import officeParser from 'officeparser'; import WordExtractor from 'word-extractor'; import * as XLSX from 'xlsx'; import { execSync } from 'child_process'; import { writeFileSync, readFileSync, unlinkSync, existsSync } from 'fs'; import { tmpdir } from 'os'; import { join } from 'path'; import { randomUUID } from 'crypto'; export const GEMINI_NATIVE_TYPES = [ 'application/pdf', 'text/plain', 'text/markdown', 'text/csv', 'text/html', 'text/css', 'text/javascript', 'image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/heic', 'image/heif', ]; export const TYPES_TO_CONVERT_TO_PLAIN_TEXT = [ 'text/xml', 'application/xml', 'application/json', 'application/javascript', 'text/x-python', 'text/x-java-source', 'text/x-c', 'text/x-c++', 'text/x-typescript', 'text/yaml', 'application/x-yaml', ]; // MIME types that officeparser can extract text from (modern Office formats) export const OFFICEPARSER_TYPES = [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx 'application/vnd.openxmlformats-officedocument.presentationml.presentation', // .pptx 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // .xlsx 'application/vnd.oasis.opendocument.text', // .odt 'application/vnd.oasis.opendocument.presentation', // .odp 'application/vnd.oasis.opendocument.spreadsheet', // .ods 'application/rtf', // .rtf ]; // Legacy Office formats requiring specialized parsers export const LEGACY_DOC_TYPE = 'application/msword'; // .doc (word-extractor) export const LEGACY_XLS_TYPE = 'application/vnd.ms-excel'; // .xls (xlsx/SheetJS) export const LEGACY_PPT_TYPE = 'application/vnd.ms-powerpoint'; // .ppt (ppt/SheetJS) // Image types that can use OCR for text extraction export const OCR_CAPABLE_TYPES = [ 'image/jpeg', 'image/png', 'image/gif', 'image/webp' ]; export const SUPPORTED_TYPES = [ ...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES, ...TYPES_TO_CONVERT_TO_PLAIN_TEXT, LEGACY_DOC_TYPE, LEGACY_XLS_TYPE, LEGACY_PPT_TYPE, ]; const EXTENSION_TO_MIME: Record = { '.txt': 'text/plain', '.md': 'text/markdown', '.markdown': 'text/markdown', '.csv': 'text/csv', '.html': 'text/html', '.htm': 'text/html', '.xml': 'application/xml', '.json': 'application/json', '.js': 'text/javascript', '.mjs': 'text/javascript', '.ts': 'text/x-typescript', '.tsx': 'text/x-typescript', '.jsx': 'text/javascript', '.py': 'text/x-python', '.java': 'text/x-java-source', '.c': 'text/x-c', '.h': 'text/x-c', '.cpp': 'text/x-c++', '.hpp': 'text/x-c++', '.cc': 'text/x-c++', '.css': 'text/css', '.yaml': 'text/yaml', '.yml': 'text/yaml', }; export function normalizeMimeType(mimeType: string, filename?: string): string { if (SUPPORTED_TYPES.includes(mimeType)) { return mimeType; } if (mimeType === 'application/octet-stream' && filename) { const ext = filename.toLowerCase().match(/\.[^.]+$/)?.[0]; if (ext && EXTENSION_TO_MIME[ext]) { return EXTENSION_TO_MIME[ext]; } } if (mimeType.startsWith('text/') && !SUPPORTED_TYPES.includes(mimeType)) { return 'text/plain'; } return mimeType; } export interface ProcessedDocument { type: 'native' | 'text'; content: Buffer | string; mimeType: string; } export interface ProcessOptions { useOcr?: boolean; } export function isSupportedType(mimeType: string): boolean { return SUPPORTED_TYPES.includes(mimeType); } export function isGeminiNative(mimeType: string): boolean { return GEMINI_NATIVE_TYPES.includes(mimeType); } export function isOcrCapable(mimeType: string): boolean { return OCR_CAPABLE_TYPES.includes(mimeType); } export function needsOfficeParser(mimeType: string): boolean { return OFFICEPARSER_TYPES.includes(mimeType); } export function needsPlainTextConversion(mimeType: string): boolean { return TYPES_TO_CONVERT_TO_PLAIN_TEXT.includes(mimeType); } export function needsWordExtractor(mimeType: string): boolean { return mimeType === LEGACY_DOC_TYPE; } export function needsSheetJS(mimeType: string): boolean { return mimeType === LEGACY_XLS_TYPE; } export function needsSheetJSPPT(mimeType: string): boolean { return mimeType === LEGACY_PPT_TYPE; } async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): Promise { const config = useOcr ? { extractAttachments: true, ocr: true, ocrLanguage: 'eng' } : {}; const ast = await officeParser.parseOffice(buffer, config); let text = ast.toText(); if (useOcr && ast.attachments) { for (const attachment of ast.attachments) { if (attachment.ocrText) { text += '\n' + attachment.ocrText; } } } return text; } async function extractWithWordExtractor(buffer: Buffer): Promise { const extractor = new WordExtractor(); const doc = await extractor.extract(buffer); return doc.getBody(); } function extractWithSheetJS(buffer: Buffer): string { const workbook = XLSX.read(buffer, { type: 'buffer' }); const textParts: string[] = []; for (const sheetName of workbook.SheetNames) { const sheet = workbook.Sheets[sheetName]; const csv = XLSX.utils.sheet_to_csv(sheet); if (csv.trim()) { textParts.push(`--- ${sheetName} ---\n${csv}`); } } return textParts.join('\n\n'); } // Map legacy extensions to their modern equivalents for LibreOffice conversion const LEGACY_TO_MODERN: Record = { '.ppt': 'pptx', '.doc': 'docx', '.xls': 'xlsx', }; async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise { // Use LibreOffice to convert legacy Office files to modern format, then parse const tempId = randomUUID(); const tempDir = tmpdir(); const inputPath = join(tempDir, `input-${tempId}${extension}`); const modernExt = LEGACY_TO_MODERN[extension] || 'pdf'; const outputPath = join(tempDir, `input-${tempId}.${modernExt}`); try { // Write input file writeFileSync(inputPath, buffer); // Convert to modern format using LibreOffice try { execSync( `libreoffice --headless --convert-to ${modernExt} --outdir "${tempDir}" "${inputPath}"`, { timeout: 60000, stdio: 'pipe' } ); } catch (execError) { // LibreOffice not available or conversion failed const error = execError as Error & { code?: string }; if (error.code === 'ENOENT' || error.message?.includes('not found')) { throw new Error( `Legacy ${extension} files require LibreOffice for text extraction. ` + `Please convert to .${modernExt} format or ensure LibreOffice is installed.` ); } throw new Error(`LibreOffice conversion failed: ${error.message}`); } // Read the converted file and extract text using officeparser (with OCR if enabled) if (existsSync(outputPath)) { const convertedBuffer = readFileSync(outputPath); const config = useOcr ? { extractAttachments: true, ocr: true, ocrLanguage: 'eng' } : {}; const ast = await officeParser.parseOffice(convertedBuffer, config); let text = ast.toText(); // Include OCR text from attachments if available if (useOcr && ast.attachments) { for (const attachment of ast.attachments) { if (attachment.ocrText) { text += '\n' + attachment.ocrText; } } } return text; } throw new Error('LibreOffice conversion produced no output'); } finally { // Cleanup temp files try { unlinkSync(inputPath); } catch { /* ignore */ } try { unlinkSync(outputPath); } catch { /* ignore */ } } } export async function processDocument( buffer: Buffer, mimeType: string, options: ProcessOptions = {} ): Promise { if (!isSupportedType(mimeType)) { throw new Error(`Unsupported file type: ${mimeType}`); } // Images with OCR requested - extract text if (isOcrCapable(mimeType) && options.useOcr) { const text = await extractWithOfficeParser(buffer, true); return { type: 'text', content: text, mimeType: 'text/plain' }; } // Gemini-native types (including images without OCR) - pass through if (isGeminiNative(mimeType)) { return { type: 'native', content: buffer, mimeType }; } // Office documents - extract text, OCR extracts text from embedded images if (needsOfficeParser(mimeType)) { const text = await extractWithOfficeParser(buffer, options.useOcr); return { type: 'text', content: text, mimeType: 'text/plain' }; } // Legacy .doc files (Word 97-2003) if (needsWordExtractor(mimeType)) { const text = await extractWithWordExtractor(buffer); return { type: 'text', content: text, mimeType: 'text/plain' }; } // Legacy .xls files (Excel 97-2003) if (needsSheetJS(mimeType)) { const text = extractWithSheetJS(buffer); return { type: 'text', content: text, mimeType: 'text/plain' }; } // Legacy .ppt files (PowerPoint 97-2003) - use LibreOffice for conversion if (needsSheetJSPPT(mimeType)) { const text = await extractWithLibreOffice(buffer, '.ppt', options.useOcr); return { type: 'text', content: text, mimeType: 'text/plain' }; } if (needsPlainTextConversion(mimeType)) { return { type: 'text', content: buffer.toString('utf-8'), mimeType: 'text/plain' }; } throw new Error(`No extraction handler for: ${mimeType}`); }