346 lines
9.7 KiB
TypeScript
346 lines
9.7 KiB
TypeScript
import officeParser from 'officeparser';
|
|
import WordExtractor from 'word-extractor';
|
|
import * as XLSX from 'xlsx';
|
|
import { execSync } from 'child_process';
|
|
import { writeFileSync, readFileSync, unlinkSync, existsSync } from 'fs';
|
|
import { tmpdir } from 'os';
|
|
import { join } from 'path';
|
|
import { randomUUID } from 'crypto';
|
|
|
|
export const GEMINI_NATIVE_TYPES = [
|
|
'application/pdf',
|
|
'text/plain',
|
|
'text/markdown',
|
|
'text/csv',
|
|
'text/html',
|
|
'text/css',
|
|
'text/javascript',
|
|
'image/jpeg',
|
|
'image/png',
|
|
'image/gif',
|
|
'image/webp',
|
|
'image/heic',
|
|
'image/heif',
|
|
];
|
|
|
|
export const TYPES_TO_CONVERT_TO_PLAIN_TEXT = [
|
|
'text/xml',
|
|
'application/xml',
|
|
'application/json',
|
|
'application/javascript',
|
|
'text/x-python',
|
|
'text/x-java-source',
|
|
'text/x-c',
|
|
'text/x-c++',
|
|
'text/x-typescript',
|
|
'text/yaml',
|
|
'application/x-yaml',
|
|
];
|
|
|
|
// MIME types that officeparser can extract text from (modern Office formats)
|
|
export const OFFICEPARSER_TYPES = [
|
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx
|
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // .pptx
|
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // .xlsx
|
|
'application/vnd.oasis.opendocument.text', // .odt
|
|
'application/vnd.oasis.opendocument.presentation', // .odp
|
|
'application/vnd.oasis.opendocument.spreadsheet', // .ods
|
|
'application/rtf', // .rtf
|
|
];
|
|
|
|
// Legacy Office formats requiring specialized parsers
|
|
export const LEGACY_DOC_TYPE = 'application/msword'; // .doc (word-extractor)
|
|
export const LEGACY_XLS_TYPE = 'application/vnd.ms-excel'; // .xls (xlsx/SheetJS)
|
|
export const LEGACY_PPT_TYPE = 'application/vnd.ms-powerpoint'; // .ppt (ppt/SheetJS)
|
|
|
|
// Image types that can use OCR for text extraction
|
|
export const OCR_CAPABLE_TYPES = [
|
|
'image/jpeg',
|
|
'image/png',
|
|
'image/gif',
|
|
'image/webp'
|
|
];
|
|
|
|
export const SUPPORTED_TYPES = [
|
|
...GEMINI_NATIVE_TYPES,
|
|
...OFFICEPARSER_TYPES,
|
|
...TYPES_TO_CONVERT_TO_PLAIN_TEXT,
|
|
LEGACY_DOC_TYPE,
|
|
LEGACY_XLS_TYPE,
|
|
LEGACY_PPT_TYPE,
|
|
];
|
|
|
|
const EXTENSION_TO_MIME: Record<string, string> = {
|
|
'.txt': 'text/plain',
|
|
'.md': 'text/markdown',
|
|
'.markdown': 'text/markdown',
|
|
'.csv': 'text/csv',
|
|
'.html': 'text/html',
|
|
'.htm': 'text/html',
|
|
'.xml': 'application/xml',
|
|
'.json': 'application/json',
|
|
'.js': 'text/javascript',
|
|
'.mjs': 'text/javascript',
|
|
'.ts': 'text/x-typescript',
|
|
'.tsx': 'text/x-typescript',
|
|
'.jsx': 'text/javascript',
|
|
'.py': 'text/x-python',
|
|
'.java': 'text/x-java-source',
|
|
'.c': 'text/x-c',
|
|
'.h': 'text/x-c',
|
|
'.cpp': 'text/x-c++',
|
|
'.hpp': 'text/x-c++',
|
|
'.cc': 'text/x-c++',
|
|
'.css': 'text/css',
|
|
'.yaml': 'text/yaml',
|
|
'.yml': 'text/yaml',
|
|
};
|
|
|
|
export function normalizeMimeType(mimeType: string, filename?: string): string {
|
|
if (SUPPORTED_TYPES.includes(mimeType)) {
|
|
return mimeType;
|
|
}
|
|
|
|
if (mimeType === 'application/octet-stream' && filename) {
|
|
const ext = filename.toLowerCase().match(/\.[^.]+$/)?.[0];
|
|
if (ext && EXTENSION_TO_MIME[ext]) {
|
|
return EXTENSION_TO_MIME[ext];
|
|
}
|
|
}
|
|
|
|
if (mimeType.startsWith('text/') && !SUPPORTED_TYPES.includes(mimeType)) {
|
|
return 'text/plain';
|
|
}
|
|
|
|
return mimeType;
|
|
}
|
|
|
|
export interface ProcessedDocument {
|
|
type: 'native' | 'text';
|
|
content: Buffer | string;
|
|
mimeType: string;
|
|
}
|
|
|
|
export interface ProcessOptions {
|
|
useOcr?: boolean;
|
|
}
|
|
|
|
export function isSupportedType(mimeType: string): boolean {
|
|
return SUPPORTED_TYPES.includes(mimeType);
|
|
}
|
|
|
|
export function isGeminiNative(mimeType: string): boolean {
|
|
return GEMINI_NATIVE_TYPES.includes(mimeType);
|
|
}
|
|
|
|
export function isOcrCapable(mimeType: string): boolean {
|
|
return OCR_CAPABLE_TYPES.includes(mimeType);
|
|
}
|
|
|
|
export function needsOfficeParser(mimeType: string): boolean {
|
|
return OFFICEPARSER_TYPES.includes(mimeType);
|
|
}
|
|
|
|
export function needsPlainTextConversion(mimeType: string): boolean {
|
|
return TYPES_TO_CONVERT_TO_PLAIN_TEXT.includes(mimeType);
|
|
}
|
|
|
|
export function needsWordExtractor(mimeType: string): boolean {
|
|
return mimeType === LEGACY_DOC_TYPE;
|
|
}
|
|
|
|
export function needsSheetJS(mimeType: string): boolean {
|
|
return mimeType === LEGACY_XLS_TYPE;
|
|
}
|
|
|
|
export function needsSheetJSPPT(mimeType: string): boolean {
|
|
return mimeType === LEGACY_PPT_TYPE;
|
|
}
|
|
|
|
async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): Promise<string> {
|
|
const config = useOcr ? {
|
|
extractAttachments: true,
|
|
ocr: true,
|
|
ocrLanguage: 'eng'
|
|
} : {};
|
|
|
|
const ast = await officeParser.parseOffice(buffer, config);
|
|
let text = ast.toText();
|
|
|
|
if (useOcr && ast.attachments) {
|
|
for (const attachment of ast.attachments) {
|
|
if (attachment.ocrText) {
|
|
text += '\n' + attachment.ocrText;
|
|
}
|
|
}
|
|
}
|
|
|
|
return text;
|
|
}
|
|
|
|
async function extractWithWordExtractor(buffer: Buffer): Promise<string> {
|
|
const extractor = new WordExtractor();
|
|
const doc = await extractor.extract(buffer);
|
|
return doc.getBody();
|
|
}
|
|
|
|
function extractWithSheetJS(buffer: Buffer): string {
|
|
const workbook = XLSX.read(buffer, { type: 'buffer' });
|
|
const textParts: string[] = [];
|
|
|
|
for (const sheetName of workbook.SheetNames) {
|
|
const sheet = workbook.Sheets[sheetName];
|
|
const csv = XLSX.utils.sheet_to_csv(sheet);
|
|
if (csv.trim()) {
|
|
textParts.push(`--- ${sheetName} ---\n${csv}`);
|
|
}
|
|
}
|
|
|
|
return textParts.join('\n\n');
|
|
}
|
|
|
|
// Map legacy extensions to their modern equivalents for LibreOffice conversion
|
|
const LEGACY_TO_MODERN: Record<string, string> = {
|
|
'.ppt': 'pptx',
|
|
'.doc': 'docx',
|
|
'.xls': 'xlsx',
|
|
};
|
|
|
|
async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise<string> {
|
|
// Use LibreOffice to convert legacy Office files to modern format, then parse
|
|
const tempId = randomUUID();
|
|
const tempDir = tmpdir();
|
|
const inputPath = join(tempDir, `input-${tempId}${extension}`);
|
|
const modernExt = LEGACY_TO_MODERN[extension] || 'pdf';
|
|
const outputPath = join(tempDir, `input-${tempId}.${modernExt}`);
|
|
|
|
try {
|
|
// Write input file
|
|
writeFileSync(inputPath, buffer);
|
|
|
|
// Convert to modern format using LibreOffice
|
|
try {
|
|
execSync(
|
|
`libreoffice --headless --convert-to ${modernExt} --outdir "${tempDir}" "${inputPath}"`,
|
|
{ timeout: 60000, stdio: 'pipe' }
|
|
);
|
|
} catch (execError) {
|
|
// LibreOffice not available or conversion failed
|
|
const error = execError as Error & { code?: string };
|
|
if (error.code === 'ENOENT' || error.message?.includes('not found')) {
|
|
throw new Error(
|
|
`Legacy ${extension} files require LibreOffice for text extraction. ` +
|
|
`Please convert to .${modernExt} format or ensure LibreOffice is installed.`
|
|
);
|
|
}
|
|
throw new Error(`LibreOffice conversion failed: ${error.message}`);
|
|
}
|
|
|
|
// Read the converted file and extract text using officeparser (with OCR if enabled)
|
|
if (existsSync(outputPath)) {
|
|
const convertedBuffer = readFileSync(outputPath);
|
|
const config = useOcr ? {
|
|
extractAttachments: true,
|
|
ocr: true,
|
|
ocrLanguage: 'eng'
|
|
} : {};
|
|
const ast = await officeParser.parseOffice(convertedBuffer, config);
|
|
let text = ast.toText();
|
|
|
|
// Include OCR text from attachments if available
|
|
if (useOcr && ast.attachments) {
|
|
for (const attachment of ast.attachments) {
|
|
if (attachment.ocrText) {
|
|
text += '\n' + attachment.ocrText;
|
|
}
|
|
}
|
|
}
|
|
return text;
|
|
}
|
|
|
|
throw new Error('LibreOffice conversion produced no output');
|
|
} finally {
|
|
// Cleanup temp files
|
|
try { unlinkSync(inputPath); } catch { /* ignore */ }
|
|
try { unlinkSync(outputPath); } catch { /* ignore */ }
|
|
}
|
|
}
|
|
|
|
export async function processDocument(
|
|
buffer: Buffer,
|
|
mimeType: string,
|
|
options: ProcessOptions = {}
|
|
): Promise<ProcessedDocument> {
|
|
if (!isSupportedType(mimeType)) {
|
|
throw new Error(`Unsupported file type: ${mimeType}`);
|
|
}
|
|
|
|
// Images with OCR requested - extract text
|
|
if (isOcrCapable(mimeType) && options.useOcr) {
|
|
const text = await extractWithOfficeParser(buffer, true);
|
|
return {
|
|
type: 'text',
|
|
content: text,
|
|
mimeType: 'text/plain'
|
|
};
|
|
}
|
|
|
|
// Gemini-native types (including images without OCR) - pass through
|
|
if (isGeminiNative(mimeType)) {
|
|
return {
|
|
type: 'native',
|
|
content: buffer,
|
|
mimeType
|
|
};
|
|
}
|
|
|
|
// Office documents - extract text, OCR extracts text from embedded images
|
|
if (needsOfficeParser(mimeType)) {
|
|
const text = await extractWithOfficeParser(buffer, options.useOcr);
|
|
return {
|
|
type: 'text',
|
|
content: text,
|
|
mimeType: 'text/plain'
|
|
};
|
|
}
|
|
|
|
// Legacy .doc files (Word 97-2003)
|
|
if (needsWordExtractor(mimeType)) {
|
|
const text = await extractWithWordExtractor(buffer);
|
|
return {
|
|
type: 'text',
|
|
content: text,
|
|
mimeType: 'text/plain'
|
|
};
|
|
}
|
|
|
|
// Legacy .xls files (Excel 97-2003)
|
|
if (needsSheetJS(mimeType)) {
|
|
const text = extractWithSheetJS(buffer);
|
|
return {
|
|
type: 'text',
|
|
content: text,
|
|
mimeType: 'text/plain'
|
|
};
|
|
}
|
|
|
|
// Legacy .ppt files (PowerPoint 97-2003) - use LibreOffice for conversion
|
|
if (needsSheetJSPPT(mimeType)) {
|
|
const text = await extractWithLibreOffice(buffer, '.ppt', options.useOcr);
|
|
return {
|
|
type: 'text',
|
|
content: text,
|
|
mimeType: 'text/plain'
|
|
};
|
|
}
|
|
|
|
if (needsPlainTextConversion(mimeType)) {
|
|
return {
|
|
type: 'text',
|
|
content: buffer.toString('utf-8'),
|
|
mimeType: 'text/plain'
|
|
};
|
|
}
|
|
|
|
throw new Error(`No extraction handler for: ${mimeType}`);
|
|
}
|