kaboot/server/src/services/documentParser.ts

200 lines
5.3 KiB
TypeScript

import officeParser from 'officeparser';
export const GEMINI_NATIVE_TYPES = [
'application/pdf',
'text/plain',
'text/markdown',
'text/csv',
'text/html',
'text/css',
'text/javascript',
'image/jpeg',
'image/png',
'image/gif',
'image/webp',
'image/heic',
'image/heif',
];
export const TYPES_TO_CONVERT_TO_PLAIN_TEXT = [
'text/xml',
'application/xml',
'application/json',
'application/javascript',
'text/x-python',
'text/x-java-source',
'text/x-c',
'text/x-c++',
'text/x-typescript',
'text/yaml',
'application/x-yaml',
];
// MIME types that officeparser can extract text from
export const OFFICEPARSER_TYPES = [
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // .pptx
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // .xlsx
'application/msword', // .doc
'application/vnd.ms-powerpoint', // .ppt
'application/vnd.ms-excel', // .xls
'application/vnd.oasis.opendocument.text', // .odt
'application/vnd.oasis.opendocument.presentation', // .odp
'application/vnd.oasis.opendocument.spreadsheet', // .ods
'application/rtf', // .rtf
];
// Image types that can use OCR for text extraction
export const OCR_CAPABLE_TYPES = [
'image/jpeg',
'image/png',
'image/gif',
'image/webp'
];
export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES, ...TYPES_TO_CONVERT_TO_PLAIN_TEXT];
const EXTENSION_TO_MIME: Record<string, string> = {
'.txt': 'text/plain',
'.md': 'text/markdown',
'.markdown': 'text/markdown',
'.csv': 'text/csv',
'.html': 'text/html',
'.htm': 'text/html',
'.xml': 'application/xml',
'.json': 'application/json',
'.js': 'text/javascript',
'.mjs': 'text/javascript',
'.ts': 'text/x-typescript',
'.tsx': 'text/x-typescript',
'.jsx': 'text/javascript',
'.py': 'text/x-python',
'.java': 'text/x-java-source',
'.c': 'text/x-c',
'.h': 'text/x-c',
'.cpp': 'text/x-c++',
'.hpp': 'text/x-c++',
'.cc': 'text/x-c++',
'.css': 'text/css',
'.yaml': 'text/yaml',
'.yml': 'text/yaml',
};
export function normalizeMimeType(mimeType: string, filename?: string): string {
if (SUPPORTED_TYPES.includes(mimeType)) {
return mimeType;
}
if (mimeType === 'application/octet-stream' && filename) {
const ext = filename.toLowerCase().match(/\.[^.]+$/)?.[0];
if (ext && EXTENSION_TO_MIME[ext]) {
return EXTENSION_TO_MIME[ext];
}
}
if (mimeType.startsWith('text/') && !SUPPORTED_TYPES.includes(mimeType)) {
return 'text/plain';
}
return mimeType;
}
export interface ProcessedDocument {
type: 'native' | 'text';
content: Buffer | string;
mimeType: string;
}
export interface ProcessOptions {
useOcr?: boolean;
}
export function isSupportedType(mimeType: string): boolean {
return SUPPORTED_TYPES.includes(mimeType);
}
export function isGeminiNative(mimeType: string): boolean {
return GEMINI_NATIVE_TYPES.includes(mimeType);
}
export function isOcrCapable(mimeType: string): boolean {
return OCR_CAPABLE_TYPES.includes(mimeType);
}
export function needsOfficeParser(mimeType: string): boolean {
return OFFICEPARSER_TYPES.includes(mimeType);
}
export function needsPlainTextConversion(mimeType: string): boolean {
return TYPES_TO_CONVERT_TO_PLAIN_TEXT.includes(mimeType);
}
async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): Promise<string> {
const config = useOcr ? {
extractAttachments: true,
ocr: true,
ocrLanguage: 'eng'
} : {};
const ast = await officeParser.parseOffice(buffer, config);
let text = ast.toText();
if (useOcr && ast.attachments) {
for (const attachment of ast.attachments) {
if (attachment.ocrText) {
text += '\n' + attachment.ocrText;
}
}
}
return text;
}
export async function processDocument(
buffer: Buffer,
mimeType: string,
options: ProcessOptions = {}
): Promise<ProcessedDocument> {
if (!isSupportedType(mimeType)) {
throw new Error(`Unsupported file type: ${mimeType}`);
}
// Images with OCR requested - extract text
if (isOcrCapable(mimeType) && options.useOcr) {
const text = await extractWithOfficeParser(buffer, true);
return {
type: 'text',
content: text,
mimeType: 'text/plain'
};
}
// Gemini-native types (including images without OCR) - pass through
if (isGeminiNative(mimeType)) {
return {
type: 'native',
content: buffer,
mimeType
};
}
// Office documents - extract text, OCR extracts text from embedded images
if (needsOfficeParser(mimeType)) {
const text = await extractWithOfficeParser(buffer, options.useOcr);
return {
type: 'text',
content: text,
mimeType: 'text/plain'
};
}
if (needsPlainTextConversion(mimeType)) {
return {
type: 'text',
content: buffer.toString('utf-8'),
mimeType: 'text/plain'
};
}
throw new Error(`No extraction handler for: ${mimeType}`);
}