kaboot/server/src/services/documentParser.ts

import officeParser from 'officeparser';

export const GEMINI_NATIVE_TYPES = [
  'application/pdf',
  'text/plain',
  'text/markdown',
  'text/csv',
  'text/html',
  'text/css',
  'text/javascript',
  'image/jpeg',
  'image/png',
  'image/gif',
  'image/webp',
  'image/heic',
  'image/heif',
];

export const TYPES_TO_CONVERT_TO_PLAIN_TEXT = [
  'text/xml',
  'application/xml',
  'application/json',
  'application/javascript',
  'text/x-python',
  'text/x-java-source',
  'text/x-c',
  'text/x-c++',
  'text/x-typescript',
  'text/yaml',
  'application/x-yaml',
];

// MIME types that officeparser can extract text from
export const OFFICEPARSER_TYPES = [
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document',   // .docx
  'application/vnd.openxmlformats-officedocument.presentationml.presentation', // .pptx
  'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',         // .xlsx
  'application/msword',                                                         // .doc
  'application/vnd.ms-powerpoint',                                              // .ppt
  'application/vnd.ms-excel',                                                   // .xls
  'application/vnd.oasis.opendocument.text',                                   // .odt
  'application/vnd.oasis.opendocument.presentation',                           // .odp
  'application/vnd.oasis.opendocument.spreadsheet',                            // .ods
  'application/rtf',                                                            // .rtf
];

// Image types that can use OCR for text extraction
export const OCR_CAPABLE_TYPES = [
  'image/jpeg',
  'image/png',
  'image/gif',
  'image/webp'
];

export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES, ...TYPES_TO_CONVERT_TO_PLAIN_TEXT];

const EXTENSION_TO_MIME: Record<string, string> = {
  '.txt': 'text/plain',
  '.md': 'text/markdown',
  '.markdown': 'text/markdown',
  '.csv': 'text/csv',
  '.html': 'text/html',
  '.htm': 'text/html',
  '.xml': 'application/xml',
  '.json': 'application/json',
  '.js': 'text/javascript',
  '.mjs': 'text/javascript',
  '.ts': 'text/x-typescript',
  '.tsx': 'text/x-typescript',
  '.jsx': 'text/javascript',
  '.py': 'text/x-python',
  '.java': 'text/x-java-source',
  '.c': 'text/x-c',
  '.h': 'text/x-c',
  '.cpp': 'text/x-c++',
  '.hpp': 'text/x-c++',
  '.cc': 'text/x-c++',
  '.css': 'text/css',
  '.yaml': 'text/yaml',
  '.yml': 'text/yaml',
};

export function normalizeMimeType(mimeType: string, filename?: string): string {
  if (SUPPORTED_TYPES.includes(mimeType)) {
    return mimeType;
  }

  if (mimeType === 'application/octet-stream' && filename) {
    const ext = filename.toLowerCase().match(/\.[^.]+$/)?.[0];
    if (ext && EXTENSION_TO_MIME[ext]) {
      return EXTENSION_TO_MIME[ext];
    }
  }

  if (mimeType.startsWith('text/') && !SUPPORTED_TYPES.includes(mimeType)) {
    return 'text/plain';
  }

  return mimeType;
}

export interface ProcessedDocument {
  type: 'native' | 'text';
  content: Buffer | string;
  mimeType: string;
}

export interface ProcessOptions {
  useOcr?: boolean;
}

export function isSupportedType(mimeType: string): boolean {
  return SUPPORTED_TYPES.includes(mimeType);
}

export function isGeminiNative(mimeType: string): boolean {
  return GEMINI_NATIVE_TYPES.includes(mimeType);
}

export function isOcrCapable(mimeType: string): boolean {
  return OCR_CAPABLE_TYPES.includes(mimeType);
}

export function needsOfficeParser(mimeType: string): boolean {
  return OFFICEPARSER_TYPES.includes(mimeType);
}

export function needsPlainTextConversion(mimeType: string): boolean {
  return TYPES_TO_CONVERT_TO_PLAIN_TEXT.includes(mimeType);
}

async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): Promise<string> {
  const config = useOcr ? {
    extractAttachments: true,
    ocr: true,
    ocrLanguage: 'eng'
  } : {};

  const ast = await officeParser.parseOffice(buffer, config);
  let text = ast.toText();

  if (useOcr && ast.attachments) {
    for (const attachment of ast.attachments) {
      if (attachment.ocrText) {
        text += '\n' + attachment.ocrText;
      }
    }
  }

  return text;
}

export async function processDocument(
  buffer: Buffer,
  mimeType: string,
  options: ProcessOptions = {}
): Promise<ProcessedDocument> {
  if (!isSupportedType(mimeType)) {
    throw new Error(`Unsupported file type: ${mimeType}`);
  }

  // Images with OCR requested - extract text
  if (isOcrCapable(mimeType) && options.useOcr) {
    const text = await extractWithOfficeParser(buffer, true);
    return {
      type: 'text',
      content: text,
      mimeType: 'text/plain'
    };
  }

  // Gemini-native types (including images without OCR) - pass through
  if (isGeminiNative(mimeType)) {
    return {
      type: 'native',
      content: buffer,
      mimeType
    };
  }

  // Office documents - extract text, OCR extracts text from embedded images
  if (needsOfficeParser(mimeType)) {
    const text = await extractWithOfficeParser(buffer, options.useOcr);
    return {
      type: 'text',
      content: text,
      mimeType: 'text/plain'
    };
  }

  if (needsPlainTextConversion(mimeType)) {
    return {
      type: 'text',
      content: buffer.toString('utf-8'),
      mimeType: 'text/plain'
    };
  }

  throw new Error(`No extraction handler for: ${mimeType}`);
}