kaboot/server/src/services/documentParser.ts

import officeParser from 'officeparser';
import WordExtractor from 'word-extractor';
import * as XLSX from 'xlsx';
import { execSync } from 'child_process';
import { writeFileSync, readFileSync, unlinkSync, existsSync } from 'fs';
import { tmpdir } from 'os';
import { join } from 'path';
import { randomUUID } from 'crypto';

export const GEMINI_NATIVE_TYPES = [
  'application/pdf',
  'text/plain',
  'text/markdown',
  'text/csv',
  'text/html',
  'text/css',
  'text/javascript',
  'image/jpeg',
  'image/png',
  'image/gif',
  'image/webp',
  'image/heic',
  'image/heif',
];

export const TYPES_TO_CONVERT_TO_PLAIN_TEXT = [
  'text/xml',
  'application/xml',
  'application/json',
  'application/javascript',
  'text/x-python',
  'text/x-java-source',
  'text/x-c',
  'text/x-c++',
  'text/x-typescript',
  'text/yaml',
  'application/x-yaml',
];

// MIME types that officeparser can extract text from (modern Office formats)
export const OFFICEPARSER_TYPES = [
  'application/vnd.openxmlformats-officedocument.wordprocessingml.document',   // .docx
  'application/vnd.openxmlformats-officedocument.presentationml.presentation', // .pptx
  'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',         // .xlsx
  'application/vnd.oasis.opendocument.text',                                   // .odt
  'application/vnd.oasis.opendocument.presentation',                           // .odp
  'application/vnd.oasis.opendocument.spreadsheet',                            // .ods
  'application/rtf',                                                            // .rtf
];

// Legacy Office formats requiring specialized parsers
export const LEGACY_DOC_TYPE = 'application/msword';                            // .doc (word-extractor)
export const LEGACY_XLS_TYPE = 'application/vnd.ms-excel';                      // .xls (xlsx/SheetJS)
export const LEGACY_PPT_TYPE = 'application/vnd.ms-powerpoint';                 // .ppt (ppt/SheetJS)

// Image types that can use OCR for text extraction
export const OCR_CAPABLE_TYPES = [
  'image/jpeg',
  'image/png',
  'image/gif',
  'image/webp'
];

export const SUPPORTED_TYPES = [
  ...GEMINI_NATIVE_TYPES,
  ...OFFICEPARSER_TYPES,
  ...TYPES_TO_CONVERT_TO_PLAIN_TEXT,
  LEGACY_DOC_TYPE,
  LEGACY_XLS_TYPE,
  LEGACY_PPT_TYPE,
];

const EXTENSION_TO_MIME: Record<string, string> = {
  '.txt': 'text/plain',
  '.md': 'text/markdown',
  '.markdown': 'text/markdown',
  '.csv': 'text/csv',
  '.html': 'text/html',
  '.htm': 'text/html',
  '.xml': 'application/xml',
  '.json': 'application/json',
  '.js': 'text/javascript',
  '.mjs': 'text/javascript',
  '.ts': 'text/x-typescript',
  '.tsx': 'text/x-typescript',
  '.jsx': 'text/javascript',
  '.py': 'text/x-python',
  '.java': 'text/x-java-source',
  '.c': 'text/x-c',
  '.h': 'text/x-c',
  '.cpp': 'text/x-c++',
  '.hpp': 'text/x-c++',
  '.cc': 'text/x-c++',
  '.css': 'text/css',
  '.yaml': 'text/yaml',
  '.yml': 'text/yaml',
};

export function normalizeMimeType(mimeType: string, filename?: string): string {
  if (SUPPORTED_TYPES.includes(mimeType)) {
    return mimeType;
  }

  if (mimeType === 'application/octet-stream' && filename) {
    const ext = filename.toLowerCase().match(/\.[^.]+$/)?.[0];
    if (ext && EXTENSION_TO_MIME[ext]) {
      return EXTENSION_TO_MIME[ext];
    }
  }

  if (mimeType.startsWith('text/') && !SUPPORTED_TYPES.includes(mimeType)) {
    return 'text/plain';
  }

  return mimeType;
}

export interface ProcessedDocument {
  type: 'native' | 'text';
  content: Buffer | string;
  mimeType: string;
}

export interface ProcessOptions {
  useOcr?: boolean;
}

export function isSupportedType(mimeType: string): boolean {
  return SUPPORTED_TYPES.includes(mimeType);
}

export function isGeminiNative(mimeType: string): boolean {
  return GEMINI_NATIVE_TYPES.includes(mimeType);
}

export function isOcrCapable(mimeType: string): boolean {
  return OCR_CAPABLE_TYPES.includes(mimeType);
}

export function needsOfficeParser(mimeType: string): boolean {
  return OFFICEPARSER_TYPES.includes(mimeType);
}

export function needsPlainTextConversion(mimeType: string): boolean {
  return TYPES_TO_CONVERT_TO_PLAIN_TEXT.includes(mimeType);
}

export function needsWordExtractor(mimeType: string): boolean {
  return mimeType === LEGACY_DOC_TYPE;
}

export function needsSheetJS(mimeType: string): boolean {
  return mimeType === LEGACY_XLS_TYPE;
}

export function needsSheetJSPPT(mimeType: string): boolean {
  return mimeType === LEGACY_PPT_TYPE;
}

async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): Promise<string> {
  const config = useOcr ? {
    extractAttachments: true,
    ocr: true,
    ocrLanguage: 'eng'
  } : {};

  const ast = await officeParser.parseOffice(buffer, config);
  let text = ast.toText();

  if (useOcr && ast.attachments) {
    for (const attachment of ast.attachments) {
      if (attachment.ocrText) {
        text += '\n' + attachment.ocrText;
      }
    }
  }

  return text;
}

async function extractWithWordExtractor(buffer: Buffer): Promise<string> {
  const extractor = new WordExtractor();
  const doc = await extractor.extract(buffer);
  return doc.getBody();
}

function extractWithSheetJS(buffer: Buffer): string {
  const workbook = XLSX.read(buffer, { type: 'buffer' });
  const textParts: string[] = [];

  for (const sheetName of workbook.SheetNames) {
    const sheet = workbook.Sheets[sheetName];
    const csv = XLSX.utils.sheet_to_csv(sheet);
    if (csv.trim()) {
      textParts.push(`--- ${sheetName} ---\n${csv}`);
    }
  }

  return textParts.join('\n\n');
}

// Map legacy extensions to their modern equivalents for LibreOffice conversion
const LEGACY_TO_MODERN: Record<string, string> = {
  '.ppt': 'pptx',
  '.doc': 'docx',
  '.xls': 'xlsx',
};

async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise<string> {
  // Use LibreOffice to convert legacy Office files to modern format, then parse
  const tempId = randomUUID();
  const tempDir = tmpdir();
  const inputPath = join(tempDir, `input-${tempId}${extension}`);
  const modernExt = LEGACY_TO_MODERN[extension] || 'pdf';
  const outputPath = join(tempDir, `input-${tempId}.${modernExt}`);

  try {
    // Write input file
    writeFileSync(inputPath, buffer);

    // Convert to modern format using LibreOffice
    try {
      execSync(
        `libreoffice --headless --convert-to ${modernExt} --outdir "${tempDir}" "${inputPath}"`,
        { timeout: 60000, stdio: 'pipe' }
      );
    } catch (execError) {
      // LibreOffice not available or conversion failed
      const error = execError as Error & { code?: string };
      if (error.code === 'ENOENT' || error.message?.includes('not found')) {
        throw new Error(
          `Legacy ${extension} files require LibreOffice for text extraction. ` +
          `Please convert to .${modernExt} format or ensure LibreOffice is installed.`
        );
      }
      throw new Error(`LibreOffice conversion failed: ${error.message}`);
    }

    // Read the converted file and extract text using officeparser (with OCR if enabled)
    if (existsSync(outputPath)) {
      const convertedBuffer = readFileSync(outputPath);
      const config = useOcr ? {
        extractAttachments: true,
        ocr: true,
        ocrLanguage: 'eng'
      } : {};
      const ast = await officeParser.parseOffice(convertedBuffer, config);
      let text = ast.toText();

      // Include OCR text from attachments if available
      if (useOcr && ast.attachments) {
        for (const attachment of ast.attachments) {
          if (attachment.ocrText) {
            text += '\n' + attachment.ocrText;
          }
        }
      }
      return text;
    }

    throw new Error('LibreOffice conversion produced no output');
  } finally {
    // Cleanup temp files
    try { unlinkSync(inputPath); } catch { /* ignore */ }
    try { unlinkSync(outputPath); } catch { /* ignore */ }
  }
}

export async function processDocument(
  buffer: Buffer,
  mimeType: string,
  options: ProcessOptions = {}
): Promise<ProcessedDocument> {
  if (!isSupportedType(mimeType)) {
    throw new Error(`Unsupported file type: ${mimeType}`);
  }

  // Images with OCR requested - extract text
  if (isOcrCapable(mimeType) && options.useOcr) {
    const text = await extractWithOfficeParser(buffer, true);
    return {
      type: 'text',
      content: text,
      mimeType: 'text/plain'
    };
  }

  // Gemini-native types (including images without OCR) - pass through
  if (isGeminiNative(mimeType)) {
    return {
      type: 'native',
      content: buffer,
      mimeType
    };
  }

  // Office documents - extract text, OCR extracts text from embedded images
  if (needsOfficeParser(mimeType)) {
    const text = await extractWithOfficeParser(buffer, options.useOcr);
    return {
      type: 'text',
      content: text,
      mimeType: 'text/plain'
    };
  }

  // Legacy .doc files (Word 97-2003)
  if (needsWordExtractor(mimeType)) {
    const text = await extractWithWordExtractor(buffer);
    return {
      type: 'text',
      content: text,
      mimeType: 'text/plain'
    };
  }

  // Legacy .xls files (Excel 97-2003)
  if (needsSheetJS(mimeType)) {
    const text = extractWithSheetJS(buffer);
    return {
      type: 'text',
      content: text,
      mimeType: 'text/plain'
    };
  }

  // Legacy .ppt files (PowerPoint 97-2003) - use LibreOffice for conversion
  if (needsSheetJSPPT(mimeType)) {
    const text = await extractWithLibreOffice(buffer, '.ppt', options.useOcr);
    return {
      type: 'text',
      content: text,
      mimeType: 'text/plain'
    };
  }

  if (needsPlainTextConversion(mimeType)) {
    return {
      type: 'text',
      content: buffer.toString('utf-8'),
      mimeType: 'text/plain'
    };
  }

  throw new Error(`No extraction handler for: ${mimeType}`);
}