Security audit #1

2026-02-03 06:59:07 -07:00 · 2026-02-03 06:59:07 -07:00 · cd04d34b23
commit cd04d34b23
parent 68bc591150
8 changed files with 131 additions and 56 deletions
--- a/server/src/services/documentParser.ts
+++ b/server/src/services/documentParser.ts
@ -2,11 +2,13 @@ import officeParser from 'officeparser';
 import WordExtractor from 'word-extractor';
 import * as XLSX from 'xlsx';
 import { execSync } from 'child_process';
-import { writeFileSync, readFileSync, unlinkSync, existsSync } from 'fs';
+import { writeFileSync, readFileSync, unlinkSync, existsSync, mkdirSync, rmdirSync } from 'fs';
 import { tmpdir } from 'os';
 import { join } from 'path';
 import { randomUUID } from 'crypto';

+const PROCESSING_TIMEOUT_MS = 30000;
+
 export const GEMINI_NATIVE_TYPES = [
  'application/pdf',
  'text/plain',
@ -207,62 +209,60 @@ const LEGACY_TO_MODERN: Record<string, string> = {
 };

 async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise<string> {
-  // Use LibreOffice to convert legacy Office files to modern format, then parse
  const tempId = randomUUID();
-  const tempDir = tmpdir();
-  const inputPath = join(tempDir, `input-${tempId}${extension}`);
+  const privateTempDir = join(tmpdir(), `kaboot-${tempId}`);
+  const inputPath = join(privateTempDir, `input${extension}`);
  const modernExt = LEGACY_TO_MODERN[extension] || 'pdf';
-  const outputPath = join(tempDir, `input-${tempId}.${modernExt}`);
+  const outputPath = join(privateTempDir, `input.${modernExt}`);
  
+  const cleanup = () => {
+    try { unlinkSync(inputPath); } catch { /* ignore */ }
+    try { unlinkSync(outputPath); } catch { /* ignore */ }
+    try { rmdirSync(privateTempDir); } catch { /* ignore */ }
+  };
+
  try {
-    // Write input file
-    writeFileSync(inputPath, buffer);
+    mkdirSync(privateTempDir, { mode: 0o700 });
+    writeFileSync(inputPath, buffer, { mode: 0o600 });
    
-    // Convert to modern format using LibreOffice
    try {
      execSync(
-        `libreoffice --headless --convert-to ${modernExt} --outdir "${tempDir}" "${inputPath}"`,
-        { timeout: 60000, stdio: 'pipe' }
+        `libreoffice --headless --convert-to ${modernExt} --outdir "${privateTempDir}" "${inputPath}"`,
+        { timeout: PROCESSING_TIMEOUT_MS, stdio: 'pipe', maxBuffer: 10 * 1024 * 1024 }
      );
    } catch (execError) {
-      // LibreOffice not available or conversion failed
-      const error = execError as Error & { code?: string };
+      const error = execError as Error & { code?: string; killed?: boolean };
+      if (error.killed) {
+        throw new Error('Document conversion timed out. Try a smaller file.');
+      }
      if (error.code === 'ENOENT' || error.message?.includes('not found')) {
        throw new Error(
          `Legacy ${extension} files require LibreOffice for text extraction. ` +
          `Please convert to .${modernExt} format or ensure LibreOffice is installed.`
        );
      }
-      throw new Error(`LibreOffice conversion failed: ${error.message}`);
+      throw new Error('Document conversion failed. The file may be corrupted.');
    }
    
-    // Read the converted file and extract text using officeparser (with OCR if enabled)
-    if (existsSync(outputPath)) {
-      const convertedBuffer = readFileSync(outputPath);
-      const config = useOcr ? {
-        extractAttachments: true,
-        ocr: true,
-        ocrLanguage: 'eng'
-      } : {};
-      const ast = await officeParser.parseOffice(convertedBuffer, config);
-      let text = ast.toText();
-      
-      // Include OCR text from attachments if available
-      if (useOcr && ast.attachments) {
-        for (const attachment of ast.attachments) {
-          if (attachment.ocrText) {
-            text += '\n' + attachment.ocrText;
-          }
+    if (!existsSync(outputPath)) {
+      throw new Error('Document conversion produced no output.');
+    }
+
+    const convertedBuffer = readFileSync(outputPath);
+    const config = useOcr ? { extractAttachments: true, ocr: true, ocrLanguage: 'eng' } : {};
+    const ast = await officeParser.parseOffice(convertedBuffer, config);
+    let text = ast.toText();
+    
+    if (useOcr && ast.attachments) {
+      for (const attachment of ast.attachments) {
+        if (attachment.ocrText) {
+          text += '\n' + attachment.ocrText;
        }
      }
-      return text;
    }
-    
-    throw new Error('LibreOffice conversion produced no output');
+    return text;
  } finally {
-    // Cleanup temp files
-    try { unlinkSync(inputPath); } catch { /* ignore */ }
-    try { unlinkSync(outputPath); } catch { /* ignore */ }
+    cleanup();
  }
 }