Security audit #1

This commit is contained in:
Joey Yakimowich-Payne 2026-02-03 06:59:07 -07:00
commit cd04d34b23
8 changed files with 131 additions and 56 deletions

View file

@ -2,11 +2,13 @@ import officeParser from 'officeparser';
import WordExtractor from 'word-extractor';
import * as XLSX from 'xlsx';
import { execSync } from 'child_process';
import { writeFileSync, readFileSync, unlinkSync, existsSync } from 'fs';
import { writeFileSync, readFileSync, unlinkSync, existsSync, mkdirSync, rmdirSync } from 'fs';
import { tmpdir } from 'os';
import { join } from 'path';
import { randomUUID } from 'crypto';
const PROCESSING_TIMEOUT_MS = 30000;
export const GEMINI_NATIVE_TYPES = [
'application/pdf',
'text/plain',
@ -207,62 +209,60 @@ const LEGACY_TO_MODERN: Record<string, string> = {
};
async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise<string> {
// Use LibreOffice to convert legacy Office files to modern format, then parse
const tempId = randomUUID();
const tempDir = tmpdir();
const inputPath = join(tempDir, `input-${tempId}${extension}`);
const privateTempDir = join(tmpdir(), `kaboot-${tempId}`);
const inputPath = join(privateTempDir, `input${extension}`);
const modernExt = LEGACY_TO_MODERN[extension] || 'pdf';
const outputPath = join(tempDir, `input-${tempId}.${modernExt}`);
const outputPath = join(privateTempDir, `input.${modernExt}`);
const cleanup = () => {
try { unlinkSync(inputPath); } catch { /* ignore */ }
try { unlinkSync(outputPath); } catch { /* ignore */ }
try { rmdirSync(privateTempDir); } catch { /* ignore */ }
};
try {
// Write input file
writeFileSync(inputPath, buffer);
mkdirSync(privateTempDir, { mode: 0o700 });
writeFileSync(inputPath, buffer, { mode: 0o600 });
// Convert to modern format using LibreOffice
try {
execSync(
`libreoffice --headless --convert-to ${modernExt} --outdir "${tempDir}" "${inputPath}"`,
{ timeout: 60000, stdio: 'pipe' }
`libreoffice --headless --convert-to ${modernExt} --outdir "${privateTempDir}" "${inputPath}"`,
{ timeout: PROCESSING_TIMEOUT_MS, stdio: 'pipe', maxBuffer: 10 * 1024 * 1024 }
);
} catch (execError) {
// LibreOffice not available or conversion failed
const error = execError as Error & { code?: string };
const error = execError as Error & { code?: string; killed?: boolean };
if (error.killed) {
throw new Error('Document conversion timed out. Try a smaller file.');
}
if (error.code === 'ENOENT' || error.message?.includes('not found')) {
throw new Error(
`Legacy ${extension} files require LibreOffice for text extraction. ` +
`Please convert to .${modernExt} format or ensure LibreOffice is installed.`
);
}
throw new Error(`LibreOffice conversion failed: ${error.message}`);
throw new Error('Document conversion failed. The file may be corrupted.');
}
// Read the converted file and extract text using officeparser (with OCR if enabled)
if (existsSync(outputPath)) {
const convertedBuffer = readFileSync(outputPath);
const config = useOcr ? {
extractAttachments: true,
ocr: true,
ocrLanguage: 'eng'
} : {};
const ast = await officeParser.parseOffice(convertedBuffer, config);
let text = ast.toText();
// Include OCR text from attachments if available
if (useOcr && ast.attachments) {
for (const attachment of ast.attachments) {
if (attachment.ocrText) {
text += '\n' + attachment.ocrText;
}
if (!existsSync(outputPath)) {
throw new Error('Document conversion produced no output.');
}
const convertedBuffer = readFileSync(outputPath);
const config = useOcr ? { extractAttachments: true, ocr: true, ocrLanguage: 'eng' } : {};
const ast = await officeParser.parseOffice(convertedBuffer, config);
let text = ast.toText();
if (useOcr && ast.attachments) {
for (const attachment of ast.attachments) {
if (attachment.ocrText) {
text += '\n' + attachment.ocrText;
}
}
return text;
}
throw new Error('LibreOffice conversion produced no output');
return text;
} finally {
// Cleanup temp files
try { unlinkSync(inputPath); } catch { /* ignore */ }
try { unlinkSync(outputPath); } catch { /* ignore */ }
cleanup();
}
}