diff --git a/server/Dockerfile b/server/Dockerfile index 26de127..1b1dfc1 100644 --- a/server/Dockerfile +++ b/server/Dockerfile @@ -2,7 +2,8 @@ FROM node:22-alpine WORKDIR /app -RUN apk add --no-cache python3 make g++ +# Build dependencies + LibreOffice for legacy Office format conversion +RUN apk add --no-cache python3 make g++ libreoffice curl COPY package*.json ./ RUN npm install diff --git a/server/package-lock.json b/server/package-lock.json index 972c16b..0304960 100644 --- a/server/package-lock.json +++ b/server/package-lock.json @@ -18,7 +18,6 @@ "jwks-rsa": "^3.1.0", "multer": "^2.0.2", "officeparser": "^6.0.4", - "ppt": "^0.0.2", "stripe": "^20.2.0", "uuid": "^11.0.5", "word-extractor": "^1.0.4", @@ -1209,15 +1208,6 @@ "node": ">=0.8" } }, - "node_modules/commander": { - "version": "14.0.2", - "resolved": "https://registry.npmjs.org/commander/-/commander-14.0.2.tgz", - "integrity": "sha512-TywoWNNRbhoD0BXs1P3ZEScW8W5iKrnbithIl0YH+uCmBd0QpPOA8yc82DS3BIE5Ma6FnBVUsJ7wVUDz4dvOWQ==", - "license": "MIT", - "engines": { - "node": ">=20" - } - }, "node_modules/concat-stream": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-2.0.0.tgz", @@ -2542,40 +2532,6 @@ "integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==", "license": "MIT" }, - "node_modules/ppt": { - "version": "0.0.2", - "resolved": "https://registry.npmjs.org/ppt/-/ppt-0.0.2.tgz", - "integrity": "sha512-qhII2wLFquh/rSJ4UBfywFHKmKy3PoZb7GvWhRAaHHKoVP7rfNZQldAHIgc5gD4s1cSBSBfedZYEY0m+vYYI5A==", - "license": "Apache-2.0", - "dependencies": { - "cfb": ">=0.10.0", - "codepage": "~1.3.4", - "commander": "" - }, - "bin": { - "ppt": "bin/ppt.njs" - }, - "engines": { - "node": ">=0.8" - } - }, - "node_modules/ppt/node_modules/codepage": { - "version": "1.3.8", - "resolved": "https://registry.npmjs.org/codepage/-/codepage-1.3.8.tgz", - "integrity": "sha512-cjAoQW5L/TCKWRbzt/xGBvhwJKQFhcIVO0jWQtpKQx4gr9qvXNkpRfq6gSmjjA8dB2Is/DPOb7gNwqQXP7UgTQ==", - "license": "Apache-2.0", - "dependencies": { - "commander": "", - "concat-stream": "", - "voc": "" - }, - "bin": { - "codepage": "bin/codepage.njs" - }, - "engines": { - "node": ">=0.8" - } - }, "node_modules/prebuild-install": { "version": "7.1.3", "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz", @@ -3277,18 +3233,6 @@ "node": ">= 0.8" } }, - "node_modules/voc": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/voc/-/voc-1.2.0.tgz", - "integrity": "sha512-BOuDjFFYvJdZO6e/N65AlaDItXo2TgyLjeyRYcqgAPkXpp5yTJcvkL2n+syO1r9Qc5g96tfBD2tuiMhYDmaGcA==", - "license": "Apache-2.0", - "bin": { - "voc": "voc.njs" - }, - "engines": { - "node": ">=0.8" - } - }, "node_modules/wasm-feature-detect": { "version": "1.8.0", "resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz", diff --git a/server/package.json b/server/package.json index a8a5cd2..70c53ec 100644 --- a/server/package.json +++ b/server/package.json @@ -21,7 +21,6 @@ "jwks-rsa": "^3.1.0", "multer": "^2.0.2", "officeparser": "^6.0.4", - "ppt": "^0.0.2", "stripe": "^20.2.0", "uuid": "^11.0.5", "word-extractor": "^1.0.4", diff --git a/server/src/services/documentParser.ts b/server/src/services/documentParser.ts index 5cfc628..663cb17 100644 --- a/server/src/services/documentParser.ts +++ b/server/src/services/documentParser.ts @@ -1,8 +1,8 @@ import officeParser from 'officeparser'; import WordExtractor from 'word-extractor'; import * as XLSX from 'xlsx'; -import PPT from 'ppt'; -import { writeFileSync, unlinkSync } from 'fs'; +import { execSync } from 'child_process'; +import { writeFileSync, readFileSync, unlinkSync, existsSync } from 'fs'; import { tmpdir } from 'os'; import { join } from 'path'; import { randomUUID } from 'crypto'; @@ -199,20 +199,70 @@ function extractWithSheetJS(buffer: Buffer): string { return textParts.join('\n\n'); } -function extractWithSheetJSPPT(buffer: Buffer): string { - // PPT library requires file path due to CFB API compatibility issues - const tempPath = join(tmpdir(), `ppt-${randomUUID()}.ppt`); +// Map legacy extensions to their modern equivalents for LibreOffice conversion +const LEGACY_TO_MODERN: Record = { + '.ppt': 'pptx', + '.doc': 'docx', + '.xls': 'xlsx', +}; + +async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise { + // Use LibreOffice to convert legacy Office files to modern format, then parse + const tempId = randomUUID(); + const tempDir = tmpdir(); + const inputPath = join(tempDir, `input-${tempId}${extension}`); + const modernExt = LEGACY_TO_MODERN[extension] || 'pdf'; + const outputPath = join(tempDir, `input-${tempId}.${modernExt}`); + try { - writeFileSync(tempPath, buffer); - const pres = PPT.readFile(tempPath); - const textArray = PPT.utils.to_text(pres); - return textArray.join('\n\n'); - } finally { + // Write input file + writeFileSync(inputPath, buffer); + + // Convert to modern format using LibreOffice try { - unlinkSync(tempPath); - } catch { - // Ignore cleanup errors + execSync( + `libreoffice --headless --convert-to ${modernExt} --outdir "${tempDir}" "${inputPath}"`, + { timeout: 60000, stdio: 'pipe' } + ); + } catch (execError) { + // LibreOffice not available or conversion failed + const error = execError as Error & { code?: string }; + if (error.code === 'ENOENT' || error.message?.includes('not found')) { + throw new Error( + `Legacy ${extension} files require LibreOffice for text extraction. ` + + `Please convert to .${modernExt} format or ensure LibreOffice is installed.` + ); + } + throw new Error(`LibreOffice conversion failed: ${error.message}`); } + + // Read the converted file and extract text using officeparser (with OCR if enabled) + if (existsSync(outputPath)) { + const convertedBuffer = readFileSync(outputPath); + const config = useOcr ? { + extractAttachments: true, + ocr: true, + ocrLanguage: 'eng' + } : {}; + const ast = await officeParser.parseOffice(convertedBuffer, config); + let text = ast.toText(); + + // Include OCR text from attachments if available + if (useOcr && ast.attachments) { + for (const attachment of ast.attachments) { + if (attachment.ocrText) { + text += '\n' + attachment.ocrText; + } + } + } + return text; + } + + throw new Error('LibreOffice conversion produced no output'); + } finally { + // Cleanup temp files + try { unlinkSync(inputPath); } catch { /* ignore */ } + try { unlinkSync(outputPath); } catch { /* ignore */ } } } @@ -274,9 +324,9 @@ export async function processDocument( }; } - // Legacy .ppt files (PowerPoint 97-2003) + // Legacy .ppt files (PowerPoint 97-2003) - use LibreOffice for conversion if (needsSheetJSPPT(mimeType)) { - const text = extractWithSheetJSPPT(buffer); + const text = await extractWithLibreOffice(buffer, '.ppt', options.useOcr); return { type: 'text', content: text, diff --git a/server/src/types/legacy-office.d.ts b/server/src/types/legacy-office.d.ts deleted file mode 100644 index e00074f..0000000 --- a/server/src/types/legacy-office.d.ts +++ /dev/null @@ -1,31 +0,0 @@ -// Type declarations for legacy Office format parsers - -declare module 'ppt' { - interface PPTPresentation { - docs: Array<{ - slideList: string[]; - }>; - slides: Array<{ - drawing: { - groupShape: Array<{ - clientTextbox?: { - t: string; - }; - }>; - }; - }>; - } - - interface PPTUtils { - to_text(pres: PPTPresentation): string[]; - } - - interface PPTModule { - version: string; - readFile(filename: string, opts?: { WTF?: number; dump?: number }): PPTPresentation; - utils: PPTUtils; - } - - const PPT: PPTModule; - export default PPT; -}