diff --git a/server/package-lock.json b/server/package-lock.json index dd222da..972c16b 100644 --- a/server/package-lock.json +++ b/server/package-lock.json @@ -18,8 +18,11 @@ "jwks-rsa": "^3.1.0", "multer": "^2.0.2", "officeparser": "^6.0.4", + "ppt": "^0.0.2", "stripe": "^20.2.0", - "uuid": "^11.0.5" + "uuid": "^11.0.5", + "word-extractor": "^1.0.4", + "xlsx": "^0.18.5" }, "devDependencies": { "@types/better-sqlite3": "^7.6.12", @@ -31,6 +34,7 @@ "@types/multer": "^2.0.0", "@types/node": "^22.10.7", "@types/uuid": "^10.0.0", + "@types/word-extractor": "^1.0.6", "tsx": "^4.19.2", "typescript": "^5.7.3" } @@ -919,6 +923,16 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/word-extractor": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/@types/word-extractor/-/word-extractor-1.0.6.tgz", + "integrity": "sha512-NDrvZXGJi7cTKXGr8GTP08HiqiueggR1wfHZvBj1sfL8e52qecBSlvl1rBWrvOY0LLkk1DISkKVlFqMTfipLbQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@xmldom/xmldom": { "version": "0.8.11", "resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.8.11.tgz", @@ -953,6 +967,15 @@ "node": ">= 0.6" } }, + "node_modules/adler-32": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/adler-32/-/adler-32-1.3.1.tgz", + "integrity": "sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, "node_modules/agent-base": { "version": "7.1.4", "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", @@ -1158,12 +1181,43 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/cfb": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/cfb/-/cfb-1.2.2.tgz", + "integrity": "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==", + "license": "Apache-2.0", + "dependencies": { + "adler-32": "~1.3.0", + "crc-32": "~1.2.0" + }, + "engines": { + "node": ">=0.8" + } + }, "node_modules/chownr": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==", "license": "ISC" }, + "node_modules/codepage": { + "version": "1.15.0", + "resolved": "https://registry.npmjs.org/codepage/-/codepage-1.15.0.tgz", + "integrity": "sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/commander": { + "version": "14.0.2", + "resolved": "https://registry.npmjs.org/commander/-/commander-14.0.2.tgz", + "integrity": "sha512-TywoWNNRbhoD0BXs1P3ZEScW8W5iKrnbithIl0YH+uCmBd0QpPOA8yc82DS3BIE5Ma6FnBVUsJ7wVUDz4dvOWQ==", + "license": "MIT", + "engines": { + "node": ">=20" + } + }, "node_modules/concat-stream": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-2.0.0.tgz", @@ -1228,6 +1282,18 @@ "node": ">= 0.10" } }, + "node_modules/crc-32": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/crc-32/-/crc-32-1.2.2.tgz", + "integrity": "sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ==", + "license": "Apache-2.0", + "bin": { + "crc32": "bin/crc32.njs" + }, + "engines": { + "node": ">=0.8" + } + }, "node_modules/debug": { "version": "2.6.9", "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", @@ -1455,6 +1521,7 @@ "resolved": "https://registry.npmjs.org/express/-/express-4.22.1.tgz", "integrity": "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==", "license": "MIT", + "peer": true, "dependencies": { "accepts": "~1.3.8", "array-flatten": "1.1.1", @@ -1520,6 +1587,15 @@ "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", "license": "MIT" }, + "node_modules/fd-slicer": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz", + "integrity": "sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==", + "license": "MIT", + "dependencies": { + "pend": "~1.2.0" + } + }, "node_modules/file-type": { "version": "16.5.4", "resolved": "https://registry.npmjs.org/file-type/-/file-type-16.5.4.tgz", @@ -1570,6 +1646,15 @@ "node": ">= 0.6" } }, + "node_modules/frac": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/frac/-/frac-1.1.2.tgz", + "integrity": "sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, "node_modules/fresh": { "version": "0.5.2", "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", @@ -2457,6 +2542,40 @@ "integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==", "license": "MIT" }, + "node_modules/ppt": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/ppt/-/ppt-0.0.2.tgz", + "integrity": "sha512-qhII2wLFquh/rSJ4UBfywFHKmKy3PoZb7GvWhRAaHHKoVP7rfNZQldAHIgc5gD4s1cSBSBfedZYEY0m+vYYI5A==", + "license": "Apache-2.0", + "dependencies": { + "cfb": ">=0.10.0", + "codepage": "~1.3.4", + "commander": "" + }, + "bin": { + "ppt": "bin/ppt.njs" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/ppt/node_modules/codepage": { + "version": "1.3.8", + "resolved": "https://registry.npmjs.org/codepage/-/codepage-1.3.8.tgz", + "integrity": "sha512-cjAoQW5L/TCKWRbzt/xGBvhwJKQFhcIVO0jWQtpKQx4gr9qvXNkpRfq6gSmjjA8dB2Is/DPOb7gNwqQXP7UgTQ==", + "license": "Apache-2.0", + "dependencies": { + "commander": "", + "concat-stream": "", + "voc": "" + }, + "bin": { + "codepage": "bin/codepage.njs" + }, + "engines": { + "node": ">=0.8" + } + }, "node_modules/prebuild-install": { "version": "7.1.3", "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz", @@ -2681,6 +2800,18 @@ "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", "license": "MIT" }, + "node_modules/saxes": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/saxes/-/saxes-5.0.1.tgz", + "integrity": "sha512-5LBh1Tls8c9xgGjw3QrMwETmTMVk0oFgvrFSvWx62llR2hcEInrKNZ2GZCCuuy2lvWrdl5jhbpeqc5hRYKFOcw==", + "license": "ISC", + "dependencies": { + "xmlchars": "^2.2.0" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/semver": { "version": "7.7.3", "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz", @@ -2861,6 +2992,18 @@ "simple-concat": "^1.0.0" } }, + "node_modules/ssf": { + "version": "0.11.2", + "resolved": "https://registry.npmjs.org/ssf/-/ssf-0.11.2.tgz", + "integrity": "sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g==", + "license": "Apache-2.0", + "dependencies": { + "frac": "~1.1.2" + }, + "engines": { + "node": ">=0.8" + } + }, "node_modules/statuses": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.2.tgz", @@ -3134,6 +3277,18 @@ "node": ">= 0.8" } }, + "node_modules/voc": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/voc/-/voc-1.2.0.tgz", + "integrity": "sha512-BOuDjFFYvJdZO6e/N65AlaDItXo2TgyLjeyRYcqgAPkXpp5yTJcvkL2n+syO1r9Qc5g96tfBD2tuiMhYDmaGcA==", + "license": "Apache-2.0", + "bin": { + "voc": "voc.njs" + }, + "engines": { + "node": ">=0.8" + } + }, "node_modules/wasm-feature-detect": { "version": "1.8.0", "resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz", @@ -3156,6 +3311,44 @@ "webidl-conversions": "^3.0.0" } }, + "node_modules/wmf": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wmf/-/wmf-1.0.2.tgz", + "integrity": "sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/word": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/word/-/word-0.3.0.tgz", + "integrity": "sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA==", + "license": "Apache-2.0", + "engines": { + "node": ">=0.8" + } + }, + "node_modules/word-extractor": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/word-extractor/-/word-extractor-1.0.4.tgz", + "integrity": "sha512-PyAGZQ2gjnVA5kcZAOAxoYciCMaAvu0dbVlw/zxHphhy+3be8cDeYKHJPO8iedIM3Sx0arA/ugKTJyXhZNgo6g==", + "license": "MIT", + "dependencies": { + "saxes": "^5.0.1", + "yauzl": "^2.10.0" + } + }, + "node_modules/word-extractor/node_modules/yauzl": { + "version": "2.10.0", + "resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz", + "integrity": "sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==", + "license": "MIT", + "dependencies": { + "buffer-crc32": "~0.2.3", + "fd-slicer": "~1.1.0" + } + }, "node_modules/wrappy": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", @@ -3183,6 +3376,33 @@ } } }, + "node_modules/xlsx": { + "version": "0.18.5", + "resolved": "https://registry.npmjs.org/xlsx/-/xlsx-0.18.5.tgz", + "integrity": "sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ==", + "license": "Apache-2.0", + "dependencies": { + "adler-32": "~1.3.0", + "cfb": "~1.2.1", + "codepage": "~1.15.0", + "crc-32": "~1.2.1", + "ssf": "~0.11.2", + "wmf": "~1.0.1", + "word": "~0.3.0" + }, + "bin": { + "xlsx": "bin/xlsx.njs" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/xmlchars": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", + "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==", + "license": "MIT" + }, "node_modules/xtend": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", @@ -3225,6 +3445,7 @@ "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz", "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==", "license": "MIT", + "peer": true, "funding": { "url": "https://github.com/sponsors/colinhacks" } diff --git a/server/package.json b/server/package.json index 297aec2..a8a5cd2 100644 --- a/server/package.json +++ b/server/package.json @@ -21,8 +21,11 @@ "jwks-rsa": "^3.1.0", "multer": "^2.0.2", "officeparser": "^6.0.4", + "ppt": "^0.0.2", "stripe": "^20.2.0", - "uuid": "^11.0.5" + "uuid": "^11.0.5", + "word-extractor": "^1.0.4", + "xlsx": "^0.18.5" }, "devDependencies": { "@types/better-sqlite3": "^7.6.12", @@ -34,6 +37,7 @@ "@types/multer": "^2.0.0", "@types/node": "^22.10.7", "@types/uuid": "^10.0.0", + "@types/word-extractor": "^1.0.6", "tsx": "^4.19.2", "typescript": "^5.7.3" } diff --git a/server/src/services/documentParser.ts b/server/src/services/documentParser.ts index 3a45908..ba54ceb 100644 --- a/server/src/services/documentParser.ts +++ b/server/src/services/documentParser.ts @@ -1,4 +1,8 @@ import officeParser from 'officeparser'; +import WordExtractor from 'word-extractor'; +import * as XLSX from 'xlsx'; +import * as PPT from 'ppt'; +import * as CFB from 'cfb'; export const GEMINI_NATIVE_TYPES = [ 'application/pdf', @@ -30,20 +34,22 @@ export const TYPES_TO_CONVERT_TO_PLAIN_TEXT = [ 'application/x-yaml', ]; -// MIME types that officeparser can extract text from +// MIME types that officeparser can extract text from (modern Office formats) export const OFFICEPARSER_TYPES = [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx 'application/vnd.openxmlformats-officedocument.presentationml.presentation', // .pptx 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // .xlsx - 'application/msword', // .doc - 'application/vnd.ms-powerpoint', // .ppt - 'application/vnd.ms-excel', // .xls 'application/vnd.oasis.opendocument.text', // .odt 'application/vnd.oasis.opendocument.presentation', // .odp 'application/vnd.oasis.opendocument.spreadsheet', // .ods 'application/rtf', // .rtf ]; +// Legacy Office formats requiring specialized parsers +export const LEGACY_DOC_TYPE = 'application/msword'; // .doc (word-extractor) +export const LEGACY_XLS_TYPE = 'application/vnd.ms-excel'; // .xls (xlsx/SheetJS) +export const LEGACY_PPT_TYPE = 'application/vnd.ms-powerpoint'; // .ppt (ppt/SheetJS) + // Image types that can use OCR for text extraction export const OCR_CAPABLE_TYPES = [ 'image/jpeg', @@ -52,7 +58,14 @@ export const OCR_CAPABLE_TYPES = [ 'image/webp' ]; -export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES, ...TYPES_TO_CONVERT_TO_PLAIN_TEXT]; +export const SUPPORTED_TYPES = [ + ...GEMINI_NATIVE_TYPES, + ...OFFICEPARSER_TYPES, + ...TYPES_TO_CONVERT_TO_PLAIN_TEXT, + LEGACY_DOC_TYPE, + LEGACY_XLS_TYPE, + LEGACY_PPT_TYPE, +]; const EXTENSION_TO_MIME: Record = { '.txt': 'text/plain', @@ -129,6 +142,18 @@ export function needsPlainTextConversion(mimeType: string): boolean { return TYPES_TO_CONVERT_TO_PLAIN_TEXT.includes(mimeType); } +export function needsWordExtractor(mimeType: string): boolean { + return mimeType === LEGACY_DOC_TYPE; +} + +export function needsSheetJS(mimeType: string): boolean { + return mimeType === LEGACY_XLS_TYPE; +} + +export function needsSheetJSPPT(mimeType: string): boolean { + return mimeType === LEGACY_PPT_TYPE; +} + async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): Promise { const config = useOcr ? { extractAttachments: true, @@ -150,6 +175,34 @@ async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): return text; } +async function extractWithWordExtractor(buffer: Buffer): Promise { + const extractor = new WordExtractor(); + const doc = await extractor.extract(buffer); + return doc.getBody(); +} + +function extractWithSheetJS(buffer: Buffer): string { + const workbook = XLSX.read(buffer, { type: 'buffer' }); + const textParts: string[] = []; + + for (const sheetName of workbook.SheetNames) { + const sheet = workbook.Sheets[sheetName]; + const csv = XLSX.utils.sheet_to_csv(sheet); + if (csv.trim()) { + textParts.push(`--- ${sheetName} ---\n${csv}`); + } + } + + return textParts.join('\n\n'); +} + +function extractWithSheetJSPPT(buffer: Buffer): string { + const cfb = CFB.read(buffer, { type: 'buffer' }); + const pres = PPT.parse_pptcfb(cfb); + const textArray = PPT.utils.to_text(pres); + return textArray.join('\n\n'); +} + export async function processDocument( buffer: Buffer, mimeType: string, @@ -188,6 +241,36 @@ export async function processDocument( }; } + // Legacy .doc files (Word 97-2003) + if (needsWordExtractor(mimeType)) { + const text = await extractWithWordExtractor(buffer); + return { + type: 'text', + content: text, + mimeType: 'text/plain' + }; + } + + // Legacy .xls files (Excel 97-2003) + if (needsSheetJS(mimeType)) { + const text = extractWithSheetJS(buffer); + return { + type: 'text', + content: text, + mimeType: 'text/plain' + }; + } + + // Legacy .ppt files (PowerPoint 97-2003) + if (needsSheetJSPPT(mimeType)) { + const text = extractWithSheetJSPPT(buffer); + return { + type: 'text', + content: text, + mimeType: 'text/plain' + }; + } + if (needsPlainTextConversion(mimeType)) { return { type: 'text', diff --git a/server/src/types/legacy-office.d.ts b/server/src/types/legacy-office.d.ts new file mode 100644 index 0000000..3f91c0f --- /dev/null +++ b/server/src/types/legacy-office.d.ts @@ -0,0 +1,46 @@ +// Type declarations for legacy Office format parsers + +declare module 'ppt' { + interface PPTPresentation { + docs: Array<{ + slideList: string[]; + }>; + slides: Array<{ + drawing: { + groupShape: Array<{ + clientTextbox?: { + t: string; + }; + }>; + }; + }>; + } + + interface PPTUtils { + to_text(pres: PPTPresentation): string[]; + } + + export function readFile(filename: string, opts?: { WTF?: number; dump?: number }): PPTPresentation; + export function parse_pptcfb(cfb: CFB.Container, opts?: object): PPTPresentation; + export const utils: PPTUtils; +} + +declare module 'cfb' { + interface CFBEntry { + name: string; + type: number; + content?: Buffer; + } + + interface Container { + FileIndex: CFBEntry[]; + FullPaths: string[]; + } + + interface ReadOptions { + type?: 'file' | 'buffer' | 'base64' | 'binary' | 'array'; + } + + export function read(data: Buffer | string | ArrayBuffer, opts?: ReadOptions): Container; + export function parse(data: Buffer | number[]): Container; +}