Add ppt deps

This commit is contained in:
Joey Yakimowich-Payne 2026-01-23 14:54:44 -07:00
commit fa2ac3bdfd
No known key found for this signature in database
GPG key ID: 6BFE655FA5ABD1E1
4 changed files with 361 additions and 7 deletions

223
server/package-lock.json generated
View file

@ -18,8 +18,11 @@
"jwks-rsa": "^3.1.0", "jwks-rsa": "^3.1.0",
"multer": "^2.0.2", "multer": "^2.0.2",
"officeparser": "^6.0.4", "officeparser": "^6.0.4",
"ppt": "^0.0.2",
"stripe": "^20.2.0", "stripe": "^20.2.0",
"uuid": "^11.0.5" "uuid": "^11.0.5",
"word-extractor": "^1.0.4",
"xlsx": "^0.18.5"
}, },
"devDependencies": { "devDependencies": {
"@types/better-sqlite3": "^7.6.12", "@types/better-sqlite3": "^7.6.12",
@ -31,6 +34,7 @@
"@types/multer": "^2.0.0", "@types/multer": "^2.0.0",
"@types/node": "^22.10.7", "@types/node": "^22.10.7",
"@types/uuid": "^10.0.0", "@types/uuid": "^10.0.0",
"@types/word-extractor": "^1.0.6",
"tsx": "^4.19.2", "tsx": "^4.19.2",
"typescript": "^5.7.3" "typescript": "^5.7.3"
} }
@ -919,6 +923,16 @@
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },
"node_modules/@types/word-extractor": {
"version": "1.0.6",
"resolved": "https://registry.npmjs.org/@types/word-extractor/-/word-extractor-1.0.6.tgz",
"integrity": "sha512-NDrvZXGJi7cTKXGr8GTP08HiqiueggR1wfHZvBj1sfL8e52qecBSlvl1rBWrvOY0LLkk1DISkKVlFqMTfipLbQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@xmldom/xmldom": { "node_modules/@xmldom/xmldom": {
"version": "0.8.11", "version": "0.8.11",
"resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.8.11.tgz", "resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.8.11.tgz",
@ -953,6 +967,15 @@
"node": ">= 0.6" "node": ">= 0.6"
} }
}, },
"node_modules/adler-32": {
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/adler-32/-/adler-32-1.3.1.tgz",
"integrity": "sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A==",
"license": "Apache-2.0",
"engines": {
"node": ">=0.8"
}
},
"node_modules/agent-base": { "node_modules/agent-base": {
"version": "7.1.4", "version": "7.1.4",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
@ -1158,12 +1181,43 @@
"url": "https://github.com/sponsors/ljharb" "url": "https://github.com/sponsors/ljharb"
} }
}, },
"node_modules/cfb": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/cfb/-/cfb-1.2.2.tgz",
"integrity": "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==",
"license": "Apache-2.0",
"dependencies": {
"adler-32": "~1.3.0",
"crc-32": "~1.2.0"
},
"engines": {
"node": ">=0.8"
}
},
"node_modules/chownr": { "node_modules/chownr": {
"version": "1.1.4", "version": "1.1.4",
"resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz",
"integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==", "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==",
"license": "ISC" "license": "ISC"
}, },
"node_modules/codepage": {
"version": "1.15.0",
"resolved": "https://registry.npmjs.org/codepage/-/codepage-1.15.0.tgz",
"integrity": "sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA==",
"license": "Apache-2.0",
"engines": {
"node": ">=0.8"
}
},
"node_modules/commander": {
"version": "14.0.2",
"resolved": "https://registry.npmjs.org/commander/-/commander-14.0.2.tgz",
"integrity": "sha512-TywoWNNRbhoD0BXs1P3ZEScW8W5iKrnbithIl0YH+uCmBd0QpPOA8yc82DS3BIE5Ma6FnBVUsJ7wVUDz4dvOWQ==",
"license": "MIT",
"engines": {
"node": ">=20"
}
},
"node_modules/concat-stream": { "node_modules/concat-stream": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-2.0.0.tgz", "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-2.0.0.tgz",
@ -1228,6 +1282,18 @@
"node": ">= 0.10" "node": ">= 0.10"
} }
}, },
"node_modules/crc-32": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/crc-32/-/crc-32-1.2.2.tgz",
"integrity": "sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ==",
"license": "Apache-2.0",
"bin": {
"crc32": "bin/crc32.njs"
},
"engines": {
"node": ">=0.8"
}
},
"node_modules/debug": { "node_modules/debug": {
"version": "2.6.9", "version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
@ -1455,6 +1521,7 @@
"resolved": "https://registry.npmjs.org/express/-/express-4.22.1.tgz", "resolved": "https://registry.npmjs.org/express/-/express-4.22.1.tgz",
"integrity": "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==", "integrity": "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==",
"license": "MIT", "license": "MIT",
"peer": true,
"dependencies": { "dependencies": {
"accepts": "~1.3.8", "accepts": "~1.3.8",
"array-flatten": "1.1.1", "array-flatten": "1.1.1",
@ -1520,6 +1587,15 @@
"integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/fd-slicer": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz",
"integrity": "sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==",
"license": "MIT",
"dependencies": {
"pend": "~1.2.0"
}
},
"node_modules/file-type": { "node_modules/file-type": {
"version": "16.5.4", "version": "16.5.4",
"resolved": "https://registry.npmjs.org/file-type/-/file-type-16.5.4.tgz", "resolved": "https://registry.npmjs.org/file-type/-/file-type-16.5.4.tgz",
@ -1570,6 +1646,15 @@
"node": ">= 0.6" "node": ">= 0.6"
} }
}, },
"node_modules/frac": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/frac/-/frac-1.1.2.tgz",
"integrity": "sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA==",
"license": "Apache-2.0",
"engines": {
"node": ">=0.8"
}
},
"node_modules/fresh": { "node_modules/fresh": {
"version": "0.5.2", "version": "0.5.2",
"resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
@ -2457,6 +2542,40 @@
"integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==", "integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/ppt": {
"version": "0.0.2",
"resolved": "https://registry.npmjs.org/ppt/-/ppt-0.0.2.tgz",
"integrity": "sha512-qhII2wLFquh/rSJ4UBfywFHKmKy3PoZb7GvWhRAaHHKoVP7rfNZQldAHIgc5gD4s1cSBSBfedZYEY0m+vYYI5A==",
"license": "Apache-2.0",
"dependencies": {
"cfb": ">=0.10.0",
"codepage": "~1.3.4",
"commander": ""
},
"bin": {
"ppt": "bin/ppt.njs"
},
"engines": {
"node": ">=0.8"
}
},
"node_modules/ppt/node_modules/codepage": {
"version": "1.3.8",
"resolved": "https://registry.npmjs.org/codepage/-/codepage-1.3.8.tgz",
"integrity": "sha512-cjAoQW5L/TCKWRbzt/xGBvhwJKQFhcIVO0jWQtpKQx4gr9qvXNkpRfq6gSmjjA8dB2Is/DPOb7gNwqQXP7UgTQ==",
"license": "Apache-2.0",
"dependencies": {
"commander": "",
"concat-stream": "",
"voc": ""
},
"bin": {
"codepage": "bin/codepage.njs"
},
"engines": {
"node": ">=0.8"
}
},
"node_modules/prebuild-install": { "node_modules/prebuild-install": {
"version": "7.1.3", "version": "7.1.3",
"resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz", "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz",
@ -2681,6 +2800,18 @@
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/saxes": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/saxes/-/saxes-5.0.1.tgz",
"integrity": "sha512-5LBh1Tls8c9xgGjw3QrMwETmTMVk0oFgvrFSvWx62llR2hcEInrKNZ2GZCCuuy2lvWrdl5jhbpeqc5hRYKFOcw==",
"license": "ISC",
"dependencies": {
"xmlchars": "^2.2.0"
},
"engines": {
"node": ">=10"
}
},
"node_modules/semver": { "node_modules/semver": {
"version": "7.7.3", "version": "7.7.3",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz", "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
@ -2861,6 +2992,18 @@
"simple-concat": "^1.0.0" "simple-concat": "^1.0.0"
} }
}, },
"node_modules/ssf": {
"version": "0.11.2",
"resolved": "https://registry.npmjs.org/ssf/-/ssf-0.11.2.tgz",
"integrity": "sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g==",
"license": "Apache-2.0",
"dependencies": {
"frac": "~1.1.2"
},
"engines": {
"node": ">=0.8"
}
},
"node_modules/statuses": { "node_modules/statuses": {
"version": "2.0.2", "version": "2.0.2",
"resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.2.tgz", "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.2.tgz",
@ -3134,6 +3277,18 @@
"node": ">= 0.8" "node": ">= 0.8"
} }
}, },
"node_modules/voc": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/voc/-/voc-1.2.0.tgz",
"integrity": "sha512-BOuDjFFYvJdZO6e/N65AlaDItXo2TgyLjeyRYcqgAPkXpp5yTJcvkL2n+syO1r9Qc5g96tfBD2tuiMhYDmaGcA==",
"license": "Apache-2.0",
"bin": {
"voc": "voc.njs"
},
"engines": {
"node": ">=0.8"
}
},
"node_modules/wasm-feature-detect": { "node_modules/wasm-feature-detect": {
"version": "1.8.0", "version": "1.8.0",
"resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz", "resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz",
@ -3156,6 +3311,44 @@
"webidl-conversions": "^3.0.0" "webidl-conversions": "^3.0.0"
} }
}, },
"node_modules/wmf": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/wmf/-/wmf-1.0.2.tgz",
"integrity": "sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw==",
"license": "Apache-2.0",
"engines": {
"node": ">=0.8"
}
},
"node_modules/word": {
"version": "0.3.0",
"resolved": "https://registry.npmjs.org/word/-/word-0.3.0.tgz",
"integrity": "sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA==",
"license": "Apache-2.0",
"engines": {
"node": ">=0.8"
}
},
"node_modules/word-extractor": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/word-extractor/-/word-extractor-1.0.4.tgz",
"integrity": "sha512-PyAGZQ2gjnVA5kcZAOAxoYciCMaAvu0dbVlw/zxHphhy+3be8cDeYKHJPO8iedIM3Sx0arA/ugKTJyXhZNgo6g==",
"license": "MIT",
"dependencies": {
"saxes": "^5.0.1",
"yauzl": "^2.10.0"
}
},
"node_modules/word-extractor/node_modules/yauzl": {
"version": "2.10.0",
"resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz",
"integrity": "sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==",
"license": "MIT",
"dependencies": {
"buffer-crc32": "~0.2.3",
"fd-slicer": "~1.1.0"
}
},
"node_modules/wrappy": { "node_modules/wrappy": {
"version": "1.0.2", "version": "1.0.2",
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
@ -3183,6 +3376,33 @@
} }
} }
}, },
"node_modules/xlsx": {
"version": "0.18.5",
"resolved": "https://registry.npmjs.org/xlsx/-/xlsx-0.18.5.tgz",
"integrity": "sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ==",
"license": "Apache-2.0",
"dependencies": {
"adler-32": "~1.3.0",
"cfb": "~1.2.1",
"codepage": "~1.15.0",
"crc-32": "~1.2.1",
"ssf": "~0.11.2",
"wmf": "~1.0.1",
"word": "~0.3.0"
},
"bin": {
"xlsx": "bin/xlsx.njs"
},
"engines": {
"node": ">=0.8"
}
},
"node_modules/xmlchars": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz",
"integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==",
"license": "MIT"
},
"node_modules/xtend": { "node_modules/xtend": {
"version": "4.0.2", "version": "4.0.2",
"resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz",
@ -3225,6 +3445,7 @@
"resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz", "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
"integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==", "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
"license": "MIT", "license": "MIT",
"peer": true,
"funding": { "funding": {
"url": "https://github.com/sponsors/colinhacks" "url": "https://github.com/sponsors/colinhacks"
} }

View file

@ -21,8 +21,11 @@
"jwks-rsa": "^3.1.0", "jwks-rsa": "^3.1.0",
"multer": "^2.0.2", "multer": "^2.0.2",
"officeparser": "^6.0.4", "officeparser": "^6.0.4",
"ppt": "^0.0.2",
"stripe": "^20.2.0", "stripe": "^20.2.0",
"uuid": "^11.0.5" "uuid": "^11.0.5",
"word-extractor": "^1.0.4",
"xlsx": "^0.18.5"
}, },
"devDependencies": { "devDependencies": {
"@types/better-sqlite3": "^7.6.12", "@types/better-sqlite3": "^7.6.12",
@ -34,6 +37,7 @@
"@types/multer": "^2.0.0", "@types/multer": "^2.0.0",
"@types/node": "^22.10.7", "@types/node": "^22.10.7",
"@types/uuid": "^10.0.0", "@types/uuid": "^10.0.0",
"@types/word-extractor": "^1.0.6",
"tsx": "^4.19.2", "tsx": "^4.19.2",
"typescript": "^5.7.3" "typescript": "^5.7.3"
} }

View file

@ -1,4 +1,8 @@
import officeParser from 'officeparser'; import officeParser from 'officeparser';
import WordExtractor from 'word-extractor';
import * as XLSX from 'xlsx';
import * as PPT from 'ppt';
import * as CFB from 'cfb';
export const GEMINI_NATIVE_TYPES = [ export const GEMINI_NATIVE_TYPES = [
'application/pdf', 'application/pdf',
@ -30,20 +34,22 @@ export const TYPES_TO_CONVERT_TO_PLAIN_TEXT = [
'application/x-yaml', 'application/x-yaml',
]; ];
// MIME types that officeparser can extract text from // MIME types that officeparser can extract text from (modern Office formats)
export const OFFICEPARSER_TYPES = [ export const OFFICEPARSER_TYPES = [
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // .pptx 'application/vnd.openxmlformats-officedocument.presentationml.presentation', // .pptx
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // .xlsx 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // .xlsx
'application/msword', // .doc
'application/vnd.ms-powerpoint', // .ppt
'application/vnd.ms-excel', // .xls
'application/vnd.oasis.opendocument.text', // .odt 'application/vnd.oasis.opendocument.text', // .odt
'application/vnd.oasis.opendocument.presentation', // .odp 'application/vnd.oasis.opendocument.presentation', // .odp
'application/vnd.oasis.opendocument.spreadsheet', // .ods 'application/vnd.oasis.opendocument.spreadsheet', // .ods
'application/rtf', // .rtf 'application/rtf', // .rtf
]; ];
// Legacy Office formats requiring specialized parsers
export const LEGACY_DOC_TYPE = 'application/msword'; // .doc (word-extractor)
export const LEGACY_XLS_TYPE = 'application/vnd.ms-excel'; // .xls (xlsx/SheetJS)
export const LEGACY_PPT_TYPE = 'application/vnd.ms-powerpoint'; // .ppt (ppt/SheetJS)
// Image types that can use OCR for text extraction // Image types that can use OCR for text extraction
export const OCR_CAPABLE_TYPES = [ export const OCR_CAPABLE_TYPES = [
'image/jpeg', 'image/jpeg',
@ -52,7 +58,14 @@ export const OCR_CAPABLE_TYPES = [
'image/webp' 'image/webp'
]; ];
export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES, ...TYPES_TO_CONVERT_TO_PLAIN_TEXT]; export const SUPPORTED_TYPES = [
...GEMINI_NATIVE_TYPES,
...OFFICEPARSER_TYPES,
...TYPES_TO_CONVERT_TO_PLAIN_TEXT,
LEGACY_DOC_TYPE,
LEGACY_XLS_TYPE,
LEGACY_PPT_TYPE,
];
const EXTENSION_TO_MIME: Record<string, string> = { const EXTENSION_TO_MIME: Record<string, string> = {
'.txt': 'text/plain', '.txt': 'text/plain',
@ -129,6 +142,18 @@ export function needsPlainTextConversion(mimeType: string): boolean {
return TYPES_TO_CONVERT_TO_PLAIN_TEXT.includes(mimeType); return TYPES_TO_CONVERT_TO_PLAIN_TEXT.includes(mimeType);
} }
export function needsWordExtractor(mimeType: string): boolean {
return mimeType === LEGACY_DOC_TYPE;
}
export function needsSheetJS(mimeType: string): boolean {
return mimeType === LEGACY_XLS_TYPE;
}
export function needsSheetJSPPT(mimeType: string): boolean {
return mimeType === LEGACY_PPT_TYPE;
}
async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): Promise<string> { async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): Promise<string> {
const config = useOcr ? { const config = useOcr ? {
extractAttachments: true, extractAttachments: true,
@ -150,6 +175,34 @@ async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false):
return text; return text;
} }
async function extractWithWordExtractor(buffer: Buffer): Promise<string> {
const extractor = new WordExtractor();
const doc = await extractor.extract(buffer);
return doc.getBody();
}
function extractWithSheetJS(buffer: Buffer): string {
const workbook = XLSX.read(buffer, { type: 'buffer' });
const textParts: string[] = [];
for (const sheetName of workbook.SheetNames) {
const sheet = workbook.Sheets[sheetName];
const csv = XLSX.utils.sheet_to_csv(sheet);
if (csv.trim()) {
textParts.push(`--- ${sheetName} ---\n${csv}`);
}
}
return textParts.join('\n\n');
}
function extractWithSheetJSPPT(buffer: Buffer): string {
const cfb = CFB.read(buffer, { type: 'buffer' });
const pres = PPT.parse_pptcfb(cfb);
const textArray = PPT.utils.to_text(pres);
return textArray.join('\n\n');
}
export async function processDocument( export async function processDocument(
buffer: Buffer, buffer: Buffer,
mimeType: string, mimeType: string,
@ -188,6 +241,36 @@ export async function processDocument(
}; };
} }
// Legacy .doc files (Word 97-2003)
if (needsWordExtractor(mimeType)) {
const text = await extractWithWordExtractor(buffer);
return {
type: 'text',
content: text,
mimeType: 'text/plain'
};
}
// Legacy .xls files (Excel 97-2003)
if (needsSheetJS(mimeType)) {
const text = extractWithSheetJS(buffer);
return {
type: 'text',
content: text,
mimeType: 'text/plain'
};
}
// Legacy .ppt files (PowerPoint 97-2003)
if (needsSheetJSPPT(mimeType)) {
const text = extractWithSheetJSPPT(buffer);
return {
type: 'text',
content: text,
mimeType: 'text/plain'
};
}
if (needsPlainTextConversion(mimeType)) { if (needsPlainTextConversion(mimeType)) {
return { return {
type: 'text', type: 'text',

46
server/src/types/legacy-office.d.ts vendored Normal file
View file

@ -0,0 +1,46 @@
// Type declarations for legacy Office format parsers
declare module 'ppt' {
interface PPTPresentation {
docs: Array<{
slideList: string[];
}>;
slides: Array<{
drawing: {
groupShape: Array<{
clientTextbox?: {
t: string;
};
}>;
};
}>;
}
interface PPTUtils {
to_text(pres: PPTPresentation): string[];
}
export function readFile(filename: string, opts?: { WTF?: number; dump?: number }): PPTPresentation;
export function parse_pptcfb(cfb: CFB.Container, opts?: object): PPTPresentation;
export const utils: PPTUtils;
}
declare module 'cfb' {
interface CFBEntry {
name: string;
type: number;
content?: Buffer;
}
interface Container {
FileIndex: CFBEntry[];
FullPaths: string[];
}
interface ReadOptions {
type?: 'file' | 'buffer' | 'base64' | 'binary' | 'array';
}
export function read(data: Buffer | string | ArrayBuffer, opts?: ReadOptions): Container;
export function parse(data: Buffer | number[]): Container;
}