Fix ppt
This commit is contained in:
parent
255497837b
commit
b85970e168
5 changed files with 67 additions and 104 deletions
|
|
@ -2,7 +2,8 @@ FROM node:22-alpine
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
RUN apk add --no-cache python3 make g++
|
# Build dependencies + LibreOffice for legacy Office format conversion
|
||||||
|
RUN apk add --no-cache python3 make g++ libreoffice curl
|
||||||
|
|
||||||
COPY package*.json ./
|
COPY package*.json ./
|
||||||
RUN npm install
|
RUN npm install
|
||||||
|
|
|
||||||
56
server/package-lock.json
generated
56
server/package-lock.json
generated
|
|
@ -18,7 +18,6 @@
|
||||||
"jwks-rsa": "^3.1.0",
|
"jwks-rsa": "^3.1.0",
|
||||||
"multer": "^2.0.2",
|
"multer": "^2.0.2",
|
||||||
"officeparser": "^6.0.4",
|
"officeparser": "^6.0.4",
|
||||||
"ppt": "^0.0.2",
|
|
||||||
"stripe": "^20.2.0",
|
"stripe": "^20.2.0",
|
||||||
"uuid": "^11.0.5",
|
"uuid": "^11.0.5",
|
||||||
"word-extractor": "^1.0.4",
|
"word-extractor": "^1.0.4",
|
||||||
|
|
@ -1209,15 +1208,6 @@
|
||||||
"node": ">=0.8"
|
"node": ">=0.8"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/commander": {
|
|
||||||
"version": "14.0.2",
|
|
||||||
"resolved": "https://registry.npmjs.org/commander/-/commander-14.0.2.tgz",
|
|
||||||
"integrity": "sha512-TywoWNNRbhoD0BXs1P3ZEScW8W5iKrnbithIl0YH+uCmBd0QpPOA8yc82DS3BIE5Ma6FnBVUsJ7wVUDz4dvOWQ==",
|
|
||||||
"license": "MIT",
|
|
||||||
"engines": {
|
|
||||||
"node": ">=20"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/concat-stream": {
|
"node_modules/concat-stream": {
|
||||||
"version": "2.0.0",
|
"version": "2.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-2.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-2.0.0.tgz",
|
||||||
|
|
@ -2542,40 +2532,6 @@
|
||||||
"integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==",
|
"integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==",
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
"node_modules/ppt": {
|
|
||||||
"version": "0.0.2",
|
|
||||||
"resolved": "https://registry.npmjs.org/ppt/-/ppt-0.0.2.tgz",
|
|
||||||
"integrity": "sha512-qhII2wLFquh/rSJ4UBfywFHKmKy3PoZb7GvWhRAaHHKoVP7rfNZQldAHIgc5gD4s1cSBSBfedZYEY0m+vYYI5A==",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"dependencies": {
|
|
||||||
"cfb": ">=0.10.0",
|
|
||||||
"codepage": "~1.3.4",
|
|
||||||
"commander": ""
|
|
||||||
},
|
|
||||||
"bin": {
|
|
||||||
"ppt": "bin/ppt.njs"
|
|
||||||
},
|
|
||||||
"engines": {
|
|
||||||
"node": ">=0.8"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/ppt/node_modules/codepage": {
|
|
||||||
"version": "1.3.8",
|
|
||||||
"resolved": "https://registry.npmjs.org/codepage/-/codepage-1.3.8.tgz",
|
|
||||||
"integrity": "sha512-cjAoQW5L/TCKWRbzt/xGBvhwJKQFhcIVO0jWQtpKQx4gr9qvXNkpRfq6gSmjjA8dB2Is/DPOb7gNwqQXP7UgTQ==",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"dependencies": {
|
|
||||||
"commander": "",
|
|
||||||
"concat-stream": "",
|
|
||||||
"voc": ""
|
|
||||||
},
|
|
||||||
"bin": {
|
|
||||||
"codepage": "bin/codepage.njs"
|
|
||||||
},
|
|
||||||
"engines": {
|
|
||||||
"node": ">=0.8"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/prebuild-install": {
|
"node_modules/prebuild-install": {
|
||||||
"version": "7.1.3",
|
"version": "7.1.3",
|
||||||
"resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz",
|
"resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz",
|
||||||
|
|
@ -3277,18 +3233,6 @@
|
||||||
"node": ">= 0.8"
|
"node": ">= 0.8"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/voc": {
|
|
||||||
"version": "1.2.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/voc/-/voc-1.2.0.tgz",
|
|
||||||
"integrity": "sha512-BOuDjFFYvJdZO6e/N65AlaDItXo2TgyLjeyRYcqgAPkXpp5yTJcvkL2n+syO1r9Qc5g96tfBD2tuiMhYDmaGcA==",
|
|
||||||
"license": "Apache-2.0",
|
|
||||||
"bin": {
|
|
||||||
"voc": "voc.njs"
|
|
||||||
},
|
|
||||||
"engines": {
|
|
||||||
"node": ">=0.8"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/wasm-feature-detect": {
|
"node_modules/wasm-feature-detect": {
|
||||||
"version": "1.8.0",
|
"version": "1.8.0",
|
||||||
"resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz",
|
"resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz",
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,6 @@
|
||||||
"jwks-rsa": "^3.1.0",
|
"jwks-rsa": "^3.1.0",
|
||||||
"multer": "^2.0.2",
|
"multer": "^2.0.2",
|
||||||
"officeparser": "^6.0.4",
|
"officeparser": "^6.0.4",
|
||||||
"ppt": "^0.0.2",
|
|
||||||
"stripe": "^20.2.0",
|
"stripe": "^20.2.0",
|
||||||
"uuid": "^11.0.5",
|
"uuid": "^11.0.5",
|
||||||
"word-extractor": "^1.0.4",
|
"word-extractor": "^1.0.4",
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
import officeParser from 'officeparser';
|
import officeParser from 'officeparser';
|
||||||
import WordExtractor from 'word-extractor';
|
import WordExtractor from 'word-extractor';
|
||||||
import * as XLSX from 'xlsx';
|
import * as XLSX from 'xlsx';
|
||||||
import PPT from 'ppt';
|
import { execSync } from 'child_process';
|
||||||
import { writeFileSync, unlinkSync } from 'fs';
|
import { writeFileSync, readFileSync, unlinkSync, existsSync } from 'fs';
|
||||||
import { tmpdir } from 'os';
|
import { tmpdir } from 'os';
|
||||||
import { join } from 'path';
|
import { join } from 'path';
|
||||||
import { randomUUID } from 'crypto';
|
import { randomUUID } from 'crypto';
|
||||||
|
|
@ -199,20 +199,70 @@ function extractWithSheetJS(buffer: Buffer): string {
|
||||||
return textParts.join('\n\n');
|
return textParts.join('\n\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
function extractWithSheetJSPPT(buffer: Buffer): string {
|
// Map legacy extensions to their modern equivalents for LibreOffice conversion
|
||||||
// PPT library requires file path due to CFB API compatibility issues
|
const LEGACY_TO_MODERN: Record<string, string> = {
|
||||||
const tempPath = join(tmpdir(), `ppt-${randomUUID()}.ppt`);
|
'.ppt': 'pptx',
|
||||||
|
'.doc': 'docx',
|
||||||
|
'.xls': 'xlsx',
|
||||||
|
};
|
||||||
|
|
||||||
|
async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise<string> {
|
||||||
|
// Use LibreOffice to convert legacy Office files to modern format, then parse
|
||||||
|
const tempId = randomUUID();
|
||||||
|
const tempDir = tmpdir();
|
||||||
|
const inputPath = join(tempDir, `input-${tempId}${extension}`);
|
||||||
|
const modernExt = LEGACY_TO_MODERN[extension] || 'pdf';
|
||||||
|
const outputPath = join(tempDir, `input-${tempId}.${modernExt}`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
writeFileSync(tempPath, buffer);
|
// Write input file
|
||||||
const pres = PPT.readFile(tempPath);
|
writeFileSync(inputPath, buffer);
|
||||||
const textArray = PPT.utils.to_text(pres);
|
|
||||||
return textArray.join('\n\n');
|
// Convert to modern format using LibreOffice
|
||||||
} finally {
|
|
||||||
try {
|
try {
|
||||||
unlinkSync(tempPath);
|
execSync(
|
||||||
} catch {
|
`libreoffice --headless --convert-to ${modernExt} --outdir "${tempDir}" "${inputPath}"`,
|
||||||
// Ignore cleanup errors
|
{ timeout: 60000, stdio: 'pipe' }
|
||||||
|
);
|
||||||
|
} catch (execError) {
|
||||||
|
// LibreOffice not available or conversion failed
|
||||||
|
const error = execError as Error & { code?: string };
|
||||||
|
if (error.code === 'ENOENT' || error.message?.includes('not found')) {
|
||||||
|
throw new Error(
|
||||||
|
`Legacy ${extension} files require LibreOffice for text extraction. ` +
|
||||||
|
`Please convert to .${modernExt} format or ensure LibreOffice is installed.`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
throw new Error(`LibreOffice conversion failed: ${error.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Read the converted file and extract text using officeparser (with OCR if enabled)
|
||||||
|
if (existsSync(outputPath)) {
|
||||||
|
const convertedBuffer = readFileSync(outputPath);
|
||||||
|
const config = useOcr ? {
|
||||||
|
extractAttachments: true,
|
||||||
|
ocr: true,
|
||||||
|
ocrLanguage: 'eng'
|
||||||
|
} : {};
|
||||||
|
const ast = await officeParser.parseOffice(convertedBuffer, config);
|
||||||
|
let text = ast.toText();
|
||||||
|
|
||||||
|
// Include OCR text from attachments if available
|
||||||
|
if (useOcr && ast.attachments) {
|
||||||
|
for (const attachment of ast.attachments) {
|
||||||
|
if (attachment.ocrText) {
|
||||||
|
text += '\n' + attachment.ocrText;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error('LibreOffice conversion produced no output');
|
||||||
|
} finally {
|
||||||
|
// Cleanup temp files
|
||||||
|
try { unlinkSync(inputPath); } catch { /* ignore */ }
|
||||||
|
try { unlinkSync(outputPath); } catch { /* ignore */ }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -274,9 +324,9 @@ export async function processDocument(
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Legacy .ppt files (PowerPoint 97-2003)
|
// Legacy .ppt files (PowerPoint 97-2003) - use LibreOffice for conversion
|
||||||
if (needsSheetJSPPT(mimeType)) {
|
if (needsSheetJSPPT(mimeType)) {
|
||||||
const text = extractWithSheetJSPPT(buffer);
|
const text = await extractWithLibreOffice(buffer, '.ppt', options.useOcr);
|
||||||
return {
|
return {
|
||||||
type: 'text',
|
type: 'text',
|
||||||
content: text,
|
content: text,
|
||||||
|
|
|
||||||
31
server/src/types/legacy-office.d.ts
vendored
31
server/src/types/legacy-office.d.ts
vendored
|
|
@ -1,31 +0,0 @@
|
||||||
// Type declarations for legacy Office format parsers
|
|
||||||
|
|
||||||
declare module 'ppt' {
|
|
||||||
interface PPTPresentation {
|
|
||||||
docs: Array<{
|
|
||||||
slideList: string[];
|
|
||||||
}>;
|
|
||||||
slides: Array<{
|
|
||||||
drawing: {
|
|
||||||
groupShape: Array<{
|
|
||||||
clientTextbox?: {
|
|
||||||
t: string;
|
|
||||||
};
|
|
||||||
}>;
|
|
||||||
};
|
|
||||||
}>;
|
|
||||||
}
|
|
||||||
|
|
||||||
interface PPTUtils {
|
|
||||||
to_text(pres: PPTPresentation): string[];
|
|
||||||
}
|
|
||||||
|
|
||||||
interface PPTModule {
|
|
||||||
version: string;
|
|
||||||
readFile(filename: string, opts?: { WTF?: number; dump?: number }): PPTPresentation;
|
|
||||||
utils: PPTUtils;
|
|
||||||
}
|
|
||||||
|
|
||||||
const PPT: PPTModule;
|
|
||||||
export default PPT;
|
|
||||||
}
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue