Fix ppt
This commit is contained in:
parent
255497837b
commit
b85970e168
5 changed files with 67 additions and 104 deletions
|
|
@ -1,8 +1,8 @@
|
|||
import officeParser from 'officeparser';
|
||||
import WordExtractor from 'word-extractor';
|
||||
import * as XLSX from 'xlsx';
|
||||
import PPT from 'ppt';
|
||||
import { writeFileSync, unlinkSync } from 'fs';
|
||||
import { execSync } from 'child_process';
|
||||
import { writeFileSync, readFileSync, unlinkSync, existsSync } from 'fs';
|
||||
import { tmpdir } from 'os';
|
||||
import { join } from 'path';
|
||||
import { randomUUID } from 'crypto';
|
||||
|
|
@ -199,20 +199,70 @@ function extractWithSheetJS(buffer: Buffer): string {
|
|||
return textParts.join('\n\n');
|
||||
}
|
||||
|
||||
function extractWithSheetJSPPT(buffer: Buffer): string {
|
||||
// PPT library requires file path due to CFB API compatibility issues
|
||||
const tempPath = join(tmpdir(), `ppt-${randomUUID()}.ppt`);
|
||||
// Map legacy extensions to their modern equivalents for LibreOffice conversion
|
||||
const LEGACY_TO_MODERN: Record<string, string> = {
|
||||
'.ppt': 'pptx',
|
||||
'.doc': 'docx',
|
||||
'.xls': 'xlsx',
|
||||
};
|
||||
|
||||
async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise<string> {
|
||||
// Use LibreOffice to convert legacy Office files to modern format, then parse
|
||||
const tempId = randomUUID();
|
||||
const tempDir = tmpdir();
|
||||
const inputPath = join(tempDir, `input-${tempId}${extension}`);
|
||||
const modernExt = LEGACY_TO_MODERN[extension] || 'pdf';
|
||||
const outputPath = join(tempDir, `input-${tempId}.${modernExt}`);
|
||||
|
||||
try {
|
||||
writeFileSync(tempPath, buffer);
|
||||
const pres = PPT.readFile(tempPath);
|
||||
const textArray = PPT.utils.to_text(pres);
|
||||
return textArray.join('\n\n');
|
||||
} finally {
|
||||
// Write input file
|
||||
writeFileSync(inputPath, buffer);
|
||||
|
||||
// Convert to modern format using LibreOffice
|
||||
try {
|
||||
unlinkSync(tempPath);
|
||||
} catch {
|
||||
// Ignore cleanup errors
|
||||
execSync(
|
||||
`libreoffice --headless --convert-to ${modernExt} --outdir "${tempDir}" "${inputPath}"`,
|
||||
{ timeout: 60000, stdio: 'pipe' }
|
||||
);
|
||||
} catch (execError) {
|
||||
// LibreOffice not available or conversion failed
|
||||
const error = execError as Error & { code?: string };
|
||||
if (error.code === 'ENOENT' || error.message?.includes('not found')) {
|
||||
throw new Error(
|
||||
`Legacy ${extension} files require LibreOffice for text extraction. ` +
|
||||
`Please convert to .${modernExt} format or ensure LibreOffice is installed.`
|
||||
);
|
||||
}
|
||||
throw new Error(`LibreOffice conversion failed: ${error.message}`);
|
||||
}
|
||||
|
||||
// Read the converted file and extract text using officeparser (with OCR if enabled)
|
||||
if (existsSync(outputPath)) {
|
||||
const convertedBuffer = readFileSync(outputPath);
|
||||
const config = useOcr ? {
|
||||
extractAttachments: true,
|
||||
ocr: true,
|
||||
ocrLanguage: 'eng'
|
||||
} : {};
|
||||
const ast = await officeParser.parseOffice(convertedBuffer, config);
|
||||
let text = ast.toText();
|
||||
|
||||
// Include OCR text from attachments if available
|
||||
if (useOcr && ast.attachments) {
|
||||
for (const attachment of ast.attachments) {
|
||||
if (attachment.ocrText) {
|
||||
text += '\n' + attachment.ocrText;
|
||||
}
|
||||
}
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
throw new Error('LibreOffice conversion produced no output');
|
||||
} finally {
|
||||
// Cleanup temp files
|
||||
try { unlinkSync(inputPath); } catch { /* ignore */ }
|
||||
try { unlinkSync(outputPath); } catch { /* ignore */ }
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -274,9 +324,9 @@ export async function processDocument(
|
|||
};
|
||||
}
|
||||
|
||||
// Legacy .ppt files (PowerPoint 97-2003)
|
||||
// Legacy .ppt files (PowerPoint 97-2003) - use LibreOffice for conversion
|
||||
if (needsSheetJSPPT(mimeType)) {
|
||||
const text = extractWithSheetJSPPT(buffer);
|
||||
const text = await extractWithLibreOffice(buffer, '.ppt', options.useOcr);
|
||||
return {
|
||||
type: 'text',
|
||||
content: text,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue