Add ppt deps

This commit is contained in:
Joey Yakimowich-Payne 2026-01-23 14:54:44 -07:00
commit fa2ac3bdfd
No known key found for this signature in database
GPG key ID: 6BFE655FA5ABD1E1
4 changed files with 361 additions and 7 deletions

View file

@ -1,4 +1,8 @@
import officeParser from 'officeparser';
import WordExtractor from 'word-extractor';
import * as XLSX from 'xlsx';
import * as PPT from 'ppt';
import * as CFB from 'cfb';
export const GEMINI_NATIVE_TYPES = [
'application/pdf',
@ -30,20 +34,22 @@ export const TYPES_TO_CONVERT_TO_PLAIN_TEXT = [
'application/x-yaml',
];
// MIME types that officeparser can extract text from
// MIME types that officeparser can extract text from (modern Office formats)
export const OFFICEPARSER_TYPES = [
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // .pptx
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // .xlsx
'application/msword', // .doc
'application/vnd.ms-powerpoint', // .ppt
'application/vnd.ms-excel', // .xls
'application/vnd.oasis.opendocument.text', // .odt
'application/vnd.oasis.opendocument.presentation', // .odp
'application/vnd.oasis.opendocument.spreadsheet', // .ods
'application/rtf', // .rtf
];
// Legacy Office formats requiring specialized parsers
export const LEGACY_DOC_TYPE = 'application/msword'; // .doc (word-extractor)
export const LEGACY_XLS_TYPE = 'application/vnd.ms-excel'; // .xls (xlsx/SheetJS)
export const LEGACY_PPT_TYPE = 'application/vnd.ms-powerpoint'; // .ppt (ppt/SheetJS)
// Image types that can use OCR for text extraction
export const OCR_CAPABLE_TYPES = [
'image/jpeg',
@ -52,7 +58,14 @@ export const OCR_CAPABLE_TYPES = [
'image/webp'
];
export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES, ...TYPES_TO_CONVERT_TO_PLAIN_TEXT];
export const SUPPORTED_TYPES = [
...GEMINI_NATIVE_TYPES,
...OFFICEPARSER_TYPES,
...TYPES_TO_CONVERT_TO_PLAIN_TEXT,
LEGACY_DOC_TYPE,
LEGACY_XLS_TYPE,
LEGACY_PPT_TYPE,
];
const EXTENSION_TO_MIME: Record<string, string> = {
'.txt': 'text/plain',
@ -129,6 +142,18 @@ export function needsPlainTextConversion(mimeType: string): boolean {
return TYPES_TO_CONVERT_TO_PLAIN_TEXT.includes(mimeType);
}
export function needsWordExtractor(mimeType: string): boolean {
return mimeType === LEGACY_DOC_TYPE;
}
export function needsSheetJS(mimeType: string): boolean {
return mimeType === LEGACY_XLS_TYPE;
}
export function needsSheetJSPPT(mimeType: string): boolean {
return mimeType === LEGACY_PPT_TYPE;
}
async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): Promise<string> {
const config = useOcr ? {
extractAttachments: true,
@ -150,6 +175,34 @@ async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false):
return text;
}
async function extractWithWordExtractor(buffer: Buffer): Promise<string> {
const extractor = new WordExtractor();
const doc = await extractor.extract(buffer);
return doc.getBody();
}
function extractWithSheetJS(buffer: Buffer): string {
const workbook = XLSX.read(buffer, { type: 'buffer' });
const textParts: string[] = [];
for (const sheetName of workbook.SheetNames) {
const sheet = workbook.Sheets[sheetName];
const csv = XLSX.utils.sheet_to_csv(sheet);
if (csv.trim()) {
textParts.push(`--- ${sheetName} ---\n${csv}`);
}
}
return textParts.join('\n\n');
}
function extractWithSheetJSPPT(buffer: Buffer): string {
const cfb = CFB.read(buffer, { type: 'buffer' });
const pres = PPT.parse_pptcfb(cfb);
const textArray = PPT.utils.to_text(pres);
return textArray.join('\n\n');
}
export async function processDocument(
buffer: Buffer,
mimeType: string,
@ -188,6 +241,36 @@ export async function processDocument(
};
}
// Legacy .doc files (Word 97-2003)
if (needsWordExtractor(mimeType)) {
const text = await extractWithWordExtractor(buffer);
return {
type: 'text',
content: text,
mimeType: 'text/plain'
};
}
// Legacy .xls files (Excel 97-2003)
if (needsSheetJS(mimeType)) {
const text = extractWithSheetJS(buffer);
return {
type: 'text',
content: text,
mimeType: 'text/plain'
};
}
// Legacy .ppt files (PowerPoint 97-2003)
if (needsSheetJSPPT(mimeType)) {
const text = extractWithSheetJSPPT(buffer);
return {
type: 'text',
content: text,
mimeType: 'text/plain'
};
}
if (needsPlainTextConversion(mimeType)) {
return {
type: 'text',

46
server/src/types/legacy-office.d.ts vendored Normal file
View file

@ -0,0 +1,46 @@
// Type declarations for legacy Office format parsers
declare module 'ppt' {
interface PPTPresentation {
docs: Array<{
slideList: string[];
}>;
slides: Array<{
drawing: {
groupShape: Array<{
clientTextbox?: {
t: string;
};
}>;
};
}>;
}
interface PPTUtils {
to_text(pres: PPTPresentation): string[];
}
export function readFile(filename: string, opts?: { WTF?: number; dump?: number }): PPTPresentation;
export function parse_pptcfb(cfb: CFB.Container, opts?: object): PPTPresentation;
export const utils: PPTUtils;
}
declare module 'cfb' {
interface CFBEntry {
name: string;
type: number;
content?: Buffer;
}
interface Container {
FileIndex: CFBEntry[];
FullPaths: string[];
}
interface ReadOptions {
type?: 'file' | 'buffer' | 'base64' | 'binary' | 'array';
}
export function read(data: Buffer | string | ArrayBuffer, opts?: ReadOptions): Container;
export function parse(data: Buffer | number[]): Container;
}