Add ppt deps
This commit is contained in:
parent
cac058b643
commit
fa2ac3bdfd
4 changed files with 361 additions and 7 deletions
|
|
@ -1,4 +1,8 @@
|
|||
import officeParser from 'officeparser';
|
||||
import WordExtractor from 'word-extractor';
|
||||
import * as XLSX from 'xlsx';
|
||||
import * as PPT from 'ppt';
|
||||
import * as CFB from 'cfb';
|
||||
|
||||
export const GEMINI_NATIVE_TYPES = [
|
||||
'application/pdf',
|
||||
|
|
@ -30,20 +34,22 @@ export const TYPES_TO_CONVERT_TO_PLAIN_TEXT = [
|
|||
'application/x-yaml',
|
||||
];
|
||||
|
||||
// MIME types that officeparser can extract text from
|
||||
// MIME types that officeparser can extract text from (modern Office formats)
|
||||
export const OFFICEPARSER_TYPES = [
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', // .docx
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation', // .pptx
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', // .xlsx
|
||||
'application/msword', // .doc
|
||||
'application/vnd.ms-powerpoint', // .ppt
|
||||
'application/vnd.ms-excel', // .xls
|
||||
'application/vnd.oasis.opendocument.text', // .odt
|
||||
'application/vnd.oasis.opendocument.presentation', // .odp
|
||||
'application/vnd.oasis.opendocument.spreadsheet', // .ods
|
||||
'application/rtf', // .rtf
|
||||
];
|
||||
|
||||
// Legacy Office formats requiring specialized parsers
|
||||
export const LEGACY_DOC_TYPE = 'application/msword'; // .doc (word-extractor)
|
||||
export const LEGACY_XLS_TYPE = 'application/vnd.ms-excel'; // .xls (xlsx/SheetJS)
|
||||
export const LEGACY_PPT_TYPE = 'application/vnd.ms-powerpoint'; // .ppt (ppt/SheetJS)
|
||||
|
||||
// Image types that can use OCR for text extraction
|
||||
export const OCR_CAPABLE_TYPES = [
|
||||
'image/jpeg',
|
||||
|
|
@ -52,7 +58,14 @@ export const OCR_CAPABLE_TYPES = [
|
|||
'image/webp'
|
||||
];
|
||||
|
||||
export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES, ...TYPES_TO_CONVERT_TO_PLAIN_TEXT];
|
||||
export const SUPPORTED_TYPES = [
|
||||
...GEMINI_NATIVE_TYPES,
|
||||
...OFFICEPARSER_TYPES,
|
||||
...TYPES_TO_CONVERT_TO_PLAIN_TEXT,
|
||||
LEGACY_DOC_TYPE,
|
||||
LEGACY_XLS_TYPE,
|
||||
LEGACY_PPT_TYPE,
|
||||
];
|
||||
|
||||
const EXTENSION_TO_MIME: Record<string, string> = {
|
||||
'.txt': 'text/plain',
|
||||
|
|
@ -129,6 +142,18 @@ export function needsPlainTextConversion(mimeType: string): boolean {
|
|||
return TYPES_TO_CONVERT_TO_PLAIN_TEXT.includes(mimeType);
|
||||
}
|
||||
|
||||
export function needsWordExtractor(mimeType: string): boolean {
|
||||
return mimeType === LEGACY_DOC_TYPE;
|
||||
}
|
||||
|
||||
export function needsSheetJS(mimeType: string): boolean {
|
||||
return mimeType === LEGACY_XLS_TYPE;
|
||||
}
|
||||
|
||||
export function needsSheetJSPPT(mimeType: string): boolean {
|
||||
return mimeType === LEGACY_PPT_TYPE;
|
||||
}
|
||||
|
||||
async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): Promise<string> {
|
||||
const config = useOcr ? {
|
||||
extractAttachments: true,
|
||||
|
|
@ -150,6 +175,34 @@ async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false):
|
|||
return text;
|
||||
}
|
||||
|
||||
async function extractWithWordExtractor(buffer: Buffer): Promise<string> {
|
||||
const extractor = new WordExtractor();
|
||||
const doc = await extractor.extract(buffer);
|
||||
return doc.getBody();
|
||||
}
|
||||
|
||||
function extractWithSheetJS(buffer: Buffer): string {
|
||||
const workbook = XLSX.read(buffer, { type: 'buffer' });
|
||||
const textParts: string[] = [];
|
||||
|
||||
for (const sheetName of workbook.SheetNames) {
|
||||
const sheet = workbook.Sheets[sheetName];
|
||||
const csv = XLSX.utils.sheet_to_csv(sheet);
|
||||
if (csv.trim()) {
|
||||
textParts.push(`--- ${sheetName} ---\n${csv}`);
|
||||
}
|
||||
}
|
||||
|
||||
return textParts.join('\n\n');
|
||||
}
|
||||
|
||||
function extractWithSheetJSPPT(buffer: Buffer): string {
|
||||
const cfb = CFB.read(buffer, { type: 'buffer' });
|
||||
const pres = PPT.parse_pptcfb(cfb);
|
||||
const textArray = PPT.utils.to_text(pres);
|
||||
return textArray.join('\n\n');
|
||||
}
|
||||
|
||||
export async function processDocument(
|
||||
buffer: Buffer,
|
||||
mimeType: string,
|
||||
|
|
@ -188,6 +241,36 @@ export async function processDocument(
|
|||
};
|
||||
}
|
||||
|
||||
// Legacy .doc files (Word 97-2003)
|
||||
if (needsWordExtractor(mimeType)) {
|
||||
const text = await extractWithWordExtractor(buffer);
|
||||
return {
|
||||
type: 'text',
|
||||
content: text,
|
||||
mimeType: 'text/plain'
|
||||
};
|
||||
}
|
||||
|
||||
// Legacy .xls files (Excel 97-2003)
|
||||
if (needsSheetJS(mimeType)) {
|
||||
const text = extractWithSheetJS(buffer);
|
||||
return {
|
||||
type: 'text',
|
||||
content: text,
|
||||
mimeType: 'text/plain'
|
||||
};
|
||||
}
|
||||
|
||||
// Legacy .ppt files (PowerPoint 97-2003)
|
||||
if (needsSheetJSPPT(mimeType)) {
|
||||
const text = extractWithSheetJSPPT(buffer);
|
||||
return {
|
||||
type: 'text',
|
||||
content: text,
|
||||
mimeType: 'text/plain'
|
||||
};
|
||||
}
|
||||
|
||||
if (needsPlainTextConversion(mimeType)) {
|
||||
return {
|
||||
type: 'text',
|
||||
|
|
|
|||
46
server/src/types/legacy-office.d.ts
vendored
Normal file
46
server/src/types/legacy-office.d.ts
vendored
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
// Type declarations for legacy Office format parsers
|
||||
|
||||
declare module 'ppt' {
|
||||
interface PPTPresentation {
|
||||
docs: Array<{
|
||||
slideList: string[];
|
||||
}>;
|
||||
slides: Array<{
|
||||
drawing: {
|
||||
groupShape: Array<{
|
||||
clientTextbox?: {
|
||||
t: string;
|
||||
};
|
||||
}>;
|
||||
};
|
||||
}>;
|
||||
}
|
||||
|
||||
interface PPTUtils {
|
||||
to_text(pres: PPTPresentation): string[];
|
||||
}
|
||||
|
||||
export function readFile(filename: string, opts?: { WTF?: number; dump?: number }): PPTPresentation;
|
||||
export function parse_pptcfb(cfb: CFB.Container, opts?: object): PPTPresentation;
|
||||
export const utils: PPTUtils;
|
||||
}
|
||||
|
||||
declare module 'cfb' {
|
||||
interface CFBEntry {
|
||||
name: string;
|
||||
type: number;
|
||||
content?: Buffer;
|
||||
}
|
||||
|
||||
interface Container {
|
||||
FileIndex: CFBEntry[];
|
||||
FullPaths: string[];
|
||||
}
|
||||
|
||||
interface ReadOptions {
|
||||
type?: 'file' | 'buffer' | 'base64' | 'binary' | 'array';
|
||||
}
|
||||
|
||||
export function read(data: Buffer | string | ArrayBuffer, opts?: ReadOptions): Container;
|
||||
export function parse(data: Buffer | number[]): Container;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue