From b80254418b007904d3215fce38ea0032202bb1a9 Mon Sep 17 00:00:00 2001 From: Joey Yakimowich-Payne Date: Mon, 19 Jan 2026 15:00:25 -0700 Subject: [PATCH 1/2] Add more file formats --- components/Landing.tsx | 4 +- server/package-lock.json | 230 ++++++++++++++++++++++++++ server/package.json | 1 + server/src/routes/upload.ts | 10 +- server/src/services/documentParser.ts | 58 +++++++ 5 files changed, 297 insertions(+), 6 deletions(-) diff --git a/components/Landing.tsx b/components/Landing.tsx index f932bb3..1f49321 100644 --- a/components/Landing.tsx +++ b/components/Landing.tsx @@ -433,7 +433,7 @@ export const Landing: React.FC = ({ onGenerate, onCreateManual, on multiple className="hidden" onChange={handleFileSelect} - accept=".pdf,.txt,.md,.docx,.pptx,.xlsx,.odt,.odp,.ods,.rtf,.jpg,.jpeg,.png,.gif,.webp" + accept=".pdf,.txt,.md,.csv,.html,.htm,.xml,.json,.js,.ts,.jsx,.tsx,.py,.java,.c,.cpp,.h,.hpp,.css,.yaml,.yml,.docx,.pptx,.xlsx,.odt,.odp,.ods,.rtf,.jpg,.jpeg,.png,.gif,.webp" />
@@ -441,7 +441,7 @@ export const Landing: React.FC = ({ onGenerate, onCreateManual, on

{isDragging ? 'Drop files here' : 'Drop files or click to browse'}

-

PDF, DOCX, PPTX, XLSX, TXT, Images

+

PDF, Office, Text, Code, CSV, HTML, JSON, Images

diff --git a/server/package-lock.json b/server/package-lock.json index aeddd72..3c922d7 100644 --- a/server/package-lock.json +++ b/server/package-lock.json @@ -8,6 +8,7 @@ "name": "kaboot-backend", "version": "1.0.0", "dependencies": { + "@google/genai": "^0.14.1", "better-sqlite3": "^11.7.0", "cors": "^2.8.5", "express": "^4.21.2", @@ -24,6 +25,7 @@ "@types/cors": "^2.8.17", "@types/express": "^5.0.0", "@types/express-rate-limit": "^5.1.3", + "@types/helmet": "^0.0.48", "@types/jsonwebtoken": "^9.0.7", "@types/multer": "^2.0.0", "@types/node": "^22.10.7", @@ -474,6 +476,21 @@ "node": ">=18" } }, + "node_modules/@google/genai": { + "version": "0.14.1", + "resolved": "https://registry.npmjs.org/@google/genai/-/genai-0.14.1.tgz", + "integrity": "sha512-BZ93j4XcvsLEX5RkYE1RqrXLpuzEuH5VGY0geRrHjfpLP3ijDepGePg/iJ7kMSPOTXFYNMeTruNyoTB6TXXgnA==", + "license": "Apache-2.0", + "dependencies": { + "google-auth-library": "^9.14.2", + "ws": "^8.18.0", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.4" + }, + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/@napi-rs/canvas": { "version": "0.1.88", "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.88.tgz", @@ -804,6 +821,16 @@ "@types/send": "*" } }, + "node_modules/@types/helmet": { + "version": "0.0.48", + "resolved": "https://registry.npmjs.org/@types/helmet/-/helmet-0.0.48.tgz", + "integrity": "sha512-C7MpnvSDrunS1q2Oy1VWCY7CDWHozqSnM8P4tFeRTuzwqni+PYOjEredwcqWG+kLpYcgLsgcY3orHB54gbx2Jw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/express": "*" + } + }, "node_modules/@types/http-errors": { "version": "2.0.5", "resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.5.tgz", @@ -925,6 +952,15 @@ "node": ">= 0.6" } }, + "node_modules/agent-base": { + "version": "7.1.4", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", + "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, "node_modules/append-field": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/append-field/-/append-field-1.0.0.tgz", @@ -968,6 +1004,15 @@ "prebuild-install": "^7.1.1" } }, + "node_modules/bignumber.js": { + "version": "9.3.1", + "resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.3.1.tgz", + "integrity": "sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==", + "license": "MIT", + "engines": { + "node": "*" + } + }, "node_modules/bindings": { "version": "1.5.0", "resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz", @@ -1469,6 +1514,12 @@ "express": ">= 4.11" } }, + "node_modules/extend": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", + "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", + "license": "MIT" + }, "node_modules/file-type": { "version": "16.5.4", "resolved": "https://registry.npmjs.org/file-type/-/file-type-16.5.4.tgz", @@ -1558,6 +1609,49 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/gaxios": { + "version": "6.7.1", + "resolved": "https://registry.npmjs.org/gaxios/-/gaxios-6.7.1.tgz", + "integrity": "sha512-LDODD4TMYx7XXdpwxAVRAIAuB0bzv0s+ywFonY46k126qzQHT9ygyoa9tncmOiQmmDrik65UYsEkv3lbfqQ3yQ==", + "license": "Apache-2.0", + "dependencies": { + "extend": "^3.0.2", + "https-proxy-agent": "^7.0.1", + "is-stream": "^2.0.0", + "node-fetch": "^2.6.9", + "uuid": "^9.0.1" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/gaxios/node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "license": "MIT", + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/gcp-metadata": { + "version": "6.1.1", + "resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-6.1.1.tgz", + "integrity": "sha512-a4tiq7E0/5fTjxPAaH4jpjkSv/uCaU2p5KC6HVGrvl0cDjA8iBZv4vv1gyzlmK0ZUKqwpOyQMKzZQe3lTit77A==", + "license": "Apache-2.0", + "dependencies": { + "gaxios": "^6.1.1", + "google-logging-utils": "^0.0.2", + "json-bigint": "^1.0.0" + }, + "engines": { + "node": ">=14" + } + }, "node_modules/get-intrinsic": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", @@ -1614,6 +1708,32 @@ "integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==", "license": "MIT" }, + "node_modules/google-auth-library": { + "version": "9.15.1", + "resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-9.15.1.tgz", + "integrity": "sha512-Jb6Z0+nvECVz+2lzSMt9u98UsoakXxA2HGHMCxh+so3n90XgYWkq5dur19JAJV7ONiJY22yBTyJB1TSkvPq9Ng==", + "license": "Apache-2.0", + "dependencies": { + "base64-js": "^1.3.0", + "ecdsa-sig-formatter": "^1.0.11", + "gaxios": "^6.1.1", + "gcp-metadata": "^6.1.0", + "gtoken": "^7.0.0", + "jws": "^4.0.0" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/google-logging-utils": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/google-logging-utils/-/google-logging-utils-0.0.2.tgz", + "integrity": "sha512-NEgUnEcBiP5HrPzufUkBzJOD/Sxsco3rLNo1F1TNf7ieU8ryUzBhqba8r756CjLX7rn3fHl6iLEwPYuqpoKgQQ==", + "license": "Apache-2.0", + "engines": { + "node": ">=14" + } + }, "node_modules/gopd": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", @@ -1626,6 +1746,19 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/gtoken": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/gtoken/-/gtoken-7.1.0.tgz", + "integrity": "sha512-pCcEwRi+TKpMlxAQObHDQ56KawURgyAf6jtIY046fJ5tIv3zDe/LEIubckAO8fj6JnAxLdmWkUfNyulQ2iKdEw==", + "license": "MIT", + "dependencies": { + "gaxios": "^6.0.0", + "jws": "^4.0.0" + }, + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/has-symbols": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", @@ -1679,6 +1812,42 @@ "url": "https://opencollective.com/express" } }, + "node_modules/https-proxy-agent": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", + "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.2", + "debug": "4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/https-proxy-agent/node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/https-proxy-agent/node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "license": "MIT" + }, "node_modules/iconv-lite": { "version": "0.4.24", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", @@ -1747,6 +1916,18 @@ "node": ">= 0.10" } }, + "node_modules/is-stream": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz", + "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==", + "license": "MIT", + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/is-url": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz", @@ -1762,6 +1943,15 @@ "url": "https://github.com/sponsors/panva" } }, + "node_modules/json-bigint": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz", + "integrity": "sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==", + "license": "MIT", + "dependencies": { + "bignumber.js": "^9.0.0" + } + }, "node_modules/jsonwebtoken": { "version": "9.0.3", "resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.3.tgz", @@ -2952,6 +3142,27 @@ "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", "license": "ISC" }, + "node_modules/ws": { + "version": "8.19.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.19.0.tgz", + "integrity": "sha512-blAT2mjOEIi0ZzruJfIhb3nps74PRWTCz1IjglWEEpQl5XS/UNama6u2/rjFkDDouqr4L67ry+1aGIALViWjDg==", + "license": "MIT", + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, "node_modules/xtend": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", @@ -2988,6 +3199,25 @@ "engines": { "node": "*" } + }, + "node_modules/zod": { + "version": "3.25.76", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz", + "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==", + "license": "MIT", + "peer": true, + "funding": { + "url": "https://github.com/sponsors/colinhacks" + } + }, + "node_modules/zod-to-json-schema": { + "version": "3.25.1", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.25.1.tgz", + "integrity": "sha512-pM/SU9d3YAggzi6MtR4h7ruuQlqKtad8e9S0fmxcMi+ueAK5Korys/aWcV9LIIHTVbj01NdzxcnXSN+O74ZIVA==", + "license": "ISC", + "peerDependencies": { + "zod": "^3.25 || ^4" + } } } } diff --git a/server/package.json b/server/package.json index e8c603e..3331a10 100644 --- a/server/package.json +++ b/server/package.json @@ -28,6 +28,7 @@ "@types/cors": "^2.8.17", "@types/express": "^5.0.0", "@types/express-rate-limit": "^5.1.3", + "@types/helmet": "^0.0.48", "@types/jsonwebtoken": "^9.0.7", "@types/multer": "^2.0.0", "@types/node": "^22.10.7", diff --git a/server/src/routes/upload.ts b/server/src/routes/upload.ts index 7181623..0ba4ff0 100644 --- a/server/src/routes/upload.ts +++ b/server/src/routes/upload.ts @@ -1,6 +1,6 @@ import { Router } from 'express'; import multer from 'multer'; -import { processDocument, SUPPORTED_TYPES } from '../services/documentParser.js'; +import { processDocument, SUPPORTED_TYPES, normalizeMimeType } from '../services/documentParser.js'; import { requireAuth } from '../middleware/auth.js'; const router = Router(); @@ -15,10 +15,11 @@ const upload = multer({ fileSize: 50 * 1024 * 1024, // 50MB limit }, fileFilter: (_req, file, cb) => { - if (SUPPORTED_TYPES.includes(file.mimetype)) { + const normalizedMime = normalizeMimeType(file.mimetype, file.originalname); + if (SUPPORTED_TYPES.includes(normalizedMime)) { cb(null, true); } else { - cb(new Error(`Unsupported file type: ${file.mimetype}. Supported types: ${SUPPORTED_TYPES.join(', ')}`)); + cb(new Error(`Unsupported file type: ${file.mimetype}. Supported types: PDF, Office docs, text files, code files, and images.`)); } } }); @@ -30,7 +31,8 @@ router.post('/', upload.single('document'), async (req, res) => { } const useOcr = req.body?.useOcr === 'true' || req.body?.useOcr === true; - const processed = await processDocument(req.file.buffer, req.file.mimetype, { useOcr }); + const normalizedMime = normalizeMimeType(req.file.mimetype, req.file.originalname); + const processed = await processDocument(req.file.buffer, normalizedMime, { useOcr }); if (processed.type === 'native') { res.json({ diff --git a/server/src/services/documentParser.ts b/server/src/services/documentParser.ts index ad72b6f..6616866 100644 --- a/server/src/services/documentParser.ts +++ b/server/src/services/documentParser.ts @@ -7,6 +7,19 @@ export const GEMINI_NATIVE_TYPES = [ 'text/markdown', 'text/csv', 'text/html', + 'text/xml', + 'application/xml', + 'application/json', + 'text/javascript', + 'application/javascript', + 'text/x-python', + 'text/x-java-source', + 'text/x-c', + 'text/x-c++', + 'text/x-typescript', + 'text/css', + 'text/yaml', + 'application/x-yaml', 'image/jpeg', 'image/png', 'image/gif', @@ -34,6 +47,51 @@ export const OCR_CAPABLE_TYPES = [ export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES]; +const EXTENSION_TO_MIME: Record = { + '.txt': 'text/plain', + '.md': 'text/markdown', + '.markdown': 'text/markdown', + '.csv': 'text/csv', + '.html': 'text/html', + '.htm': 'text/html', + '.xml': 'application/xml', + '.json': 'application/json', + '.js': 'text/javascript', + '.mjs': 'text/javascript', + '.ts': 'text/x-typescript', + '.tsx': 'text/x-typescript', + '.jsx': 'text/javascript', + '.py': 'text/x-python', + '.java': 'text/x-java-source', + '.c': 'text/x-c', + '.h': 'text/x-c', + '.cpp': 'text/x-c++', + '.hpp': 'text/x-c++', + '.cc': 'text/x-c++', + '.css': 'text/css', + '.yaml': 'text/yaml', + '.yml': 'text/yaml', +}; + +export function normalizeMimeType(mimeType: string, filename?: string): string { + if (SUPPORTED_TYPES.includes(mimeType)) { + return mimeType; + } + + if (mimeType === 'application/octet-stream' && filename) { + const ext = filename.toLowerCase().match(/\.[^.]+$/)?.[0]; + if (ext && EXTENSION_TO_MIME[ext]) { + return EXTENSION_TO_MIME[ext]; + } + } + + if (mimeType.startsWith('text/') && !SUPPORTED_TYPES.includes(mimeType)) { + return 'text/plain'; + } + + return mimeType; +} + export interface ProcessedDocument { type: 'native' | 'text'; content: Buffer | string; From 1d234d6c0ea09456dc25db19ff24587e7fb20708 Mon Sep 17 00:00:00 2001 From: Joey Yakimowich-Payne Date: Mon, 19 Jan 2026 15:02:28 -0700 Subject: [PATCH 2/2] Make gemini compatible --- server/src/services/documentParser.ts | 32 ++++++++++++++++++++------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/server/src/services/documentParser.ts b/server/src/services/documentParser.ts index 6616866..76d148f 100644 --- a/server/src/services/documentParser.ts +++ b/server/src/services/documentParser.ts @@ -1,29 +1,33 @@ import officeParser from 'officeparser'; -// MIME types that Gemini can handle natively (send as-is) export const GEMINI_NATIVE_TYPES = [ 'application/pdf', 'text/plain', 'text/markdown', 'text/csv', 'text/html', + 'text/css', + 'text/javascript', + 'image/jpeg', + 'image/png', + 'image/gif', + 'image/webp', + 'image/heic', + 'image/heif', +]; + +export const TYPES_TO_CONVERT_TO_PLAIN_TEXT = [ 'text/xml', 'application/xml', 'application/json', - 'text/javascript', 'application/javascript', 'text/x-python', 'text/x-java-source', 'text/x-c', 'text/x-c++', 'text/x-typescript', - 'text/css', 'text/yaml', 'application/x-yaml', - 'image/jpeg', - 'image/png', - 'image/gif', - 'image/webp' ]; // MIME types that officeparser can extract text from @@ -45,7 +49,7 @@ export const OCR_CAPABLE_TYPES = [ 'image/webp' ]; -export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES]; +export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES, ...TYPES_TO_CONVERT_TO_PLAIN_TEXT]; const EXTENSION_TO_MIME: Record = { '.txt': 'text/plain', @@ -118,6 +122,10 @@ export function needsOfficeParser(mimeType: string): boolean { return OFFICEPARSER_TYPES.includes(mimeType); } +export function needsPlainTextConversion(mimeType: string): boolean { + return TYPES_TO_CONVERT_TO_PLAIN_TEXT.includes(mimeType); +} + async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): Promise { const config = useOcr ? { extractAttachments: true, @@ -177,5 +185,13 @@ export async function processDocument( }; } + if (needsPlainTextConversion(mimeType)) { + return { + type: 'text', + content: buffer.toString('utf-8'), + mimeType: 'text/plain' + }; + } + throw new Error(`No extraction handler for: ${mimeType}`); }