Add more file formats

This commit is contained in:
Joey Yakimowich-Payne 2026-01-19 15:00:25 -07:00
commit b80254418b
No known key found for this signature in database
GPG key ID: DDF6AF5B21B407D4
5 changed files with 297 additions and 6 deletions

View file

@ -433,7 +433,7 @@ export const Landing: React.FC<LandingProps> = ({ onGenerate, onCreateManual, on
multiple multiple
className="hidden" className="hidden"
onChange={handleFileSelect} onChange={handleFileSelect}
accept=".pdf,.txt,.md,.docx,.pptx,.xlsx,.odt,.odp,.ods,.rtf,.jpg,.jpeg,.png,.gif,.webp" accept=".pdf,.txt,.md,.csv,.html,.htm,.xml,.json,.js,.ts,.jsx,.tsx,.py,.java,.c,.cpp,.h,.hpp,.css,.yaml,.yml,.docx,.pptx,.xlsx,.odt,.odp,.ods,.rtf,.jpg,.jpeg,.png,.gif,.webp"
/> />
<div className="space-y-2"> <div className="space-y-2">
@ -441,7 +441,7 @@ export const Landing: React.FC<LandingProps> = ({ onGenerate, onCreateManual, on
<p className="text-sm font-bold text-gray-600"> <p className="text-sm font-bold text-gray-600">
{isDragging ? 'Drop files here' : 'Drop files or click to browse'} {isDragging ? 'Drop files here' : 'Drop files or click to browse'}
</p> </p>
<p className="text-xs text-gray-400">PDF, DOCX, PPTX, XLSX, TXT, Images</p> <p className="text-xs text-gray-400">PDF, Office, Text, Code, CSV, HTML, JSON, Images</p>
</div> </div>
</div> </div>

230
server/package-lock.json generated
View file

@ -8,6 +8,7 @@
"name": "kaboot-backend", "name": "kaboot-backend",
"version": "1.0.0", "version": "1.0.0",
"dependencies": { "dependencies": {
"@google/genai": "^0.14.1",
"better-sqlite3": "^11.7.0", "better-sqlite3": "^11.7.0",
"cors": "^2.8.5", "cors": "^2.8.5",
"express": "^4.21.2", "express": "^4.21.2",
@ -24,6 +25,7 @@
"@types/cors": "^2.8.17", "@types/cors": "^2.8.17",
"@types/express": "^5.0.0", "@types/express": "^5.0.0",
"@types/express-rate-limit": "^5.1.3", "@types/express-rate-limit": "^5.1.3",
"@types/helmet": "^0.0.48",
"@types/jsonwebtoken": "^9.0.7", "@types/jsonwebtoken": "^9.0.7",
"@types/multer": "^2.0.0", "@types/multer": "^2.0.0",
"@types/node": "^22.10.7", "@types/node": "^22.10.7",
@ -474,6 +476,21 @@
"node": ">=18" "node": ">=18"
} }
}, },
"node_modules/@google/genai": {
"version": "0.14.1",
"resolved": "https://registry.npmjs.org/@google/genai/-/genai-0.14.1.tgz",
"integrity": "sha512-BZ93j4XcvsLEX5RkYE1RqrXLpuzEuH5VGY0geRrHjfpLP3ijDepGePg/iJ7kMSPOTXFYNMeTruNyoTB6TXXgnA==",
"license": "Apache-2.0",
"dependencies": {
"google-auth-library": "^9.14.2",
"ws": "^8.18.0",
"zod": "^3.22.4",
"zod-to-json-schema": "^3.22.4"
},
"engines": {
"node": ">=18.0.0"
}
},
"node_modules/@napi-rs/canvas": { "node_modules/@napi-rs/canvas": {
"version": "0.1.88", "version": "0.1.88",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.88.tgz", "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.88.tgz",
@ -804,6 +821,16 @@
"@types/send": "*" "@types/send": "*"
} }
}, },
"node_modules/@types/helmet": {
"version": "0.0.48",
"resolved": "https://registry.npmjs.org/@types/helmet/-/helmet-0.0.48.tgz",
"integrity": "sha512-C7MpnvSDrunS1q2Oy1VWCY7CDWHozqSnM8P4tFeRTuzwqni+PYOjEredwcqWG+kLpYcgLsgcY3orHB54gbx2Jw==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/express": "*"
}
},
"node_modules/@types/http-errors": { "node_modules/@types/http-errors": {
"version": "2.0.5", "version": "2.0.5",
"resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.5.tgz", "resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.5.tgz",
@ -925,6 +952,15 @@
"node": ">= 0.6" "node": ">= 0.6"
} }
}, },
"node_modules/agent-base": {
"version": "7.1.4",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
"integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==",
"license": "MIT",
"engines": {
"node": ">= 14"
}
},
"node_modules/append-field": { "node_modules/append-field": {
"version": "1.0.0", "version": "1.0.0",
"resolved": "https://registry.npmjs.org/append-field/-/append-field-1.0.0.tgz", "resolved": "https://registry.npmjs.org/append-field/-/append-field-1.0.0.tgz",
@ -968,6 +1004,15 @@
"prebuild-install": "^7.1.1" "prebuild-install": "^7.1.1"
} }
}, },
"node_modules/bignumber.js": {
"version": "9.3.1",
"resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.3.1.tgz",
"integrity": "sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==",
"license": "MIT",
"engines": {
"node": "*"
}
},
"node_modules/bindings": { "node_modules/bindings": {
"version": "1.5.0", "version": "1.5.0",
"resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz", "resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz",
@ -1469,6 +1514,12 @@
"express": ">= 4.11" "express": ">= 4.11"
} }
}, },
"node_modules/extend": {
"version": "3.0.2",
"resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
"integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
"license": "MIT"
},
"node_modules/file-type": { "node_modules/file-type": {
"version": "16.5.4", "version": "16.5.4",
"resolved": "https://registry.npmjs.org/file-type/-/file-type-16.5.4.tgz", "resolved": "https://registry.npmjs.org/file-type/-/file-type-16.5.4.tgz",
@ -1558,6 +1609,49 @@
"url": "https://github.com/sponsors/ljharb" "url": "https://github.com/sponsors/ljharb"
} }
}, },
"node_modules/gaxios": {
"version": "6.7.1",
"resolved": "https://registry.npmjs.org/gaxios/-/gaxios-6.7.1.tgz",
"integrity": "sha512-LDODD4TMYx7XXdpwxAVRAIAuB0bzv0s+ywFonY46k126qzQHT9ygyoa9tncmOiQmmDrik65UYsEkv3lbfqQ3yQ==",
"license": "Apache-2.0",
"dependencies": {
"extend": "^3.0.2",
"https-proxy-agent": "^7.0.1",
"is-stream": "^2.0.0",
"node-fetch": "^2.6.9",
"uuid": "^9.0.1"
},
"engines": {
"node": ">=14"
}
},
"node_modules/gaxios/node_modules/uuid": {
"version": "9.0.1",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
"funding": [
"https://github.com/sponsors/broofa",
"https://github.com/sponsors/ctavan"
],
"license": "MIT",
"bin": {
"uuid": "dist/bin/uuid"
}
},
"node_modules/gcp-metadata": {
"version": "6.1.1",
"resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-6.1.1.tgz",
"integrity": "sha512-a4tiq7E0/5fTjxPAaH4jpjkSv/uCaU2p5KC6HVGrvl0cDjA8iBZv4vv1gyzlmK0ZUKqwpOyQMKzZQe3lTit77A==",
"license": "Apache-2.0",
"dependencies": {
"gaxios": "^6.1.1",
"google-logging-utils": "^0.0.2",
"json-bigint": "^1.0.0"
},
"engines": {
"node": ">=14"
}
},
"node_modules/get-intrinsic": { "node_modules/get-intrinsic": {
"version": "1.3.0", "version": "1.3.0",
"resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
@ -1614,6 +1708,32 @@
"integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==", "integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/google-auth-library": {
"version": "9.15.1",
"resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-9.15.1.tgz",
"integrity": "sha512-Jb6Z0+nvECVz+2lzSMt9u98UsoakXxA2HGHMCxh+so3n90XgYWkq5dur19JAJV7ONiJY22yBTyJB1TSkvPq9Ng==",
"license": "Apache-2.0",
"dependencies": {
"base64-js": "^1.3.0",
"ecdsa-sig-formatter": "^1.0.11",
"gaxios": "^6.1.1",
"gcp-metadata": "^6.1.0",
"gtoken": "^7.0.0",
"jws": "^4.0.0"
},
"engines": {
"node": ">=14"
}
},
"node_modules/google-logging-utils": {
"version": "0.0.2",
"resolved": "https://registry.npmjs.org/google-logging-utils/-/google-logging-utils-0.0.2.tgz",
"integrity": "sha512-NEgUnEcBiP5HrPzufUkBzJOD/Sxsco3rLNo1F1TNf7ieU8ryUzBhqba8r756CjLX7rn3fHl6iLEwPYuqpoKgQQ==",
"license": "Apache-2.0",
"engines": {
"node": ">=14"
}
},
"node_modules/gopd": { "node_modules/gopd": {
"version": "1.2.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
@ -1626,6 +1746,19 @@
"url": "https://github.com/sponsors/ljharb" "url": "https://github.com/sponsors/ljharb"
} }
}, },
"node_modules/gtoken": {
"version": "7.1.0",
"resolved": "https://registry.npmjs.org/gtoken/-/gtoken-7.1.0.tgz",
"integrity": "sha512-pCcEwRi+TKpMlxAQObHDQ56KawURgyAf6jtIY046fJ5tIv3zDe/LEIubckAO8fj6JnAxLdmWkUfNyulQ2iKdEw==",
"license": "MIT",
"dependencies": {
"gaxios": "^6.0.0",
"jws": "^4.0.0"
},
"engines": {
"node": ">=14.0.0"
}
},
"node_modules/has-symbols": { "node_modules/has-symbols": {
"version": "1.1.0", "version": "1.1.0",
"resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
@ -1679,6 +1812,42 @@
"url": "https://opencollective.com/express" "url": "https://opencollective.com/express"
} }
}, },
"node_modules/https-proxy-agent": {
"version": "7.0.6",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
"integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==",
"license": "MIT",
"dependencies": {
"agent-base": "^7.1.2",
"debug": "4"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/https-proxy-agent/node_modules/debug": {
"version": "4.4.3",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
"integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
"license": "MIT",
"dependencies": {
"ms": "^2.1.3"
},
"engines": {
"node": ">=6.0"
},
"peerDependenciesMeta": {
"supports-color": {
"optional": true
}
}
},
"node_modules/https-proxy-agent/node_modules/ms": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
"license": "MIT"
},
"node_modules/iconv-lite": { "node_modules/iconv-lite": {
"version": "0.4.24", "version": "0.4.24",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
@ -1747,6 +1916,18 @@
"node": ">= 0.10" "node": ">= 0.10"
} }
}, },
"node_modules/is-stream": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz",
"integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==",
"license": "MIT",
"engines": {
"node": ">=8"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/is-url": { "node_modules/is-url": {
"version": "1.2.4", "version": "1.2.4",
"resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz", "resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz",
@ -1762,6 +1943,15 @@
"url": "https://github.com/sponsors/panva" "url": "https://github.com/sponsors/panva"
} }
}, },
"node_modules/json-bigint": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz",
"integrity": "sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==",
"license": "MIT",
"dependencies": {
"bignumber.js": "^9.0.0"
}
},
"node_modules/jsonwebtoken": { "node_modules/jsonwebtoken": {
"version": "9.0.3", "version": "9.0.3",
"resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.3.tgz", "resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.3.tgz",
@ -2952,6 +3142,27 @@
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
"license": "ISC" "license": "ISC"
}, },
"node_modules/ws": {
"version": "8.19.0",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.19.0.tgz",
"integrity": "sha512-blAT2mjOEIi0ZzruJfIhb3nps74PRWTCz1IjglWEEpQl5XS/UNama6u2/rjFkDDouqr4L67ry+1aGIALViWjDg==",
"license": "MIT",
"engines": {
"node": ">=10.0.0"
},
"peerDependencies": {
"bufferutil": "^4.0.1",
"utf-8-validate": ">=5.0.2"
},
"peerDependenciesMeta": {
"bufferutil": {
"optional": true
},
"utf-8-validate": {
"optional": true
}
}
},
"node_modules/xtend": { "node_modules/xtend": {
"version": "4.0.2", "version": "4.0.2",
"resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz",
@ -2988,6 +3199,25 @@
"engines": { "engines": {
"node": "*" "node": "*"
} }
},
"node_modules/zod": {
"version": "3.25.76",
"resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
"integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
"license": "MIT",
"peer": true,
"funding": {
"url": "https://github.com/sponsors/colinhacks"
}
},
"node_modules/zod-to-json-schema": {
"version": "3.25.1",
"resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.25.1.tgz",
"integrity": "sha512-pM/SU9d3YAggzi6MtR4h7ruuQlqKtad8e9S0fmxcMi+ueAK5Korys/aWcV9LIIHTVbj01NdzxcnXSN+O74ZIVA==",
"license": "ISC",
"peerDependencies": {
"zod": "^3.25 || ^4"
}
} }
} }
} }

View file

@ -28,6 +28,7 @@
"@types/cors": "^2.8.17", "@types/cors": "^2.8.17",
"@types/express": "^5.0.0", "@types/express": "^5.0.0",
"@types/express-rate-limit": "^5.1.3", "@types/express-rate-limit": "^5.1.3",
"@types/helmet": "^0.0.48",
"@types/jsonwebtoken": "^9.0.7", "@types/jsonwebtoken": "^9.0.7",
"@types/multer": "^2.0.0", "@types/multer": "^2.0.0",
"@types/node": "^22.10.7", "@types/node": "^22.10.7",

View file

@ -1,6 +1,6 @@
import { Router } from 'express'; import { Router } from 'express';
import multer from 'multer'; import multer from 'multer';
import { processDocument, SUPPORTED_TYPES } from '../services/documentParser.js'; import { processDocument, SUPPORTED_TYPES, normalizeMimeType } from '../services/documentParser.js';
import { requireAuth } from '../middleware/auth.js'; import { requireAuth } from '../middleware/auth.js';
const router = Router(); const router = Router();
@ -15,10 +15,11 @@ const upload = multer({
fileSize: 50 * 1024 * 1024, // 50MB limit fileSize: 50 * 1024 * 1024, // 50MB limit
}, },
fileFilter: (_req, file, cb) => { fileFilter: (_req, file, cb) => {
if (SUPPORTED_TYPES.includes(file.mimetype)) { const normalizedMime = normalizeMimeType(file.mimetype, file.originalname);
if (SUPPORTED_TYPES.includes(normalizedMime)) {
cb(null, true); cb(null, true);
} else { } else {
cb(new Error(`Unsupported file type: ${file.mimetype}. Supported types: ${SUPPORTED_TYPES.join(', ')}`)); cb(new Error(`Unsupported file type: ${file.mimetype}. Supported types: PDF, Office docs, text files, code files, and images.`));
} }
} }
}); });
@ -30,7 +31,8 @@ router.post('/', upload.single('document'), async (req, res) => {
} }
const useOcr = req.body?.useOcr === 'true' || req.body?.useOcr === true; const useOcr = req.body?.useOcr === 'true' || req.body?.useOcr === true;
const processed = await processDocument(req.file.buffer, req.file.mimetype, { useOcr }); const normalizedMime = normalizeMimeType(req.file.mimetype, req.file.originalname);
const processed = await processDocument(req.file.buffer, normalizedMime, { useOcr });
if (processed.type === 'native') { if (processed.type === 'native') {
res.json({ res.json({

View file

@ -7,6 +7,19 @@ export const GEMINI_NATIVE_TYPES = [
'text/markdown', 'text/markdown',
'text/csv', 'text/csv',
'text/html', 'text/html',
'text/xml',
'application/xml',
'application/json',
'text/javascript',
'application/javascript',
'text/x-python',
'text/x-java-source',
'text/x-c',
'text/x-c++',
'text/x-typescript',
'text/css',
'text/yaml',
'application/x-yaml',
'image/jpeg', 'image/jpeg',
'image/png', 'image/png',
'image/gif', 'image/gif',
@ -34,6 +47,51 @@ export const OCR_CAPABLE_TYPES = [
export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES]; export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES];
const EXTENSION_TO_MIME: Record<string, string> = {
'.txt': 'text/plain',
'.md': 'text/markdown',
'.markdown': 'text/markdown',
'.csv': 'text/csv',
'.html': 'text/html',
'.htm': 'text/html',
'.xml': 'application/xml',
'.json': 'application/json',
'.js': 'text/javascript',
'.mjs': 'text/javascript',
'.ts': 'text/x-typescript',
'.tsx': 'text/x-typescript',
'.jsx': 'text/javascript',
'.py': 'text/x-python',
'.java': 'text/x-java-source',
'.c': 'text/x-c',
'.h': 'text/x-c',
'.cpp': 'text/x-c++',
'.hpp': 'text/x-c++',
'.cc': 'text/x-c++',
'.css': 'text/css',
'.yaml': 'text/yaml',
'.yml': 'text/yaml',
};
export function normalizeMimeType(mimeType: string, filename?: string): string {
if (SUPPORTED_TYPES.includes(mimeType)) {
return mimeType;
}
if (mimeType === 'application/octet-stream' && filename) {
const ext = filename.toLowerCase().match(/\.[^.]+$/)?.[0];
if (ext && EXTENSION_TO_MIME[ext]) {
return EXTENSION_TO_MIME[ext];
}
}
if (mimeType.startsWith('text/') && !SUPPORTED_TYPES.includes(mimeType)) {
return 'text/plain';
}
return mimeType;
}
export interface ProcessedDocument { export interface ProcessedDocument {
type: 'native' | 'text'; type: 'native' | 'text';
content: Buffer | string; content: Buffer | string;