Compare commits
2 commits
79820f5298
...
1d234d6c0e
| Author | SHA1 | Date | |
|---|---|---|---|
|
1d234d6c0e |
|||
|
b80254418b |
5 changed files with 316 additions and 9 deletions
|
|
@ -433,7 +433,7 @@ export const Landing: React.FC<LandingProps> = ({ onGenerate, onCreateManual, on
|
|||
multiple
|
||||
className="hidden"
|
||||
onChange={handleFileSelect}
|
||||
accept=".pdf,.txt,.md,.docx,.pptx,.xlsx,.odt,.odp,.ods,.rtf,.jpg,.jpeg,.png,.gif,.webp"
|
||||
accept=".pdf,.txt,.md,.csv,.html,.htm,.xml,.json,.js,.ts,.jsx,.tsx,.py,.java,.c,.cpp,.h,.hpp,.css,.yaml,.yml,.docx,.pptx,.xlsx,.odt,.odp,.ods,.rtf,.jpg,.jpeg,.png,.gif,.webp"
|
||||
/>
|
||||
|
||||
<div className="space-y-2">
|
||||
|
|
@ -441,7 +441,7 @@ export const Landing: React.FC<LandingProps> = ({ onGenerate, onCreateManual, on
|
|||
<p className="text-sm font-bold text-gray-600">
|
||||
{isDragging ? 'Drop files here' : 'Drop files or click to browse'}
|
||||
</p>
|
||||
<p className="text-xs text-gray-400">PDF, DOCX, PPTX, XLSX, TXT, Images</p>
|
||||
<p className="text-xs text-gray-400">PDF, Office, Text, Code, CSV, HTML, JSON, Images</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
|
|||
230
server/package-lock.json
generated
230
server/package-lock.json
generated
|
|
@ -8,6 +8,7 @@
|
|||
"name": "kaboot-backend",
|
||||
"version": "1.0.0",
|
||||
"dependencies": {
|
||||
"@google/genai": "^0.14.1",
|
||||
"better-sqlite3": "^11.7.0",
|
||||
"cors": "^2.8.5",
|
||||
"express": "^4.21.2",
|
||||
|
|
@ -24,6 +25,7 @@
|
|||
"@types/cors": "^2.8.17",
|
||||
"@types/express": "^5.0.0",
|
||||
"@types/express-rate-limit": "^5.1.3",
|
||||
"@types/helmet": "^0.0.48",
|
||||
"@types/jsonwebtoken": "^9.0.7",
|
||||
"@types/multer": "^2.0.0",
|
||||
"@types/node": "^22.10.7",
|
||||
|
|
@ -474,6 +476,21 @@
|
|||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/@google/genai": {
|
||||
"version": "0.14.1",
|
||||
"resolved": "https://registry.npmjs.org/@google/genai/-/genai-0.14.1.tgz",
|
||||
"integrity": "sha512-BZ93j4XcvsLEX5RkYE1RqrXLpuzEuH5VGY0geRrHjfpLP3ijDepGePg/iJ7kMSPOTXFYNMeTruNyoTB6TXXgnA==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"google-auth-library": "^9.14.2",
|
||||
"ws": "^8.18.0",
|
||||
"zod": "^3.22.4",
|
||||
"zod-to-json-schema": "^3.22.4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas": {
|
||||
"version": "0.1.88",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.88.tgz",
|
||||
|
|
@ -804,6 +821,16 @@
|
|||
"@types/send": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/helmet": {
|
||||
"version": "0.0.48",
|
||||
"resolved": "https://registry.npmjs.org/@types/helmet/-/helmet-0.0.48.tgz",
|
||||
"integrity": "sha512-C7MpnvSDrunS1q2Oy1VWCY7CDWHozqSnM8P4tFeRTuzwqni+PYOjEredwcqWG+kLpYcgLsgcY3orHB54gbx2Jw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/express": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/http-errors": {
|
||||
"version": "2.0.5",
|
||||
"resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.5.tgz",
|
||||
|
|
@ -925,6 +952,15 @@
|
|||
"node": ">= 0.6"
|
||||
}
|
||||
},
|
||||
"node_modules/agent-base": {
|
||||
"version": "7.1.4",
|
||||
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
|
||||
"integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/append-field": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/append-field/-/append-field-1.0.0.tgz",
|
||||
|
|
@ -968,6 +1004,15 @@
|
|||
"prebuild-install": "^7.1.1"
|
||||
}
|
||||
},
|
||||
"node_modules/bignumber.js": {
|
||||
"version": "9.3.1",
|
||||
"resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.3.1.tgz",
|
||||
"integrity": "sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/bindings": {
|
||||
"version": "1.5.0",
|
||||
"resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz",
|
||||
|
|
@ -1469,6 +1514,12 @@
|
|||
"express": ">= 4.11"
|
||||
}
|
||||
},
|
||||
"node_modules/extend": {
|
||||
"version": "3.0.2",
|
||||
"resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
|
||||
"integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/file-type": {
|
||||
"version": "16.5.4",
|
||||
"resolved": "https://registry.npmjs.org/file-type/-/file-type-16.5.4.tgz",
|
||||
|
|
@ -1558,6 +1609,49 @@
|
|||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/gaxios": {
|
||||
"version": "6.7.1",
|
||||
"resolved": "https://registry.npmjs.org/gaxios/-/gaxios-6.7.1.tgz",
|
||||
"integrity": "sha512-LDODD4TMYx7XXdpwxAVRAIAuB0bzv0s+ywFonY46k126qzQHT9ygyoa9tncmOiQmmDrik65UYsEkv3lbfqQ3yQ==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"extend": "^3.0.2",
|
||||
"https-proxy-agent": "^7.0.1",
|
||||
"is-stream": "^2.0.0",
|
||||
"node-fetch": "^2.6.9",
|
||||
"uuid": "^9.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14"
|
||||
}
|
||||
},
|
||||
"node_modules/gaxios/node_modules/uuid": {
|
||||
"version": "9.0.1",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
|
||||
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
|
||||
"funding": [
|
||||
"https://github.com/sponsors/broofa",
|
||||
"https://github.com/sponsors/ctavan"
|
||||
],
|
||||
"license": "MIT",
|
||||
"bin": {
|
||||
"uuid": "dist/bin/uuid"
|
||||
}
|
||||
},
|
||||
"node_modules/gcp-metadata": {
|
||||
"version": "6.1.1",
|
||||
"resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-6.1.1.tgz",
|
||||
"integrity": "sha512-a4tiq7E0/5fTjxPAaH4jpjkSv/uCaU2p5KC6HVGrvl0cDjA8iBZv4vv1gyzlmK0ZUKqwpOyQMKzZQe3lTit77A==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"gaxios": "^6.1.1",
|
||||
"google-logging-utils": "^0.0.2",
|
||||
"json-bigint": "^1.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14"
|
||||
}
|
||||
},
|
||||
"node_modules/get-intrinsic": {
|
||||
"version": "1.3.0",
|
||||
"resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
|
||||
|
|
@ -1614,6 +1708,32 @@
|
|||
"integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/google-auth-library": {
|
||||
"version": "9.15.1",
|
||||
"resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-9.15.1.tgz",
|
||||
"integrity": "sha512-Jb6Z0+nvECVz+2lzSMt9u98UsoakXxA2HGHMCxh+so3n90XgYWkq5dur19JAJV7ONiJY22yBTyJB1TSkvPq9Ng==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"base64-js": "^1.3.0",
|
||||
"ecdsa-sig-formatter": "^1.0.11",
|
||||
"gaxios": "^6.1.1",
|
||||
"gcp-metadata": "^6.1.0",
|
||||
"gtoken": "^7.0.0",
|
||||
"jws": "^4.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14"
|
||||
}
|
||||
},
|
||||
"node_modules/google-logging-utils": {
|
||||
"version": "0.0.2",
|
||||
"resolved": "https://registry.npmjs.org/google-logging-utils/-/google-logging-utils-0.0.2.tgz",
|
||||
"integrity": "sha512-NEgUnEcBiP5HrPzufUkBzJOD/Sxsco3rLNo1F1TNf7ieU8ryUzBhqba8r756CjLX7rn3fHl6iLEwPYuqpoKgQQ==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=14"
|
||||
}
|
||||
},
|
||||
"node_modules/gopd": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
|
||||
|
|
@ -1626,6 +1746,19 @@
|
|||
"url": "https://github.com/sponsors/ljharb"
|
||||
}
|
||||
},
|
||||
"node_modules/gtoken": {
|
||||
"version": "7.1.0",
|
||||
"resolved": "https://registry.npmjs.org/gtoken/-/gtoken-7.1.0.tgz",
|
||||
"integrity": "sha512-pCcEwRi+TKpMlxAQObHDQ56KawURgyAf6jtIY046fJ5tIv3zDe/LEIubckAO8fj6JnAxLdmWkUfNyulQ2iKdEw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"gaxios": "^6.0.0",
|
||||
"jws": "^4.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/has-symbols": {
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
|
||||
|
|
@ -1679,6 +1812,42 @@
|
|||
"url": "https://opencollective.com/express"
|
||||
}
|
||||
},
|
||||
"node_modules/https-proxy-agent": {
|
||||
"version": "7.0.6",
|
||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
|
||||
"integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"agent-base": "^7.1.2",
|
||||
"debug": "4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/https-proxy-agent/node_modules/debug": {
|
||||
"version": "4.4.3",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
|
||||
"integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"ms": "^2.1.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=6.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"supports-color": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/https-proxy-agent/node_modules/ms": {
|
||||
"version": "2.1.3",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
|
||||
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/iconv-lite": {
|
||||
"version": "0.4.24",
|
||||
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz",
|
||||
|
|
@ -1747,6 +1916,18 @@
|
|||
"node": ">= 0.10"
|
||||
}
|
||||
},
|
||||
"node_modules/is-stream": {
|
||||
"version": "2.0.1",
|
||||
"resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz",
|
||||
"integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=8"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/is-url": {
|
||||
"version": "1.2.4",
|
||||
"resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz",
|
||||
|
|
@ -1762,6 +1943,15 @@
|
|||
"url": "https://github.com/sponsors/panva"
|
||||
}
|
||||
},
|
||||
"node_modules/json-bigint": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz",
|
||||
"integrity": "sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"bignumber.js": "^9.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/jsonwebtoken": {
|
||||
"version": "9.0.3",
|
||||
"resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.3.tgz",
|
||||
|
|
@ -2952,6 +3142,27 @@
|
|||
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/ws": {
|
||||
"version": "8.19.0",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-8.19.0.tgz",
|
||||
"integrity": "sha512-blAT2mjOEIi0ZzruJfIhb3nps74PRWTCz1IjglWEEpQl5XS/UNama6u2/rjFkDDouqr4L67ry+1aGIALViWjDg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=10.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"bufferutil": "^4.0.1",
|
||||
"utf-8-validate": ">=5.0.2"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"bufferutil": {
|
||||
"optional": true
|
||||
},
|
||||
"utf-8-validate": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/xtend": {
|
||||
"version": "4.0.2",
|
||||
"resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz",
|
||||
|
|
@ -2988,6 +3199,25 @@
|
|||
"engines": {
|
||||
"node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/zod": {
|
||||
"version": "3.25.76",
|
||||
"resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
|
||||
"integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/colinhacks"
|
||||
}
|
||||
},
|
||||
"node_modules/zod-to-json-schema": {
|
||||
"version": "3.25.1",
|
||||
"resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.25.1.tgz",
|
||||
"integrity": "sha512-pM/SU9d3YAggzi6MtR4h7ruuQlqKtad8e9S0fmxcMi+ueAK5Korys/aWcV9LIIHTVbj01NdzxcnXSN+O74ZIVA==",
|
||||
"license": "ISC",
|
||||
"peerDependencies": {
|
||||
"zod": "^3.25 || ^4"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@
|
|||
"@types/cors": "^2.8.17",
|
||||
"@types/express": "^5.0.0",
|
||||
"@types/express-rate-limit": "^5.1.3",
|
||||
"@types/helmet": "^0.0.48",
|
||||
"@types/jsonwebtoken": "^9.0.7",
|
||||
"@types/multer": "^2.0.0",
|
||||
"@types/node": "^22.10.7",
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import { Router } from 'express';
|
||||
import multer from 'multer';
|
||||
import { processDocument, SUPPORTED_TYPES } from '../services/documentParser.js';
|
||||
import { processDocument, SUPPORTED_TYPES, normalizeMimeType } from '../services/documentParser.js';
|
||||
import { requireAuth } from '../middleware/auth.js';
|
||||
|
||||
const router = Router();
|
||||
|
|
@ -15,10 +15,11 @@ const upload = multer({
|
|||
fileSize: 50 * 1024 * 1024, // 50MB limit
|
||||
},
|
||||
fileFilter: (_req, file, cb) => {
|
||||
if (SUPPORTED_TYPES.includes(file.mimetype)) {
|
||||
const normalizedMime = normalizeMimeType(file.mimetype, file.originalname);
|
||||
if (SUPPORTED_TYPES.includes(normalizedMime)) {
|
||||
cb(null, true);
|
||||
} else {
|
||||
cb(new Error(`Unsupported file type: ${file.mimetype}. Supported types: ${SUPPORTED_TYPES.join(', ')}`));
|
||||
cb(new Error(`Unsupported file type: ${file.mimetype}. Supported types: PDF, Office docs, text files, code files, and images.`));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
|
@ -30,7 +31,8 @@ router.post('/', upload.single('document'), async (req, res) => {
|
|||
}
|
||||
|
||||
const useOcr = req.body?.useOcr === 'true' || req.body?.useOcr === true;
|
||||
const processed = await processDocument(req.file.buffer, req.file.mimetype, { useOcr });
|
||||
const normalizedMime = normalizeMimeType(req.file.mimetype, req.file.originalname);
|
||||
const processed = await processDocument(req.file.buffer, normalizedMime, { useOcr });
|
||||
|
||||
if (processed.type === 'native') {
|
||||
res.json({
|
||||
|
|
|
|||
|
|
@ -1,16 +1,33 @@
|
|||
import officeParser from 'officeparser';
|
||||
|
||||
// MIME types that Gemini can handle natively (send as-is)
|
||||
export const GEMINI_NATIVE_TYPES = [
|
||||
'application/pdf',
|
||||
'text/plain',
|
||||
'text/markdown',
|
||||
'text/csv',
|
||||
'text/html',
|
||||
'text/css',
|
||||
'text/javascript',
|
||||
'image/jpeg',
|
||||
'image/png',
|
||||
'image/gif',
|
||||
'image/webp'
|
||||
'image/webp',
|
||||
'image/heic',
|
||||
'image/heif',
|
||||
];
|
||||
|
||||
export const TYPES_TO_CONVERT_TO_PLAIN_TEXT = [
|
||||
'text/xml',
|
||||
'application/xml',
|
||||
'application/json',
|
||||
'application/javascript',
|
||||
'text/x-python',
|
||||
'text/x-java-source',
|
||||
'text/x-c',
|
||||
'text/x-c++',
|
||||
'text/x-typescript',
|
||||
'text/yaml',
|
||||
'application/x-yaml',
|
||||
];
|
||||
|
||||
// MIME types that officeparser can extract text from
|
||||
|
|
@ -32,7 +49,52 @@ export const OCR_CAPABLE_TYPES = [
|
|||
'image/webp'
|
||||
];
|
||||
|
||||
export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES];
|
||||
export const SUPPORTED_TYPES = [...GEMINI_NATIVE_TYPES, ...OFFICEPARSER_TYPES, ...TYPES_TO_CONVERT_TO_PLAIN_TEXT];
|
||||
|
||||
const EXTENSION_TO_MIME: Record<string, string> = {
|
||||
'.txt': 'text/plain',
|
||||
'.md': 'text/markdown',
|
||||
'.markdown': 'text/markdown',
|
||||
'.csv': 'text/csv',
|
||||
'.html': 'text/html',
|
||||
'.htm': 'text/html',
|
||||
'.xml': 'application/xml',
|
||||
'.json': 'application/json',
|
||||
'.js': 'text/javascript',
|
||||
'.mjs': 'text/javascript',
|
||||
'.ts': 'text/x-typescript',
|
||||
'.tsx': 'text/x-typescript',
|
||||
'.jsx': 'text/javascript',
|
||||
'.py': 'text/x-python',
|
||||
'.java': 'text/x-java-source',
|
||||
'.c': 'text/x-c',
|
||||
'.h': 'text/x-c',
|
||||
'.cpp': 'text/x-c++',
|
||||
'.hpp': 'text/x-c++',
|
||||
'.cc': 'text/x-c++',
|
||||
'.css': 'text/css',
|
||||
'.yaml': 'text/yaml',
|
||||
'.yml': 'text/yaml',
|
||||
};
|
||||
|
||||
export function normalizeMimeType(mimeType: string, filename?: string): string {
|
||||
if (SUPPORTED_TYPES.includes(mimeType)) {
|
||||
return mimeType;
|
||||
}
|
||||
|
||||
if (mimeType === 'application/octet-stream' && filename) {
|
||||
const ext = filename.toLowerCase().match(/\.[^.]+$/)?.[0];
|
||||
if (ext && EXTENSION_TO_MIME[ext]) {
|
||||
return EXTENSION_TO_MIME[ext];
|
||||
}
|
||||
}
|
||||
|
||||
if (mimeType.startsWith('text/') && !SUPPORTED_TYPES.includes(mimeType)) {
|
||||
return 'text/plain';
|
||||
}
|
||||
|
||||
return mimeType;
|
||||
}
|
||||
|
||||
export interface ProcessedDocument {
|
||||
type: 'native' | 'text';
|
||||
|
|
@ -60,6 +122,10 @@ export function needsOfficeParser(mimeType: string): boolean {
|
|||
return OFFICEPARSER_TYPES.includes(mimeType);
|
||||
}
|
||||
|
||||
export function needsPlainTextConversion(mimeType: string): boolean {
|
||||
return TYPES_TO_CONVERT_TO_PLAIN_TEXT.includes(mimeType);
|
||||
}
|
||||
|
||||
async function extractWithOfficeParser(buffer: Buffer, useOcr: boolean = false): Promise<string> {
|
||||
const config = useOcr ? {
|
||||
extractAttachments: true,
|
||||
|
|
@ -119,5 +185,13 @@ export async function processDocument(
|
|||
};
|
||||
}
|
||||
|
||||
if (needsPlainTextConversion(mimeType)) {
|
||||
return {
|
||||
type: 'text',
|
||||
content: buffer.toString('utf-8'),
|
||||
mimeType: 'text/plain'
|
||||
};
|
||||
}
|
||||
|
||||
throw new Error(`No extraction handler for: ${mimeType}`);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue