diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 6fb48aa..3275186 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -85,6 +85,29 @@ services: networks: - kaboot-network + kaboot-sandbox: + build: + context: ./server/sandbox + dockerfile: Dockerfile + restart: unless-stopped + read_only: true + security_opt: + - no-new-privileges:true + cap_drop: + - ALL + environment: + MAX_CONCURRENT: "2" + tmpfs: + - /tmp:size=200M,mode=1777 + deploy: + replicas: ${SANDBOX_REPLICAS:-4} + resources: + limits: + cpus: '1' + memory: 512M + networks: + - kaboot-network + kaboot-backend: build: context: ./server @@ -105,10 +128,14 @@ services: STRIPE_WEBHOOK_SECRET: ${STRIPE_WEBHOOK_SECRET:-} STRIPE_PRICE_ID_MONTHLY: ${STRIPE_PRICE_ID_MONTHLY:-} STRIPE_PRICE_ID_YEARLY: ${STRIPE_PRICE_ID_YEARLY:-} + SANDBOX_URL: http://kaboot-sandbox:3002 + USE_SANDBOX: "true" volumes: - kaboot-data:/data tmpfs: - /tmp:size=100M + depends_on: + - kaboot-sandbox networks: - kaboot-network diff --git a/docker-compose.yml b/docker-compose.yml index 1a9e809..43f7abb 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -97,6 +97,33 @@ services: networks: - kaboot-network + # ═══════════════════════════════════════════════════════════════════════════ + # KABOOT - Document Conversion Sandbox (isolated LibreOffice) + # ═══════════════════════════════════════════════════════════════════════════ + + kaboot-sandbox: + build: + context: ./server/sandbox + dockerfile: Dockerfile + restart: unless-stopped + read_only: true + security_opt: + - no-new-privileges:true + cap_drop: + - ALL + environment: + MAX_CONCURRENT: "2" + tmpfs: + - /tmp:size=200M,mode=1777 + deploy: + replicas: ${SANDBOX_REPLICAS:-2} + resources: + limits: + cpus: '1' + memory: 512M + networks: + - kaboot-network + # ═══════════════════════════════════════════════════════════════════════════ # KABOOT - Application Backend # ═══════════════════════════════════════════════════════════════════════════ @@ -118,6 +145,8 @@ services: CORS_ORIGIN: http://localhost:${KABOOT_FRONTEND_PORT:-5173},http://${KABOOT_HOST:-localhost}:${KABOOT_FRONTEND_PORT:-5173} LOG_REQUESTS: ${LOG_REQUESTS:-true} GEMINI_API_KEY: ${GEMINI_API_KEY:-} + SANDBOX_URL: http://kaboot-sandbox:3002 + USE_SANDBOX: "true" volumes: - ./data:/data tmpfs: @@ -126,6 +155,7 @@ services: - "${KABOOT_BACKEND_PORT:-3001}:3001" depends_on: - authentik-server + - kaboot-sandbox networks: - kaboot-network diff --git a/server/Dockerfile b/server/Dockerfile index 1b1dfc1..9439812 100644 --- a/server/Dockerfile +++ b/server/Dockerfile @@ -2,8 +2,7 @@ FROM node:22-alpine WORKDIR /app -# Build dependencies + LibreOffice for legacy Office format conversion -RUN apk add --no-cache python3 make g++ libreoffice curl +RUN apk add --no-cache python3 make g++ curl COPY package*.json ./ RUN npm install diff --git a/server/sandbox/Dockerfile b/server/sandbox/Dockerfile new file mode 100644 index 0000000..1b28fc9 --- /dev/null +++ b/server/sandbox/Dockerfile @@ -0,0 +1,23 @@ +FROM node:22-alpine + +RUN apk add --no-cache libreoffice curl \ + && addgroup -g 1000 sandbox \ + && adduser -u 1000 -G sandbox -s /bin/sh -D sandbox \ + && mkdir -p /app /tmp/convert \ + && chown -R sandbox:sandbox /app /tmp/convert + +WORKDIR /app + +COPY --chown=sandbox:sandbox package.json ./ +RUN npm install --omit=dev + +COPY --chown=sandbox:sandbox convert.js ./ + +USER sandbox + +EXPOSE 3002 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:3002/health || exit 1 + +CMD ["node", "convert.js"] diff --git a/server/sandbox/convert.js b/server/sandbox/convert.js new file mode 100644 index 0000000..10eaf09 --- /dev/null +++ b/server/sandbox/convert.js @@ -0,0 +1,168 @@ +import { createServer } from 'http'; +import { exec } from 'child_process'; +import { writeFile, readFile, unlink, mkdir, readdir } from 'fs/promises'; +import { existsSync } from 'fs'; +import { join } from 'path'; +import { randomUUID } from 'crypto'; + +const PORT = process.env.PORT || 3002; +const TEMP_DIR = '/tmp/convert'; +const TIMEOUT_MS = 30000; +const MAX_FILE_SIZE = 50 * 1024 * 1024; +const MAX_CONCURRENT = parseInt(process.env.MAX_CONCURRENT || '3', 10); + +const EXTENSION_TO_OUTPUT = { + '.ppt': 'pptx', + '.doc': 'docx', + '.xls': 'xlsx', +}; + +let activeJobs = 0; +let queuedJobs = 0; + +async function cleanup(dir) { + if (!existsSync(dir)) return; + try { + for (const file of await readdir(dir)) { + await unlink(join(dir, file)).catch(() => {}); + } + await unlink(dir).catch(() => {}); + } catch {} +} + +function runLibreOffice(inputPath, outputDir, outputExt) { + return new Promise((resolve, reject) => { + const timer = setTimeout(() => { + reject(new Error('timeout')); + }, TIMEOUT_MS); + + exec( + `libreoffice --headless --convert-to ${outputExt} --outdir "${outputDir}" "${inputPath}"`, + { maxBuffer: 10 * 1024 * 1024 }, + (error) => { + clearTimeout(timer); + if (error) reject(error); + else resolve(); + } + ); + }); +} + +async function processConversion(buffer, ext) { + const jobId = randomUUID(); + const jobDir = join(TEMP_DIR, jobId); + const inputPath = join(jobDir, `input${ext}`); + const outputExt = EXTENSION_TO_OUTPUT[ext]; + const outputPath = join(jobDir, `input.${outputExt}`); + + try { + await mkdir(jobDir, { mode: 0o700 }); + await writeFile(inputPath, buffer, { mode: 0o600 }); + + await runLibreOffice(inputPath, jobDir, outputExt); + + if (!existsSync(outputPath)) { + throw new Error('no_output'); + } + + return await readFile(outputPath); + } finally { + await cleanup(jobDir); + } +} + +async function handleConvert(req, res) { + if (activeJobs >= MAX_CONCURRENT) { + queuedJobs++; + if (queuedJobs > MAX_CONCURRENT * 2) { + queuedJobs--; + res.writeHead(503, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: 'Server busy, try again later' })); + return; + } + + await new Promise((resolve) => { + const check = setInterval(() => { + if (activeJobs < MAX_CONCURRENT) { + clearInterval(check); + queuedJobs--; + resolve(); + } + }, 100); + }); + } + + activeJobs++; + + const chunks = []; + let size = 0; + + req.on('data', (chunk) => { + size += chunk.length; + if (size <= MAX_FILE_SIZE) { + chunks.push(chunk); + } + }); + + req.on('end', async () => { + if (size > MAX_FILE_SIZE) { + activeJobs--; + res.writeHead(413, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: 'File too large' })); + return; + } + + const url = new URL(req.url, `http://localhost:${PORT}`); + const ext = url.searchParams.get('ext'); + + if (!ext || !EXTENSION_TO_OUTPUT[ext]) { + activeJobs--; + res.writeHead(400, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: 'Invalid or missing extension parameter' })); + return; + } + + try { + const converted = await processConversion(Buffer.concat(chunks), ext); + res.writeHead(200, { + 'Content-Type': 'application/octet-stream', + 'Content-Length': converted.length, + }); + res.end(converted); + } catch (err) { + const message = err.message === 'timeout' + ? 'Conversion timed out' + : err.message === 'no_output' + ? 'Conversion produced no output' + : 'Conversion failed'; + res.writeHead(500, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: message })); + } finally { + activeJobs--; + } + }); +} + +const server = createServer((req, res) => { + if (req.method === 'GET' && req.url === '/health') { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ status: 'ok', active: activeJobs, queued: queuedJobs })); + return; + } + + if (req.method === 'POST' && req.url?.startsWith('/convert')) { + handleConvert(req, res); + return; + } + + res.writeHead(404, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: 'Not found' })); +}); + +if (!existsSync(TEMP_DIR)) { + await mkdir(TEMP_DIR, { recursive: true, mode: 0o700 }); +} + +server.listen(PORT, '0.0.0.0', () => { + console.log(`Sandbox running on port ${PORT} (max ${MAX_CONCURRENT} concurrent)`); +}); diff --git a/server/sandbox/package.json b/server/sandbox/package.json new file mode 100644 index 0000000..685582d --- /dev/null +++ b/server/sandbox/package.json @@ -0,0 +1,9 @@ +{ + "name": "kaboot-sandbox", + "version": "1.0.0", + "type": "module", + "private": true, + "scripts": { + "start": "node convert.js" + } +} diff --git a/server/src/services/documentParser.ts b/server/src/services/documentParser.ts index bf7f88c..c2d4643 100644 --- a/server/src/services/documentParser.ts +++ b/server/src/services/documentParser.ts @@ -8,6 +8,8 @@ import { join } from 'path'; import { randomUUID } from 'crypto'; const PROCESSING_TIMEOUT_MS = 30000; +const SANDBOX_URL = process.env.SANDBOX_URL || 'http://localhost:3002'; +const USE_SANDBOX = process.env.USE_SANDBOX !== 'false'; export const GEMINI_NATIVE_TYPES = [ 'application/pdf', @@ -201,14 +203,41 @@ function extractWithSheetJS(buffer: Buffer): string { return textParts.join('\n\n'); } -// Map legacy extensions to their modern equivalents for LibreOffice conversion const LEGACY_TO_MODERN: Record = { '.ppt': 'pptx', '.doc': 'docx', '.xls': 'xlsx', }; -async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise { +async function convertViaSandbox(buffer: Buffer, extension: string): Promise { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), PROCESSING_TIMEOUT_MS); + + try { + const response = await fetch(`${SANDBOX_URL}/convert?ext=${extension}`, { + method: 'POST', + body: buffer, + signal: controller.signal, + headers: { 'Content-Type': 'application/octet-stream' }, + }); + + if (!response.ok) { + const error = await response.json().catch(() => ({ error: 'Conversion failed' })); + throw new Error(error.error || 'Document conversion failed'); + } + + return Buffer.from(await response.arrayBuffer()); + } catch (err) { + if (err instanceof Error && err.name === 'AbortError') { + throw new Error('Document conversion timed out. Try a smaller file.'); + } + throw err; + } finally { + clearTimeout(timeout); + } +} + +async function convertLocally(buffer: Buffer, extension: string): Promise { const tempId = randomUUID(); const privateTempDir = join(tmpdir(), `kaboot-${tempId}`); const inputPath = join(privateTempDir, `input${extension}`); @@ -225,47 +254,52 @@ async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: mkdirSync(privateTempDir, { mode: 0o700 }); writeFileSync(inputPath, buffer, { mode: 0o600 }); - try { - execSync( - `libreoffice --headless --convert-to ${modernExt} --outdir "${privateTempDir}" "${inputPath}"`, - { timeout: PROCESSING_TIMEOUT_MS, stdio: 'pipe', maxBuffer: 10 * 1024 * 1024 } - ); - } catch (execError) { - const error = execError as Error & { code?: string; killed?: boolean }; - if (error.killed) { - throw new Error('Document conversion timed out. Try a smaller file.'); - } - if (error.code === 'ENOENT' || error.message?.includes('not found')) { - throw new Error( - `Legacy ${extension} files require LibreOffice for text extraction. ` + - `Please convert to .${modernExt} format or ensure LibreOffice is installed.` - ); - } - throw new Error('Document conversion failed. The file may be corrupted.'); - } + execSync( + `libreoffice --headless --convert-to ${modernExt} --outdir "${privateTempDir}" "${inputPath}"`, + { timeout: PROCESSING_TIMEOUT_MS, stdio: 'pipe', maxBuffer: 10 * 1024 * 1024 } + ); if (!existsSync(outputPath)) { throw new Error('Document conversion produced no output.'); } - const convertedBuffer = readFileSync(outputPath); - const config = useOcr ? { extractAttachments: true, ocr: true, ocrLanguage: 'eng' } : {}; - const ast = await officeParser.parseOffice(convertedBuffer, config); - let text = ast.toText(); - - if (useOcr && ast.attachments) { - for (const attachment of ast.attachments) { - if (attachment.ocrText) { - text += '\n' + attachment.ocrText; - } - } + return readFileSync(outputPath); + } catch (execError) { + const error = execError as Error & { code?: string; killed?: boolean }; + if (error.killed) { + throw new Error('Document conversion timed out. Try a smaller file.'); } - return text; + if (error.code === 'ENOENT' || error.message?.includes('not found')) { + throw new Error( + `Legacy ${extension} files require LibreOffice for text extraction. ` + + `Please convert to .${LEGACY_TO_MODERN[extension]} format or ensure LibreOffice is installed.` + ); + } + throw new Error('Document conversion failed. The file may be corrupted.'); } finally { cleanup(); } } +async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise { + const convertedBuffer = USE_SANDBOX + ? await convertViaSandbox(buffer, extension) + : await convertLocally(buffer, extension); + + const config = useOcr ? { extractAttachments: true, ocr: true, ocrLanguage: 'eng' } : {}; + const ast = await officeParser.parseOffice(convertedBuffer, config); + let text = ast.toText(); + + if (useOcr && ast.attachments) { + for (const attachment of ast.attachments) { + if (attachment.ocrText) { + text += '\n' + attachment.ocrText; + } + } + } + return text; +} + export async function processDocument( buffer: Buffer, mimeType: string,