Sandboxing
This commit is contained in:
parent
cd04d34b23
commit
70df689701
7 changed files with 324 additions and 34 deletions
|
|
@ -85,6 +85,29 @@ services:
|
|||
networks:
|
||||
- kaboot-network
|
||||
|
||||
kaboot-sandbox:
|
||||
build:
|
||||
context: ./server/sandbox
|
||||
dockerfile: Dockerfile
|
||||
restart: unless-stopped
|
||||
read_only: true
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
cap_drop:
|
||||
- ALL
|
||||
environment:
|
||||
MAX_CONCURRENT: "2"
|
||||
tmpfs:
|
||||
- /tmp:size=200M,mode=1777
|
||||
deploy:
|
||||
replicas: ${SANDBOX_REPLICAS:-4}
|
||||
resources:
|
||||
limits:
|
||||
cpus: '1'
|
||||
memory: 512M
|
||||
networks:
|
||||
- kaboot-network
|
||||
|
||||
kaboot-backend:
|
||||
build:
|
||||
context: ./server
|
||||
|
|
@ -105,10 +128,14 @@ services:
|
|||
STRIPE_WEBHOOK_SECRET: ${STRIPE_WEBHOOK_SECRET:-}
|
||||
STRIPE_PRICE_ID_MONTHLY: ${STRIPE_PRICE_ID_MONTHLY:-}
|
||||
STRIPE_PRICE_ID_YEARLY: ${STRIPE_PRICE_ID_YEARLY:-}
|
||||
SANDBOX_URL: http://kaboot-sandbox:3002
|
||||
USE_SANDBOX: "true"
|
||||
volumes:
|
||||
- kaboot-data:/data
|
||||
tmpfs:
|
||||
- /tmp:size=100M
|
||||
depends_on:
|
||||
- kaboot-sandbox
|
||||
networks:
|
||||
- kaboot-network
|
||||
|
||||
|
|
|
|||
|
|
@ -97,6 +97,33 @@ services:
|
|||
networks:
|
||||
- kaboot-network
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# KABOOT - Document Conversion Sandbox (isolated LibreOffice)
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
kaboot-sandbox:
|
||||
build:
|
||||
context: ./server/sandbox
|
||||
dockerfile: Dockerfile
|
||||
restart: unless-stopped
|
||||
read_only: true
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
cap_drop:
|
||||
- ALL
|
||||
environment:
|
||||
MAX_CONCURRENT: "2"
|
||||
tmpfs:
|
||||
- /tmp:size=200M,mode=1777
|
||||
deploy:
|
||||
replicas: ${SANDBOX_REPLICAS:-2}
|
||||
resources:
|
||||
limits:
|
||||
cpus: '1'
|
||||
memory: 512M
|
||||
networks:
|
||||
- kaboot-network
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# KABOOT - Application Backend
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
|
@ -118,6 +145,8 @@ services:
|
|||
CORS_ORIGIN: http://localhost:${KABOOT_FRONTEND_PORT:-5173},http://${KABOOT_HOST:-localhost}:${KABOOT_FRONTEND_PORT:-5173}
|
||||
LOG_REQUESTS: ${LOG_REQUESTS:-true}
|
||||
GEMINI_API_KEY: ${GEMINI_API_KEY:-}
|
||||
SANDBOX_URL: http://kaboot-sandbox:3002
|
||||
USE_SANDBOX: "true"
|
||||
volumes:
|
||||
- ./data:/data
|
||||
tmpfs:
|
||||
|
|
@ -126,6 +155,7 @@ services:
|
|||
- "${KABOOT_BACKEND_PORT:-3001}:3001"
|
||||
depends_on:
|
||||
- authentik-server
|
||||
- kaboot-sandbox
|
||||
networks:
|
||||
- kaboot-network
|
||||
|
||||
|
|
|
|||
|
|
@ -2,8 +2,7 @@ FROM node:22-alpine
|
|||
|
||||
WORKDIR /app
|
||||
|
||||
# Build dependencies + LibreOffice for legacy Office format conversion
|
||||
RUN apk add --no-cache python3 make g++ libreoffice curl
|
||||
RUN apk add --no-cache python3 make g++ curl
|
||||
|
||||
COPY package*.json ./
|
||||
RUN npm install
|
||||
|
|
|
|||
23
server/sandbox/Dockerfile
Normal file
23
server/sandbox/Dockerfile
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
FROM node:22-alpine
|
||||
|
||||
RUN apk add --no-cache libreoffice curl \
|
||||
&& addgroup -g 1000 sandbox \
|
||||
&& adduser -u 1000 -G sandbox -s /bin/sh -D sandbox \
|
||||
&& mkdir -p /app /tmp/convert \
|
||||
&& chown -R sandbox:sandbox /app /tmp/convert
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY --chown=sandbox:sandbox package.json ./
|
||||
RUN npm install --omit=dev
|
||||
|
||||
COPY --chown=sandbox:sandbox convert.js ./
|
||||
|
||||
USER sandbox
|
||||
|
||||
EXPOSE 3002
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:3002/health || exit 1
|
||||
|
||||
CMD ["node", "convert.js"]
|
||||
168
server/sandbox/convert.js
Normal file
168
server/sandbox/convert.js
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
import { createServer } from 'http';
|
||||
import { exec } from 'child_process';
|
||||
import { writeFile, readFile, unlink, mkdir, readdir } from 'fs/promises';
|
||||
import { existsSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import { randomUUID } from 'crypto';
|
||||
|
||||
const PORT = process.env.PORT || 3002;
|
||||
const TEMP_DIR = '/tmp/convert';
|
||||
const TIMEOUT_MS = 30000;
|
||||
const MAX_FILE_SIZE = 50 * 1024 * 1024;
|
||||
const MAX_CONCURRENT = parseInt(process.env.MAX_CONCURRENT || '3', 10);
|
||||
|
||||
const EXTENSION_TO_OUTPUT = {
|
||||
'.ppt': 'pptx',
|
||||
'.doc': 'docx',
|
||||
'.xls': 'xlsx',
|
||||
};
|
||||
|
||||
let activeJobs = 0;
|
||||
let queuedJobs = 0;
|
||||
|
||||
async function cleanup(dir) {
|
||||
if (!existsSync(dir)) return;
|
||||
try {
|
||||
for (const file of await readdir(dir)) {
|
||||
await unlink(join(dir, file)).catch(() => {});
|
||||
}
|
||||
await unlink(dir).catch(() => {});
|
||||
} catch {}
|
||||
}
|
||||
|
||||
function runLibreOffice(inputPath, outputDir, outputExt) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timer = setTimeout(() => {
|
||||
reject(new Error('timeout'));
|
||||
}, TIMEOUT_MS);
|
||||
|
||||
exec(
|
||||
`libreoffice --headless --convert-to ${outputExt} --outdir "${outputDir}" "${inputPath}"`,
|
||||
{ maxBuffer: 10 * 1024 * 1024 },
|
||||
(error) => {
|
||||
clearTimeout(timer);
|
||||
if (error) reject(error);
|
||||
else resolve();
|
||||
}
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
async function processConversion(buffer, ext) {
|
||||
const jobId = randomUUID();
|
||||
const jobDir = join(TEMP_DIR, jobId);
|
||||
const inputPath = join(jobDir, `input${ext}`);
|
||||
const outputExt = EXTENSION_TO_OUTPUT[ext];
|
||||
const outputPath = join(jobDir, `input.${outputExt}`);
|
||||
|
||||
try {
|
||||
await mkdir(jobDir, { mode: 0o700 });
|
||||
await writeFile(inputPath, buffer, { mode: 0o600 });
|
||||
|
||||
await runLibreOffice(inputPath, jobDir, outputExt);
|
||||
|
||||
if (!existsSync(outputPath)) {
|
||||
throw new Error('no_output');
|
||||
}
|
||||
|
||||
return await readFile(outputPath);
|
||||
} finally {
|
||||
await cleanup(jobDir);
|
||||
}
|
||||
}
|
||||
|
||||
async function handleConvert(req, res) {
|
||||
if (activeJobs >= MAX_CONCURRENT) {
|
||||
queuedJobs++;
|
||||
if (queuedJobs > MAX_CONCURRENT * 2) {
|
||||
queuedJobs--;
|
||||
res.writeHead(503, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify({ error: 'Server busy, try again later' }));
|
||||
return;
|
||||
}
|
||||
|
||||
await new Promise((resolve) => {
|
||||
const check = setInterval(() => {
|
||||
if (activeJobs < MAX_CONCURRENT) {
|
||||
clearInterval(check);
|
||||
queuedJobs--;
|
||||
resolve();
|
||||
}
|
||||
}, 100);
|
||||
});
|
||||
}
|
||||
|
||||
activeJobs++;
|
||||
|
||||
const chunks = [];
|
||||
let size = 0;
|
||||
|
||||
req.on('data', (chunk) => {
|
||||
size += chunk.length;
|
||||
if (size <= MAX_FILE_SIZE) {
|
||||
chunks.push(chunk);
|
||||
}
|
||||
});
|
||||
|
||||
req.on('end', async () => {
|
||||
if (size > MAX_FILE_SIZE) {
|
||||
activeJobs--;
|
||||
res.writeHead(413, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify({ error: 'File too large' }));
|
||||
return;
|
||||
}
|
||||
|
||||
const url = new URL(req.url, `http://localhost:${PORT}`);
|
||||
const ext = url.searchParams.get('ext');
|
||||
|
||||
if (!ext || !EXTENSION_TO_OUTPUT[ext]) {
|
||||
activeJobs--;
|
||||
res.writeHead(400, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify({ error: 'Invalid or missing extension parameter' }));
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const converted = await processConversion(Buffer.concat(chunks), ext);
|
||||
res.writeHead(200, {
|
||||
'Content-Type': 'application/octet-stream',
|
||||
'Content-Length': converted.length,
|
||||
});
|
||||
res.end(converted);
|
||||
} catch (err) {
|
||||
const message = err.message === 'timeout'
|
||||
? 'Conversion timed out'
|
||||
: err.message === 'no_output'
|
||||
? 'Conversion produced no output'
|
||||
: 'Conversion failed';
|
||||
res.writeHead(500, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify({ error: message }));
|
||||
} finally {
|
||||
activeJobs--;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
const server = createServer((req, res) => {
|
||||
if (req.method === 'GET' && req.url === '/health') {
|
||||
res.writeHead(200, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify({ status: 'ok', active: activeJobs, queued: queuedJobs }));
|
||||
return;
|
||||
}
|
||||
|
||||
if (req.method === 'POST' && req.url?.startsWith('/convert')) {
|
||||
handleConvert(req, res);
|
||||
return;
|
||||
}
|
||||
|
||||
res.writeHead(404, { 'Content-Type': 'application/json' });
|
||||
res.end(JSON.stringify({ error: 'Not found' }));
|
||||
});
|
||||
|
||||
if (!existsSync(TEMP_DIR)) {
|
||||
await mkdir(TEMP_DIR, { recursive: true, mode: 0o700 });
|
||||
}
|
||||
|
||||
server.listen(PORT, '0.0.0.0', () => {
|
||||
console.log(`Sandbox running on port ${PORT} (max ${MAX_CONCURRENT} concurrent)`);
|
||||
});
|
||||
9
server/sandbox/package.json
Normal file
9
server/sandbox/package.json
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
{
|
||||
"name": "kaboot-sandbox",
|
||||
"version": "1.0.0",
|
||||
"type": "module",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"start": "node convert.js"
|
||||
}
|
||||
}
|
||||
|
|
@ -8,6 +8,8 @@ import { join } from 'path';
|
|||
import { randomUUID } from 'crypto';
|
||||
|
||||
const PROCESSING_TIMEOUT_MS = 30000;
|
||||
const SANDBOX_URL = process.env.SANDBOX_URL || 'http://localhost:3002';
|
||||
const USE_SANDBOX = process.env.USE_SANDBOX !== 'false';
|
||||
|
||||
export const GEMINI_NATIVE_TYPES = [
|
||||
'application/pdf',
|
||||
|
|
@ -201,14 +203,41 @@ function extractWithSheetJS(buffer: Buffer): string {
|
|||
return textParts.join('\n\n');
|
||||
}
|
||||
|
||||
// Map legacy extensions to their modern equivalents for LibreOffice conversion
|
||||
const LEGACY_TO_MODERN: Record<string, string> = {
|
||||
'.ppt': 'pptx',
|
||||
'.doc': 'docx',
|
||||
'.xls': 'xlsx',
|
||||
};
|
||||
|
||||
async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise<string> {
|
||||
async function convertViaSandbox(buffer: Buffer, extension: string): Promise<Buffer> {
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), PROCESSING_TIMEOUT_MS);
|
||||
|
||||
try {
|
||||
const response = await fetch(`${SANDBOX_URL}/convert?ext=${extension}`, {
|
||||
method: 'POST',
|
||||
body: buffer,
|
||||
signal: controller.signal,
|
||||
headers: { 'Content-Type': 'application/octet-stream' },
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json().catch(() => ({ error: 'Conversion failed' }));
|
||||
throw new Error(error.error || 'Document conversion failed');
|
||||
}
|
||||
|
||||
return Buffer.from(await response.arrayBuffer());
|
||||
} catch (err) {
|
||||
if (err instanceof Error && err.name === 'AbortError') {
|
||||
throw new Error('Document conversion timed out. Try a smaller file.');
|
||||
}
|
||||
throw err;
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
}
|
||||
|
||||
async function convertLocally(buffer: Buffer, extension: string): Promise<Buffer> {
|
||||
const tempId = randomUUID();
|
||||
const privateTempDir = join(tmpdir(), `kaboot-${tempId}`);
|
||||
const inputPath = join(privateTempDir, `input${extension}`);
|
||||
|
|
@ -225,47 +254,52 @@ async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr:
|
|||
mkdirSync(privateTempDir, { mode: 0o700 });
|
||||
writeFileSync(inputPath, buffer, { mode: 0o600 });
|
||||
|
||||
try {
|
||||
execSync(
|
||||
`libreoffice --headless --convert-to ${modernExt} --outdir "${privateTempDir}" "${inputPath}"`,
|
||||
{ timeout: PROCESSING_TIMEOUT_MS, stdio: 'pipe', maxBuffer: 10 * 1024 * 1024 }
|
||||
);
|
||||
} catch (execError) {
|
||||
const error = execError as Error & { code?: string; killed?: boolean };
|
||||
if (error.killed) {
|
||||
throw new Error('Document conversion timed out. Try a smaller file.');
|
||||
}
|
||||
if (error.code === 'ENOENT' || error.message?.includes('not found')) {
|
||||
throw new Error(
|
||||
`Legacy ${extension} files require LibreOffice for text extraction. ` +
|
||||
`Please convert to .${modernExt} format or ensure LibreOffice is installed.`
|
||||
);
|
||||
}
|
||||
throw new Error('Document conversion failed. The file may be corrupted.');
|
||||
}
|
||||
execSync(
|
||||
`libreoffice --headless --convert-to ${modernExt} --outdir "${privateTempDir}" "${inputPath}"`,
|
||||
{ timeout: PROCESSING_TIMEOUT_MS, stdio: 'pipe', maxBuffer: 10 * 1024 * 1024 }
|
||||
);
|
||||
|
||||
if (!existsSync(outputPath)) {
|
||||
throw new Error('Document conversion produced no output.');
|
||||
}
|
||||
|
||||
const convertedBuffer = readFileSync(outputPath);
|
||||
const config = useOcr ? { extractAttachments: true, ocr: true, ocrLanguage: 'eng' } : {};
|
||||
const ast = await officeParser.parseOffice(convertedBuffer, config);
|
||||
let text = ast.toText();
|
||||
|
||||
if (useOcr && ast.attachments) {
|
||||
for (const attachment of ast.attachments) {
|
||||
if (attachment.ocrText) {
|
||||
text += '\n' + attachment.ocrText;
|
||||
}
|
||||
}
|
||||
return readFileSync(outputPath);
|
||||
} catch (execError) {
|
||||
const error = execError as Error & { code?: string; killed?: boolean };
|
||||
if (error.killed) {
|
||||
throw new Error('Document conversion timed out. Try a smaller file.');
|
||||
}
|
||||
return text;
|
||||
if (error.code === 'ENOENT' || error.message?.includes('not found')) {
|
||||
throw new Error(
|
||||
`Legacy ${extension} files require LibreOffice for text extraction. ` +
|
||||
`Please convert to .${LEGACY_TO_MODERN[extension]} format or ensure LibreOffice is installed.`
|
||||
);
|
||||
}
|
||||
throw new Error('Document conversion failed. The file may be corrupted.');
|
||||
} finally {
|
||||
cleanup();
|
||||
}
|
||||
}
|
||||
|
||||
async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise<string> {
|
||||
const convertedBuffer = USE_SANDBOX
|
||||
? await convertViaSandbox(buffer, extension)
|
||||
: await convertLocally(buffer, extension);
|
||||
|
||||
const config = useOcr ? { extractAttachments: true, ocr: true, ocrLanguage: 'eng' } : {};
|
||||
const ast = await officeParser.parseOffice(convertedBuffer, config);
|
||||
let text = ast.toText();
|
||||
|
||||
if (useOcr && ast.attachments) {
|
||||
for (const attachment of ast.attachments) {
|
||||
if (attachment.ocrText) {
|
||||
text += '\n' + attachment.ocrText;
|
||||
}
|
||||
}
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
export async function processDocument(
|
||||
buffer: Buffer,
|
||||
mimeType: string,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue