Sandboxing
This commit is contained in:
parent
cd04d34b23
commit
70df689701
7 changed files with 324 additions and 34 deletions
|
|
@ -85,6 +85,29 @@ services:
|
||||||
networks:
|
networks:
|
||||||
- kaboot-network
|
- kaboot-network
|
||||||
|
|
||||||
|
kaboot-sandbox:
|
||||||
|
build:
|
||||||
|
context: ./server/sandbox
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
restart: unless-stopped
|
||||||
|
read_only: true
|
||||||
|
security_opt:
|
||||||
|
- no-new-privileges:true
|
||||||
|
cap_drop:
|
||||||
|
- ALL
|
||||||
|
environment:
|
||||||
|
MAX_CONCURRENT: "2"
|
||||||
|
tmpfs:
|
||||||
|
- /tmp:size=200M,mode=1777
|
||||||
|
deploy:
|
||||||
|
replicas: ${SANDBOX_REPLICAS:-4}
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: '1'
|
||||||
|
memory: 512M
|
||||||
|
networks:
|
||||||
|
- kaboot-network
|
||||||
|
|
||||||
kaboot-backend:
|
kaboot-backend:
|
||||||
build:
|
build:
|
||||||
context: ./server
|
context: ./server
|
||||||
|
|
@ -105,10 +128,14 @@ services:
|
||||||
STRIPE_WEBHOOK_SECRET: ${STRIPE_WEBHOOK_SECRET:-}
|
STRIPE_WEBHOOK_SECRET: ${STRIPE_WEBHOOK_SECRET:-}
|
||||||
STRIPE_PRICE_ID_MONTHLY: ${STRIPE_PRICE_ID_MONTHLY:-}
|
STRIPE_PRICE_ID_MONTHLY: ${STRIPE_PRICE_ID_MONTHLY:-}
|
||||||
STRIPE_PRICE_ID_YEARLY: ${STRIPE_PRICE_ID_YEARLY:-}
|
STRIPE_PRICE_ID_YEARLY: ${STRIPE_PRICE_ID_YEARLY:-}
|
||||||
|
SANDBOX_URL: http://kaboot-sandbox:3002
|
||||||
|
USE_SANDBOX: "true"
|
||||||
volumes:
|
volumes:
|
||||||
- kaboot-data:/data
|
- kaboot-data:/data
|
||||||
tmpfs:
|
tmpfs:
|
||||||
- /tmp:size=100M
|
- /tmp:size=100M
|
||||||
|
depends_on:
|
||||||
|
- kaboot-sandbox
|
||||||
networks:
|
networks:
|
||||||
- kaboot-network
|
- kaboot-network
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -97,6 +97,33 @@ services:
|
||||||
networks:
|
networks:
|
||||||
- kaboot-network
|
- kaboot-network
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
# KABOOT - Document Conversion Sandbox (isolated LibreOffice)
|
||||||
|
# ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
kaboot-sandbox:
|
||||||
|
build:
|
||||||
|
context: ./server/sandbox
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
restart: unless-stopped
|
||||||
|
read_only: true
|
||||||
|
security_opt:
|
||||||
|
- no-new-privileges:true
|
||||||
|
cap_drop:
|
||||||
|
- ALL
|
||||||
|
environment:
|
||||||
|
MAX_CONCURRENT: "2"
|
||||||
|
tmpfs:
|
||||||
|
- /tmp:size=200M,mode=1777
|
||||||
|
deploy:
|
||||||
|
replicas: ${SANDBOX_REPLICAS:-2}
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: '1'
|
||||||
|
memory: 512M
|
||||||
|
networks:
|
||||||
|
- kaboot-network
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════════
|
||||||
# KABOOT - Application Backend
|
# KABOOT - Application Backend
|
||||||
# ═══════════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
@ -118,6 +145,8 @@ services:
|
||||||
CORS_ORIGIN: http://localhost:${KABOOT_FRONTEND_PORT:-5173},http://${KABOOT_HOST:-localhost}:${KABOOT_FRONTEND_PORT:-5173}
|
CORS_ORIGIN: http://localhost:${KABOOT_FRONTEND_PORT:-5173},http://${KABOOT_HOST:-localhost}:${KABOOT_FRONTEND_PORT:-5173}
|
||||||
LOG_REQUESTS: ${LOG_REQUESTS:-true}
|
LOG_REQUESTS: ${LOG_REQUESTS:-true}
|
||||||
GEMINI_API_KEY: ${GEMINI_API_KEY:-}
|
GEMINI_API_KEY: ${GEMINI_API_KEY:-}
|
||||||
|
SANDBOX_URL: http://kaboot-sandbox:3002
|
||||||
|
USE_SANDBOX: "true"
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/data
|
- ./data:/data
|
||||||
tmpfs:
|
tmpfs:
|
||||||
|
|
@ -126,6 +155,7 @@ services:
|
||||||
- "${KABOOT_BACKEND_PORT:-3001}:3001"
|
- "${KABOOT_BACKEND_PORT:-3001}:3001"
|
||||||
depends_on:
|
depends_on:
|
||||||
- authentik-server
|
- authentik-server
|
||||||
|
- kaboot-sandbox
|
||||||
networks:
|
networks:
|
||||||
- kaboot-network
|
- kaboot-network
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,7 @@ FROM node:22-alpine
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Build dependencies + LibreOffice for legacy Office format conversion
|
RUN apk add --no-cache python3 make g++ curl
|
||||||
RUN apk add --no-cache python3 make g++ libreoffice curl
|
|
||||||
|
|
||||||
COPY package*.json ./
|
COPY package*.json ./
|
||||||
RUN npm install
|
RUN npm install
|
||||||
|
|
|
||||||
23
server/sandbox/Dockerfile
Normal file
23
server/sandbox/Dockerfile
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
FROM node:22-alpine
|
||||||
|
|
||||||
|
RUN apk add --no-cache libreoffice curl \
|
||||||
|
&& addgroup -g 1000 sandbox \
|
||||||
|
&& adduser -u 1000 -G sandbox -s /bin/sh -D sandbox \
|
||||||
|
&& mkdir -p /app /tmp/convert \
|
||||||
|
&& chown -R sandbox:sandbox /app /tmp/convert
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY --chown=sandbox:sandbox package.json ./
|
||||||
|
RUN npm install --omit=dev
|
||||||
|
|
||||||
|
COPY --chown=sandbox:sandbox convert.js ./
|
||||||
|
|
||||||
|
USER sandbox
|
||||||
|
|
||||||
|
EXPOSE 3002
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=5s --start-period=5s --retries=3 \
|
||||||
|
CMD curl -f http://localhost:3002/health || exit 1
|
||||||
|
|
||||||
|
CMD ["node", "convert.js"]
|
||||||
168
server/sandbox/convert.js
Normal file
168
server/sandbox/convert.js
Normal file
|
|
@ -0,0 +1,168 @@
|
||||||
|
import { createServer } from 'http';
|
||||||
|
import { exec } from 'child_process';
|
||||||
|
import { writeFile, readFile, unlink, mkdir, readdir } from 'fs/promises';
|
||||||
|
import { existsSync } from 'fs';
|
||||||
|
import { join } from 'path';
|
||||||
|
import { randomUUID } from 'crypto';
|
||||||
|
|
||||||
|
const PORT = process.env.PORT || 3002;
|
||||||
|
const TEMP_DIR = '/tmp/convert';
|
||||||
|
const TIMEOUT_MS = 30000;
|
||||||
|
const MAX_FILE_SIZE = 50 * 1024 * 1024;
|
||||||
|
const MAX_CONCURRENT = parseInt(process.env.MAX_CONCURRENT || '3', 10);
|
||||||
|
|
||||||
|
const EXTENSION_TO_OUTPUT = {
|
||||||
|
'.ppt': 'pptx',
|
||||||
|
'.doc': 'docx',
|
||||||
|
'.xls': 'xlsx',
|
||||||
|
};
|
||||||
|
|
||||||
|
let activeJobs = 0;
|
||||||
|
let queuedJobs = 0;
|
||||||
|
|
||||||
|
async function cleanup(dir) {
|
||||||
|
if (!existsSync(dir)) return;
|
||||||
|
try {
|
||||||
|
for (const file of await readdir(dir)) {
|
||||||
|
await unlink(join(dir, file)).catch(() => {});
|
||||||
|
}
|
||||||
|
await unlink(dir).catch(() => {});
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
|
||||||
|
function runLibreOffice(inputPath, outputDir, outputExt) {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const timer = setTimeout(() => {
|
||||||
|
reject(new Error('timeout'));
|
||||||
|
}, TIMEOUT_MS);
|
||||||
|
|
||||||
|
exec(
|
||||||
|
`libreoffice --headless --convert-to ${outputExt} --outdir "${outputDir}" "${inputPath}"`,
|
||||||
|
{ maxBuffer: 10 * 1024 * 1024 },
|
||||||
|
(error) => {
|
||||||
|
clearTimeout(timer);
|
||||||
|
if (error) reject(error);
|
||||||
|
else resolve();
|
||||||
|
}
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function processConversion(buffer, ext) {
|
||||||
|
const jobId = randomUUID();
|
||||||
|
const jobDir = join(TEMP_DIR, jobId);
|
||||||
|
const inputPath = join(jobDir, `input${ext}`);
|
||||||
|
const outputExt = EXTENSION_TO_OUTPUT[ext];
|
||||||
|
const outputPath = join(jobDir, `input.${outputExt}`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await mkdir(jobDir, { mode: 0o700 });
|
||||||
|
await writeFile(inputPath, buffer, { mode: 0o600 });
|
||||||
|
|
||||||
|
await runLibreOffice(inputPath, jobDir, outputExt);
|
||||||
|
|
||||||
|
if (!existsSync(outputPath)) {
|
||||||
|
throw new Error('no_output');
|
||||||
|
}
|
||||||
|
|
||||||
|
return await readFile(outputPath);
|
||||||
|
} finally {
|
||||||
|
await cleanup(jobDir);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function handleConvert(req, res) {
|
||||||
|
if (activeJobs >= MAX_CONCURRENT) {
|
||||||
|
queuedJobs++;
|
||||||
|
if (queuedJobs > MAX_CONCURRENT * 2) {
|
||||||
|
queuedJobs--;
|
||||||
|
res.writeHead(503, { 'Content-Type': 'application/json' });
|
||||||
|
res.end(JSON.stringify({ error: 'Server busy, try again later' }));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise((resolve) => {
|
||||||
|
const check = setInterval(() => {
|
||||||
|
if (activeJobs < MAX_CONCURRENT) {
|
||||||
|
clearInterval(check);
|
||||||
|
queuedJobs--;
|
||||||
|
resolve();
|
||||||
|
}
|
||||||
|
}, 100);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
activeJobs++;
|
||||||
|
|
||||||
|
const chunks = [];
|
||||||
|
let size = 0;
|
||||||
|
|
||||||
|
req.on('data', (chunk) => {
|
||||||
|
size += chunk.length;
|
||||||
|
if (size <= MAX_FILE_SIZE) {
|
||||||
|
chunks.push(chunk);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
req.on('end', async () => {
|
||||||
|
if (size > MAX_FILE_SIZE) {
|
||||||
|
activeJobs--;
|
||||||
|
res.writeHead(413, { 'Content-Type': 'application/json' });
|
||||||
|
res.end(JSON.stringify({ error: 'File too large' }));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const url = new URL(req.url, `http://localhost:${PORT}`);
|
||||||
|
const ext = url.searchParams.get('ext');
|
||||||
|
|
||||||
|
if (!ext || !EXTENSION_TO_OUTPUT[ext]) {
|
||||||
|
activeJobs--;
|
||||||
|
res.writeHead(400, { 'Content-Type': 'application/json' });
|
||||||
|
res.end(JSON.stringify({ error: 'Invalid or missing extension parameter' }));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const converted = await processConversion(Buffer.concat(chunks), ext);
|
||||||
|
res.writeHead(200, {
|
||||||
|
'Content-Type': 'application/octet-stream',
|
||||||
|
'Content-Length': converted.length,
|
||||||
|
});
|
||||||
|
res.end(converted);
|
||||||
|
} catch (err) {
|
||||||
|
const message = err.message === 'timeout'
|
||||||
|
? 'Conversion timed out'
|
||||||
|
: err.message === 'no_output'
|
||||||
|
? 'Conversion produced no output'
|
||||||
|
: 'Conversion failed';
|
||||||
|
res.writeHead(500, { 'Content-Type': 'application/json' });
|
||||||
|
res.end(JSON.stringify({ error: message }));
|
||||||
|
} finally {
|
||||||
|
activeJobs--;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const server = createServer((req, res) => {
|
||||||
|
if (req.method === 'GET' && req.url === '/health') {
|
||||||
|
res.writeHead(200, { 'Content-Type': 'application/json' });
|
||||||
|
res.end(JSON.stringify({ status: 'ok', active: activeJobs, queued: queuedJobs }));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (req.method === 'POST' && req.url?.startsWith('/convert')) {
|
||||||
|
handleConvert(req, res);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
res.writeHead(404, { 'Content-Type': 'application/json' });
|
||||||
|
res.end(JSON.stringify({ error: 'Not found' }));
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!existsSync(TEMP_DIR)) {
|
||||||
|
await mkdir(TEMP_DIR, { recursive: true, mode: 0o700 });
|
||||||
|
}
|
||||||
|
|
||||||
|
server.listen(PORT, '0.0.0.0', () => {
|
||||||
|
console.log(`Sandbox running on port ${PORT} (max ${MAX_CONCURRENT} concurrent)`);
|
||||||
|
});
|
||||||
9
server/sandbox/package.json
Normal file
9
server/sandbox/package.json
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
{
|
||||||
|
"name": "kaboot-sandbox",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"type": "module",
|
||||||
|
"private": true,
|
||||||
|
"scripts": {
|
||||||
|
"start": "node convert.js"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -8,6 +8,8 @@ import { join } from 'path';
|
||||||
import { randomUUID } from 'crypto';
|
import { randomUUID } from 'crypto';
|
||||||
|
|
||||||
const PROCESSING_TIMEOUT_MS = 30000;
|
const PROCESSING_TIMEOUT_MS = 30000;
|
||||||
|
const SANDBOX_URL = process.env.SANDBOX_URL || 'http://localhost:3002';
|
||||||
|
const USE_SANDBOX = process.env.USE_SANDBOX !== 'false';
|
||||||
|
|
||||||
export const GEMINI_NATIVE_TYPES = [
|
export const GEMINI_NATIVE_TYPES = [
|
||||||
'application/pdf',
|
'application/pdf',
|
||||||
|
|
@ -201,14 +203,41 @@ function extractWithSheetJS(buffer: Buffer): string {
|
||||||
return textParts.join('\n\n');
|
return textParts.join('\n\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Map legacy extensions to their modern equivalents for LibreOffice conversion
|
|
||||||
const LEGACY_TO_MODERN: Record<string, string> = {
|
const LEGACY_TO_MODERN: Record<string, string> = {
|
||||||
'.ppt': 'pptx',
|
'.ppt': 'pptx',
|
||||||
'.doc': 'docx',
|
'.doc': 'docx',
|
||||||
'.xls': 'xlsx',
|
'.xls': 'xlsx',
|
||||||
};
|
};
|
||||||
|
|
||||||
async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise<string> {
|
async function convertViaSandbox(buffer: Buffer, extension: string): Promise<Buffer> {
|
||||||
|
const controller = new AbortController();
|
||||||
|
const timeout = setTimeout(() => controller.abort(), PROCESSING_TIMEOUT_MS);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${SANDBOX_URL}/convert?ext=${extension}`, {
|
||||||
|
method: 'POST',
|
||||||
|
body: buffer,
|
||||||
|
signal: controller.signal,
|
||||||
|
headers: { 'Content-Type': 'application/octet-stream' },
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const error = await response.json().catch(() => ({ error: 'Conversion failed' }));
|
||||||
|
throw new Error(error.error || 'Document conversion failed');
|
||||||
|
}
|
||||||
|
|
||||||
|
return Buffer.from(await response.arrayBuffer());
|
||||||
|
} catch (err) {
|
||||||
|
if (err instanceof Error && err.name === 'AbortError') {
|
||||||
|
throw new Error('Document conversion timed out. Try a smaller file.');
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
} finally {
|
||||||
|
clearTimeout(timeout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function convertLocally(buffer: Buffer, extension: string): Promise<Buffer> {
|
||||||
const tempId = randomUUID();
|
const tempId = randomUUID();
|
||||||
const privateTempDir = join(tmpdir(), `kaboot-${tempId}`);
|
const privateTempDir = join(tmpdir(), `kaboot-${tempId}`);
|
||||||
const inputPath = join(privateTempDir, `input${extension}`);
|
const inputPath = join(privateTempDir, `input${extension}`);
|
||||||
|
|
@ -225,11 +254,16 @@ async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr:
|
||||||
mkdirSync(privateTempDir, { mode: 0o700 });
|
mkdirSync(privateTempDir, { mode: 0o700 });
|
||||||
writeFileSync(inputPath, buffer, { mode: 0o600 });
|
writeFileSync(inputPath, buffer, { mode: 0o600 });
|
||||||
|
|
||||||
try {
|
|
||||||
execSync(
|
execSync(
|
||||||
`libreoffice --headless --convert-to ${modernExt} --outdir "${privateTempDir}" "${inputPath}"`,
|
`libreoffice --headless --convert-to ${modernExt} --outdir "${privateTempDir}" "${inputPath}"`,
|
||||||
{ timeout: PROCESSING_TIMEOUT_MS, stdio: 'pipe', maxBuffer: 10 * 1024 * 1024 }
|
{ timeout: PROCESSING_TIMEOUT_MS, stdio: 'pipe', maxBuffer: 10 * 1024 * 1024 }
|
||||||
);
|
);
|
||||||
|
|
||||||
|
if (!existsSync(outputPath)) {
|
||||||
|
throw new Error('Document conversion produced no output.');
|
||||||
|
}
|
||||||
|
|
||||||
|
return readFileSync(outputPath);
|
||||||
} catch (execError) {
|
} catch (execError) {
|
||||||
const error = execError as Error & { code?: string; killed?: boolean };
|
const error = execError as Error & { code?: string; killed?: boolean };
|
||||||
if (error.killed) {
|
if (error.killed) {
|
||||||
|
|
@ -238,17 +272,20 @@ async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr:
|
||||||
if (error.code === 'ENOENT' || error.message?.includes('not found')) {
|
if (error.code === 'ENOENT' || error.message?.includes('not found')) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Legacy ${extension} files require LibreOffice for text extraction. ` +
|
`Legacy ${extension} files require LibreOffice for text extraction. ` +
|
||||||
`Please convert to .${modernExt} format or ensure LibreOffice is installed.`
|
`Please convert to .${LEGACY_TO_MODERN[extension]} format or ensure LibreOffice is installed.`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
throw new Error('Document conversion failed. The file may be corrupted.');
|
throw new Error('Document conversion failed. The file may be corrupted.');
|
||||||
|
} finally {
|
||||||
|
cleanup();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!existsSync(outputPath)) {
|
async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr: boolean = false): Promise<string> {
|
||||||
throw new Error('Document conversion produced no output.');
|
const convertedBuffer = USE_SANDBOX
|
||||||
}
|
? await convertViaSandbox(buffer, extension)
|
||||||
|
: await convertLocally(buffer, extension);
|
||||||
|
|
||||||
const convertedBuffer = readFileSync(outputPath);
|
|
||||||
const config = useOcr ? { extractAttachments: true, ocr: true, ocrLanguage: 'eng' } : {};
|
const config = useOcr ? { extractAttachments: true, ocr: true, ocrLanguage: 'eng' } : {};
|
||||||
const ast = await officeParser.parseOffice(convertedBuffer, config);
|
const ast = await officeParser.parseOffice(convertedBuffer, config);
|
||||||
let text = ast.toText();
|
let text = ast.toText();
|
||||||
|
|
@ -261,9 +298,6 @@ async function extractWithLibreOffice(buffer: Buffer, extension: string, useOcr:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return text;
|
return text;
|
||||||
} finally {
|
|
||||||
cleanup();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function processDocument(
|
export async function processDocument(
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue