feat: add challenge and red-blue competitions across API and web

This commit is contained in:
Joey Yakimowich-Payne 2025-10-01 06:49:09 -06:00
commit 8fd3c4bb64
No known key found for this signature in database
GPG key ID: 6BFE655FA5ABD1E1
77 changed files with 5355 additions and 24 deletions

View file

@ -0,0 +1,14 @@
from __future__ import annotations
from flask.testing import FlaskClient
class TestWebChallenges:
def test_list_and_detail(self, test_client_with_containers: FlaskClient):
# list
resp = test_client_with_containers.get("/api/web/challenges")
assert resp.status_code == 200
data = resp.get_json()
assert data["result"] == "success"

View file

@ -0,0 +1,13 @@
from __future__ import annotations
from flask.testing import FlaskClient
class TestWebRedBlueChallenges:
def test_list(self, test_client_with_containers: FlaskClient):
resp = test_client_with_containers.get("/api/web/red-blue-challenges")
assert resp.status_code == 200
data = resp.get_json()
assert data["result"] == "success"

View file

@ -0,0 +1,144 @@
"""Tests for ChallengeScorerService."""
from __future__ import annotations
import pytest
from services.challenge_scorer_service import ChallengeScorerService
class TestChallengeScorerService:
"""Test custom scorer plugin loading and execution."""
def test_weighted_scorer_success(self):
"""Test WeightedScorer with successful attempt."""
metrics = {
"succeeded": True,
"tokens_total": 1000,
"elapsed_ms": 5000, # 5 seconds
"rating": 8,
"created_at": 1730000000000,
}
config = {
"success_bonus": 100.0,
"rating_weight": 10.0,
"time_penalty": 1.0,
"token_penalty": 0.01,
}
ctx = {
"tenant_id": "test-tenant",
"app_id": "test-app",
"workflow_id": "test-workflow",
"challenge_id": "test-challenge",
"timeout_ms": 5000,
}
result = ChallengeScorerService.score_with_plugin(
scorer_plugin_id="builtin.weighted_scorer",
scorer_entrypoint="services.scorers.weighted:WeightedScorer",
metrics=metrics,
config=config,
ctx=ctx,
)
# Expected: 100 (success) + 80 (8*10 rating) - 5 (5s*1.0) - 10 (1000*0.01) = 165
assert result["score"] == 165.0
assert "details" in result
assert result["details"]["base"] == 100.0
assert result["details"]["rating_contribution"] == 80.0
assert result["details"]["time_penalty"] == 5.0
assert result["details"]["token_penalty"] == 10.0
def test_weighted_scorer_failure(self):
"""Test WeightedScorer with failed attempt."""
metrics = {
"succeeded": False,
"tokens_total": 500,
"elapsed_ms": 2000, # 2 seconds
"rating": 3,
"created_at": 1730000000000,
}
config = {
"success_bonus": 100.0,
"rating_weight": 10.0,
"time_penalty": 1.0,
"token_penalty": 0.01,
}
ctx = {
"tenant_id": "test-tenant",
"app_id": "test-app",
"challenge_id": "test-challenge",
"timeout_ms": 5000,
}
result = ChallengeScorerService.score_with_plugin(
scorer_plugin_id="builtin.weighted_scorer",
scorer_entrypoint="services.scorers.weighted:WeightedScorer",
metrics=metrics,
config=config,
ctx=ctx,
)
# Expected: 0 (no success bonus) + 30 (3*10) - 2 (2s*1.0) - 5 (500*0.01) = 23
assert result["score"] == 23.0
def test_weighted_scorer_minimum_zero(self):
"""Test WeightedScorer never returns negative scores."""
metrics = {
"succeeded": False,
"tokens_total": 10000, # High token count
"elapsed_ms": 30000, # 30 seconds
"rating": 1,
"created_at": 1730000000000,
}
config = {
"success_bonus": 100.0,
"rating_weight": 10.0,
"time_penalty": 1.0,
"token_penalty": 0.01,
}
ctx = {
"tenant_id": "test-tenant",
"app_id": "test-app",
"challenge_id": "test-challenge",
"timeout_ms": 5000,
}
result = ChallengeScorerService.score_with_plugin(
scorer_plugin_id="builtin.weighted_scorer",
scorer_entrypoint="services.scorers.weighted:WeightedScorer",
metrics=metrics,
config=config,
ctx=ctx,
)
# Expected: 0 + 10 - 30 - 100 = -120, but clamped to 0
assert result["score"] == 0.0
def test_scorer_with_missing_plugin(self):
"""Test error handling for missing plugin."""
with pytest.raises(ValueError, match="Failed to load scorer plugin"):
ChallengeScorerService.score_with_plugin(
scorer_plugin_id="nonexistent",
scorer_entrypoint="nonexistent.module:NonexistentScorer",
metrics={},
config={},
ctx={"timeout_ms": 5000},
)
def test_scorer_with_invalid_entrypoint(self):
"""Test error handling for invalid entrypoint format."""
with pytest.raises(ValueError, match="scorer_plugin_id and scorer_entrypoint are required"):
ChallengeScorerService.score_with_plugin(
scorer_plugin_id=None,
scorer_entrypoint=None,
metrics={},
config={},
ctx={"timeout_ms": 5000},
)

View file

@ -0,0 +1,40 @@
from __future__ import annotations
from models.challenge import ChallengeAttempt
from services.challenge_service import ChallengeService
def test_evaluate_outcome_regex_match():
ok, details = ChallengeService.evaluate_outcome(
"Hello SECRET",
{"success_type": "regex", "success_pattern": "secret"},
)
assert ok is True
assert details.get("mode") == "regex"
def test_evaluate_outcome_contains():
ok, _ = ChallengeService.evaluate_outcome(
"hello world",
{"success_type": "contains", "success_pattern": "world"},
)
assert ok is True
def test_record_attempt_creates_row(mocker):
# mock db.session
session = mocker.MagicMock()
attempt = ChallengeService.record_attempt(
tenant_id="t1",
challenge_id="c1",
end_user_id=None,
account_id=None,
workflow_run_id=None,
succeeded=True,
score=10.0,
session=session,
)
assert isinstance(attempt, ChallengeAttempt)
session.add.assert_called_once()
session.commit.assert_called_once()

View file

@ -0,0 +1,34 @@
from __future__ import annotations
from types import SimpleNamespace
from services.red_blue_service import RedBlueService
def test_submit_prompt_creates_submission(mocker):
session = mocker.MagicMock()
sub = RedBlueService.submit_prompt(
challenge_id="cid",
tenant_id="tid",
team="red",
prompt="attack",
account_id="aid",
end_user_id="eid",
session=session,
)
assert sub.team == "red"
session.add.assert_called_once()
session.commit.assert_called_once()
def test_select_counterparty_submission_latest_active(mocker):
c = SimpleNamespace(id="cid")
session = mocker.MagicMock()
qs = (
session.query.return_value.filter.return_value.order_by.return_value
)
qs.first.return_value = SimpleNamespace(id="subid", team="blue")
sub = RedBlueService.select_counterparty_submission(challenge=c, team="red", session=session)
assert sub.team == "blue"