feat: add challenge and red-blue competitions across API and web
This commit is contained in:
parent
f5161d9add
commit
8fd3c4bb64
77 changed files with 5355 additions and 24 deletions
|
|
@ -0,0 +1,14 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from flask.testing import FlaskClient
|
||||
|
||||
|
||||
class TestWebChallenges:
|
||||
def test_list_and_detail(self, test_client_with_containers: FlaskClient):
|
||||
# list
|
||||
resp = test_client_with_containers.get("/api/web/challenges")
|
||||
assert resp.status_code == 200
|
||||
data = resp.get_json()
|
||||
assert data["result"] == "success"
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from flask.testing import FlaskClient
|
||||
|
||||
|
||||
class TestWebRedBlueChallenges:
|
||||
def test_list(self, test_client_with_containers: FlaskClient):
|
||||
resp = test_client_with_containers.get("/api/web/red-blue-challenges")
|
||||
assert resp.status_code == 200
|
||||
data = resp.get_json()
|
||||
assert data["result"] == "success"
|
||||
|
||||
|
||||
144
api/tests/unit_tests/services/test_challenge_scorer_service.py
Normal file
144
api/tests/unit_tests/services/test_challenge_scorer_service.py
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
"""Tests for ChallengeScorerService."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from services.challenge_scorer_service import ChallengeScorerService
|
||||
|
||||
|
||||
class TestChallengeScorerService:
|
||||
"""Test custom scorer plugin loading and execution."""
|
||||
|
||||
def test_weighted_scorer_success(self):
|
||||
"""Test WeightedScorer with successful attempt."""
|
||||
metrics = {
|
||||
"succeeded": True,
|
||||
"tokens_total": 1000,
|
||||
"elapsed_ms": 5000, # 5 seconds
|
||||
"rating": 8,
|
||||
"created_at": 1730000000000,
|
||||
}
|
||||
|
||||
config = {
|
||||
"success_bonus": 100.0,
|
||||
"rating_weight": 10.0,
|
||||
"time_penalty": 1.0,
|
||||
"token_penalty": 0.01,
|
||||
}
|
||||
|
||||
ctx = {
|
||||
"tenant_id": "test-tenant",
|
||||
"app_id": "test-app",
|
||||
"workflow_id": "test-workflow",
|
||||
"challenge_id": "test-challenge",
|
||||
"timeout_ms": 5000,
|
||||
}
|
||||
|
||||
result = ChallengeScorerService.score_with_plugin(
|
||||
scorer_plugin_id="builtin.weighted_scorer",
|
||||
scorer_entrypoint="services.scorers.weighted:WeightedScorer",
|
||||
metrics=metrics,
|
||||
config=config,
|
||||
ctx=ctx,
|
||||
)
|
||||
|
||||
# Expected: 100 (success) + 80 (8*10 rating) - 5 (5s*1.0) - 10 (1000*0.01) = 165
|
||||
assert result["score"] == 165.0
|
||||
assert "details" in result
|
||||
assert result["details"]["base"] == 100.0
|
||||
assert result["details"]["rating_contribution"] == 80.0
|
||||
assert result["details"]["time_penalty"] == 5.0
|
||||
assert result["details"]["token_penalty"] == 10.0
|
||||
|
||||
def test_weighted_scorer_failure(self):
|
||||
"""Test WeightedScorer with failed attempt."""
|
||||
metrics = {
|
||||
"succeeded": False,
|
||||
"tokens_total": 500,
|
||||
"elapsed_ms": 2000, # 2 seconds
|
||||
"rating": 3,
|
||||
"created_at": 1730000000000,
|
||||
}
|
||||
|
||||
config = {
|
||||
"success_bonus": 100.0,
|
||||
"rating_weight": 10.0,
|
||||
"time_penalty": 1.0,
|
||||
"token_penalty": 0.01,
|
||||
}
|
||||
|
||||
ctx = {
|
||||
"tenant_id": "test-tenant",
|
||||
"app_id": "test-app",
|
||||
"challenge_id": "test-challenge",
|
||||
"timeout_ms": 5000,
|
||||
}
|
||||
|
||||
result = ChallengeScorerService.score_with_plugin(
|
||||
scorer_plugin_id="builtin.weighted_scorer",
|
||||
scorer_entrypoint="services.scorers.weighted:WeightedScorer",
|
||||
metrics=metrics,
|
||||
config=config,
|
||||
ctx=ctx,
|
||||
)
|
||||
|
||||
# Expected: 0 (no success bonus) + 30 (3*10) - 2 (2s*1.0) - 5 (500*0.01) = 23
|
||||
assert result["score"] == 23.0
|
||||
|
||||
def test_weighted_scorer_minimum_zero(self):
|
||||
"""Test WeightedScorer never returns negative scores."""
|
||||
metrics = {
|
||||
"succeeded": False,
|
||||
"tokens_total": 10000, # High token count
|
||||
"elapsed_ms": 30000, # 30 seconds
|
||||
"rating": 1,
|
||||
"created_at": 1730000000000,
|
||||
}
|
||||
|
||||
config = {
|
||||
"success_bonus": 100.0,
|
||||
"rating_weight": 10.0,
|
||||
"time_penalty": 1.0,
|
||||
"token_penalty": 0.01,
|
||||
}
|
||||
|
||||
ctx = {
|
||||
"tenant_id": "test-tenant",
|
||||
"app_id": "test-app",
|
||||
"challenge_id": "test-challenge",
|
||||
"timeout_ms": 5000,
|
||||
}
|
||||
|
||||
result = ChallengeScorerService.score_with_plugin(
|
||||
scorer_plugin_id="builtin.weighted_scorer",
|
||||
scorer_entrypoint="services.scorers.weighted:WeightedScorer",
|
||||
metrics=metrics,
|
||||
config=config,
|
||||
ctx=ctx,
|
||||
)
|
||||
|
||||
# Expected: 0 + 10 - 30 - 100 = -120, but clamped to 0
|
||||
assert result["score"] == 0.0
|
||||
|
||||
def test_scorer_with_missing_plugin(self):
|
||||
"""Test error handling for missing plugin."""
|
||||
with pytest.raises(ValueError, match="Failed to load scorer plugin"):
|
||||
ChallengeScorerService.score_with_plugin(
|
||||
scorer_plugin_id="nonexistent",
|
||||
scorer_entrypoint="nonexistent.module:NonexistentScorer",
|
||||
metrics={},
|
||||
config={},
|
||||
ctx={"timeout_ms": 5000},
|
||||
)
|
||||
|
||||
def test_scorer_with_invalid_entrypoint(self):
|
||||
"""Test error handling for invalid entrypoint format."""
|
||||
with pytest.raises(ValueError, match="scorer_plugin_id and scorer_entrypoint are required"):
|
||||
ChallengeScorerService.score_with_plugin(
|
||||
scorer_plugin_id=None,
|
||||
scorer_entrypoint=None,
|
||||
metrics={},
|
||||
config={},
|
||||
ctx={"timeout_ms": 5000},
|
||||
)
|
||||
40
api/tests/unit_tests/services/test_challenge_service.py
Normal file
40
api/tests/unit_tests/services/test_challenge_service.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from models.challenge import ChallengeAttempt
|
||||
from services.challenge_service import ChallengeService
|
||||
|
||||
|
||||
def test_evaluate_outcome_regex_match():
|
||||
ok, details = ChallengeService.evaluate_outcome(
|
||||
"Hello SECRET",
|
||||
{"success_type": "regex", "success_pattern": "secret"},
|
||||
)
|
||||
assert ok is True
|
||||
assert details.get("mode") == "regex"
|
||||
|
||||
|
||||
def test_evaluate_outcome_contains():
|
||||
ok, _ = ChallengeService.evaluate_outcome(
|
||||
"hello world",
|
||||
{"success_type": "contains", "success_pattern": "world"},
|
||||
)
|
||||
assert ok is True
|
||||
|
||||
|
||||
def test_record_attempt_creates_row(mocker):
|
||||
# mock db.session
|
||||
session = mocker.MagicMock()
|
||||
attempt = ChallengeService.record_attempt(
|
||||
tenant_id="t1",
|
||||
challenge_id="c1",
|
||||
end_user_id=None,
|
||||
account_id=None,
|
||||
workflow_run_id=None,
|
||||
succeeded=True,
|
||||
score=10.0,
|
||||
session=session,
|
||||
)
|
||||
assert isinstance(attempt, ChallengeAttempt)
|
||||
session.add.assert_called_once()
|
||||
session.commit.assert_called_once()
|
||||
|
||||
34
api/tests/unit_tests/services/test_red_blue_service.py
Normal file
34
api/tests/unit_tests/services/test_red_blue_service.py
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from types import SimpleNamespace
|
||||
|
||||
from services.red_blue_service import RedBlueService
|
||||
|
||||
|
||||
def test_submit_prompt_creates_submission(mocker):
|
||||
session = mocker.MagicMock()
|
||||
sub = RedBlueService.submit_prompt(
|
||||
challenge_id="cid",
|
||||
tenant_id="tid",
|
||||
team="red",
|
||||
prompt="attack",
|
||||
account_id="aid",
|
||||
end_user_id="eid",
|
||||
session=session,
|
||||
)
|
||||
assert sub.team == "red"
|
||||
session.add.assert_called_once()
|
||||
session.commit.assert_called_once()
|
||||
|
||||
|
||||
def test_select_counterparty_submission_latest_active(mocker):
|
||||
c = SimpleNamespace(id="cid")
|
||||
session = mocker.MagicMock()
|
||||
qs = (
|
||||
session.query.return_value.filter.return_value.order_by.return_value
|
||||
)
|
||||
qs.first.return_value = SimpleNamespace(id="subid", team="blue")
|
||||
sub = RedBlueService.select_counterparty_submission(challenge=c, team="red", session=session)
|
||||
assert sub.team == "blue"
|
||||
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue