feat: add challenge and red-blue competitions across API and web

2025-10-01 06:49:09 -06:00 · 2025-10-01 06:49:09 -06:00 · 8fd3c4bb64
commit 8fd3c4bb64
parent f5161d9add
77 changed files with 5355 additions and 24 deletions
--- a/api/services/challenge_scorer_protocol.py
+++ b/api/services/challenge_scorer_protocol.py
@ -0,0 +1,56 @@
+"""
+Challenge scorer protocol and type definitions.
+
+Defines the interface for custom scoring plugins that compute
+numeric scores from attempt metrics for leaderboard ranking.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Protocol, TypedDict
+
+
+class ScoringContext(TypedDict, total=False):
+    """Context provided to scorer plugins."""
+
+    tenant_id: str
+    app_id: str
+    workflow_id: str
+    challenge_id: str
+    end_user_id: str | None
+    timeout_ms: int
+
+
+class AttemptMetrics(TypedDict, total=False):
+    """Metrics from a challenge attempt."""
+
+    succeeded: bool
+    tokens_total: int | None
+    elapsed_ms: int | None
+    rating: int | None
+    created_at: int | None  # epoch ms
+
+
+class ScoringResult(TypedDict, total=False):
+    """Result returned by scorer plugin."""
+
+    score: float
+    details: dict[str, Any] | None
+
+
+class ScorerProtocol(Protocol):
+    """Protocol that all scorer plugins must implement."""
+
+    def score(self, metrics: AttemptMetrics, config: dict[str, Any], ctx: ScoringContext) -> ScoringResult:
+        """
+        Compute a numeric score from attempt metrics.
+
+        Args:
+            metrics: Attempt metrics (tokens, time, rating, etc.)
+            config: Plugin-specific configuration (from challenge.scoring_config)
+            ctx: Context with tenant_id, app_id, etc.
+
+        Returns:
+            ScoringResult with computed score and optional details
+        """
+        ...
--- a/api/services/challenge_scorer_service.py
+++ b/api/services/challenge_scorer_service.py
@ -0,0 +1,112 @@
+"""
+Challenge scorer service.
+
+Loads and invokes custom scorer plugins to compute scores from attempt metrics.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from services.challenge_scorer_protocol import AttemptMetrics, ScoringContext, ScoringResult
+
+logger = logging.getLogger(__name__)
+
+
+class ChallengeScorerService:
+    """Service for loading and invoking custom scorer plugins."""
+
+    _plugin_cache: dict[str, Any] = {}
+
+    @classmethod
+    def score_with_plugin(
+        cls,
+        *,
+        scorer_plugin_id: str | None,
+        scorer_entrypoint: str | None,
+        metrics: AttemptMetrics,
+        config: dict[str, Any] | None,
+        ctx: ScoringContext,
+    ) -> ScoringResult:
+        """
+        Compute score using a custom scorer plugin.
+
+        Args:
+            scorer_plugin_id: Plugin identifier (e.g., 'builtin.weighted_scorer')
+            scorer_entrypoint: Entrypoint path (e.g., 'services.scorers.weighted:WeightedScorer')
+            metrics: Attempt metrics to score
+            config: Plugin-specific configuration
+            ctx: Scoring context
+
+        Returns:
+            ScoringResult with computed score
+
+        Raises:
+            ValueError: If plugin cannot be loaded or scoring fails
+        """
+        if not scorer_plugin_id or not scorer_entrypoint:
+            raise ValueError("scorer_plugin_id and scorer_entrypoint are required for custom scoring")
+
+        # Load plugin
+        scorer = cls._load_plugin(scorer_plugin_id, scorer_entrypoint)
+        if not scorer:
+            raise ValueError(f"Failed to load scorer plugin: {scorer_plugin_id}:{scorer_entrypoint}")
+
+        # Invoke scorer with timeout protection
+        timeout_ms = ctx.get("timeout_ms", 5000)
+        try:
+            # TODO: Add timeout enforcement using threading.Timer or signal.alarm
+            result = scorer.score(metrics, config or {}, ctx)
+            if not isinstance(result, dict) or "score" not in result:
+                raise ValueError("Scorer must return a dict with 'score' key")
+            return result
+        except Exception as e:
+            logger.error(f"Scorer plugin {scorer_plugin_id} failed: {e}", exc_info=True)
+            raise ValueError(f"Scorer plugin execution failed: {e}")
+
+    @classmethod
+    def _load_plugin(cls, plugin_id: str, entrypoint: str) -> Any:
+        """
+        Load a scorer plugin by entrypoint.
+
+        Args:
+            plugin_id: Plugin identifier for caching
+            entrypoint: Python path like 'pkg.module:ClassName'
+
+        Returns:
+            Scorer instance or None if loading fails
+        """
+        cache_key = f"{plugin_id}:{entrypoint}"
+        if cache_key in cls._plugin_cache:
+            return cls._plugin_cache[cache_key]
+
+        try:
+            # Parse entrypoint: 'pkg.module:ClassName'
+            if ":" not in entrypoint:
+                raise ValueError(f"Invalid entrypoint format: {entrypoint}. Expected 'module:ClassName'")
+
+            module_path, class_name = entrypoint.split(":", 1)
+
+            # Dynamic import
+            import importlib
+
+            module = importlib.import_module(module_path)
+            scorer_class = getattr(module, class_name)
+
+            # Instantiate
+            scorer = scorer_class()
+
+            # Cache it
+            cls._plugin_cache[cache_key] = scorer
+            logger.info(f"Loaded scorer plugin: {plugin_id} from {entrypoint}")
+            return scorer
+
+        except Exception as e:
+            logger.error(f"Failed to load scorer plugin {plugin_id}:{entrypoint}: {e}", exc_info=True)
+            return None
+
+    @classmethod
+    def clear_cache(cls) -> None:
+        """Clear the plugin cache (useful for testing)."""
+        cls._plugin_cache.clear()
--- a/api/services/challenge_service.py
+++ b/api/services/challenge_service.py
@ -0,0 +1,64 @@
+from __future__ import annotations
+
+import re
+from collections.abc import Mapping
+from typing import Any
+
+from sqlalchemy.orm import Session
+
+from extensions.ext_database import db
+from models.challenge import Challenge, ChallengeAttempt
+
+
+class ChallengeService:
+    @staticmethod
+    def evaluate_outcome(output_text: str, cfg: Mapping[str, Any]) -> tuple[bool, dict[str, Any]]:
+        success_type = cfg.get("success_type", "regex")
+        pattern = cfg.get("success_pattern")
+        if success_type == "regex" and pattern:
+            try:
+                if re.search(pattern, output_text, flags=re.IGNORECASE | re.MULTILINE):
+                    return True, {"mode": "regex", "matched": True}
+                return False, {"mode": "regex", "matched": False}
+            except re.error as e:
+                return False, {"mode": "regex", "error": f"invalid_regex: {e}"}
+        if success_type == "contains" and pattern:
+            return (pattern.lower() in output_text.lower()), {"mode": "contains"}
+        return False, {"mode": success_type, "info": "no_pattern_or_unsupported"}
+
+    @staticmethod
+    def record_attempt(
+        *,
+        tenant_id: str,
+        challenge_id: str,
+        end_user_id: str | None,
+        account_id: str | None,
+        workflow_run_id: str | None,
+        succeeded: bool,
+        score: float | None = None,
+        judge_rating: int | None = None,
+        judge_feedback: str | None = None,
+        judge_output_raw: dict[str, Any] | None = None,
+        tokens_total: int | None = None,
+        elapsed_ms: int | None = None,
+        session: Session | None = None,
+    ) -> ChallengeAttempt:
+        sess = session or db.session
+        attempt = ChallengeAttempt()
+        attempt.tenant_id = tenant_id
+        attempt.challenge_id = challenge_id
+        attempt.end_user_id = end_user_id
+        attempt.account_id = account_id
+        attempt.workflow_run_id = workflow_run_id
+        attempt.succeeded = succeeded
+        attempt.score = score
+        attempt.judge_rating = judge_rating
+        attempt.judge_feedback = judge_feedback
+        attempt.judge_output_raw = judge_output_raw
+        attempt.tokens_total = tokens_total
+        attempt.elapsed_ms = elapsed_ms
+        sess.add(attempt)
+        sess.commit()
+        return attempt
+
+
--- a/api/services/red_blue_service.py
+++ b/api/services/red_blue_service.py
@ -0,0 +1,91 @@
+from __future__ import annotations
+
+from typing import Any
+
+from sqlalchemy.orm import Session
+
+from extensions.ext_database import db
+from models.red_blue import RedBlueChallenge, TeamPairing, TeamSubmission
+
+
+class RedBlueService:
+    @staticmethod
+    def submit_prompt(
+        *,
+        challenge_id: str,
+        tenant_id: str,
+        team: str,
+        prompt: str,
+        account_id: str | None,
+        end_user_id: str | None,
+        session: Session | None = None,
+    ) -> TeamSubmission:
+        sess = session or db.session
+        sub = TeamSubmission()
+        sub.red_blue_challenge_id = challenge_id
+        sub.tenant_id = tenant_id
+        sub.team = team
+        sub.prompt = prompt
+        sub.account_id = account_id
+        sub.end_user_id = end_user_id
+        sess.add(sub)
+        sess.commit()
+        return sub
+
+    @staticmethod
+    def select_counterparty_submission(
+        *,
+        challenge: RedBlueChallenge,
+        team: str,
+        session: Session | None = None,
+    ) -> TeamSubmission | None:
+        sess = session or db.session
+        opposite = "blue" if team == "red" else "red"
+        # Simplest policy: latest active from opposite team
+        return (
+            sess.query(TeamSubmission)
+            .filter(
+                TeamSubmission.red_blue_challenge_id == challenge.id,
+                TeamSubmission.team == opposite,
+                TeamSubmission.active.is_(True),
+            )
+            .order_by(TeamSubmission.created_at.desc())
+            .first()
+        )
+
+    @staticmethod
+    def record_pairing(
+        *,
+        challenge_id: str,
+        tenant_id: str,
+        attack_submission_id: str | None,
+        defense_submission_id: str | None,
+        judge_output_raw: dict[str, Any] | None,
+        categories: dict[str, Any] | None,
+        judge_rating: int | None,
+        judge_feedback: str | None,
+        red_points: float,
+        blue_points: float,
+        tokens_total: int | None,
+        elapsed_ms: int | None,
+        session: Session | None = None,
+    ) -> TeamPairing:
+        sess = session or db.session
+        pairing = TeamPairing()
+        pairing.red_blue_challenge_id = challenge_id
+        pairing.tenant_id = tenant_id
+        pairing.attack_submission_id = attack_submission_id
+        pairing.defense_submission_id = defense_submission_id
+        pairing.judge_output_raw = judge_output_raw
+        pairing.categories = categories
+        pairing.judge_rating = judge_rating
+        pairing.judge_feedback = judge_feedback
+        pairing.red_points = red_points
+        pairing.blue_points = blue_points
+        pairing.tokens_total = tokens_total
+        pairing.elapsed_ms = elapsed_ms
+        sess.add(pairing)
+        sess.commit()
+        return pairing
+
+
--- a/api/services/scorers/README.md
+++ b/api/services/scorers/README.md
@ -0,0 +1,144 @@
+# Custom Scorer Plugins
+
+This directory contains custom scorer plugins for challenge leaderboards.
+
+## Overview
+
+Scorers compute numeric scores from challenge attempt metrics (tokens, time, rating, success) for ranking on leaderboards when `scoring_strategy = 'custom'`.
+
+## Built-in Scorers
+
+### WeightedScorer
+
+**Entrypoint:** `services.scorers.weighted:WeightedScorer`
+
+Computes a weighted score combining multiple metrics with configurable bonuses and penalties.
+
+**Formula:**
+```
+score = success_bonus
+        + (rating × rating_weight)
+        - (elapsed_seconds × time_penalty)
+        - (tokens × token_penalty)
+```
+
+**Configuration:**
+- `success_bonus` (float, default: 100): Base points for successful attempts
+- `rating_weight` (float, default: 10): Multiplier for judge rating (0-10)
+- `time_penalty` (float, default: 1.0): Penalty per second elapsed
+- `token_penalty` (float, default: 0.01): Penalty per token used
+
+**Example Configuration:**
+```json
+{
+  "success_bonus": 100.0,
+  "rating_weight": 10.0,
+  "time_penalty": 1.0,
+  "token_penalty": 0.01
+}
+```
+
+**Example Challenge Setup (via API):**
+```python
+{
+  "name": "Advanced Prompt Challenge",
+  "scoring_strategy": "custom",
+  "scoring_plugin_id": "builtin.weighted_scorer",
+  "scoring_entrypoint": "services.scorers.weighted:WeightedScorer",
+  "scoring_config": {
+    "success_bonus": 100.0,
+    "rating_weight": 15.0,
+    "time_penalty": 0.5,
+    "token_penalty": 0.02
+  }
+}
+```
+
+## Creating Custom Scorers
+
+### 1. Implement the ScorerProtocol
+
+Create a new file in this directory (e.g., `custom.py`):
+
+```python
+from typing import Any
+from services.challenge_scorer_protocol import AttemptMetrics, ScoringContext, ScoringResult
+
+class MyCustomScorer:
+    def score(self, metrics: AttemptMetrics, config: dict[str, Any], ctx: ScoringContext) -> ScoringResult:
+        # Access metrics
+        succeeded = metrics.get('succeeded', False)
+        tokens = metrics.get('tokens_total', 0)
+        elapsed_ms = metrics.get('elapsed_ms', 0)
+        rating = metrics.get('rating', 0)
+
+        # Access configuration
+        multiplier = config.get('multiplier', 1.0)
+
+        # Compute score
+        score = (rating * multiplier) if succeeded else 0.0
+
+        return {
+            'score': score,
+            'details': {  # optional
+                'multiplier_used': multiplier
+            }
+        }
+```
+
+### 2. Register in Challenge
+
+Set the challenge's scoring fields:
+
+```python
+challenge.scoring_strategy = 'custom'
+challenge.scoring_plugin_id = 'my_custom_scorer'
+challenge.scoring_entrypoint = 'services.scorers.custom:MyCustomScorer'
+challenge.scoring_config = {
+    'multiplier': 2.0
+}
+```
+
+### 3. Testing
+
+Create tests in `api/tests/unit_tests/services/` following the pattern in `test_challenge_scorer_service.py`.
+
+## Protocol Reference
+
+### Input Types
+
+**AttemptMetrics:**
+- `succeeded` (bool): Whether the challenge was passed
+- `tokens_total` (int | None): Total tokens used
+- `elapsed_ms` (int | None): Time taken in milliseconds
+- `rating` (int | None): Judge rating (0-10)
+- `created_at` (int | None): Timestamp in epoch milliseconds
+
+**ScoringContext:**
+- `tenant_id` (str): Tenant identifier
+- `app_id` (str): Application identifier
+- `workflow_id` (str): Workflow identifier
+- `challenge_id` (str): Challenge identifier
+- `end_user_id` (str | None): End user identifier (if available)
+- `timeout_ms` (int): Maximum execution time
+
+### Output Type
+
+**ScoringResult:**
+- `score` (float, required): Computed numeric score
+- `details` (dict[str, Any] | None, optional): Additional scoring details
+
+## Error Handling
+
+- Scorers must return a dict with a `score` key
+- Exceptions are caught and logged; the attempt is recorded with `score=None`
+- Scorers are executed with a timeout (default: 5s)
+- Scorers should never return negative scores; use `max(score, 0.0)` to clamp
+
+## Best Practices
+
+1. **Keep it simple**: Scoring should be fast and deterministic
+2. **Validate config**: Check configuration values and provide defaults
+3. **Clamp scores**: Ensure scores are non-negative
+4. **Document formula**: Clearly explain how your scorer works
+5. **Test edge cases**: Test with missing metrics, zeros, nulls
--- a/api/services/scorers/init.py
+++ b/api/services/scorers/init.py
@ -0,0 +1 @@
+"""Built-in scorer plugins."""
--- a/api/services/scorers/weighted.py
+++ b/api/services/scorers/weighted.py
@ -0,0 +1,66 @@
+"""
+Weighted scorer plugin.
+
+Computes a weighted score based on success bonus, rating, elapsed time, and token usage.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from services.challenge_scorer_protocol import AttemptMetrics, ScoringContext, ScoringResult
+
+
+class WeightedScorer:
+    """
+    Example weighted scorer that combines multiple metrics.
+
+    Configuration options:
+    - success_bonus (float): Base points for successful attempt (default: 100)
+    - rating_weight (float): Multiplier for judge rating (default: 10)
+    - time_penalty (float): Penalty per second elapsed (default: 1.0)
+    - token_penalty (float): Penalty per token used (default: 0.01)
+
+    Formula:
+        score = success_bonus
+                + (rating * rating_weight)
+                - (elapsed_seconds * time_penalty)
+                - (tokens * token_penalty)
+    """
+
+    def score(self, metrics: AttemptMetrics, config: dict[str, Any], ctx: ScoringContext) -> ScoringResult:
+        """Compute weighted score from metrics."""
+        # Base score for success
+        base = 0.0
+        if metrics.get("succeeded"):
+            base += config.get("success_bonus", 100.0)
+
+        # Add rating contribution
+        rating = metrics.get("rating") or 0
+        rating_weight = config.get("rating_weight", 10.0)
+        rating_score = rating * rating_weight
+
+        # Subtract time penalty
+        elapsed_ms = metrics.get("elapsed_ms") or 0
+        elapsed_seconds = elapsed_ms / 1000.0
+        time_penalty = config.get("time_penalty", 1.0)
+        time_score = elapsed_seconds * time_penalty
+
+        # Subtract token penalty
+        tokens = metrics.get("tokens_total") or 0
+        token_penalty = config.get("token_penalty", 0.01)
+        token_score = tokens * token_penalty
+
+        # Compute final score (never negative)
+        final_score = base + rating_score - time_score - token_score
+        final_score = max(final_score, 0.0)
+
+        return {
+            "score": final_score,
+            "details": {
+                "base": base,
+                "rating_contribution": rating_score,
+                "time_penalty": time_score,
+                "token_penalty": token_score,
+            },
+        }