feat: add challenge and red-blue competitions across API and web

This commit is contained in:
Joey Yakimowich-Payne 2025-10-01 06:49:09 -06:00
commit 8fd3c4bb64
No known key found for this signature in database
GPG key ID: 6BFE655FA5ABD1E1
77 changed files with 5355 additions and 24 deletions

View file

@ -129,6 +129,10 @@ from .workspace import (
workspace,
)
# Import custom challenge controllers
from . import challenges as challenges
from . import red_blue_challenges as red_blue_challenges
api.add_namespace(console_ns)
__all__ = [
@ -204,4 +208,6 @@ __all__ = [
"workflow_run",
"workflow_statistic",
"workspace",
"challenges",
"red_blue_challenges",
]

View file

@ -12,7 +12,6 @@ from controllers.console.app.wraps import get_app_model
from controllers.console.wraps import (
account_initialization_required,
cloud_edition_billing_resource_check,
enterprise_license_required,
setup_required,
)
from core.ops.ops_trace_manager import OpsTraceManager
@ -53,7 +52,6 @@ class AppListApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
def get(self):
"""Get app list"""
@ -166,7 +164,6 @@ class AppApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
@get_app_model
@marshal_with(app_detail_fields_with_site)
def get(self, app_model):

View file

@ -0,0 +1,154 @@
from __future__ import annotations
from flask_restx import Resource, reqparse
from controllers.console import console_ns as api
from controllers.console.wraps import (
account_initialization_required,
setup_required,
)
from libs.login import login_required
from extensions.ext_database import db
from libs.login import current_user
from models.challenge import Challenge
@api.route("/challenges")
class ChallengeListCreateApi(Resource):
@api.doc("list_challenges")
@setup_required
@login_required
@account_initialization_required
def get(self):
tenant_id = current_user.current_tenant_id
if not tenant_id:
# no active workspace selected; return empty list to avoid leaking data
return {"result": "success", "data": []}
rows = (
db.session.query(Challenge)
.filter(Challenge.tenant_id == tenant_id)
.order_by(Challenge.created_at.desc())
.all()
)
return {
"result": "success",
"data": [
{
"id": r.id,
"name": r.name,
"description": r.description,
"goal": r.goal,
"is_active": r.is_active,
"success_type": r.success_type,
"success_pattern": r.success_pattern,
"scoring_strategy": r.scoring_strategy,
"app_id": r.app_id,
"workflow_id": r.workflow_id,
}
for r in rows
],
}
@api.doc("create_challenge")
@setup_required
@login_required
@account_initialization_required
def post(self):
parser = reqparse.RequestParser()
parser.add_argument("tenant_id", type=str, required=False, location="json")
parser.add_argument("app_id", type=str, required=True, location="json")
parser.add_argument("workflow_id", type=str, required=False, location="json")
parser.add_argument("name", type=str, required=True, location="json")
parser.add_argument("description", type=str, required=False, location="json")
parser.add_argument("goal", type=str, required=False, location="json")
parser.add_argument("success_type", type=str, required=False, location="json")
parser.add_argument("success_pattern", type=str, required=False, location="json")
parser.add_argument("scoring_strategy", type=str, required=False, location="json")
parser.add_argument("is_active", type=bool, required=False, location="json")
args = parser.parse_args()
c = Challenge()
c.tenant_id = args.get("tenant_id") or current_user.current_tenant_id
c.app_id = args["app_id"]
# Convert empty string to None for UUID field
workflow_id = args.get("workflow_id")
c.workflow_id = workflow_id if workflow_id else None
c.name = args["name"]
c.description = args.get("description")
c.goal = args.get("goal")
if args.get("success_type"):
c.success_type = args["success_type"]
c.success_pattern = args.get("success_pattern")
if args.get("scoring_strategy"):
c.scoring_strategy = args["scoring_strategy"]
if args.get("is_active") is not None:
c.is_active = args["is_active"]
db.session.add(c)
db.session.commit()
return {"result": "success", "data": {"id": c.id}}, 201
@api.route("/challenges/<uuid:challenge_id>")
class ChallengeDetailApi(Resource):
@api.doc("get_challenge")
@setup_required
@login_required
@account_initialization_required
def get(self, challenge_id):
c = db.session.get(Challenge, str(challenge_id))
if not c:
return {"result": "not_found"}, 404
return {
"result": "success",
"data": {
"id": c.id,
"name": c.name,
"description": c.description,
"goal": c.goal,
"is_active": c.is_active,
"success_type": c.success_type,
"success_pattern": c.success_pattern,
"scoring_strategy": c.scoring_strategy,
"app_id": c.app_id,
"workflow_id": c.workflow_id,
},
}
@api.doc("update_challenge")
@setup_required
@login_required
@account_initialization_required
def patch(self, challenge_id):
c = db.session.get(Challenge, str(challenge_id))
if not c:
return {"result": "not_found"}, 404
parser = reqparse.RequestParser()
parser.add_argument("name", type=str, required=False, location="json")
parser.add_argument("description", type=str, required=False, location="json")
parser.add_argument("goal", type=str, required=False, location="json")
parser.add_argument("is_active", type=bool, required=False, location="json")
args = parser.parse_args()
if args.get("name"):
c.name = args["name"]
if args.get("description") is not None:
c.description = args["description"]
if args.get("goal") is not None:
c.goal = args["goal"]
if args.get("is_active") is not None:
c.is_active = bool(args["is_active"])
db.session.commit()
return {"result": "success"}
@api.doc("delete_challenge")
@setup_required
@login_required
@account_initialization_required
def delete(self, challenge_id):
c = db.session.get(Challenge, str(challenge_id))
if not c:
return {"result": "not_found"}, 404
db.session.delete(c)
db.session.commit()
return {"result": "success"}, 204

View file

@ -5,7 +5,7 @@ from flask_restx import Resource, marshal_with, reqparse
from werkzeug.exceptions import NotFound
from controllers.console import console_ns
from controllers.console.wraps import account_initialization_required, enterprise_license_required, setup_required
from controllers.console.wraps import account_initialization_required, setup_required
from fields.dataset_fields import dataset_metadata_fields
from libs.login import login_required
from services.dataset_service import DatasetService
@ -21,7 +21,6 @@ class DatasetMetadataCreateApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
@marshal_with(dataset_metadata_fields)
def post(self, dataset_id):
parser = reqparse.RequestParser()
@ -42,7 +41,6 @@ class DatasetMetadataCreateApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
def get(self, dataset_id):
dataset_id_str = str(dataset_id)
dataset = DatasetService.get_dataset(dataset_id_str)
@ -56,7 +54,6 @@ class DatasetMetadataApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
@marshal_with(dataset_metadata_fields)
def patch(self, dataset_id, metadata_id):
parser = reqparse.RequestParser()
@ -77,7 +74,6 @@ class DatasetMetadataApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
def delete(self, dataset_id, metadata_id):
dataset_id_str = str(dataset_id)
metadata_id_str = str(metadata_id)
@ -95,7 +91,6 @@ class DatasetMetadataBuiltInFieldApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
def get(self):
built_in_fields = MetadataService.get_built_in_fields()
return {"fields": built_in_fields}, 200
@ -106,7 +101,6 @@ class DatasetMetadataBuiltInFieldActionApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
def post(self, dataset_id, action: Literal["enable", "disable"]):
dataset_id_str = str(dataset_id)
dataset = DatasetService.get_dataset(dataset_id_str)
@ -126,7 +120,6 @@ class DocumentMetadataEditApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
def post(self, dataset_id):
dataset_id_str = str(dataset_id)
dataset = DatasetService.get_dataset(dataset_id_str)

View file

@ -7,7 +7,6 @@ from sqlalchemy.orm import Session
from controllers.console import console_ns
from controllers.console.wraps import (
account_initialization_required,
enterprise_license_required,
knowledge_pipeline_publish_enabled,
setup_required,
)
@ -37,7 +36,6 @@ class PipelineTemplateListApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
def get(self):
type = request.args.get("type", default="built-in", type=str)
language = request.args.get("language", default="en-US", type=str)
@ -51,7 +49,6 @@ class PipelineTemplateDetailApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
def get(self, template_id: str):
type = request.args.get("type", default="built-in", type=str)
rag_pipeline_service = RagPipelineService()
@ -64,7 +61,6 @@ class CustomizedPipelineTemplateApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
def patch(self, template_id: str):
parser = reqparse.RequestParser()
parser.add_argument(
@ -95,7 +91,6 @@ class CustomizedPipelineTemplateApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
def delete(self, template_id: str):
RagPipelineService.delete_customized_pipeline_template(template_id)
return 200
@ -103,7 +98,6 @@ class CustomizedPipelineTemplateApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
def post(self, template_id: str):
with Session(db.engine) as session:
template = (
@ -120,7 +114,6 @@ class PublishCustomizedPipelineTemplateApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
@knowledge_pipeline_publish_enabled
def post(self, pipeline_id: str):
parser = reqparse.RequestParser()

View file

@ -0,0 +1,145 @@
from __future__ import annotations
from flask_restx import Resource, reqparse
from controllers.console import console_ns as api
from controllers.console.wraps import (
account_initialization_required,
setup_required,
)
from extensions.ext_database import db
from libs.login import current_user, login_required
from models.red_blue import RedBlueChallenge, TeamPairing
@api.route("/red-blue-challenges")
class RedBlueListCreateApi(Resource):
@api.doc("list_red_blue_challenges")
@setup_required
@login_required
@account_initialization_required
def get(self):
tenant_id = current_user.current_tenant_id
if not tenant_id:
return {"result": "success", "data": []}
rows = (
db.session.query(RedBlueChallenge)
.filter(RedBlueChallenge.tenant_id == tenant_id)
.order_by(RedBlueChallenge.created_at.desc())
.all()
)
return {
"result": "success",
"data": [
{
"id": r.id,
"name": r.name,
"description": r.description,
"is_active": r.is_active,
}
for r in rows
],
}
@api.doc("create_red_blue_challenge")
@setup_required
@login_required
@account_initialization_required
def post(self):
parser = reqparse.RequestParser()
parser.add_argument("tenant_id", type=str, required=True, location="json")
parser.add_argument("app_id", type=str, required=True, location="json")
parser.add_argument("name", type=str, required=True, location="json")
parser.add_argument("description", type=str, required=False, location="json")
parser.add_argument("judge_suite", type=dict, required=True, location="json")
args = parser.parse_args()
c = RedBlueChallenge()
c.tenant_id = args.get("tenant_id") or current_user.current_tenant_id
c.app_id = args["app_id"]
c.name = args["name"]
c.description = args.get("description")
c.judge_suite = args["judge_suite"]
db.session.add(c)
db.session.commit()
return {"result": "success", "data": {"id": c.id}}, 201
@api.route("/red-blue-challenges/<uuid:challenge_id>")
class RedBlueDetailApi(Resource):
@api.doc("get_red_blue_challenge")
@setup_required
@login_required
@account_initialization_required
def get(self, challenge_id):
c = db.session.get(RedBlueChallenge, str(challenge_id))
if not c:
return {"result": "not_found"}, 404
return {
"result": "success",
"data": {"id": c.id, "name": c.name, "description": c.description, "is_active": c.is_active},
}
@api.doc("update_red_blue_challenge")
@setup_required
@login_required
@account_initialization_required
def patch(self, challenge_id):
c = db.session.get(RedBlueChallenge, str(challenge_id))
if not c:
return {"result": "not_found"}, 404
parser = reqparse.RequestParser()
parser.add_argument("name", type=str, required=False, location="json")
parser.add_argument("description", type=str, required=False, location="json")
parser.add_argument("is_active", type=bool, required=False, location="json")
args = parser.parse_args()
if args.get("name"):
c.name = args["name"]
if args.get("description") is not None:
c.description = args["description"]
if args.get("is_active") is not None:
c.is_active = bool(args["is_active"])
db.session.commit()
return {"result": "success"}
@api.doc("delete_red_blue_challenge")
@setup_required
@login_required
@account_initialization_required
def delete(self, challenge_id):
c = db.session.get(RedBlueChallenge, str(challenge_id))
if not c:
return {"result": "not_found"}, 404
db.session.delete(c)
db.session.commit()
return {"result": "success"}, 204
@api.route("/red-blue-challenges/<uuid:challenge_id>/pairings")
class RedBluePairingsApi(Resource):
@api.doc("list_red_blue_pairings")
@setup_required
@login_required
@account_initialization_required
def get(self, challenge_id):
rows = (
db.session.query(TeamPairing)
.filter(TeamPairing.red_blue_challenge_id == str(challenge_id))
.order_by(TeamPairing.created_at.desc())
.limit(100)
.all()
)
return {
"result": "success",
"data": [
{
"id": r.id,
"red_points": r.red_points,
"blue_points": r.blue_points,
"judge_rating": r.judge_rating,
"created_at": r.created_at.isoformat() if hasattr(r.created_at, "isoformat") else None,
}
for r in rows
],
}

View file

@ -29,7 +29,6 @@ from controllers.console.wraps import (
account_initialization_required,
cloud_edition_billing_enabled,
enable_change_email,
enterprise_license_required,
only_edition_cloud,
setup_required,
)
@ -102,7 +101,6 @@ class AccountProfileApi(Resource):
@login_required
@account_initialization_required
@marshal_with(account_fields)
@enterprise_license_required
def get(self):
if not isinstance(current_user, Account):
raise ValueError("Invalid user account")

View file

@ -13,7 +13,6 @@ from configs import dify_config
from controllers.console import api
from controllers.console.wraps import (
account_initialization_required,
enterprise_license_required,
setup_required,
)
from core.mcp.auth.auth_flow import auth, handle_callback
@ -667,7 +666,6 @@ class ToolLabelsApi(Resource):
@setup_required
@login_required
@account_initialization_required
@enterprise_license_required
def get(self):
return jsonable_encoder(ToolLabelsService.list_tool_labels())

View file

@ -24,12 +24,15 @@ from . import (
files,
forgot_password,
login,
register,
message,
passport,
remote_files,
saved_message,
site,
workflow,
challenges,
red_blue_challenges,
)
api.add_namespace(web_ns)
@ -45,6 +48,7 @@ __all__ = [
"files",
"forgot_password",
"login",
"register",
"message",
"passport",
"remote_files",
@ -52,4 +56,6 @@ __all__ = [
"site",
"web_ns",
"workflow",
"challenges",
"red_blue_challenges",
]

View file

@ -0,0 +1,121 @@
from __future__ import annotations
from flask_restx import Resource
from controllers.web import web_ns
from extensions.ext_database import db
from sqlalchemy import select
from models.challenge import Challenge, ChallengeAttempt
from models.model import App, Site
@web_ns.route("/challenges")
class ChallengeListApi(Resource):
def get(self):
q = db.session.query(Challenge).filter(Challenge.is_active.is_(True)).order_by(Challenge.created_at.desc())
items = []
for c in q.all():
app = db.session.get(App, c.app_id) if c.app_id else None
site_code = None
if c.app_id:
site = db.session.execute(
select(Site).where(Site.app_id == c.app_id, Site.status == "normal")
).scalar_one_or_none()
site_code = site.code if site else None
items.append({
"id": c.id,
"name": c.name,
"description": c.description,
"goal": c.goal,
"app_id": c.app_id,
"workflow_id": c.workflow_id,
"app_mode": app.mode if app else None,
"app_site_code": site_code,
})
return {"result": "success", "data": items}
@web_ns.route("/challenges/<uuid:challenge_id>")
class ChallengeDetailApi(Resource):
def get(self, challenge_id):
c = db.session.get(Challenge, str(challenge_id))
if not c:
return {"result": "not_found"}, 404
app = db.session.get(App, c.app_id) if c.app_id else None
site_code = None
if c.app_id:
site = db.session.execute(
select(Site).where(Site.app_id == c.app_id, Site.status == "normal")
).scalar_one_or_none()
site_code = site.code if site else None
data = {
"id": c.id,
"name": c.name,
"description": c.description,
"goal": c.goal,
"is_active": c.is_active,
"app_id": c.app_id,
"workflow_id": c.workflow_id,
"app_mode": app.mode if app else None,
"app_site_code": site_code,
}
return {"result": "success", "data": data}
@web_ns.route("/challenges/<uuid:challenge_id>/leaderboard")
class ChallengeLeaderboardApi(Resource):
def get(self, challenge_id):
limit = 20
# Get the challenge to determine scoring strategy
challenge = db.session.get(Challenge, str(challenge_id))
if not challenge:
return {"result": "not_found"}, 404
scoring_strategy = challenge.scoring_strategy or 'highest_rating'
# Build query based on scoring strategy
q = db.session.query(ChallengeAttempt).filter(
ChallengeAttempt.challenge_id == str(challenge_id),
ChallengeAttempt.succeeded.is_(True)
)
# Apply sorting based on strategy
if scoring_strategy == 'first':
# Earliest successful attempt wins
q = q.order_by(ChallengeAttempt.created_at.asc())
elif scoring_strategy == 'fastest':
# Lowest elapsed_ms wins
q = q.order_by(ChallengeAttempt.elapsed_ms.asc().nullslast(), ChallengeAttempt.created_at.asc())
elif scoring_strategy == 'fewest_tokens':
# Lowest tokens_total wins
q = q.order_by(ChallengeAttempt.tokens_total.asc().nullslast(), ChallengeAttempt.created_at.asc())
elif scoring_strategy == 'highest_rating':
# Highest judge_rating wins, ties broken by earliest
q = q.order_by(ChallengeAttempt.judge_rating.desc().nullslast(), ChallengeAttempt.created_at.asc())
elif scoring_strategy == 'custom':
# Custom score field (computed by plugin)
q = q.order_by(ChallengeAttempt.score.desc().nullslast(), ChallengeAttempt.created_at.asc())
else:
# Default to highest_rating
q = q.order_by(ChallengeAttempt.judge_rating.desc().nullslast(), ChallengeAttempt.created_at.asc())
rows = q.limit(limit).all()
data = [
{
"attempt_id": r.id,
"account_id": r.account_id,
"end_user_id": r.end_user_id,
"score": r.score,
"judge_rating": r.judge_rating,
"tokens_total": r.tokens_total,
"elapsed_ms": r.elapsed_ms,
"created_at": r.created_at.isoformat() if hasattr(r.created_at, "isoformat") else None,
}
for r in rows
]
return {"result": "success", "data": data}

View file

@ -0,0 +1,85 @@
from __future__ import annotations
from flask import request
from flask_restx import Resource
from controllers.web import web_ns
from extensions.ext_database import db
from models.red_blue import RedBlueChallenge, TeamPairing
from services.red_blue_service import RedBlueService
@web_ns.route("/red-blue-challenges")
class RedBlueListApi(Resource):
def get(self):
q = db.session.query(RedBlueChallenge).filter(RedBlueChallenge.is_active.is_(True))
items = [
{
"id": c.id,
"name": c.name,
"description": c.description,
}
for c in q.all()
]
return {"result": "success", "data": items}
@web_ns.route("/red-blue-challenges/<uuid:challenge_id>")
class RedBlueDetailApi(Resource):
def get(self, challenge_id):
c = db.session.get(RedBlueChallenge, str(challenge_id))
if not c:
return {"result": "not_found"}, 404
data = {
"id": c.id,
"name": c.name,
"description": c.description,
}
return {"result": "success", "data": data}
@web_ns.route("/red-blue-challenges/<uuid:challenge_id>/submit")
class RedBlueSubmitApi(Resource):
def post(self, challenge_id):
payload = request.get_json(force=True) or {}
team = payload.get("team")
prompt = payload.get("prompt")
if team not in ("red", "blue") or not prompt:
return {"result": "bad_request"}, 400
c = db.session.get(RedBlueChallenge, str(challenge_id))
if not c:
return {"result": "not_found"}, 404
sub = RedBlueService.submit_prompt(
challenge_id=str(challenge_id),
tenant_id=c.tenant_id,
team=team,
prompt=prompt,
account_id=None,
end_user_id=None,
)
return {"result": "success", "data": {"id": sub.id}}, 201
@web_ns.route("/red-blue-challenges/<uuid:challenge_id>/leaderboard")
class RedBlueLeaderboardApi(Resource):
def get(self, challenge_id):
# aggregate simple totals
red = (
db.session.query(db.func.coalesce(db.func.sum(TeamPairing.red_points), 0.0))
.filter(TeamPairing.red_blue_challenge_id == str(challenge_id))
.scalar()
)
blue = (
db.session.query(db.func.coalesce(db.func.sum(TeamPairing.blue_points), 0.0))
.filter(TeamPairing.red_blue_challenge_id == str(challenge_id))
.scalar()
)
total = (red or 0.0) + (blue or 0.0)
data = {
"red_points": float(red or 0.0),
"blue_points": float(blue or 0.0),
"red_ratio": (float(red or 0.0) / total) if total else 0.0,
"blue_ratio": (float(blue or 0.0) / total) if total else 0.0,
}
return {"result": "success", "data": data}

View file

@ -0,0 +1,30 @@
from __future__ import annotations
from flask import request
from flask_restx import Resource
from controllers.web import web_ns
from extensions.ext_database import db
from services.account_service import RegisterService
@web_ns.route('/register')
class WebRegisterApi(Resource):
def post(self):
payload = request.get_json(force=True) or {}
email = payload.get('email')
name = payload.get('name') or 'Player'
password = payload.get('password')
if not email or not password:
return { 'result': 'bad_request' }, 400
account = RegisterService.register(
email=email,
name=name,
password=password,
is_setup=False,
create_workspace_required=False,
)
db.session.commit()
return { 'result': 'success', 'data': { 'account_id': account.id } }, 201

View file

@ -58,6 +58,9 @@ class NodeType(StrEnum):
DOCUMENT_EXTRACTOR = "document-extractor"
LIST_OPERATOR = "list-operator"
AGENT = "agent"
CHALLENGE_EVALUATOR = "challenge-evaluator"
JUDGING_LLM = "judging-llm"
TEAM_CHALLENGE = "team-challenge"
class NodeExecutionType(StrEnum):

View file

@ -0,0 +1,3 @@
from .node import ChallengeEvaluatorNode
__all__ = ['ChallengeEvaluatorNode']

View file

@ -0,0 +1,258 @@
# pyright: reportImplicitRelativeImport=none
from __future__ import annotations
import logging
import time
from collections.abc import Mapping
from typing import Any
from core.variables.segments import Segment
from core.workflow.enums import ErrorStrategy, NodeExecutionType, NodeType, WorkflowNodeExecutionStatus
from core.workflow.node_events import NodeRunResult
from core.workflow.nodes.base.entities import BaseNodeData, RetryConfig
from core.workflow.nodes.base.node import Node
from extensions.ext_database import db
from models.challenge import Challenge
from services.challenge_scorer_service import ChallengeScorerService
from services.challenge_service import ChallengeService
logger = logging.getLogger(__name__)
class ChallengeEvaluatorNode(Node):
node_type = NodeType.CHALLENGE_EVALUATOR
execution_type = NodeExecutionType.EXECUTABLE
_node_data: BaseNodeData
def init_node_data(self, data: Mapping[str, Any]):
# Using BaseNodeData to carry title/desc; node data is accessed directly
self._node_data = BaseNodeData.model_validate(data)
self._config: dict[str, Any] = data
def _get_error_strategy(self) -> ErrorStrategy | None:
return getattr(self._node_data, 'error_strategy', None)
def _get_retry_config(self) -> RetryConfig:
return getattr(self._node_data, 'retry_config', RetryConfig())
def _get_title(self) -> str:
return getattr(self._node_data, 'title', 'Challenge Evaluator')
def _get_description(self) -> str | None:
return getattr(self._node_data, 'desc', None)
def _get_default_value_dict(self) -> dict[str, Any]:
return getattr(self._node_data, 'default_value_dict', {})
def get_base_node_data(self) -> BaseNodeData:
return self._node_data
@classmethod
def version(cls) -> str:
return "1"
def _run(self) -> NodeRunResult:
# Resolve response text from selector in config.inputs.response (frontend schema)
output_text = ''
source_selector = None
inputs_cfg = self._config.get('inputs') or {}
if isinstance(inputs_cfg, dict):
source_selector = inputs_cfg.get('response')
# fallback to older key if any
source_selector = source_selector or self._config.get('value_selector')
# Check evaluation mode from config
evaluation_mode = self._config.get('evaluation_mode', 'rules')
logger.info("ChallengeEvaluator - evaluation_mode: %s, source_selector: %s", evaluation_mode, source_selector)
# Initialize judge variables
is_judge_input = False
judge_passed = False
judge_rating = 0
judge_feedback_from_input = ''
output_text = ''
def _segment_to_value(segment: Segment | None) -> Any:
if segment is None:
return None
if hasattr(segment, "to_object"):
try:
return segment.to_object()
except Exception: # pragma: no cover - defensive
pass
return getattr(segment, "value", segment)
# If evaluation_mode is 'llm-judge', try to read from upstream Judging LLM node
if evaluation_mode == 'llm-judge' and source_selector and len(source_selector) >= 1:
try:
node_id = source_selector[0]
# Retrieve judge outputs as Segments and convert to primitive values
passed_segment = self.graph_runtime_state.variable_pool.get([node_id, 'judge_passed'])
rating_segment = self.graph_runtime_state.variable_pool.get([node_id, 'judge_rating'])
feedback_segment = self.graph_runtime_state.variable_pool.get([node_id, 'judge_feedback'])
potential_judge_passed = _segment_to_value(passed_segment)
potential_judge_rating = _segment_to_value(rating_segment)
potential_judge_feedback = _segment_to_value(feedback_segment)
logger.info(
"ChallengeEvaluator - Reading judge outputs: passed=%s, rating=%s, feedback=%s",
potential_judge_passed,
potential_judge_rating,
potential_judge_feedback,
)
# If judge_passed exists, we successfully read from a Judging LLM node
if potential_judge_passed is not None:
is_judge_input = True
judge_passed = bool(potential_judge_passed)
judge_rating = int(potential_judge_rating or 0)
judge_feedback_from_input = str(potential_judge_feedback or '')
logger.info(
"ChallengeEvaluator - Judge input successfully read! passed=%s, rating=%s, feedback=%s",
judge_passed,
judge_rating,
judge_feedback_from_input,
)
except Exception as e:
logger.error("ChallengeEvaluator - Error reading judge outputs: %s", e, exc_info=True)
is_judge_input = False
# If not using judge input, get text output for rules-based evaluation
if not is_judge_input and source_selector:
try:
segment = self.graph_runtime_state.variable_pool.get(source_selector)
if segment is None:
output_text = ''
elif hasattr(segment, 'text'):
output_text = segment.text
else:
output_text = str(_segment_to_value(segment) or '')
except Exception:
output_text = ''
# Evaluate based on mode
if is_judge_input:
ok = judge_passed
details = {
'mode': 'llm-judge',
'rating': judge_rating,
'feedback': judge_feedback_from_input,
}
else:
# Rules-based evaluation (only if not using judge input)
ok, details = ChallengeService.evaluate_outcome(output_text, self._config)
# optional persistence if config carries challenge_id
challenge_id = self._config.get('challenge_id')
if challenge_id:
try:
# Calculate elapsed time in milliseconds
elapsed_ms = int((time.time() - self.graph_runtime_state.start_at) * 1000)
# Get total tokens used in the workflow so far
tokens_total = self.graph_runtime_state.total_tokens
# Extract judge_rating from details if available (for highest_rating strategy)
judge_rating = None
judge_feedback = None
if isinstance(details, dict):
judge_rating = details.get('rating')
judge_feedback = details.get('feedback')
# Load challenge to check scoring strategy
challenge = db.session.get(Challenge, str(challenge_id))
# Score field is reserved for custom scoring plugins.
# For built-in strategies (first, fastest, fewest_tokens, highest_rating),
# the leaderboard sorts by specific columns (created_at, elapsed_ms, tokens_total, judge_rating).
score = None
# If custom scoring is configured, compute score using plugin
if challenge and challenge.scoring_strategy == 'custom':
try:
metrics = {
'succeeded': ok,
'tokens_total': tokens_total,
'elapsed_ms': elapsed_ms,
'rating': judge_rating,
'created_at': int(time.time() * 1000),
}
ctx = {
'tenant_id': self.tenant_id,
'app_id': self.app_id,
'workflow_id': self.workflow_id,
'challenge_id': str(challenge_id),
'end_user_id': None,
'timeout_ms': 5000,
}
result = ChallengeScorerService.score_with_plugin(
scorer_plugin_id=challenge.scoring_plugin_id,
scorer_entrypoint=challenge.scoring_entrypoint,
metrics=metrics,
config=challenge.scoring_config or {},
ctx=ctx,
)
score = result.get('score')
logger.info(
"Custom scorer computed score: %s (details: %s)",
score,
result.get('details'),
)
except Exception as e:
logger.error("Custom scorer failed: %s", e, exc_info=True)
# Continue with score=None on error
ChallengeService.record_attempt(
tenant_id=self.tenant_id,
challenge_id=challenge_id,
end_user_id=None,
account_id=None,
workflow_run_id=None,
succeeded=ok,
score=score,
judge_rating=judge_rating,
judge_feedback=judge_feedback,
tokens_total=tokens_total,
elapsed_ms=elapsed_ms,
session=db.session,
)
except Exception:
# do not crash the workflow if recording fails
pass
# Always provide all output variables to match frontend getOutputVars
outputs: dict[str, Any] = {
'challenge_succeeded': ok,
'judge_rating': 0,
'judge_feedback': '',
'message': '',
}
# Override with actual values if evaluator provides them
if isinstance(details, dict):
logger.debug("ChallengeEvaluator - details: %s", details)
if 'rating' in details:
outputs['judge_rating'] = details.get('rating')
if 'feedback' in details:
outputs['judge_feedback'] = details.get('feedback')
if 'message' in details:
outputs['message'] = details.get('message')
# If no explicit message, create one from evaluation details
if not outputs['message']:
if ok:
outputs['message'] = f"Success: {details.get('mode', 'evaluation')} matched"
else:
outputs['message'] = f"Failed: {details.get('mode', 'evaluation')} did not match"
return NodeRunResult(
status=WorkflowNodeExecutionStatus.SUCCEEDED,
outputs=outputs,
)

View file

@ -0,0 +1,188 @@
from __future__ import annotations
import json
import re
from collections.abc import Mapping
from typing import Any
from core.model_manager import ModelManager
from core.model_runtime.entities.llm_entities import LLMResult
from core.model_runtime.entities.message_entities import (
PromptMessageContentType,
SystemPromptMessage,
UserPromptMessage,
)
from core.model_runtime.entities.model_entities import ModelType
from core.workflow.enums import ErrorStrategy, NodeExecutionType, NodeType, WorkflowNodeExecutionStatus
from core.workflow.node_events import NodeRunResult
from core.workflow.nodes.base.entities import BaseNodeData, RetryConfig
from core.workflow.nodes.base.node import Node
from services.challenge_service import ChallengeService
class JudgingLLMNode(Node):
node_type = NodeType.JUDGING_LLM
execution_type = NodeExecutionType.EXECUTABLE
_node_data: BaseNodeData
def init_node_data(self, data: Mapping[str, Any]):
self._node_data = BaseNodeData.model_validate(data)
# Access data directly from node_data, not from a 'config' key
self._config: dict[str, Any] = data
def _get_error_strategy(self) -> ErrorStrategy | None:
return getattr(self._node_data, 'error_strategy', None)
def _get_retry_config(self) -> RetryConfig:
return getattr(self._node_data, 'retry_config', RetryConfig())
def _get_title(self) -> str:
return getattr(self._node_data, 'title', 'Judging LLM')
def _get_description(self) -> str | None:
return getattr(self._node_data, 'desc', None)
def _get_default_value_dict(self) -> dict[str, Any]:
return getattr(self._node_data, 'default_value_dict', {})
def get_base_node_data(self) -> BaseNodeData:
return self._node_data
@classmethod
def version(cls) -> str:
return "1"
def _run(self) -> NodeRunResult:
# Placeholder with FE-compatible keys. Extract inputs for future wiring.
inputs_cfg = self._config.get('inputs') or {}
goal_selector = None
response_selector = None
if isinstance(inputs_cfg, dict):
goal_selector = inputs_cfg.get('goal')
response_selector = inputs_cfg.get('response')
# Attempt to read variables (not used in placeholder decision)
_ = None
try:
if goal_selector:
_ = self.graph_runtime_state.variable_pool.get(goal_selector)
if response_selector:
_ = self.graph_runtime_state.variable_pool.get(response_selector)
except Exception:
pass
outputs = {
'judge_passed': False,
'judge_rating': 0,
'judge_feedback': '',
}
# If model config and rubric provided, invoke LLM synchronously to judge
judge_model = self._config.get('judge_model') or {}
rubric = self._config.get('rubric_prompt_template') or ''
provider = (judge_model or {}).get('provider')
model_name = (judge_model or {}).get('name')
completion_params = (judge_model or {}).get('completion_params') or {}
def _segment_to_text(seg: Any) -> str:
try:
# Many variable types expose .text
if hasattr(seg, 'text'):
return str(seg.text)
if isinstance(seg, (dict, list)):
return json.dumps(seg, ensure_ascii=False)
return str(seg)
except Exception:
return ''
# Debug: log what we're checking
import logging
logger = logging.getLogger(__name__)
logger.info(
"JudgingLLM check - provider: %s, model: %s, rubric_len: %s, response_selector: %s",
provider,
model_name,
len(rubric) if rubric else 0,
response_selector,
)
if provider and model_name and rubric and response_selector:
logger.info("JudgingLLM: All conditions met, invoking LLM...")
try:
goal_val = self.graph_runtime_state.variable_pool.get(goal_selector) if goal_selector else None
response_val = self.graph_runtime_state.variable_pool.get(response_selector)
goal_text = _segment_to_text(goal_val)
response_text = _segment_to_text(response_val)
json_template = '{"passed": boolean, "rating": number (0-10), "feedback": string}'
prompt_body = (
f"Goal:\n{goal_text}\n\n"
f"Response:\n{response_text}\n\n"
f"Return JSON with rating 0-10: {json_template}"
)
prompt_messages = [
SystemPromptMessage(content=rubric),
UserPromptMessage(content=prompt_body),
]
model_instance = ModelManager().get_model_instance(
tenant_id=self.tenant_id,
model_type=ModelType.LLM,
provider=provider,
model=model_name,
)
result: LLMResult = model_instance.invoke_llm(
prompt_messages=prompt_messages,
model_parameters=completion_params,
stop=[],
stream=False,
user=self.user_id,
) # type: ignore
# Extract text from result
text_out = ''
content = getattr(result.message, 'content', '')
if isinstance(content, str):
text_out = content
elif isinstance(content, list):
for item in content:
if getattr(item, 'type', None) == PromptMessageContentType.TEXT:
text_out += str(getattr(item, 'data', ''))
else:
text_out = str(content)
# Parse last JSON object in output
verdict: dict[str, Any] | None = None
try:
matches = re.findall(r"\{[\s\S]*\}", text_out)
if matches:
verdict = json.loads(matches[-1])
except Exception:
verdict = None
if isinstance(verdict, dict):
outputs['judge_passed'] = bool(verdict.get('passed'))
outputs['judge_rating'] = int(verdict.get('rating') or 0)
outputs['judge_feedback'] = str(verdict.get('feedback') or '')
outputs['judge_raw'] = json.dumps(verdict)
else:
# Fallback to simple rules if configured
success_type = self._config.get('success_type')
success_pattern = self._config.get('success_pattern')
if success_type and success_pattern:
ok, _ = ChallengeService.evaluate_outcome(response_text, {
'success_type': success_type,
'success_pattern': success_pattern,
})
outputs['judge_passed'] = ok
outputs['judge_rating'] = 10 if ok else 0
outputs['judge_feedback'] = 'passed by rules' if ok else 'failed by rules'
except Exception as e:
# keep default outputs on error
logger.error("JudgingLLM error: %s", e, exc_info=True)
pass
else:
logger.warning("JudgingLLM skipped - missing required fields")
return NodeRunResult(status=WorkflowNodeExecutionStatus.SUCCEEDED, outputs=outputs)

View file

@ -24,6 +24,9 @@ from core.workflow.nodes.tool import ToolNode
from core.workflow.nodes.variable_aggregator import VariableAggregatorNode
from core.workflow.nodes.variable_assigner.v1 import VariableAssignerNode as VariableAssignerNodeV1
from core.workflow.nodes.variable_assigner.v2 import VariableAssignerNode as VariableAssignerNodeV2
from core.workflow.nodes.challenge_evaluator.node import ChallengeEvaluatorNode
from core.workflow.nodes.judging_llm.node import JudgingLLMNode
from core.workflow.nodes.team_challenge.node import TeamChallengeNode
LATEST_VERSION = "latest"
@ -142,4 +145,16 @@ NODE_TYPE_CLASSES_MAPPING: Mapping[NodeType, Mapping[str, type[Node]]] = {
LATEST_VERSION: KnowledgeIndexNode,
"1": KnowledgeIndexNode,
},
NodeType.CHALLENGE_EVALUATOR: {
LATEST_VERSION: ChallengeEvaluatorNode,
"1": ChallengeEvaluatorNode,
},
NodeType.JUDGING_LLM: {
LATEST_VERSION: JudgingLLMNode,
"1": JudgingLLMNode,
},
NodeType.TEAM_CHALLENGE: {
LATEST_VERSION: TeamChallengeNode,
"1": TeamChallengeNode,
},
}

View file

@ -0,0 +1,68 @@
from __future__ import annotations
from collections.abc import Mapping
from typing import Any
from core.workflow.enums import ErrorStrategy, NodeExecutionType, NodeType, WorkflowNodeExecutionStatus
from core.workflow.node_events import NodeRunResult
from core.workflow.nodes.base.entities import BaseNodeData, RetryConfig
from core.workflow.nodes.base.node import Node
class TeamChallengeNode(Node):
node_type = NodeType.TEAM_CHALLENGE
execution_type = NodeExecutionType.EXECUTABLE
_node_data: BaseNodeData
def init_node_data(self, data: Mapping[str, Any]):
self._node_data = BaseNodeData.model_validate(data)
self._config: dict[str, Any] = data
def _get_error_strategy(self) -> ErrorStrategy | None:
return getattr(self._node_data, 'error_strategy', None)
def _get_retry_config(self) -> RetryConfig:
return getattr(self._node_data, 'retry_config', RetryConfig())
def _get_title(self) -> str:
return getattr(self._node_data, 'title', 'Team Challenge')
def _get_description(self) -> str | None:
return getattr(self._node_data, 'desc', None)
def _get_default_value_dict(self) -> dict[str, Any]:
return getattr(self._node_data, 'default_value_dict', {})
def get_base_node_data(self) -> BaseNodeData:
return self._node_data
@classmethod
def version(cls) -> str:
return "1"
def _run(self) -> NodeRunResult:
# Read inputs.team_choice for consistency with FE
inputs_cfg = self._config.get('inputs') or {}
team_choice = ''
if isinstance(inputs_cfg, dict):
team_choice_selector = inputs_cfg.get('team_choice')
if team_choice_selector:
try:
v = self.graph_runtime_state.variable_pool.get_value_by_selector(team_choice_selector)
team_choice = str(v or '')
except Exception:
team_choice = ''
outputs = {
'team': team_choice,
'judge_passed': False,
'judge_rating': 0,
'judge_feedback': '',
'categories': {},
'team_points': 0.0,
'total_points': 0.0,
}
return NodeRunResult(status=WorkflowNodeExecutionStatus.SUCCEEDED, outputs=outputs)

View file

@ -0,0 +1,170 @@
"""add challenge & red/blue tables
Revision ID: 183e2d30fb4e
Revises: 68519ad5cd18
Create Date: 2025-09-30 08:22:31.223257
"""
from alembic import op
import models as models
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = '183e2d30fb4e'
down_revision = '68519ad5cd18'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('challenge_attempts',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('challenge_id', models.types.StringUUID(), nullable=False),
sa.Column('end_user_id', models.types.StringUUID(), nullable=True),
sa.Column('account_id', models.types.StringUUID(), nullable=True),
sa.Column('workflow_run_id', models.types.StringUUID(), nullable=True),
sa.Column('succeeded', sa.Boolean(), server_default=sa.text('false'), nullable=False),
sa.Column('score', sa.Float(), nullable=True),
sa.Column('judge_rating', sa.Integer(), nullable=True),
sa.Column('judge_feedback', sa.Text(), nullable=True),
sa.Column('judge_output_raw', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.Column('tokens_total', sa.Integer(), nullable=True),
sa.Column('elapsed_ms', sa.Integer(), nullable=True),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.PrimaryKeyConstraint('id', name='challenge_attempts_pkey')
)
with op.batch_alter_table('challenge_attempts', schema=None) as batch_op:
batch_op.create_index('challenge_attempts_challenge_id_idx', ['challenge_id'], unique=False)
batch_op.create_index('challenge_attempts_tenant_id_idx', ['tenant_id'], unique=False)
op.create_table('challenges',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('app_id', models.types.StringUUID(), nullable=False),
sa.Column('workflow_id', models.types.StringUUID(), nullable=True),
sa.Column('name', sa.Text(), nullable=False),
sa.Column('description', sa.Text(), nullable=True),
sa.Column('goal', sa.Text(), nullable=True),
sa.Column('success_type', sa.String(length=64), server_default=sa.text("'regex'"), nullable=False),
sa.Column('success_pattern', sa.Text(), nullable=True),
sa.Column('secret_ref', sa.Text(), nullable=True),
sa.Column('evaluator_type', sa.String(length=32), server_default=sa.text("'rules'"), nullable=False),
sa.Column('evaluator_plugin_id', sa.Text(), nullable=True),
sa.Column('evaluator_entrypoint', sa.Text(), nullable=True),
sa.Column('evaluator_config', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.Column('scoring_strategy', sa.String(length=64), server_default=sa.text("'first'"), nullable=False),
sa.Column('scoring_plugin_id', sa.Text(), nullable=True),
sa.Column('scoring_entrypoint', sa.Text(), nullable=True),
sa.Column('scoring_config', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.Column('is_active', sa.Boolean(), server_default=sa.text('true'), nullable=False),
sa.Column('created_by', models.types.StringUUID(), nullable=True),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.PrimaryKeyConstraint('id', name='challenges_pkey')
)
with op.batch_alter_table('challenges', schema=None) as batch_op:
batch_op.create_index('challenges_app_id_idx', ['app_id'], unique=False)
batch_op.create_index('challenges_tenant_id_idx', ['tenant_id'], unique=False)
op.create_table('red_blue_challenges',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('app_id', models.types.StringUUID(), nullable=False),
sa.Column('workflow_id', models.types.StringUUID(), nullable=True),
sa.Column('name', sa.Text(), nullable=False),
sa.Column('description', sa.Text(), nullable=True),
sa.Column('judge_suite', postgresql.JSONB(astext_type=sa.Text()), nullable=False),
sa.Column('defense_selection_policy', sa.String(length=64), server_default=sa.text("'latest_best'"), nullable=False),
sa.Column('attack_selection_policy', sa.String(length=64), server_default=sa.text("'latest_best'"), nullable=False),
sa.Column('scoring_strategy', sa.String(length=64), server_default=sa.text("'red_blue_ratio'"), nullable=False),
sa.Column('theme', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.Column('instructions_md', sa.Text(), nullable=True),
sa.Column('is_active', sa.Boolean(), server_default=sa.text('true'), nullable=False),
sa.Column('created_by', models.types.StringUUID(), nullable=True),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.PrimaryKeyConstraint('id', name='red_blue_challenges_pkey')
)
with op.batch_alter_table('red_blue_challenges', schema=None) as batch_op:
batch_op.create_index('red_blue_challenges_app_id_idx', ['app_id'], unique=False)
batch_op.create_index('red_blue_challenges_tenant_id_idx', ['tenant_id'], unique=False)
op.create_table('team_pairings',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
sa.Column('red_blue_challenge_id', models.types.StringUUID(), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('attack_submission_id', models.types.StringUUID(), nullable=True),
sa.Column('defense_submission_id', models.types.StringUUID(), nullable=True),
sa.Column('judge_output_raw', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.Column('categories', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.Column('judge_rating', sa.Integer(), nullable=True),
sa.Column('judge_feedback', sa.Text(), nullable=True),
sa.Column('red_points', sa.Float(), server_default=sa.text('0'), nullable=False),
sa.Column('blue_points', sa.Float(), server_default=sa.text('0'), nullable=False),
sa.Column('tokens_total', sa.Integer(), nullable=True),
sa.Column('elapsed_ms', sa.Integer(), nullable=True),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.PrimaryKeyConstraint('id', name='team_pairings_pkey')
)
with op.batch_alter_table('team_pairings', schema=None) as batch_op:
batch_op.create_index('team_pairings_challenge_id_idx', ['red_blue_challenge_id'], unique=False)
batch_op.create_index('team_pairings_tenant_id_idx', ['tenant_id'], unique=False)
op.create_table('team_submissions',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
sa.Column('red_blue_challenge_id', models.types.StringUUID(), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('account_id', models.types.StringUUID(), nullable=True),
sa.Column('end_user_id', models.types.StringUUID(), nullable=True),
sa.Column('team', sa.String(length=16), nullable=False),
sa.Column('prompt', sa.Text(), nullable=False),
sa.Column('active', sa.Boolean(), server_default=sa.text('true'), nullable=False),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False),
sa.PrimaryKeyConstraint('id', name='team_submissions_pkey')
)
with op.batch_alter_table('team_submissions', schema=None) as batch_op:
batch_op.create_index('team_submissions_challenge_id_idx', ['red_blue_challenge_id'], unique=False)
batch_op.create_index('team_submissions_tenant_id_idx', ['tenant_id'], unique=False)
with op.batch_alter_table('providers', schema=None) as batch_op:
batch_op.drop_column('credential_status')
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('providers', schema=None) as batch_op:
batch_op.add_column(sa.Column('credential_status', sa.VARCHAR(length=20), server_default=sa.text("'active'::character varying"), autoincrement=False, nullable=True))
with op.batch_alter_table('team_submissions', schema=None) as batch_op:
batch_op.drop_index('team_submissions_tenant_id_idx')
batch_op.drop_index('team_submissions_challenge_id_idx')
op.drop_table('team_submissions')
with op.batch_alter_table('team_pairings', schema=None) as batch_op:
batch_op.drop_index('team_pairings_tenant_id_idx')
batch_op.drop_index('team_pairings_challenge_id_idx')
op.drop_table('team_pairings')
with op.batch_alter_table('red_blue_challenges', schema=None) as batch_op:
batch_op.drop_index('red_blue_challenges_tenant_id_idx')
batch_op.drop_index('red_blue_challenges_app_id_idx')
op.drop_table('red_blue_challenges')
with op.batch_alter_table('challenges', schema=None) as batch_op:
batch_op.drop_index('challenges_tenant_id_idx')
batch_op.drop_index('challenges_app_id_idx')
op.drop_table('challenges')
with op.batch_alter_table('challenge_attempts', schema=None) as batch_op:
batch_op.drop_index('challenge_attempts_tenant_id_idx')
batch_op.drop_index('challenge_attempts_challenge_id_idx')
op.drop_table('challenge_attempts')
# ### end Alembic commands ###

View file

@ -91,6 +91,8 @@ from .workflow import (
WorkflowRun,
WorkflowType,
)
from .challenge import Challenge, ChallengeAttempt
from .red_blue import RedBlueChallenge, TeamSubmission, TeamPairing
__all__ = [
"APIBasedExtension",
@ -181,4 +183,9 @@ __all__ = [
"WorkflowRunTriggeredFrom",
"WorkflowToolProvider",
"WorkflowType",
"Challenge",
"ChallengeAttempt",
"RedBlueChallenge",
"TeamSubmission",
"TeamPairing",
]

91
api/models/challenge.py Normal file
View file

@ -0,0 +1,91 @@
from __future__ import annotations
from datetime import datetime
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import Mapped, mapped_column
from .base import Base
from .types import StringUUID
class Challenge(Base):
__tablename__ = "challenges"
__table_args__ = (
sa.PrimaryKeyConstraint("id", name="challenges_pkey"),
sa.Index("challenges_tenant_id_idx", "tenant_id"),
sa.Index("challenges_app_id_idx", "app_id"),
)
id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
app_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
workflow_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
name: Mapped[str] = mapped_column(sa.Text, nullable=False)
description: Mapped[str | None] = mapped_column(sa.Text, nullable=True)
goal: Mapped[str | None] = mapped_column(sa.Text, nullable=True)
success_type: Mapped[str] = mapped_column(sa.String(64), nullable=False, server_default=sa.text("'regex'"))
success_pattern: Mapped[str | None] = mapped_column(sa.Text, nullable=True)
secret_ref: Mapped[str | None] = mapped_column(sa.Text, nullable=True)
evaluator_type: Mapped[str] = mapped_column(sa.String(32), nullable=False, server_default=sa.text("'rules'"))
evaluator_plugin_id: Mapped[str | None] = mapped_column(sa.Text, nullable=True)
evaluator_entrypoint: Mapped[str | None] = mapped_column(sa.Text, nullable=True)
evaluator_config = mapped_column(JSONB, nullable=True)
scoring_strategy: Mapped[str] = mapped_column(sa.String(64), nullable=False, server_default=sa.text("'first'"))
scoring_plugin_id: Mapped[str | None] = mapped_column(sa.Text, nullable=True)
scoring_entrypoint: Mapped[str | None] = mapped_column(sa.Text, nullable=True)
scoring_config = mapped_column(JSONB, nullable=True)
is_active: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
created_by: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
created_at: Mapped[datetime] = mapped_column(
sa.DateTime,
nullable=False,
server_default=sa.func.current_timestamp(),
)
updated_by: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
updated_at: Mapped[datetime] = mapped_column(
sa.DateTime,
nullable=False,
server_default=sa.func.current_timestamp(),
)
class ChallengeAttempt(Base):
__tablename__ = "challenge_attempts"
__table_args__ = (
sa.PrimaryKeyConstraint("id", name="challenge_attempts_pkey"),
sa.Index("challenge_attempts_tenant_id_idx", "tenant_id"),
sa.Index("challenge_attempts_challenge_id_idx", "challenge_id"),
)
id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
challenge_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
end_user_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
account_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
workflow_run_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
succeeded: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false"))
score: Mapped[float | None] = mapped_column(sa.Float, nullable=True)
judge_rating: Mapped[int | None] = mapped_column(sa.Integer, nullable=True)
judge_feedback: Mapped[str | None] = mapped_column(sa.Text, nullable=True)
judge_output_raw = mapped_column(JSONB, nullable=True)
tokens_total: Mapped[int | None] = mapped_column(sa.Integer, nullable=True)
elapsed_ms: Mapped[int | None] = mapped_column(sa.Integer, nullable=True)
created_at: Mapped[datetime] = mapped_column(
sa.DateTime,
nullable=False,
server_default=sa.func.current_timestamp(),
)

114
api/models/red_blue.py Normal file
View file

@ -0,0 +1,114 @@
from __future__ import annotations
from datetime import datetime
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import Mapped, mapped_column
from .base import Base
from .types import StringUUID
class RedBlueChallenge(Base):
__tablename__ = "red_blue_challenges"
__table_args__ = (
sa.PrimaryKeyConstraint("id", name="red_blue_challenges_pkey"),
sa.Index("red_blue_challenges_tenant_id_idx", "tenant_id"),
sa.Index("red_blue_challenges_app_id_idx", "app_id"),
)
id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
app_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
workflow_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
name: Mapped[str] = mapped_column(sa.Text, nullable=False)
description: Mapped[str | None] = mapped_column(sa.Text, nullable=True)
judge_suite = mapped_column(JSONB, nullable=False)
defense_selection_policy: Mapped[str] = mapped_column(
sa.String(64), nullable=False, server_default=sa.text("'latest_best'")
)
attack_selection_policy: Mapped[str] = mapped_column(
sa.String(64), nullable=False, server_default=sa.text("'latest_best'")
)
scoring_strategy: Mapped[str] = mapped_column(
sa.String(64), nullable=False, server_default=sa.text("'red_blue_ratio'")
)
theme = mapped_column(JSONB, nullable=True)
instructions_md: Mapped[str | None] = mapped_column(sa.Text, nullable=True)
is_active: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
created_by: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
created_at: Mapped[datetime] = mapped_column(
sa.DateTime,
nullable=False,
server_default=sa.func.current_timestamp(),
)
updated_by: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
updated_at: Mapped[datetime] = mapped_column(
sa.DateTime,
nullable=False,
server_default=sa.func.current_timestamp(),
)
class TeamSubmission(Base):
__tablename__ = "team_submissions"
__table_args__ = (
sa.PrimaryKeyConstraint("id", name="team_submissions_pkey"),
sa.Index("team_submissions_challenge_id_idx", "red_blue_challenge_id"),
sa.Index("team_submissions_tenant_id_idx", "tenant_id"),
)
id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
red_blue_challenge_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
account_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
end_user_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
team: Mapped[str] = mapped_column(sa.String(16), nullable=False) # 'red' | 'blue'
prompt: Mapped[str] = mapped_column(sa.Text, nullable=False)
active: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true"))
created_at: Mapped[datetime] = mapped_column(
sa.DateTime,
nullable=False,
server_default=sa.func.current_timestamp(),
)
class TeamPairing(Base):
__tablename__ = "team_pairings"
__table_args__ = (
sa.PrimaryKeyConstraint("id", name="team_pairings_pkey"),
sa.Index("team_pairings_challenge_id_idx", "red_blue_challenge_id"),
sa.Index("team_pairings_tenant_id_idx", "tenant_id"),
)
id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()"))
red_blue_challenge_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
attack_submission_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
defense_submission_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
judge_output_raw = mapped_column(JSONB, nullable=True)
categories = mapped_column(JSONB, nullable=True)
judge_rating: Mapped[int | None] = mapped_column(sa.Integer, nullable=True)
judge_feedback: Mapped[str | None] = mapped_column(sa.Text, nullable=True)
red_points: Mapped[float] = mapped_column(sa.Float, nullable=False, server_default=sa.text("0"))
blue_points: Mapped[float] = mapped_column(sa.Float, nullable=False, server_default=sa.text("0"))
tokens_total: Mapped[int | None] = mapped_column(sa.Integer, nullable=True)
elapsed_ms: Mapped[int | None] = mapped_column(sa.Integer, nullable=True)
created_at: Mapped[datetime] = mapped_column(
sa.DateTime,
nullable=False,
server_default=sa.func.current_timestamp(),
)

View file

@ -223,3 +223,6 @@ vdb = [
"xinference-client~=1.2.2",
"mo-vector~=0.1.13",
]
[tool.pyright]
typeCheckingMode = "basic"
reportImplicitRelativeImport = "none"

View file

@ -8,7 +8,8 @@
"extensions",
"core/app/app_config/easy_ui_based_app/dataset"
],
"typeCheckingMode": "strict",
"typeCheckingMode": "basic",
"reportImplicitRelativeImport": "none",
"allowedUntypedLibraries": [
"flask_restx",
"flask_login",

1
api/run.sh Normal file
View file

@ -0,0 +1 @@
uv run --dev flask --app app run --host 0.0.0.0 --port 5001 --debug

View file

@ -0,0 +1,56 @@
"""
Challenge scorer protocol and type definitions.
Defines the interface for custom scoring plugins that compute
numeric scores from attempt metrics for leaderboard ranking.
"""
from __future__ import annotations
from typing import Any, Protocol, TypedDict
class ScoringContext(TypedDict, total=False):
"""Context provided to scorer plugins."""
tenant_id: str
app_id: str
workflow_id: str
challenge_id: str
end_user_id: str | None
timeout_ms: int
class AttemptMetrics(TypedDict, total=False):
"""Metrics from a challenge attempt."""
succeeded: bool
tokens_total: int | None
elapsed_ms: int | None
rating: int | None
created_at: int | None # epoch ms
class ScoringResult(TypedDict, total=False):
"""Result returned by scorer plugin."""
score: float
details: dict[str, Any] | None
class ScorerProtocol(Protocol):
"""Protocol that all scorer plugins must implement."""
def score(self, metrics: AttemptMetrics, config: dict[str, Any], ctx: ScoringContext) -> ScoringResult:
"""
Compute a numeric score from attempt metrics.
Args:
metrics: Attempt metrics (tokens, time, rating, etc.)
config: Plugin-specific configuration (from challenge.scoring_config)
ctx: Context with tenant_id, app_id, etc.
Returns:
ScoringResult with computed score and optional details
"""
...

View file

@ -0,0 +1,112 @@
"""
Challenge scorer service.
Loads and invokes custom scorer plugins to compute scores from attempt metrics.
"""
from __future__ import annotations
import logging
from typing import Any
from services.challenge_scorer_protocol import AttemptMetrics, ScoringContext, ScoringResult
logger = logging.getLogger(__name__)
class ChallengeScorerService:
"""Service for loading and invoking custom scorer plugins."""
_plugin_cache: dict[str, Any] = {}
@classmethod
def score_with_plugin(
cls,
*,
scorer_plugin_id: str | None,
scorer_entrypoint: str | None,
metrics: AttemptMetrics,
config: dict[str, Any] | None,
ctx: ScoringContext,
) -> ScoringResult:
"""
Compute score using a custom scorer plugin.
Args:
scorer_plugin_id: Plugin identifier (e.g., 'builtin.weighted_scorer')
scorer_entrypoint: Entrypoint path (e.g., 'services.scorers.weighted:WeightedScorer')
metrics: Attempt metrics to score
config: Plugin-specific configuration
ctx: Scoring context
Returns:
ScoringResult with computed score
Raises:
ValueError: If plugin cannot be loaded or scoring fails
"""
if not scorer_plugin_id or not scorer_entrypoint:
raise ValueError("scorer_plugin_id and scorer_entrypoint are required for custom scoring")
# Load plugin
scorer = cls._load_plugin(scorer_plugin_id, scorer_entrypoint)
if not scorer:
raise ValueError(f"Failed to load scorer plugin: {scorer_plugin_id}:{scorer_entrypoint}")
# Invoke scorer with timeout protection
timeout_ms = ctx.get("timeout_ms", 5000)
try:
# TODO: Add timeout enforcement using threading.Timer or signal.alarm
result = scorer.score(metrics, config or {}, ctx)
if not isinstance(result, dict) or "score" not in result:
raise ValueError("Scorer must return a dict with 'score' key")
return result
except Exception as e:
logger.error(f"Scorer plugin {scorer_plugin_id} failed: {e}", exc_info=True)
raise ValueError(f"Scorer plugin execution failed: {e}")
@classmethod
def _load_plugin(cls, plugin_id: str, entrypoint: str) -> Any:
"""
Load a scorer plugin by entrypoint.
Args:
plugin_id: Plugin identifier for caching
entrypoint: Python path like 'pkg.module:ClassName'
Returns:
Scorer instance or None if loading fails
"""
cache_key = f"{plugin_id}:{entrypoint}"
if cache_key in cls._plugin_cache:
return cls._plugin_cache[cache_key]
try:
# Parse entrypoint: 'pkg.module:ClassName'
if ":" not in entrypoint:
raise ValueError(f"Invalid entrypoint format: {entrypoint}. Expected 'module:ClassName'")
module_path, class_name = entrypoint.split(":", 1)
# Dynamic import
import importlib
module = importlib.import_module(module_path)
scorer_class = getattr(module, class_name)
# Instantiate
scorer = scorer_class()
# Cache it
cls._plugin_cache[cache_key] = scorer
logger.info(f"Loaded scorer plugin: {plugin_id} from {entrypoint}")
return scorer
except Exception as e:
logger.error(f"Failed to load scorer plugin {plugin_id}:{entrypoint}: {e}", exc_info=True)
return None
@classmethod
def clear_cache(cls) -> None:
"""Clear the plugin cache (useful for testing)."""
cls._plugin_cache.clear()

View file

@ -0,0 +1,64 @@
from __future__ import annotations
import re
from collections.abc import Mapping
from typing import Any
from sqlalchemy.orm import Session
from extensions.ext_database import db
from models.challenge import Challenge, ChallengeAttempt
class ChallengeService:
@staticmethod
def evaluate_outcome(output_text: str, cfg: Mapping[str, Any]) -> tuple[bool, dict[str, Any]]:
success_type = cfg.get("success_type", "regex")
pattern = cfg.get("success_pattern")
if success_type == "regex" and pattern:
try:
if re.search(pattern, output_text, flags=re.IGNORECASE | re.MULTILINE):
return True, {"mode": "regex", "matched": True}
return False, {"mode": "regex", "matched": False}
except re.error as e:
return False, {"mode": "regex", "error": f"invalid_regex: {e}"}
if success_type == "contains" and pattern:
return (pattern.lower() in output_text.lower()), {"mode": "contains"}
return False, {"mode": success_type, "info": "no_pattern_or_unsupported"}
@staticmethod
def record_attempt(
*,
tenant_id: str,
challenge_id: str,
end_user_id: str | None,
account_id: str | None,
workflow_run_id: str | None,
succeeded: bool,
score: float | None = None,
judge_rating: int | None = None,
judge_feedback: str | None = None,
judge_output_raw: dict[str, Any] | None = None,
tokens_total: int | None = None,
elapsed_ms: int | None = None,
session: Session | None = None,
) -> ChallengeAttempt:
sess = session or db.session
attempt = ChallengeAttempt()
attempt.tenant_id = tenant_id
attempt.challenge_id = challenge_id
attempt.end_user_id = end_user_id
attempt.account_id = account_id
attempt.workflow_run_id = workflow_run_id
attempt.succeeded = succeeded
attempt.score = score
attempt.judge_rating = judge_rating
attempt.judge_feedback = judge_feedback
attempt.judge_output_raw = judge_output_raw
attempt.tokens_total = tokens_total
attempt.elapsed_ms = elapsed_ms
sess.add(attempt)
sess.commit()
return attempt

View file

@ -0,0 +1,91 @@
from __future__ import annotations
from typing import Any
from sqlalchemy.orm import Session
from extensions.ext_database import db
from models.red_blue import RedBlueChallenge, TeamPairing, TeamSubmission
class RedBlueService:
@staticmethod
def submit_prompt(
*,
challenge_id: str,
tenant_id: str,
team: str,
prompt: str,
account_id: str | None,
end_user_id: str | None,
session: Session | None = None,
) -> TeamSubmission:
sess = session or db.session
sub = TeamSubmission()
sub.red_blue_challenge_id = challenge_id
sub.tenant_id = tenant_id
sub.team = team
sub.prompt = prompt
sub.account_id = account_id
sub.end_user_id = end_user_id
sess.add(sub)
sess.commit()
return sub
@staticmethod
def select_counterparty_submission(
*,
challenge: RedBlueChallenge,
team: str,
session: Session | None = None,
) -> TeamSubmission | None:
sess = session or db.session
opposite = "blue" if team == "red" else "red"
# Simplest policy: latest active from opposite team
return (
sess.query(TeamSubmission)
.filter(
TeamSubmission.red_blue_challenge_id == challenge.id,
TeamSubmission.team == opposite,
TeamSubmission.active.is_(True),
)
.order_by(TeamSubmission.created_at.desc())
.first()
)
@staticmethod
def record_pairing(
*,
challenge_id: str,
tenant_id: str,
attack_submission_id: str | None,
defense_submission_id: str | None,
judge_output_raw: dict[str, Any] | None,
categories: dict[str, Any] | None,
judge_rating: int | None,
judge_feedback: str | None,
red_points: float,
blue_points: float,
tokens_total: int | None,
elapsed_ms: int | None,
session: Session | None = None,
) -> TeamPairing:
sess = session or db.session
pairing = TeamPairing()
pairing.red_blue_challenge_id = challenge_id
pairing.tenant_id = tenant_id
pairing.attack_submission_id = attack_submission_id
pairing.defense_submission_id = defense_submission_id
pairing.judge_output_raw = judge_output_raw
pairing.categories = categories
pairing.judge_rating = judge_rating
pairing.judge_feedback = judge_feedback
pairing.red_points = red_points
pairing.blue_points = blue_points
pairing.tokens_total = tokens_total
pairing.elapsed_ms = elapsed_ms
sess.add(pairing)
sess.commit()
return pairing

View file

@ -0,0 +1,144 @@
# Custom Scorer Plugins
This directory contains custom scorer plugins for challenge leaderboards.
## Overview
Scorers compute numeric scores from challenge attempt metrics (tokens, time, rating, success) for ranking on leaderboards when `scoring_strategy = 'custom'`.
## Built-in Scorers
### WeightedScorer
**Entrypoint:** `services.scorers.weighted:WeightedScorer`
Computes a weighted score combining multiple metrics with configurable bonuses and penalties.
**Formula:**
```
score = success_bonus
+ (rating × rating_weight)
- (elapsed_seconds × time_penalty)
- (tokens × token_penalty)
```
**Configuration:**
- `success_bonus` (float, default: 100): Base points for successful attempts
- `rating_weight` (float, default: 10): Multiplier for judge rating (0-10)
- `time_penalty` (float, default: 1.0): Penalty per second elapsed
- `token_penalty` (float, default: 0.01): Penalty per token used
**Example Configuration:**
```json
{
"success_bonus": 100.0,
"rating_weight": 10.0,
"time_penalty": 1.0,
"token_penalty": 0.01
}
```
**Example Challenge Setup (via API):**
```python
{
"name": "Advanced Prompt Challenge",
"scoring_strategy": "custom",
"scoring_plugin_id": "builtin.weighted_scorer",
"scoring_entrypoint": "services.scorers.weighted:WeightedScorer",
"scoring_config": {
"success_bonus": 100.0,
"rating_weight": 15.0,
"time_penalty": 0.5,
"token_penalty": 0.02
}
}
```
## Creating Custom Scorers
### 1. Implement the ScorerProtocol
Create a new file in this directory (e.g., `custom.py`):
```python
from typing import Any
from services.challenge_scorer_protocol import AttemptMetrics, ScoringContext, ScoringResult
class MyCustomScorer:
def score(self, metrics: AttemptMetrics, config: dict[str, Any], ctx: ScoringContext) -> ScoringResult:
# Access metrics
succeeded = metrics.get('succeeded', False)
tokens = metrics.get('tokens_total', 0)
elapsed_ms = metrics.get('elapsed_ms', 0)
rating = metrics.get('rating', 0)
# Access configuration
multiplier = config.get('multiplier', 1.0)
# Compute score
score = (rating * multiplier) if succeeded else 0.0
return {
'score': score,
'details': { # optional
'multiplier_used': multiplier
}
}
```
### 2. Register in Challenge
Set the challenge's scoring fields:
```python
challenge.scoring_strategy = 'custom'
challenge.scoring_plugin_id = 'my_custom_scorer'
challenge.scoring_entrypoint = 'services.scorers.custom:MyCustomScorer'
challenge.scoring_config = {
'multiplier': 2.0
}
```
### 3. Testing
Create tests in `api/tests/unit_tests/services/` following the pattern in `test_challenge_scorer_service.py`.
## Protocol Reference
### Input Types
**AttemptMetrics:**
- `succeeded` (bool): Whether the challenge was passed
- `tokens_total` (int | None): Total tokens used
- `elapsed_ms` (int | None): Time taken in milliseconds
- `rating` (int | None): Judge rating (0-10)
- `created_at` (int | None): Timestamp in epoch milliseconds
**ScoringContext:**
- `tenant_id` (str): Tenant identifier
- `app_id` (str): Application identifier
- `workflow_id` (str): Workflow identifier
- `challenge_id` (str): Challenge identifier
- `end_user_id` (str | None): End user identifier (if available)
- `timeout_ms` (int): Maximum execution time
### Output Type
**ScoringResult:**
- `score` (float, required): Computed numeric score
- `details` (dict[str, Any] | None, optional): Additional scoring details
## Error Handling
- Scorers must return a dict with a `score` key
- Exceptions are caught and logged; the attempt is recorded with `score=None`
- Scorers are executed with a timeout (default: 5s)
- Scorers should never return negative scores; use `max(score, 0.0)` to clamp
## Best Practices
1. **Keep it simple**: Scoring should be fast and deterministic
2. **Validate config**: Check configuration values and provide defaults
3. **Clamp scores**: Ensure scores are non-negative
4. **Document formula**: Clearly explain how your scorer works
5. **Test edge cases**: Test with missing metrics, zeros, nulls

View file

@ -0,0 +1 @@
"""Built-in scorer plugins."""

View file

@ -0,0 +1,66 @@
"""
Weighted scorer plugin.
Computes a weighted score based on success bonus, rating, elapsed time, and token usage.
"""
from __future__ import annotations
from typing import Any
from services.challenge_scorer_protocol import AttemptMetrics, ScoringContext, ScoringResult
class WeightedScorer:
"""
Example weighted scorer that combines multiple metrics.
Configuration options:
- success_bonus (float): Base points for successful attempt (default: 100)
- rating_weight (float): Multiplier for judge rating (default: 10)
- time_penalty (float): Penalty per second elapsed (default: 1.0)
- token_penalty (float): Penalty per token used (default: 0.01)
Formula:
score = success_bonus
+ (rating * rating_weight)
- (elapsed_seconds * time_penalty)
- (tokens * token_penalty)
"""
def score(self, metrics: AttemptMetrics, config: dict[str, Any], ctx: ScoringContext) -> ScoringResult:
"""Compute weighted score from metrics."""
# Base score for success
base = 0.0
if metrics.get("succeeded"):
base += config.get("success_bonus", 100.0)
# Add rating contribution
rating = metrics.get("rating") or 0
rating_weight = config.get("rating_weight", 10.0)
rating_score = rating * rating_weight
# Subtract time penalty
elapsed_ms = metrics.get("elapsed_ms") or 0
elapsed_seconds = elapsed_ms / 1000.0
time_penalty = config.get("time_penalty", 1.0)
time_score = elapsed_seconds * time_penalty
# Subtract token penalty
tokens = metrics.get("tokens_total") or 0
token_penalty = config.get("token_penalty", 0.01)
token_score = tokens * token_penalty
# Compute final score (never negative)
final_score = base + rating_score - time_score - token_score
final_score = max(final_score, 0.0)
return {
"score": final_score,
"details": {
"base": base,
"rating_contribution": rating_score,
"time_penalty": time_score,
"token_penalty": token_score,
},
}

View file

@ -0,0 +1,14 @@
from __future__ import annotations
from flask.testing import FlaskClient
class TestWebChallenges:
def test_list_and_detail(self, test_client_with_containers: FlaskClient):
# list
resp = test_client_with_containers.get("/api/web/challenges")
assert resp.status_code == 200
data = resp.get_json()
assert data["result"] == "success"

View file

@ -0,0 +1,13 @@
from __future__ import annotations
from flask.testing import FlaskClient
class TestWebRedBlueChallenges:
def test_list(self, test_client_with_containers: FlaskClient):
resp = test_client_with_containers.get("/api/web/red-blue-challenges")
assert resp.status_code == 200
data = resp.get_json()
assert data["result"] == "success"

View file

@ -0,0 +1,144 @@
"""Tests for ChallengeScorerService."""
from __future__ import annotations
import pytest
from services.challenge_scorer_service import ChallengeScorerService
class TestChallengeScorerService:
"""Test custom scorer plugin loading and execution."""
def test_weighted_scorer_success(self):
"""Test WeightedScorer with successful attempt."""
metrics = {
"succeeded": True,
"tokens_total": 1000,
"elapsed_ms": 5000, # 5 seconds
"rating": 8,
"created_at": 1730000000000,
}
config = {
"success_bonus": 100.0,
"rating_weight": 10.0,
"time_penalty": 1.0,
"token_penalty": 0.01,
}
ctx = {
"tenant_id": "test-tenant",
"app_id": "test-app",
"workflow_id": "test-workflow",
"challenge_id": "test-challenge",
"timeout_ms": 5000,
}
result = ChallengeScorerService.score_with_plugin(
scorer_plugin_id="builtin.weighted_scorer",
scorer_entrypoint="services.scorers.weighted:WeightedScorer",
metrics=metrics,
config=config,
ctx=ctx,
)
# Expected: 100 (success) + 80 (8*10 rating) - 5 (5s*1.0) - 10 (1000*0.01) = 165
assert result["score"] == 165.0
assert "details" in result
assert result["details"]["base"] == 100.0
assert result["details"]["rating_contribution"] == 80.0
assert result["details"]["time_penalty"] == 5.0
assert result["details"]["token_penalty"] == 10.0
def test_weighted_scorer_failure(self):
"""Test WeightedScorer with failed attempt."""
metrics = {
"succeeded": False,
"tokens_total": 500,
"elapsed_ms": 2000, # 2 seconds
"rating": 3,
"created_at": 1730000000000,
}
config = {
"success_bonus": 100.0,
"rating_weight": 10.0,
"time_penalty": 1.0,
"token_penalty": 0.01,
}
ctx = {
"tenant_id": "test-tenant",
"app_id": "test-app",
"challenge_id": "test-challenge",
"timeout_ms": 5000,
}
result = ChallengeScorerService.score_with_plugin(
scorer_plugin_id="builtin.weighted_scorer",
scorer_entrypoint="services.scorers.weighted:WeightedScorer",
metrics=metrics,
config=config,
ctx=ctx,
)
# Expected: 0 (no success bonus) + 30 (3*10) - 2 (2s*1.0) - 5 (500*0.01) = 23
assert result["score"] == 23.0
def test_weighted_scorer_minimum_zero(self):
"""Test WeightedScorer never returns negative scores."""
metrics = {
"succeeded": False,
"tokens_total": 10000, # High token count
"elapsed_ms": 30000, # 30 seconds
"rating": 1,
"created_at": 1730000000000,
}
config = {
"success_bonus": 100.0,
"rating_weight": 10.0,
"time_penalty": 1.0,
"token_penalty": 0.01,
}
ctx = {
"tenant_id": "test-tenant",
"app_id": "test-app",
"challenge_id": "test-challenge",
"timeout_ms": 5000,
}
result = ChallengeScorerService.score_with_plugin(
scorer_plugin_id="builtin.weighted_scorer",
scorer_entrypoint="services.scorers.weighted:WeightedScorer",
metrics=metrics,
config=config,
ctx=ctx,
)
# Expected: 0 + 10 - 30 - 100 = -120, but clamped to 0
assert result["score"] == 0.0
def test_scorer_with_missing_plugin(self):
"""Test error handling for missing plugin."""
with pytest.raises(ValueError, match="Failed to load scorer plugin"):
ChallengeScorerService.score_with_plugin(
scorer_plugin_id="nonexistent",
scorer_entrypoint="nonexistent.module:NonexistentScorer",
metrics={},
config={},
ctx={"timeout_ms": 5000},
)
def test_scorer_with_invalid_entrypoint(self):
"""Test error handling for invalid entrypoint format."""
with pytest.raises(ValueError, match="scorer_plugin_id and scorer_entrypoint are required"):
ChallengeScorerService.score_with_plugin(
scorer_plugin_id=None,
scorer_entrypoint=None,
metrics={},
config={},
ctx={"timeout_ms": 5000},
)

View file

@ -0,0 +1,40 @@
from __future__ import annotations
from models.challenge import ChallengeAttempt
from services.challenge_service import ChallengeService
def test_evaluate_outcome_regex_match():
ok, details = ChallengeService.evaluate_outcome(
"Hello SECRET",
{"success_type": "regex", "success_pattern": "secret"},
)
assert ok is True
assert details.get("mode") == "regex"
def test_evaluate_outcome_contains():
ok, _ = ChallengeService.evaluate_outcome(
"hello world",
{"success_type": "contains", "success_pattern": "world"},
)
assert ok is True
def test_record_attempt_creates_row(mocker):
# mock db.session
session = mocker.MagicMock()
attempt = ChallengeService.record_attempt(
tenant_id="t1",
challenge_id="c1",
end_user_id=None,
account_id=None,
workflow_run_id=None,
succeeded=True,
score=10.0,
session=session,
)
assert isinstance(attempt, ChallengeAttempt)
session.add.assert_called_once()
session.commit.assert_called_once()

View file

@ -0,0 +1,34 @@
from __future__ import annotations
from types import SimpleNamespace
from services.red_blue_service import RedBlueService
def test_submit_prompt_creates_submission(mocker):
session = mocker.MagicMock()
sub = RedBlueService.submit_prompt(
challenge_id="cid",
tenant_id="tid",
team="red",
prompt="attack",
account_id="aid",
end_user_id="eid",
session=session,
)
assert sub.team == "red"
session.add.assert_called_once()
session.commit.assert_called_once()
def test_select_counterparty_submission_latest_active(mocker):
c = SimpleNamespace(id="cid")
session = mocker.MagicMock()
qs = (
session.query.return_value.filter.return_value.order_by.return_value
)
qs.first.return_value = SimpleNamespace(id="subid", team="blue")
sub = RedBlueService.select_counterparty_submission(challenge=c, team="red", session=session)
assert sub.team == "blue"

View file

@ -0,0 +1,633 @@
## Prompt Hacking Challenges — Design
### Overview
Enable developer-authored prompt hacking challenges inside Difys workflow builder via a new workflow node. Players can register/login using the existing web auth and compete on challenges. Attempts are recorded server-side and leaderboards are exposed via public web APIs.
### Goals
- Add a first-class workflow node that evaluates success/failure against developer-specified criteria.
- Add a Judging LLM node that compares model outputs to the challenge goal and produces pass/fail, textual feedback, and a 110 rating.
- Persist attempts with metadata for scoring and leaderboards.
- Reuse existing account/web auth for players.
- Fit Difys DDD/Clean Architecture: models, services, controllers, workflow nodes, and frontend builder integration.
### Non-Goals
- Anti-cheat measures beyond simple rate limiting.
- Complex custom scoring plugins (design leaves a hook for future work).
## Architecture Summary
### Backend components
- Models (SQLAlchemy)
- Challenge
- id, tenant_id, app_id, workflow_id
- name, description, goal (plain text shown to players)
- success_type: one of ['regex', 'contains', 'custom']
- success_pattern: string (regex or substring depending on type)
- secret_ref: reference to server-side secret (never exposed to clients)
- scoring_strategy: one of ['first', 'fastest', 'fewest_tokens', 'custom']
- is_active: bool
- created_by, created_at, updated_by, updated_at
- ChallengeAttempt
- id, tenant_id, challenge_id (FK), end_user_id (FK), workflow_run_id (optional FK)
- succeeded: bool
- score: numeric (meaning depends on strategy)
- judge_rating: int (010)
- judge_feedback: text
- judge_output_raw: jsonb (optional; structured judgement payload)
- tokens_total: int (when available from run metrics)
- elapsed_ms: int (when available)
- created_at
- Service layer (e.g., `ChallengeService`, `ChallengeJudgeService`)
- evaluate_outcome(output, cfg) -> (succeeded: bool, details: dict)
- judge_with_llm(goal, response, cfg) -> { passed: bool, rating: int, feedback: str, raw?: dict }
- evaluate_with_plugin(evaluator_ref, goal, response, ctx) -> { passed: bool, rating?: int, feedback?: str, raw?: dict }
- score_with_plugin(scorer_ref, attempt_metrics, ctx) -> { score: number, details?: dict }
- record_attempt(tenant_id, challenge_id, end_user_id, run_meta, succeeded) -> ChallengeAttempt
- get_leaderboard(challenge_id, limit, strategy) -> list
- get_challenge_public(challenge_id) -> dict
- Controllers
- Console (for creators): CRUD on challenges under the workspace (`/console/api/challenges`)
- Web (for players): public endpoints under `/web/api/challenges`
- List active challenges, fetch details, fetch leaderboard
- Optional auth via existing web login for personalization, otherwise anonymous read
- Workflow nodes
- NodeType: `challenge-evaluator`
- Config
- `challenge_id`: reference to a stored Challenge (preferred)
- or inline config: `success_type`, `success_pattern`, `scoring_strategy`
- `mask_variables`: string[] — variable names to redact in logs
- Execution
- Consumes upstream content (typically latest assistant output)
- Evaluates success with `ChallengeService.evaluate_outcome`
- If an `EndUser` context exists and a `challenge_id` is present, writes `ChallengeAttempt`
- Outputs `{ challenge_succeeded: boolean, message?: string }`, optionally passes through original output
- NodeType: `judging-llm`
- Purpose: judge a model response against the challenge goal using an LLM rubric.
- Config
- `judge_model`: provider/name/version
- `temperature`, `max_tokens`, other model params
- `rubric_prompt_template`: template with placeholders for {goal}, {response}, optional {hints}
- `rating_scale`: default 010; configurable upper bound optional
- `pass_threshold`: integer (default 5)
- Inputs
- `goal`: the attacking goal or acceptance criteria
- `response`: the model output to evaluate
- Execution
- Calls `ChallengeJudgeService.judge_with_llm()` to obtain structured judgement
- Returns outputs `{ judge_passed: boolean, judge_rating: number (010), judge_feedback: string, judge_raw?: object }`
- Integration
- Downstream `challenge-evaluator` can consume `judge_passed` and `judge_rating` to record an attempt instead of regex/contains
- Alternatively, `challenge-evaluator` may support an `evaluation_mode: 'rules' | 'llm-judge'` to invoke judging internally
- NodeType: `team-challenge` (Red/Blue orchestrator)
- Purpose: orchestrate two-sided challenges where players choose Red (attack) or Blue (defense) and submit prompts accordingly. The node pairs attacks and defenses, configures the LLM, invokes judging, and emits scores.
- Config
- `red_blue_challenge_id`: reference to a Red/Blue challenge definition
- `defense_selection_policy`: 'latest_best' | 'random_active' | 'round_robin' | 'request_new_if_none'
- `attack_selection_policy`: same options for the defense side evaluation path
- `judge_suite`: list of category tests to run (e.g., CBRNE, SA, SH, RWH, V, M)
- `scoring_strategy`: 'red_blue_ratio' | 'custom'
- Inputs
- `team_choice`: 'red' | 'blue'
- `attack_prompt?`: string (when `team_choice = 'red'`)
- `defense_prompt?`: string (when `team_choice = 'blue'`)
- Execution (high level)
- If `team_choice = 'red'`:
- Persist `attack_prompt` submission
- Load a defense by policy; if none, optionally signal Blue to provide one (async) and fall back to last known
- Configure LLM with defense as system prompt, submit attack as user message
- Run `judge_suite` via `judging-llm`; compute Red score
- If `team_choice = 'blue'`:
- Persist `defense_prompt` submission
- Load an attack by policy; if none, signal Red to provide one (async) and fall back to last known
- Configure LLM with defense as system prompt and submit the loaded attack
- Run `judge_suite`; compute Blue score (prevention)
- Persist pairing and metrics
- Outputs
- `{ team: 'red'|'blue', judge_passed: boolean, judge_rating: number, judge_feedback: string, categories: Record<string, boolean|number>, team_points: number, total_points: number }`
### Frontend components
- Workflow builder
- Add `Prompt Challenge` to the node palette
- Add `Judging LLM` to the node palette
- Node editor panel: select existing Challenge or define inline success criteria
- Judging panel: choose model, edit rubric prompt, set pass threshold, preview structured outputs
- Custom evaluator/scorer panels: choose plugin and configure JSON settings with live schema validation
- I18n strings in `web/i18n/en-US/`
- Challenge display & theming
- Author-provided instructions (Markdown) render before/alongside the task input area
- Theme tokens (colors, logo, background) applied to challenge pages
- Optional hero image/video via existing `UploadFile` and signed URLs
- Optional player UX (phase 2)
- `/challenges` list and `/challenges/[id]` details with leaderboard
- `/challenge-collections` list and `/challenge-collections/[id]` details with collection leaderboard
- Use existing web login endpoints
## Data Model
Minimal table shapes (final columns managed in migration):
```sql
-- challenges
id (uuid pk)
tenant_id (uuid fk)
app_id (uuid fk)
workflow_id (uuid fk)
name (text)
description (text)
goal (text)
success_type (text)
success_pattern (text)
secret_ref (text)
scoring_strategy (text)
is_active (bool)
created_by (uuid)
created_at (timestamp)
updated_by (uuid)
updated_at (timestamp)
-- challenge_attempts
id (uuid pk)
tenant_id (uuid fk)
challenge_id (uuid fk)
end_user_id (uuid fk)
workflow_run_id (uuid fk, nullable)
succeeded (bool)
score (numeric)
tokens_total (int)
elapsed_ms (int)
created_at (timestamp)
```
Additional columns for judging:
```sql
ALTER TABLE challenge_attempts
ADD COLUMN judge_rating integer,
ADD COLUMN judge_feedback text,
ADD COLUMN judge_output_raw jsonb;
```
Optional columns for custom evaluators/scorers:
```sql
ALTER TABLE challenges
ADD COLUMN evaluator_type text DEFAULT 'rules', -- one of: rules, llm-judge, custom
ADD COLUMN evaluator_plugin_id text,
ADD COLUMN evaluator_entrypoint text, -- e.g., "pkg.module:Evaluator"
ADD COLUMN evaluator_config jsonb,
ADD COLUMN scoring_plugin_id text,
ADD COLUMN scoring_entrypoint text, -- e.g., "pkg.module:Scorer"
ADD COLUMN scoring_config jsonb;
```
Additional tables for Red/Blue team challenges:
```sql
-- red_blue_challenges (definition)
CREATE TABLE red_blue_challenges (
id uuid PRIMARY KEY DEFAULT uuid_generate_v4(),
tenant_id uuid NOT NULL REFERENCES tenants(id),
app_id uuid NOT NULL REFERENCES apps(id),
workflow_id uuid REFERENCES workflows(id),
name text NOT NULL,
description text,
judge_suite jsonb NOT NULL, -- list of categories/tests
defense_selection_policy text NOT NULL DEFAULT 'latest_best',
attack_selection_policy text NOT NULL DEFAULT 'latest_best',
scoring_strategy text NOT NULL DEFAULT 'red_blue_ratio',
theme jsonb,
instructions_md text,
is_active boolean NOT NULL DEFAULT true,
created_by uuid,
created_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_by uuid,
updated_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- team_submissions (attack/defense prompts)
CREATE TABLE team_submissions (
id uuid PRIMARY KEY DEFAULT uuid_generate_v4(),
red_blue_challenge_id uuid NOT NULL REFERENCES red_blue_challenges(id) ON DELETE CASCADE,
tenant_id uuid NOT NULL,
account_id uuid NULL,
end_user_id uuid NULL,
team text NOT NULL CHECK (team in ('red','blue')),
prompt text NOT NULL,
active boolean NOT NULL DEFAULT true,
created_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
);
-- pairings (which attack tested against which defense)
CREATE TABLE team_pairings (
id uuid PRIMARY KEY DEFAULT uuid_generate_v4(),
red_blue_challenge_id uuid NOT NULL REFERENCES red_blue_challenges(id) ON DELETE CASCADE,
tenant_id uuid NOT NULL,
attack_submission_id uuid REFERENCES team_submissions(id),
defense_submission_id uuid REFERENCES team_submissions(id),
judge_output_raw jsonb,
categories jsonb, -- e.g., per-suite pass/fail or rating
judge_rating integer,
judge_feedback text,
red_points numeric NOT NULL DEFAULT 0,
blue_points numeric NOT NULL DEFAULT 0,
tokens_total int,
elapsed_ms int,
created_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
);
```
## API Design
### Console (creator)
- `GET /console/api/challenges?app_id=...` — list
- `POST /console/api/challenges` — create
- `GET /console/api/challenges/{id}` — retrieve
- `PATCH /console/api/challenges/{id}` — update
- `DELETE /console/api/challenges/{id}` — delete
All require console `login_required` and membership in tenant.
### Web (player)
- `GET /web/api/challenges` — list active challenges (public)
- `GET /web/api/challenges/{id}` — details (public)
- `GET /web/api/challenges/{id}/leaderboard?limit=...` — leaderboard (public)
Player login uses existing web login endpoints to obtain access token when needed for personalization.
### Collections
Console (creator)
- `GET /console/api/challenge-collections?app_id=...`
- `POST /console/api/challenge-collections`
- `GET /console/api/challenge-collections/{id}`
- `PATCH /console/api/challenge-collections/{id}`
- `DELETE /console/api/challenge-collections/{id}`
- `PUT /console/api/challenge-collections/{id}/challenges` (set membership and order)
Web (player)
- `GET /web/api/challenge-collections` — list public collections
- `GET /web/api/challenge-collections/{id}` — collection details (instructions/theme), included challenges
- `GET /web/api/challenge-collections/{id}/leaderboard?limit=...` — collection leaderboard
### Red/Blue team challenge APIs
Console (creator)
- `POST /console/api/red-blue-challenges` — create
- `GET /console/api/red-blue-challenges?app_id=...` — list
- `GET /console/api/red-blue-challenges/{id}` — detail
- `PATCH /console/api/red-blue-challenges/{id}` — update
- `DELETE /console/api/red-blue-challenges/{id}` — delete
- `GET /console/api/red-blue-challenges/{id}/pairings` — view pairings/metrics
Web (player)
- `POST /web/api/red-blue-challenges/{id}/join` — join red or blue (payload: { team })
- `POST /web/api/red-blue-challenges/{id}/submit` — submit attack/defense (payload: { team, prompt })
- `GET /web/api/red-blue-challenges/{id}` — public info (instructions, theme, leaderboard snapshot)
- `GET /web/api/red-blue-challenges/{id}/leaderboard?limit=...` — red vs blue standings
## Player Registration & Identity
### Registration and login
- Reuse existing web auth service for player accounts:
- Email/password login: `POST /web/api/login`
- Email code login: `POST /web/api/login/email-code/send` + `POST /web/api/login/email-code/verify` (existing patterns)
- Add an explicit web registration endpoint (thin wrapper around `RegisterService.register`):
- `POST /web/api/register` (payload: email, name, password | email-code)
- Behavior:
- `create_workspace_required = False` to avoid auto-creating workspaces for players
- `status = active`
- Set `interface_language` from `Accept-Language` as done in OAuth flow
- On success, also create or associate a per-tenant `EndUser` record so gameplay runs can be attributed consistently.
### Player identity during runs
- Each gameplay run already has an `EndUser` context. For registered players:
- When a player is authenticated, resolve (or lazily create) an `EndUser` tied to their `account_id` for the current tenant/app
- Persist `end_user_id` to `ChallengeAttempt` as today; optionally also store `account_id` for simplified leaderboard personalization
### Optional schema addition
```sql
ALTER TABLE challenge_attempts
ADD COLUMN account_id uuid NULL;
```
This enables direct joins to accounts for notification and profile display without traversing end-user mappings.
### Player profile (optional)
Introduce a lightweight `player_profiles` table for nickname/avatar/notification preferences without touching `account` directly:
```sql
CREATE TABLE player_profiles (
account_id uuid PRIMARY KEY REFERENCES accounts(id) ON DELETE CASCADE,
display_name text,
avatar_url text,
notify_on_first_blood boolean DEFAULT true,
notify_on_record_beaten boolean DEFAULT true,
created_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
);
```
## Workflow Node Execution
1. Node receives upstream output (string or structured content). A typical placement is after an LLM node.
2. Node loads Challenge config (stored by `challenge_id` or inline).
3. Node evaluates success by either rules or judging:
- `regex`: test pattern against text output
- `contains`: case-insensitive substring match
- `llm-judge`: call the `judging-llm` node (or internal judge) to obtain `{ judge_passed, judge_rating, judge_feedback }`
- `custom`: call `evaluate_with_plugin` using configured `evaluator_plugin_id`/`evaluator_entrypoint`
4. If `EndUser` and `challenge_id` present, record a `ChallengeAttempt` with run metrics (tokens, elapsed), and when available, `judge_rating`/`judge_feedback`.
5. Node outputs
- Rules mode: `{ challenge_succeeded: boolean, message?: string }`
- Judging mode: `{ challenge_succeeded: boolean, judge_rating: number, judge_feedback: string }`
- Pass through original output for chaining when needed.
For collections, attempts are recorded per challenge as usual. Collection leaderboard aggregation is computed over a players best attempt per challenge, combined using the collections `scoring_strategy` (e.g., sum of scores, total elapsed_ms, etc.).
## Scoring Strategies
- `first`: first successful attempt time wins (leaderboard sorted by earliest `created_at`).
- `fastest`: success with lowest `elapsed_ms` wins.
- `fewest_tokens`: success with lowest `tokens_total` wins.
- `highest_rating`: success with the highest `judge_rating` wins; ties broken by earliest `created_at`.
- `custom`: compute via `score_with_plugin` using `scoring_plugin_id`/`scoring_entrypoint`.
Collection strategies:
- `sum`: sum of per-challenge scores in the collection (uses built-in or custom scoring per challenge)
- `fastest_total`: sum of `elapsed_ms` of successful best attempts (lower is better)
- `fewest_tokens_total`: sum of `tokens_total` of successful best attempts (lower is better)
- `highest_avg_rating`: average of `judge_rating` across completed challenges (higher is better)
- `custom`: plugin-defined; service calls `score_with_plugin` at collection level with a list of per-challenge metrics
Red/Blue team scoring:
- Base idea: award Red points for breakthroughs and Blue points for prevented attacks.
- Suggested defaults per pairing:
- For each category in the judge suite (e.g., CBRNE, SA, SH, RWH, V, M):
- If the attack bypasses defense (category breach), Red +1
- If defense prevents (no breach), Blue +1
- Bonus based on `judge_rating` magnitude for breakthrough severity (e.g., Red +round(rating/3))
- Time/token penalties can reduce points to encourage efficient strategies
- Ratio-based standings:
- Red ratio = Red points / (Red points + Blue points)
- Blue ratio = Blue points / (Red points + Blue points)
- Custom plugin scoring:
- Provide all pairing metrics to a scorer plugin to compute per-pairing or cumulative standings
## Custom Evaluators & Scorers
This section specifies how custom evaluation and scoring plugins integrate with challenges.
### Concepts
- Evaluator: decides whether a response meets the goal. May optionally emit a rating (010) and textual feedback.
- Scorer: converts an attempts metrics (e.g., elapsed time, tokens, rating) into a numeric score for leaderboards.
### Data model
- `challenges.evaluator_type`: one of `rules`, `llm-judge`, or `custom`.
- `challenges.evaluator_plugin_id`, `evaluator_entrypoint`, `evaluator_config`: identify and configure the evaluator plugin when `custom` is selected.
- `challenges.scoring_plugin_id`, `scoring_entrypoint`, `scoring_config`: identify and configure the scorer plugin when `scoring_strategy = 'custom'`.
### Service interfaces
Evaluator interface (Python):
```python
class EvaluatorContext(TypedDict, total=False):
tenant_id: str
app_id: str
workflow_id: str
challenge_id: str
end_user_id: str | None
variables: dict[str, Any] # sanitized runtime variables
timeout_ms: int
class EvaluatorResult(TypedDict, total=False):
passed: bool
rating: int # 010 (optional)
feedback: str # textual feedback for player (optional)
raw: dict[str, Any] # internal diagnostics (optional)
class EvaluatorProtocol(Protocol):
def evaluate(self, goal: str, response: str, config: dict[str, Any], ctx: EvaluatorContext) -> EvaluatorResult: ...
```
Scorer interface (Python):
```python
class ScoringContext(TypedDict, total=False):
tenant_id: str
app_id: str
workflow_id: str
challenge_id: str
end_user_id: str | None
timeout_ms: int
class AttemptMetrics(TypedDict, total=False):
succeeded: bool
tokens_total: int | None
elapsed_ms: int | None
rating: int | None
created_at: int | None # epoch ms
class ScoringResult(TypedDict, total=False):
score: float
details: dict[str, Any] | None
class ScorerProtocol(Protocol):
def score(self, metrics: AttemptMetrics, config: dict[str, Any], ctx: ScoringContext) -> ScoringResult: ...
```
### Discovery and loading
- Plugins are discovered via the existing plugin manager. Each plugin exposes one or more entrypoints (e.g., `pkg.module:Evaluator`).
- `evaluator_plugin_id`/`evaluator_entrypoint` and `scoring_plugin_id`/`scoring_entrypoint` identify the target callables.
- Services load plugins lazily and cache handles with safe import guards.
### Execution flow
1) For `evaluator_type = 'custom'`, the `challenge-evaluator` node calls `evaluate_with_plugin` with `(goal, response, evaluator_config, ctx)`.
2) If `EvaluatorResult.passed` is true, set `challenge_succeeded = True` and persist `judge_rating`/`judge_feedback` if provided.
3) For `scoring_strategy = 'custom'`, call `score_with_plugin` with attempt metrics to compute `score`.
4) Persist `ChallengeAttempt` with plugin-derived fields.
### Frontend configuration
- Prompt Challenge panel
- Evaluation mode: Rules | Judging LLM | Custom Evaluator
- When Custom Evaluator is chosen:
- Plugin selector: lists available evaluator plugins by `plugin_id` and exposed entrypoints
- JSON config editor with schema-based validation (optional `$schema` per plugin)
- Scoring section
- Strategy: First | Fastest | Fewest Tokens | Highest Rating | Custom
- When Custom is chosen: plugin selector + JSON config editor
### Security & sandboxing
- Plugins run under server control with:
- Timeouts (default 5s) and memory ceilings; cancellation on overrun
- No network access by default (opt-in allowlist if ever needed)
- Sanitized inputs: secrets removed; only whitelisted variables passed
- Structured error mapping; no stack traces leaked to players
### Error handling & observability
- If plugin load or execution fails, treat as non-pass and record a generic failure reason.
- Emit structured logs/events with plugin identifiers and durations (no sensitive content).
- Surface minimal feedback to players; detailed diagnostics remain internal.
### Examples
Evaluator (substring with banned terms):
```python
class SimpleEvaluator:
def evaluate(self, goal, response, config, ctx):
required = config.get('must_contain', [])
banned = set(map(str.lower, config.get('banned', [])))
if any(w.lower() in response.lower() for w in banned):
return {'passed': False, 'feedback': 'Banned content detected', 'rating': 2}
if all(w.lower() in response.lower() for w in required):
return {'passed': True, 'feedback': 'Meets criteria', 'rating': 8}
return {'passed': False, 'feedback': 'Missing required signal', 'rating': 5}
```
Scorer (weighted combo):
```python
class WeightedScorer:
def score(self, metrics, config, ctx):
base = 0.0
if metrics.get('succeeded'):
base += config.get('success_bonus', 100)
rating = metrics.get('rating') or 0
elapsed = metrics.get('elapsed_ms') or 0
tokens = metrics.get('tokens_total') or 0
score = base + rating * config.get('rating_weight', 10) \
- (elapsed / 1000.0) * config.get('time_penalty', 1.0) \
- tokens * config.get('token_penalty', 0.01)
return {'score': max(score, 0.0)}
```
## Security & Privacy
- Never expose `secret_ref` or derived secrets to clients or node outputs.
- Redact configured `mask_variables` in logs and stored attempt details.
- Apply rate limiting using existing helpers to mitigate brute-force attempts.
- Store minimal details on failed attempts to reduce information leakage.
- Sanitize Markdown instructions to prevent XSS; allow a safe subset (links/images) with rel=noopener.
- Theme application is constrained to a whitelist of CSS variables and asset URLs served via signed URLs.
## Testing Plan
- Service unit tests
- `evaluate_outcome` for regex/contains (edge cases, unicode, multiline)
- `judge_with_llm` deterministic tests with mocked LLM returning structured payloads
- `record_attempt` scoring aggregation and sorting
- Node tests
- Given inputs, assert success/failure and resulting outputs
- Judging node: asserts `{ judge_passed, judge_rating, judge_feedback }` shape and thresholds
- When `challenge_id` present, attempts are written; when not, none are written
- API tests
- Console CRUD happy paths and permissions
- Web endpoints list/details/leaderboard
- Frontend
- Panel validation, serialization/deserialization of node config
- Judging panel: model selection, rubric template binding, threshold validation
- Node palette presence
- Challenge instructions: Markdown renderer sanitization, link and image handling
- Theming: verify CSS variable injection, dark/light modes, and fallback to defaults
- Collections UI: ordering, visibility filtering, collection leaderboard rendering
## Rollout
1. DB migrations: create `challenges`, `challenge_attempts` tables; add judging columns.
2. Backend: models, service, console/web controllers, workflow node, `NodeType` and node mapping registration.
3. Frontend: add block enum, node + panel components (Prompt Challenge, Judging LLM), node palette default, i18n entries.
4. QA: run `make lint`, `make type-check`, and unit tests; `pnpm lint` and tests for web.
5. Documentation: link this design from contributor docs as needed.
## Open Questions / Future Work
- Anti-cheat signals and anomaly detection.
- Custom evaluator/scoring plugin hooks with sandboxing.
- Team competitions and seasons.
- Per-challenge rate limits and cooldowns.
## Notifications
### Events
- `challenge_first_blood`: emitted when the first successful attempt occurs for a challenge
- `challenge_record_beaten`: emitted when a leaderboard record is surpassed under the active scoring strategy
- `team_pairing_completed`: emitted after each Red/Blue pairing is judged with per-team points
### Delivery channels
- In-app (console): add a section in the console UI for challenge events; poll or use server-sent events
- Email (optional): send via existing email task infra (e.g., Celery tasks)
- Webhook (optional): per-tenant webhook endpoint configured in workspace settings to receive challenge events
### Payloads
```json
{
"event": "challenge_record_beaten",
"challenge_id": "...",
"scoring_strategy": "highest_rating",
"previous_record": { "account_id": "...", "score": 95.2 },
"new_record": { "account_id": "...", "score": 96.8 },
"occurred_at": 1730000000000
}
```
Red/Blue pairing example:
```json
{
"event": "team_pairing_completed",
"red_blue_challenge_id": "...",
"pairing_id": "...",
"attack_submission_id": "...",
"defense_submission_id": "...",
"categories": { "CBRNE": true, "SA": false, "SH": true },
"judge_rating": 8,
"red_points": 4,
"blue_points": 2,
"occurred_at": 1730000000001
}
```
### Triggers in services
- After `record_attempt`, re-evaluate leaderboard head for the challenge against the prior head
- If the head changed and meets trigger criteria, enqueue notification tasks
- Respect player profile preferences (`notify_on_first_blood`, `notify_on_record_beaten`)
### Player-facing feedback
- Immediate feedback comes from node outputs (e.g., `judge_feedback`, `judge_rating`)
- Aggregated notifications (record beaten, first blood) are async and opt-in per player preferences

View file

@ -0,0 +1,102 @@
import { submitChallengeAttempt } from '@/service/challenges'
import { postPublic } from '@/service/base'
import { PUBLIC_API_PREFIX } from '@/config'
jest.mock('@/service/base', () => ({
getPublic: jest.fn(),
postPublic: jest.fn(),
}))
const mockedPostPublic = postPublic as jest.MockedFunction<typeof postPublic>
const originalFetch = globalThis.fetch
let fetchMock: jest.Mock
describe('submitChallengeAttempt', () => {
beforeEach(() => {
fetchMock = jest.fn()
globalThis.fetch = fetchMock as unknown as typeof fetch
mockedPostPublic.mockReset()
mockedPostPublic.mockResolvedValue({ result: 'success' } as any)
localStorage.clear()
})
afterEach(() => {
jest.clearAllMocks()
})
afterAll(() => {
globalThis.fetch = originalFetch
})
it('throws when challenge web app is not published', async () => {
await expect(
submitChallengeAttempt('challenge-id', 'app-id', undefined, 'chat', 'hello'),
).rejects.toThrow('Challenge app is not published')
expect(fetchMock).not.toHaveBeenCalled()
expect(mockedPostPublic).not.toHaveBeenCalled()
})
it('requests a passport token and submits chat attempts through /chat-messages', async () => {
const passportToken = 'chat-passport-token'
fetchMock.mockResolvedValue({
ok: true,
json: jest.fn().mockResolvedValue({ access_token: passportToken }),
})
await submitChallengeAttempt('challenge-123', 'app-abc', 'site-code-xyz', 'chat', 'solve this')
expect(fetchMock).toHaveBeenCalledWith(`${PUBLIC_API_PREFIX}/passport`, {
method: 'GET',
headers: {
'X-App-Code': 'site-code-xyz',
},
credentials: 'include',
})
expect(mockedPostPublic).toHaveBeenCalledWith('/chat-messages', expect.objectContaining({
body: {
query: 'solve this',
inputs: {},
response_mode: 'blocking',
conversation_id: '',
},
}))
const storedToken = JSON.parse(localStorage.getItem('token') || '{}')
expect(storedToken.version).toBe(2)
expect(storedToken['challenge-123'].DEFAULT).toBe(passportToken)
})
it('requests a passport token and submits workflow attempts through /workflows/run', async () => {
const passportToken = 'workflow-passport-token'
fetchMock.mockResolvedValue({
ok: true,
json: jest.fn().mockResolvedValue({ access_token: passportToken }),
})
await submitChallengeAttempt('challenge-456', 'app-def', 'site-code-xyz', 'workflow', 'my answer')
expect(fetchMock).toHaveBeenCalledWith(`${PUBLIC_API_PREFIX}/passport`, {
method: 'GET',
headers: {
'X-App-Code': 'site-code-xyz',
},
credentials: 'include',
})
expect(mockedPostPublic).toHaveBeenCalledWith('/workflows/run', expect.objectContaining({
body: {
inputs: {
user_prompt: 'my answer',
},
response_mode: 'blocking',
},
}))
const storedToken = JSON.parse(localStorage.getItem('token') || '{}')
expect(storedToken['challenge-456'].DEFAULT).toBe(passportToken)
})
})

View file

@ -0,0 +1,163 @@
'use client'
import { useState } from 'react'
import { useTranslation } from 'react-i18next'
import Modal from '@/app/components/base/modal'
import Button from '@/app/components/base/button'
import Input from '@/app/components/base/input'
import Textarea from '@/app/components/base/textarea'
import Switch from '@/app/components/base/switch'
import Toast from '@/app/components/base/toast'
import Select from '@/app/components/base/select'
import { createConsoleChallenge } from '@/service/console/challenges'
import { useAppFullList } from '@/service/use-apps'
import { useAppWorkflow } from '@/service/use-workflow'
type Props = {
show: boolean
onHide: () => void
onSuccess: () => void
}
export default function CreateChallengeModal({ show, onHide, onSuccess }: Props) {
const { t } = useTranslation()
const [form, setForm] = useState({
app_id: '',
workflow_id: '',
name: '',
description: '',
goal: '',
is_active: true,
})
const [loading, setLoading] = useState(false)
const { data: appsData } = useAppFullList()
const apps = appsData?.data || []
const { data: workflowData } = useAppWorkflow(form.app_id)
const hasWorkflow = !!workflowData?.graph
const handleSubmit = async () => {
if (!form.app_id || !form.name) {
Toast.notify({ type: 'error', message: 'App ID and Name are required' })
return
}
setLoading(true)
try {
await createConsoleChallenge(form)
Toast.notify({ type: 'success', message: 'Challenge created successfully' })
onSuccess()
}
catch (e: any) {
Toast.notify({ type: 'error', message: e.message || 'Failed to create challenge' })
}
finally {
setLoading(false)
}
}
return (
<Modal
isShow={show}
onClose={onHide}
title={t('challenges.console.create')}
className='!max-w-[640px]'
>
<div className='space-y-4 p-8'>
<div>
<label className='mb-2 block text-sm font-medium text-text-secondary'>
{t('challenges.console.form.appId')} <span className='text-text-destructive'>*</span>
</label>
<Select
className='w-full'
defaultValue={form.app_id}
onSelect={item => setForm({ ...form, app_id: item.value as string, workflow_id: '' })}
placeholder={t('common.placeholder.select')}
items={apps.map(app => ({
value: app.id,
name: app.name,
}))}
/>
</div>
{form.app_id && hasWorkflow && (
<div>
<label className='mb-2 block text-sm font-medium text-text-secondary'>
{t('challenges.console.form.workflowId')}
</label>
<div className='flex items-center gap-2'>
<Input
className='flex-1'
value={workflowData?.id || ''}
disabled
/>
<Button
size='small'
onClick={() => setForm({ ...form, workflow_id: workflowData?.id || '' })}
>
Use Workflow
</Button>
</div>
<div className='mt-1 text-xs text-text-tertiary'>
{workflowData?.id ? `Workflow ID: ${workflowData.id}` : 'No workflow published'}
</div>
</div>
)}
<div>
<label className='mb-2 block text-sm font-medium text-text-secondary'>
{t('challenges.console.form.name')} <span className='text-text-destructive'>*</span>
</label>
<Input
value={form.name}
onChange={e => setForm({ ...form, name: e.target.value })}
placeholder={t('challenges.console.form.namePlaceholder')}
/>
</div>
<div>
<label className='mb-2 block text-sm font-medium text-text-secondary'>
{t('challenges.console.form.description')}
</label>
<Textarea
value={form.description}
onChange={e => setForm({ ...form, description: e.target.value })}
placeholder={t('challenges.console.form.descriptionPlaceholder')}
rows={3}
/>
</div>
<div>
<label className='mb-2 block text-sm font-medium text-text-secondary'>
{t('challenges.console.form.goal')}
</label>
<Textarea
value={form.goal}
onChange={e => setForm({ ...form, goal: e.target.value })}
placeholder={t('challenges.console.form.goalPlaceholder')}
rows={2}
/>
<div className='mt-1 text-xs text-text-tertiary'>
This will be shown to players. Challenge logic is defined in the workflow using Challenge Evaluator nodes.
</div>
</div>
<div className='flex items-center justify-between'>
<label className='text-sm font-medium text-text-secondary'>
{t('challenges.console.form.isActive')}
</label>
<Switch
defaultValue={form.is_active}
onChange={v => setForm({ ...form, is_active: v })}
/>
</div>
<div className='flex justify-end gap-2 pt-4'>
<Button onClick={onHide}>{t('common.operation.cancel')}</Button>
<Button variant='primary' onClick={handleSubmit} loading={loading}>
{t('common.operation.create')}
</Button>
</div>
</div>
</Modal>
)
}

View file

@ -0,0 +1,133 @@
'use client'
import { useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { RiAddLine, RiDeleteBinLine } from '@remixicon/react'
import { deleteConsoleChallenge, listConsoleChallenges, updateConsoleChallenge } from '@/service/console/challenges'
import Button from '@/app/components/base/button'
import Toast from '@/app/components/base/toast'
import Confirm from '@/app/components/base/confirm'
import CreateChallengeModal from './create-challenge-modal'
export default function ConsoleChallengesPage() {
const { t } = useTranslation()
const [items, setItems] = useState<any[]>([])
const [showModal, setShowModal] = useState(false)
const [loading, setLoading] = useState(false)
const [pendingDeleteId, setPendingDeleteId] = useState<string | null>(null)
const load = async () => {
setLoading(true)
try {
const data = await listConsoleChallenges()
setItems(data)
}
finally {
setLoading(false)
}
}
useEffect(() => {
void load()
}, [])
const confirmDelete = async () => {
if (!pendingDeleteId)
return
try {
await deleteConsoleChallenge(pendingDeleteId)
Toast.notify({ type: 'success', message: 'Challenge deleted' })
await load()
}
catch (e: any) {
Toast.notify({ type: 'error', message: e.message || 'Delete failed' })
}
finally {
setPendingDeleteId(null)
}
}
const handleToggleActive = async (item: any) => {
try {
await updateConsoleChallenge(item.id, { is_active: !item.is_active })
Toast.notify({ type: 'success', message: item.is_active ? 'Deactivated' : 'Activated' })
await load()
}
catch (e: any) {
Toast.notify({ type: 'error', message: e.message || 'Update failed' })
}
}
return (
<div className='flex h-full flex-col bg-components-panel-bg'>
<div className='flex items-center justify-between border-b border-divider-subtle px-12 py-4'>
<h1 className='text-xl font-semibold text-text-primary'>{t('challenges.console.title')}</h1>
<Button onClick={() => setShowModal(true)}>
<RiAddLine className='h-4 w-4' />
{t('challenges.console.create')}
</Button>
</div>
<div className='flex-1 overflow-y-auto px-12 py-6'>
{loading ? (
<div className='text-text-tertiary'>{t('common.loading')}</div>
) : items.length === 0 ? (
<div className='flex flex-col items-center justify-center py-16'>
<div className='mb-2 text-text-secondary'>{t('challenges.console.empty')}</div>
<div className='text-sm text-text-tertiary'>{t('challenges.console.emptyDesc')}</div>
</div>
) : (
<div className='grid gap-4 sm:grid-cols-2 lg:grid-cols-3'>
{items.map(item => (
<div key={item.id} className='group relative rounded-xl border border-divider-subtle bg-components-panel-bg p-4 shadow-xs transition-shadow hover:shadow-md'>
<div className='mb-2 text-base font-semibold text-text-primary'>{item.name}</div>
{item.description && (
<div className='mb-2 line-clamp-2 text-sm text-text-secondary'>{item.description}</div>
)}
{item.goal && (
<div className='mb-2 line-clamp-1 text-xs text-text-tertiary'>Goal: {item.goal}</div>
)}
<div className='mt-3 flex items-center justify-between'>
<div className={`rounded px-2 py-0.5 text-xs font-medium ${item.is_active ? 'bg-util-colors-green-green-100 text-util-colors-green-green-700' : 'bg-components-badge-gray text-text-tertiary'}`}>
{item.is_active ? t('challenges.console.status.active') : t('challenges.console.status.inactive')}
</div>
<div className='flex gap-1 opacity-0 transition-opacity group-hover:opacity-100'>
<Button
size='small'
onClick={() => handleToggleActive(item)}
>
{item.is_active ? t('challenges.console.actions.deactivate') : t('challenges.console.actions.activate')}
</Button>
<Button
size='small'
variant='ghost'
onClick={() => setPendingDeleteId(item.id)}
>
<RiDeleteBinLine className='h-4 w-4' />
</Button>
</div>
</div>
</div>
))}
</div>
)}
</div>
{showModal && (
<CreateChallengeModal
show={showModal}
onHide={() => setShowModal(false)}
onSuccess={() => {
setShowModal(false)
void load()
}}
/>
)}
<Confirm
isShow={Boolean(pendingDeleteId)}
title={t('challenges.console.actions.deleteConfirm')}
content={t('challenges.console.actions.deleteConfirm')}
onCancel={() => setPendingDeleteId(null)}
onConfirm={confirmDelete}
/>
</div>
)
}

View file

@ -0,0 +1,194 @@
'use client'
import { useState } from 'react'
import { useTranslation } from 'react-i18next'
import Modal from '@/app/components/base/modal'
import Button from '@/app/components/base/button'
import Input from '@/app/components/base/input'
import Textarea from '@/app/components/base/textarea'
import Switch from '@/app/components/base/switch'
import Toast from '@/app/components/base/toast'
import Select from '@/app/components/base/select'
import { createRedBlueChallenge } from '@/service/console/challenges'
import { useAppFullList } from '@/service/use-apps'
import { useAppWorkflow } from '@/service/use-workflow'
type Props = {
show: boolean
onHide: () => void
onSuccess: () => void
}
export default function CreateRedBlueModal({ show, onHide, onSuccess }: Props) {
const { t } = useTranslation()
const [form, setForm] = useState({
app_id: '',
workflow_id: '',
name: '',
description: '',
judge_suite: ['CBRNE', 'SA', 'SH', 'RWH', 'V', 'M'],
defense_selection_policy: 'latest_best',
attack_selection_policy: 'latest_best',
scoring_strategy: 'red_blue_ratio',
is_active: true,
})
const [loading, setLoading] = useState(false)
const { data: appsData } = useAppFullList()
const apps = appsData?.data || []
const { data: workflowData } = useAppWorkflow(form.app_id)
const hasWorkflow = !!workflowData?.graph
const handleSubmit = async () => {
if (!form.app_id || !form.name) {
Toast.notify({ type: 'error', message: 'App ID and Name are required' })
return
}
setLoading(true)
try {
await createRedBlueChallenge(form)
Toast.notify({ type: 'success', message: 'Red/Blue challenge created successfully' })
onSuccess()
}
catch (e: any) {
Toast.notify({ type: 'error', message: e.message || 'Failed to create challenge' })
}
finally {
setLoading(false)
}
}
return (
<Modal
isShow={show}
onClose={onHide}
title={t('challenges.console.createRedBlue')}
className='!max-w-[640px]'
>
<div className='space-y-4 p-8'>
<div>
<label className='mb-2 block text-sm font-medium text-text-secondary'>
{t('challenges.console.form.appId')} <span className='text-text-destructive'>*</span>
</label>
<Select
className='w-full'
defaultValue={form.app_id}
onSelect={item => setForm({ ...form, app_id: item.value as string, workflow_id: '' })}
placeholder={t('common.placeholder.select')}
items={apps.map(app => ({
value: app.id,
name: app.name,
}))}
/>
</div>
{form.app_id && hasWorkflow && (
<div>
<label className='mb-2 block text-sm font-medium text-text-secondary'>
{t('challenges.console.form.workflowId')}
</label>
<div className='flex items-center gap-2'>
<Input
className='flex-1'
value={workflowData?.id || ''}
disabled
/>
<Button
size='small'
onClick={() => setForm({ ...form, workflow_id: workflowData?.id || '' })}
>
Use Workflow
</Button>
</div>
<div className='mt-1 text-xs text-text-tertiary'>
{workflowData?.id ? `Workflow ID: ${workflowData.id}` : 'No workflow published'}
</div>
</div>
)}
<div>
<label className='mb-2 block text-sm font-medium text-text-secondary'>
{t('challenges.console.form.name')} <span className='text-text-destructive'>*</span>
</label>
<Input
value={form.name}
onChange={e => setForm({ ...form, name: e.target.value })}
placeholder={t('challenges.console.form.namePlaceholder')}
/>
</div>
<div>
<label className='mb-2 block text-sm font-medium text-text-secondary'>
{t('challenges.console.form.description')}
</label>
<Textarea
value={form.description}
onChange={e => setForm({ ...form, description: e.target.value })}
placeholder={t('challenges.console.form.descriptionPlaceholder')}
rows={3}
/>
</div>
<div className='grid grid-cols-2 gap-4'>
<div>
<label className='mb-2 block text-sm font-medium text-text-secondary'>
{t('challenges.console.form.defensePolicy')}
</label>
<select
className='input-select w-full'
value={form.defense_selection_policy}
onChange={e => setForm({ ...form, defense_selection_policy: e.target.value })}
>
<option value='latest_best'>Latest Best</option>
<option value='random_active'>Random Active</option>
<option value='round_robin'>Round Robin</option>
<option value='request_new_if_none'>Request New If None</option>
</select>
</div>
<div>
<label className='mb-2 block text-sm font-medium text-text-secondary'>
{t('challenges.console.form.attackPolicy')}
</label>
<select
className='input-select w-full'
value={form.attack_selection_policy}
onChange={e => setForm({ ...form, attack_selection_policy: e.target.value })}
>
<option value='latest_best'>Latest Best</option>
<option value='random_active'>Random Active</option>
<option value='round_robin'>Round Robin</option>
<option value='request_new_if_none'>Request New If None</option>
</select>
</div>
</div>
<div>
<label className='mb-2 block text-sm font-medium text-text-secondary'>
{t('challenges.console.form.judgeSuite')}
</label>
<div className='text-xs text-text-tertiary'>
Categories: {form.judge_suite.join(', ')}
</div>
</div>
<div className='flex items-center justify-between'>
<label className='text-sm font-medium text-text-secondary'>
{t('challenges.console.form.isActive')}
</label>
<Switch
defaultValue={form.is_active}
onChange={v => setForm({ ...form, is_active: v })}
/>
</div>
<div className='flex justify-end gap-2 pt-4'>
<Button onClick={onHide}>{t('common.operation.cancel')}</Button>
<Button variant='primary' onClick={handleSubmit} loading={loading}>
{t('common.operation.create')}
</Button>
</div>
</div>
</Modal>
)
}

View file

@ -0,0 +1,136 @@
'use client'
import { useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { RiAddLine, RiDeleteBinLine } from '@remixicon/react'
import { deleteRedBlueChallenge, listRedBlueChallenges, updateRedBlueChallenge } from '@/service/console/challenges'
import Button from '@/app/components/base/button'
import Toast from '@/app/components/base/toast'
import Confirm from '@/app/components/base/confirm'
import CreateRedBlueModal from './create-red-blue-modal'
export default function ConsoleRedBlueChallengesPage() {
const { t } = useTranslation()
const [items, setItems] = useState<any[]>([])
const [showModal, setShowModal] = useState(false)
const [loading, setLoading] = useState(false)
const [pendingDeleteId, setPendingDeleteId] = useState<string | null>(null)
const load = async () => {
setLoading(true)
try {
const data = await listRedBlueChallenges()
setItems(data)
}
finally {
setLoading(false)
}
}
useEffect(() => {
void load()
}, [])
const confirmDelete = async () => {
if (!pendingDeleteId)
return
try {
await deleteRedBlueChallenge(pendingDeleteId)
Toast.notify({ type: 'success', message: 'Challenge deleted' })
await load()
}
catch (e: any) {
Toast.notify({ type: 'error', message: e.message || 'Delete failed' })
}
finally {
setPendingDeleteId(null)
}
}
const handleToggleActive = async (item: any) => {
try {
await updateRedBlueChallenge(item.id, { is_active: !item.is_active })
Toast.notify({ type: 'success', message: item.is_active ? 'Deactivated' : 'Activated' })
await load()
}
catch (e: any) {
Toast.notify({ type: 'error', message: e.message || 'Update failed' })
}
}
return (
<div className='flex h-full flex-col bg-components-panel-bg'>
<div className='flex items-center justify-between border-b border-divider-subtle px-12 py-4'>
<h1 className='text-xl font-semibold text-text-primary'>{t('challenges.redBlue.title')}</h1>
<Button onClick={() => setShowModal(true)}>
<RiAddLine className='h-4 w-4' />
{t('challenges.console.createRedBlue')}
</Button>
</div>
<div className='flex-1 overflow-y-auto px-12 py-6'>
{loading ? (
<div className='text-text-tertiary'>{t('common.loading')}</div>
) : items.length === 0 ? (
<div className='flex flex-col items-center justify-center py-16'>
<div className='mb-2 text-text-secondary'>{t('challenges.console.empty')}</div>
<div className='text-sm text-text-tertiary'>{t('challenges.console.emptyDesc')}</div>
</div>
) : (
<div className='grid gap-4 sm:grid-cols-2 lg:grid-cols-3'>
{items.map(item => (
<div key={item.id} className='group relative rounded-xl border border-divider-subtle bg-components-panel-bg p-4 shadow-xs transition-shadow hover:shadow-md'>
<div className='mb-2 flex items-start justify-between'>
<div className='text-base font-semibold text-text-primary'>{item.name}</div>
<div className='flex gap-1'>
<div className='rounded bg-util-colors-red-red-100 px-1.5 py-0.5 text-[10px] font-semibold uppercase text-util-colors-red-red-700'>RED</div>
<div className='rounded bg-util-colors-blue-blue-100 px-1.5 py-0.5 text-[10px] font-semibold uppercase text-util-colors-blue-blue-700'>BLUE</div>
</div>
</div>
{item.description && (
<div className='mb-2 line-clamp-2 text-sm text-text-secondary'>{item.description}</div>
)}
<div className='mt-3 flex items-center justify-between'>
<div className={`rounded px-2 py-0.5 text-xs font-medium ${item.is_active ? 'bg-util-colors-green-green-100 text-util-colors-green-green-700' : 'bg-components-badge-gray text-text-tertiary'}`}>
{item.is_active ? t('challenges.console.status.active') : t('challenges.console.status.inactive')}
</div>
<div className='flex gap-1 opacity-0 transition-opacity group-hover:opacity-100'>
<Button
size='small'
onClick={() => handleToggleActive(item)}
>
{item.is_active ? t('challenges.console.actions.deactivate') : t('challenges.console.actions.activate')}
</Button>
<Button
size='small'
variant='ghost'
onClick={() => setPendingDeleteId(item.id)}
>
<RiDeleteBinLine className='h-4 w-4' />
</Button>
</div>
</div>
</div>
))}
</div>
)}
</div>
{showModal && (
<CreateRedBlueModal
show={showModal}
onHide={() => setShowModal(false)}
onSuccess={() => {
setShowModal(false)
void load()
}}
/>
)}
<Confirm
isShow={Boolean(pendingDeleteId)}
title={t('challenges.console.actions.deleteConfirm')}
content={t('challenges.console.actions.deleteConfirm')}
onCancel={() => setPendingDeleteId(null)}
onConfirm={confirmDelete}
/>
</div>
)
}

View file

@ -0,0 +1,206 @@
'use client'
import { useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { useParams } from 'next/navigation'
import { RiCheckLine, RiCloseLine, RiLoader4Line } from '@remixicon/react'
import { fetchChallengeDetail, fetchChallengeLeaderboard, submitChallengeAttempt } from '@/service/challenges'
import Leaderboard from '@/app/components/challenge/leaderboard'
import Button from '@/app/components/base/button'
import Textarea from '@/app/components/base/textarea'
import Toast from '@/app/components/base/toast'
export default function ChallengeDetailPage() {
const { t } = useTranslation()
const params = useParams()
const id = params?.id as string
const [challenge, setChallenge] = useState<any>(null)
const [leaderboard, setLeaderboard] = useState<any[]>([])
const [loading, setLoading] = useState(true)
const [submitting, setSubmitting] = useState(false)
const [userInput, setUserInput] = useState('')
const [lastResult, setLastResult] = useState<{ success: boolean; message?: string; rating?: number } | null>(null)
useEffect(() => {
const load = async () => {
try {
const [detail, leaders] = await Promise.all([
fetchChallengeDetail(id),
fetchChallengeLeaderboard(id),
])
setChallenge(detail)
setLeaderboard(leaders)
}
catch (e: any) {
Toast.notify({ type: 'error', message: e.message || 'Failed to load challenge' })
}
finally {
setLoading(false)
}
}
if (id)
load()
}, [id])
const handleSubmit = async () => {
if (!userInput.trim()) {
Toast.notify({ type: 'error', message: 'Please enter a response' })
return
}
if (!challenge?.app_id) {
Toast.notify({ type: 'error', message: 'Challenge is not configured with an app' })
return
}
setSubmitting(true)
setLastResult(null)
try {
// Execute the workflow with the user's input
// Endpoint varies by app type (chat vs workflow)
const result = await submitChallengeAttempt(
id,
challenge.app_id,
challenge.app_site_code,
challenge.app_mode || 'workflow',
userInput,
)
// Extract challenge results from workflow output
// Response structure differs by app mode:
// - Chat apps: result.data.answer + result.data.metadata.outputs
// - Workflow apps: result.data (direct outputs)
const isChatApp = challenge.app_mode === 'chat' || challenge.app_mode === 'advanced-chat'
const workflowOutputs = isChatApp
? (result.data?.metadata?.outputs || {})
: (result.data || {})
const success = workflowOutputs.challenge_succeeded || false
const rating = workflowOutputs.judge_rating
const feedback = workflowOutputs.judge_feedback || workflowOutputs.message || result.data?.answer
setLastResult({
success,
message: feedback || (success ? 'Challenge passed!' : 'Challenge not passed.'),
rating,
})
if (success) {
Toast.notify({ type: 'success', message: 'Challenge completed!' })
// Refresh leaderboard
const leaders = await fetchChallengeLeaderboard(id)
setLeaderboard(leaders)
}
}
catch (e: any) {
console.error('Submission error:', e)
Toast.notify({ type: 'error', message: e.message || 'Submission failed' })
}
finally {
setSubmitting(false)
}
}
if (loading) {
return (
<div className='flex min-h-screen items-center justify-center bg-components-panel-bg'>
<div className='text-text-tertiary'>{t('common.loading')}</div>
</div>
)
}
if (!challenge) {
return (
<div className='flex min-h-screen items-center justify-center bg-components-panel-bg'>
<div className='text-text-secondary'>Challenge not found</div>
</div>
)
}
return (
<div className='min-h-screen bg-components-panel-bg'>
<div className='mx-auto max-w-5xl px-4 py-12 sm:px-6 lg:px-8'>
<div className='mb-8'>
<h1 className='mb-2 text-3xl font-bold text-text-primary'>{challenge.name}</h1>
{challenge.description && (
<p className='text-lg text-text-secondary'>{challenge.description}</p>
)}
</div>
<div className='grid gap-6 lg:grid-cols-3'>
<div className='lg:col-span-2'>
{challenge.goal && (
<div className='mb-6 rounded-xl border border-divider-subtle bg-components-panel-bg p-6 shadow-xs'>
<h2 className='mb-2 text-sm font-medium uppercase tracking-wide text-text-tertiary'>
{t('challenges.player.goal')}
</h2>
<p className='text-text-primary'>{challenge.goal}</p>
</div>
)}
<div className='rounded-xl border border-divider-subtle bg-components-panel-bg p-6 shadow-xs'>
<h2 className='mb-4 text-lg font-semibold text-text-primary'>
{t('challenges.player.yourAttempt')}
</h2>
<Textarea
value={userInput}
onChange={e => setUserInput(e.target.value)}
placeholder='Enter your response here...'
rows={8}
className='mb-4 w-full'
/>
<Button
type='primary'
onClick={handleSubmit}
loading={submitting}
disabled={!userInput.trim()}
className='w-full'
>
{submitting ? (
<>
<RiLoader4Line className='mr-2 h-4 w-4 animate-spin' />
{t('common.operation.processing')}
</>
) : (
t('challenges.player.submit')
)}
</Button>
{lastResult && (
<div className={`mt-4 rounded-lg border p-4 ${lastResult.success ? 'border-util-colors-green-green-500 bg-util-colors-green-green-50' : 'border-util-colors-orange-orange-500 bg-util-colors-orange-orange-50'}`}>
<div className='flex items-start gap-3'>
{lastResult.success ? (
<RiCheckLine className='h-5 w-5 shrink-0 text-util-colors-green-green-600' />
) : (
<RiCloseLine className='h-5 w-5 shrink-0 text-util-colors-orange-orange-600' />
)}
<div className='flex-1'>
<div className={`mb-1 font-medium ${lastResult.success ? 'text-util-colors-green-green-700' : 'text-util-colors-orange-orange-700'}`}>
{lastResult.success ? t('challenges.player.status.success') : t('challenges.player.status.failed')}
</div>
{lastResult.message && (
<div className='text-sm text-text-secondary'>{lastResult.message}</div>
)}
{lastResult.rating !== undefined && (
<div className='mt-2 text-sm text-text-tertiary'>
{t('challenges.leaderboard.rating')}: {lastResult.rating}/10
</div>
)}
</div>
</div>
</div>
)}
</div>
</div>
<div className='lg:col-span-1'>
<Leaderboard entries={leaderboard} strategy={challenge.scoring_strategy} />
</div>
</div>
</div>
</div>
)
}

View file

@ -0,0 +1,78 @@
'use client'
import { useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import Link from 'next/link'
import { RiArrowRightLine, RiAwardLine } from '@remixicon/react'
import { fetchChallenges } from '@/service/challenges'
import type { ChallengeListItem } from '@/service/challenges'
export default function ChallengesListPage() {
const { t } = useTranslation()
const [challenges, setChallenges] = useState<ChallengeListItem[]>([])
const [loading, setLoading] = useState(true)
useEffect(() => {
const load = async () => {
try {
const data = await fetchChallenges()
console.log('Loaded challenges:', data)
setChallenges(data)
}
catch (error) {
console.error('Failed to load challenges:', error)
}
finally {
setLoading(false)
}
}
load()
}, [])
return (
<div className='min-h-screen bg-components-panel-bg'>
<div className='mx-auto max-w-7xl px-4 py-12 sm:px-6 lg:px-8'>
<div className='mb-8 text-center'>
<h1 className='mb-2 text-4xl font-bold text-text-primary'>{t('challenges.player.browse')}</h1>
<p className='text-lg text-text-secondary'>Test your skills and compete on the leaderboard</p>
</div>
{loading ? (
<div className='text-center text-text-tertiary'>{t('common.loading')}</div>
) : challenges.length === 0 ? (
<div className='rounded-xl border border-divider-subtle bg-components-panel-bg p-12 text-center'>
<RiAwardLine className='mx-auto mb-4 h-12 w-12 text-text-quaternary' />
<div className='text-text-secondary'>No challenges available yet</div>
</div>
) : (
<div className='grid gap-6 sm:grid-cols-2 lg:grid-cols-3'>
{challenges.map(challenge => (
<Link
key={challenge.id}
href={`/challenges/${challenge.id}`}
className='group block'
>
<div className='h-full rounded-xl border border-divider-subtle bg-components-panel-bg p-6 shadow-xs transition-all hover:border-components-button-primary-bg hover:shadow-md'>
<div className='mb-3 flex items-start justify-between'>
<RiAwardLine className='h-8 w-8 text-util-colors-cyan-cyan-500' />
<RiArrowRightLine className='h-5 w-5 text-text-quaternary transition-transform group-hover:translate-x-1' />
</div>
<h3 className='mb-2 text-lg font-semibold text-text-primary'>{challenge.name}</h3>
{challenge.description && (
<p className='mb-3 line-clamp-2 text-sm text-text-secondary'>{challenge.description}</p>
)}
{challenge.goal && (
<div className='mt-4 rounded-lg bg-components-panel-on-panel-item-bg p-3'>
<div className='mb-1 text-xs font-medium uppercase text-text-tertiary'>Goal</div>
<div className='line-clamp-2 text-sm text-text-secondary'>{challenge.goal}</div>
</div>
)}
</div>
</Link>
))}
</div>
)}
</div>
</div>
)
}

View file

@ -0,0 +1,124 @@
'use client'
import { useTranslation } from 'react-i18next'
type LeaderboardEntry = {
rank: number
player_name: string
score: number
elapsed_ms?: number
tokens_total?: number
judge_rating?: number
created_at: string
is_current_user?: boolean
}
type Props = {
entries: LeaderboardEntry[]
strategy?: string
}
export default function Leaderboard({ entries, strategy = 'highest_rating' }: Props) {
const { t } = useTranslation()
if (entries.length === 0) {
return (
<div className='rounded-xl border border-divider-subtle bg-components-panel-bg p-8 text-center'>
<div className='text-text-tertiary'>{t('challenges.leaderboard.empty')}</div>
</div>
)
}
return (
<div className='rounded-xl border border-divider-subtle bg-components-panel-bg shadow-xs'>
<div className='border-b border-divider-subtle px-6 py-4'>
<h2 className='text-lg font-semibold text-text-primary'>{t('challenges.leaderboard.title')}</h2>
</div>
<div className='overflow-x-auto'>
<table className='w-full'>
<thead className='border-b border-divider-subtle bg-components-panel-on-panel-item-bg'>
<tr>
<th className='px-6 py-3 text-left text-xs font-medium uppercase tracking-wider text-text-tertiary'>
{t('challenges.leaderboard.rank')}
</th>
<th className='px-6 py-3 text-left text-xs font-medium uppercase tracking-wider text-text-tertiary'>
{t('challenges.leaderboard.player')}
</th>
<th className='px-6 py-3 text-left text-xs font-medium uppercase tracking-wider text-text-tertiary'>
{t('challenges.leaderboard.score')}
</th>
{strategy === 'fastest' && (
<th className='px-6 py-3 text-left text-xs font-medium uppercase tracking-wider text-text-tertiary'>
{t('challenges.leaderboard.time')}
</th>
)}
{strategy === 'fewest_tokens' && (
<th className='px-6 py-3 text-left text-xs font-medium uppercase tracking-wider text-text-tertiary'>
{t('challenges.leaderboard.tokens')}
</th>
)}
{strategy === 'highest_rating' && (
<th className='px-6 py-3 text-left text-xs font-medium uppercase tracking-wider text-text-tertiary'>
{t('challenges.leaderboard.rating')}
</th>
)}
</tr>
</thead>
<tbody className='divide-y divide-divider-subtle'>
{entries.map((entry, idx) => (
<tr
key={idx}
className={`transition-colors hover:bg-components-panel-on-panel-item-bg ${entry.is_current_user ? 'bg-util-colors-blue-blue-50' : ''}`}
>
<td className='whitespace-nowrap px-6 py-4'>
<div className='flex items-center'>
{entry.rank <= 3 ? (
<span className='text-lg'>
{entry.rank === 1 && '🥇'}
{entry.rank === 2 && '🥈'}
{entry.rank === 3 && '🥉'}
</span>
) : (
<span className='text-sm text-text-tertiary'>#{entry.rank}</span>
)}
</div>
</td>
<td className='whitespace-nowrap px-6 py-4'>
<div className='flex items-center'>
<div className='text-sm font-medium text-text-primary'>
{entry.player_name}
{entry.is_current_user && (
<span className='ml-2 rounded bg-util-colors-blue-blue-100 px-1.5 py-0.5 text-xs text-util-colors-blue-blue-700'>
{t('challenges.leaderboard.yourBest')}
</span>
)}
</div>
</div>
</td>
<td className='whitespace-nowrap px-6 py-4 text-sm text-text-secondary'>
{entry.score.toFixed(1)}
</td>
{strategy === 'fastest' && entry.elapsed_ms !== undefined && (
<td className='whitespace-nowrap px-6 py-4 text-sm text-text-secondary'>
{(entry.elapsed_ms / 1000).toFixed(2)}s
</td>
)}
{strategy === 'fewest_tokens' && entry.tokens_total !== undefined && (
<td className='whitespace-nowrap px-6 py-4 text-sm text-text-secondary'>
{entry.tokens_total}
</td>
)}
{strategy === 'highest_rating' && entry.judge_rating !== undefined && (
<td className='whitespace-nowrap px-6 py-4'>
<div className='flex items-center'>
<span className='text-sm font-medium text-text-primary'>{entry.judge_rating}/10</span>
</div>
</td>
)}
</tr>
))}
</tbody>
</table>
</div>
</div>
)
}

View file

@ -65,6 +65,8 @@ export const useAvailableNodesMetaData = () => {
nodesMap: {
...availableNodesMetaDataMap,
[BlockEnum.VariableAssigner]: availableNodesMetaDataMap?.[BlockEnum.VariableAggregator],
// Legacy alias for renamed node
'prompt-challenge': availableNodesMetaDataMap?.[BlockEnum.ChallengeEvaluator],
},
}
}, [availableNodesMetaData, availableNodesMetaDataMap])

View file

@ -62,6 +62,8 @@ export const useAvailableNodesMetaData = () => {
nodesMap: {
...availableNodesMetaDataMap,
[BlockEnum.VariableAssigner]: availableNodesMetaDataMap?.[BlockEnum.VariableAggregator],
// Legacy alias for renamed node
'prompt-challenge': availableNodesMetaDataMap?.[BlockEnum.ChallengeEvaluator],
},
}
}, [availableNodesMetaData, availableNodesMetaDataMap])

View file

@ -66,6 +66,9 @@ const getIcon = (type: BlockEnum, className: string) => {
[BlockEnum.KnowledgeBase]: <KnowledgeBase className={className} />,
[BlockEnum.DataSource]: <Datasource className={className} />,
[BlockEnum.DataSourceEmpty]: <></>,
[BlockEnum.ChallengeEvaluator]: <IfElse className={className} />,
[BlockEnum.JudgingLLM]: <Llm className={className} />,
[BlockEnum.TeamChallenge]: <Agent className={className} />,
}[type]
}
const ICON_CONTAINER_BG_COLOR_MAP: Record<string, string> = {
@ -92,6 +95,9 @@ const ICON_CONTAINER_BG_COLOR_MAP: Record<string, string> = {
[BlockEnum.Agent]: 'bg-util-colors-indigo-indigo-500',
[BlockEnum.KnowledgeBase]: 'bg-util-colors-warning-warning-500',
[BlockEnum.DataSource]: 'bg-components-icon-bg-midnight-solid',
[BlockEnum.ChallengeEvaluator]: 'bg-util-colors-blue-blue-500',
[BlockEnum.JudgingLLM]: 'bg-util-colors-indigo-indigo-500',
[BlockEnum.TeamChallenge]: 'bg-util-colors-green-green-500',
}
const BlockIcon: FC<BlockIconProps> = ({
type,

View file

@ -61,6 +61,7 @@ export const SUPPORT_OUTPUT_VARS_NODE = [
BlockEnum.ParameterExtractor, BlockEnum.Iteration, BlockEnum.Loop,
BlockEnum.DocExtractor, BlockEnum.ListFilter,
BlockEnum.Agent, BlockEnum.DataSource,
BlockEnum.ChallengeEvaluator, BlockEnum.JudgingLLM, BlockEnum.TeamChallenge,
]
export const AGENT_OUTPUT_STRUCT: Var[] = [

View file

@ -20,6 +20,9 @@ import httpRequestDefault from '@/app/components/workflow/nodes/http/default'
import parameterExtractorDefault from '@/app/components/workflow/nodes/parameter-extractor/default'
import listOperatorDefault from '@/app/components/workflow/nodes/list-operator/default'
import toolDefault from '@/app/components/workflow/nodes/tool/default'
import challengeEvaluatorDefault from '@/app/components/workflow/nodes/challenge-evaluator/default'
import judgingLLMDefault from '@/app/components/workflow/nodes/judging-llm/default'
import teamChallengeDefault from '@/app/components/workflow/nodes/team-challenge/default'
export const WORKFLOW_COMMON_NODES = [
llmDefault,
@ -41,4 +44,7 @@ export const WORKFLOW_COMMON_NODES = [
httpRequestDefault,
listOperatorDefault,
toolDefault,
challengeEvaluatorDefault,
judgingLLMDefault,
teamChallengeDefault,
]

View file

@ -645,6 +645,41 @@ const formatItem = (
}) as Var[]
break
}
case BlockEnum.JudgingLLM:
case BlockEnum.ChallengeEvaluator:
case BlockEnum.TeamChallenge: {
// Synchronously get outputs if getOutputVars is defined
const nodeType = data.type
if (nodeType === BlockEnum.JudgingLLM) {
res.vars = [
{ variable: 'judge_passed', type: VarType.boolean },
{ variable: 'judge_rating', type: VarType.number },
{ variable: 'judge_feedback', type: VarType.string },
{ variable: 'judge_raw', type: VarType.object },
]
}
else if (nodeType === BlockEnum.ChallengeEvaluator) {
res.vars = [
{ variable: 'challenge_succeeded', type: VarType.boolean },
{ variable: 'judge_rating', type: VarType.number },
{ variable: 'judge_feedback', type: VarType.string },
{ variable: 'message', type: VarType.string },
]
}
else if (nodeType === BlockEnum.TeamChallenge) {
res.vars = [
{ variable: 'team', type: VarType.string },
{ variable: 'judge_passed', type: VarType.boolean },
{ variable: 'judge_rating', type: VarType.number },
{ variable: 'judge_feedback', type: VarType.string },
{ variable: 'categories', type: VarType.object },
{ variable: 'team_points', type: VarType.number },
{ variable: 'total_points', type: VarType.number },
]
}
break
}
}
const { error_strategy } = data

View file

@ -0,0 +1,42 @@
import type { NodeDefault } from '../../types'
import { genNodeMetaData } from '@/app/components/workflow/utils'
import { BlockEnum, VarType } from '@/app/components/workflow/types'
import { BlockClassificationEnum } from '@/app/components/workflow/block-selector/types'
import type { ChallengeEvaluatorNodeType } from './types'
const metaData = genNodeMetaData({
classification: BlockClassificationEnum.Utilities,
sort: 3,
type: BlockEnum.ChallengeEvaluator,
helpLinkUri: 'challenge-evaluator',
})
const nodeDefault: NodeDefault<ChallengeEvaluatorNodeType> = {
metaData,
defaultValue: {
evaluation_mode: 'rules',
success_type: 'contains',
success_pattern: '',
scoring_strategy: 'highest_rating',
mask_variables: [],
inputs: {
response: [],
},
},
getOutputVars() {
return [
{ variable: 'challenge_succeeded', type: VarType.boolean },
{ variable: 'judge_rating', type: VarType.number },
{ variable: 'judge_feedback', type: VarType.string },
{ variable: 'message', type: VarType.string },
]
},
checkValid(payload: ChallengeEvaluatorNodeType, t: any) {
let errorMessages = ''
if (payload.evaluation_mode === 'rules' && !payload.success_pattern)
errorMessages = t('workflow.errorMsg.fieldRequired', { field: 'success_pattern' })
return { isValid: !errorMessages, errorMessage: errorMessages }
},
}
export default nodeDefault

View file

@ -0,0 +1,28 @@
import type { FC } from 'react'
import React from 'react'
import type { NodeProps } from '@/app/components/workflow/types'
import type { ChallengeEvaluatorNodeType } from './types'
const Node: FC<NodeProps<ChallengeEvaluatorNodeType>> = ({ data }) => {
const { evaluation_mode, success_type, success_pattern, challenge_id } = data
return (
<div className='mb-1 px-3 py-1'>
{challenge_id ? (
<div className='flex items-center gap-2'>
<div className='rounded bg-components-badge-white-to-dark px-1 py-0.5 text-[10px] font-semibold uppercase text-text-tertiary'>Challenge</div>
<div className='truncate text-xs text-text-secondary' title={challenge_id}>{challenge_id}</div>
</div>
) : (
<div className='flex items-center gap-2'>
<div className='rounded bg-components-badge-white-to-dark px-1 py-0.5 text-[10px] font-semibold uppercase text-text-tertiary'>{evaluation_mode}</div>
<div className='rounded bg-components-badge-white-to-dark px-1 py-0.5 text-[10px] font-semibold uppercase text-text-tertiary'>{success_type}</div>
{success_pattern && (
<div className='min-w-0 truncate text-xs text-text-secondary' title={success_pattern}>"{success_pattern}"</div>
)}
</div>
)}
</div>
)
}
export default React.memo(Node)

View file

@ -0,0 +1,121 @@
import type { FC } from 'react'
import { memo, useMemo } from 'react'
import { useTranslation } from 'react-i18next'
import type { NodePanelProps } from '@/app/components/workflow/types'
import Field from '@/app/components/workflow/nodes/_base/components/field'
import Split from '@/app/components/workflow/nodes/_base/components/split'
import OutputVars, { VarItem } from '@/app/components/workflow/nodes/_base/components/output-vars'
import VarReferencePicker from '@/app/components/workflow/nodes/_base/components/variable/var-reference-picker'
import type { ChallengeEvaluatorNodeType } from './types'
import useNodeCrud from '@/app/components/workflow/nodes/_base/hooks/use-node-crud'
import produce from 'immer'
import useSWR from 'swr'
import { fetchChallenges } from '@/service/challenges'
import Editor from '@/app/components/workflow/nodes/_base/components/prompt/editor'
import useAvailableVarList from '@/app/components/workflow/nodes/_base/hooks/use-available-var-list'
import Select from '@/app/components/base/select'
const i18nPrefix = 'workflow.nodes.challengeEvaluator'
const Panel: FC<NodePanelProps<ChallengeEvaluatorNodeType>> = ({ id, data }) => {
const { t } = useTranslation()
const { inputs, setInputs } = useNodeCrud<ChallengeEvaluatorNodeType>(id, data)
const { data: challenges } = useSWR('challenges:list', fetchChallenges)
const filterVar = useMemo(() => (_: any) => true, [])
const { availableVars, availableNodesWithParent } = useAvailableVarList(id, { onlyLeafNodeVar: false, filterVar })
return (
<div className='pt-2'>
<div className='space-y-4 px-4 pb-4'>
<Field title={t(`${i18nPrefix}.selectedChallenge`)} tooltip={t(`${i18nPrefix}.selectedChallengeTip`)}>
<Select
items={(challenges || []).map((c: any) => ({ value: c.id, name: c.name }))}
defaultValue={data.challenge_id || ''}
onSelect={item => setInputs(produce(inputs, (draft) => { (draft as any).challenge_id = (item?.value as string) || undefined }))}
allowSearch={false}
/>
</Field>
<Field title={t(`${i18nPrefix}.evaluationMode`)} tooltip={t(`${i18nPrefix}.evaluationModeTip`)}>
<Select
items={[
{ value: 'rules', name: 'Rules' },
{ value: 'llm-judge', name: 'Judging LLM' },
{ value: 'custom', name: 'Custom' },
]}
defaultValue={data.evaluation_mode || 'rules'}
onSelect={item => setInputs(produce(inputs, (draft) => { (draft as any).evaluation_mode = item.value as string }))}
allowSearch={false}
/>
</Field>
{!data.challenge_id && data.evaluation_mode === 'rules' && (
<>
<Field title={t(`${i18nPrefix}.successType`)} tooltip={t(`${i18nPrefix}.successTypeTip`)}>
<Select
items={[
{ value: 'contains', name: 'Contains' },
{ value: 'regex', name: 'Regex' },
]}
defaultValue={data.success_type || 'contains'}
onSelect={item => setInputs(produce(inputs, (draft) => { (draft as any).success_type = item.value as string }))}
allowSearch={false}
/>
</Field>
<Field title={t(`${i18nPrefix}.successPattern`)} tooltip={t(`${i18nPrefix}.successPatternTip`)} required>
<Editor
title={<div className='text-xs font-semibold uppercase text-text-secondary'>pattern</div>}
value={data.success_pattern || ''}
onChange={v => setInputs(produce(inputs, (draft) => { (draft as any).success_pattern = v }))}
readOnly={false}
isShowContext={false}
isChatApp
isChatModel
hasSetBlockStatus={{ history: false, query: false, context: false }}
nodesOutputVars={availableVars}
availableNodes={availableNodesWithParent}
isSupportFileVar
/>
</Field>
</>
)}
<Field title={t(`${i18nPrefix}.responseVar`)} tooltip={t(`${i18nPrefix}.responseVarTip`)}>
<VarReferencePicker
nodeId={id}
readonly={false}
isShowNodeName
value={data.inputs?.response || []}
onChange={v => setInputs(produce(inputs, (draft) => { (draft as any).inputs = { ...(draft as any).inputs, response: v } }))}
filterVar={filterVar}
/>
</Field>
<Field title={t(`${i18nPrefix}.scoringStrategy`)} tooltip={t(`${i18nPrefix}.scoringStrategyTip`)}>
<Select
items={[
{ value: 'first', name: t(`${i18nPrefix}.scoringFirst`) },
{ value: 'fastest', name: t(`${i18nPrefix}.scoringFastest`) },
{ value: 'fewest_tokens', name: t(`${i18nPrefix}.scoringFewestTokens`) },
{ value: 'highest_rating', name: t(`${i18nPrefix}.scoringHighestRating`) },
{ value: 'custom', name: t(`${i18nPrefix}.scoringCustom`) },
]}
defaultValue={data.scoring_strategy || 'highest_rating'}
onSelect={item => setInputs(produce(inputs, (draft) => { (draft as any).scoring_strategy = item.value as string }))}
allowSearch={false}
/>
</Field>
</div>
<Split />
<div>
<OutputVars>
<>
<VarItem name='challenge_succeeded' type='boolean' description='Challenge succeeded' />
<VarItem name='judge_rating' type='number' description='Judge rating' />
<VarItem name='judge_feedback' type='string' description='Judge feedback' />
<VarItem name='message' type='string' description='Message' />
</>
</OutputVars>
</div>
</div>
)
}
export default memo(Panel)

View file

@ -0,0 +1,20 @@
import type { CommonNodeType, ValueSelector } from '@/app/components/workflow/types'
import { BlockEnum } from '@/app/components/workflow/types'
export type ChallengeEvaluatorNodeType = CommonNodeType<{
challenge_id?: string
evaluation_mode?: 'rules' | 'llm-judge' | 'custom'
success_type?: 'regex' | 'contains' | 'custom'
success_pattern?: string
scoring_strategy?: 'first' | 'fastest' | 'fewest_tokens' | 'highest_rating' | 'custom'
mask_variables?: string[]
inputs?: {
response?: ValueSelector
}
}>
export const DEFAULT_CHALLENGE_EVALUATOR_INPUTS: ChallengeEvaluatorNodeType['inputs'] = {
response: [],
}
export const CHALLENGE_EVALUATOR_BLOCK_TYPE = BlockEnum.ChallengeEvaluator

View file

@ -43,6 +43,12 @@ import DataSourcePanel from './data-source/panel'
import KnowledgeBaseNode from './knowledge-base/node'
import KnowledgeBasePanel from './knowledge-base/panel'
import { TransferMethod } from '@/types/app'
import ChallengeEvaluatorNode from './challenge-evaluator/node'
import ChallengeEvaluatorPanel from './challenge-evaluator/panel'
import JudgingLLMNode from './judging-llm/node'
import JudgingLLMPanel from './judging-llm/panel'
import TeamChallengeNode from './team-challenge/node'
import TeamChallengePanel from './team-challenge/panel'
export const NodeComponentMap: Record<string, ComponentType<any>> = {
[BlockEnum.Start]: StartNode,
@ -67,6 +73,11 @@ export const NodeComponentMap: Record<string, ComponentType<any>> = {
[BlockEnum.Agent]: AgentNode,
[BlockEnum.DataSource]: DataSourceNode,
[BlockEnum.KnowledgeBase]: KnowledgeBaseNode,
[BlockEnum.ChallengeEvaluator]: ChallengeEvaluatorNode,
[BlockEnum.JudgingLLM]: JudgingLLMNode,
[BlockEnum.TeamChallenge]: TeamChallengeNode,
// Legacy alias for renamed node
'prompt-challenge': ChallengeEvaluatorNode,
}
export const PanelComponentMap: Record<string, ComponentType<any>> = {
@ -92,6 +103,11 @@ export const PanelComponentMap: Record<string, ComponentType<any>> = {
[BlockEnum.Agent]: AgentPanel,
[BlockEnum.DataSource]: DataSourcePanel,
[BlockEnum.KnowledgeBase]: KnowledgeBasePanel,
[BlockEnum.ChallengeEvaluator]: ChallengeEvaluatorPanel,
[BlockEnum.JudgingLLM]: JudgingLLMPanel,
[BlockEnum.TeamChallenge]: TeamChallengePanel,
// Legacy alias for renamed node
'prompt-challenge': ChallengeEvaluatorPanel,
}
export const CUSTOM_NODE_TYPE = 'custom'

View file

@ -0,0 +1,45 @@
import type { NodeDefault } from '../../types'
import { genNodeMetaData } from '@/app/components/workflow/utils'
import { BlockEnum, VarType } from '@/app/components/workflow/types'
import { BlockClassificationEnum } from '@/app/components/workflow/block-selector/types'
import type { JudgingLLMNodeType } from './types'
import { DEFAULT_JUDGE_MODEL } from './types'
const metaData = genNodeMetaData({
classification: BlockClassificationEnum.Utilities,
sort: 4,
type: BlockEnum.JudgingLLM,
helpLinkUri: 'judging-llm',
})
const nodeDefault: NodeDefault<JudgingLLMNodeType> = {
metaData,
defaultValue: {
judge_model: DEFAULT_JUDGE_MODEL,
rubric_prompt_template: '',
rating_scale: 10,
pass_threshold: 7,
inputs: {
goal: [],
response: [],
},
},
getOutputVars() {
return [
{ variable: 'judge_passed', type: VarType.boolean },
{ variable: 'judge_rating', type: VarType.number },
{ variable: 'judge_feedback', type: VarType.string },
{ variable: 'judge_raw', type: VarType.object },
]
},
checkValid(payload: JudgingLLMNodeType, t: any) {
let errorMessages = ''
if (!payload.judge_model?.provider)
errorMessages = t('workflow.errorMsg.fieldRequired', { field: t('workflow.common.model') })
if (!errorMessages && !payload.rubric_prompt_template)
errorMessages = t('workflow.errorMsg.fieldRequired', { field: 'rubric_prompt_template' })
return { isValid: !errorMessages, errorMessage: errorMessages }
},
}
export default nodeDefault

View file

@ -0,0 +1,33 @@
import type { FC } from 'react'
import React from 'react'
import type { NodeProps } from '@/app/components/workflow/types'
import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
import { useTextGenerationCurrentProviderAndModelAndModelList } from '@/app/components/header/account-setting/model-provider-page/hooks'
import type { JudgingLLMNodeType } from './types'
const Node: FC<NodeProps<JudgingLLMNodeType>> = ({ data }) => {
const { judge_model, pass_threshold } = data
const hasSetModel = !!(judge_model?.provider && judge_model?.name)
const { textGenerationModelList } = useTextGenerationCurrentProviderAndModelAndModelList()
if (!hasSetModel)
return null
return (
<div className='mb-1 px-3 py-1'>
<div className='flex items-center gap-2'>
<ModelSelector
defaultModel={{ provider: judge_model!.provider, model: judge_model!.name }}
modelList={textGenerationModelList}
triggerClassName='!h-6 !rounded-md'
readonly
/>
<div className='rounded bg-components-badge-white-to-dark px-1 py-0.5 text-[10px] font-semibold uppercase text-text-tertiary'>
Pass {pass_threshold ?? 7}
</div>
</div>
</div>
)
}
export default React.memo(Node)

View file

@ -0,0 +1,143 @@
import type { FC } from 'react'
import { memo, useMemo } from 'react'
import { useTranslation } from 'react-i18next'
import type { NodePanelProps } from '@/app/components/workflow/types'
import Field from '@/app/components/workflow/nodes/_base/components/field'
import Split from '@/app/components/workflow/nodes/_base/components/split'
import VarReferencePicker from '@/app/components/workflow/nodes/_base/components/variable/var-reference-picker'
import OutputVars, { VarItem } from '@/app/components/workflow/nodes/_base/components/output-vars'
import ModelParameterModal from '@/app/components/header/account-setting/model-provider-page/model-parameter-modal'
import { fetchAndMergeValidCompletionParams } from '@/utils/completion-params'
import Toast from '@/app/components/base/toast'
import AddButton2 from '@/app/components/base/button/add-button'
import Editor from '@/app/components/workflow/nodes/_base/components/prompt/editor'
import useAvailableVarList from '@/app/components/workflow/nodes/_base/hooks/use-available-var-list'
import Input from '@/app/components/base/input'
import type { JudgingLLMNodeType } from './types'
import useNodeCrud from '@/app/components/workflow/nodes/_base/hooks/use-node-crud'
import produce from 'immer'
const Panel: FC<NodePanelProps<JudgingLLMNodeType>> = ({ id, data }) => {
const { t } = useTranslation()
const { inputs, setInputs } = useNodeCrud<JudgingLLMNodeType>(id, data)
const filterVar = useMemo(() => (_: any) => true, [])
const { availableVars, availableNodesWithParent } = useAvailableVarList(id, { onlyLeafNodeVar: false, filterVar })
return (
<div className='pt-2'>
<div className='space-y-4 px-4 pb-4'>
<Field title={t('workflow.common.model')} required>
<ModelParameterModal
popupClassName='!w-[387px]'
isInWorkflow
isAdvancedMode={true}
mode={data.judge_model?.mode}
provider={data.judge_model?.provider}
completionParams={data.judge_model?.completion_params}
modelId={data.judge_model?.name}
setModel={async (model: { provider: string; modelId: string; mode?: string }) => {
try {
const { params } = await fetchAndMergeValidCompletionParams(
model.provider,
model.modelId,
data.judge_model?.completion_params || {},
true,
)
setInputs(produce(inputs, (draft) => {
(draft as any).judge_model = {
provider: model.provider,
name: model.modelId,
mode: model.mode || 'chat',
completion_params: params,
}
}))
}
catch {
Toast.notify({ type: 'error', message: t('common.error') })
setInputs(produce(inputs, (draft) => {
(draft as any).judge_model = {
provider: model.provider,
name: model.modelId,
mode: model.mode || 'chat',
completion_params: {},
}
}))
}
}}
onCompletionParamsChange={newParams => setInputs(produce(inputs, (draft) => { (draft as any).judge_model.completion_params = newParams }))}
hideDebugWithMultipleModel
debugWithMultipleModel={false}
readonly={false}
/>
</Field>
<Field title='Rubric Template' required>
<div className='space-y-2'>
<Editor
title={<div className='text-xs font-semibold uppercase text-text-secondary'>system</div>}
value={data.rubric_prompt_template || ''}
onChange={v => setInputs(produce(inputs, (draft) => { (draft as any).rubric_prompt_template = v }))}
readOnly={false}
isShowContext={false}
isChatApp
isChatModel
hasSetBlockStatus={{ history: false, query: false, context: false }}
nodesOutputVars={availableVars}
availableNodes={availableNodesWithParent}
isSupportFileVar
/>
<div className='flex items-center gap-2'>
<AddButton2 onClick={() => setInputs(produce(inputs, (draft) => { (draft as any).rubric_prompt_template = 'You are a strict evaluator. Given a goal and a model response, decide pass/fail, give a rating 0-10, and provide concise feedback.\\n\\nGoal:\\n{goal}\\n\\nResponse:\\n{response}\\n\\nReturn JSON: {"passed": boolean, "rating": number, "feedback": string}.' }))} />
<div className='system-xs-medium-uppercase text-text-tertiary'>Insert default rubric</div>
</div>
</div>
</Field>
<Field title='Pass Threshold'>
<Input
type='number'
wrapperClassName='w-full'
min={0}
max={data.rating_scale || 10}
value={data.pass_threshold ?? 7}
onChange={e => setInputs(produce(inputs, (draft) => { (draft as any).pass_threshold = Number(e.target.value) }))}
/>
</Field>
<Field title='Inputs'>
<div className='space-y-2'>
<div>
<div className='system-xs-medium-uppercase mb-1 text-text-tertiary'>Goal</div>
<VarReferencePicker
nodeId={id}
isShowNodeName
readonly={false}
value={data.inputs?.goal || []}
onChange={v => setInputs(produce(inputs, (draft) => { (draft as any).inputs = { ...(draft as any).inputs, goal: v } }))}
/>
</div>
<div>
<div className='system-xs-medium-uppercase mb-1 text-text-tertiary'>Response</div>
<VarReferencePicker
nodeId={id}
isShowNodeName
readonly={false}
value={data.inputs?.response || []}
onChange={v => setInputs(produce(inputs, (draft) => { (draft as any).inputs = { ...(draft as any).inputs, response: v } }))}
/>
</div>
</div>
</Field>
</div>
<Split />
<div>
<OutputVars>
<>
<VarItem name='judge_passed' type='boolean' description={t('workflow.nodes.judgingLLM.outputVars.judgePassed')} />
<VarItem name='judge_rating' type='number' description={t('workflow.nodes.judgingLLM.outputVars.judgeRating')} />
<VarItem name='judge_feedback' type='string' description={t('workflow.nodes.judgingLLM.outputVars.judgeFeedback')} />
</>
</OutputVars>
</div>
</div>
)
}
export default memo(Panel)

View file

@ -0,0 +1,24 @@
import type { CommonNodeType, ModelConfig, ValueSelector } from '@/app/components/workflow/types'
import { BlockEnum } from '@/app/components/workflow/types'
export type JudgingLLMNodeType = CommonNodeType<{
judge_model: ModelConfig
rubric_prompt_template: string
rating_scale?: number
pass_threshold?: number
inputs?: {
goal?: ValueSelector
response?: ValueSelector
}
}>
export const DEFAULT_JUDGE_MODEL: ModelConfig = {
provider: '',
name: '',
mode: 'chat',
completion_params: {
temperature: 0.3,
},
}
export const JUDGING_LLM_BLOCK_TYPE = BlockEnum.JudgingLLM

View file

@ -0,0 +1,42 @@
import type { NodeDefault } from '../../types'
import { genNodeMetaData } from '@/app/components/workflow/utils'
import { BlockEnum, VarType } from '@/app/components/workflow/types'
import { BlockClassificationEnum } from '@/app/components/workflow/block-selector/types'
import type { TeamChallengeNodeType } from './types'
const metaData = genNodeMetaData({
classification: BlockClassificationEnum.Utilities,
sort: 5,
type: BlockEnum.TeamChallenge,
helpLinkUri: 'team-challenge',
})
const nodeDefault: NodeDefault<TeamChallengeNodeType> = {
metaData,
defaultValue: {
defense_selection_policy: 'latest_best',
attack_selection_policy: 'latest_best',
scoring_strategy: 'red_blue_ratio',
inputs: {
team_choice: [],
attack_prompt: [],
defense_prompt: [],
},
},
getOutputVars() {
return [
{ variable: 'team', type: VarType.string },
{ variable: 'judge_passed', type: VarType.boolean },
{ variable: 'judge_rating', type: VarType.number },
{ variable: 'judge_feedback', type: VarType.string },
{ variable: 'categories', type: VarType.object },
{ variable: 'team_points', type: VarType.number },
{ variable: 'total_points', type: VarType.number },
]
},
checkValid(_payload: TeamChallengeNodeType) {
return { isValid: true }
},
}
export default nodeDefault

View file

@ -0,0 +1,25 @@
import type { FC } from 'react'
import React from 'react'
import type { NodeProps } from '@/app/components/workflow/types'
import type { TeamChallengeNodeType } from './types'
const Node: FC<NodeProps<TeamChallengeNodeType>> = ({ data }) => {
const { red_blue_challenge_id, defense_selection_policy, attack_selection_policy } = data
return (
<div className='mb-1 px-3 py-1'>
{red_blue_challenge_id ? (
<div className='flex items-center gap-2'>
<div className='rounded bg-components-badge-white-to-dark px-1 py-0.5 text-[10px] font-semibold uppercase text-text-tertiary'>Red/Blue</div>
<div className='truncate text-xs text-text-secondary' title={red_blue_challenge_id}>{red_blue_challenge_id}</div>
</div>
) : (
<div className='flex items-center gap-2'>
<div className='rounded bg-components-badge-white-to-dark px-1 py-0.5 text-[10px] font-semibold uppercase text-text-tertiary'>Defense: {defense_selection_policy}</div>
<div className='rounded bg-components-badge-white-to-dark px-1 py-0.5 text-[10px] font-semibold uppercase text-text-tertiary'>Attack: {attack_selection_policy}</div>
</div>
)}
</div>
)
}
export default React.memo(Node)

View file

@ -0,0 +1,106 @@
import type { FC } from 'react'
import { memo } from 'react'
import { useTranslation } from 'react-i18next'
import type { NodePanelProps } from '@/app/components/workflow/types'
import Field from '@/app/components/workflow/nodes/_base/components/field'
import Split from '@/app/components/workflow/nodes/_base/components/split'
import VarReferencePicker from '@/app/components/workflow/nodes/_base/components/variable/var-reference-picker'
import OutputVars, { VarItem } from '@/app/components/workflow/nodes/_base/components/output-vars'
import type { TeamChallengeNodeType } from './types'
import useNodeCrud from '@/app/components/workflow/nodes/_base/hooks/use-node-crud'
import produce from 'immer'
import useSWR from 'swr'
import { fetchRedBlueChallenges } from '@/service/redBlueChallenges'
import Select from '@/app/components/base/select'
const i18nPrefix = 'workflow.nodes.teamChallenge'
const Panel: FC<NodePanelProps<TeamChallengeNodeType>> = ({ id, data }) => {
const { t } = useTranslation()
const { inputs, setInputs } = useNodeCrud<TeamChallengeNodeType>(id, data)
const { data: redBlue } = useSWR('redBlue:list', fetchRedBlueChallenges)
return (
<div className='pt-2'>
<div className='space-y-4 px-4 pb-4'>
<Field title={t(`${i18nPrefix}.selectedChallenge`)} tooltip={t(`${i18nPrefix}.selectedChallengeTip`)}>
<Select
items={(redBlue || []).map((c: any) => ({ value: c.id, name: c.name }))}
defaultValue={data.red_blue_challenge_id || ''}
onSelect={item => setInputs(produce(inputs, (draft) => { (draft as any).red_blue_challenge_id = (item?.value as string) || undefined }))}
allowSearch={false}
/>
</Field>
<Field title={t(`${i18nPrefix}.defenseSelectionPolicy`)} tooltip={t(`${i18nPrefix}.defenseSelectionPolicyTip`)}>
<Select
items={[
{ value: 'latest_best', name: 'latest_best' },
{ value: 'random_active', name: 'random_active' },
{ value: 'round_robin', name: 'round_robin' },
{ value: 'request_new_if_none', name: 'request_new_if_none' },
]}
defaultValue={data.defense_selection_policy || 'latest_best'}
onSelect={item => setInputs(produce(inputs, (draft) => { (draft as any).defense_selection_policy = item.value as string }))}
allowSearch={false}
/>
</Field>
<Field title={t(`${i18nPrefix}.attackSelectionPolicy`)} tooltip={t(`${i18nPrefix}.attackSelectionPolicyTip`)}>
<Select
items={[
{ value: 'latest_best', name: 'latest_best' },
{ value: 'random_active', name: 'random_active' },
{ value: 'round_robin', name: 'round_robin' },
{ value: 'request_new_if_none', name: 'request_new_if_none' },
]}
defaultValue={data.attack_selection_policy || 'latest_best'}
onSelect={item => setInputs(produce(inputs, (draft) => { (draft as any).attack_selection_policy = item.value as string }))}
allowSearch={false}
/>
</Field>
<Field title={t(`${i18nPrefix}.teamChoiceVar`)} tooltip={t(`${i18nPrefix}.teamChoiceVarTip`)}>
<VarReferencePicker
nodeId={id}
isShowNodeName
readonly={false}
value={data.inputs?.team_choice || []}
onChange={v => setInputs(produce(inputs, (draft) => { (draft as any).inputs = { ...(draft as any).inputs, team_choice: v } }))}
/>
</Field>
<Field title={t(`${i18nPrefix}.attackPromptVar`)} tooltip={t(`${i18nPrefix}.attackPromptVarTip`)}>
<VarReferencePicker
nodeId={id}
isShowNodeName
readonly={false}
value={data.inputs?.attack_prompt || []}
onChange={v => setInputs(produce(inputs, (draft) => { (draft as any).inputs = { ...(draft as any).inputs, attack_prompt: v } }))}
/>
</Field>
<Field title={t(`${i18nPrefix}.defensePromptVar`)} tooltip={t(`${i18nPrefix}.defensePromptVarTip`)}>
<VarReferencePicker
nodeId={id}
isShowNodeName
readonly={false}
value={data.inputs?.defense_prompt || []}
onChange={v => setInputs(produce(inputs, (draft) => { (draft as any).inputs = { ...(draft as any).inputs, defense_prompt: v } }))}
/>
</Field>
</div>
<Split />
<div>
<OutputVars>
<>
<VarItem name='team' type='string' description='Team' />
<VarItem name='judge_passed' type='boolean' description='Judge passed' />
<VarItem name='judge_rating' type='number' description='Judge rating' />
<VarItem name='judge_feedback' type='string' description='Judge feedback' />
<VarItem name='categories' type='object' description='Category outcomes' />
<VarItem name='team_points' type='number' description='Team points' />
<VarItem name='total_points' type='number' description='Total points' />
</>
</OutputVars>
</div>
</div>
)
}
export default memo(Panel)

View file

@ -0,0 +1,16 @@
import type { CommonNodeType, ValueSelector } from '@/app/components/workflow/types'
import { BlockEnum } from '@/app/components/workflow/types'
export type TeamChallengeNodeType = CommonNodeType<{
red_blue_challenge_id?: string
defense_selection_policy?: 'latest_best' | 'random_active' | 'round_robin' | 'request_new_if_none'
attack_selection_policy?: 'latest_best' | 'random_active' | 'round_robin' | 'request_new_if_none'
scoring_strategy?: 'red_blue_ratio' | 'custom'
inputs?: {
team_choice?: ValueSelector
attack_prompt?: ValueSelector
defense_prompt?: ValueSelector
}
}>
export const TEAM_CHALLENGE_BLOCK_TYPE = BlockEnum.TeamChallenge

View file

@ -50,6 +50,9 @@ export enum BlockEnum {
DataSource = 'datasource',
DataSourceEmpty = 'datasource-empty',
KnowledgeBase = 'knowledge-index',
ChallengeEvaluator = 'challenge-evaluator',
JudgingLLM = 'judging-llm',
TeamChallenge = 'team-challenge',
}
export enum ControlMode {

View file

@ -0,0 +1,191 @@
'use client'
import { useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { useParams } from 'next/navigation'
import { RiLoader4Line, RiShieldLine, RiSwordLine } from '@remixicon/react'
import { fetchRedBlueLeaderboard, submitRedBluePrompt } from '@/service/redBlueChallenges'
import Button from '@/app/components/base/button'
import Textarea from '@/app/components/base/textarea'
import Toast from '@/app/components/base/toast'
export default function RedBlueChallengeDetailPage() {
const { t } = useTranslation()
const params = useParams()
const id = params?.id as string
const [team, setTeam] = useState<'red' | 'blue' | null>(null)
const [prompt, setPrompt] = useState('')
const [submitting, setSubmitting] = useState(false)
const [lastResult, setLastResult] = useState<any>(null)
const [leaderboard, setLeaderboard] = useState<any>(null)
useEffect(() => {
const load = async () => {
try {
const leaders = await fetchRedBlueLeaderboard(id)
setLeaderboard(leaders)
}
catch (e: any) {
console.error('Failed to load leaderboard:', e)
}
}
if (id)
load()
}, [id])
const handleSubmit = async () => {
if (!team || !prompt.trim()) {
Toast.notify({ type: 'error', message: 'Please choose a team and enter a prompt' })
return
}
setSubmitting(true)
setLastResult(null)
try {
const result = await submitRedBluePrompt(id, team, prompt)
setLastResult(result)
Toast.notify({ type: 'success', message: 'Prompt submitted!' })
setPrompt('')
// Refresh leaderboard
const leaders = await fetchRedBlueLeaderboard(id)
setLeaderboard(leaders)
}
catch (e: any) {
Toast.notify({ type: 'error', message: e.message || 'Submission failed' })
}
finally {
setSubmitting(false)
}
}
return (
<div className='min-h-screen bg-components-panel-bg'>
<div className='mx-auto max-w-5xl px-4 py-12 sm:px-6 lg:px-8'>
<div className='mb-8 text-center'>
<h1 className='mb-2 text-3xl font-bold text-text-primary'>{t('challenges.redBlue.title')}</h1>
<p className='text-lg text-text-secondary'>Join the Red or Blue team and compete</p>
</div>
{!team ? (
<div className='grid gap-6 sm:grid-cols-2'>
<button
onClick={() => setTeam('red')}
className='group rounded-xl border-2 border-util-colors-red-red-300 bg-util-colors-red-red-50 p-8 shadow-xs transition-all hover:border-util-colors-red-red-500 hover:shadow-md'
>
<RiSwordLine className='mx-auto mb-4 h-16 w-16 text-util-colors-red-red-600' />
<h2 className='mb-2 text-2xl font-bold text-util-colors-red-red-700'>
{t('challenges.redBlue.red')}
</h2>
<p className='text-util-colors-red-red-600'>{t('challenges.redBlue.redDesc')}</p>
</button>
<button
onClick={() => setTeam('blue')}
className='group rounded-xl border-2 border-util-colors-blue-blue-300 bg-util-colors-blue-blue-50 p-8 shadow-xs transition-all hover:border-util-colors-blue-blue-500 hover:shadow-md'
>
<RiShieldLine className='mx-auto mb-4 h-16 w-16 text-util-colors-blue-blue-600' />
<h2 className='mb-2 text-2xl font-bold text-util-colors-blue-blue-700'>
{t('challenges.redBlue.blue')}
</h2>
<p className='text-util-colors-blue-blue-600'>{t('challenges.redBlue.blueDesc')}</p>
</button>
</div>
) : (
<div className='grid gap-6 lg:grid-cols-3'>
<div className='lg:col-span-2'>
<div className={`rounded-xl border-2 p-6 shadow-xs ${team === 'red' ? 'border-util-colors-red-red-300 bg-util-colors-red-red-50' : 'border-util-colors-blue-blue-300 bg-util-colors-blue-blue-50'}`}>
<div className='mb-4 flex items-center justify-between'>
<h2 className={`text-xl font-bold ${team === 'red' ? 'text-util-colors-red-red-700' : 'text-util-colors-blue-blue-700'}`}>
{team === 'red' ? t('challenges.redBlue.submitAttack') : t('challenges.redBlue.submitDefense')}
</h2>
<Button
size='small'
onClick={() => {
setTeam(null)
setPrompt('')
setLastResult(null)
}}
>
Switch Team
</Button>
</div>
<Textarea
value={prompt}
onChange={e => setPrompt(e.target.value)}
placeholder={team === 'red' ? t('challenges.redBlue.attackPrompt') : t('challenges.redBlue.defensePrompt')}
rows={8}
className='mb-4 w-full'
/>
<Button
type='primary'
onClick={handleSubmit}
loading={submitting}
disabled={!prompt.trim()}
className='w-full'
>
{submitting ? (
<>
<RiLoader4Line className='mr-2 h-4 w-4 animate-spin' />
{t('common.operation.processing')}
</>
) : (
t('challenges.player.submit')
)}
</Button>
{lastResult && (
<div className='mt-4 rounded-lg border border-divider-subtle bg-components-panel-bg p-4'>
<h3 className='mb-2 font-medium text-text-primary'>{t('challenges.redBlue.results')}</h3>
{lastResult.judge_rating !== undefined && (
<div className='text-sm text-text-secondary'>
{t('challenges.leaderboard.rating')}: {lastResult.judge_rating}/10
</div>
)}
{lastResult.team_points !== undefined && (
<div className='mt-1 text-sm text-text-secondary'>
Points earned: {lastResult.team_points}
</div>
)}
{lastResult.judge_feedback && (
<div className='mt-2 text-sm text-text-tertiary'>{lastResult.judge_feedback}</div>
)}
</div>
)}
</div>
</div>
<div className='lg:col-span-1'>
{leaderboard && (
<div className='rounded-xl border border-divider-subtle bg-components-panel-bg p-6 shadow-xs'>
<h3 className='mb-4 text-lg font-semibold text-text-primary'>Standings</h3>
<div className='space-y-3'>
<div className='flex items-center justify-between rounded-lg bg-util-colors-red-red-50 p-3'>
<span className='font-medium text-util-colors-red-red-700'>
{t('challenges.redBlue.redPoints')}
</span>
<span className='text-xl font-bold text-util-colors-red-red-700'>
{leaderboard.red_points || 0}
</span>
</div>
<div className='flex items-center justify-between rounded-lg bg-util-colors-blue-blue-50 p-3'>
<span className='font-medium text-util-colors-blue-blue-700'>
{t('challenges.redBlue.bluePoints')}
</span>
<span className='text-xl font-bold text-util-colors-blue-blue-700'>
{leaderboard.blue_points || 0}
</span>
</div>
</div>
</div>
)}
</div>
</div>
)}
</div>
</div>
)
}

View file

@ -0,0 +1,23 @@
import Link from 'next/link'
import { fetchRedBlueChallenges } from '@/service/redBlueChallenges'
export default async function RedBlueChallengesPage() {
const items = await fetchRedBlueChallenges()
return (
<div className="px-6 py-8">
<h1 className="mb-4 text-xl font-semibold">Red / Blue Challenges</h1>
<ul className="space-y-2">
{items.map(i => (
<li key={i.id} className="rounded border p-3">
<Link href={`/red-blue-challenges/${i.id}`} className="text-primary hover:underline">
{i.name}
</Link>
{i.description && (
<p className="mt-1 text-sm text-gray-500">{i.description}</p>
)}
</li>
))}
</ul>
</div>
)
}

View file

@ -23,6 +23,7 @@ const NAMESPACES = [
'app-overview',
'app',
'billing',
'challenges',
'common',
'custom',
'dataset-creation',

View file

@ -0,0 +1,80 @@
export default {
console: {
title: 'Challenges',
create: 'Create Challenge',
createRedBlue: 'Create Red/Blue Challenge',
edit: 'Edit Challenge',
empty: 'No challenges yet',
emptyDesc: 'Create your first challenge to get started',
form: {
name: 'Name',
namePlaceholder: 'Enter challenge name',
description: 'Description',
descriptionPlaceholder: 'Describe the challenge',
goal: 'Goal',
goalPlaceholder: 'What should players achieve?',
appId: 'App ID',
workflowId: 'Workflow ID',
evaluatorType: 'Evaluator Type',
successType: 'Success Type',
successPattern: 'Success Pattern',
scoringStrategy: 'Scoring Strategy',
isActive: 'Active',
judgeSuite: 'Judge Suite',
defensePolicy: 'Defense Selection Policy',
attackPolicy: 'Attack Selection Policy',
},
actions: {
activate: 'Activate',
deactivate: 'Deactivate',
edit: 'Edit',
delete: 'Delete',
deleteConfirm: 'Are you sure you want to delete this challenge?',
},
status: {
active: 'Active',
inactive: 'Inactive',
},
},
player: {
title: 'Challenges',
browse: 'Browse Challenges',
play: 'Play Challenge',
submit: 'Submit',
tryAgain: 'Try Again',
viewLeaderboard: 'View Leaderboard',
yourAttempt: 'Your Attempt',
goal: 'Goal',
status: {
success: 'Success!',
failed: 'Failed',
pending: 'Pending',
},
},
leaderboard: {
title: 'Leaderboard',
rank: 'Rank',
player: 'Player',
score: 'Score',
time: 'Time',
tokens: 'Tokens',
rating: 'Rating',
empty: 'No attempts yet',
yourBest: 'Your Best',
},
redBlue: {
title: 'Red vs Blue',
chooseTeam: 'Choose Your Team',
red: 'Red Team',
blue: 'Blue Team',
redDesc: 'Attack: Try to bypass defenses',
blueDesc: 'Defense: Prevent attacks',
submitAttack: 'Submit Attack',
submitDefense: 'Submit Defense',
attackPrompt: 'Attack Prompt',
defensePrompt: 'Defense Prompt',
results: 'Results',
redPoints: 'Red Points',
bluePoints: 'Blue Points',
},
}

View file

@ -270,6 +270,9 @@ const translation = {
'loop-end': 'Exit Loop',
'knowledge-index': 'Knowledge Base',
'datasource': 'Data Source',
'challenge-evaluator': 'Challenge Evaluator',
'judging-llm': 'Judging LLM',
'team-challenge': 'Team Challenge',
},
blocksAbout: {
'start': 'Define the initial parameters for launching a workflow',
@ -294,6 +297,9 @@ const translation = {
'agent': 'Invoking large language models to answer questions or process natural language',
'knowledge-index': 'Knowledge Base About',
'datasource': 'Data Source About',
'challenge-evaluator': 'Evaluate a model response against success rules or an LLM judge and record attempts.',
'judging-llm': 'Judge a response with a rubric using an LLM and output pass/rating/feedback.',
'team-challenge': 'Coordinate Red/Blue prompts, run judging, and emit team scores.',
},
operator: {
zoomIn: 'Zoom In',
@ -864,6 +870,69 @@ const translation = {
last_record: 'Last record',
},
},
challengeEvaluator: {
evaluationMode: 'Evaluation Mode',
evaluationModeTip: 'Choose how the response is evaluated: rules, LLM judge, or custom.',
successType: 'Success Type',
successTypeTip: 'Contains: substring match (case-insensitive). Regex: JavaScript regular expression.',
successPattern: 'Success Pattern',
successPatternPlaceholder: 'Enter regex or substring',
successPatternTip: 'Supports variables. For regex, do not add surrounding slashes.',
responseVar: 'Response Variable',
responseVarTip: 'Pick the upstream response to evaluate.',
selectedChallenge: 'Selected Challenge',
selectedChallengeTip: 'Select an existing challenge to use its stored rules and settings.',
scoringStrategy: 'Scoring Strategy',
scoringStrategyTip: 'How to rank successful attempts on the leaderboard.',
scoringFirst: 'First (earliest success)',
scoringFastest: 'Fastest (lowest time)',
scoringFewestTokens: 'Fewest Tokens',
scoringHighestRating: 'Highest Rating',
scoringCustom: 'Custom',
outputVars: {
challengeSucceeded: 'Challenge Succeeded',
judgeRating: 'Judge Rating',
judgeFeedback: 'Judge Feedback',
message: 'Result Message',
},
},
judgingLLM: {
rubricTemplate: 'Rubric Template',
rubricTemplatePlaceholder: 'Define your evaluation criteria',
rubricTemplateTip: 'Template for the judge. Use {goal} and {response} placeholders.',
passThreshold: 'Pass Threshold',
passThresholdTip: 'Minimum rating (0-10) required to pass.',
insertDefaultRubric: 'Insert default rubric',
outputVars: {
judgePassed: 'Judge Passed',
judgeRating: 'Judge Rating',
judgeFeedback: 'Judge Feedback',
judgeRaw: 'Judge Raw Output',
},
},
teamChallenge: {
defenseSelectionPolicy: 'Defense Selection Policy',
defenseSelectionPolicyTip: 'Choose how a defense prompt is selected to pair against incoming attacks.',
attackSelectionPolicy: 'Attack Selection Policy',
attackSelectionPolicyTip: 'Choose how an attack prompt is selected to test your defense.',
teamChoiceVar: 'Team Choice Variable',
teamChoiceVarTip: 'Variable that yields "red" or "blue" to select the role.',
attackPromptVar: 'Attack Prompt Variable',
attackPromptVarTip: 'Variable that provides the attacker\'s prompt.',
defensePromptVar: 'Defense Prompt Variable',
defensePromptVarTip: 'Variable that provides the defender\'s prompt (system prompt).',
selectedChallenge: 'Selected Challenge',
selectedChallengeTip: 'Pick a Red/Blue challenge definition to orchestrate evaluations.',
outputVars: {
team: 'Team',
judgePassed: 'Judge Passed',
judgeRating: 'Judge Rating',
judgeFeedback: 'Judge Feedback',
categories: 'Categories',
teamPoints: 'Team Points',
totalPoints: 'Total Points',
},
},
agent: {
strategy: {
label: 'Agentic Strategy',

View file

@ -19,7 +19,7 @@
"and_qq >= 14.9"
],
"scripts": {
"dev": "cross-env NODE_OPTIONS='--inspect' next dev --turbopack",
"dev": "cross-env NODE_OPTIONS='' next dev --turbopack",
"build": "next build",
"build:docker": "next build && node scripts/optimize-standalone.js",
"start": "cp -r .next/static .next/standalone/.next/static && cp -r public .next/standalone/public && cross-env PORT=$npm_config_port HOSTNAME=$npm_config_host node .next/standalone/server.js",

1
web/run.sh Normal file
View file

@ -0,0 +1 @@
pnpm run dev

104
web/service/challenges.ts Normal file
View file

@ -0,0 +1,104 @@
import { getPublic, postPublic } from './base'
import { PUBLIC_API_PREFIX } from '@/config'
import { getInitialTokenV2, isTokenV1 } from '@/app/components/share/utils'
import { CONVERSATION_ID_INFO } from '@/app/components/base/chat/constants'
export type ChallengeListItem = {
id: string
name: string
description?: string
goal?: string
app_id?: string
workflow_id?: string
app_mode?: string
app_site_code?: string
}
export async function fetchChallenges(): Promise<ChallengeListItem[]> {
const res = await getPublic<{ result: string; data: ChallengeListItem[] }>('/challenges')
return res.data ?? []
}
export async function fetchChallengeDetail(id: string) {
const res = await getPublic<{ result: string; data: any }>(`/challenges/${id}`)
return res.data
}
export async function fetchChallengeLeaderboard(id: string) {
const res = await getPublic<{ result: string; data: any[] }>(`/challenges/${id}/leaderboard`)
return res.data ?? []
}
export async function submitChallengeAttempt(
challengeId: string,
appId: string,
appSiteCode: string | undefined,
appMode: string,
userInput: string,
) {
if (!appSiteCode)
throw new Error('Challenge app is not published. Please enable the app site for this challenge.')
const passportRes = await fetch(`${PUBLIC_API_PREFIX}/passport`, {
method: 'GET',
headers: {
'X-App-Code': appSiteCode,
},
credentials: 'include',
})
if (!passportRes.ok) {
let message = 'Unable to start challenge. Please try again.'
try {
const data = await passportRes.json()
message = data?.message || message
}
catch { /* ignore json parse errors */ }
throw new Error(message)
}
const passportData = await passportRes.json() as { access_token?: string }
const accessToken = passportData?.access_token
if (!accessToken)
throw new Error('Challenge authorization failed. Please refresh and try again.')
// Persist token using the same structure expected by getAccessToken(true)
const storageKey = 'token'
const userKey = 'DEFAULT'
const rawTokenStore = localStorage.getItem(storageKey) || JSON.stringify(getInitialTokenV2())
let tokenStore: Record<string, any>
try {
const parsed = JSON.parse(rawTokenStore)
tokenStore = isTokenV1(parsed) ? getInitialTokenV2() : parsed
}
catch {
tokenStore = getInitialTokenV2()
}
tokenStore[challengeId] = {
...(tokenStore[challengeId] || {}),
[userKey]: accessToken,
}
localStorage.setItem(storageKey, JSON.stringify(tokenStore))
localStorage.removeItem(CONVERSATION_ID_INFO)
if (appMode === 'chat' || appMode === 'advanced-chat' || appMode === 'agent-chat') {
return await postPublic<any>('/chat-messages', {
body: {
query: userInput,
inputs: {},
response_mode: 'blocking',
conversation_id: '',
},
})
}
return await postPublic<any>('/workflows/run', {
body: {
inputs: {
user_prompt: userInput,
},
response_mode: 'blocking',
},
})
}

View file

@ -0,0 +1,99 @@
import { request } from '@/service/base'
export type ConsoleChallenge = {
id: string
name: string
description?: string
goal?: string
is_active?: boolean
success_type?: string
success_pattern?: string
scoring_strategy?: string
app_id?: string
workflow_id?: string
}
export async function listConsoleChallenges() {
const resp = await request<{ data: ConsoleChallenge[] }>('/challenges', {}, {})
return resp.data
}
export async function createConsoleChallenge(payload: {
app_id: string
workflow_id?: string
name: string
description?: string
goal?: string
success_type?: string
success_pattern?: string
scoring_strategy?: string
is_active?: boolean
}) {
const resp = await request<{ data: { id: string } }>('/challenges', {
method: 'POST',
body: payload,
}, {})
return resp.data
}
export async function updateConsoleChallenge(id: string, payload: Partial<ConsoleChallenge>) {
const resp = await request<{ data: ConsoleChallenge }>(`/challenges/${id}`, {
method: 'PATCH',
body: payload,
}, {})
return resp.data
}
export async function deleteConsoleChallenge(id: string) {
await request(`/challenges/${id}`, {
method: 'DELETE',
}, {})
}
export type RedBlueChallenge = {
id: string
name: string
description?: string
judge_suite?: string[]
defense_selection_policy?: string
attack_selection_policy?: string
scoring_strategy?: string
is_active?: boolean
}
export async function listRedBlueChallenges() {
const resp = await request<{ data: RedBlueChallenge[] }>('/red-blue-challenges', {}, {})
return resp.data
}
export async function createRedBlueChallenge(payload: {
app_id: string
workflow_id?: string
name: string
description?: string
judge_suite?: string[]
defense_selection_policy?: string
attack_selection_policy?: string
scoring_strategy?: string
is_active?: boolean
}) {
const resp = await request<{ data: { id: string } }>('/red-blue-challenges', {
method: 'POST',
body: payload,
}, {})
return resp.data
}
export async function updateRedBlueChallenge(id: string, payload: Partial<RedBlueChallenge>) {
const resp = await request<{ data: RedBlueChallenge }>(`/red-blue-challenges/${id}`, {
method: 'PATCH',
body: payload,
}, {})
return resp.data
}
export async function deleteRedBlueChallenge(id: string) {
await request(`/red-blue-challenges/${id}`, {
method: 'DELETE',
}, {})
}

View file

@ -0,0 +1,27 @@
import { request } from '@/service/base'
export type ConsoleRedBlueChallenge = {
id: string
name: string
description?: string
is_active?: boolean
}
export async function listConsoleRedBlueChallenges() {
const resp = await request<{ data: ConsoleRedBlueChallenge[] }>('/red-blue-challenges', {}, {})
return resp.data
}
export async function createConsoleRedBlueChallenge(payload: {
tenant_id: string
app_id: string
name: string
description?: string
judge_suite: Record<string, any>
}) {
const resp = await request<{ data: { id: string } }>('/red-blue-challenges', {
method: 'POST',
body: JSON.stringify(payload),
}, {})
return resp.data
}

View file

@ -0,0 +1,24 @@
import { getPublic, postPublic } from './base'
export type RedBlueListItem = {
id: string
name: string
description?: string
}
export async function fetchRedBlueChallenges(): Promise<RedBlueListItem[]> {
const res = await getPublic<{ result: string; data: RedBlueListItem[] }>('/red-blue-challenges')
return res.data ?? []
}
export async function fetchRedBlueLeaderboard(id: string) {
const res = await getPublic<{ result: string; data: any }>(`/red-blue-challenges/${id}/leaderboard`)
return res.data
}
export async function submitRedBluePrompt(id: string, team: 'red' | 'blue', prompt: string) {
const res = await postPublic<{ result: string; data: any }>(`/red-blue-challenges/${id}/submit`, {
body: { team, prompt },
})
return res.data
}