diff --git a/api/controllers/console/__init__.py b/api/controllers/console/__init__.py index 621f5066e..e44091d17 100644 --- a/api/controllers/console/__init__.py +++ b/api/controllers/console/__init__.py @@ -129,6 +129,10 @@ from .workspace import ( workspace, ) +# Import custom challenge controllers +from . import challenges as challenges +from . import red_blue_challenges as red_blue_challenges + api.add_namespace(console_ns) __all__ = [ @@ -204,4 +208,6 @@ __all__ = [ "workflow_run", "workflow_statistic", "workspace", + "challenges", + "red_blue_challenges", ] diff --git a/api/controllers/console/app/app.py b/api/controllers/console/app/app.py index 23b8e2c5a..e1e161635 100644 --- a/api/controllers/console/app/app.py +++ b/api/controllers/console/app/app.py @@ -12,7 +12,6 @@ from controllers.console.app.wraps import get_app_model from controllers.console.wraps import ( account_initialization_required, cloud_edition_billing_resource_check, - enterprise_license_required, setup_required, ) from core.ops.ops_trace_manager import OpsTraceManager @@ -53,7 +52,6 @@ class AppListApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required def get(self): """Get app list""" @@ -166,7 +164,6 @@ class AppApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required @get_app_model @marshal_with(app_detail_fields_with_site) def get(self, app_model): diff --git a/api/controllers/console/challenges.py b/api/controllers/console/challenges.py new file mode 100644 index 000000000..3d3dcff22 --- /dev/null +++ b/api/controllers/console/challenges.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +from flask_restx import Resource, reqparse + +from controllers.console import console_ns as api +from controllers.console.wraps import ( + account_initialization_required, + setup_required, +) +from libs.login import login_required +from extensions.ext_database import db +from libs.login import current_user +from models.challenge import Challenge + + +@api.route("/challenges") +class ChallengeListCreateApi(Resource): + @api.doc("list_challenges") + @setup_required + @login_required + @account_initialization_required + def get(self): + tenant_id = current_user.current_tenant_id + if not tenant_id: + # no active workspace selected; return empty list to avoid leaking data + return {"result": "success", "data": []} + rows = ( + db.session.query(Challenge) + .filter(Challenge.tenant_id == tenant_id) + .order_by(Challenge.created_at.desc()) + .all() + ) + return { + "result": "success", + "data": [ + { + "id": r.id, + "name": r.name, + "description": r.description, + "goal": r.goal, + "is_active": r.is_active, + "success_type": r.success_type, + "success_pattern": r.success_pattern, + "scoring_strategy": r.scoring_strategy, + "app_id": r.app_id, + "workflow_id": r.workflow_id, + } + for r in rows + ], + } + + @api.doc("create_challenge") + @setup_required + @login_required + @account_initialization_required + def post(self): + parser = reqparse.RequestParser() + parser.add_argument("tenant_id", type=str, required=False, location="json") + parser.add_argument("app_id", type=str, required=True, location="json") + parser.add_argument("workflow_id", type=str, required=False, location="json") + parser.add_argument("name", type=str, required=True, location="json") + parser.add_argument("description", type=str, required=False, location="json") + parser.add_argument("goal", type=str, required=False, location="json") + parser.add_argument("success_type", type=str, required=False, location="json") + parser.add_argument("success_pattern", type=str, required=False, location="json") + parser.add_argument("scoring_strategy", type=str, required=False, location="json") + parser.add_argument("is_active", type=bool, required=False, location="json") + args = parser.parse_args() + + c = Challenge() + c.tenant_id = args.get("tenant_id") or current_user.current_tenant_id + c.app_id = args["app_id"] + # Convert empty string to None for UUID field + workflow_id = args.get("workflow_id") + c.workflow_id = workflow_id if workflow_id else None + c.name = args["name"] + c.description = args.get("description") + c.goal = args.get("goal") + if args.get("success_type"): + c.success_type = args["success_type"] + c.success_pattern = args.get("success_pattern") + if args.get("scoring_strategy"): + c.scoring_strategy = args["scoring_strategy"] + if args.get("is_active") is not None: + c.is_active = args["is_active"] + db.session.add(c) + db.session.commit() + return {"result": "success", "data": {"id": c.id}}, 201 + + +@api.route("/challenges/") +class ChallengeDetailApi(Resource): + @api.doc("get_challenge") + @setup_required + @login_required + @account_initialization_required + def get(self, challenge_id): + c = db.session.get(Challenge, str(challenge_id)) + if not c: + return {"result": "not_found"}, 404 + return { + "result": "success", + "data": { + "id": c.id, + "name": c.name, + "description": c.description, + "goal": c.goal, + "is_active": c.is_active, + "success_type": c.success_type, + "success_pattern": c.success_pattern, + "scoring_strategy": c.scoring_strategy, + "app_id": c.app_id, + "workflow_id": c.workflow_id, + }, + } + + @api.doc("update_challenge") + @setup_required + @login_required + @account_initialization_required + def patch(self, challenge_id): + c = db.session.get(Challenge, str(challenge_id)) + if not c: + return {"result": "not_found"}, 404 + parser = reqparse.RequestParser() + parser.add_argument("name", type=str, required=False, location="json") + parser.add_argument("description", type=str, required=False, location="json") + parser.add_argument("goal", type=str, required=False, location="json") + parser.add_argument("is_active", type=bool, required=False, location="json") + args = parser.parse_args() + if args.get("name"): + c.name = args["name"] + if args.get("description") is not None: + c.description = args["description"] + if args.get("goal") is not None: + c.goal = args["goal"] + if args.get("is_active") is not None: + c.is_active = bool(args["is_active"]) + db.session.commit() + return {"result": "success"} + + @api.doc("delete_challenge") + @setup_required + @login_required + @account_initialization_required + def delete(self, challenge_id): + c = db.session.get(Challenge, str(challenge_id)) + if not c: + return {"result": "not_found"}, 404 + db.session.delete(c) + db.session.commit() + return {"result": "success"}, 204 + + diff --git a/api/controllers/console/datasets/metadata.py b/api/controllers/console/datasets/metadata.py index dc3cd3fce..1a96cadc6 100644 --- a/api/controllers/console/datasets/metadata.py +++ b/api/controllers/console/datasets/metadata.py @@ -5,7 +5,7 @@ from flask_restx import Resource, marshal_with, reqparse from werkzeug.exceptions import NotFound from controllers.console import console_ns -from controllers.console.wraps import account_initialization_required, enterprise_license_required, setup_required +from controllers.console.wraps import account_initialization_required, setup_required from fields.dataset_fields import dataset_metadata_fields from libs.login import login_required from services.dataset_service import DatasetService @@ -21,7 +21,6 @@ class DatasetMetadataCreateApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required @marshal_with(dataset_metadata_fields) def post(self, dataset_id): parser = reqparse.RequestParser() @@ -42,7 +41,6 @@ class DatasetMetadataCreateApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required def get(self, dataset_id): dataset_id_str = str(dataset_id) dataset = DatasetService.get_dataset(dataset_id_str) @@ -56,7 +54,6 @@ class DatasetMetadataApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required @marshal_with(dataset_metadata_fields) def patch(self, dataset_id, metadata_id): parser = reqparse.RequestParser() @@ -77,7 +74,6 @@ class DatasetMetadataApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required def delete(self, dataset_id, metadata_id): dataset_id_str = str(dataset_id) metadata_id_str = str(metadata_id) @@ -95,7 +91,6 @@ class DatasetMetadataBuiltInFieldApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required def get(self): built_in_fields = MetadataService.get_built_in_fields() return {"fields": built_in_fields}, 200 @@ -106,7 +101,6 @@ class DatasetMetadataBuiltInFieldActionApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required def post(self, dataset_id, action: Literal["enable", "disable"]): dataset_id_str = str(dataset_id) dataset = DatasetService.get_dataset(dataset_id_str) @@ -126,7 +120,6 @@ class DocumentMetadataEditApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required def post(self, dataset_id): dataset_id_str = str(dataset_id) dataset = DatasetService.get_dataset(dataset_id_str) diff --git a/api/controllers/console/datasets/rag_pipeline/rag_pipeline.py b/api/controllers/console/datasets/rag_pipeline/rag_pipeline.py index 3af590afc..9bb4d4d93 100644 --- a/api/controllers/console/datasets/rag_pipeline/rag_pipeline.py +++ b/api/controllers/console/datasets/rag_pipeline/rag_pipeline.py @@ -7,7 +7,6 @@ from sqlalchemy.orm import Session from controllers.console import console_ns from controllers.console.wraps import ( account_initialization_required, - enterprise_license_required, knowledge_pipeline_publish_enabled, setup_required, ) @@ -37,7 +36,6 @@ class PipelineTemplateListApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required def get(self): type = request.args.get("type", default="built-in", type=str) language = request.args.get("language", default="en-US", type=str) @@ -51,7 +49,6 @@ class PipelineTemplateDetailApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required def get(self, template_id: str): type = request.args.get("type", default="built-in", type=str) rag_pipeline_service = RagPipelineService() @@ -64,7 +61,6 @@ class CustomizedPipelineTemplateApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required def patch(self, template_id: str): parser = reqparse.RequestParser() parser.add_argument( @@ -95,7 +91,6 @@ class CustomizedPipelineTemplateApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required def delete(self, template_id: str): RagPipelineService.delete_customized_pipeline_template(template_id) return 200 @@ -103,7 +98,6 @@ class CustomizedPipelineTemplateApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required def post(self, template_id: str): with Session(db.engine) as session: template = ( @@ -120,7 +114,6 @@ class PublishCustomizedPipelineTemplateApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required @knowledge_pipeline_publish_enabled def post(self, pipeline_id: str): parser = reqparse.RequestParser() diff --git a/api/controllers/console/red_blue_challenges.py b/api/controllers/console/red_blue_challenges.py new file mode 100644 index 000000000..7be061ee5 --- /dev/null +++ b/api/controllers/console/red_blue_challenges.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +from flask_restx import Resource, reqparse + +from controllers.console import console_ns as api +from controllers.console.wraps import ( + account_initialization_required, + setup_required, +) +from extensions.ext_database import db +from libs.login import current_user, login_required +from models.red_blue import RedBlueChallenge, TeamPairing + + +@api.route("/red-blue-challenges") +class RedBlueListCreateApi(Resource): + @api.doc("list_red_blue_challenges") + @setup_required + @login_required + @account_initialization_required + def get(self): + tenant_id = current_user.current_tenant_id + if not tenant_id: + return {"result": "success", "data": []} + rows = ( + db.session.query(RedBlueChallenge) + .filter(RedBlueChallenge.tenant_id == tenant_id) + .order_by(RedBlueChallenge.created_at.desc()) + .all() + ) + return { + "result": "success", + "data": [ + { + "id": r.id, + "name": r.name, + "description": r.description, + "is_active": r.is_active, + } + for r in rows + ], + } + + @api.doc("create_red_blue_challenge") + @setup_required + @login_required + @account_initialization_required + def post(self): + parser = reqparse.RequestParser() + parser.add_argument("tenant_id", type=str, required=True, location="json") + parser.add_argument("app_id", type=str, required=True, location="json") + parser.add_argument("name", type=str, required=True, location="json") + parser.add_argument("description", type=str, required=False, location="json") + parser.add_argument("judge_suite", type=dict, required=True, location="json") + args = parser.parse_args() + + c = RedBlueChallenge() + c.tenant_id = args.get("tenant_id") or current_user.current_tenant_id + c.app_id = args["app_id"] + c.name = args["name"] + c.description = args.get("description") + c.judge_suite = args["judge_suite"] + db.session.add(c) + db.session.commit() + return {"result": "success", "data": {"id": c.id}}, 201 + + +@api.route("/red-blue-challenges/") +class RedBlueDetailApi(Resource): + @api.doc("get_red_blue_challenge") + @setup_required + @login_required + @account_initialization_required + def get(self, challenge_id): + c = db.session.get(RedBlueChallenge, str(challenge_id)) + if not c: + return {"result": "not_found"}, 404 + return { + "result": "success", + "data": {"id": c.id, "name": c.name, "description": c.description, "is_active": c.is_active}, + } + + @api.doc("update_red_blue_challenge") + @setup_required + @login_required + @account_initialization_required + def patch(self, challenge_id): + c = db.session.get(RedBlueChallenge, str(challenge_id)) + if not c: + return {"result": "not_found"}, 404 + parser = reqparse.RequestParser() + parser.add_argument("name", type=str, required=False, location="json") + parser.add_argument("description", type=str, required=False, location="json") + parser.add_argument("is_active", type=bool, required=False, location="json") + args = parser.parse_args() + if args.get("name"): + c.name = args["name"] + if args.get("description") is not None: + c.description = args["description"] + if args.get("is_active") is not None: + c.is_active = bool(args["is_active"]) + db.session.commit() + return {"result": "success"} + + @api.doc("delete_red_blue_challenge") + @setup_required + @login_required + @account_initialization_required + def delete(self, challenge_id): + c = db.session.get(RedBlueChallenge, str(challenge_id)) + if not c: + return {"result": "not_found"}, 404 + db.session.delete(c) + db.session.commit() + return {"result": "success"}, 204 + + +@api.route("/red-blue-challenges//pairings") +class RedBluePairingsApi(Resource): + @api.doc("list_red_blue_pairings") + @setup_required + @login_required + @account_initialization_required + def get(self, challenge_id): + rows = ( + db.session.query(TeamPairing) + .filter(TeamPairing.red_blue_challenge_id == str(challenge_id)) + .order_by(TeamPairing.created_at.desc()) + .limit(100) + .all() + ) + return { + "result": "success", + "data": [ + { + "id": r.id, + "red_points": r.red_points, + "blue_points": r.blue_points, + "judge_rating": r.judge_rating, + "created_at": r.created_at.isoformat() if hasattr(r.created_at, "isoformat") else None, + } + for r in rows + ], + } + diff --git a/api/controllers/console/workspace/account.py b/api/controllers/console/workspace/account.py index 7a41a8a5c..1f701010f 100644 --- a/api/controllers/console/workspace/account.py +++ b/api/controllers/console/workspace/account.py @@ -29,7 +29,6 @@ from controllers.console.wraps import ( account_initialization_required, cloud_edition_billing_enabled, enable_change_email, - enterprise_license_required, only_edition_cloud, setup_required, ) @@ -102,7 +101,6 @@ class AccountProfileApi(Resource): @login_required @account_initialization_required @marshal_with(account_fields) - @enterprise_license_required def get(self): if not isinstance(current_user, Account): raise ValueError("Invalid user account") diff --git a/api/controllers/console/workspace/tool_providers.py b/api/controllers/console/workspace/tool_providers.py index 8693d99e2..0a72b3ead 100644 --- a/api/controllers/console/workspace/tool_providers.py +++ b/api/controllers/console/workspace/tool_providers.py @@ -13,7 +13,6 @@ from configs import dify_config from controllers.console import api from controllers.console.wraps import ( account_initialization_required, - enterprise_license_required, setup_required, ) from core.mcp.auth.auth_flow import auth, handle_callback @@ -667,7 +666,6 @@ class ToolLabelsApi(Resource): @setup_required @login_required @account_initialization_required - @enterprise_license_required def get(self): return jsonable_encoder(ToolLabelsService.list_tool_labels()) diff --git a/api/controllers/web/__init__.py b/api/controllers/web/__init__.py index 1d2295430..9e257fc70 100644 --- a/api/controllers/web/__init__.py +++ b/api/controllers/web/__init__.py @@ -24,12 +24,15 @@ from . import ( files, forgot_password, login, + register, message, passport, remote_files, saved_message, site, workflow, + challenges, + red_blue_challenges, ) api.add_namespace(web_ns) @@ -45,6 +48,7 @@ __all__ = [ "files", "forgot_password", "login", + "register", "message", "passport", "remote_files", @@ -52,4 +56,6 @@ __all__ = [ "site", "web_ns", "workflow", + "challenges", + "red_blue_challenges", ] diff --git a/api/controllers/web/challenges.py b/api/controllers/web/challenges.py new file mode 100644 index 000000000..7ca9ee2a6 --- /dev/null +++ b/api/controllers/web/challenges.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +from flask_restx import Resource + +from controllers.web import web_ns +from extensions.ext_database import db +from sqlalchemy import select + +from models.challenge import Challenge, ChallengeAttempt +from models.model import App, Site + + +@web_ns.route("/challenges") +class ChallengeListApi(Resource): + def get(self): + q = db.session.query(Challenge).filter(Challenge.is_active.is_(True)).order_by(Challenge.created_at.desc()) + items = [] + for c in q.all(): + app = db.session.get(App, c.app_id) if c.app_id else None + site_code = None + if c.app_id: + site = db.session.execute( + select(Site).where(Site.app_id == c.app_id, Site.status == "normal") + ).scalar_one_or_none() + site_code = site.code if site else None + items.append({ + "id": c.id, + "name": c.name, + "description": c.description, + "goal": c.goal, + "app_id": c.app_id, + "workflow_id": c.workflow_id, + "app_mode": app.mode if app else None, + "app_site_code": site_code, + }) + return {"result": "success", "data": items} + + +@web_ns.route("/challenges/") +class ChallengeDetailApi(Resource): + def get(self, challenge_id): + c = db.session.get(Challenge, str(challenge_id)) + if not c: + return {"result": "not_found"}, 404 + + app = db.session.get(App, c.app_id) if c.app_id else None + site_code = None + if c.app_id: + site = db.session.execute( + select(Site).where(Site.app_id == c.app_id, Site.status == "normal") + ).scalar_one_or_none() + site_code = site.code if site else None + data = { + "id": c.id, + "name": c.name, + "description": c.description, + "goal": c.goal, + "is_active": c.is_active, + "app_id": c.app_id, + "workflow_id": c.workflow_id, + "app_mode": app.mode if app else None, + "app_site_code": site_code, + } + return {"result": "success", "data": data} + + +@web_ns.route("/challenges//leaderboard") +class ChallengeLeaderboardApi(Resource): + def get(self, challenge_id): + limit = 20 + + # Get the challenge to determine scoring strategy + challenge = db.session.get(Challenge, str(challenge_id)) + if not challenge: + return {"result": "not_found"}, 404 + + scoring_strategy = challenge.scoring_strategy or 'highest_rating' + + # Build query based on scoring strategy + q = db.session.query(ChallengeAttempt).filter( + ChallengeAttempt.challenge_id == str(challenge_id), + ChallengeAttempt.succeeded.is_(True) + ) + + # Apply sorting based on strategy + if scoring_strategy == 'first': + # Earliest successful attempt wins + q = q.order_by(ChallengeAttempt.created_at.asc()) + elif scoring_strategy == 'fastest': + # Lowest elapsed_ms wins + q = q.order_by(ChallengeAttempt.elapsed_ms.asc().nullslast(), ChallengeAttempt.created_at.asc()) + elif scoring_strategy == 'fewest_tokens': + # Lowest tokens_total wins + q = q.order_by(ChallengeAttempt.tokens_total.asc().nullslast(), ChallengeAttempt.created_at.asc()) + elif scoring_strategy == 'highest_rating': + # Highest judge_rating wins, ties broken by earliest + q = q.order_by(ChallengeAttempt.judge_rating.desc().nullslast(), ChallengeAttempt.created_at.asc()) + elif scoring_strategy == 'custom': + # Custom score field (computed by plugin) + q = q.order_by(ChallengeAttempt.score.desc().nullslast(), ChallengeAttempt.created_at.asc()) + else: + # Default to highest_rating + q = q.order_by(ChallengeAttempt.judge_rating.desc().nullslast(), ChallengeAttempt.created_at.asc()) + + rows = q.limit(limit).all() + data = [ + { + "attempt_id": r.id, + "account_id": r.account_id, + "end_user_id": r.end_user_id, + "score": r.score, + "judge_rating": r.judge_rating, + "tokens_total": r.tokens_total, + "elapsed_ms": r.elapsed_ms, + "created_at": r.created_at.isoformat() if hasattr(r.created_at, "isoformat") else None, + } + for r in rows + ] + return {"result": "success", "data": data} + + diff --git a/api/controllers/web/red_blue_challenges.py b/api/controllers/web/red_blue_challenges.py new file mode 100644 index 000000000..54be5cdba --- /dev/null +++ b/api/controllers/web/red_blue_challenges.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from flask import request +from flask_restx import Resource + +from controllers.web import web_ns +from extensions.ext_database import db +from models.red_blue import RedBlueChallenge, TeamPairing +from services.red_blue_service import RedBlueService + + +@web_ns.route("/red-blue-challenges") +class RedBlueListApi(Resource): + def get(self): + q = db.session.query(RedBlueChallenge).filter(RedBlueChallenge.is_active.is_(True)) + items = [ + { + "id": c.id, + "name": c.name, + "description": c.description, + } + for c in q.all() + ] + return {"result": "success", "data": items} + + +@web_ns.route("/red-blue-challenges/") +class RedBlueDetailApi(Resource): + def get(self, challenge_id): + c = db.session.get(RedBlueChallenge, str(challenge_id)) + if not c: + return {"result": "not_found"}, 404 + data = { + "id": c.id, + "name": c.name, + "description": c.description, + } + return {"result": "success", "data": data} + + +@web_ns.route("/red-blue-challenges//submit") +class RedBlueSubmitApi(Resource): + def post(self, challenge_id): + payload = request.get_json(force=True) or {} + team = payload.get("team") + prompt = payload.get("prompt") + if team not in ("red", "blue") or not prompt: + return {"result": "bad_request"}, 400 + c = db.session.get(RedBlueChallenge, str(challenge_id)) + if not c: + return {"result": "not_found"}, 404 + sub = RedBlueService.submit_prompt( + challenge_id=str(challenge_id), + tenant_id=c.tenant_id, + team=team, + prompt=prompt, + account_id=None, + end_user_id=None, + ) + return {"result": "success", "data": {"id": sub.id}}, 201 + + +@web_ns.route("/red-blue-challenges//leaderboard") +class RedBlueLeaderboardApi(Resource): + def get(self, challenge_id): + # aggregate simple totals + red = ( + db.session.query(db.func.coalesce(db.func.sum(TeamPairing.red_points), 0.0)) + .filter(TeamPairing.red_blue_challenge_id == str(challenge_id)) + .scalar() + ) + blue = ( + db.session.query(db.func.coalesce(db.func.sum(TeamPairing.blue_points), 0.0)) + .filter(TeamPairing.red_blue_challenge_id == str(challenge_id)) + .scalar() + ) + total = (red or 0.0) + (blue or 0.0) + data = { + "red_points": float(red or 0.0), + "blue_points": float(blue or 0.0), + "red_ratio": (float(red or 0.0) / total) if total else 0.0, + "blue_ratio": (float(blue or 0.0) / total) if total else 0.0, + } + return {"result": "success", "data": data} + diff --git a/api/controllers/web/register.py b/api/controllers/web/register.py new file mode 100644 index 000000000..d678498d5 --- /dev/null +++ b/api/controllers/web/register.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from flask import request +from flask_restx import Resource + +from controllers.web import web_ns +from extensions.ext_database import db +from services.account_service import RegisterService + + +@web_ns.route('/register') +class WebRegisterApi(Resource): + def post(self): + payload = request.get_json(force=True) or {} + email = payload.get('email') + name = payload.get('name') or 'Player' + password = payload.get('password') + if not email or not password: + return { 'result': 'bad_request' }, 400 + account = RegisterService.register( + email=email, + name=name, + password=password, + is_setup=False, + create_workspace_required=False, + ) + db.session.commit() + return { 'result': 'success', 'data': { 'account_id': account.id } }, 201 + + diff --git a/api/core/workflow/enums.py b/api/core/workflow/enums.py index 00a125660..a27cc3c1b 100644 --- a/api/core/workflow/enums.py +++ b/api/core/workflow/enums.py @@ -58,6 +58,9 @@ class NodeType(StrEnum): DOCUMENT_EXTRACTOR = "document-extractor" LIST_OPERATOR = "list-operator" AGENT = "agent" + CHALLENGE_EVALUATOR = "challenge-evaluator" + JUDGING_LLM = "judging-llm" + TEAM_CHALLENGE = "team-challenge" class NodeExecutionType(StrEnum): diff --git a/api/core/workflow/nodes/challenge_evaluator/__init__.py b/api/core/workflow/nodes/challenge_evaluator/__init__.py new file mode 100644 index 000000000..5e12446aa --- /dev/null +++ b/api/core/workflow/nodes/challenge_evaluator/__init__.py @@ -0,0 +1,3 @@ +from .node import ChallengeEvaluatorNode + +__all__ = ['ChallengeEvaluatorNode'] diff --git a/api/core/workflow/nodes/challenge_evaluator/node.py b/api/core/workflow/nodes/challenge_evaluator/node.py new file mode 100644 index 000000000..e06d856ee --- /dev/null +++ b/api/core/workflow/nodes/challenge_evaluator/node.py @@ -0,0 +1,258 @@ +# pyright: reportImplicitRelativeImport=none + +from __future__ import annotations + +import logging +import time +from collections.abc import Mapping +from typing import Any + +from core.variables.segments import Segment +from core.workflow.enums import ErrorStrategy, NodeExecutionType, NodeType, WorkflowNodeExecutionStatus +from core.workflow.node_events import NodeRunResult +from core.workflow.nodes.base.entities import BaseNodeData, RetryConfig +from core.workflow.nodes.base.node import Node +from extensions.ext_database import db +from models.challenge import Challenge +from services.challenge_scorer_service import ChallengeScorerService +from services.challenge_service import ChallengeService + +logger = logging.getLogger(__name__) + + +class ChallengeEvaluatorNode(Node): + node_type = NodeType.CHALLENGE_EVALUATOR + execution_type = NodeExecutionType.EXECUTABLE + + _node_data: BaseNodeData + + def init_node_data(self, data: Mapping[str, Any]): + # Using BaseNodeData to carry title/desc; node data is accessed directly + self._node_data = BaseNodeData.model_validate(data) + self._config: dict[str, Any] = data + + def _get_error_strategy(self) -> ErrorStrategy | None: + return getattr(self._node_data, 'error_strategy', None) + + def _get_retry_config(self) -> RetryConfig: + return getattr(self._node_data, 'retry_config', RetryConfig()) + + def _get_title(self) -> str: + return getattr(self._node_data, 'title', 'Challenge Evaluator') + + def _get_description(self) -> str | None: + return getattr(self._node_data, 'desc', None) + + def _get_default_value_dict(self) -> dict[str, Any]: + return getattr(self._node_data, 'default_value_dict', {}) + + def get_base_node_data(self) -> BaseNodeData: + return self._node_data + + @classmethod + def version(cls) -> str: + return "1" + + def _run(self) -> NodeRunResult: + # Resolve response text from selector in config.inputs.response (frontend schema) + output_text = '' + source_selector = None + inputs_cfg = self._config.get('inputs') or {} + if isinstance(inputs_cfg, dict): + source_selector = inputs_cfg.get('response') + # fallback to older key if any + source_selector = source_selector or self._config.get('value_selector') + + # Check evaluation mode from config + evaluation_mode = self._config.get('evaluation_mode', 'rules') + + logger.info("ChallengeEvaluator - evaluation_mode: %s, source_selector: %s", evaluation_mode, source_selector) + + # Initialize judge variables + is_judge_input = False + judge_passed = False + judge_rating = 0 + judge_feedback_from_input = '' + output_text = '' + + def _segment_to_value(segment: Segment | None) -> Any: + if segment is None: + return None + if hasattr(segment, "to_object"): + try: + return segment.to_object() + except Exception: # pragma: no cover - defensive + pass + return getattr(segment, "value", segment) + + # If evaluation_mode is 'llm-judge', try to read from upstream Judging LLM node + if evaluation_mode == 'llm-judge' and source_selector and len(source_selector) >= 1: + try: + node_id = source_selector[0] + # Retrieve judge outputs as Segments and convert to primitive values + passed_segment = self.graph_runtime_state.variable_pool.get([node_id, 'judge_passed']) + rating_segment = self.graph_runtime_state.variable_pool.get([node_id, 'judge_rating']) + feedback_segment = self.graph_runtime_state.variable_pool.get([node_id, 'judge_feedback']) + + potential_judge_passed = _segment_to_value(passed_segment) + potential_judge_rating = _segment_to_value(rating_segment) + potential_judge_feedback = _segment_to_value(feedback_segment) + + logger.info( + "ChallengeEvaluator - Reading judge outputs: passed=%s, rating=%s, feedback=%s", + potential_judge_passed, + potential_judge_rating, + potential_judge_feedback, + ) + + # If judge_passed exists, we successfully read from a Judging LLM node + if potential_judge_passed is not None: + is_judge_input = True + judge_passed = bool(potential_judge_passed) + judge_rating = int(potential_judge_rating or 0) + judge_feedback_from_input = str(potential_judge_feedback or '') + logger.info( + "ChallengeEvaluator - Judge input successfully read! passed=%s, rating=%s, feedback=%s", + judge_passed, + judge_rating, + judge_feedback_from_input, + ) + except Exception as e: + logger.error("ChallengeEvaluator - Error reading judge outputs: %s", e, exc_info=True) + is_judge_input = False + + # If not using judge input, get text output for rules-based evaluation + if not is_judge_input and source_selector: + try: + segment = self.graph_runtime_state.variable_pool.get(source_selector) + if segment is None: + output_text = '' + elif hasattr(segment, 'text'): + output_text = segment.text + else: + output_text = str(_segment_to_value(segment) or '') + except Exception: + output_text = '' + + # Evaluate based on mode + if is_judge_input: + ok = judge_passed + details = { + 'mode': 'llm-judge', + 'rating': judge_rating, + 'feedback': judge_feedback_from_input, + } + else: + # Rules-based evaluation (only if not using judge input) + ok, details = ChallengeService.evaluate_outcome(output_text, self._config) + + # optional persistence if config carries challenge_id + challenge_id = self._config.get('challenge_id') + if challenge_id: + try: + # Calculate elapsed time in milliseconds + elapsed_ms = int((time.time() - self.graph_runtime_state.start_at) * 1000) + + # Get total tokens used in the workflow so far + tokens_total = self.graph_runtime_state.total_tokens + + # Extract judge_rating from details if available (for highest_rating strategy) + judge_rating = None + judge_feedback = None + if isinstance(details, dict): + judge_rating = details.get('rating') + judge_feedback = details.get('feedback') + + # Load challenge to check scoring strategy + challenge = db.session.get(Challenge, str(challenge_id)) + + # Score field is reserved for custom scoring plugins. + # For built-in strategies (first, fastest, fewest_tokens, highest_rating), + # the leaderboard sorts by specific columns (created_at, elapsed_ms, tokens_total, judge_rating). + score = None + + # If custom scoring is configured, compute score using plugin + if challenge and challenge.scoring_strategy == 'custom': + try: + metrics = { + 'succeeded': ok, + 'tokens_total': tokens_total, + 'elapsed_ms': elapsed_ms, + 'rating': judge_rating, + 'created_at': int(time.time() * 1000), + } + + ctx = { + 'tenant_id': self.tenant_id, + 'app_id': self.app_id, + 'workflow_id': self.workflow_id, + 'challenge_id': str(challenge_id), + 'end_user_id': None, + 'timeout_ms': 5000, + } + + result = ChallengeScorerService.score_with_plugin( + scorer_plugin_id=challenge.scoring_plugin_id, + scorer_entrypoint=challenge.scoring_entrypoint, + metrics=metrics, + config=challenge.scoring_config or {}, + ctx=ctx, + ) + + score = result.get('score') + logger.info( + "Custom scorer computed score: %s (details: %s)", + score, + result.get('details'), + ) + except Exception as e: + logger.error("Custom scorer failed: %s", e, exc_info=True) + # Continue with score=None on error + + ChallengeService.record_attempt( + tenant_id=self.tenant_id, + challenge_id=challenge_id, + end_user_id=None, + account_id=None, + workflow_run_id=None, + succeeded=ok, + score=score, + judge_rating=judge_rating, + judge_feedback=judge_feedback, + tokens_total=tokens_total, + elapsed_ms=elapsed_ms, + session=db.session, + ) + except Exception: + # do not crash the workflow if recording fails + pass + + # Always provide all output variables to match frontend getOutputVars + outputs: dict[str, Any] = { + 'challenge_succeeded': ok, + 'judge_rating': 0, + 'judge_feedback': '', + 'message': '', + } + + # Override with actual values if evaluator provides them + if isinstance(details, dict): + logger.debug("ChallengeEvaluator - details: %s", details) + if 'rating' in details: + outputs['judge_rating'] = details.get('rating') + if 'feedback' in details: + outputs['judge_feedback'] = details.get('feedback') + if 'message' in details: + outputs['message'] = details.get('message') + # If no explicit message, create one from evaluation details + if not outputs['message']: + if ok: + outputs['message'] = f"Success: {details.get('mode', 'evaluation')} matched" + else: + outputs['message'] = f"Failed: {details.get('mode', 'evaluation')} did not match" + + return NodeRunResult( + status=WorkflowNodeExecutionStatus.SUCCEEDED, + outputs=outputs, + ) + diff --git a/api/core/workflow/nodes/judging_llm/node.py b/api/core/workflow/nodes/judging_llm/node.py new file mode 100644 index 000000000..1b1ff1db1 --- /dev/null +++ b/api/core/workflow/nodes/judging_llm/node.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +import json +import re +from collections.abc import Mapping +from typing import Any + +from core.model_manager import ModelManager +from core.model_runtime.entities.llm_entities import LLMResult +from core.model_runtime.entities.message_entities import ( + PromptMessageContentType, + SystemPromptMessage, + UserPromptMessage, +) +from core.model_runtime.entities.model_entities import ModelType +from core.workflow.enums import ErrorStrategy, NodeExecutionType, NodeType, WorkflowNodeExecutionStatus +from core.workflow.node_events import NodeRunResult +from core.workflow.nodes.base.entities import BaseNodeData, RetryConfig +from core.workflow.nodes.base.node import Node +from services.challenge_service import ChallengeService + + +class JudgingLLMNode(Node): + node_type = NodeType.JUDGING_LLM + execution_type = NodeExecutionType.EXECUTABLE + + _node_data: BaseNodeData + + def init_node_data(self, data: Mapping[str, Any]): + self._node_data = BaseNodeData.model_validate(data) + # Access data directly from node_data, not from a 'config' key + self._config: dict[str, Any] = data + + def _get_error_strategy(self) -> ErrorStrategy | None: + return getattr(self._node_data, 'error_strategy', None) + + def _get_retry_config(self) -> RetryConfig: + return getattr(self._node_data, 'retry_config', RetryConfig()) + + def _get_title(self) -> str: + return getattr(self._node_data, 'title', 'Judging LLM') + + def _get_description(self) -> str | None: + return getattr(self._node_data, 'desc', None) + + def _get_default_value_dict(self) -> dict[str, Any]: + return getattr(self._node_data, 'default_value_dict', {}) + + def get_base_node_data(self) -> BaseNodeData: + return self._node_data + + @classmethod + def version(cls) -> str: + return "1" + + def _run(self) -> NodeRunResult: + # Placeholder with FE-compatible keys. Extract inputs for future wiring. + inputs_cfg = self._config.get('inputs') or {} + goal_selector = None + response_selector = None + if isinstance(inputs_cfg, dict): + goal_selector = inputs_cfg.get('goal') + response_selector = inputs_cfg.get('response') + + # Attempt to read variables (not used in placeholder decision) + _ = None + try: + if goal_selector: + _ = self.graph_runtime_state.variable_pool.get(goal_selector) + if response_selector: + _ = self.graph_runtime_state.variable_pool.get(response_selector) + except Exception: + pass + + outputs = { + 'judge_passed': False, + 'judge_rating': 0, + 'judge_feedback': '', + } + + # If model config and rubric provided, invoke LLM synchronously to judge + judge_model = self._config.get('judge_model') or {} + rubric = self._config.get('rubric_prompt_template') or '' + provider = (judge_model or {}).get('provider') + model_name = (judge_model or {}).get('name') + completion_params = (judge_model or {}).get('completion_params') or {} + + def _segment_to_text(seg: Any) -> str: + try: + # Many variable types expose .text + if hasattr(seg, 'text'): + return str(seg.text) + if isinstance(seg, (dict, list)): + return json.dumps(seg, ensure_ascii=False) + return str(seg) + except Exception: + return '' + + # Debug: log what we're checking + import logging + logger = logging.getLogger(__name__) + logger.info( + "JudgingLLM check - provider: %s, model: %s, rubric_len: %s, response_selector: %s", + provider, + model_name, + len(rubric) if rubric else 0, + response_selector, + ) + + if provider and model_name and rubric and response_selector: + logger.info("JudgingLLM: All conditions met, invoking LLM...") + try: + goal_val = self.graph_runtime_state.variable_pool.get(goal_selector) if goal_selector else None + response_val = self.graph_runtime_state.variable_pool.get(response_selector) + goal_text = _segment_to_text(goal_val) + response_text = _segment_to_text(response_val) + json_template = '{"passed": boolean, "rating": number (0-10), "feedback": string}' + + prompt_body = ( + f"Goal:\n{goal_text}\n\n" + f"Response:\n{response_text}\n\n" + f"Return JSON with rating 0-10: {json_template}" + ) + + prompt_messages = [ + SystemPromptMessage(content=rubric), + UserPromptMessage(content=prompt_body), + ] + + model_instance = ModelManager().get_model_instance( + tenant_id=self.tenant_id, + model_type=ModelType.LLM, + provider=provider, + model=model_name, + ) + result: LLMResult = model_instance.invoke_llm( + prompt_messages=prompt_messages, + model_parameters=completion_params, + stop=[], + stream=False, + user=self.user_id, + ) # type: ignore + # Extract text from result + text_out = '' + content = getattr(result.message, 'content', '') + if isinstance(content, str): + text_out = content + elif isinstance(content, list): + for item in content: + if getattr(item, 'type', None) == PromptMessageContentType.TEXT: + text_out += str(getattr(item, 'data', '')) + else: + text_out = str(content) + + # Parse last JSON object in output + verdict: dict[str, Any] | None = None + try: + matches = re.findall(r"\{[\s\S]*\}", text_out) + if matches: + verdict = json.loads(matches[-1]) + except Exception: + verdict = None + + if isinstance(verdict, dict): + outputs['judge_passed'] = bool(verdict.get('passed')) + outputs['judge_rating'] = int(verdict.get('rating') or 0) + outputs['judge_feedback'] = str(verdict.get('feedback') or '') + outputs['judge_raw'] = json.dumps(verdict) + else: + # Fallback to simple rules if configured + success_type = self._config.get('success_type') + success_pattern = self._config.get('success_pattern') + if success_type and success_pattern: + ok, _ = ChallengeService.evaluate_outcome(response_text, { + 'success_type': success_type, + 'success_pattern': success_pattern, + }) + outputs['judge_passed'] = ok + outputs['judge_rating'] = 10 if ok else 0 + outputs['judge_feedback'] = 'passed by rules' if ok else 'failed by rules' + except Exception as e: + # keep default outputs on error + logger.error("JudgingLLM error: %s", e, exc_info=True) + pass + else: + logger.warning("JudgingLLM skipped - missing required fields") + return NodeRunResult(status=WorkflowNodeExecutionStatus.SUCCEEDED, outputs=outputs) + diff --git a/api/core/workflow/nodes/node_mapping.py b/api/core/workflow/nodes/node_mapping.py index 3d3a1bec9..7fb2820cd 100644 --- a/api/core/workflow/nodes/node_mapping.py +++ b/api/core/workflow/nodes/node_mapping.py @@ -24,6 +24,9 @@ from core.workflow.nodes.tool import ToolNode from core.workflow.nodes.variable_aggregator import VariableAggregatorNode from core.workflow.nodes.variable_assigner.v1 import VariableAssignerNode as VariableAssignerNodeV1 from core.workflow.nodes.variable_assigner.v2 import VariableAssignerNode as VariableAssignerNodeV2 +from core.workflow.nodes.challenge_evaluator.node import ChallengeEvaluatorNode +from core.workflow.nodes.judging_llm.node import JudgingLLMNode +from core.workflow.nodes.team_challenge.node import TeamChallengeNode LATEST_VERSION = "latest" @@ -142,4 +145,16 @@ NODE_TYPE_CLASSES_MAPPING: Mapping[NodeType, Mapping[str, type[Node]]] = { LATEST_VERSION: KnowledgeIndexNode, "1": KnowledgeIndexNode, }, + NodeType.CHALLENGE_EVALUATOR: { + LATEST_VERSION: ChallengeEvaluatorNode, + "1": ChallengeEvaluatorNode, + }, + NodeType.JUDGING_LLM: { + LATEST_VERSION: JudgingLLMNode, + "1": JudgingLLMNode, + }, + NodeType.TEAM_CHALLENGE: { + LATEST_VERSION: TeamChallengeNode, + "1": TeamChallengeNode, + }, } diff --git a/api/core/workflow/nodes/team_challenge/node.py b/api/core/workflow/nodes/team_challenge/node.py new file mode 100644 index 000000000..337158ad1 --- /dev/null +++ b/api/core/workflow/nodes/team_challenge/node.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any + +from core.workflow.enums import ErrorStrategy, NodeExecutionType, NodeType, WorkflowNodeExecutionStatus +from core.workflow.node_events import NodeRunResult +from core.workflow.nodes.base.entities import BaseNodeData, RetryConfig +from core.workflow.nodes.base.node import Node + + +class TeamChallengeNode(Node): + node_type = NodeType.TEAM_CHALLENGE + execution_type = NodeExecutionType.EXECUTABLE + + _node_data: BaseNodeData + + def init_node_data(self, data: Mapping[str, Any]): + self._node_data = BaseNodeData.model_validate(data) + self._config: dict[str, Any] = data + + def _get_error_strategy(self) -> ErrorStrategy | None: + return getattr(self._node_data, 'error_strategy', None) + + def _get_retry_config(self) -> RetryConfig: + return getattr(self._node_data, 'retry_config', RetryConfig()) + + def _get_title(self) -> str: + return getattr(self._node_data, 'title', 'Team Challenge') + + def _get_description(self) -> str | None: + return getattr(self._node_data, 'desc', None) + + def _get_default_value_dict(self) -> dict[str, Any]: + return getattr(self._node_data, 'default_value_dict', {}) + + def get_base_node_data(self) -> BaseNodeData: + return self._node_data + + @classmethod + def version(cls) -> str: + return "1" + + def _run(self) -> NodeRunResult: + # Read inputs.team_choice for consistency with FE + inputs_cfg = self._config.get('inputs') or {} + team_choice = '' + if isinstance(inputs_cfg, dict): + team_choice_selector = inputs_cfg.get('team_choice') + if team_choice_selector: + try: + v = self.graph_runtime_state.variable_pool.get_value_by_selector(team_choice_selector) + team_choice = str(v or '') + except Exception: + team_choice = '' + + outputs = { + 'team': team_choice, + 'judge_passed': False, + 'judge_rating': 0, + 'judge_feedback': '', + 'categories': {}, + 'team_points': 0.0, + 'total_points': 0.0, + } + return NodeRunResult(status=WorkflowNodeExecutionStatus.SUCCEEDED, outputs=outputs) + + diff --git a/api/migrations/versions/2025_09_30_0822-183e2d30fb4e_add_challenge_red_blue_tables.py b/api/migrations/versions/2025_09_30_0822-183e2d30fb4e_add_challenge_red_blue_tables.py new file mode 100644 index 000000000..a3fbc8f17 --- /dev/null +++ b/api/migrations/versions/2025_09_30_0822-183e2d30fb4e_add_challenge_red_blue_tables.py @@ -0,0 +1,170 @@ +"""add challenge & red/blue tables + +Revision ID: 183e2d30fb4e +Revises: 68519ad5cd18 +Create Date: 2025-09-30 08:22:31.223257 + +""" +from alembic import op +import models as models +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '183e2d30fb4e' +down_revision = '68519ad5cd18' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('challenge_attempts', + sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False), + sa.Column('tenant_id', models.types.StringUUID(), nullable=False), + sa.Column('challenge_id', models.types.StringUUID(), nullable=False), + sa.Column('end_user_id', models.types.StringUUID(), nullable=True), + sa.Column('account_id', models.types.StringUUID(), nullable=True), + sa.Column('workflow_run_id', models.types.StringUUID(), nullable=True), + sa.Column('succeeded', sa.Boolean(), server_default=sa.text('false'), nullable=False), + sa.Column('score', sa.Float(), nullable=True), + sa.Column('judge_rating', sa.Integer(), nullable=True), + sa.Column('judge_feedback', sa.Text(), nullable=True), + sa.Column('judge_output_raw', postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column('tokens_total', sa.Integer(), nullable=True), + sa.Column('elapsed_ms', sa.Integer(), nullable=True), + sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.PrimaryKeyConstraint('id', name='challenge_attempts_pkey') + ) + with op.batch_alter_table('challenge_attempts', schema=None) as batch_op: + batch_op.create_index('challenge_attempts_challenge_id_idx', ['challenge_id'], unique=False) + batch_op.create_index('challenge_attempts_tenant_id_idx', ['tenant_id'], unique=False) + + op.create_table('challenges', + sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False), + sa.Column('tenant_id', models.types.StringUUID(), nullable=False), + sa.Column('app_id', models.types.StringUUID(), nullable=False), + sa.Column('workflow_id', models.types.StringUUID(), nullable=True), + sa.Column('name', sa.Text(), nullable=False), + sa.Column('description', sa.Text(), nullable=True), + sa.Column('goal', sa.Text(), nullable=True), + sa.Column('success_type', sa.String(length=64), server_default=sa.text("'regex'"), nullable=False), + sa.Column('success_pattern', sa.Text(), nullable=True), + sa.Column('secret_ref', sa.Text(), nullable=True), + sa.Column('evaluator_type', sa.String(length=32), server_default=sa.text("'rules'"), nullable=False), + sa.Column('evaluator_plugin_id', sa.Text(), nullable=True), + sa.Column('evaluator_entrypoint', sa.Text(), nullable=True), + sa.Column('evaluator_config', postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column('scoring_strategy', sa.String(length=64), server_default=sa.text("'first'"), nullable=False), + sa.Column('scoring_plugin_id', sa.Text(), nullable=True), + sa.Column('scoring_entrypoint', sa.Text(), nullable=True), + sa.Column('scoring_config', postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column('is_active', sa.Boolean(), server_default=sa.text('true'), nullable=False), + sa.Column('created_by', models.types.StringUUID(), nullable=True), + sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.Column('updated_by', models.types.StringUUID(), nullable=True), + sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.PrimaryKeyConstraint('id', name='challenges_pkey') + ) + with op.batch_alter_table('challenges', schema=None) as batch_op: + batch_op.create_index('challenges_app_id_idx', ['app_id'], unique=False) + batch_op.create_index('challenges_tenant_id_idx', ['tenant_id'], unique=False) + + op.create_table('red_blue_challenges', + sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False), + sa.Column('tenant_id', models.types.StringUUID(), nullable=False), + sa.Column('app_id', models.types.StringUUID(), nullable=False), + sa.Column('workflow_id', models.types.StringUUID(), nullable=True), + sa.Column('name', sa.Text(), nullable=False), + sa.Column('description', sa.Text(), nullable=True), + sa.Column('judge_suite', postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column('defense_selection_policy', sa.String(length=64), server_default=sa.text("'latest_best'"), nullable=False), + sa.Column('attack_selection_policy', sa.String(length=64), server_default=sa.text("'latest_best'"), nullable=False), + sa.Column('scoring_strategy', sa.String(length=64), server_default=sa.text("'red_blue_ratio'"), nullable=False), + sa.Column('theme', postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column('instructions_md', sa.Text(), nullable=True), + sa.Column('is_active', sa.Boolean(), server_default=sa.text('true'), nullable=False), + sa.Column('created_by', models.types.StringUUID(), nullable=True), + sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.Column('updated_by', models.types.StringUUID(), nullable=True), + sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.PrimaryKeyConstraint('id', name='red_blue_challenges_pkey') + ) + with op.batch_alter_table('red_blue_challenges', schema=None) as batch_op: + batch_op.create_index('red_blue_challenges_app_id_idx', ['app_id'], unique=False) + batch_op.create_index('red_blue_challenges_tenant_id_idx', ['tenant_id'], unique=False) + + op.create_table('team_pairings', + sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False), + sa.Column('red_blue_challenge_id', models.types.StringUUID(), nullable=False), + sa.Column('tenant_id', models.types.StringUUID(), nullable=False), + sa.Column('attack_submission_id', models.types.StringUUID(), nullable=True), + sa.Column('defense_submission_id', models.types.StringUUID(), nullable=True), + sa.Column('judge_output_raw', postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column('categories', postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column('judge_rating', sa.Integer(), nullable=True), + sa.Column('judge_feedback', sa.Text(), nullable=True), + sa.Column('red_points', sa.Float(), server_default=sa.text('0'), nullable=False), + sa.Column('blue_points', sa.Float(), server_default=sa.text('0'), nullable=False), + sa.Column('tokens_total', sa.Integer(), nullable=True), + sa.Column('elapsed_ms', sa.Integer(), nullable=True), + sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.PrimaryKeyConstraint('id', name='team_pairings_pkey') + ) + with op.batch_alter_table('team_pairings', schema=None) as batch_op: + batch_op.create_index('team_pairings_challenge_id_idx', ['red_blue_challenge_id'], unique=False) + batch_op.create_index('team_pairings_tenant_id_idx', ['tenant_id'], unique=False) + + op.create_table('team_submissions', + sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False), + sa.Column('red_blue_challenge_id', models.types.StringUUID(), nullable=False), + sa.Column('tenant_id', models.types.StringUUID(), nullable=False), + sa.Column('account_id', models.types.StringUUID(), nullable=True), + sa.Column('end_user_id', models.types.StringUUID(), nullable=True), + sa.Column('team', sa.String(length=16), nullable=False), + sa.Column('prompt', sa.Text(), nullable=False), + sa.Column('active', sa.Boolean(), server_default=sa.text('true'), nullable=False), + sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP'), nullable=False), + sa.PrimaryKeyConstraint('id', name='team_submissions_pkey') + ) + with op.batch_alter_table('team_submissions', schema=None) as batch_op: + batch_op.create_index('team_submissions_challenge_id_idx', ['red_blue_challenge_id'], unique=False) + batch_op.create_index('team_submissions_tenant_id_idx', ['tenant_id'], unique=False) + + with op.batch_alter_table('providers', schema=None) as batch_op: + batch_op.drop_column('credential_status') + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('providers', schema=None) as batch_op: + batch_op.add_column(sa.Column('credential_status', sa.VARCHAR(length=20), server_default=sa.text("'active'::character varying"), autoincrement=False, nullable=True)) + + with op.batch_alter_table('team_submissions', schema=None) as batch_op: + batch_op.drop_index('team_submissions_tenant_id_idx') + batch_op.drop_index('team_submissions_challenge_id_idx') + + op.drop_table('team_submissions') + with op.batch_alter_table('team_pairings', schema=None) as batch_op: + batch_op.drop_index('team_pairings_tenant_id_idx') + batch_op.drop_index('team_pairings_challenge_id_idx') + + op.drop_table('team_pairings') + with op.batch_alter_table('red_blue_challenges', schema=None) as batch_op: + batch_op.drop_index('red_blue_challenges_tenant_id_idx') + batch_op.drop_index('red_blue_challenges_app_id_idx') + + op.drop_table('red_blue_challenges') + with op.batch_alter_table('challenges', schema=None) as batch_op: + batch_op.drop_index('challenges_tenant_id_idx') + batch_op.drop_index('challenges_app_id_idx') + + op.drop_table('challenges') + with op.batch_alter_table('challenge_attempts', schema=None) as batch_op: + batch_op.drop_index('challenge_attempts_tenant_id_idx') + batch_op.drop_index('challenge_attempts_challenge_id_idx') + + op.drop_table('challenge_attempts') + # ### end Alembic commands ### diff --git a/api/models/__init__.py b/api/models/__init__.py index 779484283..7f28a3603 100644 --- a/api/models/__init__.py +++ b/api/models/__init__.py @@ -91,6 +91,8 @@ from .workflow import ( WorkflowRun, WorkflowType, ) +from .challenge import Challenge, ChallengeAttempt +from .red_blue import RedBlueChallenge, TeamSubmission, TeamPairing __all__ = [ "APIBasedExtension", @@ -181,4 +183,9 @@ __all__ = [ "WorkflowRunTriggeredFrom", "WorkflowToolProvider", "WorkflowType", + "Challenge", + "ChallengeAttempt", + "RedBlueChallenge", + "TeamSubmission", + "TeamPairing", ] diff --git a/api/models/challenge.py b/api/models/challenge.py new file mode 100644 index 000000000..3bb7fa003 --- /dev/null +++ b/api/models/challenge.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from datetime import datetime + +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.orm import Mapped, mapped_column + +from .base import Base +from .types import StringUUID + + +class Challenge(Base): + __tablename__ = "challenges" + __table_args__ = ( + sa.PrimaryKeyConstraint("id", name="challenges_pkey"), + sa.Index("challenges_tenant_id_idx", "tenant_id"), + sa.Index("challenges_app_id_idx", "app_id"), + ) + + id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()")) + tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + app_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + workflow_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + + name: Mapped[str] = mapped_column(sa.Text, nullable=False) + description: Mapped[str | None] = mapped_column(sa.Text, nullable=True) + goal: Mapped[str | None] = mapped_column(sa.Text, nullable=True) + + success_type: Mapped[str] = mapped_column(sa.String(64), nullable=False, server_default=sa.text("'regex'")) + success_pattern: Mapped[str | None] = mapped_column(sa.Text, nullable=True) + secret_ref: Mapped[str | None] = mapped_column(sa.Text, nullable=True) + + evaluator_type: Mapped[str] = mapped_column(sa.String(32), nullable=False, server_default=sa.text("'rules'")) + evaluator_plugin_id: Mapped[str | None] = mapped_column(sa.Text, nullable=True) + evaluator_entrypoint: Mapped[str | None] = mapped_column(sa.Text, nullable=True) + evaluator_config = mapped_column(JSONB, nullable=True) + + scoring_strategy: Mapped[str] = mapped_column(sa.String(64), nullable=False, server_default=sa.text("'first'")) + scoring_plugin_id: Mapped[str | None] = mapped_column(sa.Text, nullable=True) + scoring_entrypoint: Mapped[str | None] = mapped_column(sa.Text, nullable=True) + scoring_config = mapped_column(JSONB, nullable=True) + + is_active: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true")) + + created_by: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + created_at: Mapped[datetime] = mapped_column( + sa.DateTime, + nullable=False, + server_default=sa.func.current_timestamp(), + ) + updated_by: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + updated_at: Mapped[datetime] = mapped_column( + sa.DateTime, + nullable=False, + server_default=sa.func.current_timestamp(), + ) + + +class ChallengeAttempt(Base): + __tablename__ = "challenge_attempts" + __table_args__ = ( + sa.PrimaryKeyConstraint("id", name="challenge_attempts_pkey"), + sa.Index("challenge_attempts_tenant_id_idx", "tenant_id"), + sa.Index("challenge_attempts_challenge_id_idx", "challenge_id"), + ) + + id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()")) + tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + challenge_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + + end_user_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + account_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + workflow_run_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + + succeeded: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("false")) + score: Mapped[float | None] = mapped_column(sa.Float, nullable=True) + + judge_rating: Mapped[int | None] = mapped_column(sa.Integer, nullable=True) + judge_feedback: Mapped[str | None] = mapped_column(sa.Text, nullable=True) + judge_output_raw = mapped_column(JSONB, nullable=True) + + tokens_total: Mapped[int | None] = mapped_column(sa.Integer, nullable=True) + elapsed_ms: Mapped[int | None] = mapped_column(sa.Integer, nullable=True) + + created_at: Mapped[datetime] = mapped_column( + sa.DateTime, + nullable=False, + server_default=sa.func.current_timestamp(), + ) + diff --git a/api/models/red_blue.py b/api/models/red_blue.py new file mode 100644 index 000000000..382c2fa9a --- /dev/null +++ b/api/models/red_blue.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +from datetime import datetime + +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.orm import Mapped, mapped_column + +from .base import Base +from .types import StringUUID + + +class RedBlueChallenge(Base): + __tablename__ = "red_blue_challenges" + __table_args__ = ( + sa.PrimaryKeyConstraint("id", name="red_blue_challenges_pkey"), + sa.Index("red_blue_challenges_tenant_id_idx", "tenant_id"), + sa.Index("red_blue_challenges_app_id_idx", "app_id"), + ) + + id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()")) + tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + app_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + workflow_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + + name: Mapped[str] = mapped_column(sa.Text, nullable=False) + description: Mapped[str | None] = mapped_column(sa.Text, nullable=True) + + judge_suite = mapped_column(JSONB, nullable=False) + defense_selection_policy: Mapped[str] = mapped_column( + sa.String(64), nullable=False, server_default=sa.text("'latest_best'") + ) + attack_selection_policy: Mapped[str] = mapped_column( + sa.String(64), nullable=False, server_default=sa.text("'latest_best'") + ) + scoring_strategy: Mapped[str] = mapped_column( + sa.String(64), nullable=False, server_default=sa.text("'red_blue_ratio'") + ) + + theme = mapped_column(JSONB, nullable=True) + instructions_md: Mapped[str | None] = mapped_column(sa.Text, nullable=True) + + is_active: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true")) + + created_by: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + created_at: Mapped[datetime] = mapped_column( + sa.DateTime, + nullable=False, + server_default=sa.func.current_timestamp(), + ) + updated_by: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + updated_at: Mapped[datetime] = mapped_column( + sa.DateTime, + nullable=False, + server_default=sa.func.current_timestamp(), + ) + + +class TeamSubmission(Base): + __tablename__ = "team_submissions" + __table_args__ = ( + sa.PrimaryKeyConstraint("id", name="team_submissions_pkey"), + sa.Index("team_submissions_challenge_id_idx", "red_blue_challenge_id"), + sa.Index("team_submissions_tenant_id_idx", "tenant_id"), + ) + + id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()")) + red_blue_challenge_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + account_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + end_user_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + + team: Mapped[str] = mapped_column(sa.String(16), nullable=False) # 'red' | 'blue' + prompt: Mapped[str] = mapped_column(sa.Text, nullable=False) + active: Mapped[bool] = mapped_column(sa.Boolean, nullable=False, server_default=sa.text("true")) + + created_at: Mapped[datetime] = mapped_column( + sa.DateTime, + nullable=False, + server_default=sa.func.current_timestamp(), + ) + + +class TeamPairing(Base): + __tablename__ = "team_pairings" + __table_args__ = ( + sa.PrimaryKeyConstraint("id", name="team_pairings_pkey"), + sa.Index("team_pairings_challenge_id_idx", "red_blue_challenge_id"), + sa.Index("team_pairings_tenant_id_idx", "tenant_id"), + ) + + id: Mapped[str] = mapped_column(StringUUID, server_default=sa.text("uuid_generate_v4()")) + red_blue_challenge_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False) + + attack_submission_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + defense_submission_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True) + + judge_output_raw = mapped_column(JSONB, nullable=True) + categories = mapped_column(JSONB, nullable=True) + judge_rating: Mapped[int | None] = mapped_column(sa.Integer, nullable=True) + judge_feedback: Mapped[str | None] = mapped_column(sa.Text, nullable=True) + red_points: Mapped[float] = mapped_column(sa.Float, nullable=False, server_default=sa.text("0")) + blue_points: Mapped[float] = mapped_column(sa.Float, nullable=False, server_default=sa.text("0")) + + tokens_total: Mapped[int | None] = mapped_column(sa.Integer, nullable=True) + elapsed_ms: Mapped[int | None] = mapped_column(sa.Integer, nullable=True) + + created_at: Mapped[datetime] = mapped_column( + sa.DateTime, + nullable=False, + server_default=sa.func.current_timestamp(), + ) + diff --git a/api/pyproject.toml b/api/pyproject.toml index 85fa0beaa..5d05864a2 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -223,3 +223,6 @@ vdb = [ "xinference-client~=1.2.2", "mo-vector~=0.1.13", ] +[tool.pyright] +typeCheckingMode = "basic" +reportImplicitRelativeImport = "none" diff --git a/api/pyrightconfig.json b/api/pyrightconfig.json index 00dda8b08..daf3994c8 100644 --- a/api/pyrightconfig.json +++ b/api/pyrightconfig.json @@ -8,7 +8,8 @@ "extensions", "core/app/app_config/easy_ui_based_app/dataset" ], - "typeCheckingMode": "strict", + "typeCheckingMode": "basic", + "reportImplicitRelativeImport": "none", "allowedUntypedLibraries": [ "flask_restx", "flask_login", diff --git a/api/run.sh b/api/run.sh new file mode 100644 index 000000000..415b40b00 --- /dev/null +++ b/api/run.sh @@ -0,0 +1 @@ +uv run --dev flask --app app run --host 0.0.0.0 --port 5001 --debug diff --git a/api/services/challenge_scorer_protocol.py b/api/services/challenge_scorer_protocol.py new file mode 100644 index 000000000..43c1c8b4d --- /dev/null +++ b/api/services/challenge_scorer_protocol.py @@ -0,0 +1,56 @@ +""" +Challenge scorer protocol and type definitions. + +Defines the interface for custom scoring plugins that compute +numeric scores from attempt metrics for leaderboard ranking. +""" + +from __future__ import annotations + +from typing import Any, Protocol, TypedDict + + +class ScoringContext(TypedDict, total=False): + """Context provided to scorer plugins.""" + + tenant_id: str + app_id: str + workflow_id: str + challenge_id: str + end_user_id: str | None + timeout_ms: int + + +class AttemptMetrics(TypedDict, total=False): + """Metrics from a challenge attempt.""" + + succeeded: bool + tokens_total: int | None + elapsed_ms: int | None + rating: int | None + created_at: int | None # epoch ms + + +class ScoringResult(TypedDict, total=False): + """Result returned by scorer plugin.""" + + score: float + details: dict[str, Any] | None + + +class ScorerProtocol(Protocol): + """Protocol that all scorer plugins must implement.""" + + def score(self, metrics: AttemptMetrics, config: dict[str, Any], ctx: ScoringContext) -> ScoringResult: + """ + Compute a numeric score from attempt metrics. + + Args: + metrics: Attempt metrics (tokens, time, rating, etc.) + config: Plugin-specific configuration (from challenge.scoring_config) + ctx: Context with tenant_id, app_id, etc. + + Returns: + ScoringResult with computed score and optional details + """ + ... diff --git a/api/services/challenge_scorer_service.py b/api/services/challenge_scorer_service.py new file mode 100644 index 000000000..73b7fcdea --- /dev/null +++ b/api/services/challenge_scorer_service.py @@ -0,0 +1,112 @@ +""" +Challenge scorer service. + +Loads and invokes custom scorer plugins to compute scores from attempt metrics. +""" + +from __future__ import annotations + +import logging +from typing import Any + +from services.challenge_scorer_protocol import AttemptMetrics, ScoringContext, ScoringResult + +logger = logging.getLogger(__name__) + + +class ChallengeScorerService: + """Service for loading and invoking custom scorer plugins.""" + + _plugin_cache: dict[str, Any] = {} + + @classmethod + def score_with_plugin( + cls, + *, + scorer_plugin_id: str | None, + scorer_entrypoint: str | None, + metrics: AttemptMetrics, + config: dict[str, Any] | None, + ctx: ScoringContext, + ) -> ScoringResult: + """ + Compute score using a custom scorer plugin. + + Args: + scorer_plugin_id: Plugin identifier (e.g., 'builtin.weighted_scorer') + scorer_entrypoint: Entrypoint path (e.g., 'services.scorers.weighted:WeightedScorer') + metrics: Attempt metrics to score + config: Plugin-specific configuration + ctx: Scoring context + + Returns: + ScoringResult with computed score + + Raises: + ValueError: If plugin cannot be loaded or scoring fails + """ + if not scorer_plugin_id or not scorer_entrypoint: + raise ValueError("scorer_plugin_id and scorer_entrypoint are required for custom scoring") + + # Load plugin + scorer = cls._load_plugin(scorer_plugin_id, scorer_entrypoint) + if not scorer: + raise ValueError(f"Failed to load scorer plugin: {scorer_plugin_id}:{scorer_entrypoint}") + + # Invoke scorer with timeout protection + timeout_ms = ctx.get("timeout_ms", 5000) + try: + # TODO: Add timeout enforcement using threading.Timer or signal.alarm + result = scorer.score(metrics, config or {}, ctx) + if not isinstance(result, dict) or "score" not in result: + raise ValueError("Scorer must return a dict with 'score' key") + return result + except Exception as e: + logger.error(f"Scorer plugin {scorer_plugin_id} failed: {e}", exc_info=True) + raise ValueError(f"Scorer plugin execution failed: {e}") + + @classmethod + def _load_plugin(cls, plugin_id: str, entrypoint: str) -> Any: + """ + Load a scorer plugin by entrypoint. + + Args: + plugin_id: Plugin identifier for caching + entrypoint: Python path like 'pkg.module:ClassName' + + Returns: + Scorer instance or None if loading fails + """ + cache_key = f"{plugin_id}:{entrypoint}" + if cache_key in cls._plugin_cache: + return cls._plugin_cache[cache_key] + + try: + # Parse entrypoint: 'pkg.module:ClassName' + if ":" not in entrypoint: + raise ValueError(f"Invalid entrypoint format: {entrypoint}. Expected 'module:ClassName'") + + module_path, class_name = entrypoint.split(":", 1) + + # Dynamic import + import importlib + + module = importlib.import_module(module_path) + scorer_class = getattr(module, class_name) + + # Instantiate + scorer = scorer_class() + + # Cache it + cls._plugin_cache[cache_key] = scorer + logger.info(f"Loaded scorer plugin: {plugin_id} from {entrypoint}") + return scorer + + except Exception as e: + logger.error(f"Failed to load scorer plugin {plugin_id}:{entrypoint}: {e}", exc_info=True) + return None + + @classmethod + def clear_cache(cls) -> None: + """Clear the plugin cache (useful for testing).""" + cls._plugin_cache.clear() diff --git a/api/services/challenge_service.py b/api/services/challenge_service.py new file mode 100644 index 000000000..e2a484647 --- /dev/null +++ b/api/services/challenge_service.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import re +from collections.abc import Mapping +from typing import Any + +from sqlalchemy.orm import Session + +from extensions.ext_database import db +from models.challenge import Challenge, ChallengeAttempt + + +class ChallengeService: + @staticmethod + def evaluate_outcome(output_text: str, cfg: Mapping[str, Any]) -> tuple[bool, dict[str, Any]]: + success_type = cfg.get("success_type", "regex") + pattern = cfg.get("success_pattern") + if success_type == "regex" and pattern: + try: + if re.search(pattern, output_text, flags=re.IGNORECASE | re.MULTILINE): + return True, {"mode": "regex", "matched": True} + return False, {"mode": "regex", "matched": False} + except re.error as e: + return False, {"mode": "regex", "error": f"invalid_regex: {e}"} + if success_type == "contains" and pattern: + return (pattern.lower() in output_text.lower()), {"mode": "contains"} + return False, {"mode": success_type, "info": "no_pattern_or_unsupported"} + + @staticmethod + def record_attempt( + *, + tenant_id: str, + challenge_id: str, + end_user_id: str | None, + account_id: str | None, + workflow_run_id: str | None, + succeeded: bool, + score: float | None = None, + judge_rating: int | None = None, + judge_feedback: str | None = None, + judge_output_raw: dict[str, Any] | None = None, + tokens_total: int | None = None, + elapsed_ms: int | None = None, + session: Session | None = None, + ) -> ChallengeAttempt: + sess = session or db.session + attempt = ChallengeAttempt() + attempt.tenant_id = tenant_id + attempt.challenge_id = challenge_id + attempt.end_user_id = end_user_id + attempt.account_id = account_id + attempt.workflow_run_id = workflow_run_id + attempt.succeeded = succeeded + attempt.score = score + attempt.judge_rating = judge_rating + attempt.judge_feedback = judge_feedback + attempt.judge_output_raw = judge_output_raw + attempt.tokens_total = tokens_total + attempt.elapsed_ms = elapsed_ms + sess.add(attempt) + sess.commit() + return attempt + + diff --git a/api/services/red_blue_service.py b/api/services/red_blue_service.py new file mode 100644 index 000000000..27822a552 --- /dev/null +++ b/api/services/red_blue_service.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from typing import Any + +from sqlalchemy.orm import Session + +from extensions.ext_database import db +from models.red_blue import RedBlueChallenge, TeamPairing, TeamSubmission + + +class RedBlueService: + @staticmethod + def submit_prompt( + *, + challenge_id: str, + tenant_id: str, + team: str, + prompt: str, + account_id: str | None, + end_user_id: str | None, + session: Session | None = None, + ) -> TeamSubmission: + sess = session or db.session + sub = TeamSubmission() + sub.red_blue_challenge_id = challenge_id + sub.tenant_id = tenant_id + sub.team = team + sub.prompt = prompt + sub.account_id = account_id + sub.end_user_id = end_user_id + sess.add(sub) + sess.commit() + return sub + + @staticmethod + def select_counterparty_submission( + *, + challenge: RedBlueChallenge, + team: str, + session: Session | None = None, + ) -> TeamSubmission | None: + sess = session or db.session + opposite = "blue" if team == "red" else "red" + # Simplest policy: latest active from opposite team + return ( + sess.query(TeamSubmission) + .filter( + TeamSubmission.red_blue_challenge_id == challenge.id, + TeamSubmission.team == opposite, + TeamSubmission.active.is_(True), + ) + .order_by(TeamSubmission.created_at.desc()) + .first() + ) + + @staticmethod + def record_pairing( + *, + challenge_id: str, + tenant_id: str, + attack_submission_id: str | None, + defense_submission_id: str | None, + judge_output_raw: dict[str, Any] | None, + categories: dict[str, Any] | None, + judge_rating: int | None, + judge_feedback: str | None, + red_points: float, + blue_points: float, + tokens_total: int | None, + elapsed_ms: int | None, + session: Session | None = None, + ) -> TeamPairing: + sess = session or db.session + pairing = TeamPairing() + pairing.red_blue_challenge_id = challenge_id + pairing.tenant_id = tenant_id + pairing.attack_submission_id = attack_submission_id + pairing.defense_submission_id = defense_submission_id + pairing.judge_output_raw = judge_output_raw + pairing.categories = categories + pairing.judge_rating = judge_rating + pairing.judge_feedback = judge_feedback + pairing.red_points = red_points + pairing.blue_points = blue_points + pairing.tokens_total = tokens_total + pairing.elapsed_ms = elapsed_ms + sess.add(pairing) + sess.commit() + return pairing + + diff --git a/api/services/scorers/README.md b/api/services/scorers/README.md new file mode 100644 index 000000000..2ec3e75c6 --- /dev/null +++ b/api/services/scorers/README.md @@ -0,0 +1,144 @@ +# Custom Scorer Plugins + +This directory contains custom scorer plugins for challenge leaderboards. + +## Overview + +Scorers compute numeric scores from challenge attempt metrics (tokens, time, rating, success) for ranking on leaderboards when `scoring_strategy = 'custom'`. + +## Built-in Scorers + +### WeightedScorer + +**Entrypoint:** `services.scorers.weighted:WeightedScorer` + +Computes a weighted score combining multiple metrics with configurable bonuses and penalties. + +**Formula:** +``` +score = success_bonus + + (rating × rating_weight) + - (elapsed_seconds × time_penalty) + - (tokens × token_penalty) +``` + +**Configuration:** +- `success_bonus` (float, default: 100): Base points for successful attempts +- `rating_weight` (float, default: 10): Multiplier for judge rating (0-10) +- `time_penalty` (float, default: 1.0): Penalty per second elapsed +- `token_penalty` (float, default: 0.01): Penalty per token used + +**Example Configuration:** +```json +{ + "success_bonus": 100.0, + "rating_weight": 10.0, + "time_penalty": 1.0, + "token_penalty": 0.01 +} +``` + +**Example Challenge Setup (via API):** +```python +{ + "name": "Advanced Prompt Challenge", + "scoring_strategy": "custom", + "scoring_plugin_id": "builtin.weighted_scorer", + "scoring_entrypoint": "services.scorers.weighted:WeightedScorer", + "scoring_config": { + "success_bonus": 100.0, + "rating_weight": 15.0, + "time_penalty": 0.5, + "token_penalty": 0.02 + } +} +``` + +## Creating Custom Scorers + +### 1. Implement the ScorerProtocol + +Create a new file in this directory (e.g., `custom.py`): + +```python +from typing import Any +from services.challenge_scorer_protocol import AttemptMetrics, ScoringContext, ScoringResult + +class MyCustomScorer: + def score(self, metrics: AttemptMetrics, config: dict[str, Any], ctx: ScoringContext) -> ScoringResult: + # Access metrics + succeeded = metrics.get('succeeded', False) + tokens = metrics.get('tokens_total', 0) + elapsed_ms = metrics.get('elapsed_ms', 0) + rating = metrics.get('rating', 0) + + # Access configuration + multiplier = config.get('multiplier', 1.0) + + # Compute score + score = (rating * multiplier) if succeeded else 0.0 + + return { + 'score': score, + 'details': { # optional + 'multiplier_used': multiplier + } + } +``` + +### 2. Register in Challenge + +Set the challenge's scoring fields: + +```python +challenge.scoring_strategy = 'custom' +challenge.scoring_plugin_id = 'my_custom_scorer' +challenge.scoring_entrypoint = 'services.scorers.custom:MyCustomScorer' +challenge.scoring_config = { + 'multiplier': 2.0 +} +``` + +### 3. Testing + +Create tests in `api/tests/unit_tests/services/` following the pattern in `test_challenge_scorer_service.py`. + +## Protocol Reference + +### Input Types + +**AttemptMetrics:** +- `succeeded` (bool): Whether the challenge was passed +- `tokens_total` (int | None): Total tokens used +- `elapsed_ms` (int | None): Time taken in milliseconds +- `rating` (int | None): Judge rating (0-10) +- `created_at` (int | None): Timestamp in epoch milliseconds + +**ScoringContext:** +- `tenant_id` (str): Tenant identifier +- `app_id` (str): Application identifier +- `workflow_id` (str): Workflow identifier +- `challenge_id` (str): Challenge identifier +- `end_user_id` (str | None): End user identifier (if available) +- `timeout_ms` (int): Maximum execution time + +### Output Type + +**ScoringResult:** +- `score` (float, required): Computed numeric score +- `details` (dict[str, Any] | None, optional): Additional scoring details + +## Error Handling + +- Scorers must return a dict with a `score` key +- Exceptions are caught and logged; the attempt is recorded with `score=None` +- Scorers are executed with a timeout (default: 5s) +- Scorers should never return negative scores; use `max(score, 0.0)` to clamp + +## Best Practices + +1. **Keep it simple**: Scoring should be fast and deterministic +2. **Validate config**: Check configuration values and provide defaults +3. **Clamp scores**: Ensure scores are non-negative +4. **Document formula**: Clearly explain how your scorer works +5. **Test edge cases**: Test with missing metrics, zeros, nulls diff --git a/api/services/scorers/__init__.py b/api/services/scorers/__init__.py new file mode 100644 index 000000000..1c78f4779 --- /dev/null +++ b/api/services/scorers/__init__.py @@ -0,0 +1 @@ +"""Built-in scorer plugins.""" diff --git a/api/services/scorers/weighted.py b/api/services/scorers/weighted.py new file mode 100644 index 000000000..f702e54e5 --- /dev/null +++ b/api/services/scorers/weighted.py @@ -0,0 +1,66 @@ +""" +Weighted scorer plugin. + +Computes a weighted score based on success bonus, rating, elapsed time, and token usage. +""" + +from __future__ import annotations + +from typing import Any + +from services.challenge_scorer_protocol import AttemptMetrics, ScoringContext, ScoringResult + + +class WeightedScorer: + """ + Example weighted scorer that combines multiple metrics. + + Configuration options: + - success_bonus (float): Base points for successful attempt (default: 100) + - rating_weight (float): Multiplier for judge rating (default: 10) + - time_penalty (float): Penalty per second elapsed (default: 1.0) + - token_penalty (float): Penalty per token used (default: 0.01) + + Formula: + score = success_bonus + + (rating * rating_weight) + - (elapsed_seconds * time_penalty) + - (tokens * token_penalty) + """ + + def score(self, metrics: AttemptMetrics, config: dict[str, Any], ctx: ScoringContext) -> ScoringResult: + """Compute weighted score from metrics.""" + # Base score for success + base = 0.0 + if metrics.get("succeeded"): + base += config.get("success_bonus", 100.0) + + # Add rating contribution + rating = metrics.get("rating") or 0 + rating_weight = config.get("rating_weight", 10.0) + rating_score = rating * rating_weight + + # Subtract time penalty + elapsed_ms = metrics.get("elapsed_ms") or 0 + elapsed_seconds = elapsed_ms / 1000.0 + time_penalty = config.get("time_penalty", 1.0) + time_score = elapsed_seconds * time_penalty + + # Subtract token penalty + tokens = metrics.get("tokens_total") or 0 + token_penalty = config.get("token_penalty", 0.01) + token_score = tokens * token_penalty + + # Compute final score (never negative) + final_score = base + rating_score - time_score - token_score + final_score = max(final_score, 0.0) + + return { + "score": final_score, + "details": { + "base": base, + "rating_contribution": rating_score, + "time_penalty": time_score, + "token_penalty": token_score, + }, + } diff --git a/api/tests/test_containers_integration_tests/test_web_challenges.py b/api/tests/test_containers_integration_tests/test_web_challenges.py new file mode 100644 index 000000000..44d61ced5 --- /dev/null +++ b/api/tests/test_containers_integration_tests/test_web_challenges.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +from flask.testing import FlaskClient + + +class TestWebChallenges: + def test_list_and_detail(self, test_client_with_containers: FlaskClient): + # list + resp = test_client_with_containers.get("/api/web/challenges") + assert resp.status_code == 200 + data = resp.get_json() + assert data["result"] == "success" + + diff --git a/api/tests/test_containers_integration_tests/test_web_red_blue_challenges.py b/api/tests/test_containers_integration_tests/test_web_red_blue_challenges.py new file mode 100644 index 000000000..ee01ffac5 --- /dev/null +++ b/api/tests/test_containers_integration_tests/test_web_red_blue_challenges.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from flask.testing import FlaskClient + + +class TestWebRedBlueChallenges: + def test_list(self, test_client_with_containers: FlaskClient): + resp = test_client_with_containers.get("/api/web/red-blue-challenges") + assert resp.status_code == 200 + data = resp.get_json() + assert data["result"] == "success" + + diff --git a/api/tests/unit_tests/services/test_challenge_scorer_service.py b/api/tests/unit_tests/services/test_challenge_scorer_service.py new file mode 100644 index 000000000..4d2746556 --- /dev/null +++ b/api/tests/unit_tests/services/test_challenge_scorer_service.py @@ -0,0 +1,144 @@ +"""Tests for ChallengeScorerService.""" + +from __future__ import annotations + +import pytest + +from services.challenge_scorer_service import ChallengeScorerService + + +class TestChallengeScorerService: + """Test custom scorer plugin loading and execution.""" + + def test_weighted_scorer_success(self): + """Test WeightedScorer with successful attempt.""" + metrics = { + "succeeded": True, + "tokens_total": 1000, + "elapsed_ms": 5000, # 5 seconds + "rating": 8, + "created_at": 1730000000000, + } + + config = { + "success_bonus": 100.0, + "rating_weight": 10.0, + "time_penalty": 1.0, + "token_penalty": 0.01, + } + + ctx = { + "tenant_id": "test-tenant", + "app_id": "test-app", + "workflow_id": "test-workflow", + "challenge_id": "test-challenge", + "timeout_ms": 5000, + } + + result = ChallengeScorerService.score_with_plugin( + scorer_plugin_id="builtin.weighted_scorer", + scorer_entrypoint="services.scorers.weighted:WeightedScorer", + metrics=metrics, + config=config, + ctx=ctx, + ) + + # Expected: 100 (success) + 80 (8*10 rating) - 5 (5s*1.0) - 10 (1000*0.01) = 165 + assert result["score"] == 165.0 + assert "details" in result + assert result["details"]["base"] == 100.0 + assert result["details"]["rating_contribution"] == 80.0 + assert result["details"]["time_penalty"] == 5.0 + assert result["details"]["token_penalty"] == 10.0 + + def test_weighted_scorer_failure(self): + """Test WeightedScorer with failed attempt.""" + metrics = { + "succeeded": False, + "tokens_total": 500, + "elapsed_ms": 2000, # 2 seconds + "rating": 3, + "created_at": 1730000000000, + } + + config = { + "success_bonus": 100.0, + "rating_weight": 10.0, + "time_penalty": 1.0, + "token_penalty": 0.01, + } + + ctx = { + "tenant_id": "test-tenant", + "app_id": "test-app", + "challenge_id": "test-challenge", + "timeout_ms": 5000, + } + + result = ChallengeScorerService.score_with_plugin( + scorer_plugin_id="builtin.weighted_scorer", + scorer_entrypoint="services.scorers.weighted:WeightedScorer", + metrics=metrics, + config=config, + ctx=ctx, + ) + + # Expected: 0 (no success bonus) + 30 (3*10) - 2 (2s*1.0) - 5 (500*0.01) = 23 + assert result["score"] == 23.0 + + def test_weighted_scorer_minimum_zero(self): + """Test WeightedScorer never returns negative scores.""" + metrics = { + "succeeded": False, + "tokens_total": 10000, # High token count + "elapsed_ms": 30000, # 30 seconds + "rating": 1, + "created_at": 1730000000000, + } + + config = { + "success_bonus": 100.0, + "rating_weight": 10.0, + "time_penalty": 1.0, + "token_penalty": 0.01, + } + + ctx = { + "tenant_id": "test-tenant", + "app_id": "test-app", + "challenge_id": "test-challenge", + "timeout_ms": 5000, + } + + result = ChallengeScorerService.score_with_plugin( + scorer_plugin_id="builtin.weighted_scorer", + scorer_entrypoint="services.scorers.weighted:WeightedScorer", + metrics=metrics, + config=config, + ctx=ctx, + ) + + # Expected: 0 + 10 - 30 - 100 = -120, but clamped to 0 + assert result["score"] == 0.0 + + def test_scorer_with_missing_plugin(self): + """Test error handling for missing plugin.""" + with pytest.raises(ValueError, match="Failed to load scorer plugin"): + ChallengeScorerService.score_with_plugin( + scorer_plugin_id="nonexistent", + scorer_entrypoint="nonexistent.module:NonexistentScorer", + metrics={}, + config={}, + ctx={"timeout_ms": 5000}, + ) + + def test_scorer_with_invalid_entrypoint(self): + """Test error handling for invalid entrypoint format.""" + with pytest.raises(ValueError, match="scorer_plugin_id and scorer_entrypoint are required"): + ChallengeScorerService.score_with_plugin( + scorer_plugin_id=None, + scorer_entrypoint=None, + metrics={}, + config={}, + ctx={"timeout_ms": 5000}, + ) diff --git a/api/tests/unit_tests/services/test_challenge_service.py b/api/tests/unit_tests/services/test_challenge_service.py new file mode 100644 index 000000000..684c0d519 --- /dev/null +++ b/api/tests/unit_tests/services/test_challenge_service.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from models.challenge import ChallengeAttempt +from services.challenge_service import ChallengeService + + +def test_evaluate_outcome_regex_match(): + ok, details = ChallengeService.evaluate_outcome( + "Hello SECRET", + {"success_type": "regex", "success_pattern": "secret"}, + ) + assert ok is True + assert details.get("mode") == "regex" + + +def test_evaluate_outcome_contains(): + ok, _ = ChallengeService.evaluate_outcome( + "hello world", + {"success_type": "contains", "success_pattern": "world"}, + ) + assert ok is True + + +def test_record_attempt_creates_row(mocker): + # mock db.session + session = mocker.MagicMock() + attempt = ChallengeService.record_attempt( + tenant_id="t1", + challenge_id="c1", + end_user_id=None, + account_id=None, + workflow_run_id=None, + succeeded=True, + score=10.0, + session=session, + ) + assert isinstance(attempt, ChallengeAttempt) + session.add.assert_called_once() + session.commit.assert_called_once() + diff --git a/api/tests/unit_tests/services/test_red_blue_service.py b/api/tests/unit_tests/services/test_red_blue_service.py new file mode 100644 index 000000000..a99f980c4 --- /dev/null +++ b/api/tests/unit_tests/services/test_red_blue_service.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from types import SimpleNamespace + +from services.red_blue_service import RedBlueService + + +def test_submit_prompt_creates_submission(mocker): + session = mocker.MagicMock() + sub = RedBlueService.submit_prompt( + challenge_id="cid", + tenant_id="tid", + team="red", + prompt="attack", + account_id="aid", + end_user_id="eid", + session=session, + ) + assert sub.team == "red" + session.add.assert_called_once() + session.commit.assert_called_once() + + +def test_select_counterparty_submission_latest_active(mocker): + c = SimpleNamespace(id="cid") + session = mocker.MagicMock() + qs = ( + session.query.return_value.filter.return_value.order_by.return_value + ) + qs.first.return_value = SimpleNamespace(id="subid", team="blue") + sub = RedBlueService.select_counterparty_submission(challenge=c, team="red", session=session) + assert sub.team == "blue" + + diff --git a/docs/design/prompt-challenges.md b/docs/design/prompt-challenges.md new file mode 100644 index 000000000..8cf19b87e --- /dev/null +++ b/docs/design/prompt-challenges.md @@ -0,0 +1,633 @@ +## Prompt Hacking Challenges — Design + +### Overview + +Enable developer-authored prompt hacking challenges inside Dify’s workflow builder via a new workflow node. Players can register/login using the existing web auth and compete on challenges. Attempts are recorded server-side and leaderboards are exposed via public web APIs. + +### Goals + +- Add a first-class workflow node that evaluates success/failure against developer-specified criteria. +- Add a Judging LLM node that compares model outputs to the challenge goal and produces pass/fail, textual feedback, and a 1–10 rating. +- Persist attempts with metadata for scoring and leaderboards. +- Reuse existing account/web auth for players. +- Fit Dify’s DDD/Clean Architecture: models, services, controllers, workflow nodes, and frontend builder integration. + +### Non-Goals + +- Anti-cheat measures beyond simple rate limiting. +- Complex custom scoring plugins (design leaves a hook for future work). + +## Architecture Summary + +### Backend components + +- Models (SQLAlchemy) + - Challenge + - id, tenant_id, app_id, workflow_id + - name, description, goal (plain text shown to players) + - success_type: one of ['regex', 'contains', 'custom'] + - success_pattern: string (regex or substring depending on type) + - secret_ref: reference to server-side secret (never exposed to clients) + - scoring_strategy: one of ['first', 'fastest', 'fewest_tokens', 'custom'] + - is_active: bool + - created_by, created_at, updated_by, updated_at + - ChallengeAttempt + - id, tenant_id, challenge_id (FK), end_user_id (FK), workflow_run_id (optional FK) + - succeeded: bool + - score: numeric (meaning depends on strategy) + - judge_rating: int (0–10) + - judge_feedback: text + - judge_output_raw: jsonb (optional; structured judgement payload) + - tokens_total: int (when available from run metrics) + - elapsed_ms: int (when available) + - created_at + +- Service layer (e.g., `ChallengeService`, `ChallengeJudgeService`) + - evaluate_outcome(output, cfg) -> (succeeded: bool, details: dict) + - judge_with_llm(goal, response, cfg) -> { passed: bool, rating: int, feedback: str, raw?: dict } + - evaluate_with_plugin(evaluator_ref, goal, response, ctx) -> { passed: bool, rating?: int, feedback?: str, raw?: dict } + - score_with_plugin(scorer_ref, attempt_metrics, ctx) -> { score: number, details?: dict } + - record_attempt(tenant_id, challenge_id, end_user_id, run_meta, succeeded) -> ChallengeAttempt + - get_leaderboard(challenge_id, limit, strategy) -> list + - get_challenge_public(challenge_id) -> dict + +- Controllers + - Console (for creators): CRUD on challenges under the workspace (`/console/api/challenges`) + - Web (for players): public endpoints under `/web/api/challenges` + - List active challenges, fetch details, fetch leaderboard + - Optional auth via existing web login for personalization, otherwise anonymous read + +- Workflow nodes + - NodeType: `challenge-evaluator` + - Config + - `challenge_id`: reference to a stored Challenge (preferred) + - or inline config: `success_type`, `success_pattern`, `scoring_strategy` + - `mask_variables`: string[] — variable names to redact in logs + - Execution + - Consumes upstream content (typically latest assistant output) + - Evaluates success with `ChallengeService.evaluate_outcome` + - If an `EndUser` context exists and a `challenge_id` is present, writes `ChallengeAttempt` + - Outputs `{ challenge_succeeded: boolean, message?: string }`, optionally passes through original output + - NodeType: `judging-llm` + - Purpose: judge a model response against the challenge goal using an LLM rubric. + - Config + - `judge_model`: provider/name/version + - `temperature`, `max_tokens`, other model params + - `rubric_prompt_template`: template with placeholders for {goal}, {response}, optional {hints} + - `rating_scale`: default 0–10; configurable upper bound optional + - `pass_threshold`: integer (default 5) + - Inputs + - `goal`: the attacking goal or acceptance criteria + - `response`: the model output to evaluate + - Execution + - Calls `ChallengeJudgeService.judge_with_llm()` to obtain structured judgement + - Returns outputs `{ judge_passed: boolean, judge_rating: number (0–10), judge_feedback: string, judge_raw?: object }` + - Integration + - Downstream `challenge-evaluator` can consume `judge_passed` and `judge_rating` to record an attempt instead of regex/contains + - Alternatively, `challenge-evaluator` may support an `evaluation_mode: 'rules' | 'llm-judge'` to invoke judging internally + - NodeType: `team-challenge` (Red/Blue orchestrator) + - Purpose: orchestrate two-sided challenges where players choose Red (attack) or Blue (defense) and submit prompts accordingly. The node pairs attacks and defenses, configures the LLM, invokes judging, and emits scores. + - Config + - `red_blue_challenge_id`: reference to a Red/Blue challenge definition + - `defense_selection_policy`: 'latest_best' | 'random_active' | 'round_robin' | 'request_new_if_none' + - `attack_selection_policy`: same options for the defense side evaluation path + - `judge_suite`: list of category tests to run (e.g., CBRNE, SA, SH, RWH, V, M) + - `scoring_strategy`: 'red_blue_ratio' | 'custom' + - Inputs + - `team_choice`: 'red' | 'blue' + - `attack_prompt?`: string (when `team_choice = 'red'`) + - `defense_prompt?`: string (when `team_choice = 'blue'`) + - Execution (high level) + - If `team_choice = 'red'`: + - Persist `attack_prompt` submission + - Load a defense by policy; if none, optionally signal Blue to provide one (async) and fall back to last known + - Configure LLM with defense as system prompt, submit attack as user message + - Run `judge_suite` via `judging-llm`; compute Red score + - If `team_choice = 'blue'`: + - Persist `defense_prompt` submission + - Load an attack by policy; if none, signal Red to provide one (async) and fall back to last known + - Configure LLM with defense as system prompt and submit the loaded attack + - Run `judge_suite`; compute Blue score (prevention) + - Persist pairing and metrics + - Outputs + - `{ team: 'red'|'blue', judge_passed: boolean, judge_rating: number, judge_feedback: string, categories: Record, team_points: number, total_points: number }` + +### Frontend components + +- Workflow builder + - Add `Prompt Challenge` to the node palette + - Add `Judging LLM` to the node palette + - Node editor panel: select existing Challenge or define inline success criteria + - Judging panel: choose model, edit rubric prompt, set pass threshold, preview structured outputs + - Custom evaluator/scorer panels: choose plugin and configure JSON settings with live schema validation + - I18n strings in `web/i18n/en-US/` + - Challenge display & theming + - Author-provided instructions (Markdown) render before/alongside the task input area + - Theme tokens (colors, logo, background) applied to challenge pages + - Optional hero image/video via existing `UploadFile` and signed URLs + +- Optional player UX (phase 2) + - `/challenges` list and `/challenges/[id]` details with leaderboard + - `/challenge-collections` list and `/challenge-collections/[id]` details with collection leaderboard + - Use existing web login endpoints + +## Data Model + +Minimal table shapes (final columns managed in migration): + +```sql +-- challenges +id (uuid pk) +tenant_id (uuid fk) +app_id (uuid fk) +workflow_id (uuid fk) +name (text) +description (text) +goal (text) +success_type (text) +success_pattern (text) +secret_ref (text) +scoring_strategy (text) +is_active (bool) +created_by (uuid) +created_at (timestamp) +updated_by (uuid) +updated_at (timestamp) + +-- challenge_attempts +id (uuid pk) +tenant_id (uuid fk) +challenge_id (uuid fk) +end_user_id (uuid fk) +workflow_run_id (uuid fk, nullable) +succeeded (bool) +score (numeric) +tokens_total (int) +elapsed_ms (int) +created_at (timestamp) +``` + +Additional columns for judging: + +```sql +ALTER TABLE challenge_attempts + ADD COLUMN judge_rating integer, + ADD COLUMN judge_feedback text, + ADD COLUMN judge_output_raw jsonb; +``` + +Optional columns for custom evaluators/scorers: + +```sql +ALTER TABLE challenges + ADD COLUMN evaluator_type text DEFAULT 'rules', -- one of: rules, llm-judge, custom + ADD COLUMN evaluator_plugin_id text, + ADD COLUMN evaluator_entrypoint text, -- e.g., "pkg.module:Evaluator" + ADD COLUMN evaluator_config jsonb, + ADD COLUMN scoring_plugin_id text, + ADD COLUMN scoring_entrypoint text, -- e.g., "pkg.module:Scorer" + ADD COLUMN scoring_config jsonb; +``` + +Additional tables for Red/Blue team challenges: + +```sql +-- red_blue_challenges (definition) +CREATE TABLE red_blue_challenges ( + id uuid PRIMARY KEY DEFAULT uuid_generate_v4(), + tenant_id uuid NOT NULL REFERENCES tenants(id), + app_id uuid NOT NULL REFERENCES apps(id), + workflow_id uuid REFERENCES workflows(id), + name text NOT NULL, + description text, + judge_suite jsonb NOT NULL, -- list of categories/tests + defense_selection_policy text NOT NULL DEFAULT 'latest_best', + attack_selection_policy text NOT NULL DEFAULT 'latest_best', + scoring_strategy text NOT NULL DEFAULT 'red_blue_ratio', + theme jsonb, + instructions_md text, + is_active boolean NOT NULL DEFAULT true, + created_by uuid, + created_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_by uuid, + updated_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +-- team_submissions (attack/defense prompts) +CREATE TABLE team_submissions ( + id uuid PRIMARY KEY DEFAULT uuid_generate_v4(), + red_blue_challenge_id uuid NOT NULL REFERENCES red_blue_challenges(id) ON DELETE CASCADE, + tenant_id uuid NOT NULL, + account_id uuid NULL, + end_user_id uuid NULL, + team text NOT NULL CHECK (team in ('red','blue')), + prompt text NOT NULL, + active boolean NOT NULL DEFAULT true, + created_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +-- pairings (which attack tested against which defense) +CREATE TABLE team_pairings ( + id uuid PRIMARY KEY DEFAULT uuid_generate_v4(), + red_blue_challenge_id uuid NOT NULL REFERENCES red_blue_challenges(id) ON DELETE CASCADE, + tenant_id uuid NOT NULL, + attack_submission_id uuid REFERENCES team_submissions(id), + defense_submission_id uuid REFERENCES team_submissions(id), + judge_output_raw jsonb, + categories jsonb, -- e.g., per-suite pass/fail or rating + judge_rating integer, + judge_feedback text, + red_points numeric NOT NULL DEFAULT 0, + blue_points numeric NOT NULL DEFAULT 0, + tokens_total int, + elapsed_ms int, + created_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP +); +``` + +## API Design + +### Console (creator) + +- `GET /console/api/challenges?app_id=...` — list +- `POST /console/api/challenges` — create +- `GET /console/api/challenges/{id}` — retrieve +- `PATCH /console/api/challenges/{id}` — update +- `DELETE /console/api/challenges/{id}` — delete + +All require console `login_required` and membership in tenant. + +### Web (player) + +- `GET /web/api/challenges` — list active challenges (public) +- `GET /web/api/challenges/{id}` — details (public) +- `GET /web/api/challenges/{id}/leaderboard?limit=...` — leaderboard (public) + +Player login uses existing web login endpoints to obtain access token when needed for personalization. + +### Collections + +Console (creator) + +- `GET /console/api/challenge-collections?app_id=...` +- `POST /console/api/challenge-collections` +- `GET /console/api/challenge-collections/{id}` +- `PATCH /console/api/challenge-collections/{id}` +- `DELETE /console/api/challenge-collections/{id}` +- `PUT /console/api/challenge-collections/{id}/challenges` (set membership and order) + +Web (player) + +- `GET /web/api/challenge-collections` — list public collections +- `GET /web/api/challenge-collections/{id}` — collection details (instructions/theme), included challenges +- `GET /web/api/challenge-collections/{id}/leaderboard?limit=...` — collection leaderboard + +### Red/Blue team challenge APIs + +Console (creator) + +- `POST /console/api/red-blue-challenges` — create +- `GET /console/api/red-blue-challenges?app_id=...` — list +- `GET /console/api/red-blue-challenges/{id}` — detail +- `PATCH /console/api/red-blue-challenges/{id}` — update +- `DELETE /console/api/red-blue-challenges/{id}` — delete +- `GET /console/api/red-blue-challenges/{id}/pairings` — view pairings/metrics + +Web (player) + +- `POST /web/api/red-blue-challenges/{id}/join` — join red or blue (payload: { team }) +- `POST /web/api/red-blue-challenges/{id}/submit` — submit attack/defense (payload: { team, prompt }) +- `GET /web/api/red-blue-challenges/{id}` — public info (instructions, theme, leaderboard snapshot) +- `GET /web/api/red-blue-challenges/{id}/leaderboard?limit=...` — red vs blue standings + +## Player Registration & Identity + +### Registration and login + +- Reuse existing web auth service for player accounts: + - Email/password login: `POST /web/api/login` + - Email code login: `POST /web/api/login/email-code/send` + `POST /web/api/login/email-code/verify` (existing patterns) +- Add an explicit web registration endpoint (thin wrapper around `RegisterService.register`): + - `POST /web/api/register` (payload: email, name, password | email-code) + - Behavior: + - `create_workspace_required = False` to avoid auto-creating workspaces for players + - `status = active` + - Set `interface_language` from `Accept-Language` as done in OAuth flow + - On success, also create or associate a per-tenant `EndUser` record so gameplay runs can be attributed consistently. + +### Player identity during runs + +- Each gameplay run already has an `EndUser` context. For registered players: + - When a player is authenticated, resolve (or lazily create) an `EndUser` tied to their `account_id` for the current tenant/app + - Persist `end_user_id` to `ChallengeAttempt` as today; optionally also store `account_id` for simplified leaderboard personalization + +### Optional schema addition + +```sql +ALTER TABLE challenge_attempts + ADD COLUMN account_id uuid NULL; +``` + +This enables direct joins to accounts for notification and profile display without traversing end-user mappings. + +### Player profile (optional) + +Introduce a lightweight `player_profiles` table for nickname/avatar/notification preferences without touching `account` directly: + +```sql +CREATE TABLE player_profiles ( + account_id uuid PRIMARY KEY REFERENCES accounts(id) ON DELETE CASCADE, + display_name text, + avatar_url text, + notify_on_first_blood boolean DEFAULT true, + notify_on_record_beaten boolean DEFAULT true, + created_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP +); +``` + +## Workflow Node Execution + +1. Node receives upstream output (string or structured content). A typical placement is after an LLM node. +2. Node loads Challenge config (stored by `challenge_id` or inline). +3. Node evaluates success by either rules or judging: + - `regex`: test pattern against text output + - `contains`: case-insensitive substring match + - `llm-judge`: call the `judging-llm` node (or internal judge) to obtain `{ judge_passed, judge_rating, judge_feedback }` + - `custom`: call `evaluate_with_plugin` using configured `evaluator_plugin_id`/`evaluator_entrypoint` +4. If `EndUser` and `challenge_id` present, record a `ChallengeAttempt` with run metrics (tokens, elapsed), and when available, `judge_rating`/`judge_feedback`. +5. Node outputs + - Rules mode: `{ challenge_succeeded: boolean, message?: string }` + - Judging mode: `{ challenge_succeeded: boolean, judge_rating: number, judge_feedback: string }` + - Pass through original output for chaining when needed. + +For collections, attempts are recorded per challenge as usual. Collection leaderboard aggregation is computed over a player’s best attempt per challenge, combined using the collection’s `scoring_strategy` (e.g., sum of scores, total elapsed_ms, etc.). + +## Scoring Strategies + +- `first`: first successful attempt time wins (leaderboard sorted by earliest `created_at`). +- `fastest`: success with lowest `elapsed_ms` wins. +- `fewest_tokens`: success with lowest `tokens_total` wins. +- `highest_rating`: success with the highest `judge_rating` wins; ties broken by earliest `created_at`. +- `custom`: compute via `score_with_plugin` using `scoring_plugin_id`/`scoring_entrypoint`. + +Collection strategies: + +- `sum`: sum of per-challenge scores in the collection (uses built-in or custom scoring per challenge) +- `fastest_total`: sum of `elapsed_ms` of successful best attempts (lower is better) +- `fewest_tokens_total`: sum of `tokens_total` of successful best attempts (lower is better) +- `highest_avg_rating`: average of `judge_rating` across completed challenges (higher is better) +- `custom`: plugin-defined; service calls `score_with_plugin` at collection level with a list of per-challenge metrics + +Red/Blue team scoring: + +- Base idea: award Red points for breakthroughs and Blue points for prevented attacks. +- Suggested defaults per pairing: + - For each category in the judge suite (e.g., CBRNE, SA, SH, RWH, V, M): + - If the attack bypasses defense (category breach), Red +1 + - If defense prevents (no breach), Blue +1 + - Bonus based on `judge_rating` magnitude for breakthrough severity (e.g., Red +round(rating/3)) + - Time/token penalties can reduce points to encourage efficient strategies +- Ratio-based standings: + - Red ratio = Red points / (Red points + Blue points) + - Blue ratio = Blue points / (Red points + Blue points) +- Custom plugin scoring: + - Provide all pairing metrics to a scorer plugin to compute per-pairing or cumulative standings + +## Custom Evaluators & Scorers + +This section specifies how custom evaluation and scoring plugins integrate with challenges. + +### Concepts + +- Evaluator: decides whether a response meets the goal. May optionally emit a rating (0–10) and textual feedback. +- Scorer: converts an attempt’s metrics (e.g., elapsed time, tokens, rating) into a numeric score for leaderboards. + +### Data model + +- `challenges.evaluator_type`: one of `rules`, `llm-judge`, or `custom`. +- `challenges.evaluator_plugin_id`, `evaluator_entrypoint`, `evaluator_config`: identify and configure the evaluator plugin when `custom` is selected. +- `challenges.scoring_plugin_id`, `scoring_entrypoint`, `scoring_config`: identify and configure the scorer plugin when `scoring_strategy = 'custom'`. + +### Service interfaces + +Evaluator interface (Python): + +```python +class EvaluatorContext(TypedDict, total=False): + tenant_id: str + app_id: str + workflow_id: str + challenge_id: str + end_user_id: str | None + variables: dict[str, Any] # sanitized runtime variables + timeout_ms: int + +class EvaluatorResult(TypedDict, total=False): + passed: bool + rating: int # 0–10 (optional) + feedback: str # textual feedback for player (optional) + raw: dict[str, Any] # internal diagnostics (optional) + +class EvaluatorProtocol(Protocol): + def evaluate(self, goal: str, response: str, config: dict[str, Any], ctx: EvaluatorContext) -> EvaluatorResult: ... +``` + +Scorer interface (Python): + +```python +class ScoringContext(TypedDict, total=False): + tenant_id: str + app_id: str + workflow_id: str + challenge_id: str + end_user_id: str | None + timeout_ms: int + +class AttemptMetrics(TypedDict, total=False): + succeeded: bool + tokens_total: int | None + elapsed_ms: int | None + rating: int | None + created_at: int | None # epoch ms + +class ScoringResult(TypedDict, total=False): + score: float + details: dict[str, Any] | None + +class ScorerProtocol(Protocol): + def score(self, metrics: AttemptMetrics, config: dict[str, Any], ctx: ScoringContext) -> ScoringResult: ... +``` + +### Discovery and loading + +- Plugins are discovered via the existing plugin manager. Each plugin exposes one or more entrypoints (e.g., `pkg.module:Evaluator`). +- `evaluator_plugin_id`/`evaluator_entrypoint` and `scoring_plugin_id`/`scoring_entrypoint` identify the target callables. +- Services load plugins lazily and cache handles with safe import guards. + +### Execution flow + +1) For `evaluator_type = 'custom'`, the `challenge-evaluator` node calls `evaluate_with_plugin` with `(goal, response, evaluator_config, ctx)`. +2) If `EvaluatorResult.passed` is true, set `challenge_succeeded = True` and persist `judge_rating`/`judge_feedback` if provided. +3) For `scoring_strategy = 'custom'`, call `score_with_plugin` with attempt metrics to compute `score`. +4) Persist `ChallengeAttempt` with plugin-derived fields. + +### Frontend configuration + +- Prompt Challenge panel + - Evaluation mode: Rules | Judging LLM | Custom Evaluator + - When Custom Evaluator is chosen: + - Plugin selector: lists available evaluator plugins by `plugin_id` and exposed entrypoints + - JSON config editor with schema-based validation (optional `$schema` per plugin) +- Scoring section + - Strategy: First | Fastest | Fewest Tokens | Highest Rating | Custom + - When Custom is chosen: plugin selector + JSON config editor + +### Security & sandboxing + +- Plugins run under server control with: + - Timeouts (default 5s) and memory ceilings; cancellation on overrun + - No network access by default (opt-in allowlist if ever needed) + - Sanitized inputs: secrets removed; only whitelisted variables passed + - Structured error mapping; no stack traces leaked to players + +### Error handling & observability + +- If plugin load or execution fails, treat as non-pass and record a generic failure reason. +- Emit structured logs/events with plugin identifiers and durations (no sensitive content). +- Surface minimal feedback to players; detailed diagnostics remain internal. + +### Examples + +Evaluator (substring with banned terms): + +```python +class SimpleEvaluator: + def evaluate(self, goal, response, config, ctx): + required = config.get('must_contain', []) + banned = set(map(str.lower, config.get('banned', []))) + if any(w.lower() in response.lower() for w in banned): + return {'passed': False, 'feedback': 'Banned content detected', 'rating': 2} + if all(w.lower() in response.lower() for w in required): + return {'passed': True, 'feedback': 'Meets criteria', 'rating': 8} + return {'passed': False, 'feedback': 'Missing required signal', 'rating': 5} +``` + +Scorer (weighted combo): + +```python +class WeightedScorer: + def score(self, metrics, config, ctx): + base = 0.0 + if metrics.get('succeeded'): + base += config.get('success_bonus', 100) + rating = metrics.get('rating') or 0 + elapsed = metrics.get('elapsed_ms') or 0 + tokens = metrics.get('tokens_total') or 0 + score = base + rating * config.get('rating_weight', 10) \ + - (elapsed / 1000.0) * config.get('time_penalty', 1.0) \ + - tokens * config.get('token_penalty', 0.01) + return {'score': max(score, 0.0)} +``` + +## Security & Privacy + +- Never expose `secret_ref` or derived secrets to clients or node outputs. +- Redact configured `mask_variables` in logs and stored attempt details. +- Apply rate limiting using existing helpers to mitigate brute-force attempts. +- Store minimal details on failed attempts to reduce information leakage. + - Sanitize Markdown instructions to prevent XSS; allow a safe subset (links/images) with rel=noopener. + - Theme application is constrained to a whitelist of CSS variables and asset URLs served via signed URLs. + +## Testing Plan + +- Service unit tests + - `evaluate_outcome` for regex/contains (edge cases, unicode, multiline) + - `judge_with_llm` deterministic tests with mocked LLM returning structured payloads + - `record_attempt` scoring aggregation and sorting +- Node tests + - Given inputs, assert success/failure and resulting outputs + - Judging node: asserts `{ judge_passed, judge_rating, judge_feedback }` shape and thresholds + - When `challenge_id` present, attempts are written; when not, none are written +- API tests + - Console CRUD happy paths and permissions + - Web endpoints list/details/leaderboard +- Frontend + - Panel validation, serialization/deserialization of node config + - Judging panel: model selection, rubric template binding, threshold validation + - Node palette presence + - Challenge instructions: Markdown renderer sanitization, link and image handling + - Theming: verify CSS variable injection, dark/light modes, and fallback to defaults + - Collections UI: ordering, visibility filtering, collection leaderboard rendering + +## Rollout + +1. DB migrations: create `challenges`, `challenge_attempts` tables; add judging columns. +2. Backend: models, service, console/web controllers, workflow node, `NodeType` and node mapping registration. +3. Frontend: add block enum, node + panel components (Prompt Challenge, Judging LLM), node palette default, i18n entries. +4. QA: run `make lint`, `make type-check`, and unit tests; `pnpm lint` and tests for web. +5. Documentation: link this design from contributor docs as needed. + +## Open Questions / Future Work + +- Anti-cheat signals and anomaly detection. +- Custom evaluator/scoring plugin hooks with sandboxing. +- Team competitions and seasons. +- Per-challenge rate limits and cooldowns. + +## Notifications + +### Events + +- `challenge_first_blood`: emitted when the first successful attempt occurs for a challenge +- `challenge_record_beaten`: emitted when a leaderboard record is surpassed under the active scoring strategy +- `team_pairing_completed`: emitted after each Red/Blue pairing is judged with per-team points + +### Delivery channels + +- In-app (console): add a section in the console UI for challenge events; poll or use server-sent events +- Email (optional): send via existing email task infra (e.g., Celery tasks) +- Webhook (optional): per-tenant webhook endpoint configured in workspace settings to receive challenge events + +### Payloads + +```json +{ + "event": "challenge_record_beaten", + "challenge_id": "...", + "scoring_strategy": "highest_rating", + "previous_record": { "account_id": "...", "score": 95.2 }, + "new_record": { "account_id": "...", "score": 96.8 }, + "occurred_at": 1730000000000 +} +``` + +Red/Blue pairing example: + +```json +{ + "event": "team_pairing_completed", + "red_blue_challenge_id": "...", + "pairing_id": "...", + "attack_submission_id": "...", + "defense_submission_id": "...", + "categories": { "CBRNE": true, "SA": false, "SH": true }, + "judge_rating": 8, + "red_points": 4, + "blue_points": 2, + "occurred_at": 1730000000001 +} +``` + +### Triggers in services + +- After `record_attempt`, re-evaluate leaderboard head for the challenge against the prior head +- If the head changed and meets trigger criteria, enqueue notification tasks +- Respect player profile preferences (`notify_on_first_blood`, `notify_on_record_beaten`) + +### Player-facing feedback + +- Immediate feedback comes from node outputs (e.g., `judge_feedback`, `judge_rating`) +- Aggregated notifications (record beaten, first blood) are async and opt-in per player preferences + + diff --git a/web/__tests__/challenge-submission.test.ts b/web/__tests__/challenge-submission.test.ts new file mode 100644 index 000000000..6bc6f29e1 --- /dev/null +++ b/web/__tests__/challenge-submission.test.ts @@ -0,0 +1,102 @@ +import { submitChallengeAttempt } from '@/service/challenges' +import { postPublic } from '@/service/base' +import { PUBLIC_API_PREFIX } from '@/config' + +jest.mock('@/service/base', () => ({ + getPublic: jest.fn(), + postPublic: jest.fn(), +})) + +const mockedPostPublic = postPublic as jest.MockedFunction +const originalFetch = globalThis.fetch +let fetchMock: jest.Mock + +describe('submitChallengeAttempt', () => { + beforeEach(() => { + fetchMock = jest.fn() + globalThis.fetch = fetchMock as unknown as typeof fetch + + mockedPostPublic.mockReset() + mockedPostPublic.mockResolvedValue({ result: 'success' } as any) + + localStorage.clear() + }) + + afterEach(() => { + jest.clearAllMocks() + }) + + afterAll(() => { + globalThis.fetch = originalFetch + }) + + it('throws when challenge web app is not published', async () => { + await expect( + submitChallengeAttempt('challenge-id', 'app-id', undefined, 'chat', 'hello'), + ).rejects.toThrow('Challenge app is not published') + + expect(fetchMock).not.toHaveBeenCalled() + expect(mockedPostPublic).not.toHaveBeenCalled() + }) + + it('requests a passport token and submits chat attempts through /chat-messages', async () => { + const passportToken = 'chat-passport-token' + fetchMock.mockResolvedValue({ + ok: true, + json: jest.fn().mockResolvedValue({ access_token: passportToken }), + }) + + await submitChallengeAttempt('challenge-123', 'app-abc', 'site-code-xyz', 'chat', 'solve this') + + expect(fetchMock).toHaveBeenCalledWith(`${PUBLIC_API_PREFIX}/passport`, { + method: 'GET', + headers: { + 'X-App-Code': 'site-code-xyz', + }, + credentials: 'include', + }) + + expect(mockedPostPublic).toHaveBeenCalledWith('/chat-messages', expect.objectContaining({ + body: { + query: 'solve this', + inputs: {}, + response_mode: 'blocking', + conversation_id: '', + }, + })) + + const storedToken = JSON.parse(localStorage.getItem('token') || '{}') + expect(storedToken.version).toBe(2) + expect(storedToken['challenge-123'].DEFAULT).toBe(passportToken) + }) + + it('requests a passport token and submits workflow attempts through /workflows/run', async () => { + const passportToken = 'workflow-passport-token' + fetchMock.mockResolvedValue({ + ok: true, + json: jest.fn().mockResolvedValue({ access_token: passportToken }), + }) + + await submitChallengeAttempt('challenge-456', 'app-def', 'site-code-xyz', 'workflow', 'my answer') + + expect(fetchMock).toHaveBeenCalledWith(`${PUBLIC_API_PREFIX}/passport`, { + method: 'GET', + headers: { + 'X-App-Code': 'site-code-xyz', + }, + credentials: 'include', + }) + + expect(mockedPostPublic).toHaveBeenCalledWith('/workflows/run', expect.objectContaining({ + body: { + inputs: { + user_prompt: 'my answer', + }, + response_mode: 'blocking', + }, + })) + + const storedToken = JSON.parse(localStorage.getItem('token') || '{}') + expect(storedToken['challenge-456'].DEFAULT).toBe(passportToken) + }) +}) diff --git a/web/app/(commonLayout)/console/challenges/create-challenge-modal.tsx b/web/app/(commonLayout)/console/challenges/create-challenge-modal.tsx new file mode 100644 index 000000000..412ba24dd --- /dev/null +++ b/web/app/(commonLayout)/console/challenges/create-challenge-modal.tsx @@ -0,0 +1,163 @@ +'use client' +import { useState } from 'react' +import { useTranslation } from 'react-i18next' +import Modal from '@/app/components/base/modal' +import Button from '@/app/components/base/button' +import Input from '@/app/components/base/input' +import Textarea from '@/app/components/base/textarea' +import Switch from '@/app/components/base/switch' +import Toast from '@/app/components/base/toast' +import Select from '@/app/components/base/select' +import { createConsoleChallenge } from '@/service/console/challenges' +import { useAppFullList } from '@/service/use-apps' +import { useAppWorkflow } from '@/service/use-workflow' + +type Props = { + show: boolean + onHide: () => void + onSuccess: () => void +} + +export default function CreateChallengeModal({ show, onHide, onSuccess }: Props) { + const { t } = useTranslation() + const [form, setForm] = useState({ + app_id: '', + workflow_id: '', + name: '', + description: '', + goal: '', + is_active: true, + }) + const [loading, setLoading] = useState(false) + + const { data: appsData } = useAppFullList() + const apps = appsData?.data || [] + const { data: workflowData } = useAppWorkflow(form.app_id) + const hasWorkflow = !!workflowData?.graph + + const handleSubmit = async () => { + if (!form.app_id || !form.name) { + Toast.notify({ type: 'error', message: 'App ID and Name are required' }) + return + } + + setLoading(true) + try { + await createConsoleChallenge(form) + Toast.notify({ type: 'success', message: 'Challenge created successfully' }) + onSuccess() + } + catch (e: any) { + Toast.notify({ type: 'error', message: e.message || 'Failed to create challenge' }) + } + finally { + setLoading(false) + } + } + + return ( + +
+
+ + + +
+
+ {workflowData?.id ? `Workflow ID: ${workflowData.id}` : 'No workflow published'} +
+
+ )} + +
+ + setForm({ ...form, name: e.target.value })} + placeholder={t('challenges.console.form.namePlaceholder')} + /> +
+ +
+ +