From 96634e7cec549a92169a60cfd77e4ff95adee48b Mon Sep 17 00:00:00 2001
From: Joey Yakimowich-Payne <jyapayne@pm.me>
Date: Wed, 1 Oct 2025 16:07:40 -0600
Subject: [PATCH] feat: propagate judge feedback through challenge workflow
 results

---
 api/core/workflow/nodes/answer/answer_node.py |  11 +-
 .../nodes/challenge_evaluator/node.py         |  34 ++--
 web/app/challenges/[id]/page.tsx              |  30 +++-
 web/i18n/en-US/challenges.ts                  |   4 +
 web/i18n/en-US/common.ts                      |   1 +
 web/service/challenges.ts                     | 145 +++++++++++-------
 6 files changed, 158 insertions(+), 67 deletions(-)

diff --git a/api/core/workflow/nodes/answer/answer_node.py b/api/core/workflow/nodes/answer/answer_node.py
index 86174c7ea..e9df26006 100644
--- a/api/core/workflow/nodes/answer/answer_node.py
+++ b/api/core/workflow/nodes/answer/answer_node.py
@@ -45,9 +45,18 @@ class AnswerNode(Node):
     def _run(self) -> NodeRunResult:
         segments = self.graph_runtime_state.variable_pool.convert_template(self._node_data.answer)
         files = self._extract_files_from_segments(segments.value)
+        # Merge answer outputs into existing workflow outputs so upstream metadata (e.g. judge feedback)
+        # is preserved and included in the final workflow_finished payload.
+        existing_outputs = self.graph_runtime_state.outputs
+        merged_outputs: dict[str, Any] = {
+            **existing_outputs,
+            'answer': segments.markdown,
+            'files': ArrayFileSegment(value=files),
+        }
+        self.graph_runtime_state.outputs = merged_outputs
         return NodeRunResult(
             status=WorkflowNodeExecutionStatus.SUCCEEDED,
-            outputs={"answer": segments.markdown, "files": ArrayFileSegment(value=files)},
+            outputs=merged_outputs,
         )
 
     def _extract_files_from_segments(self, segments: Sequence[Segment]):
diff --git a/api/core/workflow/nodes/challenge_evaluator/node.py b/api/core/workflow/nodes/challenge_evaluator/node.py
index e06d856ee..c0062b902 100644
--- a/api/core/workflow/nodes/challenge_evaluator/node.py
+++ b/api/core/workflow/nodes/challenge_evaluator/node.py
@@ -148,6 +148,8 @@ class ChallengeEvaluatorNode(Node):
 
         # optional persistence if config carries challenge_id
         challenge_id = self._config.get('challenge_id')
+        judge_feedback = judge_feedback_from_input if is_judge_input else None
+        judge_rating_value = judge_rating if is_judge_input else None
         if challenge_id:
             try:
                 # Calculate elapsed time in milliseconds
@@ -157,11 +159,17 @@ class ChallengeEvaluatorNode(Node):
                 tokens_total = self.graph_runtime_state.total_tokens
 
                 # Extract judge_rating from details if available (for highest_rating strategy)
-                judge_rating = None
-                judge_feedback = None
                 if isinstance(details, dict):
-                    judge_rating = details.get('rating')
-                    judge_feedback = details.get('feedback')
+                    judge_rating_raw = details.get('rating')
+                    judge_feedback_raw = details.get('feedback')
+                    if judge_rating_raw is not None:
+                        judge_rating_value = int(judge_rating_raw)
+                    if (
+                        judge_feedback_raw is not None
+                        and isinstance(judge_feedback_raw, str)
+                        and judge_feedback_raw.strip()
+                    ):
+                        judge_feedback = str(judge_feedback_raw)
 
                 # Load challenge to check scoring strategy
                 challenge = db.session.get(Challenge, str(challenge_id))
@@ -209,7 +217,7 @@ class ChallengeEvaluatorNode(Node):
                         logger.error("Custom scorer failed: %s", e, exc_info=True)
                         # Continue with score=None on error
 
-                ChallengeService.record_attempt(
+                _ = ChallengeService.record_attempt(
                     tenant_id=self.tenant_id,
                     challenge_id=challenge_id,
                     end_user_id=None,
@@ -217,8 +225,8 @@ class ChallengeEvaluatorNode(Node):
                     workflow_run_id=None,
                     succeeded=ok,
                     score=score,
-                    judge_rating=judge_rating,
-                    judge_feedback=judge_feedback,
+                    judge_rating=judge_rating_value,
+                    judge_feedback=judge_feedback_from_input or judge_feedback,
                     tokens_total=tokens_total,
                     elapsed_ms=elapsed_ms,
                     session=db.session,
@@ -230,8 +238,8 @@ class ChallengeEvaluatorNode(Node):
         # Always provide all output variables to match frontend getOutputVars
         outputs: dict[str, Any] = {
             'challenge_succeeded': ok,
-            'judge_rating': 0,
-            'judge_feedback': '',
+            'judge_rating': judge_rating_value,
+            'judge_feedback': judge_feedback_from_input or judge_feedback or '',
             'message': '',
         }
 
@@ -251,8 +259,14 @@ class ChallengeEvaluatorNode(Node):
                 else:
                     outputs['message'] = f"Failed: {details.get('mode', 'evaluation')} did not match"
 
+        # also persist judge outputs onto runtime state so downstream consumers can access them
+        if outputs['judge_feedback']:
+            self.graph_runtime_state.set_output('judge_feedback', outputs['judge_feedback'])
+        if outputs['judge_rating'] is not None:
+            self.graph_runtime_state.set_output('judge_rating', outputs['judge_rating'])
+        self.graph_runtime_state.set_output('challenge_succeeded', outputs['challenge_succeeded'])
+
         return NodeRunResult(
             status=WorkflowNodeExecutionStatus.SUCCEEDED,
             outputs=outputs,
         )
-
diff --git a/web/app/challenges/[id]/page.tsx b/web/app/challenges/[id]/page.tsx
index 538bf5ed5..70e5d9974 100644
--- a/web/app/challenges/[id]/page.tsx
+++ b/web/app/challenges/[id]/page.tsx
@@ -54,6 +54,14 @@ export default function ChallengeDetailPage() {
     setHasStreamingResult(false)
   }, [])
 
+  useEffect(() => {
+    const previousOverflow = document.body.style.overflowY
+    document.body.style.overflowY = 'auto'
+    return () => {
+      document.body.style.overflowY = previousOverflow
+    }
+  }, [])
+
   useEffect(() => () => {
     stopStreaming()
   }, [stopStreaming])
@@ -80,6 +88,7 @@ export default function ChallengeDetailPage() {
         challenge.app_site_code,
         challenge.app_mode || 'workflow',
         userInput,
+        challenge.goal,
         {
           onStreamUpdate: (text) => {
             setStreamingText(text)
@@ -99,9 +108,24 @@ export default function ChallengeDetailPage() {
       setHasStreamingResult(false)
       setStreamingText(result.rawText)
 
+      const judgeFeedback = typeof result.outputs?.judge_feedback === 'string' && result.outputs.judge_feedback.trim().length > 0
+        ? result.outputs.judge_feedback
+        : (typeof result.message === 'string' && result.message.trim().length > 0 ? result.message : undefined)
+      const fallbackExplanation = typeof result.outputs?.message === 'string' && result.outputs.message.trim().length > 0
+        ? result.outputs.message
+        : ''
+      const successFallback = t('challenges.player.defaultSuccessMessage', 'Challenge passed!')
+      const failureFallback = t('challenges.player.defaultFailureMessage', 'Challenge not passed.')
+      const judgeFeedbackLine = judgeFeedback
+        ? t('challenges.player.judgeFeedbackLine', { feedback: judgeFeedback, defaultValue: `${judgeFeedback}` })
+        : ''
+      const combinedMessage = result.success
+        ? [judgeFeedback || fallbackExplanation || successFallback].filter(Boolean).join('\n')
+        : [judgeFeedbackLine || fallbackExplanation || failureFallback].filter(Boolean).join('\n')
+
       setLastResult({
         success: result.success,
-        message: result.message,
+        message: combinedMessage,
         rating: result.rating,
       })
 
@@ -143,7 +167,7 @@ export default function ChallengeDetailPage() {
   }
 
   return (
-    <div className='min-h-screen bg-components-panel-bg'>
+    <div className='min-h-screen overflow-y-auto bg-components-panel-bg'>
       <div className='mx-auto max-w-5xl px-4 py-12 sm:px-6 lg:px-8'>
         <div className='mb-8'>
           <h1 className='mb-2 text-3xl font-bold text-text-primary'>{challenge.name}</h1>
@@ -220,7 +244,7 @@ export default function ChallengeDetailPage() {
                         {lastResult.success ? t('challenges.player.status.success') : t('challenges.player.status.failed')}
                       </div>
                       {lastResult.message && (
-                        <div className='text-sm text-text-secondary'>{lastResult.message}</div>
+                        <div className='whitespace-pre-wrap text-sm text-text-secondary'>{lastResult.message}</div>
                       )}
                       {lastResult.rating !== undefined && (
                         <div className='mt-2 text-sm text-text-tertiary'>
diff --git a/web/i18n/en-US/challenges.ts b/web/i18n/en-US/challenges.ts
index af9dc99ab..2f64ff6de 100644
--- a/web/i18n/en-US/challenges.ts
+++ b/web/i18n/en-US/challenges.ts
@@ -54,6 +54,10 @@ export default {
     awaitingResponse: 'Waiting for the model to respond…',
     processing: 'Processing…',
     submitButton: 'Submit',
+    defaultSuccessMessage: 'Challenge passed!',
+    defaultFailureMessage: 'Challenge not passed.',
+    ratingLine: 'Judge rating: {{rating}}/10',
+    judgeFeedbackLine: 'Judge feedback: {{feedback}}',
   },
   leaderboard: {
     title: 'Leaderboard',
diff --git a/web/i18n/en-US/common.ts b/web/i18n/en-US/common.ts
index b9d315388..55800a75d 100644
--- a/web/i18n/en-US/common.ts
+++ b/web/i18n/en-US/common.ts
@@ -12,6 +12,7 @@ const translation = {
     create: 'Created',
     remove: 'Removed',
   },
+  loading: 'Loading…',
   operation: {
     create: 'Create',
     confirm: 'Confirm',
diff --git a/web/service/challenges.ts b/web/service/challenges.ts
index 5ff5992fe..42ea45c2b 100644
--- a/web/service/challenges.ts
+++ b/web/service/challenges.ts
@@ -11,6 +11,10 @@ type ChatMessageEnd = {
     message?: string
     judge_feedback?: string
     judge_rating?: number
+    judge_outputs?: {
+      feedback?: string
+      message?: string
+    }
   }
 }
 
@@ -60,6 +64,7 @@ export async function submitChallengeAttempt(
   appSiteCode: string | undefined,
   appMode: string,
   userInput: string,
+  challengeGoal?: string,
   callbacks?: ChallengeAttemptCallbacks,
 ): Promise<ChallengeAttemptResult> {
   if (!appSiteCode)
@@ -114,6 +119,7 @@ export async function submitChallengeAttempt(
     let aggregatedText = ''
     let finalOutputs: Record<string, any> | undefined
     let finalMessage: string | undefined
+    let extractedJudgeFeedback: string | undefined
     let isSettled = false
     const releaseAbortController = () => {
       callbacks?.onAbortController?.(null)
@@ -141,8 +147,12 @@ export async function submitChallengeAttempt(
       const outputs = finalOutputs || {}
       const successFlag = Boolean(outputs.challenge_succeeded)
       const rating = outputs.judge_rating ?? outputs.rating
-      const feedback = outputs.judge_feedback || outputs.message || finalMessage || aggregatedText
-      const message = feedback || (successFlag ? 'Challenge passed!' : 'Challenge not passed.')
+      const feedback = extractedJudgeFeedback
+        ?? (typeof outputs.judge_feedback === 'string' && outputs.judge_feedback.trim().length > 0
+          ? outputs.judge_feedback
+          : undefined)
+      const fallbackMessage = outputs.message || finalMessage || aggregatedText
+      const message = feedback || fallbackMessage || (successFlag ? 'Challenge passed!' : 'Challenge not passed.')
       return {
         success: successFlag,
         rating,
@@ -178,58 +188,75 @@ export async function submitChallengeAttempt(
       },
     }
 
-    if (isChatApp) {
-      ssePost(
-        '/chat-messages',
-        {
-          body: {
-            query: userInput,
-            inputs: {},
-            response_mode: 'streaming',
-            conversation_id: '',
-          },
+    const endpoint = isChatApp ? '/chat-messages' : '/workflows/run'
+    const body = isChatApp
+      ? {
+        query: userInput,
+        inputs: {
+          challenge_goal: challengeGoal,
         },
-        {
-          ...commonOptions,
-          onData: (message: string) => {
-            aggregatedText += message
-            emitStreamUpdate()
-          },
-          onMessageReplace: (messageReplace) => {
-            aggregatedText = messageReplace.answer
-            emitStreamUpdate()
-          },
-          onMessageEnd: (messageEnd) => {
-            const metadata = (messageEnd as ChatMessageEnd).metadata
-            if (metadata?.outputs)
-              finalOutputs = metadata.outputs
-            const endMessage = metadata?.answer || metadata?.message || metadata?.judge_feedback
-            if (endMessage) {
-              aggregatedText = endMessage
-              emitStreamUpdate()
-            }
-            if (metadata?.judge_feedback)
-              finalMessage = metadata.judge_feedback
-            else if (metadata?.answer || metadata?.message)
-              finalMessage = metadata.answer || metadata.message
-          },
+        response_mode: 'streaming',
+        conversation_id: '',
+      }
+      : {
+        inputs: {
+          user_prompt: userInput,
+          challenge_goal: challengeGoal,
         },
-      )
-      return
-    }
+        response_mode: 'streaming',
+      }
 
     ssePost(
-      '/workflows/run',
+      endpoint,
       {
-        body: {
-          inputs: {
-            user_prompt: userInput,
-          },
-          response_mode: 'streaming',
-        },
+        body,
       },
       {
         ...commonOptions,
+        onData: (message: string) => {
+          aggregatedText += message
+          emitStreamUpdate()
+        },
+        onMessageReplace: (messageReplace) => {
+          aggregatedText = messageReplace.answer
+          emitStreamUpdate()
+        },
+        onMessageEnd: (messageEnd) => {
+          const metadata = (messageEnd as ChatMessageEnd).metadata
+          if (metadata?.outputs) {
+            finalOutputs = {
+              ...(finalOutputs || {}),
+              ...metadata.outputs,
+            }
+            const metaFeedback = metadata.outputs?.judge_feedback
+            if (typeof metaFeedback === 'string' && metaFeedback.trim().length > 0)
+              extractedJudgeFeedback = extractedJudgeFeedback || metaFeedback
+            const outputsMessage = metadata.outputs?.message
+            if (typeof outputsMessage === 'string' && outputsMessage.trim().length > 0) {
+              aggregatedText = outputsMessage
+              emitStreamUpdate()
+            }
+          }
+
+          if (!extractedJudgeFeedback && metadata?.judge_feedback && metadata.judge_feedback.trim().length > 0)
+            extractedJudgeFeedback = metadata.judge_feedback
+
+          if (metadata?.judge_outputs) {
+            if (!extractedJudgeFeedback && typeof metadata.judge_outputs.feedback === 'string' && metadata.judge_outputs.feedback.trim().length > 0)
+              extractedJudgeFeedback = metadata.judge_outputs.feedback
+            const judgeMessage = metadata.judge_outputs.message
+            if (typeof judgeMessage === 'string' && judgeMessage.trim().length > 0) {
+              aggregatedText = judgeMessage
+              emitStreamUpdate()
+            }
+          }
+
+          const endMessage = metadata?.answer || metadata?.message
+          if (endMessage) {
+            aggregatedText = endMessage
+            emitStreamUpdate()
+          }
+        },
         onTextChunk: (chunk) => {
           const text = (chunk as any)?.data?.text || ''
           if (text) {
@@ -239,14 +266,26 @@ export async function submitChallengeAttempt(
         },
         onWorkflowFinished: ({ data }) => {
           const resultData = (data as WorkflowFinishedResponse['data']) || {}
-          if (resultData.outputs)
-            finalOutputs = resultData.outputs
-          const message = resultData.outputs?.judge_feedback || resultData.outputs?.message
-          if (message) {
-            aggregatedText = message
-            emitStreamUpdate()
+          if (resultData.outputs) {
+            finalOutputs = {
+              ...(finalOutputs || {}),
+              ...resultData.outputs,
+            }
+            const feedback = resultData.outputs.judge_feedback
+            if (typeof feedback === 'string' && feedback.trim().length > 0)
+              extractedJudgeFeedback = extractedJudgeFeedback || feedback
+            const message = resultData.outputs.message
+            if (typeof message === 'string' && message.trim().length > 0) {
+              aggregatedText = message
+              emitStreamUpdate()
+            }
+          }
+          else if ((data as any)?.metadata?.judge_feedback) {
+            // Some workflows may emit judge feedback under metadata instead of outputs
+            const judgeFeedback = (data as any).metadata.judge_feedback
+            if (typeof judgeFeedback === 'string' && judgeFeedback.trim().length > 0)
+              extractedJudgeFeedback = extractedJudgeFeedback || judgeFeedback
           }
-          finalMessage = message || finalMessage
         },
       },
     )