feat: propagate judge feedback through challenge workflow results

2025-10-01 16:07:40 -06:00 · 2025-10-01 16:07:40 -06:00 · 96634e7cec
commit 96634e7cec
parent cb4cde0ed2
6 changed files with 158 additions and 67 deletions
--- a/api/core/workflow/nodes/answer/answer_node.py
+++ b/api/core/workflow/nodes/answer/answer_node.py
@ -45,9 +45,18 @@ class AnswerNode(Node):
    def _run(self) -> NodeRunResult:
        segments = self.graph_runtime_state.variable_pool.convert_template(self._node_data.answer)
        files = self._extract_files_from_segments(segments.value)
+        # Merge answer outputs into existing workflow outputs so upstream metadata (e.g. judge feedback)
+        # is preserved and included in the final workflow_finished payload.
+        existing_outputs = self.graph_runtime_state.outputs
+        merged_outputs: dict[str, Any] = {
+            **existing_outputs,
+            'answer': segments.markdown,
+            'files': ArrayFileSegment(value=files),
+        }
+        self.graph_runtime_state.outputs = merged_outputs
        return NodeRunResult(
            status=WorkflowNodeExecutionStatus.SUCCEEDED,
-            outputs={"answer": segments.markdown, "files": ArrayFileSegment(value=files)},
+            outputs=merged_outputs,
        )

    def _extract_files_from_segments(self, segments: Sequence[Segment]):
--- a/api/core/workflow/nodes/challenge_evaluator/node.py
+++ b/api/core/workflow/nodes/challenge_evaluator/node.py
@ -148,6 +148,8 @@ class ChallengeEvaluatorNode(Node):

        # optional persistence if config carries challenge_id
        challenge_id = self._config.get('challenge_id')
+        judge_feedback = judge_feedback_from_input if is_judge_input else None
+        judge_rating_value = judge_rating if is_judge_input else None
        if challenge_id:
            try:
                # Calculate elapsed time in milliseconds
@ -157,11 +159,17 @@ class ChallengeEvaluatorNode(Node):
                tokens_total = self.graph_runtime_state.total_tokens

                # Extract judge_rating from details if available (for highest_rating strategy)
-                judge_rating = None
-                judge_feedback = None
                if isinstance(details, dict):
-                    judge_rating = details.get('rating')
-                    judge_feedback = details.get('feedback')
+                    judge_rating_raw = details.get('rating')
+                    judge_feedback_raw = details.get('feedback')
+                    if judge_rating_raw is not None:
+                        judge_rating_value = int(judge_rating_raw)
+                    if (
+                        judge_feedback_raw is not None
+                        and isinstance(judge_feedback_raw, str)
+                        and judge_feedback_raw.strip()
+                    ):
+                        judge_feedback = str(judge_feedback_raw)

                # Load challenge to check scoring strategy
                challenge = db.session.get(Challenge, str(challenge_id))
@ -209,7 +217,7 @@ class ChallengeEvaluatorNode(Node):
                        logger.error("Custom scorer failed: %s", e, exc_info=True)
                        # Continue with score=None on error

-                ChallengeService.record_attempt(
+                _ = ChallengeService.record_attempt(
                    tenant_id=self.tenant_id,
                    challenge_id=challenge_id,
                    end_user_id=None,
@ -217,8 +225,8 @@ class ChallengeEvaluatorNode(Node):
                    workflow_run_id=None,
                    succeeded=ok,
                    score=score,
-                    judge_rating=judge_rating,
-                    judge_feedback=judge_feedback,
+                    judge_rating=judge_rating_value,
+                    judge_feedback=judge_feedback_from_input or judge_feedback,
                    tokens_total=tokens_total,
                    elapsed_ms=elapsed_ms,
                    session=db.session,
@ -230,8 +238,8 @@ class ChallengeEvaluatorNode(Node):
        # Always provide all output variables to match frontend getOutputVars
        outputs: dict[str, Any] = {
            'challenge_succeeded': ok,
-            'judge_rating': 0,
-            'judge_feedback': '',
+            'judge_rating': judge_rating_value,
+            'judge_feedback': judge_feedback_from_input or judge_feedback or '',
            'message': '',
        }

@ -251,8 +259,14 @@ class ChallengeEvaluatorNode(Node):
                else:
                    outputs['message'] = f"Failed: {details.get('mode', 'evaluation')} did not match"

+        # also persist judge outputs onto runtime state so downstream consumers can access them
+        if outputs['judge_feedback']:
+            self.graph_runtime_state.set_output('judge_feedback', outputs['judge_feedback'])
+        if outputs['judge_rating'] is not None:
+            self.graph_runtime_state.set_output('judge_rating', outputs['judge_rating'])
+        self.graph_runtime_state.set_output('challenge_succeeded', outputs['challenge_succeeded'])
+
        return NodeRunResult(
            status=WorkflowNodeExecutionStatus.SUCCEEDED,
            outputs=outputs,
        )
-
--- a/web/app/challenges/[id]/page.tsx
+++ b/web/app/challenges/[id]/page.tsx
@ -54,6 +54,14 @@ export default function ChallengeDetailPage() {
    setHasStreamingResult(false)
  }, [])

+  useEffect(() => {
+    const previousOverflow = document.body.style.overflowY
+    document.body.style.overflowY = 'auto'
+    return () => {
+      document.body.style.overflowY = previousOverflow
+    }
+  }, [])
+
  useEffect(() => () => {
    stopStreaming()
  }, [stopStreaming])
@ -80,6 +88,7 @@ export default function ChallengeDetailPage() {
        challenge.app_site_code,
        challenge.app_mode || 'workflow',
        userInput,
+        challenge.goal,
        {
          onStreamUpdate: (text) => {
            setStreamingText(text)
@ -99,9 +108,24 @@ export default function ChallengeDetailPage() {
      setHasStreamingResult(false)
      setStreamingText(result.rawText)

+      const judgeFeedback = typeof result.outputs?.judge_feedback === 'string' && result.outputs.judge_feedback.trim().length > 0
+        ? result.outputs.judge_feedback
+        : (typeof result.message === 'string' && result.message.trim().length > 0 ? result.message : undefined)
+      const fallbackExplanation = typeof result.outputs?.message === 'string' && result.outputs.message.trim().length > 0
+        ? result.outputs.message
+        : ''
+      const successFallback = t('challenges.player.defaultSuccessMessage', 'Challenge passed!')
+      const failureFallback = t('challenges.player.defaultFailureMessage', 'Challenge not passed.')
+      const judgeFeedbackLine = judgeFeedback
+        ? t('challenges.player.judgeFeedbackLine', { feedback: judgeFeedback, defaultValue: `${judgeFeedback}` })
+        : ''
+      const combinedMessage = result.success
+        ? [judgeFeedback || fallbackExplanation || successFallback].filter(Boolean).join('\n')
+        : [judgeFeedbackLine || fallbackExplanation || failureFallback].filter(Boolean).join('\n')
+
      setLastResult({
        success: result.success,
-        message: result.message,
+        message: combinedMessage,
        rating: result.rating,
      })

@ -143,7 +167,7 @@ export default function ChallengeDetailPage() {
  }

  return (
-    <div className='min-h-screen bg-components-panel-bg'>
+    <div className='min-h-screen overflow-y-auto bg-components-panel-bg'>
      <div className='mx-auto max-w-5xl px-4 py-12 sm:px-6 lg:px-8'>
        <div className='mb-8'>
          <h1 className='mb-2 text-3xl font-bold text-text-primary'>{challenge.name}</h1>
@ -220,7 +244,7 @@ export default function ChallengeDetailPage() {
                        {lastResult.success ? t('challenges.player.status.success') : t('challenges.player.status.failed')}
                      </div>
                      {lastResult.message && (
-                        <div className='text-sm text-text-secondary'>{lastResult.message}</div>
+                        <div className='whitespace-pre-wrap text-sm text-text-secondary'>{lastResult.message}</div>
                      )}
                      {lastResult.rating !== undefined && (
                        <div className='mt-2 text-sm text-text-tertiary'>
--- a/web/i18n/en-US/challenges.ts
+++ b/web/i18n/en-US/challenges.ts
@ -54,6 +54,10 @@ export default {
    awaitingResponse: 'Waiting for the model to respond…',
    processing: 'Processing…',
    submitButton: 'Submit',
+    defaultSuccessMessage: 'Challenge passed!',
+    defaultFailureMessage: 'Challenge not passed.',
+    ratingLine: 'Judge rating: {{rating}}/10',
+    judgeFeedbackLine: 'Judge feedback: {{feedback}}',
  },
  leaderboard: {
    title: 'Leaderboard',
--- a/web/i18n/en-US/common.ts
+++ b/web/i18n/en-US/common.ts
@ -12,6 +12,7 @@ const translation = {
    create: 'Created',
    remove: 'Removed',
  },
+  loading: 'Loading…',
  operation: {
    create: 'Create',
    confirm: 'Confirm',
--- a/web/service/challenges.ts
+++ b/web/service/challenges.ts
@ -11,6 +11,10 @@ type ChatMessageEnd = {
    message?: string
    judge_feedback?: string
    judge_rating?: number
+    judge_outputs?: {
+      feedback?: string
+      message?: string
+    }
  }
 }

@ -60,6 +64,7 @@ export async function submitChallengeAttempt(
  appSiteCode: string | undefined,
  appMode: string,
  userInput: string,
+  challengeGoal?: string,
  callbacks?: ChallengeAttemptCallbacks,
 ): Promise<ChallengeAttemptResult> {
  if (!appSiteCode)
@ -114,6 +119,7 @@ export async function submitChallengeAttempt(
    let aggregatedText = ''
    let finalOutputs: Record<string, any> | undefined
    let finalMessage: string | undefined
+    let extractedJudgeFeedback: string | undefined
    let isSettled = false
    const releaseAbortController = () => {
      callbacks?.onAbortController?.(null)
@ -141,8 +147,12 @@ export async function submitChallengeAttempt(
      const outputs = finalOutputs || {}
      const successFlag = Boolean(outputs.challenge_succeeded)
      const rating = outputs.judge_rating ?? outputs.rating
-      const feedback = outputs.judge_feedback || outputs.message || finalMessage || aggregatedText
-      const message = feedback || (successFlag ? 'Challenge passed!' : 'Challenge not passed.')
+      const feedback = extractedJudgeFeedback
+        ?? (typeof outputs.judge_feedback === 'string' && outputs.judge_feedback.trim().length > 0
+          ? outputs.judge_feedback
+          : undefined)
+      const fallbackMessage = outputs.message || finalMessage || aggregatedText
+      const message = feedback || fallbackMessage || (successFlag ? 'Challenge passed!' : 'Challenge not passed.')
      return {
        success: successFlag,
        rating,
@ -178,58 +188,75 @@ export async function submitChallengeAttempt(
      },
    }

-    if (isChatApp) {
-      ssePost(
-        '/chat-messages',
-        {
-          body: {
-            query: userInput,
-            inputs: {},
-            response_mode: 'streaming',
-            conversation_id: '',
-          },
+    const endpoint = isChatApp ? '/chat-messages' : '/workflows/run'
+    const body = isChatApp
+      ? {
+        query: userInput,
+        inputs: {
+          challenge_goal: challengeGoal,
        },
-        {
-          ...commonOptions,
-          onData: (message: string) => {
-            aggregatedText += message
-            emitStreamUpdate()
-          },
-          onMessageReplace: (messageReplace) => {
-            aggregatedText = messageReplace.answer
-            emitStreamUpdate()
-          },
-          onMessageEnd: (messageEnd) => {
-            const metadata = (messageEnd as ChatMessageEnd).metadata
-            if (metadata?.outputs)
-              finalOutputs = metadata.outputs
-            const endMessage = metadata?.answer || metadata?.message || metadata?.judge_feedback
-            if (endMessage) {
-              aggregatedText = endMessage
-              emitStreamUpdate()
-            }
-            if (metadata?.judge_feedback)
-              finalMessage = metadata.judge_feedback
-            else if (metadata?.answer || metadata?.message)
-              finalMessage = metadata.answer || metadata.message
-          },
+        response_mode: 'streaming',
+        conversation_id: '',
+      }
+      : {
+        inputs: {
+          user_prompt: userInput,
+          challenge_goal: challengeGoal,
        },
-      )
-      return
-    }
+        response_mode: 'streaming',
+      }

    ssePost(
-      '/workflows/run',
+      endpoint,
      {
-        body: {
-          inputs: {
-            user_prompt: userInput,
-          },
-          response_mode: 'streaming',
-        },
+        body,
      },
      {
        ...commonOptions,
+        onData: (message: string) => {
+          aggregatedText += message
+          emitStreamUpdate()
+        },
+        onMessageReplace: (messageReplace) => {
+          aggregatedText = messageReplace.answer
+          emitStreamUpdate()
+        },
+        onMessageEnd: (messageEnd) => {
+          const metadata = (messageEnd as ChatMessageEnd).metadata
+          if (metadata?.outputs) {
+            finalOutputs = {
+              ...(finalOutputs || {}),
+              ...metadata.outputs,
+            }
+            const metaFeedback = metadata.outputs?.judge_feedback
+            if (typeof metaFeedback === 'string' && metaFeedback.trim().length > 0)
+              extractedJudgeFeedback = extractedJudgeFeedback || metaFeedback
+            const outputsMessage = metadata.outputs?.message
+            if (typeof outputsMessage === 'string' && outputsMessage.trim().length > 0) {
+              aggregatedText = outputsMessage
+              emitStreamUpdate()
+            }
+          }
+
+          if (!extractedJudgeFeedback && metadata?.judge_feedback && metadata.judge_feedback.trim().length > 0)
+            extractedJudgeFeedback = metadata.judge_feedback
+
+          if (metadata?.judge_outputs) {
+            if (!extractedJudgeFeedback && typeof metadata.judge_outputs.feedback === 'string' && metadata.judge_outputs.feedback.trim().length > 0)
+              extractedJudgeFeedback = metadata.judge_outputs.feedback
+            const judgeMessage = metadata.judge_outputs.message
+            if (typeof judgeMessage === 'string' && judgeMessage.trim().length > 0) {
+              aggregatedText = judgeMessage
+              emitStreamUpdate()
+            }
+          }
+
+          const endMessage = metadata?.answer || metadata?.message
+          if (endMessage) {
+            aggregatedText = endMessage
+            emitStreamUpdate()
+          }
+        },
        onTextChunk: (chunk) => {
          const text = (chunk as any)?.data?.text || ''
          if (text) {
@ -239,14 +266,26 @@ export async function submitChallengeAttempt(
        },
        onWorkflowFinished: ({ data }) => {
          const resultData = (data as WorkflowFinishedResponse['data']) || {}
-          if (resultData.outputs)
-            finalOutputs = resultData.outputs
-          const message = resultData.outputs?.judge_feedback || resultData.outputs?.message
-          if (message) {
-            aggregatedText = message
-            emitStreamUpdate()
+          if (resultData.outputs) {
+            finalOutputs = {
+              ...(finalOutputs || {}),
+              ...resultData.outputs,
+            }
+            const feedback = resultData.outputs.judge_feedback
+            if (typeof feedback === 'string' && feedback.trim().length > 0)
+              extractedJudgeFeedback = extractedJudgeFeedback || feedback
+            const message = resultData.outputs.message
+            if (typeof message === 'string' && message.trim().length > 0) {
+              aggregatedText = message
+              emitStreamUpdate()
+            }
+          }
+          else if ((data as any)?.metadata?.judge_feedback) {
+            // Some workflows may emit judge feedback under metadata instead of outputs
+            const judgeFeedback = (data as any).metadata.judge_feedback
+            if (typeof judgeFeedback === 'string' && judgeFeedback.trim().length > 0)
+              extractedJudgeFeedback = extractedJudgeFeedback || judgeFeedback
          }
-          finalMessage = message || finalMessage
        },
      },
    )