From 96634e7cec549a92169a60cfd77e4ff95adee48b Mon Sep 17 00:00:00 2001 From: Joey Yakimowich-Payne Date: Wed, 1 Oct 2025 16:07:40 -0600 Subject: [PATCH] feat: propagate judge feedback through challenge workflow results --- api/core/workflow/nodes/answer/answer_node.py | 11 +- .../nodes/challenge_evaluator/node.py | 34 ++-- web/app/challenges/[id]/page.tsx | 30 +++- web/i18n/en-US/challenges.ts | 4 + web/i18n/en-US/common.ts | 1 + web/service/challenges.ts | 145 +++++++++++------- 6 files changed, 158 insertions(+), 67 deletions(-) diff --git a/api/core/workflow/nodes/answer/answer_node.py b/api/core/workflow/nodes/answer/answer_node.py index 86174c7ea..e9df26006 100644 --- a/api/core/workflow/nodes/answer/answer_node.py +++ b/api/core/workflow/nodes/answer/answer_node.py @@ -45,9 +45,18 @@ class AnswerNode(Node): def _run(self) -> NodeRunResult: segments = self.graph_runtime_state.variable_pool.convert_template(self._node_data.answer) files = self._extract_files_from_segments(segments.value) + # Merge answer outputs into existing workflow outputs so upstream metadata (e.g. judge feedback) + # is preserved and included in the final workflow_finished payload. + existing_outputs = self.graph_runtime_state.outputs + merged_outputs: dict[str, Any] = { + **existing_outputs, + 'answer': segments.markdown, + 'files': ArrayFileSegment(value=files), + } + self.graph_runtime_state.outputs = merged_outputs return NodeRunResult( status=WorkflowNodeExecutionStatus.SUCCEEDED, - outputs={"answer": segments.markdown, "files": ArrayFileSegment(value=files)}, + outputs=merged_outputs, ) def _extract_files_from_segments(self, segments: Sequence[Segment]): diff --git a/api/core/workflow/nodes/challenge_evaluator/node.py b/api/core/workflow/nodes/challenge_evaluator/node.py index e06d856ee..c0062b902 100644 --- a/api/core/workflow/nodes/challenge_evaluator/node.py +++ b/api/core/workflow/nodes/challenge_evaluator/node.py @@ -148,6 +148,8 @@ class ChallengeEvaluatorNode(Node): # optional persistence if config carries challenge_id challenge_id = self._config.get('challenge_id') + judge_feedback = judge_feedback_from_input if is_judge_input else None + judge_rating_value = judge_rating if is_judge_input else None if challenge_id: try: # Calculate elapsed time in milliseconds @@ -157,11 +159,17 @@ class ChallengeEvaluatorNode(Node): tokens_total = self.graph_runtime_state.total_tokens # Extract judge_rating from details if available (for highest_rating strategy) - judge_rating = None - judge_feedback = None if isinstance(details, dict): - judge_rating = details.get('rating') - judge_feedback = details.get('feedback') + judge_rating_raw = details.get('rating') + judge_feedback_raw = details.get('feedback') + if judge_rating_raw is not None: + judge_rating_value = int(judge_rating_raw) + if ( + judge_feedback_raw is not None + and isinstance(judge_feedback_raw, str) + and judge_feedback_raw.strip() + ): + judge_feedback = str(judge_feedback_raw) # Load challenge to check scoring strategy challenge = db.session.get(Challenge, str(challenge_id)) @@ -209,7 +217,7 @@ class ChallengeEvaluatorNode(Node): logger.error("Custom scorer failed: %s", e, exc_info=True) # Continue with score=None on error - ChallengeService.record_attempt( + _ = ChallengeService.record_attempt( tenant_id=self.tenant_id, challenge_id=challenge_id, end_user_id=None, @@ -217,8 +225,8 @@ class ChallengeEvaluatorNode(Node): workflow_run_id=None, succeeded=ok, score=score, - judge_rating=judge_rating, - judge_feedback=judge_feedback, + judge_rating=judge_rating_value, + judge_feedback=judge_feedback_from_input or judge_feedback, tokens_total=tokens_total, elapsed_ms=elapsed_ms, session=db.session, @@ -230,8 +238,8 @@ class ChallengeEvaluatorNode(Node): # Always provide all output variables to match frontend getOutputVars outputs: dict[str, Any] = { 'challenge_succeeded': ok, - 'judge_rating': 0, - 'judge_feedback': '', + 'judge_rating': judge_rating_value, + 'judge_feedback': judge_feedback_from_input or judge_feedback or '', 'message': '', } @@ -251,8 +259,14 @@ class ChallengeEvaluatorNode(Node): else: outputs['message'] = f"Failed: {details.get('mode', 'evaluation')} did not match" + # also persist judge outputs onto runtime state so downstream consumers can access them + if outputs['judge_feedback']: + self.graph_runtime_state.set_output('judge_feedback', outputs['judge_feedback']) + if outputs['judge_rating'] is not None: + self.graph_runtime_state.set_output('judge_rating', outputs['judge_rating']) + self.graph_runtime_state.set_output('challenge_succeeded', outputs['challenge_succeeded']) + return NodeRunResult( status=WorkflowNodeExecutionStatus.SUCCEEDED, outputs=outputs, ) - diff --git a/web/app/challenges/[id]/page.tsx b/web/app/challenges/[id]/page.tsx index 538bf5ed5..70e5d9974 100644 --- a/web/app/challenges/[id]/page.tsx +++ b/web/app/challenges/[id]/page.tsx @@ -54,6 +54,14 @@ export default function ChallengeDetailPage() { setHasStreamingResult(false) }, []) + useEffect(() => { + const previousOverflow = document.body.style.overflowY + document.body.style.overflowY = 'auto' + return () => { + document.body.style.overflowY = previousOverflow + } + }, []) + useEffect(() => () => { stopStreaming() }, [stopStreaming]) @@ -80,6 +88,7 @@ export default function ChallengeDetailPage() { challenge.app_site_code, challenge.app_mode || 'workflow', userInput, + challenge.goal, { onStreamUpdate: (text) => { setStreamingText(text) @@ -99,9 +108,24 @@ export default function ChallengeDetailPage() { setHasStreamingResult(false) setStreamingText(result.rawText) + const judgeFeedback = typeof result.outputs?.judge_feedback === 'string' && result.outputs.judge_feedback.trim().length > 0 + ? result.outputs.judge_feedback + : (typeof result.message === 'string' && result.message.trim().length > 0 ? result.message : undefined) + const fallbackExplanation = typeof result.outputs?.message === 'string' && result.outputs.message.trim().length > 0 + ? result.outputs.message + : '' + const successFallback = t('challenges.player.defaultSuccessMessage', 'Challenge passed!') + const failureFallback = t('challenges.player.defaultFailureMessage', 'Challenge not passed.') + const judgeFeedbackLine = judgeFeedback + ? t('challenges.player.judgeFeedbackLine', { feedback: judgeFeedback, defaultValue: `${judgeFeedback}` }) + : '' + const combinedMessage = result.success + ? [judgeFeedback || fallbackExplanation || successFallback].filter(Boolean).join('\n') + : [judgeFeedbackLine || fallbackExplanation || failureFallback].filter(Boolean).join('\n') + setLastResult({ success: result.success, - message: result.message, + message: combinedMessage, rating: result.rating, }) @@ -143,7 +167,7 @@ export default function ChallengeDetailPage() { } return ( -
+

{challenge.name}

@@ -220,7 +244,7 @@ export default function ChallengeDetailPage() { {lastResult.success ? t('challenges.player.status.success') : t('challenges.player.status.failed')}
{lastResult.message && ( -
{lastResult.message}
+
{lastResult.message}
)} {lastResult.rating !== undefined && (
diff --git a/web/i18n/en-US/challenges.ts b/web/i18n/en-US/challenges.ts index af9dc99ab..2f64ff6de 100644 --- a/web/i18n/en-US/challenges.ts +++ b/web/i18n/en-US/challenges.ts @@ -54,6 +54,10 @@ export default { awaitingResponse: 'Waiting for the model to respond…', processing: 'Processing…', submitButton: 'Submit', + defaultSuccessMessage: 'Challenge passed!', + defaultFailureMessage: 'Challenge not passed.', + ratingLine: 'Judge rating: {{rating}}/10', + judgeFeedbackLine: 'Judge feedback: {{feedback}}', }, leaderboard: { title: 'Leaderboard', diff --git a/web/i18n/en-US/common.ts b/web/i18n/en-US/common.ts index b9d315388..55800a75d 100644 --- a/web/i18n/en-US/common.ts +++ b/web/i18n/en-US/common.ts @@ -12,6 +12,7 @@ const translation = { create: 'Created', remove: 'Removed', }, + loading: 'Loading…', operation: { create: 'Create', confirm: 'Confirm', diff --git a/web/service/challenges.ts b/web/service/challenges.ts index 5ff5992fe..42ea45c2b 100644 --- a/web/service/challenges.ts +++ b/web/service/challenges.ts @@ -11,6 +11,10 @@ type ChatMessageEnd = { message?: string judge_feedback?: string judge_rating?: number + judge_outputs?: { + feedback?: string + message?: string + } } } @@ -60,6 +64,7 @@ export async function submitChallengeAttempt( appSiteCode: string | undefined, appMode: string, userInput: string, + challengeGoal?: string, callbacks?: ChallengeAttemptCallbacks, ): Promise { if (!appSiteCode) @@ -114,6 +119,7 @@ export async function submitChallengeAttempt( let aggregatedText = '' let finalOutputs: Record | undefined let finalMessage: string | undefined + let extractedJudgeFeedback: string | undefined let isSettled = false const releaseAbortController = () => { callbacks?.onAbortController?.(null) @@ -141,8 +147,12 @@ export async function submitChallengeAttempt( const outputs = finalOutputs || {} const successFlag = Boolean(outputs.challenge_succeeded) const rating = outputs.judge_rating ?? outputs.rating - const feedback = outputs.judge_feedback || outputs.message || finalMessage || aggregatedText - const message = feedback || (successFlag ? 'Challenge passed!' : 'Challenge not passed.') + const feedback = extractedJudgeFeedback + ?? (typeof outputs.judge_feedback === 'string' && outputs.judge_feedback.trim().length > 0 + ? outputs.judge_feedback + : undefined) + const fallbackMessage = outputs.message || finalMessage || aggregatedText + const message = feedback || fallbackMessage || (successFlag ? 'Challenge passed!' : 'Challenge not passed.') return { success: successFlag, rating, @@ -178,58 +188,75 @@ export async function submitChallengeAttempt( }, } - if (isChatApp) { - ssePost( - '/chat-messages', - { - body: { - query: userInput, - inputs: {}, - response_mode: 'streaming', - conversation_id: '', - }, + const endpoint = isChatApp ? '/chat-messages' : '/workflows/run' + const body = isChatApp + ? { + query: userInput, + inputs: { + challenge_goal: challengeGoal, }, - { - ...commonOptions, - onData: (message: string) => { - aggregatedText += message - emitStreamUpdate() - }, - onMessageReplace: (messageReplace) => { - aggregatedText = messageReplace.answer - emitStreamUpdate() - }, - onMessageEnd: (messageEnd) => { - const metadata = (messageEnd as ChatMessageEnd).metadata - if (metadata?.outputs) - finalOutputs = metadata.outputs - const endMessage = metadata?.answer || metadata?.message || metadata?.judge_feedback - if (endMessage) { - aggregatedText = endMessage - emitStreamUpdate() - } - if (metadata?.judge_feedback) - finalMessage = metadata.judge_feedback - else if (metadata?.answer || metadata?.message) - finalMessage = metadata.answer || metadata.message - }, + response_mode: 'streaming', + conversation_id: '', + } + : { + inputs: { + user_prompt: userInput, + challenge_goal: challengeGoal, }, - ) - return - } + response_mode: 'streaming', + } ssePost( - '/workflows/run', + endpoint, { - body: { - inputs: { - user_prompt: userInput, - }, - response_mode: 'streaming', - }, + body, }, { ...commonOptions, + onData: (message: string) => { + aggregatedText += message + emitStreamUpdate() + }, + onMessageReplace: (messageReplace) => { + aggregatedText = messageReplace.answer + emitStreamUpdate() + }, + onMessageEnd: (messageEnd) => { + const metadata = (messageEnd as ChatMessageEnd).metadata + if (metadata?.outputs) { + finalOutputs = { + ...(finalOutputs || {}), + ...metadata.outputs, + } + const metaFeedback = metadata.outputs?.judge_feedback + if (typeof metaFeedback === 'string' && metaFeedback.trim().length > 0) + extractedJudgeFeedback = extractedJudgeFeedback || metaFeedback + const outputsMessage = metadata.outputs?.message + if (typeof outputsMessage === 'string' && outputsMessage.trim().length > 0) { + aggregatedText = outputsMessage + emitStreamUpdate() + } + } + + if (!extractedJudgeFeedback && metadata?.judge_feedback && metadata.judge_feedback.trim().length > 0) + extractedJudgeFeedback = metadata.judge_feedback + + if (metadata?.judge_outputs) { + if (!extractedJudgeFeedback && typeof metadata.judge_outputs.feedback === 'string' && metadata.judge_outputs.feedback.trim().length > 0) + extractedJudgeFeedback = metadata.judge_outputs.feedback + const judgeMessage = metadata.judge_outputs.message + if (typeof judgeMessage === 'string' && judgeMessage.trim().length > 0) { + aggregatedText = judgeMessage + emitStreamUpdate() + } + } + + const endMessage = metadata?.answer || metadata?.message + if (endMessage) { + aggregatedText = endMessage + emitStreamUpdate() + } + }, onTextChunk: (chunk) => { const text = (chunk as any)?.data?.text || '' if (text) { @@ -239,14 +266,26 @@ export async function submitChallengeAttempt( }, onWorkflowFinished: ({ data }) => { const resultData = (data as WorkflowFinishedResponse['data']) || {} - if (resultData.outputs) - finalOutputs = resultData.outputs - const message = resultData.outputs?.judge_feedback || resultData.outputs?.message - if (message) { - aggregatedText = message - emitStreamUpdate() + if (resultData.outputs) { + finalOutputs = { + ...(finalOutputs || {}), + ...resultData.outputs, + } + const feedback = resultData.outputs.judge_feedback + if (typeof feedback === 'string' && feedback.trim().length > 0) + extractedJudgeFeedback = extractedJudgeFeedback || feedback + const message = resultData.outputs.message + if (typeof message === 'string' && message.trim().length > 0) { + aggregatedText = message + emitStreamUpdate() + } + } + else if ((data as any)?.metadata?.judge_feedback) { + // Some workflows may emit judge feedback under metadata instead of outputs + const judgeFeedback = (data as any).metadata.judge_feedback + if (typeof judgeFeedback === 'string' && judgeFeedback.trim().length > 0) + extractedJudgeFeedback = extractedJudgeFeedback || judgeFeedback } - finalMessage = message || finalMessage }, }, )