feat: propagate judge feedback through challenge workflow results

This commit is contained in:
Joey Yakimowich-Payne 2025-10-01 16:07:40 -06:00
commit 96634e7cec
No known key found for this signature in database
GPG key ID: 6BFE655FA5ABD1E1
6 changed files with 158 additions and 67 deletions

View file

@ -45,9 +45,18 @@ class AnswerNode(Node):
def _run(self) -> NodeRunResult:
segments = self.graph_runtime_state.variable_pool.convert_template(self._node_data.answer)
files = self._extract_files_from_segments(segments.value)
# Merge answer outputs into existing workflow outputs so upstream metadata (e.g. judge feedback)
# is preserved and included in the final workflow_finished payload.
existing_outputs = self.graph_runtime_state.outputs
merged_outputs: dict[str, Any] = {
**existing_outputs,
'answer': segments.markdown,
'files': ArrayFileSegment(value=files),
}
self.graph_runtime_state.outputs = merged_outputs
return NodeRunResult(
status=WorkflowNodeExecutionStatus.SUCCEEDED,
outputs={"answer": segments.markdown, "files": ArrayFileSegment(value=files)},
outputs=merged_outputs,
)
def _extract_files_from_segments(self, segments: Sequence[Segment]):

View file

@ -148,6 +148,8 @@ class ChallengeEvaluatorNode(Node):
# optional persistence if config carries challenge_id
challenge_id = self._config.get('challenge_id')
judge_feedback = judge_feedback_from_input if is_judge_input else None
judge_rating_value = judge_rating if is_judge_input else None
if challenge_id:
try:
# Calculate elapsed time in milliseconds
@ -157,11 +159,17 @@ class ChallengeEvaluatorNode(Node):
tokens_total = self.graph_runtime_state.total_tokens
# Extract judge_rating from details if available (for highest_rating strategy)
judge_rating = None
judge_feedback = None
if isinstance(details, dict):
judge_rating = details.get('rating')
judge_feedback = details.get('feedback')
judge_rating_raw = details.get('rating')
judge_feedback_raw = details.get('feedback')
if judge_rating_raw is not None:
judge_rating_value = int(judge_rating_raw)
if (
judge_feedback_raw is not None
and isinstance(judge_feedback_raw, str)
and judge_feedback_raw.strip()
):
judge_feedback = str(judge_feedback_raw)
# Load challenge to check scoring strategy
challenge = db.session.get(Challenge, str(challenge_id))
@ -209,7 +217,7 @@ class ChallengeEvaluatorNode(Node):
logger.error("Custom scorer failed: %s", e, exc_info=True)
# Continue with score=None on error
ChallengeService.record_attempt(
_ = ChallengeService.record_attempt(
tenant_id=self.tenant_id,
challenge_id=challenge_id,
end_user_id=None,
@ -217,8 +225,8 @@ class ChallengeEvaluatorNode(Node):
workflow_run_id=None,
succeeded=ok,
score=score,
judge_rating=judge_rating,
judge_feedback=judge_feedback,
judge_rating=judge_rating_value,
judge_feedback=judge_feedback_from_input or judge_feedback,
tokens_total=tokens_total,
elapsed_ms=elapsed_ms,
session=db.session,
@ -230,8 +238,8 @@ class ChallengeEvaluatorNode(Node):
# Always provide all output variables to match frontend getOutputVars
outputs: dict[str, Any] = {
'challenge_succeeded': ok,
'judge_rating': 0,
'judge_feedback': '',
'judge_rating': judge_rating_value,
'judge_feedback': judge_feedback_from_input or judge_feedback or '',
'message': '',
}
@ -251,8 +259,14 @@ class ChallengeEvaluatorNode(Node):
else:
outputs['message'] = f"Failed: {details.get('mode', 'evaluation')} did not match"
# also persist judge outputs onto runtime state so downstream consumers can access them
if outputs['judge_feedback']:
self.graph_runtime_state.set_output('judge_feedback', outputs['judge_feedback'])
if outputs['judge_rating'] is not None:
self.graph_runtime_state.set_output('judge_rating', outputs['judge_rating'])
self.graph_runtime_state.set_output('challenge_succeeded', outputs['challenge_succeeded'])
return NodeRunResult(
status=WorkflowNodeExecutionStatus.SUCCEEDED,
outputs=outputs,
)

View file

@ -54,6 +54,14 @@ export default function ChallengeDetailPage() {
setHasStreamingResult(false)
}, [])
useEffect(() => {
const previousOverflow = document.body.style.overflowY
document.body.style.overflowY = 'auto'
return () => {
document.body.style.overflowY = previousOverflow
}
}, [])
useEffect(() => () => {
stopStreaming()
}, [stopStreaming])
@ -80,6 +88,7 @@ export default function ChallengeDetailPage() {
challenge.app_site_code,
challenge.app_mode || 'workflow',
userInput,
challenge.goal,
{
onStreamUpdate: (text) => {
setStreamingText(text)
@ -99,9 +108,24 @@ export default function ChallengeDetailPage() {
setHasStreamingResult(false)
setStreamingText(result.rawText)
const judgeFeedback = typeof result.outputs?.judge_feedback === 'string' && result.outputs.judge_feedback.trim().length > 0
? result.outputs.judge_feedback
: (typeof result.message === 'string' && result.message.trim().length > 0 ? result.message : undefined)
const fallbackExplanation = typeof result.outputs?.message === 'string' && result.outputs.message.trim().length > 0
? result.outputs.message
: ''
const successFallback = t('challenges.player.defaultSuccessMessage', 'Challenge passed!')
const failureFallback = t('challenges.player.defaultFailureMessage', 'Challenge not passed.')
const judgeFeedbackLine = judgeFeedback
? t('challenges.player.judgeFeedbackLine', { feedback: judgeFeedback, defaultValue: `${judgeFeedback}` })
: ''
const combinedMessage = result.success
? [judgeFeedback || fallbackExplanation || successFallback].filter(Boolean).join('\n')
: [judgeFeedbackLine || fallbackExplanation || failureFallback].filter(Boolean).join('\n')
setLastResult({
success: result.success,
message: result.message,
message: combinedMessage,
rating: result.rating,
})
@ -143,7 +167,7 @@ export default function ChallengeDetailPage() {
}
return (
<div className='min-h-screen bg-components-panel-bg'>
<div className='min-h-screen overflow-y-auto bg-components-panel-bg'>
<div className='mx-auto max-w-5xl px-4 py-12 sm:px-6 lg:px-8'>
<div className='mb-8'>
<h1 className='mb-2 text-3xl font-bold text-text-primary'>{challenge.name}</h1>
@ -220,7 +244,7 @@ export default function ChallengeDetailPage() {
{lastResult.success ? t('challenges.player.status.success') : t('challenges.player.status.failed')}
</div>
{lastResult.message && (
<div className='text-sm text-text-secondary'>{lastResult.message}</div>
<div className='whitespace-pre-wrap text-sm text-text-secondary'>{lastResult.message}</div>
)}
{lastResult.rating !== undefined && (
<div className='mt-2 text-sm text-text-tertiary'>

View file

@ -54,6 +54,10 @@ export default {
awaitingResponse: 'Waiting for the model to respond…',
processing: 'Processing…',
submitButton: 'Submit',
defaultSuccessMessage: 'Challenge passed!',
defaultFailureMessage: 'Challenge not passed.',
ratingLine: 'Judge rating: {{rating}}/10',
judgeFeedbackLine: 'Judge feedback: {{feedback}}',
},
leaderboard: {
title: 'Leaderboard',

View file

@ -12,6 +12,7 @@ const translation = {
create: 'Created',
remove: 'Removed',
},
loading: 'Loading…',
operation: {
create: 'Create',
confirm: 'Confirm',

View file

@ -11,6 +11,10 @@ type ChatMessageEnd = {
message?: string
judge_feedback?: string
judge_rating?: number
judge_outputs?: {
feedback?: string
message?: string
}
}
}
@ -60,6 +64,7 @@ export async function submitChallengeAttempt(
appSiteCode: string | undefined,
appMode: string,
userInput: string,
challengeGoal?: string,
callbacks?: ChallengeAttemptCallbacks,
): Promise<ChallengeAttemptResult> {
if (!appSiteCode)
@ -114,6 +119,7 @@ export async function submitChallengeAttempt(
let aggregatedText = ''
let finalOutputs: Record<string, any> | undefined
let finalMessage: string | undefined
let extractedJudgeFeedback: string | undefined
let isSettled = false
const releaseAbortController = () => {
callbacks?.onAbortController?.(null)
@ -141,8 +147,12 @@ export async function submitChallengeAttempt(
const outputs = finalOutputs || {}
const successFlag = Boolean(outputs.challenge_succeeded)
const rating = outputs.judge_rating ?? outputs.rating
const feedback = outputs.judge_feedback || outputs.message || finalMessage || aggregatedText
const message = feedback || (successFlag ? 'Challenge passed!' : 'Challenge not passed.')
const feedback = extractedJudgeFeedback
?? (typeof outputs.judge_feedback === 'string' && outputs.judge_feedback.trim().length > 0
? outputs.judge_feedback
: undefined)
const fallbackMessage = outputs.message || finalMessage || aggregatedText
const message = feedback || fallbackMessage || (successFlag ? 'Challenge passed!' : 'Challenge not passed.')
return {
success: successFlag,
rating,
@ -178,58 +188,75 @@ export async function submitChallengeAttempt(
},
}
if (isChatApp) {
ssePost(
'/chat-messages',
{
body: {
query: userInput,
inputs: {},
response_mode: 'streaming',
conversation_id: '',
},
const endpoint = isChatApp ? '/chat-messages' : '/workflows/run'
const body = isChatApp
? {
query: userInput,
inputs: {
challenge_goal: challengeGoal,
},
{
...commonOptions,
onData: (message: string) => {
aggregatedText += message
emitStreamUpdate()
},
onMessageReplace: (messageReplace) => {
aggregatedText = messageReplace.answer
emitStreamUpdate()
},
onMessageEnd: (messageEnd) => {
const metadata = (messageEnd as ChatMessageEnd).metadata
if (metadata?.outputs)
finalOutputs = metadata.outputs
const endMessage = metadata?.answer || metadata?.message || metadata?.judge_feedback
if (endMessage) {
aggregatedText = endMessage
emitStreamUpdate()
}
if (metadata?.judge_feedback)
finalMessage = metadata.judge_feedback
else if (metadata?.answer || metadata?.message)
finalMessage = metadata.answer || metadata.message
},
response_mode: 'streaming',
conversation_id: '',
}
: {
inputs: {
user_prompt: userInput,
challenge_goal: challengeGoal,
},
)
return
}
response_mode: 'streaming',
}
ssePost(
'/workflows/run',
endpoint,
{
body: {
inputs: {
user_prompt: userInput,
},
response_mode: 'streaming',
},
body,
},
{
...commonOptions,
onData: (message: string) => {
aggregatedText += message
emitStreamUpdate()
},
onMessageReplace: (messageReplace) => {
aggregatedText = messageReplace.answer
emitStreamUpdate()
},
onMessageEnd: (messageEnd) => {
const metadata = (messageEnd as ChatMessageEnd).metadata
if (metadata?.outputs) {
finalOutputs = {
...(finalOutputs || {}),
...metadata.outputs,
}
const metaFeedback = metadata.outputs?.judge_feedback
if (typeof metaFeedback === 'string' && metaFeedback.trim().length > 0)
extractedJudgeFeedback = extractedJudgeFeedback || metaFeedback
const outputsMessage = metadata.outputs?.message
if (typeof outputsMessage === 'string' && outputsMessage.trim().length > 0) {
aggregatedText = outputsMessage
emitStreamUpdate()
}
}
if (!extractedJudgeFeedback && metadata?.judge_feedback && metadata.judge_feedback.trim().length > 0)
extractedJudgeFeedback = metadata.judge_feedback
if (metadata?.judge_outputs) {
if (!extractedJudgeFeedback && typeof metadata.judge_outputs.feedback === 'string' && metadata.judge_outputs.feedback.trim().length > 0)
extractedJudgeFeedback = metadata.judge_outputs.feedback
const judgeMessage = metadata.judge_outputs.message
if (typeof judgeMessage === 'string' && judgeMessage.trim().length > 0) {
aggregatedText = judgeMessage
emitStreamUpdate()
}
}
const endMessage = metadata?.answer || metadata?.message
if (endMessage) {
aggregatedText = endMessage
emitStreamUpdate()
}
},
onTextChunk: (chunk) => {
const text = (chunk as any)?.data?.text || ''
if (text) {
@ -239,14 +266,26 @@ export async function submitChallengeAttempt(
},
onWorkflowFinished: ({ data }) => {
const resultData = (data as WorkflowFinishedResponse['data']) || {}
if (resultData.outputs)
finalOutputs = resultData.outputs
const message = resultData.outputs?.judge_feedback || resultData.outputs?.message
if (message) {
aggregatedText = message
emitStreamUpdate()
if (resultData.outputs) {
finalOutputs = {
...(finalOutputs || {}),
...resultData.outputs,
}
const feedback = resultData.outputs.judge_feedback
if (typeof feedback === 'string' && feedback.trim().length > 0)
extractedJudgeFeedback = extractedJudgeFeedback || feedback
const message = resultData.outputs.message
if (typeof message === 'string' && message.trim().length > 0) {
aggregatedText = message
emitStreamUpdate()
}
}
else if ((data as any)?.metadata?.judge_feedback) {
// Some workflows may emit judge feedback under metadata instead of outputs
const judgeFeedback = (data as any).metadata.judge_feedback
if (typeof judgeFeedback === 'string' && judgeFeedback.trim().length > 0)
extractedJudgeFeedback = extractedJudgeFeedback || judgeFeedback
}
finalMessage = message || finalMessage
},
},
)