feat: propagate judge feedback through challenge workflow results
This commit is contained in:
parent
cb4cde0ed2
commit
96634e7cec
6 changed files with 158 additions and 67 deletions
|
|
@ -45,9 +45,18 @@ class AnswerNode(Node):
|
|||
def _run(self) -> NodeRunResult:
|
||||
segments = self.graph_runtime_state.variable_pool.convert_template(self._node_data.answer)
|
||||
files = self._extract_files_from_segments(segments.value)
|
||||
# Merge answer outputs into existing workflow outputs so upstream metadata (e.g. judge feedback)
|
||||
# is preserved and included in the final workflow_finished payload.
|
||||
existing_outputs = self.graph_runtime_state.outputs
|
||||
merged_outputs: dict[str, Any] = {
|
||||
**existing_outputs,
|
||||
'answer': segments.markdown,
|
||||
'files': ArrayFileSegment(value=files),
|
||||
}
|
||||
self.graph_runtime_state.outputs = merged_outputs
|
||||
return NodeRunResult(
|
||||
status=WorkflowNodeExecutionStatus.SUCCEEDED,
|
||||
outputs={"answer": segments.markdown, "files": ArrayFileSegment(value=files)},
|
||||
outputs=merged_outputs,
|
||||
)
|
||||
|
||||
def _extract_files_from_segments(self, segments: Sequence[Segment]):
|
||||
|
|
|
|||
|
|
@ -148,6 +148,8 @@ class ChallengeEvaluatorNode(Node):
|
|||
|
||||
# optional persistence if config carries challenge_id
|
||||
challenge_id = self._config.get('challenge_id')
|
||||
judge_feedback = judge_feedback_from_input if is_judge_input else None
|
||||
judge_rating_value = judge_rating if is_judge_input else None
|
||||
if challenge_id:
|
||||
try:
|
||||
# Calculate elapsed time in milliseconds
|
||||
|
|
@ -157,11 +159,17 @@ class ChallengeEvaluatorNode(Node):
|
|||
tokens_total = self.graph_runtime_state.total_tokens
|
||||
|
||||
# Extract judge_rating from details if available (for highest_rating strategy)
|
||||
judge_rating = None
|
||||
judge_feedback = None
|
||||
if isinstance(details, dict):
|
||||
judge_rating = details.get('rating')
|
||||
judge_feedback = details.get('feedback')
|
||||
judge_rating_raw = details.get('rating')
|
||||
judge_feedback_raw = details.get('feedback')
|
||||
if judge_rating_raw is not None:
|
||||
judge_rating_value = int(judge_rating_raw)
|
||||
if (
|
||||
judge_feedback_raw is not None
|
||||
and isinstance(judge_feedback_raw, str)
|
||||
and judge_feedback_raw.strip()
|
||||
):
|
||||
judge_feedback = str(judge_feedback_raw)
|
||||
|
||||
# Load challenge to check scoring strategy
|
||||
challenge = db.session.get(Challenge, str(challenge_id))
|
||||
|
|
@ -209,7 +217,7 @@ class ChallengeEvaluatorNode(Node):
|
|||
logger.error("Custom scorer failed: %s", e, exc_info=True)
|
||||
# Continue with score=None on error
|
||||
|
||||
ChallengeService.record_attempt(
|
||||
_ = ChallengeService.record_attempt(
|
||||
tenant_id=self.tenant_id,
|
||||
challenge_id=challenge_id,
|
||||
end_user_id=None,
|
||||
|
|
@ -217,8 +225,8 @@ class ChallengeEvaluatorNode(Node):
|
|||
workflow_run_id=None,
|
||||
succeeded=ok,
|
||||
score=score,
|
||||
judge_rating=judge_rating,
|
||||
judge_feedback=judge_feedback,
|
||||
judge_rating=judge_rating_value,
|
||||
judge_feedback=judge_feedback_from_input or judge_feedback,
|
||||
tokens_total=tokens_total,
|
||||
elapsed_ms=elapsed_ms,
|
||||
session=db.session,
|
||||
|
|
@ -230,8 +238,8 @@ class ChallengeEvaluatorNode(Node):
|
|||
# Always provide all output variables to match frontend getOutputVars
|
||||
outputs: dict[str, Any] = {
|
||||
'challenge_succeeded': ok,
|
||||
'judge_rating': 0,
|
||||
'judge_feedback': '',
|
||||
'judge_rating': judge_rating_value,
|
||||
'judge_feedback': judge_feedback_from_input or judge_feedback or '',
|
||||
'message': '',
|
||||
}
|
||||
|
||||
|
|
@ -251,8 +259,14 @@ class ChallengeEvaluatorNode(Node):
|
|||
else:
|
||||
outputs['message'] = f"Failed: {details.get('mode', 'evaluation')} did not match"
|
||||
|
||||
# also persist judge outputs onto runtime state so downstream consumers can access them
|
||||
if outputs['judge_feedback']:
|
||||
self.graph_runtime_state.set_output('judge_feedback', outputs['judge_feedback'])
|
||||
if outputs['judge_rating'] is not None:
|
||||
self.graph_runtime_state.set_output('judge_rating', outputs['judge_rating'])
|
||||
self.graph_runtime_state.set_output('challenge_succeeded', outputs['challenge_succeeded'])
|
||||
|
||||
return NodeRunResult(
|
||||
status=WorkflowNodeExecutionStatus.SUCCEEDED,
|
||||
outputs=outputs,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -54,6 +54,14 @@ export default function ChallengeDetailPage() {
|
|||
setHasStreamingResult(false)
|
||||
}, [])
|
||||
|
||||
useEffect(() => {
|
||||
const previousOverflow = document.body.style.overflowY
|
||||
document.body.style.overflowY = 'auto'
|
||||
return () => {
|
||||
document.body.style.overflowY = previousOverflow
|
||||
}
|
||||
}, [])
|
||||
|
||||
useEffect(() => () => {
|
||||
stopStreaming()
|
||||
}, [stopStreaming])
|
||||
|
|
@ -80,6 +88,7 @@ export default function ChallengeDetailPage() {
|
|||
challenge.app_site_code,
|
||||
challenge.app_mode || 'workflow',
|
||||
userInput,
|
||||
challenge.goal,
|
||||
{
|
||||
onStreamUpdate: (text) => {
|
||||
setStreamingText(text)
|
||||
|
|
@ -99,9 +108,24 @@ export default function ChallengeDetailPage() {
|
|||
setHasStreamingResult(false)
|
||||
setStreamingText(result.rawText)
|
||||
|
||||
const judgeFeedback = typeof result.outputs?.judge_feedback === 'string' && result.outputs.judge_feedback.trim().length > 0
|
||||
? result.outputs.judge_feedback
|
||||
: (typeof result.message === 'string' && result.message.trim().length > 0 ? result.message : undefined)
|
||||
const fallbackExplanation = typeof result.outputs?.message === 'string' && result.outputs.message.trim().length > 0
|
||||
? result.outputs.message
|
||||
: ''
|
||||
const successFallback = t('challenges.player.defaultSuccessMessage', 'Challenge passed!')
|
||||
const failureFallback = t('challenges.player.defaultFailureMessage', 'Challenge not passed.')
|
||||
const judgeFeedbackLine = judgeFeedback
|
||||
? t('challenges.player.judgeFeedbackLine', { feedback: judgeFeedback, defaultValue: `${judgeFeedback}` })
|
||||
: ''
|
||||
const combinedMessage = result.success
|
||||
? [judgeFeedback || fallbackExplanation || successFallback].filter(Boolean).join('\n')
|
||||
: [judgeFeedbackLine || fallbackExplanation || failureFallback].filter(Boolean).join('\n')
|
||||
|
||||
setLastResult({
|
||||
success: result.success,
|
||||
message: result.message,
|
||||
message: combinedMessage,
|
||||
rating: result.rating,
|
||||
})
|
||||
|
||||
|
|
@ -143,7 +167,7 @@ export default function ChallengeDetailPage() {
|
|||
}
|
||||
|
||||
return (
|
||||
<div className='min-h-screen bg-components-panel-bg'>
|
||||
<div className='min-h-screen overflow-y-auto bg-components-panel-bg'>
|
||||
<div className='mx-auto max-w-5xl px-4 py-12 sm:px-6 lg:px-8'>
|
||||
<div className='mb-8'>
|
||||
<h1 className='mb-2 text-3xl font-bold text-text-primary'>{challenge.name}</h1>
|
||||
|
|
@ -220,7 +244,7 @@ export default function ChallengeDetailPage() {
|
|||
{lastResult.success ? t('challenges.player.status.success') : t('challenges.player.status.failed')}
|
||||
</div>
|
||||
{lastResult.message && (
|
||||
<div className='text-sm text-text-secondary'>{lastResult.message}</div>
|
||||
<div className='whitespace-pre-wrap text-sm text-text-secondary'>{lastResult.message}</div>
|
||||
)}
|
||||
{lastResult.rating !== undefined && (
|
||||
<div className='mt-2 text-sm text-text-tertiary'>
|
||||
|
|
|
|||
|
|
@ -54,6 +54,10 @@ export default {
|
|||
awaitingResponse: 'Waiting for the model to respond…',
|
||||
processing: 'Processing…',
|
||||
submitButton: 'Submit',
|
||||
defaultSuccessMessage: 'Challenge passed!',
|
||||
defaultFailureMessage: 'Challenge not passed.',
|
||||
ratingLine: 'Judge rating: {{rating}}/10',
|
||||
judgeFeedbackLine: 'Judge feedback: {{feedback}}',
|
||||
},
|
||||
leaderboard: {
|
||||
title: 'Leaderboard',
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ const translation = {
|
|||
create: 'Created',
|
||||
remove: 'Removed',
|
||||
},
|
||||
loading: 'Loading…',
|
||||
operation: {
|
||||
create: 'Create',
|
||||
confirm: 'Confirm',
|
||||
|
|
|
|||
|
|
@ -11,6 +11,10 @@ type ChatMessageEnd = {
|
|||
message?: string
|
||||
judge_feedback?: string
|
||||
judge_rating?: number
|
||||
judge_outputs?: {
|
||||
feedback?: string
|
||||
message?: string
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -60,6 +64,7 @@ export async function submitChallengeAttempt(
|
|||
appSiteCode: string | undefined,
|
||||
appMode: string,
|
||||
userInput: string,
|
||||
challengeGoal?: string,
|
||||
callbacks?: ChallengeAttemptCallbacks,
|
||||
): Promise<ChallengeAttemptResult> {
|
||||
if (!appSiteCode)
|
||||
|
|
@ -114,6 +119,7 @@ export async function submitChallengeAttempt(
|
|||
let aggregatedText = ''
|
||||
let finalOutputs: Record<string, any> | undefined
|
||||
let finalMessage: string | undefined
|
||||
let extractedJudgeFeedback: string | undefined
|
||||
let isSettled = false
|
||||
const releaseAbortController = () => {
|
||||
callbacks?.onAbortController?.(null)
|
||||
|
|
@ -141,8 +147,12 @@ export async function submitChallengeAttempt(
|
|||
const outputs = finalOutputs || {}
|
||||
const successFlag = Boolean(outputs.challenge_succeeded)
|
||||
const rating = outputs.judge_rating ?? outputs.rating
|
||||
const feedback = outputs.judge_feedback || outputs.message || finalMessage || aggregatedText
|
||||
const message = feedback || (successFlag ? 'Challenge passed!' : 'Challenge not passed.')
|
||||
const feedback = extractedJudgeFeedback
|
||||
?? (typeof outputs.judge_feedback === 'string' && outputs.judge_feedback.trim().length > 0
|
||||
? outputs.judge_feedback
|
||||
: undefined)
|
||||
const fallbackMessage = outputs.message || finalMessage || aggregatedText
|
||||
const message = feedback || fallbackMessage || (successFlag ? 'Challenge passed!' : 'Challenge not passed.')
|
||||
return {
|
||||
success: successFlag,
|
||||
rating,
|
||||
|
|
@ -178,58 +188,75 @@ export async function submitChallengeAttempt(
|
|||
},
|
||||
}
|
||||
|
||||
if (isChatApp) {
|
||||
ssePost(
|
||||
'/chat-messages',
|
||||
{
|
||||
body: {
|
||||
query: userInput,
|
||||
inputs: {},
|
||||
response_mode: 'streaming',
|
||||
conversation_id: '',
|
||||
},
|
||||
const endpoint = isChatApp ? '/chat-messages' : '/workflows/run'
|
||||
const body = isChatApp
|
||||
? {
|
||||
query: userInput,
|
||||
inputs: {
|
||||
challenge_goal: challengeGoal,
|
||||
},
|
||||
{
|
||||
...commonOptions,
|
||||
onData: (message: string) => {
|
||||
aggregatedText += message
|
||||
emitStreamUpdate()
|
||||
},
|
||||
onMessageReplace: (messageReplace) => {
|
||||
aggregatedText = messageReplace.answer
|
||||
emitStreamUpdate()
|
||||
},
|
||||
onMessageEnd: (messageEnd) => {
|
||||
const metadata = (messageEnd as ChatMessageEnd).metadata
|
||||
if (metadata?.outputs)
|
||||
finalOutputs = metadata.outputs
|
||||
const endMessage = metadata?.answer || metadata?.message || metadata?.judge_feedback
|
||||
if (endMessage) {
|
||||
aggregatedText = endMessage
|
||||
emitStreamUpdate()
|
||||
}
|
||||
if (metadata?.judge_feedback)
|
||||
finalMessage = metadata.judge_feedback
|
||||
else if (metadata?.answer || metadata?.message)
|
||||
finalMessage = metadata.answer || metadata.message
|
||||
},
|
||||
response_mode: 'streaming',
|
||||
conversation_id: '',
|
||||
}
|
||||
: {
|
||||
inputs: {
|
||||
user_prompt: userInput,
|
||||
challenge_goal: challengeGoal,
|
||||
},
|
||||
)
|
||||
return
|
||||
}
|
||||
response_mode: 'streaming',
|
||||
}
|
||||
|
||||
ssePost(
|
||||
'/workflows/run',
|
||||
endpoint,
|
||||
{
|
||||
body: {
|
||||
inputs: {
|
||||
user_prompt: userInput,
|
||||
},
|
||||
response_mode: 'streaming',
|
||||
},
|
||||
body,
|
||||
},
|
||||
{
|
||||
...commonOptions,
|
||||
onData: (message: string) => {
|
||||
aggregatedText += message
|
||||
emitStreamUpdate()
|
||||
},
|
||||
onMessageReplace: (messageReplace) => {
|
||||
aggregatedText = messageReplace.answer
|
||||
emitStreamUpdate()
|
||||
},
|
||||
onMessageEnd: (messageEnd) => {
|
||||
const metadata = (messageEnd as ChatMessageEnd).metadata
|
||||
if (metadata?.outputs) {
|
||||
finalOutputs = {
|
||||
...(finalOutputs || {}),
|
||||
...metadata.outputs,
|
||||
}
|
||||
const metaFeedback = metadata.outputs?.judge_feedback
|
||||
if (typeof metaFeedback === 'string' && metaFeedback.trim().length > 0)
|
||||
extractedJudgeFeedback = extractedJudgeFeedback || metaFeedback
|
||||
const outputsMessage = metadata.outputs?.message
|
||||
if (typeof outputsMessage === 'string' && outputsMessage.trim().length > 0) {
|
||||
aggregatedText = outputsMessage
|
||||
emitStreamUpdate()
|
||||
}
|
||||
}
|
||||
|
||||
if (!extractedJudgeFeedback && metadata?.judge_feedback && metadata.judge_feedback.trim().length > 0)
|
||||
extractedJudgeFeedback = metadata.judge_feedback
|
||||
|
||||
if (metadata?.judge_outputs) {
|
||||
if (!extractedJudgeFeedback && typeof metadata.judge_outputs.feedback === 'string' && metadata.judge_outputs.feedback.trim().length > 0)
|
||||
extractedJudgeFeedback = metadata.judge_outputs.feedback
|
||||
const judgeMessage = metadata.judge_outputs.message
|
||||
if (typeof judgeMessage === 'string' && judgeMessage.trim().length > 0) {
|
||||
aggregatedText = judgeMessage
|
||||
emitStreamUpdate()
|
||||
}
|
||||
}
|
||||
|
||||
const endMessage = metadata?.answer || metadata?.message
|
||||
if (endMessage) {
|
||||
aggregatedText = endMessage
|
||||
emitStreamUpdate()
|
||||
}
|
||||
},
|
||||
onTextChunk: (chunk) => {
|
||||
const text = (chunk as any)?.data?.text || ''
|
||||
if (text) {
|
||||
|
|
@ -239,14 +266,26 @@ export async function submitChallengeAttempt(
|
|||
},
|
||||
onWorkflowFinished: ({ data }) => {
|
||||
const resultData = (data as WorkflowFinishedResponse['data']) || {}
|
||||
if (resultData.outputs)
|
||||
finalOutputs = resultData.outputs
|
||||
const message = resultData.outputs?.judge_feedback || resultData.outputs?.message
|
||||
if (message) {
|
||||
aggregatedText = message
|
||||
emitStreamUpdate()
|
||||
if (resultData.outputs) {
|
||||
finalOutputs = {
|
||||
...(finalOutputs || {}),
|
||||
...resultData.outputs,
|
||||
}
|
||||
const feedback = resultData.outputs.judge_feedback
|
||||
if (typeof feedback === 'string' && feedback.trim().length > 0)
|
||||
extractedJudgeFeedback = extractedJudgeFeedback || feedback
|
||||
const message = resultData.outputs.message
|
||||
if (typeof message === 'string' && message.trim().length > 0) {
|
||||
aggregatedText = message
|
||||
emitStreamUpdate()
|
||||
}
|
||||
}
|
||||
else if ((data as any)?.metadata?.judge_feedback) {
|
||||
// Some workflows may emit judge feedback under metadata instead of outputs
|
||||
const judgeFeedback = (data as any).metadata.judge_feedback
|
||||
if (typeof judgeFeedback === 'string' && judgeFeedback.trim().length > 0)
|
||||
extractedJudgeFeedback = extractedJudgeFeedback || judgeFeedback
|
||||
}
|
||||
finalMessage = message || finalMessage
|
||||
},
|
||||
},
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue