feat: Add Cleanlab's AI Reliability Bundle to Langflow (#8049)
* Add Cleanlab bundle * Fix icon logic and add support for light/dark mode dynamic logo * Remove stuff * Modify components and add documentation * [autofix.ci] apply automated fixes * Add sidebar code * [autofix.ci] apply automated fixes * Update docs/sidebars.js Co-authored-by: Mendon Kissling <59585235+mendonk@users.noreply.github.com> * Update docs/docs/Integrations/Cleanlab/integrations-cleanlab.md Co-authored-by: Mendon Kissling <59585235+mendonk@users.noreply.github.com> * Update docs/docs/Integrations/Cleanlab/integrations-cleanlab.md Co-authored-by: Mendon Kissling <59585235+mendonk@users.noreply.github.com> * Update docs/docs/Integrations/Cleanlab/integrations-cleanlab.md Co-authored-by: Mendon Kissling <59585235+mendonk@users.noreply.github.com> * copy edits * update samples * style check fixes * [autofix.ci] apply automated fixes * style check fix 2 --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Mendon Kissling <59585235+mendonk@users.noreply.github.com> Co-authored-by: Mike Fortman <michael.fortman@datastax.com>
This commit is contained in:
parent
06f7279977
commit
087a22c246
20 changed files with 9380 additions and 5422 deletions
BIN
docs/docs/Integrations/Cleanlab/cleanlab_remediator_example.png
Normal file
BIN
docs/docs/Integrations/Cleanlab/cleanlab_remediator_example.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 209 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 58 KiB |
1880
docs/docs/Integrations/Cleanlab/eval_and_remediate_cleanlab.json
Normal file
1880
docs/docs/Integrations/Cleanlab/eval_and_remediate_cleanlab.json
Normal file
File diff suppressed because one or more lines are too long
BIN
docs/docs/Integrations/Cleanlab/eval_rag.png
Normal file
BIN
docs/docs/Integrations/Cleanlab/eval_rag.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 484 KiB |
BIN
docs/docs/Integrations/Cleanlab/eval_response.png
Normal file
BIN
docs/docs/Integrations/Cleanlab/eval_response.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 586 KiB |
BIN
docs/docs/Integrations/Cleanlab/eval_summary_rag.png
Normal file
BIN
docs/docs/Integrations/Cleanlab/eval_summary_rag.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 375 KiB |
145
docs/docs/Integrations/Cleanlab/integrations-cleanlab.md
Normal file
145
docs/docs/Integrations/Cleanlab/integrations-cleanlab.md
Normal file
|
|
@ -0,0 +1,145 @@
|
|||
---
|
||||
title: Integrate Cleanlab Evaluations with Langflow
|
||||
slug: /integrations-cleanlab
|
||||
---
|
||||
|
||||
Unlock trustworthy Agentic, RAG, and LLM pipelines with Cleanlab's evaluation and remediation suite.
|
||||
|
||||
[Cleanlab](https://www.cleanlab.ai/) adds automation and trust to every data point going in and every prediction coming out of AI and RAG solutions.
|
||||
|
||||
This Langflow integration provides 3 modular components that assess and improve the **trustworthiness** of any LLM or RAG pipeline output, enabling critical oversight for safety-sensitive, enterprise, and production GenAI applications.
|
||||
|
||||
Use this bundle to:
|
||||
- Quantify trustworthiness of ANY LLM response with a **0-1 score**
|
||||
- Explain why a response may be good or bad
|
||||
- Evaluate **context sufficiency**, **groundedness**, **helpfulness**, and **query clarity** with quantitative scores (for RAG/Agentic pipelines with context)
|
||||
- Remediate low-trust responses with warnings or fallback answers
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before using these components, you'll need:
|
||||
|
||||
- A [Cleanlab API key](https://tlm.cleanlab.ai/)
|
||||
|
||||
|
||||
## Components
|
||||
|
||||
### `CleanlabEvaluator`
|
||||
|
||||
**Purpose:** Evaluate and explain the trustworthiness of a prompt + response pair using Cleanlab. More details on how the score works [here](https://help.cleanlab.ai/tlm/).
|
||||
|
||||
#### Inputs
|
||||
|
||||
| Name | Type | Description |
|
||||
|-----------------------|------------|---------------------------------------------------------------------|
|
||||
| system_prompt | Message | (Optional) System message prepended to the prompt |
|
||||
| prompt | Message | The user-facing input to the LLM |
|
||||
| response | Message | OpenAI's, Claude, etc. model's response to evaluate |
|
||||
| cleanlab_api_key | Secret | Your Cleanlab API key |
|
||||
| cleanlab_evaluation_model | Dropdown | Evaluation model used by Cleanlab (GPT-4, Claude, etc.) This does not need to be the same model that generated the response. |
|
||||
| quality_preset | Dropdown | Tradeoff between evaluation speed and accuracy |
|
||||
|
||||
#### Outputs
|
||||
|
||||
| Name | Type | Description |
|
||||
|-----------------------|------------|---------------------------------------------------------------------|
|
||||
| score | number | Trust score between 0–1 |
|
||||
| explanation | Message | Explanation of the trust score |
|
||||
| response | Message | Returns the original response for easy chaining to `CleanlabRemediator` component |
|
||||
|
||||
---
|
||||
|
||||
### `CleanlabRemediator`
|
||||
|
||||
**Purpose:** Use the trust score from the `CleanlabEvaluator` component to determine whether to show, warn about, or replace an LLM response. This component has configurables for the score threshold, warning text, and fallback message which you can customize as needed.
|
||||
|
||||
#### Inputs
|
||||
|
||||
| Name | Type | Description |
|
||||
|-----------------------------|------------|-----------------------------------------------------------------------------|
|
||||
| response | Message | The response to potentially remediate |
|
||||
| score | number | Trust score from `CleanlabEvaluator` |
|
||||
| explanation | Message | (Optional) Explanation to append if warning is shown |
|
||||
| threshold | float | Minimum trust score to pass response unchanged |
|
||||
| show_untrustworthy_response| bool | Show original response with warning if untrustworthy |
|
||||
| untrustworthy_warning_text | Prompt | Warning text for untrustworthy responses |
|
||||
| fallback_text | Prompt | Fallback message if response is hidden |
|
||||
|
||||
#### Output
|
||||
|
||||
| Name | Type | Description |
|
||||
|-----------------------|------------|-----------------------------------------------------------------------------|
|
||||
| remediated_response | Message | Final message shown to user after remediation logic |
|
||||
|
||||
|
||||
See example outputs below!
|
||||
|
||||
---
|
||||
|
||||
### `CleanlabRAGEvaluator`
|
||||
|
||||
**Purpose:** Comprehensively evaluate RAG and LLM pipeline outputs by analyzing the context, query, and response quality using Cleanlab. This component assesses trustworthiness, context sufficiency, response groundedness, helpfulness, and query ease. Learn more about Cleanlab's evaluation metrics [here](https://help.cleanlab.ai/tlm/use-cases/tlm_rag/). You can also use the `CleanlabRemediator` component with this one to remediate low-trust responses coming from the RAG pipeline.
|
||||
|
||||
#### Inputs
|
||||
|
||||
| Name | Type | Description |
|
||||
|--------------------------|-----------|----------------------------------------------------------------------------|
|
||||
| cleanlab_api_key | Secret | Your Cleanlab API key |
|
||||
| cleanlab_evaluation_model | Dropdown | Evaluation model used by Cleanlab (GPT-4, Claude, etc.) This does not need to be the same model that generated the response. |
|
||||
| quality_preset | Dropdown | Tradeoff between evaluation speed and accuracy |
|
||||
| context | Message | Retrieved context from your RAG system |
|
||||
| query | Message | The original user query |
|
||||
| response | Message | OpenAI's, Claude, etc. model's response based on the context and query |
|
||||
| run_context_sufficiency | bool | Evaluate whether context supports answering the query |
|
||||
| run_response_groundedness| bool | Evaluate whether the response is grounded in the context |
|
||||
| run_response_helpfulness | bool | Evaluate how helpful the response is |
|
||||
| run_query_ease | bool | Evaluate if the query is vague, complex, or adversarial |
|
||||
|
||||
#### Outputs
|
||||
|
||||
| Name | Type | Description |
|
||||
|-----------------------|------------|-----------------------------------------------------------------------------|
|
||||
| trust_score | number | Overall trust score |
|
||||
| trust_explanation | Message | Explanation for trust score |
|
||||
| other_scores | dict | Dictionary of optional enabled RAG evaluation metrics |
|
||||
| evaluation_summary | Message | Markdown summary of query, context, response, and evaluation results |
|
||||
|
||||
---
|
||||
|
||||
## Example Flows
|
||||
|
||||
The following example flows show how to use the `CleanlabEvaluator` and `CleanlabRemediator` components to evaluate and remediate responses from any LLM, and how to use the `CleanlabRAGEvaluator` component to evaluate RAG pipeline outputs.
|
||||
|
||||
### Evaluate and remediate responses from any LLM
|
||||
|
||||
[Download](./eval_and_remediate_cleanlab.json) the flow to follow along!
|
||||
|
||||
This flow evaluates and remediates the trustworthiness of a response from any LLM using the `CleanlabEvaluator` and `CleanlabRemediator` components.
|
||||
|
||||

|
||||
|
||||
Simply connect the `Message` output from any LLM component (like OpenAI, Anthropic, or Google) to the `response` input of the `CleanlabEvaluator` component, along with connecting your prompt to its `prompt` input.
|
||||
|
||||
That's it! The `CleanlabEvaluator` component will return a trust score and explanation which you can use however you'd like.
|
||||
|
||||
The `CleanlabRemediator` component uses this trust score and user configurable settings to determine whether to output the original response, warn about it, or replace it with a fallback answer.
|
||||
|
||||
The example below shows a response that was determined to be untrustworthy (score of .09) and flagged with a warning by the `CleanlabRemediator` component.
|
||||
|
||||

|
||||
|
||||
If you don't want to show untrustworthy responses, you can also configure the `CleanlabRemediator` to replace the response with a fallback message.
|
||||
|
||||

|
||||
|
||||
### Evaluate RAG pipeline
|
||||
|
||||
The below flow is the `Vector Store RAG` example template, with the `CleanlabRAGEvaluator` component added to evaluate the context, query, and response. You can use the `CleanlabRAGEvaluator` with any flow that has a context, query, and response. Simply connect the `context`, `query`, and `response` outputs from any RAG pipeline to the `CleanlabRAGEvaluator` component.
|
||||
|
||||

|
||||
|
||||
Here is an example of the `Evaluation Summary` output from the `CleanlabRAGEvaluator` component.
|
||||
|
||||

|
||||
|
||||
Notice how the `Evaluation Summary` includes the query, context, response, and all the evaluation results! In this example, the `Context Sufficiency` and `Response Groundedness` scores are low (0.002) because the context doesn't contain information about the query and the response is not grounded in the context.
|
||||
|
|
@ -237,6 +237,11 @@ module.exports = {
|
|||
id: "Integrations/Composio/integrations-composio",
|
||||
label: "Composio",
|
||||
},
|
||||
{
|
||||
type: "doc",
|
||||
id: "Integrations/Cleanlab/integrations-cleanlab",
|
||||
label: "Cleanlab",
|
||||
},
|
||||
{
|
||||
type: 'category',
|
||||
label: 'Google',
|
||||
|
|
|
|||
|
|
@ -124,6 +124,7 @@ dependencies = [
|
|||
"langchain-ibm>=0.3.8",
|
||||
"opik>=1.6.3",
|
||||
"openai>=1.68.2",
|
||||
"cleanlab-tlm>=1.1.2",
|
||||
'gassist>=0.0.1; sys_platform == "win32"',
|
||||
"twelvelabs>=0.4.7",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
from .cleanlab_evaluator import CleanlabEvaluator
|
||||
from .cleanlab_rag_evaluator import CleanlabRAGEvaluator
|
||||
from .cleanlab_remediator import CleanlabRemediator
|
||||
|
||||
__all__ = ["CleanlabEvaluator", "CleanlabRAGEvaluator", "CleanlabRemediator"]
|
||||
|
|
@ -0,0 +1,145 @@
|
|||
from cleanlab_tlm import TLM
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.io import (
|
||||
DropdownInput,
|
||||
MessageTextInput,
|
||||
Output,
|
||||
SecretStrInput,
|
||||
)
|
||||
from langflow.schema.message import Message
|
||||
|
||||
|
||||
class CleanlabEvaluator(Component):
|
||||
"""A component that evaluates the trustworthiness of LLM responses using Cleanlab.
|
||||
|
||||
This component takes a prompt and response pair, along with optional system instructions,
|
||||
and uses Cleanlab's evaluation algorithms to generate a trust score and explanation.
|
||||
|
||||
Inputs:
|
||||
- system_prompt (MessageTextInput): Optional system-level instructions prepended to the user prompt.
|
||||
- prompt (MessageTextInput): The user's prompt or query sent to the LLM.
|
||||
- response (MessageTextInput): The response generated by the LLM to be evaluated. This should come from the
|
||||
LLM component, i.e. OpenAI, Gemini, etc.
|
||||
- api_key (SecretStrInput): Your Cleanlab API key.
|
||||
- model (DropdownInput): The model used by Cleanlab to evaluate the response (can differ from the
|
||||
generation model).
|
||||
- quality_preset (DropdownInput): Tradeoff setting for accuracy vs. speed and cost. Higher presets are
|
||||
slower but more accurate.
|
||||
|
||||
Outputs:
|
||||
- response_passthrough (Message): The original response, passed through for downstream use.
|
||||
- score (number): A float between 0 and 1 indicating Cleanlab's trustworthiness score for the response.
|
||||
- explanation (Message): A textual explanation of why the response received its score.
|
||||
|
||||
This component works well in conjunction with the CleanlabRemediator to create a complete trust evaluation
|
||||
and remediation pipeline.
|
||||
|
||||
More details on the evaluation metrics can be found here: https://help.cleanlab.ai/tlm/tutorials/tlm/
|
||||
"""
|
||||
|
||||
display_name = "Cleanlab Evaluator"
|
||||
description = "Evaluates any LLM response using Cleanlab and outputs trust score and explanation."
|
||||
icon = "Cleanlab"
|
||||
name = "CleanlabEvaluator"
|
||||
|
||||
inputs = [
|
||||
MessageTextInput(
|
||||
name="system_prompt",
|
||||
display_name="System Message",
|
||||
info="System-level instructions prepended to the user query.",
|
||||
value="",
|
||||
),
|
||||
MessageTextInput(
|
||||
name="prompt",
|
||||
display_name="Prompt",
|
||||
info="The user's query to the model.",
|
||||
required=True,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="response",
|
||||
display_name="Response",
|
||||
info="The response to the user's query.",
|
||||
required=True,
|
||||
),
|
||||
SecretStrInput(
|
||||
name="api_key",
|
||||
display_name="Cleanlab API Key",
|
||||
info="Your Cleanlab API key.",
|
||||
required=True,
|
||||
),
|
||||
DropdownInput(
|
||||
name="model",
|
||||
display_name="Cleanlab Evaluation Model",
|
||||
options=[
|
||||
"gpt-4.1",
|
||||
"gpt-4.1-mini",
|
||||
"gpt-4.1-nano",
|
||||
"o4-mini",
|
||||
"o3",
|
||||
"gpt-4.5-preview",
|
||||
"gpt-4o-mini",
|
||||
"gpt-4o",
|
||||
"o3-mini",
|
||||
"o1",
|
||||
"o1-mini",
|
||||
"gpt-4",
|
||||
"gpt-3.5-turbo-16k",
|
||||
"claude-3.7-sonnet",
|
||||
"claude-3.5-sonnet-v2",
|
||||
"claude-3.5-sonnet",
|
||||
"claude-3.5-haiku",
|
||||
"claude-3-haiku",
|
||||
"nova-micro",
|
||||
"nova-lite",
|
||||
"nova-pro",
|
||||
],
|
||||
info="The model Cleanlab uses to evaluate the response. This does NOT need to be the same model that "
|
||||
"generated the response.",
|
||||
value="gpt-4o-mini",
|
||||
required=True,
|
||||
advanced=True,
|
||||
),
|
||||
DropdownInput(
|
||||
name="quality_preset",
|
||||
display_name="Quality Preset",
|
||||
options=["base", "low", "medium", "high", "best"],
|
||||
value="medium",
|
||||
info="This determines the accuracy, latency, and cost of the evaluation. Higher quality is generally "
|
||||
"slower but more accurate.",
|
||||
required=True,
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(display_name="Response", name="response_passthrough", method="pass_response", types=["Message"]),
|
||||
Output(display_name="Trust Score", name="score", method="get_score", types=["number"]),
|
||||
Output(display_name="Explanation", name="explanation", method="get_explanation", types=["Message"]),
|
||||
]
|
||||
|
||||
def _evaluate_once(self):
|
||||
if not hasattr(self, "_cached_result"):
|
||||
full_prompt = f"{self.system_prompt}\n\n{self.prompt}" if self.system_prompt else self.prompt
|
||||
tlm = TLM(
|
||||
api_key=self.api_key,
|
||||
options={"log": ["explanation"], "model": self.model},
|
||||
quality_preset=self.quality_preset,
|
||||
)
|
||||
self._cached_result = tlm.get_trustworthiness_score(full_prompt, self.response)
|
||||
return self._cached_result
|
||||
|
||||
def get_score(self) -> float:
|
||||
result = self._evaluate_once()
|
||||
score = result.get("trustworthiness_score", 0.0)
|
||||
self.status = f"Trust score: {score:.2f}"
|
||||
return score
|
||||
|
||||
def get_explanation(self) -> Message:
|
||||
result = self._evaluate_once()
|
||||
explanation = result.get("log", {}).get("explanation", "No explanation returned.")
|
||||
return Message(text=explanation)
|
||||
|
||||
def pass_response(self) -> Message:
|
||||
self.status = "Passing through response."
|
||||
return Message(text=self.response)
|
||||
|
|
@ -0,0 +1,254 @@
|
|||
from cleanlab_tlm import TrustworthyRAG, get_default_evals
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.io import (
|
||||
BoolInput,
|
||||
DropdownInput,
|
||||
MessageTextInput,
|
||||
Output,
|
||||
SecretStrInput,
|
||||
)
|
||||
from langflow.schema.message import Message
|
||||
|
||||
|
||||
class CleanlabRAGEvaluator(Component):
|
||||
"""A component that evaluates the quality of RAG (Retrieval-Augmented Generation) outputs using Cleanlab.
|
||||
|
||||
This component takes a query, retrieved context, and generated response from a RAG pipeline,
|
||||
and uses Cleanlab's evaluation algorithms to assess various aspects of the RAG system's performance.
|
||||
|
||||
The component can evaluate:
|
||||
- Overall trustworthiness of the LLM generated response
|
||||
- Context sufficiency (whether the retrieved context contains information needed to answer the query)
|
||||
- Response groundedness (whether the response is supported directly by the context)
|
||||
- Response helpfulness (whether the response effectively addresses the user's query)
|
||||
- Query ease (whether the user query seems easy for an AI system to properly handle, useful to diagnose
|
||||
queries that are: complex, vague, tricky, or disgruntled-sounding)
|
||||
|
||||
Outputs:
|
||||
- Trust Score: A score between 0-1 corresponding to the trustworthiness of the response. A higher score
|
||||
indicates a higher confidence that the response is correct/good.
|
||||
- Explanation: An LLM generated explanation of the trustworthiness assessment
|
||||
- Other Evals: Additional evaluation metrics for selected evaluation types in the "Controls" tab
|
||||
- Evaluation Summary: A comprehensive summary of context, query, response, and selected evaluation results
|
||||
|
||||
This component works well in conjunction with the CleanlabRemediator to create a complete trust evaluation
|
||||
and remediation pipeline.
|
||||
|
||||
More details on the evaluation metrics can be found here: https://help.cleanlab.ai/tlm/use-cases/tlm_rag/
|
||||
"""
|
||||
|
||||
display_name = "Cleanlab RAG Evaluator"
|
||||
description = "Evaluates context, query, and response from a RAG pipeline using Cleanlab and outputs trust metrics."
|
||||
icon = "Cleanlab"
|
||||
name = "CleanlabRAGEvaluator"
|
||||
|
||||
inputs = [
|
||||
SecretStrInput(
|
||||
name="api_key",
|
||||
display_name="Cleanlab API Key",
|
||||
info="Your Cleanlab API key.",
|
||||
required=True,
|
||||
),
|
||||
DropdownInput(
|
||||
name="model",
|
||||
display_name="Cleanlab Evaluation Model",
|
||||
options=[
|
||||
"gpt-4.1",
|
||||
"gpt-4.1-mini",
|
||||
"gpt-4.1-nano",
|
||||
"o4-mini",
|
||||
"o3",
|
||||
"gpt-4.5-preview",
|
||||
"gpt-4o-mini",
|
||||
"gpt-4o",
|
||||
"o3-mini",
|
||||
"o1",
|
||||
"o1-mini",
|
||||
"gpt-4",
|
||||
"gpt-3.5-turbo-16k",
|
||||
"claude-3.7-sonnet",
|
||||
"claude-3.5-sonnet-v2",
|
||||
"claude-3.5-sonnet",
|
||||
"claude-3.5-haiku",
|
||||
"claude-3-haiku",
|
||||
"nova-micro",
|
||||
"nova-lite",
|
||||
"nova-pro",
|
||||
],
|
||||
info="The model Cleanlab uses to evaluate the context, query, and response. This does NOT need to be "
|
||||
"the same model that generated the response.",
|
||||
value="gpt-4o-mini",
|
||||
required=True,
|
||||
advanced=True,
|
||||
),
|
||||
DropdownInput(
|
||||
name="quality_preset",
|
||||
display_name="Quality Preset",
|
||||
options=["base", "low", "medium"],
|
||||
value="medium",
|
||||
info="This determines the accuracy, latency, and cost of the evaluation. Higher quality is generally "
|
||||
"slower but more accurate.",
|
||||
required=True,
|
||||
advanced=True,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="context",
|
||||
display_name="Context",
|
||||
info="The context retrieved for the given query.",
|
||||
required=True,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="query",
|
||||
display_name="Query",
|
||||
info="The user's query.",
|
||||
required=True,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="response",
|
||||
display_name="Response",
|
||||
info="The response generated by the LLM.",
|
||||
required=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="run_context_sufficiency",
|
||||
display_name="Run Context Sufficiency",
|
||||
value=False,
|
||||
advanced=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="run_response_groundedness",
|
||||
display_name="Run Response Groundedness",
|
||||
value=False,
|
||||
advanced=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="run_response_helpfulness",
|
||||
display_name="Run Response Helpfulness",
|
||||
value=False,
|
||||
advanced=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="run_query_ease",
|
||||
display_name="Run Query Ease",
|
||||
value=False,
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(display_name="Response", name="response_passthrough", method="pass_response", types=["Message"]),
|
||||
Output(display_name="Trust Score", name="trust_score", method="get_trust_score", types=["number"]),
|
||||
Output(display_name="Explanation", name="trust_explanation", method="get_trust_explanation", types=["Message"]),
|
||||
Output(display_name="Other Evals", name="other_scores", method="get_other_scores", types=["Data"]),
|
||||
Output(
|
||||
display_name="Evaluation Summary",
|
||||
name="evaluation_summary",
|
||||
method="get_evaluation_summary",
|
||||
types=["Message"],
|
||||
),
|
||||
]
|
||||
|
||||
def _evaluate_once(self):
|
||||
if not hasattr(self, "_cached_result"):
|
||||
try:
|
||||
self.status = "Configuring selected evals..."
|
||||
default_evals = get_default_evals()
|
||||
enabled_names = []
|
||||
if self.run_context_sufficiency:
|
||||
enabled_names.append("context_sufficiency")
|
||||
if self.run_response_groundedness:
|
||||
enabled_names.append("response_groundedness")
|
||||
if self.run_response_helpfulness:
|
||||
enabled_names.append("response_helpfulness")
|
||||
if self.run_query_ease:
|
||||
enabled_names.append("query_ease")
|
||||
|
||||
selected_evals = [e for e in default_evals if e.name in enabled_names]
|
||||
|
||||
validator = TrustworthyRAG(
|
||||
api_key=self.api_key,
|
||||
quality_preset=self.quality_preset,
|
||||
options={"log": ["explanation"], "model": self.model},
|
||||
evals=selected_evals,
|
||||
)
|
||||
|
||||
self.status = f"Running evals: {[e.name for e in selected_evals]}"
|
||||
self._cached_result = validator.score(
|
||||
query=self.query,
|
||||
context=self.context,
|
||||
response=self.response,
|
||||
)
|
||||
self.status = "Evaluation complete."
|
||||
|
||||
except Exception as e: # noqa: BLE001
|
||||
self.status = f"Evaluation failed: {e!s}"
|
||||
self._cached_result = {}
|
||||
return self._cached_result
|
||||
|
||||
def pass_response(self) -> Message:
|
||||
self.status = "Passing through response."
|
||||
return Message(text=self.response)
|
||||
|
||||
def get_trust_score(self) -> float:
|
||||
score = self._evaluate_once().get("trustworthiness", {}).get("score", 0.0)
|
||||
self.status = f"Trust Score: {score:.3f}"
|
||||
return score
|
||||
|
||||
def get_trust_explanation(self) -> Message:
|
||||
explanation = self._evaluate_once().get("trustworthiness", {}).get("log", {}).get("explanation", "")
|
||||
self.status = "Trust explanation extracted."
|
||||
return Message(text=explanation)
|
||||
|
||||
def get_other_scores(self) -> dict:
|
||||
result = self._evaluate_once()
|
||||
|
||||
selected = {
|
||||
"context_sufficiency": self.run_context_sufficiency,
|
||||
"response_groundedness": self.run_response_groundedness,
|
||||
"response_helpfulness": self.run_response_helpfulness,
|
||||
"query_ease": self.run_query_ease,
|
||||
}
|
||||
|
||||
filtered_scores = {key: result[key]["score"] for key, include in selected.items() if include and key in result}
|
||||
|
||||
self.status = f"{len(filtered_scores)} other evals returned."
|
||||
return filtered_scores
|
||||
|
||||
def get_evaluation_summary(self) -> Message:
|
||||
result = self._evaluate_once()
|
||||
|
||||
query_text = self.query.strip()
|
||||
context_text = self.context.strip()
|
||||
response_text = self.response.strip()
|
||||
|
||||
trust = result.get("trustworthiness", {}).get("score", 0.0)
|
||||
trust_exp = result.get("trustworthiness", {}).get("log", {}).get("explanation", "")
|
||||
|
||||
selected = {
|
||||
"context_sufficiency": self.run_context_sufficiency,
|
||||
"response_groundedness": self.run_response_groundedness,
|
||||
"response_helpfulness": self.run_response_helpfulness,
|
||||
"query_ease": self.run_query_ease,
|
||||
}
|
||||
|
||||
other_scores = {key: result[key]["score"] for key, include in selected.items() if include and key in result}
|
||||
|
||||
metrics = f"Trustworthiness: {trust:.3f}"
|
||||
if trust_exp:
|
||||
metrics += f"\nExplanation: {trust_exp}"
|
||||
if other_scores:
|
||||
metrics += "\n" + "\n".join(f"{k.replace('_', ' ').title()}: {v:.3f}" for k, v in other_scores.items())
|
||||
|
||||
summary = (
|
||||
f"Query:\n{query_text}\n"
|
||||
"-----\n"
|
||||
f"Context:\n{context_text}\n"
|
||||
"-----\n"
|
||||
f"Response:\n{response_text}\n"
|
||||
"------------------------------\n"
|
||||
f"{metrics}"
|
||||
)
|
||||
|
||||
self.status = "Evaluation summary built."
|
||||
return Message(text=summary)
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
from langflow.custom import Component
|
||||
from langflow.field_typing.range_spec import RangeSpec
|
||||
from langflow.io import BoolInput, FloatInput, HandleInput, MessageTextInput, Output, PromptInput
|
||||
from langflow.schema.message import Message
|
||||
|
||||
|
||||
class CleanlabRemediator(Component):
|
||||
"""Remediates potentially untrustworthy LLM responses based on trust scores computed by the Cleanlab Evaluator.
|
||||
|
||||
This component takes a response and its associated trust score,
|
||||
and applies remediation strategies based on configurable thresholds and settings.
|
||||
|
||||
Inputs:
|
||||
- response (MessageTextInput): The original LLM-generated response to be evaluated and possibly remediated.
|
||||
The CleanlabEvaluator passes this response through.
|
||||
- score (HandleInput): The trust score output from CleanlabEvaluator (expected to be a float between 0 and 1).
|
||||
- explanation (MessageTextInput): Optional textual explanation for the trust score, to be included in the
|
||||
output.
|
||||
- threshold (Input[float]): Minimum trust score required to accept the response. If the score is lower, the
|
||||
response is remediated.
|
||||
- show_untrustworthy_response (BoolInput): If true, returns the original response with a warning; if false,
|
||||
returns fallback text.
|
||||
- untrustworthy_warning_text (PromptInput): Text warning to append to responses deemed untrustworthy (when
|
||||
showing them).
|
||||
- fallback_text (PromptInput): Replacement message returned if the response is untrustworthy and should be
|
||||
hidden.
|
||||
|
||||
Outputs:
|
||||
- remediated_response (Message): Either:
|
||||
• the original response,
|
||||
• the original response with appended warning, or
|
||||
• the fallback response,
|
||||
depending on the trust score and configuration.
|
||||
|
||||
This component is typically used downstream of CleanlabEvaluator or CleanlabRagValidator
|
||||
to take appropriate action on low-trust responses and inform users accordingly.
|
||||
"""
|
||||
|
||||
display_name = "Cleanlab Remediator"
|
||||
description = (
|
||||
"Remediates an untrustworthy response based on trust score from the Cleanlab Evaluator, "
|
||||
"score threshold, and message handling settings."
|
||||
)
|
||||
icon = "Cleanlab"
|
||||
name = "CleanlabRemediator"
|
||||
|
||||
inputs = [
|
||||
MessageTextInput(
|
||||
name="response",
|
||||
display_name="Response",
|
||||
info="The response to the user's query.",
|
||||
required=True,
|
||||
),
|
||||
HandleInput(
|
||||
name="score",
|
||||
display_name="Trust Score",
|
||||
info="The trustworthiness score output from the Cleanlab Evaluator.",
|
||||
input_types=["number"],
|
||||
required=True,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="explanation",
|
||||
display_name="Explanation",
|
||||
info="The explanation from the Cleanlab Evaluator.",
|
||||
required=False,
|
||||
),
|
||||
FloatInput(
|
||||
name="threshold",
|
||||
display_name="Threshold",
|
||||
field_type="float",
|
||||
value=0.7,
|
||||
range_spec=RangeSpec(min=0.0, max=1.0, step=0.05),
|
||||
info="Minimum score required to show the response unmodified. Reponses with scores above this threshold "
|
||||
"are considered trustworthy. Reponses with scores below this threshold are considered untrustworthy and "
|
||||
"will be remediated based on the settings below.",
|
||||
required=True,
|
||||
show=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="show_untrustworthy_response",
|
||||
display_name="Show Untrustworthy Response",
|
||||
info="If enabled, and the trust score is below the threshold, the original response is shown with the "
|
||||
"added warning. If disabled, and the trust score is below the threshold, the fallback answer is returned.",
|
||||
value=True,
|
||||
),
|
||||
PromptInput(
|
||||
name="untrustworthy_warning_text",
|
||||
display_name="Warning for Untrustworthy Response",
|
||||
info="Warning to append to the response if Show Untrustworthy Response is enabled and trust score is "
|
||||
"below the threshold.",
|
||||
value="⚠️ WARNING: The following response is potentially untrustworthy.",
|
||||
),
|
||||
PromptInput(
|
||||
name="fallback_text",
|
||||
display_name="Fallback Answer",
|
||||
info="Response returned if the trust score is below the threshold and 'Show Untrustworthy Response' is "
|
||||
"disabled.",
|
||||
value="Based on the available information, I cannot provide a complete answer to this question.",
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(
|
||||
display_name="Remediated Message",
|
||||
name="remediated_response",
|
||||
method="remediate_response",
|
||||
types=["Message"],
|
||||
),
|
||||
]
|
||||
|
||||
def remediate_response(self) -> Message:
|
||||
if self.score >= self.threshold:
|
||||
self.status = f"Score {self.score:.2f} ≥ threshold {self.threshold:.2f} → accepted"
|
||||
return Message(
|
||||
text=f"{self.response}\n\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n**Trust Score:** {self.score:.2f}"
|
||||
)
|
||||
|
||||
self.status = f"Score {self.score:.2f} < threshold {self.threshold:.2f} → flagged"
|
||||
|
||||
if self.show_untrustworthy_response:
|
||||
parts = [
|
||||
self.response,
|
||||
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
|
||||
f"**{self.untrustworthy_warning_text.strip()}**",
|
||||
f"**Trust Score:** {self.score:.2f}",
|
||||
]
|
||||
if self.explanation:
|
||||
parts.append(f"**Explanation:** {self.explanation}")
|
||||
return Message(text="\n\n".join(parts))
|
||||
|
||||
return Message(text=self.fallback_text)
|
||||
File diff suppressed because one or more lines are too long
2993
src/frontend/package-lock.json
generated
2993
src/frontend/package-lock.json
generated
File diff suppressed because it is too large
Load diff
28
src/frontend/src/icons/Cleanlab/Cleanlab.jsx
Normal file
28
src/frontend/src/icons/Cleanlab/Cleanlab.jsx
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
import { stringToBool } from "@/utils/utils";
|
||||
|
||||
const SvgCleanlab = (props) => {
|
||||
const isDark = stringToBool(props.isdark);
|
||||
|
||||
return (
|
||||
<svg
|
||||
width={697}
|
||||
height={697}
|
||||
viewBox="0 0 697 697"
|
||||
fill="none"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
{...props}
|
||||
>
|
||||
<path
|
||||
d="M341.847921,185.264125c-13.42293,23.249376-12.347471,50.877388,0.401709,72.451244,3.225931,5.459143,3.345505,12.207007,0.174798,17.69869l-48.852472,84.61514c-3.18275,5.512656-9.1167,8.767205-15.481774,8.699138-12.415908-.13284-25.026497,2.950045-36.559278,9.6085-34.331853,19.821505-46.094999,63.721679-26.273494,98.053532,19.821549,34.33193,63.721743,46.094757,98.053597,26.273252,11.532781-6.658454,20.508022-16.037949,26.600933-26.85686,3.123545-5.546426,8.909087-9.058026,15.274517-9.058117l97.705052.000012c6.314731.000014,12.107535,3.432422,15.203594,8.93605,12.65789,22.501254,37.082709,37.493928,64.94011,36.600417,38.349672-1.230173,69.281722-33.108828,69.419913-71.478015.143192-39.764396-32.048611-72.044287-71.779567-72.044476-26.846015-.00009-50.235009,14.745435-62.54359,36.573359-3.114983,5.523677-8.899051,9.00096-15.239989,9.000931l-95.950864.000159c-15.201841-.000006-24.703043-16.45657-17.102128-29.621754l47.975569-83.095807c3.157398-5.468647,9.026328-8.769166,15.340538-8.698443,25.815666.288662,51.012155-13.367473,64.167032-37.93976,17.644134-32.958183,6.647958-74.587984-24.980446-94.51786-34.663711-21.84271-80.199374-10.350384-100.49376,24.800671Z
|
||||
M323.785578,413.729145c14.784192,25.606971,6.010559,58.350615-19.596489,73.134851s-58.350647,6.01068-73.134927-19.596445-6.010559-58.350615,19.596489-73.134851c25.607048-14.784236,58.350647-6.01068,73.134927,19.596445Z"
|
||||
fill={isDark ? "#FFFFFF" : "#000000"}
|
||||
/>
|
||||
<path
|
||||
d="M572.283355,593.964596c-47.042646,32.305387-103.837611,51.438401-165.06516,52.07923-164.731918,1.724141-300.426141-132.22655-300.738501-296.967195-.312099-164.602718,133.029516-298.13692,297.559776-298.13692,62.422076,0,120.354912,19.221075,168.202634,52.06751,1.932802,1.326827,4.559313-.028862,4.559337-2.373257l.000546-53.16979c.000011-1.048757-.562682-2.02097-1.475604-2.537173C525.033846,16.489584,466.971795.188457,405.117629.001612,212.700173-.579629,55.540912,156.084111,55.54226,348.502446c.001349,192.470068,156.029578,348.497554,348.499958,348.497554,62.256594,0,120.700306-16.324561,171.284182-44.926817.912763-.516115,1.475336-1.488098,1.475331-2.536674l-.000234-53.172985c-.00001-2.322378-2.603713-3.713615-4.518142-2.398928Z"
|
||||
fill={isDark ? "#FFFFFF" : "#000000"}
|
||||
/>
|
||||
</svg>
|
||||
);
|
||||
};
|
||||
|
||||
export default SvgCleanlab;
|
||||
11
src/frontend/src/icons/Cleanlab/index.tsx
Normal file
11
src/frontend/src/icons/Cleanlab/index.tsx
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
import { useDarkStore } from "@/stores/darkStore";
|
||||
import React, { forwardRef } from "react";
|
||||
import SvgCleanlab from "./Cleanlab";
|
||||
|
||||
export const CleanlabIcon = forwardRef<
|
||||
SVGSVGElement,
|
||||
React.PropsWithChildren<{}>
|
||||
>((props, ref) => {
|
||||
const isdark = useDarkStore((state) => state.dark).toString();
|
||||
return <SvgCleanlab ref={ref} isdark={isdark} {...props} />;
|
||||
});
|
||||
|
|
@ -42,6 +42,8 @@ export const lazyIconsMapping = {
|
|||
import("@/icons/Cassandra").then((mod) => ({ default: mod.CassandraIcon })),
|
||||
Chroma: () =>
|
||||
import("@/icons/ChromaIcon").then((mod) => ({ default: mod.ChromaIcon })),
|
||||
Cleanlab: () =>
|
||||
import("@/icons/Cleanlab").then((mod) => ({ default: mod.CleanlabIcon })),
|
||||
Clickhouse: () =>
|
||||
import("@/icons/Clickhouse").then((mod) => ({
|
||||
default: mod.ClickhouseIcon,
|
||||
|
|
|
|||
|
|
@ -281,6 +281,7 @@ export const SIDEBAR_BUNDLES = [
|
|||
name: "homeassistant",
|
||||
icon: "HomeAssistant",
|
||||
},
|
||||
{ display_name: "Cleanlab", name: "cleanlab", icon: "Cleanlab" },
|
||||
{ display_name: "Search", name: "search", icon: "Search" },
|
||||
{ display_name: "Tavily", name: "tavily", icon: "TavilyIcon" },
|
||||
];
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue