* docs-links * Update url.py --------- Co-authored-by: Edwin Jose <edwin.jose@datastax.com>
184 lines
7 KiB
Python
184 lines
7 KiB
Python
from pydantic import BaseModel, Field, create_model
|
|
from trustcall import create_extractor
|
|
|
|
from langflow.base.models.chat_result import get_chat_result
|
|
from langflow.custom.custom_component.component import Component
|
|
from langflow.helpers.base_model import build_model_from_schema
|
|
from langflow.io import (
|
|
HandleInput,
|
|
MessageTextInput,
|
|
MultilineInput,
|
|
Output,
|
|
TableInput,
|
|
)
|
|
from langflow.schema.data import Data
|
|
from langflow.schema.table import EditMode
|
|
|
|
|
|
class StructuredOutputComponent(Component):
|
|
display_name = "Structured Output"
|
|
description = "Uses an LLM to generate structured data. Ideal for extraction and consistency."
|
|
documentation: str = "https://docs.langflow.org/components-processing#structured-output"
|
|
name = "StructuredOutput"
|
|
icon = "braces"
|
|
|
|
inputs = [
|
|
HandleInput(
|
|
name="llm",
|
|
display_name="Language Model",
|
|
info="The language model to use to generate the structured output.",
|
|
input_types=["LanguageModel"],
|
|
required=True,
|
|
),
|
|
MultilineInput(
|
|
name="input_value",
|
|
display_name="Input Message",
|
|
info="The input message to the language model.",
|
|
tool_mode=True,
|
|
required=True,
|
|
),
|
|
MultilineInput(
|
|
name="system_prompt",
|
|
display_name="Format Instructions",
|
|
info="The instructions to the language model for formatting the output.",
|
|
value=(
|
|
"You are an AI that extracts one structured JSON object from unstructured text. "
|
|
"Use a predefined schema with expected types (str, int, float, bool, dict). "
|
|
"If multiple structures exist, extract only the first most complete one. "
|
|
"Fill missing or ambiguous values with defaults: null for missing values. "
|
|
"Ignore duplicates and partial repeats. "
|
|
"Always return one valid JSON, never throw errors or return multiple objects."
|
|
"Output: A single well-formed JSON object, and nothing else."
|
|
),
|
|
required=True,
|
|
advanced=True,
|
|
),
|
|
MessageTextInput(
|
|
name="schema_name",
|
|
display_name="Schema Name",
|
|
info="Provide a name for the output data schema.",
|
|
advanced=True,
|
|
),
|
|
TableInput(
|
|
name="output_schema",
|
|
display_name="Output Schema",
|
|
info="Define the structure and data types for the model's output.",
|
|
required=True,
|
|
# TODO: remove deault value
|
|
table_schema=[
|
|
{
|
|
"name": "name",
|
|
"display_name": "Name",
|
|
"type": "str",
|
|
"description": "Specify the name of the output field.",
|
|
"default": "field",
|
|
"edit_mode": EditMode.INLINE,
|
|
},
|
|
{
|
|
"name": "description",
|
|
"display_name": "Description",
|
|
"type": "str",
|
|
"description": "Describe the purpose of the output field.",
|
|
"default": "description of field",
|
|
"edit_mode": EditMode.POPOVER,
|
|
},
|
|
{
|
|
"name": "type",
|
|
"display_name": "Type",
|
|
"type": "str",
|
|
"edit_mode": EditMode.INLINE,
|
|
"description": ("Indicate the data type of the output field (e.g., str, int, float, bool, dict)."),
|
|
"options": ["str", "int", "float", "bool", "dict"],
|
|
"default": "str",
|
|
},
|
|
{
|
|
"name": "multiple",
|
|
"display_name": "As List",
|
|
"type": "boolean",
|
|
"description": "Set to True if this output field should be a list of the specified type.",
|
|
"default": "False",
|
|
"edit_mode": EditMode.INLINE,
|
|
},
|
|
],
|
|
value=[
|
|
{
|
|
"name": "field",
|
|
"description": "description of field",
|
|
"type": "str",
|
|
"multiple": "False",
|
|
}
|
|
],
|
|
),
|
|
]
|
|
|
|
outputs = [
|
|
Output(
|
|
name="structured_output",
|
|
display_name="Structured Output",
|
|
method="build_structured_output",
|
|
),
|
|
]
|
|
|
|
def build_structured_output_base(self):
|
|
schema_name = self.schema_name or "OutputModel"
|
|
|
|
if not hasattr(self.llm, "with_structured_output"):
|
|
msg = "Language model does not support structured output."
|
|
raise TypeError(msg)
|
|
if not self.output_schema:
|
|
msg = "Output schema cannot be empty"
|
|
raise ValueError(msg)
|
|
|
|
output_model_ = build_model_from_schema(self.output_schema)
|
|
|
|
output_model = create_model(
|
|
schema_name,
|
|
__doc__=f"A list of {schema_name}.",
|
|
objects=(list[output_model_], Field(description=f"A list of {schema_name}.")), # type: ignore[valid-type]
|
|
)
|
|
|
|
try:
|
|
llm_with_structured_output = create_extractor(self.llm, tools=[output_model])
|
|
except NotImplementedError as exc:
|
|
msg = f"{self.llm.__class__.__name__} does not support structured output."
|
|
raise TypeError(msg) from exc
|
|
|
|
config_dict = {
|
|
"run_name": self.display_name,
|
|
"project_name": self.get_project_name(),
|
|
"callbacks": self.get_langchain_callbacks(),
|
|
}
|
|
result = get_chat_result(
|
|
runnable=llm_with_structured_output,
|
|
system_message=self.system_prompt,
|
|
input_value=self.input_value,
|
|
config=config_dict,
|
|
)
|
|
|
|
# OPTIMIZATION NOTE: Simplified processing based on trustcall response structure
|
|
# Handle non-dict responses (shouldn't happen with trustcall, but defensive)
|
|
if not isinstance(result, dict):
|
|
return result
|
|
|
|
# Extract first response and convert BaseModel to dict
|
|
responses = result.get("responses", [])
|
|
if not responses:
|
|
return result
|
|
|
|
# Convert BaseModel to dict (creates the "objects" key)
|
|
first_response = responses[0]
|
|
structured_data = first_response.model_dump() if isinstance(first_response, BaseModel) else first_response
|
|
|
|
# Extract the objects array (guaranteed to exist due to our Pydantic model structure)
|
|
return structured_data.get("objects", structured_data)
|
|
|
|
def build_structured_output(self) -> Data:
|
|
output = self.build_structured_output_base()
|
|
if not isinstance(output, list) or not output:
|
|
# handle empty or unexpected type case
|
|
msg = "No structured output returned"
|
|
raise ValueError(msg)
|
|
if len(output) != 1:
|
|
msg = "Multiple structured outputs returned"
|
|
raise ValueError(msg)
|
|
return Data(data=output[0])
|