diff --git a/src/backend/base/langflow/components/processing/structured_output.py b/src/backend/base/langflow/components/processing/structured_output.py index 637c19600..e47a66656 100644 --- a/src/backend/base/langflow/components/processing/structured_output.py +++ b/src/backend/base/langflow/components/processing/structured_output.py @@ -12,6 +12,7 @@ from langflow.io import ( TableInput, ) from langflow.schema.data import Data +from langflow.schema.dataframe import DataFrame from langflow.schema.table import EditMode @@ -42,13 +43,13 @@ class StructuredOutputComponent(Component): display_name="Format Instructions", info="The instructions to the language model for formatting the output.", value=( - "You are an AI that extracts one structured JSON object from unstructured text. " + "You are an AI that extracts structured JSON objects from unstructured text. " "Use a predefined schema with expected types (str, int, float, bool, dict). " - "If multiple structures exist, extract only the first most complete one. " + "Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. " "Fill missing or ambiguous values with defaults: null for missing values. " - "Ignore duplicates and partial repeats. " - "Always return one valid JSON, never throw errors or return multiple objects." - "Output: A single well-formed JSON object, and nothing else." + "Remove exact duplicates but keep variations that have different field values. " + "Always return valid JSON in the expected format, never throw errors. " + "If multiple objects can be extracted, return them all in the structured format." ), required=True, advanced=True, @@ -117,6 +118,11 @@ class StructuredOutputComponent(Component): display_name="Structured Output", method="build_structured_output", ), + Output( + name="dataframe_output", + display_name="Structured Output", + method="build_structured_dataframe", + ), ] def build_structured_output_base(self): @@ -178,7 +184,19 @@ class StructuredOutputComponent(Component): # handle empty or unexpected type case msg = "No structured output returned" raise ValueError(msg) - if len(output) != 1: - msg = "Multiple structured outputs returned" + if len(output) == 1: + return Data(data=output[0]) + if len(output) > 1: + # Multiple outputs - wrap them in a results container + return Data(data={"results": output}) + return Data() + + def build_structured_dataframe(self) -> DataFrame: + output = self.build_structured_output_base() + if not isinstance(output, list) or not output: + # handle empty or unexpected type case + msg = "No structured output returned" raise ValueError(msg) - return Data(data=output[0]) + data_list = [Data(data=output[0])] if len(output) == 1 else [Data(data=item) for item in output] + + return DataFrame(data_list) diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Financial Report Parser.json b/src/backend/base/langflow/initial_setup/starter_projects/Financial Report Parser.json index cb292e652..e4c928789 100644 --- a/src/backend/base/langflow/initial_setup/starter_projects/Financial Report Parser.json +++ b/src/backend/base/langflow/initial_setup/starter_projects/Financial Report Parser.json @@ -1303,6 +1303,20 @@ "Data" ], "value": "__UNDEFINED__" + }, + { + "allows_loop": false, + "cache": true, + "display_name": "Structured Output", + "group_outputs": false, + "method": "build_structured_dataframe", + "name": "dataframe_output", + "selected": "DataFrame", + "tool_mode": true, + "types": [ + "DataFrame" + ], + "value": "__UNDEFINED__" } ], "pinned": false, @@ -1324,7 +1338,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from pydantic import BaseModel, Field, create_model\nfrom trustcall import create_extractor\n\nfrom langflow.base.models.chat_result import get_chat_result\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.helpers.base_model import build_model_from_schema\nfrom langflow.io import (\n HandleInput,\n MessageTextInput,\n MultilineInput,\n Output,\n TableInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.table import EditMode\n\n\nclass StructuredOutputComponent(Component):\n display_name = \"Structured Output\"\n description = \"Uses an LLM to generate structured data. Ideal for extraction and consistency.\"\n documentation: str = \"https://docs.langflow.org/components-processing#structured-output\"\n name = \"StructuredOutput\"\n icon = \"braces\"\n\n inputs = [\n HandleInput(\n name=\"llm\",\n display_name=\"Language Model\",\n info=\"The language model to use to generate the structured output.\",\n input_types=[\"LanguageModel\"],\n required=True,\n ),\n MultilineInput(\n name=\"input_value\",\n display_name=\"Input Message\",\n info=\"The input message to the language model.\",\n tool_mode=True,\n required=True,\n ),\n MultilineInput(\n name=\"system_prompt\",\n display_name=\"Format Instructions\",\n info=\"The instructions to the language model for formatting the output.\",\n value=(\n \"You are an AI that extracts one structured JSON object from unstructured text. \"\n \"Use a predefined schema with expected types (str, int, float, bool, dict). \"\n \"If multiple structures exist, extract only the first most complete one. \"\n \"Fill missing or ambiguous values with defaults: null for missing values. \"\n \"Ignore duplicates and partial repeats. \"\n \"Always return one valid JSON, never throw errors or return multiple objects.\"\n \"Output: A single well-formed JSON object, and nothing else.\"\n ),\n required=True,\n advanced=True,\n ),\n MessageTextInput(\n name=\"schema_name\",\n display_name=\"Schema Name\",\n info=\"Provide a name for the output data schema.\",\n advanced=True,\n ),\n TableInput(\n name=\"output_schema\",\n display_name=\"Output Schema\",\n info=\"Define the structure and data types for the model's output.\",\n required=True,\n # TODO: remove deault value\n table_schema=[\n {\n \"name\": \"name\",\n \"display_name\": \"Name\",\n \"type\": \"str\",\n \"description\": \"Specify the name of the output field.\",\n \"default\": \"field\",\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"description\",\n \"display_name\": \"Description\",\n \"type\": \"str\",\n \"description\": \"Describe the purpose of the output field.\",\n \"default\": \"description of field\",\n \"edit_mode\": EditMode.POPOVER,\n },\n {\n \"name\": \"type\",\n \"display_name\": \"Type\",\n \"type\": \"str\",\n \"edit_mode\": EditMode.INLINE,\n \"description\": (\"Indicate the data type of the output field (e.g., str, int, float, bool, dict).\"),\n \"options\": [\"str\", \"int\", \"float\", \"bool\", \"dict\"],\n \"default\": \"str\",\n },\n {\n \"name\": \"multiple\",\n \"display_name\": \"As List\",\n \"type\": \"boolean\",\n \"description\": \"Set to True if this output field should be a list of the specified type.\",\n \"default\": \"False\",\n \"edit_mode\": EditMode.INLINE,\n },\n ],\n value=[\n {\n \"name\": \"field\",\n \"description\": \"description of field\",\n \"type\": \"str\",\n \"multiple\": \"False\",\n }\n ],\n ),\n ]\n\n outputs = [\n Output(\n name=\"structured_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_output\",\n ),\n ]\n\n def build_structured_output_base(self):\n schema_name = self.schema_name or \"OutputModel\"\n\n if not hasattr(self.llm, \"with_structured_output\"):\n msg = \"Language model does not support structured output.\"\n raise TypeError(msg)\n if not self.output_schema:\n msg = \"Output schema cannot be empty\"\n raise ValueError(msg)\n\n output_model_ = build_model_from_schema(self.output_schema)\n\n output_model = create_model(\n schema_name,\n __doc__=f\"A list of {schema_name}.\",\n objects=(list[output_model_], Field(description=f\"A list of {schema_name}.\")), # type: ignore[valid-type]\n )\n\n try:\n llm_with_structured_output = create_extractor(self.llm, tools=[output_model])\n except NotImplementedError as exc:\n msg = f\"{self.llm.__class__.__name__} does not support structured output.\"\n raise TypeError(msg) from exc\n\n config_dict = {\n \"run_name\": self.display_name,\n \"project_name\": self.get_project_name(),\n \"callbacks\": self.get_langchain_callbacks(),\n }\n result = get_chat_result(\n runnable=llm_with_structured_output,\n system_message=self.system_prompt,\n input_value=self.input_value,\n config=config_dict,\n )\n\n # OPTIMIZATION NOTE: Simplified processing based on trustcall response structure\n # Handle non-dict responses (shouldn't happen with trustcall, but defensive)\n if not isinstance(result, dict):\n return result\n\n # Extract first response and convert BaseModel to dict\n responses = result.get(\"responses\", [])\n if not responses:\n return result\n\n # Convert BaseModel to dict (creates the \"objects\" key)\n first_response = responses[0]\n structured_data = first_response.model_dump() if isinstance(first_response, BaseModel) else first_response\n\n # Extract the objects array (guaranteed to exist due to our Pydantic model structure)\n return structured_data.get(\"objects\", structured_data)\n\n def build_structured_output(self) -> Data:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n if len(output) != 1:\n msg = \"Multiple structured outputs returned\"\n raise ValueError(msg)\n return Data(data=output[0])\n" + "value": "from pydantic import BaseModel, Field, create_model\nfrom trustcall import create_extractor\n\nfrom langflow.base.models.chat_result import get_chat_result\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.helpers.base_model import build_model_from_schema\nfrom langflow.io import (\n HandleInput,\n MessageTextInput,\n MultilineInput,\n Output,\n TableInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.table import EditMode\n\n\nclass StructuredOutputComponent(Component):\n display_name = \"Structured Output\"\n description = \"Uses an LLM to generate structured data. Ideal for extraction and consistency.\"\n documentation: str = \"https://docs.langflow.org/components-processing#structured-output\"\n name = \"StructuredOutput\"\n icon = \"braces\"\n\n inputs = [\n HandleInput(\n name=\"llm\",\n display_name=\"Language Model\",\n info=\"The language model to use to generate the structured output.\",\n input_types=[\"LanguageModel\"],\n required=True,\n ),\n MultilineInput(\n name=\"input_value\",\n display_name=\"Input Message\",\n info=\"The input message to the language model.\",\n tool_mode=True,\n required=True,\n ),\n MultilineInput(\n name=\"system_prompt\",\n display_name=\"Format Instructions\",\n info=\"The instructions to the language model for formatting the output.\",\n value=(\n \"You are an AI that extracts structured JSON objects from unstructured text. \"\n \"Use a predefined schema with expected types (str, int, float, bool, dict). \"\n \"Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. \"\n \"Fill missing or ambiguous values with defaults: null for missing values. \"\n \"Remove exact duplicates but keep variations that have different field values. \"\n \"Always return valid JSON in the expected format, never throw errors. \"\n \"If multiple objects can be extracted, return them all in the structured format.\"\n ),\n required=True,\n advanced=True,\n ),\n MessageTextInput(\n name=\"schema_name\",\n display_name=\"Schema Name\",\n info=\"Provide a name for the output data schema.\",\n advanced=True,\n ),\n TableInput(\n name=\"output_schema\",\n display_name=\"Output Schema\",\n info=\"Define the structure and data types for the model's output.\",\n required=True,\n # TODO: remove deault value\n table_schema=[\n {\n \"name\": \"name\",\n \"display_name\": \"Name\",\n \"type\": \"str\",\n \"description\": \"Specify the name of the output field.\",\n \"default\": \"field\",\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"description\",\n \"display_name\": \"Description\",\n \"type\": \"str\",\n \"description\": \"Describe the purpose of the output field.\",\n \"default\": \"description of field\",\n \"edit_mode\": EditMode.POPOVER,\n },\n {\n \"name\": \"type\",\n \"display_name\": \"Type\",\n \"type\": \"str\",\n \"edit_mode\": EditMode.INLINE,\n \"description\": (\"Indicate the data type of the output field (e.g., str, int, float, bool, dict).\"),\n \"options\": [\"str\", \"int\", \"float\", \"bool\", \"dict\"],\n \"default\": \"str\",\n },\n {\n \"name\": \"multiple\",\n \"display_name\": \"As List\",\n \"type\": \"boolean\",\n \"description\": \"Set to True if this output field should be a list of the specified type.\",\n \"default\": \"False\",\n \"edit_mode\": EditMode.INLINE,\n },\n ],\n value=[\n {\n \"name\": \"field\",\n \"description\": \"description of field\",\n \"type\": \"str\",\n \"multiple\": \"False\",\n }\n ],\n ),\n ]\n\n outputs = [\n Output(\n name=\"structured_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_output\",\n ),\n Output(\n name=\"dataframe_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_dataframe\",\n ),\n ]\n\n def build_structured_output_base(self):\n schema_name = self.schema_name or \"OutputModel\"\n\n if not hasattr(self.llm, \"with_structured_output\"):\n msg = \"Language model does not support structured output.\"\n raise TypeError(msg)\n if not self.output_schema:\n msg = \"Output schema cannot be empty\"\n raise ValueError(msg)\n\n output_model_ = build_model_from_schema(self.output_schema)\n\n output_model = create_model(\n schema_name,\n __doc__=f\"A list of {schema_name}.\",\n objects=(list[output_model_], Field(description=f\"A list of {schema_name}.\")), # type: ignore[valid-type]\n )\n\n try:\n llm_with_structured_output = create_extractor(self.llm, tools=[output_model])\n except NotImplementedError as exc:\n msg = f\"{self.llm.__class__.__name__} does not support structured output.\"\n raise TypeError(msg) from exc\n\n config_dict = {\n \"run_name\": self.display_name,\n \"project_name\": self.get_project_name(),\n \"callbacks\": self.get_langchain_callbacks(),\n }\n result = get_chat_result(\n runnable=llm_with_structured_output,\n system_message=self.system_prompt,\n input_value=self.input_value,\n config=config_dict,\n )\n\n # OPTIMIZATION NOTE: Simplified processing based on trustcall response structure\n # Handle non-dict responses (shouldn't happen with trustcall, but defensive)\n if not isinstance(result, dict):\n return result\n\n # Extract first response and convert BaseModel to dict\n responses = result.get(\"responses\", [])\n if not responses:\n return result\n\n # Convert BaseModel to dict (creates the \"objects\" key)\n first_response = responses[0]\n structured_data = first_response.model_dump() if isinstance(first_response, BaseModel) else first_response\n\n # Extract the objects array (guaranteed to exist due to our Pydantic model structure)\n return structured_data.get(\"objects\", structured_data)\n\n def build_structured_output(self) -> Data:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n if len(output) == 1:\n return Data(data=output[0])\n if len(output) > 1:\n # Multiple outputs - wrap them in a results container\n return Data(data={\"results\": output})\n return Data()\n\n def build_structured_dataframe(self) -> DataFrame:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n data_list = [Data(data=output[0])] if len(output) == 1 else [Data(data=item) for item in output]\n\n return DataFrame(data_list)\n" }, "input_value": { "_input_type": "MessageTextInput", @@ -1518,7 +1532,7 @@ "trace_as_input": true, "trace_as_metadata": true, "type": "str", - "value": "You are an AI system designed to extract structured information from unstructured text.Given the input_text, return a JSON object with predefined keys based on the expected structure.Extract values accurately and format them according to the specified type (e.g., string, integer, float, date).If a value is missing or cannot be determined, return a default (e.g., null, 0, or 'N/A').If multiple instances of the expected structure exist within the input_text, stream each as a separate JSON object." + "value": "You are an AI that extracts structured JSON objects from unstructured text. Use a predefined schema with expected types (str, int, float, bool, dict). Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. Fill missing or ambiguous values with defaults: null for missing values. Remove exact duplicates but keep variations that have different field values. Always return valid JSON in the expected format, never throw errors. If multiple objects can be extracted, return them all in the structured format." } }, "tool_mode": false diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Hybrid Search RAG.json b/src/backend/base/langflow/initial_setup/starter_projects/Hybrid Search RAG.json index b14fc799e..0b931edee 100644 --- a/src/backend/base/langflow/initial_setup/starter_projects/Hybrid Search RAG.json +++ b/src/backend/base/langflow/initial_setup/starter_projects/Hybrid Search RAG.json @@ -2583,6 +2583,20 @@ "Data" ], "value": "__UNDEFINED__" + }, + { + "allows_loop": false, + "cache": true, + "display_name": "Structured Output", + "group_outputs": false, + "method": "build_structured_dataframe", + "name": "dataframe_output", + "selected": "DataFrame", + "tool_mode": true, + "types": [ + "DataFrame" + ], + "value": "__UNDEFINED__" } ], "pinned": false, @@ -2604,7 +2618,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from pydantic import BaseModel, Field, create_model\nfrom trustcall import create_extractor\n\nfrom langflow.base.models.chat_result import get_chat_result\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.helpers.base_model import build_model_from_schema\nfrom langflow.io import (\n HandleInput,\n MessageTextInput,\n MultilineInput,\n Output,\n TableInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.table import EditMode\n\n\nclass StructuredOutputComponent(Component):\n display_name = \"Structured Output\"\n description = \"Uses an LLM to generate structured data. Ideal for extraction and consistency.\"\n documentation: str = \"https://docs.langflow.org/components-processing#structured-output\"\n name = \"StructuredOutput\"\n icon = \"braces\"\n\n inputs = [\n HandleInput(\n name=\"llm\",\n display_name=\"Language Model\",\n info=\"The language model to use to generate the structured output.\",\n input_types=[\"LanguageModel\"],\n required=True,\n ),\n MultilineInput(\n name=\"input_value\",\n display_name=\"Input Message\",\n info=\"The input message to the language model.\",\n tool_mode=True,\n required=True,\n ),\n MultilineInput(\n name=\"system_prompt\",\n display_name=\"Format Instructions\",\n info=\"The instructions to the language model for formatting the output.\",\n value=(\n \"You are an AI that extracts one structured JSON object from unstructured text. \"\n \"Use a predefined schema with expected types (str, int, float, bool, dict). \"\n \"If multiple structures exist, extract only the first most complete one. \"\n \"Fill missing or ambiguous values with defaults: null for missing values. \"\n \"Ignore duplicates and partial repeats. \"\n \"Always return one valid JSON, never throw errors or return multiple objects.\"\n \"Output: A single well-formed JSON object, and nothing else.\"\n ),\n required=True,\n advanced=True,\n ),\n MessageTextInput(\n name=\"schema_name\",\n display_name=\"Schema Name\",\n info=\"Provide a name for the output data schema.\",\n advanced=True,\n ),\n TableInput(\n name=\"output_schema\",\n display_name=\"Output Schema\",\n info=\"Define the structure and data types for the model's output.\",\n required=True,\n # TODO: remove deault value\n table_schema=[\n {\n \"name\": \"name\",\n \"display_name\": \"Name\",\n \"type\": \"str\",\n \"description\": \"Specify the name of the output field.\",\n \"default\": \"field\",\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"description\",\n \"display_name\": \"Description\",\n \"type\": \"str\",\n \"description\": \"Describe the purpose of the output field.\",\n \"default\": \"description of field\",\n \"edit_mode\": EditMode.POPOVER,\n },\n {\n \"name\": \"type\",\n \"display_name\": \"Type\",\n \"type\": \"str\",\n \"edit_mode\": EditMode.INLINE,\n \"description\": (\"Indicate the data type of the output field (e.g., str, int, float, bool, dict).\"),\n \"options\": [\"str\", \"int\", \"float\", \"bool\", \"dict\"],\n \"default\": \"str\",\n },\n {\n \"name\": \"multiple\",\n \"display_name\": \"As List\",\n \"type\": \"boolean\",\n \"description\": \"Set to True if this output field should be a list of the specified type.\",\n \"default\": \"False\",\n \"edit_mode\": EditMode.INLINE,\n },\n ],\n value=[\n {\n \"name\": \"field\",\n \"description\": \"description of field\",\n \"type\": \"str\",\n \"multiple\": \"False\",\n }\n ],\n ),\n ]\n\n outputs = [\n Output(\n name=\"structured_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_output\",\n ),\n ]\n\n def build_structured_output_base(self):\n schema_name = self.schema_name or \"OutputModel\"\n\n if not hasattr(self.llm, \"with_structured_output\"):\n msg = \"Language model does not support structured output.\"\n raise TypeError(msg)\n if not self.output_schema:\n msg = \"Output schema cannot be empty\"\n raise ValueError(msg)\n\n output_model_ = build_model_from_schema(self.output_schema)\n\n output_model = create_model(\n schema_name,\n __doc__=f\"A list of {schema_name}.\",\n objects=(list[output_model_], Field(description=f\"A list of {schema_name}.\")), # type: ignore[valid-type]\n )\n\n try:\n llm_with_structured_output = create_extractor(self.llm, tools=[output_model])\n except NotImplementedError as exc:\n msg = f\"{self.llm.__class__.__name__} does not support structured output.\"\n raise TypeError(msg) from exc\n\n config_dict = {\n \"run_name\": self.display_name,\n \"project_name\": self.get_project_name(),\n \"callbacks\": self.get_langchain_callbacks(),\n }\n result = get_chat_result(\n runnable=llm_with_structured_output,\n system_message=self.system_prompt,\n input_value=self.input_value,\n config=config_dict,\n )\n\n # OPTIMIZATION NOTE: Simplified processing based on trustcall response structure\n # Handle non-dict responses (shouldn't happen with trustcall, but defensive)\n if not isinstance(result, dict):\n return result\n\n # Extract first response and convert BaseModel to dict\n responses = result.get(\"responses\", [])\n if not responses:\n return result\n\n # Convert BaseModel to dict (creates the \"objects\" key)\n first_response = responses[0]\n structured_data = first_response.model_dump() if isinstance(first_response, BaseModel) else first_response\n\n # Extract the objects array (guaranteed to exist due to our Pydantic model structure)\n return structured_data.get(\"objects\", structured_data)\n\n def build_structured_output(self) -> Data:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n if len(output) != 1:\n msg = \"Multiple structured outputs returned\"\n raise ValueError(msg)\n return Data(data=output[0])\n" + "value": "from pydantic import BaseModel, Field, create_model\nfrom trustcall import create_extractor\n\nfrom langflow.base.models.chat_result import get_chat_result\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.helpers.base_model import build_model_from_schema\nfrom langflow.io import (\n HandleInput,\n MessageTextInput,\n MultilineInput,\n Output,\n TableInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.table import EditMode\n\n\nclass StructuredOutputComponent(Component):\n display_name = \"Structured Output\"\n description = \"Uses an LLM to generate structured data. Ideal for extraction and consistency.\"\n documentation: str = \"https://docs.langflow.org/components-processing#structured-output\"\n name = \"StructuredOutput\"\n icon = \"braces\"\n\n inputs = [\n HandleInput(\n name=\"llm\",\n display_name=\"Language Model\",\n info=\"The language model to use to generate the structured output.\",\n input_types=[\"LanguageModel\"],\n required=True,\n ),\n MultilineInput(\n name=\"input_value\",\n display_name=\"Input Message\",\n info=\"The input message to the language model.\",\n tool_mode=True,\n required=True,\n ),\n MultilineInput(\n name=\"system_prompt\",\n display_name=\"Format Instructions\",\n info=\"The instructions to the language model for formatting the output.\",\n value=(\n \"You are an AI that extracts structured JSON objects from unstructured text. \"\n \"Use a predefined schema with expected types (str, int, float, bool, dict). \"\n \"Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. \"\n \"Fill missing or ambiguous values with defaults: null for missing values. \"\n \"Remove exact duplicates but keep variations that have different field values. \"\n \"Always return valid JSON in the expected format, never throw errors. \"\n \"If multiple objects can be extracted, return them all in the structured format.\"\n ),\n required=True,\n advanced=True,\n ),\n MessageTextInput(\n name=\"schema_name\",\n display_name=\"Schema Name\",\n info=\"Provide a name for the output data schema.\",\n advanced=True,\n ),\n TableInput(\n name=\"output_schema\",\n display_name=\"Output Schema\",\n info=\"Define the structure and data types for the model's output.\",\n required=True,\n # TODO: remove deault value\n table_schema=[\n {\n \"name\": \"name\",\n \"display_name\": \"Name\",\n \"type\": \"str\",\n \"description\": \"Specify the name of the output field.\",\n \"default\": \"field\",\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"description\",\n \"display_name\": \"Description\",\n \"type\": \"str\",\n \"description\": \"Describe the purpose of the output field.\",\n \"default\": \"description of field\",\n \"edit_mode\": EditMode.POPOVER,\n },\n {\n \"name\": \"type\",\n \"display_name\": \"Type\",\n \"type\": \"str\",\n \"edit_mode\": EditMode.INLINE,\n \"description\": (\"Indicate the data type of the output field (e.g., str, int, float, bool, dict).\"),\n \"options\": [\"str\", \"int\", \"float\", \"bool\", \"dict\"],\n \"default\": \"str\",\n },\n {\n \"name\": \"multiple\",\n \"display_name\": \"As List\",\n \"type\": \"boolean\",\n \"description\": \"Set to True if this output field should be a list of the specified type.\",\n \"default\": \"False\",\n \"edit_mode\": EditMode.INLINE,\n },\n ],\n value=[\n {\n \"name\": \"field\",\n \"description\": \"description of field\",\n \"type\": \"str\",\n \"multiple\": \"False\",\n }\n ],\n ),\n ]\n\n outputs = [\n Output(\n name=\"structured_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_output\",\n ),\n Output(\n name=\"dataframe_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_dataframe\",\n ),\n ]\n\n def build_structured_output_base(self):\n schema_name = self.schema_name or \"OutputModel\"\n\n if not hasattr(self.llm, \"with_structured_output\"):\n msg = \"Language model does not support structured output.\"\n raise TypeError(msg)\n if not self.output_schema:\n msg = \"Output schema cannot be empty\"\n raise ValueError(msg)\n\n output_model_ = build_model_from_schema(self.output_schema)\n\n output_model = create_model(\n schema_name,\n __doc__=f\"A list of {schema_name}.\",\n objects=(list[output_model_], Field(description=f\"A list of {schema_name}.\")), # type: ignore[valid-type]\n )\n\n try:\n llm_with_structured_output = create_extractor(self.llm, tools=[output_model])\n except NotImplementedError as exc:\n msg = f\"{self.llm.__class__.__name__} does not support structured output.\"\n raise TypeError(msg) from exc\n\n config_dict = {\n \"run_name\": self.display_name,\n \"project_name\": self.get_project_name(),\n \"callbacks\": self.get_langchain_callbacks(),\n }\n result = get_chat_result(\n runnable=llm_with_structured_output,\n system_message=self.system_prompt,\n input_value=self.input_value,\n config=config_dict,\n )\n\n # OPTIMIZATION NOTE: Simplified processing based on trustcall response structure\n # Handle non-dict responses (shouldn't happen with trustcall, but defensive)\n if not isinstance(result, dict):\n return result\n\n # Extract first response and convert BaseModel to dict\n responses = result.get(\"responses\", [])\n if not responses:\n return result\n\n # Convert BaseModel to dict (creates the \"objects\" key)\n first_response = responses[0]\n structured_data = first_response.model_dump() if isinstance(first_response, BaseModel) else first_response\n\n # Extract the objects array (guaranteed to exist due to our Pydantic model structure)\n return structured_data.get(\"objects\", structured_data)\n\n def build_structured_output(self) -> Data:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n if len(output) == 1:\n return Data(data=output[0])\n if len(output) > 1:\n # Multiple outputs - wrap them in a results container\n return Data(data={\"results\": output})\n return Data()\n\n def build_structured_dataframe(self) -> DataFrame:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n data_list = [Data(data=output[0])] if len(output) == 1 else [Data(data=item) for item in output]\n\n return DataFrame(data_list)\n" }, "input_value": { "_input_type": "MessageTextInput", @@ -2786,7 +2800,7 @@ "trace_as_input": true, "trace_as_metadata": true, "type": "str", - "value": "You are an AI system designed to extract structured information from unstructured text.Given the input_text, return a JSON object with predefined keys based on the expected structure.Extract values accurately and format them according to the specified type (e.g., string, integer, float, date).If a value is missing or cannot be determined, return a default (e.g., null, 0, or 'N/A').If multiple instances of the expected structure exist within the input_text, stream each as a separate JSON object." + "value": "You are an AI that extracts structured JSON objects from unstructured text. Use a predefined schema with expected types (str, int, float, bool, dict). Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. Fill missing or ambiguous values with defaults: null for missing values. Remove exact duplicates but keep variations that have different field values. Always return valid JSON in the expected format, never throw errors. If multiple objects can be extracted, return them all in the structured format." } }, "tool_mode": false diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Image Sentiment Analysis.json b/src/backend/base/langflow/initial_setup/starter_projects/Image Sentiment Analysis.json index 968bd2173..67bd79431 100644 --- a/src/backend/base/langflow/initial_setup/starter_projects/Image Sentiment Analysis.json +++ b/src/backend/base/langflow/initial_setup/starter_projects/Image Sentiment Analysis.json @@ -1016,6 +1016,20 @@ "Data" ], "value": "__UNDEFINED__" + }, + { + "allows_loop": false, + "cache": true, + "display_name": "Structured Output", + "group_outputs": false, + "method": "build_structured_dataframe", + "name": "dataframe_output", + "selected": "DataFrame", + "tool_mode": true, + "types": [ + "DataFrame" + ], + "value": "__UNDEFINED__" } ], "pinned": false, @@ -1037,7 +1051,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from pydantic import BaseModel, Field, create_model\nfrom trustcall import create_extractor\n\nfrom langflow.base.models.chat_result import get_chat_result\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.helpers.base_model import build_model_from_schema\nfrom langflow.io import (\n HandleInput,\n MessageTextInput,\n MultilineInput,\n Output,\n TableInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.table import EditMode\n\n\nclass StructuredOutputComponent(Component):\n display_name = \"Structured Output\"\n description = \"Uses an LLM to generate structured data. Ideal for extraction and consistency.\"\n documentation: str = \"https://docs.langflow.org/components-processing#structured-output\"\n name = \"StructuredOutput\"\n icon = \"braces\"\n\n inputs = [\n HandleInput(\n name=\"llm\",\n display_name=\"Language Model\",\n info=\"The language model to use to generate the structured output.\",\n input_types=[\"LanguageModel\"],\n required=True,\n ),\n MultilineInput(\n name=\"input_value\",\n display_name=\"Input Message\",\n info=\"The input message to the language model.\",\n tool_mode=True,\n required=True,\n ),\n MultilineInput(\n name=\"system_prompt\",\n display_name=\"Format Instructions\",\n info=\"The instructions to the language model for formatting the output.\",\n value=(\n \"You are an AI that extracts one structured JSON object from unstructured text. \"\n \"Use a predefined schema with expected types (str, int, float, bool, dict). \"\n \"If multiple structures exist, extract only the first most complete one. \"\n \"Fill missing or ambiguous values with defaults: null for missing values. \"\n \"Ignore duplicates and partial repeats. \"\n \"Always return one valid JSON, never throw errors or return multiple objects.\"\n \"Output: A single well-formed JSON object, and nothing else.\"\n ),\n required=True,\n advanced=True,\n ),\n MessageTextInput(\n name=\"schema_name\",\n display_name=\"Schema Name\",\n info=\"Provide a name for the output data schema.\",\n advanced=True,\n ),\n TableInput(\n name=\"output_schema\",\n display_name=\"Output Schema\",\n info=\"Define the structure and data types for the model's output.\",\n required=True,\n # TODO: remove deault value\n table_schema=[\n {\n \"name\": \"name\",\n \"display_name\": \"Name\",\n \"type\": \"str\",\n \"description\": \"Specify the name of the output field.\",\n \"default\": \"field\",\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"description\",\n \"display_name\": \"Description\",\n \"type\": \"str\",\n \"description\": \"Describe the purpose of the output field.\",\n \"default\": \"description of field\",\n \"edit_mode\": EditMode.POPOVER,\n },\n {\n \"name\": \"type\",\n \"display_name\": \"Type\",\n \"type\": \"str\",\n \"edit_mode\": EditMode.INLINE,\n \"description\": (\"Indicate the data type of the output field (e.g., str, int, float, bool, dict).\"),\n \"options\": [\"str\", \"int\", \"float\", \"bool\", \"dict\"],\n \"default\": \"str\",\n },\n {\n \"name\": \"multiple\",\n \"display_name\": \"As List\",\n \"type\": \"boolean\",\n \"description\": \"Set to True if this output field should be a list of the specified type.\",\n \"default\": \"False\",\n \"edit_mode\": EditMode.INLINE,\n },\n ],\n value=[\n {\n \"name\": \"field\",\n \"description\": \"description of field\",\n \"type\": \"str\",\n \"multiple\": \"False\",\n }\n ],\n ),\n ]\n\n outputs = [\n Output(\n name=\"structured_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_output\",\n ),\n ]\n\n def build_structured_output_base(self):\n schema_name = self.schema_name or \"OutputModel\"\n\n if not hasattr(self.llm, \"with_structured_output\"):\n msg = \"Language model does not support structured output.\"\n raise TypeError(msg)\n if not self.output_schema:\n msg = \"Output schema cannot be empty\"\n raise ValueError(msg)\n\n output_model_ = build_model_from_schema(self.output_schema)\n\n output_model = create_model(\n schema_name,\n __doc__=f\"A list of {schema_name}.\",\n objects=(list[output_model_], Field(description=f\"A list of {schema_name}.\")), # type: ignore[valid-type]\n )\n\n try:\n llm_with_structured_output = create_extractor(self.llm, tools=[output_model])\n except NotImplementedError as exc:\n msg = f\"{self.llm.__class__.__name__} does not support structured output.\"\n raise TypeError(msg) from exc\n\n config_dict = {\n \"run_name\": self.display_name,\n \"project_name\": self.get_project_name(),\n \"callbacks\": self.get_langchain_callbacks(),\n }\n result = get_chat_result(\n runnable=llm_with_structured_output,\n system_message=self.system_prompt,\n input_value=self.input_value,\n config=config_dict,\n )\n\n # OPTIMIZATION NOTE: Simplified processing based on trustcall response structure\n # Handle non-dict responses (shouldn't happen with trustcall, but defensive)\n if not isinstance(result, dict):\n return result\n\n # Extract first response and convert BaseModel to dict\n responses = result.get(\"responses\", [])\n if not responses:\n return result\n\n # Convert BaseModel to dict (creates the \"objects\" key)\n first_response = responses[0]\n structured_data = first_response.model_dump() if isinstance(first_response, BaseModel) else first_response\n\n # Extract the objects array (guaranteed to exist due to our Pydantic model structure)\n return structured_data.get(\"objects\", structured_data)\n\n def build_structured_output(self) -> Data:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n if len(output) != 1:\n msg = \"Multiple structured outputs returned\"\n raise ValueError(msg)\n return Data(data=output[0])\n" + "value": "from pydantic import BaseModel, Field, create_model\nfrom trustcall import create_extractor\n\nfrom langflow.base.models.chat_result import get_chat_result\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.helpers.base_model import build_model_from_schema\nfrom langflow.io import (\n HandleInput,\n MessageTextInput,\n MultilineInput,\n Output,\n TableInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.table import EditMode\n\n\nclass StructuredOutputComponent(Component):\n display_name = \"Structured Output\"\n description = \"Uses an LLM to generate structured data. Ideal for extraction and consistency.\"\n documentation: str = \"https://docs.langflow.org/components-processing#structured-output\"\n name = \"StructuredOutput\"\n icon = \"braces\"\n\n inputs = [\n HandleInput(\n name=\"llm\",\n display_name=\"Language Model\",\n info=\"The language model to use to generate the structured output.\",\n input_types=[\"LanguageModel\"],\n required=True,\n ),\n MultilineInput(\n name=\"input_value\",\n display_name=\"Input Message\",\n info=\"The input message to the language model.\",\n tool_mode=True,\n required=True,\n ),\n MultilineInput(\n name=\"system_prompt\",\n display_name=\"Format Instructions\",\n info=\"The instructions to the language model for formatting the output.\",\n value=(\n \"You are an AI that extracts structured JSON objects from unstructured text. \"\n \"Use a predefined schema with expected types (str, int, float, bool, dict). \"\n \"Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. \"\n \"Fill missing or ambiguous values with defaults: null for missing values. \"\n \"Remove exact duplicates but keep variations that have different field values. \"\n \"Always return valid JSON in the expected format, never throw errors. \"\n \"If multiple objects can be extracted, return them all in the structured format.\"\n ),\n required=True,\n advanced=True,\n ),\n MessageTextInput(\n name=\"schema_name\",\n display_name=\"Schema Name\",\n info=\"Provide a name for the output data schema.\",\n advanced=True,\n ),\n TableInput(\n name=\"output_schema\",\n display_name=\"Output Schema\",\n info=\"Define the structure and data types for the model's output.\",\n required=True,\n # TODO: remove deault value\n table_schema=[\n {\n \"name\": \"name\",\n \"display_name\": \"Name\",\n \"type\": \"str\",\n \"description\": \"Specify the name of the output field.\",\n \"default\": \"field\",\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"description\",\n \"display_name\": \"Description\",\n \"type\": \"str\",\n \"description\": \"Describe the purpose of the output field.\",\n \"default\": \"description of field\",\n \"edit_mode\": EditMode.POPOVER,\n },\n {\n \"name\": \"type\",\n \"display_name\": \"Type\",\n \"type\": \"str\",\n \"edit_mode\": EditMode.INLINE,\n \"description\": (\"Indicate the data type of the output field (e.g., str, int, float, bool, dict).\"),\n \"options\": [\"str\", \"int\", \"float\", \"bool\", \"dict\"],\n \"default\": \"str\",\n },\n {\n \"name\": \"multiple\",\n \"display_name\": \"As List\",\n \"type\": \"boolean\",\n \"description\": \"Set to True if this output field should be a list of the specified type.\",\n \"default\": \"False\",\n \"edit_mode\": EditMode.INLINE,\n },\n ],\n value=[\n {\n \"name\": \"field\",\n \"description\": \"description of field\",\n \"type\": \"str\",\n \"multiple\": \"False\",\n }\n ],\n ),\n ]\n\n outputs = [\n Output(\n name=\"structured_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_output\",\n ),\n Output(\n name=\"dataframe_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_dataframe\",\n ),\n ]\n\n def build_structured_output_base(self):\n schema_name = self.schema_name or \"OutputModel\"\n\n if not hasattr(self.llm, \"with_structured_output\"):\n msg = \"Language model does not support structured output.\"\n raise TypeError(msg)\n if not self.output_schema:\n msg = \"Output schema cannot be empty\"\n raise ValueError(msg)\n\n output_model_ = build_model_from_schema(self.output_schema)\n\n output_model = create_model(\n schema_name,\n __doc__=f\"A list of {schema_name}.\",\n objects=(list[output_model_], Field(description=f\"A list of {schema_name}.\")), # type: ignore[valid-type]\n )\n\n try:\n llm_with_structured_output = create_extractor(self.llm, tools=[output_model])\n except NotImplementedError as exc:\n msg = f\"{self.llm.__class__.__name__} does not support structured output.\"\n raise TypeError(msg) from exc\n\n config_dict = {\n \"run_name\": self.display_name,\n \"project_name\": self.get_project_name(),\n \"callbacks\": self.get_langchain_callbacks(),\n }\n result = get_chat_result(\n runnable=llm_with_structured_output,\n system_message=self.system_prompt,\n input_value=self.input_value,\n config=config_dict,\n )\n\n # OPTIMIZATION NOTE: Simplified processing based on trustcall response structure\n # Handle non-dict responses (shouldn't happen with trustcall, but defensive)\n if not isinstance(result, dict):\n return result\n\n # Extract first response and convert BaseModel to dict\n responses = result.get(\"responses\", [])\n if not responses:\n return result\n\n # Convert BaseModel to dict (creates the \"objects\" key)\n first_response = responses[0]\n structured_data = first_response.model_dump() if isinstance(first_response, BaseModel) else first_response\n\n # Extract the objects array (guaranteed to exist due to our Pydantic model structure)\n return structured_data.get(\"objects\", structured_data)\n\n def build_structured_output(self) -> Data:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n if len(output) == 1:\n return Data(data=output[0])\n if len(output) > 1:\n # Multiple outputs - wrap them in a results container\n return Data(data={\"results\": output})\n return Data()\n\n def build_structured_dataframe(self) -> DataFrame:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n data_list = [Data(data=output[0])] if len(output) == 1 else [Data(data=item) for item in output]\n\n return DataFrame(data_list)\n" }, "input_value": { "_input_type": "MessageTextInput", @@ -1226,7 +1240,7 @@ "trace_as_input": true, "trace_as_metadata": true, "type": "str", - "value": "You are an AI system designed to extract structured information from unstructured text.Given the input_text, return a JSON object with predefined keys based on the expected structure.Extract values accurately and format them according to the specified type (e.g., string, integer, float, date).If a value is missing or cannot be determined, return a default (e.g., null, 0, or 'N/A').If multiple instances of the expected structure exist within the input_text, stream each as a separate JSON object." + "value": "You are an AI that extracts structured JSON objects from unstructured text. Use a predefined schema with expected types (str, int, float, bool, dict). Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. Fill missing or ambiguous values with defaults: null for missing values. Remove exact duplicates but keep variations that have different field values. Always return valid JSON in the expected format, never throw errors. If multiple objects can be extracted, return them all in the structured format." } }, "tool_mode": false diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Market Research.json b/src/backend/base/langflow/initial_setup/starter_projects/Market Research.json index 1d6ff103a..5f262b8d7 100644 --- a/src/backend/base/langflow/initial_setup/starter_projects/Market Research.json +++ b/src/backend/base/langflow/initial_setup/starter_projects/Market Research.json @@ -849,6 +849,20 @@ "Data" ], "value": "__UNDEFINED__" + }, + { + "allows_loop": false, + "cache": true, + "display_name": "Structured Output", + "group_outputs": false, + "method": "build_structured_dataframe", + "name": "dataframe_output", + "selected": "DataFrame", + "tool_mode": true, + "types": [ + "DataFrame" + ], + "value": "__UNDEFINED__" } ], "pinned": false, @@ -870,7 +884,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from pydantic import BaseModel, Field, create_model\nfrom trustcall import create_extractor\n\nfrom langflow.base.models.chat_result import get_chat_result\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.helpers.base_model import build_model_from_schema\nfrom langflow.io import (\n HandleInput,\n MessageTextInput,\n MultilineInput,\n Output,\n TableInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.table import EditMode\n\n\nclass StructuredOutputComponent(Component):\n display_name = \"Structured Output\"\n description = \"Uses an LLM to generate structured data. Ideal for extraction and consistency.\"\n documentation: str = \"https://docs.langflow.org/components-processing#structured-output\"\n name = \"StructuredOutput\"\n icon = \"braces\"\n\n inputs = [\n HandleInput(\n name=\"llm\",\n display_name=\"Language Model\",\n info=\"The language model to use to generate the structured output.\",\n input_types=[\"LanguageModel\"],\n required=True,\n ),\n MultilineInput(\n name=\"input_value\",\n display_name=\"Input Message\",\n info=\"The input message to the language model.\",\n tool_mode=True,\n required=True,\n ),\n MultilineInput(\n name=\"system_prompt\",\n display_name=\"Format Instructions\",\n info=\"The instructions to the language model for formatting the output.\",\n value=(\n \"You are an AI that extracts one structured JSON object from unstructured text. \"\n \"Use a predefined schema with expected types (str, int, float, bool, dict). \"\n \"If multiple structures exist, extract only the first most complete one. \"\n \"Fill missing or ambiguous values with defaults: null for missing values. \"\n \"Ignore duplicates and partial repeats. \"\n \"Always return one valid JSON, never throw errors or return multiple objects.\"\n \"Output: A single well-formed JSON object, and nothing else.\"\n ),\n required=True,\n advanced=True,\n ),\n MessageTextInput(\n name=\"schema_name\",\n display_name=\"Schema Name\",\n info=\"Provide a name for the output data schema.\",\n advanced=True,\n ),\n TableInput(\n name=\"output_schema\",\n display_name=\"Output Schema\",\n info=\"Define the structure and data types for the model's output.\",\n required=True,\n # TODO: remove deault value\n table_schema=[\n {\n \"name\": \"name\",\n \"display_name\": \"Name\",\n \"type\": \"str\",\n \"description\": \"Specify the name of the output field.\",\n \"default\": \"field\",\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"description\",\n \"display_name\": \"Description\",\n \"type\": \"str\",\n \"description\": \"Describe the purpose of the output field.\",\n \"default\": \"description of field\",\n \"edit_mode\": EditMode.POPOVER,\n },\n {\n \"name\": \"type\",\n \"display_name\": \"Type\",\n \"type\": \"str\",\n \"edit_mode\": EditMode.INLINE,\n \"description\": (\"Indicate the data type of the output field (e.g., str, int, float, bool, dict).\"),\n \"options\": [\"str\", \"int\", \"float\", \"bool\", \"dict\"],\n \"default\": \"str\",\n },\n {\n \"name\": \"multiple\",\n \"display_name\": \"As List\",\n \"type\": \"boolean\",\n \"description\": \"Set to True if this output field should be a list of the specified type.\",\n \"default\": \"False\",\n \"edit_mode\": EditMode.INLINE,\n },\n ],\n value=[\n {\n \"name\": \"field\",\n \"description\": \"description of field\",\n \"type\": \"str\",\n \"multiple\": \"False\",\n }\n ],\n ),\n ]\n\n outputs = [\n Output(\n name=\"structured_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_output\",\n ),\n ]\n\n def build_structured_output_base(self):\n schema_name = self.schema_name or \"OutputModel\"\n\n if not hasattr(self.llm, \"with_structured_output\"):\n msg = \"Language model does not support structured output.\"\n raise TypeError(msg)\n if not self.output_schema:\n msg = \"Output schema cannot be empty\"\n raise ValueError(msg)\n\n output_model_ = build_model_from_schema(self.output_schema)\n\n output_model = create_model(\n schema_name,\n __doc__=f\"A list of {schema_name}.\",\n objects=(list[output_model_], Field(description=f\"A list of {schema_name}.\")), # type: ignore[valid-type]\n )\n\n try:\n llm_with_structured_output = create_extractor(self.llm, tools=[output_model])\n except NotImplementedError as exc:\n msg = f\"{self.llm.__class__.__name__} does not support structured output.\"\n raise TypeError(msg) from exc\n\n config_dict = {\n \"run_name\": self.display_name,\n \"project_name\": self.get_project_name(),\n \"callbacks\": self.get_langchain_callbacks(),\n }\n result = get_chat_result(\n runnable=llm_with_structured_output,\n system_message=self.system_prompt,\n input_value=self.input_value,\n config=config_dict,\n )\n\n # OPTIMIZATION NOTE: Simplified processing based on trustcall response structure\n # Handle non-dict responses (shouldn't happen with trustcall, but defensive)\n if not isinstance(result, dict):\n return result\n\n # Extract first response and convert BaseModel to dict\n responses = result.get(\"responses\", [])\n if not responses:\n return result\n\n # Convert BaseModel to dict (creates the \"objects\" key)\n first_response = responses[0]\n structured_data = first_response.model_dump() if isinstance(first_response, BaseModel) else first_response\n\n # Extract the objects array (guaranteed to exist due to our Pydantic model structure)\n return structured_data.get(\"objects\", structured_data)\n\n def build_structured_output(self) -> Data:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n if len(output) != 1:\n msg = \"Multiple structured outputs returned\"\n raise ValueError(msg)\n return Data(data=output[0])\n" + "value": "from pydantic import BaseModel, Field, create_model\nfrom trustcall import create_extractor\n\nfrom langflow.base.models.chat_result import get_chat_result\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.helpers.base_model import build_model_from_schema\nfrom langflow.io import (\n HandleInput,\n MessageTextInput,\n MultilineInput,\n Output,\n TableInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.table import EditMode\n\n\nclass StructuredOutputComponent(Component):\n display_name = \"Structured Output\"\n description = \"Uses an LLM to generate structured data. Ideal for extraction and consistency.\"\n documentation: str = \"https://docs.langflow.org/components-processing#structured-output\"\n name = \"StructuredOutput\"\n icon = \"braces\"\n\n inputs = [\n HandleInput(\n name=\"llm\",\n display_name=\"Language Model\",\n info=\"The language model to use to generate the structured output.\",\n input_types=[\"LanguageModel\"],\n required=True,\n ),\n MultilineInput(\n name=\"input_value\",\n display_name=\"Input Message\",\n info=\"The input message to the language model.\",\n tool_mode=True,\n required=True,\n ),\n MultilineInput(\n name=\"system_prompt\",\n display_name=\"Format Instructions\",\n info=\"The instructions to the language model for formatting the output.\",\n value=(\n \"You are an AI that extracts structured JSON objects from unstructured text. \"\n \"Use a predefined schema with expected types (str, int, float, bool, dict). \"\n \"Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. \"\n \"Fill missing or ambiguous values with defaults: null for missing values. \"\n \"Remove exact duplicates but keep variations that have different field values. \"\n \"Always return valid JSON in the expected format, never throw errors. \"\n \"If multiple objects can be extracted, return them all in the structured format.\"\n ),\n required=True,\n advanced=True,\n ),\n MessageTextInput(\n name=\"schema_name\",\n display_name=\"Schema Name\",\n info=\"Provide a name for the output data schema.\",\n advanced=True,\n ),\n TableInput(\n name=\"output_schema\",\n display_name=\"Output Schema\",\n info=\"Define the structure and data types for the model's output.\",\n required=True,\n # TODO: remove deault value\n table_schema=[\n {\n \"name\": \"name\",\n \"display_name\": \"Name\",\n \"type\": \"str\",\n \"description\": \"Specify the name of the output field.\",\n \"default\": \"field\",\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"description\",\n \"display_name\": \"Description\",\n \"type\": \"str\",\n \"description\": \"Describe the purpose of the output field.\",\n \"default\": \"description of field\",\n \"edit_mode\": EditMode.POPOVER,\n },\n {\n \"name\": \"type\",\n \"display_name\": \"Type\",\n \"type\": \"str\",\n \"edit_mode\": EditMode.INLINE,\n \"description\": (\"Indicate the data type of the output field (e.g., str, int, float, bool, dict).\"),\n \"options\": [\"str\", \"int\", \"float\", \"bool\", \"dict\"],\n \"default\": \"str\",\n },\n {\n \"name\": \"multiple\",\n \"display_name\": \"As List\",\n \"type\": \"boolean\",\n \"description\": \"Set to True if this output field should be a list of the specified type.\",\n \"default\": \"False\",\n \"edit_mode\": EditMode.INLINE,\n },\n ],\n value=[\n {\n \"name\": \"field\",\n \"description\": \"description of field\",\n \"type\": \"str\",\n \"multiple\": \"False\",\n }\n ],\n ),\n ]\n\n outputs = [\n Output(\n name=\"structured_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_output\",\n ),\n Output(\n name=\"dataframe_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_dataframe\",\n ),\n ]\n\n def build_structured_output_base(self):\n schema_name = self.schema_name or \"OutputModel\"\n\n if not hasattr(self.llm, \"with_structured_output\"):\n msg = \"Language model does not support structured output.\"\n raise TypeError(msg)\n if not self.output_schema:\n msg = \"Output schema cannot be empty\"\n raise ValueError(msg)\n\n output_model_ = build_model_from_schema(self.output_schema)\n\n output_model = create_model(\n schema_name,\n __doc__=f\"A list of {schema_name}.\",\n objects=(list[output_model_], Field(description=f\"A list of {schema_name}.\")), # type: ignore[valid-type]\n )\n\n try:\n llm_with_structured_output = create_extractor(self.llm, tools=[output_model])\n except NotImplementedError as exc:\n msg = f\"{self.llm.__class__.__name__} does not support structured output.\"\n raise TypeError(msg) from exc\n\n config_dict = {\n \"run_name\": self.display_name,\n \"project_name\": self.get_project_name(),\n \"callbacks\": self.get_langchain_callbacks(),\n }\n result = get_chat_result(\n runnable=llm_with_structured_output,\n system_message=self.system_prompt,\n input_value=self.input_value,\n config=config_dict,\n )\n\n # OPTIMIZATION NOTE: Simplified processing based on trustcall response structure\n # Handle non-dict responses (shouldn't happen with trustcall, but defensive)\n if not isinstance(result, dict):\n return result\n\n # Extract first response and convert BaseModel to dict\n responses = result.get(\"responses\", [])\n if not responses:\n return result\n\n # Convert BaseModel to dict (creates the \"objects\" key)\n first_response = responses[0]\n structured_data = first_response.model_dump() if isinstance(first_response, BaseModel) else first_response\n\n # Extract the objects array (guaranteed to exist due to our Pydantic model structure)\n return structured_data.get(\"objects\", structured_data)\n\n def build_structured_output(self) -> Data:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n if len(output) == 1:\n return Data(data=output[0])\n if len(output) > 1:\n # Multiple outputs - wrap them in a results container\n return Data(data={\"results\": output})\n return Data()\n\n def build_structured_dataframe(self) -> DataFrame:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n data_list = [Data(data=output[0])] if len(output) == 1 else [Data(data=item) for item in output]\n\n return DataFrame(data_list)\n" }, "input_value": { "_input_type": "MessageTextInput", @@ -1108,7 +1122,7 @@ "trace_as_input": true, "trace_as_metadata": true, "type": "str", - "value": "You are an AI system designed to extract structured information from unstructured text.Given the input_text, return a JSON object with predefined keys based on the expected structure.Extract values accurately and format them according to the specified type (e.g., string, integer, float, date).If a value is missing or cannot be determined, return a default (e.g., null, 0, or 'N/A').If multiple instances of the expected structure exist within the input_text, stream each as a separate JSON object." + "value": "You are an AI that extracts structured JSON objects from unstructured text. Use a predefined schema with expected types (str, int, float, bool, dict). Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. Fill missing or ambiguous values with defaults: null for missing values. Remove exact duplicates but keep variations that have different field values. Always return valid JSON in the expected format, never throw errors. If multiple objects can be extracted, return them all in the structured format." } }, "tool_mode": false diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Portfolio Website Code Generator.json b/src/backend/base/langflow/initial_setup/starter_projects/Portfolio Website Code Generator.json index 4f14a0609..4ba1b2e0b 100644 --- a/src/backend/base/langflow/initial_setup/starter_projects/Portfolio Website Code Generator.json +++ b/src/backend/base/langflow/initial_setup/starter_projects/Portfolio Website Code Generator.json @@ -776,6 +776,20 @@ "Data" ], "value": "__UNDEFINED__" + }, + { + "allows_loop": false, + "cache": true, + "display_name": "Structured Output", + "group_outputs": false, + "method": "build_structured_dataframe", + "name": "dataframe_output", + "selected": "DataFrame", + "tool_mode": true, + "types": [ + "DataFrame" + ], + "value": "__UNDEFINED__" } ], "pinned": false, @@ -797,7 +811,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from pydantic import BaseModel, Field, create_model\nfrom trustcall import create_extractor\n\nfrom langflow.base.models.chat_result import get_chat_result\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.helpers.base_model import build_model_from_schema\nfrom langflow.io import (\n HandleInput,\n MessageTextInput,\n MultilineInput,\n Output,\n TableInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.table import EditMode\n\n\nclass StructuredOutputComponent(Component):\n display_name = \"Structured Output\"\n description = \"Uses an LLM to generate structured data. Ideal for extraction and consistency.\"\n documentation: str = \"https://docs.langflow.org/components-processing#structured-output\"\n name = \"StructuredOutput\"\n icon = \"braces\"\n\n inputs = [\n HandleInput(\n name=\"llm\",\n display_name=\"Language Model\",\n info=\"The language model to use to generate the structured output.\",\n input_types=[\"LanguageModel\"],\n required=True,\n ),\n MultilineInput(\n name=\"input_value\",\n display_name=\"Input Message\",\n info=\"The input message to the language model.\",\n tool_mode=True,\n required=True,\n ),\n MultilineInput(\n name=\"system_prompt\",\n display_name=\"Format Instructions\",\n info=\"The instructions to the language model for formatting the output.\",\n value=(\n \"You are an AI that extracts one structured JSON object from unstructured text. \"\n \"Use a predefined schema with expected types (str, int, float, bool, dict). \"\n \"If multiple structures exist, extract only the first most complete one. \"\n \"Fill missing or ambiguous values with defaults: null for missing values. \"\n \"Ignore duplicates and partial repeats. \"\n \"Always return one valid JSON, never throw errors or return multiple objects.\"\n \"Output: A single well-formed JSON object, and nothing else.\"\n ),\n required=True,\n advanced=True,\n ),\n MessageTextInput(\n name=\"schema_name\",\n display_name=\"Schema Name\",\n info=\"Provide a name for the output data schema.\",\n advanced=True,\n ),\n TableInput(\n name=\"output_schema\",\n display_name=\"Output Schema\",\n info=\"Define the structure and data types for the model's output.\",\n required=True,\n # TODO: remove deault value\n table_schema=[\n {\n \"name\": \"name\",\n \"display_name\": \"Name\",\n \"type\": \"str\",\n \"description\": \"Specify the name of the output field.\",\n \"default\": \"field\",\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"description\",\n \"display_name\": \"Description\",\n \"type\": \"str\",\n \"description\": \"Describe the purpose of the output field.\",\n \"default\": \"description of field\",\n \"edit_mode\": EditMode.POPOVER,\n },\n {\n \"name\": \"type\",\n \"display_name\": \"Type\",\n \"type\": \"str\",\n \"edit_mode\": EditMode.INLINE,\n \"description\": (\"Indicate the data type of the output field (e.g., str, int, float, bool, dict).\"),\n \"options\": [\"str\", \"int\", \"float\", \"bool\", \"dict\"],\n \"default\": \"str\",\n },\n {\n \"name\": \"multiple\",\n \"display_name\": \"As List\",\n \"type\": \"boolean\",\n \"description\": \"Set to True if this output field should be a list of the specified type.\",\n \"default\": \"False\",\n \"edit_mode\": EditMode.INLINE,\n },\n ],\n value=[\n {\n \"name\": \"field\",\n \"description\": \"description of field\",\n \"type\": \"str\",\n \"multiple\": \"False\",\n }\n ],\n ),\n ]\n\n outputs = [\n Output(\n name=\"structured_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_output\",\n ),\n ]\n\n def build_structured_output_base(self):\n schema_name = self.schema_name or \"OutputModel\"\n\n if not hasattr(self.llm, \"with_structured_output\"):\n msg = \"Language model does not support structured output.\"\n raise TypeError(msg)\n if not self.output_schema:\n msg = \"Output schema cannot be empty\"\n raise ValueError(msg)\n\n output_model_ = build_model_from_schema(self.output_schema)\n\n output_model = create_model(\n schema_name,\n __doc__=f\"A list of {schema_name}.\",\n objects=(list[output_model_], Field(description=f\"A list of {schema_name}.\")), # type: ignore[valid-type]\n )\n\n try:\n llm_with_structured_output = create_extractor(self.llm, tools=[output_model])\n except NotImplementedError as exc:\n msg = f\"{self.llm.__class__.__name__} does not support structured output.\"\n raise TypeError(msg) from exc\n\n config_dict = {\n \"run_name\": self.display_name,\n \"project_name\": self.get_project_name(),\n \"callbacks\": self.get_langchain_callbacks(),\n }\n result = get_chat_result(\n runnable=llm_with_structured_output,\n system_message=self.system_prompt,\n input_value=self.input_value,\n config=config_dict,\n )\n\n # OPTIMIZATION NOTE: Simplified processing based on trustcall response structure\n # Handle non-dict responses (shouldn't happen with trustcall, but defensive)\n if not isinstance(result, dict):\n return result\n\n # Extract first response and convert BaseModel to dict\n responses = result.get(\"responses\", [])\n if not responses:\n return result\n\n # Convert BaseModel to dict (creates the \"objects\" key)\n first_response = responses[0]\n structured_data = first_response.model_dump() if isinstance(first_response, BaseModel) else first_response\n\n # Extract the objects array (guaranteed to exist due to our Pydantic model structure)\n return structured_data.get(\"objects\", structured_data)\n\n def build_structured_output(self) -> Data:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n if len(output) != 1:\n msg = \"Multiple structured outputs returned\"\n raise ValueError(msg)\n return Data(data=output[0])\n" + "value": "from pydantic import BaseModel, Field, create_model\nfrom trustcall import create_extractor\n\nfrom langflow.base.models.chat_result import get_chat_result\nfrom langflow.custom.custom_component.component import Component\nfrom langflow.helpers.base_model import build_model_from_schema\nfrom langflow.io import (\n HandleInput,\n MessageTextInput,\n MultilineInput,\n Output,\n TableInput,\n)\nfrom langflow.schema.data import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.table import EditMode\n\n\nclass StructuredOutputComponent(Component):\n display_name = \"Structured Output\"\n description = \"Uses an LLM to generate structured data. Ideal for extraction and consistency.\"\n documentation: str = \"https://docs.langflow.org/components-processing#structured-output\"\n name = \"StructuredOutput\"\n icon = \"braces\"\n\n inputs = [\n HandleInput(\n name=\"llm\",\n display_name=\"Language Model\",\n info=\"The language model to use to generate the structured output.\",\n input_types=[\"LanguageModel\"],\n required=True,\n ),\n MultilineInput(\n name=\"input_value\",\n display_name=\"Input Message\",\n info=\"The input message to the language model.\",\n tool_mode=True,\n required=True,\n ),\n MultilineInput(\n name=\"system_prompt\",\n display_name=\"Format Instructions\",\n info=\"The instructions to the language model for formatting the output.\",\n value=(\n \"You are an AI that extracts structured JSON objects from unstructured text. \"\n \"Use a predefined schema with expected types (str, int, float, bool, dict). \"\n \"Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. \"\n \"Fill missing or ambiguous values with defaults: null for missing values. \"\n \"Remove exact duplicates but keep variations that have different field values. \"\n \"Always return valid JSON in the expected format, never throw errors. \"\n \"If multiple objects can be extracted, return them all in the structured format.\"\n ),\n required=True,\n advanced=True,\n ),\n MessageTextInput(\n name=\"schema_name\",\n display_name=\"Schema Name\",\n info=\"Provide a name for the output data schema.\",\n advanced=True,\n ),\n TableInput(\n name=\"output_schema\",\n display_name=\"Output Schema\",\n info=\"Define the structure and data types for the model's output.\",\n required=True,\n # TODO: remove deault value\n table_schema=[\n {\n \"name\": \"name\",\n \"display_name\": \"Name\",\n \"type\": \"str\",\n \"description\": \"Specify the name of the output field.\",\n \"default\": \"field\",\n \"edit_mode\": EditMode.INLINE,\n },\n {\n \"name\": \"description\",\n \"display_name\": \"Description\",\n \"type\": \"str\",\n \"description\": \"Describe the purpose of the output field.\",\n \"default\": \"description of field\",\n \"edit_mode\": EditMode.POPOVER,\n },\n {\n \"name\": \"type\",\n \"display_name\": \"Type\",\n \"type\": \"str\",\n \"edit_mode\": EditMode.INLINE,\n \"description\": (\"Indicate the data type of the output field (e.g., str, int, float, bool, dict).\"),\n \"options\": [\"str\", \"int\", \"float\", \"bool\", \"dict\"],\n \"default\": \"str\",\n },\n {\n \"name\": \"multiple\",\n \"display_name\": \"As List\",\n \"type\": \"boolean\",\n \"description\": \"Set to True if this output field should be a list of the specified type.\",\n \"default\": \"False\",\n \"edit_mode\": EditMode.INLINE,\n },\n ],\n value=[\n {\n \"name\": \"field\",\n \"description\": \"description of field\",\n \"type\": \"str\",\n \"multiple\": \"False\",\n }\n ],\n ),\n ]\n\n outputs = [\n Output(\n name=\"structured_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_output\",\n ),\n Output(\n name=\"dataframe_output\",\n display_name=\"Structured Output\",\n method=\"build_structured_dataframe\",\n ),\n ]\n\n def build_structured_output_base(self):\n schema_name = self.schema_name or \"OutputModel\"\n\n if not hasattr(self.llm, \"with_structured_output\"):\n msg = \"Language model does not support structured output.\"\n raise TypeError(msg)\n if not self.output_schema:\n msg = \"Output schema cannot be empty\"\n raise ValueError(msg)\n\n output_model_ = build_model_from_schema(self.output_schema)\n\n output_model = create_model(\n schema_name,\n __doc__=f\"A list of {schema_name}.\",\n objects=(list[output_model_], Field(description=f\"A list of {schema_name}.\")), # type: ignore[valid-type]\n )\n\n try:\n llm_with_structured_output = create_extractor(self.llm, tools=[output_model])\n except NotImplementedError as exc:\n msg = f\"{self.llm.__class__.__name__} does not support structured output.\"\n raise TypeError(msg) from exc\n\n config_dict = {\n \"run_name\": self.display_name,\n \"project_name\": self.get_project_name(),\n \"callbacks\": self.get_langchain_callbacks(),\n }\n result = get_chat_result(\n runnable=llm_with_structured_output,\n system_message=self.system_prompt,\n input_value=self.input_value,\n config=config_dict,\n )\n\n # OPTIMIZATION NOTE: Simplified processing based on trustcall response structure\n # Handle non-dict responses (shouldn't happen with trustcall, but defensive)\n if not isinstance(result, dict):\n return result\n\n # Extract first response and convert BaseModel to dict\n responses = result.get(\"responses\", [])\n if not responses:\n return result\n\n # Convert BaseModel to dict (creates the \"objects\" key)\n first_response = responses[0]\n structured_data = first_response.model_dump() if isinstance(first_response, BaseModel) else first_response\n\n # Extract the objects array (guaranteed to exist due to our Pydantic model structure)\n return structured_data.get(\"objects\", structured_data)\n\n def build_structured_output(self) -> Data:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n if len(output) == 1:\n return Data(data=output[0])\n if len(output) > 1:\n # Multiple outputs - wrap them in a results container\n return Data(data={\"results\": output})\n return Data()\n\n def build_structured_dataframe(self) -> DataFrame:\n output = self.build_structured_output_base()\n if not isinstance(output, list) or not output:\n # handle empty or unexpected type case\n msg = \"No structured output returned\"\n raise ValueError(msg)\n data_list = [Data(data=output[0])] if len(output) == 1 else [Data(data=item) for item in output]\n\n return DataFrame(data_list)\n" }, "input_value": { "_input_type": "MessageTextInput", @@ -1047,7 +1061,7 @@ "trace_as_input": true, "trace_as_metadata": true, "type": "str", - "value": "You are an AI system designed to extract structured information from unstructured text.Given the input_text, return a JSON object with predefined keys based on the expected structure.Extract values accurately and format them according to the specified type (e.g., string, integer, float, date).If a value is missing or cannot be determined, return a default (e.g., null, 0, or 'N/A').If multiple instances of the expected structure exist within the input_text, stream each as a separate JSON object." + "value": "You are an AI that extracts structured JSON objects from unstructured text. Use a predefined schema with expected types (str, int, float, bool, dict). Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. Fill missing or ambiguous values with defaults: null for missing values. Remove exact duplicates but keep variations that have different field values. Always return valid JSON in the expected format, never throw errors. If multiple objects can be extracted, return them all in the structured format." } }, "tool_mode": false diff --git a/src/backend/tests/unit/components/processing/test_structured_output_component.py b/src/backend/tests/unit/components/processing/test_structured_output_component.py index a650174c7..33ab6a340 100644 --- a/src/backend/tests/unit/components/processing/test_structured_output_component.py +++ b/src/backend/tests/unit/components/processing/test_structured_output_component.py @@ -283,6 +283,119 @@ class TestStructuredOutputComponent(ComponentTestBaseWithoutClient): assert result[0]["name"] == "John Doe" assert result[0]["age"] == 30 + @pytest.mark.skipif( + "OPENAI_API_KEY" not in os.environ, + reason="OPENAI_API_KEY environment variable not set", + ) + def test_with_real_openai_model_multiple_patterns(self): + # Create a real OpenAI model + llm = ChatOpenAI(model="gpt-4o-mini", temperature=0) + + # Create a component with multiple people in the input + component = StructuredOutputComponent( + llm=llm, + input_value=( + "Extract all people from this text: John Doe is 30 years old, Jane Smith is 25, and Bob Johnson is 35." + ), + schema_name="PersonInfo", + output_schema=[ + {"name": "name", "type": "str", "description": "The person's name"}, + {"name": "age", "type": "int", "description": "The person's age"}, + ], + multiple=False, + system_prompt=( + "You are an AI that extracts structured JSON objects from unstructured text. " + "Use a predefined schema with expected types (str, int, float, bool, dict). " + "Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. " + "Fill missing or ambiguous values with defaults: null for missing values. " + "Remove exact duplicates but keep variations that have different field values. " + "Always return valid JSON in the expected format, never throw errors. " + "If multiple objects can be extracted, return them all in the structured format." + ), + ) + + # Get the structured output + result = component.build_structured_output_base() + + # Verify the result contains multiple people + assert isinstance(result, list) + assert len(result) >= 3 # Should extract all three people + + # Check that we have names and ages for multiple people + names = [item["name"] for item in result if "name" in item] + ages = [item["age"] for item in result if "age" in item] + + assert len(names) >= 3 + assert len(ages) >= 3 + + # Check that we extracted the expected people (order may vary) + expected_names = ["John Doe", "Jane Smith", "Bob Johnson"] + expected_ages = [30, 25, 35] + + for expected_name in expected_names: + assert any(expected_name in name for name in names) + for expected_age in expected_ages: + assert expected_age in ages + + def test_multiple_patterns_with_duplicates_and_variations(self): + """Test that multiple patterns are extracted while removing exact duplicates but keeping variations.""" + + def mock_get_chat_result(runnable, system_message, input_value, config): # noqa: ARG001 + class MockBaseModel(BaseModel): + def model_dump(self, **__): + return { + "objects": [ + {"product": "iPhone", "price": 999.99}, + {"product": "iPhone", "price": 1099.99}, # Variation - different price + {"product": "Samsung", "price": 899.99}, + {"product": "iPhone", "price": 999.99}, # Exact duplicate - should be removed + ] + } + + return { + "messages": ["mock_message"], + "responses": [MockBaseModel()], + "response_metadata": [{"id": "mock_id"}], + "attempts": 1, + } + + component = StructuredOutputComponent( + llm=MockLanguageModel(), + input_value="Products: iPhone $999.99, iPhone $1099.99, Samsung $899.99, iPhone $999.99", + schema_name="ProductSchema", + output_schema=[ + {"name": "product", "type": "str", "description": "Product name"}, + {"name": "price", "type": "float", "description": "Product price"}, + ], + multiple=False, + system_prompt="Remove exact duplicates but keep variations that have different field values.", + ) + + with patch("langflow.components.processing.structured_output.get_chat_result", mock_get_chat_result): + result = component.build_structured_output() + + # Check that result is a Data object + from langflow.schema.data import Data + + assert isinstance(result, Data) + + # Should have multiple results due to multiple patterns + assert isinstance(result.data, dict) + assert "results" in result.data + assert ( + len(result.data["results"]) == 4 + ) # All items returned (duplicate handling is expected to be done by LLM) + + # Verify the expected products are present + products = [item["product"] for item in result.data["results"]] + prices = [item["price"] for item in result.data["results"]] + + assert "iPhone" in products + assert "Samsung" in products + assert 999.99 in prices + assert 1099.99 in prices + assert 899.99 in prices + @pytest.mark.skipif( "OPENAI_API_KEY" not in os.environ, reason="OPENAI_API_KEY environment variable not set", @@ -618,6 +731,57 @@ class TestStructuredOutputComponent(ComponentTestBaseWithoutClient): assert result.data["field"] == "value2" assert result.data["number"] == 24 + def test_build_structured_output_returns_multiple_objects(self): + """Test that build_structured_output() returns Data object with multiple objects wrapped in results.""" + + def mock_get_chat_result(runnable, system_message, input_value, config): # noqa: ARG001 + class MockBaseModel(BaseModel): + def model_dump(self, **__): + return { + "objects": [ + {"name": "John", "age": 30}, + {"name": "Jane", "age": 25}, + {"name": "Bob", "age": 35}, + ] + } + + return { + "messages": ["mock_message"], + "responses": [MockBaseModel()], + "response_metadata": [{"id": "mock_id"}], + "attempts": 1, + } + + component = StructuredOutputComponent( + llm=MockLanguageModel(), + input_value="Extract multiple people: John is 30, Jane is 25, Bob is 35", + schema_name="PersonSchema", + output_schema=[ + {"name": "name", "type": "str", "description": "Person's name"}, + {"name": "age", "type": "int", "description": "Person's age"}, + ], + multiple=False, + system_prompt="Extract ALL relevant instances that match the schema", + ) + + with patch("langflow.components.processing.structured_output.get_chat_result", mock_get_chat_result): + result = component.build_structured_output() + + # Check that result is a Data object + from langflow.schema.data import Data + + assert isinstance(result, Data) + + # Check that result.data is a dict with results key + assert isinstance(result.data, dict) + assert "results" in result.data + assert len(result.data["results"]) == 3 + + # Check the content of each result + assert result.data["results"][0] == {"name": "John", "age": 30} + assert result.data["results"][1] == {"name": "Jane", "age": 25} + assert result.data["results"][2] == {"name": "Bob", "age": 35} + def test_build_structured_output_returns_data_with_single_item(self): """Test that build_structured_output() returns Data object when only one item in objects.""" @@ -711,3 +875,157 @@ class TestStructuredOutputComponent(ComponentTestBaseWithoutClient): # Data object should be able to represent itself as text text_repr = result.get_text() assert isinstance(text_repr, str) + + def test_build_structured_dataframe_returns_dataframe_with_single_data(self): + """Test that build_structured_dataframe() returns DataFrame object with single Data item.""" + + def mock_get_chat_result(runnable, system_message, input_value, config): # noqa: ARG001 + class MockBaseModel(BaseModel): + def model_dump(self, **__): + return {"objects": [{"field": "value2", "number": 24}]} + + return { + "messages": ["mock_message"], + "responses": [MockBaseModel()], + "response_metadata": [{"id": "mock_id"}], + "attempts": 1, + } + + component = StructuredOutputComponent( + llm=MockLanguageModel(), + input_value="Test input", + schema_name="TestSchema", + output_schema=[ + {"name": "field", "type": "str", "description": "A test field"}, + {"name": "number", "type": "int", "description": "A test number"}, + ], + multiple=False, + system_prompt="Test system prompt", + ) + + with patch("langflow.components.processing.structured_output.get_chat_result", mock_get_chat_result): + result = component.build_structured_dataframe() + + # Check that result is a DataFrame object + from langflow.schema.dataframe import DataFrame + + assert isinstance(result, DataFrame) + assert len(result) == 1 + assert result.iloc[0]["field"] == "value2" + assert result.iloc[0]["number"] == 24 + + # Test conversion back to Data list + data_list = result.to_data_list() + assert len(data_list) == 1 + assert data_list[0].data == {"field": "value2", "number": 24} + + def test_build_structured_dataframe_returns_dataframe_with_multiple_data(self): + """Test that build_structured_dataframe() returns DataFrame object with multiple Data items.""" + + def mock_get_chat_result(runnable, system_message, input_value, config): # noqa: ARG001 + class MockBaseModel(BaseModel): + def model_dump(self, **__): + return { + "objects": [ + {"name": "John", "age": 30}, + {"name": "Jane", "age": 25}, + {"name": "Bob", "age": 35}, + ] + } + + return { + "messages": ["mock_message"], + "responses": [MockBaseModel()], + "response_metadata": [{"id": "mock_id"}], + "attempts": 1, + } + + component = StructuredOutputComponent( + llm=MockLanguageModel(), + input_value="Test input with multiple people", + schema_name="PersonSchema", + output_schema=[ + {"name": "name", "type": "str", "description": "Person's name"}, + {"name": "age", "type": "int", "description": "Person's age"}, + ], + multiple=False, + system_prompt="Test system prompt", + ) + + with patch("langflow.components.processing.structured_output.get_chat_result", mock_get_chat_result): + result = component.build_structured_dataframe() + + # Check that result is a DataFrame object + from langflow.schema.dataframe import DataFrame + + assert isinstance(result, DataFrame) + assert len(result) == 3 + assert result.iloc[0]["name"] == "John" + assert result.iloc[0]["age"] == 30 + assert result.iloc[1]["name"] == "Jane" + assert result.iloc[1]["age"] == 25 + assert result.iloc[2]["name"] == "Bob" + assert result.iloc[2]["age"] == 35 + + # Test conversion back to Data list + data_list = result.to_data_list() + assert len(data_list) == 3 + assert data_list[0].data == {"name": "John", "age": 30} + assert data_list[1].data == {"name": "Jane", "age": 25} + assert data_list[2].data == {"name": "Bob", "age": 35} + + def test_build_structured_dataframe_fails_when_base_returns_non_list(self): + """Test that build_structured_dataframe() fails when base method returns non-list.""" + + def mock_get_chat_result(runnable, system_message, input_value, config): # noqa: ARG001 + return { + "messages": ["mock_message"], + "responses": [{"single_item": "value"}], + "response_metadata": [{"id": "mock_id"}], + "attempts": 1, + } + + component = StructuredOutputComponent( + llm=MockLanguageModel(), + input_value="Test input", + schema_name="TestSchema", + output_schema=[{"name": "field", "type": "str", "description": "A test field"}], + multiple=False, + system_prompt="Test system prompt", + ) + + with ( + patch("langflow.components.processing.structured_output.get_chat_result", mock_get_chat_result), + pytest.raises(ValueError, match="No structured output returned"), + ): + component.build_structured_dataframe() + + def test_build_structured_dataframe_fails_when_empty_output(self): + """Test that build_structured_dataframe() fails when base method returns empty list.""" + + def mock_get_chat_result(runnable, system_message, input_value, config): # noqa: ARG001 + class MockBaseModel(BaseModel): + def model_dump(self, **__): + return {"objects": []} + + return { + "messages": ["mock_message"], + "responses": [MockBaseModel()], + "response_metadata": [{"id": "mock_id"}], + "attempts": 1, + } + + component = StructuredOutputComponent( + llm=MockLanguageModel(), + input_value="Test input", + schema_name="TestSchema", + output_schema=[{"name": "field", "type": "str", "description": "A test field"}], + multiple=False, + system_prompt="Test system prompt", + ) + + with ( + patch("langflow.components.processing.structured_output.get_chat_result", mock_get_chat_result), + pytest.raises(ValueError, match="No structured output returned"), + ): + component.build_structured_dataframe()