feat: add DataFrame output to Structured Output component (#8842)

* feat: add DataFrame output to Structured Output component - Add new DataFrame output alongside existing Data output for structured data - Single output: creates DataFrame with one row - Multiple outputs: creates DataFrame with multiple rows, each containing a Data object - Maintains backward compatibility with existing Data output - Includes comprehensive test coverage for both single and multiple output scenarios 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) * [autofix.ci] apply automated fixes * Update structured_output.py * [autofix.ci] apply automated fixes * udpate to tests * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) * update to format instructions * Update system prompt in structured output component test Expanded the system prompt in the test for StructuredOutputComponent to provide more detailed extraction instructions, including handling of missing values, duplicates, and output format requirements. This improves test clarity and better simulates real-world extraction scenarios. * [autofix.ci] apply automated fixes * Update structured_output.py * fix: return empty Data object instead of None in StructuredOutputComponent Updated the return statement in the StructuredOutputComponent to return an empty Data object when there are no outputs, improving consistency in the output handling. * [autofix.ci] apply automated fixes --------- Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Edwin Jose <edwin.jose@datastax.com> Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
2025-07-09 18:57:07 -03:00 · 2025-07-09 18:57:07 -03:00 · a0e484c0eb
commit a0e484c0eb
parent e0400f29eb
7 changed files with 424 additions and 18 deletions
--- a/src/backend/base/langflow/components/processing/structured_output.py
+++ b/src/backend/base/langflow/components/processing/structured_output.py
@ -12,6 +12,7 @@ from langflow.io import (
    TableInput,
 )
 from langflow.schema.data import Data
+from langflow.schema.dataframe import DataFrame
 from langflow.schema.table import EditMode


@ -42,13 +43,13 @@ class StructuredOutputComponent(Component):
            display_name="Format Instructions",
            info="The instructions to the language model for formatting the output.",
            value=(
-                "You are an AI that extracts one structured JSON object from unstructured text. "
+                "You are an AI that extracts structured JSON objects from unstructured text. "
                "Use a predefined schema with expected types (str, int, float, bool, dict). "
-                "If multiple structures exist, extract only the first most complete one. "
+                "Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. "
                "Fill missing or ambiguous values with defaults: null for missing values. "
-                "Ignore duplicates and partial repeats. "
-                "Always return one valid JSON, never throw errors or return multiple objects."
-                "Output: A single well-formed JSON object, and nothing else."
+                "Remove exact duplicates but keep variations that have different field values. "
+                "Always return valid JSON in the expected format, never throw errors. "
+                "If multiple objects can be extracted, return them all in the structured format."
            ),
            required=True,
            advanced=True,
@ -117,6 +118,11 @@ class StructuredOutputComponent(Component):
            display_name="Structured Output",
            method="build_structured_output",
        ),
+        Output(
+            name="dataframe_output",
+            display_name="Structured Output",
+            method="build_structured_dataframe",
+        ),
    ]

    def build_structured_output_base(self):
@ -178,7 +184,19 @@ class StructuredOutputComponent(Component):
            # handle empty or unexpected type case
            msg = "No structured output returned"
            raise ValueError(msg)
-        if len(output) != 1:
-            msg = "Multiple structured outputs returned"
+        if len(output) == 1:
+            return Data(data=output[0])
+        if len(output) > 1:
+            # Multiple outputs - wrap them in a results container
+            return Data(data={"results": output})
+        return Data()
+
+    def build_structured_dataframe(self) -> DataFrame:
+        output = self.build_structured_output_base()
+        if not isinstance(output, list) or not output:
+            # handle empty or unexpected type case
+            msg = "No structured output returned"
            raise ValueError(msg)
-        return Data(data=output[0])
+        data_list = [Data(data=output[0])] if len(output) == 1 else [Data(data=item) for item in output]
+
+        return DataFrame(data_list)
--- a/src/backend/base/langflow/initial_setup/starter_projects/Financial
+++ b/src/backend/base/langflow/initial_setup/starter_projects/Financial
--- a/src/backend/base/langflow/initial_setup/starter_projects/Hybrid
+++ b/src/backend/base/langflow/initial_setup/starter_projects/Hybrid
--- a/src/backend/base/langflow/initial_setup/starter_projects/Image
+++ b/src/backend/base/langflow/initial_setup/starter_projects/Image
--- a/src/backend/base/langflow/initial_setup/starter_projects/Market
+++ b/src/backend/base/langflow/initial_setup/starter_projects/Market
--- a/src/backend/base/langflow/initial_setup/starter_projects/Portfolio
+++ b/src/backend/base/langflow/initial_setup/starter_projects/Portfolio
--- a/src/backend/tests/unit/components/processing/test_structured_output_component.py
+++ b/src/backend/tests/unit/components/processing/test_structured_output_component.py
@ -283,6 +283,119 @@ class TestStructuredOutputComponent(ComponentTestBaseWithoutClient):
        assert result[0]["name"] == "John Doe"
        assert result[0]["age"] == 30

+    @pytest.mark.skipif(
+        "OPENAI_API_KEY" not in os.environ,
+        reason="OPENAI_API_KEY environment variable not set",
+    )
+    def test_with_real_openai_model_multiple_patterns(self):
+        # Create a real OpenAI model
+        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
+
+        # Create a component with multiple people in the input
+        component = StructuredOutputComponent(
+            llm=llm,
+            input_value=(
+                "Extract all people from this text: John Doe is 30 years old, Jane Smith is 25, and Bob Johnson is 35."
+            ),
+            schema_name="PersonInfo",
+            output_schema=[
+                {"name": "name", "type": "str", "description": "The person's name"},
+                {"name": "age", "type": "int", "description": "The person's age"},
+            ],
+            multiple=False,
+            system_prompt=(
+                "You are an AI that extracts structured JSON objects from unstructured text. "
+                "Use a predefined schema with expected types (str, int, float, bool, dict). "
+                "Extract ALL relevant instances that match the schema - if multiple patterns exist, capture them all. "
+                "Fill missing or ambiguous values with defaults: null for missing values. "
+                "Remove exact duplicates but keep variations that have different field values. "
+                "Always return valid JSON in the expected format, never throw errors. "
+                "If multiple objects can be extracted, return them all in the structured format."
+            ),
+        )
+
+        # Get the structured output
+        result = component.build_structured_output_base()
+
+        # Verify the result contains multiple people
+        assert isinstance(result, list)
+        assert len(result) >= 3  # Should extract all three people
+
+        # Check that we have names and ages for multiple people
+        names = [item["name"] for item in result if "name" in item]
+        ages = [item["age"] for item in result if "age" in item]
+
+        assert len(names) >= 3
+        assert len(ages) >= 3
+
+        # Check that we extracted the expected people (order may vary)
+        expected_names = ["John Doe", "Jane Smith", "Bob Johnson"]
+        expected_ages = [30, 25, 35]
+
+        for expected_name in expected_names:
+            assert any(expected_name in name for name in names)
+        for expected_age in expected_ages:
+            assert expected_age in ages
+
+    def test_multiple_patterns_with_duplicates_and_variations(self):
+        """Test that multiple patterns are extracted while removing exact duplicates but keeping variations."""
+
+        def mock_get_chat_result(runnable, system_message, input_value, config):  # noqa: ARG001
+            class MockBaseModel(BaseModel):
+                def model_dump(self, **__):
+                    return {
+                        "objects": [
+                            {"product": "iPhone", "price": 999.99},
+                            {"product": "iPhone", "price": 1099.99},  # Variation - different price
+                            {"product": "Samsung", "price": 899.99},
+                            {"product": "iPhone", "price": 999.99},  # Exact duplicate - should be removed
+                        ]
+                    }
+
+            return {
+                "messages": ["mock_message"],
+                "responses": [MockBaseModel()],
+                "response_metadata": [{"id": "mock_id"}],
+                "attempts": 1,
+            }
+
+        component = StructuredOutputComponent(
+            llm=MockLanguageModel(),
+            input_value="Products: iPhone $999.99, iPhone $1099.99, Samsung $899.99, iPhone $999.99",
+            schema_name="ProductSchema",
+            output_schema=[
+                {"name": "product", "type": "str", "description": "Product name"},
+                {"name": "price", "type": "float", "description": "Product price"},
+            ],
+            multiple=False,
+            system_prompt="Remove exact duplicates but keep variations that have different field values.",
+        )
+
+        with patch("langflow.components.processing.structured_output.get_chat_result", mock_get_chat_result):
+            result = component.build_structured_output()
+
+            # Check that result is a Data object
+            from langflow.schema.data import Data
+
+            assert isinstance(result, Data)
+
+            # Should have multiple results due to multiple patterns
+            assert isinstance(result.data, dict)
+            assert "results" in result.data
+            assert (
+                len(result.data["results"]) == 4
+            )  # All items returned (duplicate handling is expected to be done by LLM)
+
+            # Verify the expected products are present
+            products = [item["product"] for item in result.data["results"]]
+            prices = [item["price"] for item in result.data["results"]]
+
+            assert "iPhone" in products
+            assert "Samsung" in products
+            assert 999.99 in prices
+            assert 1099.99 in prices
+            assert 899.99 in prices
+
    @pytest.mark.skipif(
        "OPENAI_API_KEY" not in os.environ,
        reason="OPENAI_API_KEY environment variable not set",
@ -618,6 +731,57 @@ class TestStructuredOutputComponent(ComponentTestBaseWithoutClient):
            assert result.data["field"] == "value2"
            assert result.data["number"] == 24

+    def test_build_structured_output_returns_multiple_objects(self):
+        """Test that build_structured_output() returns Data object with multiple objects wrapped in results."""
+
+        def mock_get_chat_result(runnable, system_message, input_value, config):  # noqa: ARG001
+            class MockBaseModel(BaseModel):
+                def model_dump(self, **__):
+                    return {
+                        "objects": [
+                            {"name": "John", "age": 30},
+                            {"name": "Jane", "age": 25},
+                            {"name": "Bob", "age": 35},
+                        ]
+                    }
+
+            return {
+                "messages": ["mock_message"],
+                "responses": [MockBaseModel()],
+                "response_metadata": [{"id": "mock_id"}],
+                "attempts": 1,
+            }
+
+        component = StructuredOutputComponent(
+            llm=MockLanguageModel(),
+            input_value="Extract multiple people: John is 30, Jane is 25, Bob is 35",
+            schema_name="PersonSchema",
+            output_schema=[
+                {"name": "name", "type": "str", "description": "Person's name"},
+                {"name": "age", "type": "int", "description": "Person's age"},
+            ],
+            multiple=False,
+            system_prompt="Extract ALL relevant instances that match the schema",
+        )
+
+        with patch("langflow.components.processing.structured_output.get_chat_result", mock_get_chat_result):
+            result = component.build_structured_output()
+
+            # Check that result is a Data object
+            from langflow.schema.data import Data
+
+            assert isinstance(result, Data)
+
+            # Check that result.data is a dict with results key
+            assert isinstance(result.data, dict)
+            assert "results" in result.data
+            assert len(result.data["results"]) == 3
+
+            # Check the content of each result
+            assert result.data["results"][0] == {"name": "John", "age": 30}
+            assert result.data["results"][1] == {"name": "Jane", "age": 25}
+            assert result.data["results"][2] == {"name": "Bob", "age": 35}
+
    def test_build_structured_output_returns_data_with_single_item(self):
        """Test that build_structured_output() returns Data object when only one item in objects."""

@ -711,3 +875,157 @@ class TestStructuredOutputComponent(ComponentTestBaseWithoutClient):
                # Data object should be able to represent itself as text
                text_repr = result.get_text()
                assert isinstance(text_repr, str)
+
+    def test_build_structured_dataframe_returns_dataframe_with_single_data(self):
+        """Test that build_structured_dataframe() returns DataFrame object with single Data item."""
+
+        def mock_get_chat_result(runnable, system_message, input_value, config):  # noqa: ARG001
+            class MockBaseModel(BaseModel):
+                def model_dump(self, **__):
+                    return {"objects": [{"field": "value2", "number": 24}]}
+
+            return {
+                "messages": ["mock_message"],
+                "responses": [MockBaseModel()],
+                "response_metadata": [{"id": "mock_id"}],
+                "attempts": 1,
+            }
+
+        component = StructuredOutputComponent(
+            llm=MockLanguageModel(),
+            input_value="Test input",
+            schema_name="TestSchema",
+            output_schema=[
+                {"name": "field", "type": "str", "description": "A test field"},
+                {"name": "number", "type": "int", "description": "A test number"},
+            ],
+            multiple=False,
+            system_prompt="Test system prompt",
+        )
+
+        with patch("langflow.components.processing.structured_output.get_chat_result", mock_get_chat_result):
+            result = component.build_structured_dataframe()
+
+            # Check that result is a DataFrame object
+            from langflow.schema.dataframe import DataFrame
+
+            assert isinstance(result, DataFrame)
+            assert len(result) == 1
+            assert result.iloc[0]["field"] == "value2"
+            assert result.iloc[0]["number"] == 24
+
+            # Test conversion back to Data list
+            data_list = result.to_data_list()
+            assert len(data_list) == 1
+            assert data_list[0].data == {"field": "value2", "number": 24}
+
+    def test_build_structured_dataframe_returns_dataframe_with_multiple_data(self):
+        """Test that build_structured_dataframe() returns DataFrame object with multiple Data items."""
+
+        def mock_get_chat_result(runnable, system_message, input_value, config):  # noqa: ARG001
+            class MockBaseModel(BaseModel):
+                def model_dump(self, **__):
+                    return {
+                        "objects": [
+                            {"name": "John", "age": 30},
+                            {"name": "Jane", "age": 25},
+                            {"name": "Bob", "age": 35},
+                        ]
+                    }
+
+            return {
+                "messages": ["mock_message"],
+                "responses": [MockBaseModel()],
+                "response_metadata": [{"id": "mock_id"}],
+                "attempts": 1,
+            }
+
+        component = StructuredOutputComponent(
+            llm=MockLanguageModel(),
+            input_value="Test input with multiple people",
+            schema_name="PersonSchema",
+            output_schema=[
+                {"name": "name", "type": "str", "description": "Person's name"},
+                {"name": "age", "type": "int", "description": "Person's age"},
+            ],
+            multiple=False,
+            system_prompt="Test system prompt",
+        )
+
+        with patch("langflow.components.processing.structured_output.get_chat_result", mock_get_chat_result):
+            result = component.build_structured_dataframe()
+
+            # Check that result is a DataFrame object
+            from langflow.schema.dataframe import DataFrame
+
+            assert isinstance(result, DataFrame)
+            assert len(result) == 3
+            assert result.iloc[0]["name"] == "John"
+            assert result.iloc[0]["age"] == 30
+            assert result.iloc[1]["name"] == "Jane"
+            assert result.iloc[1]["age"] == 25
+            assert result.iloc[2]["name"] == "Bob"
+            assert result.iloc[2]["age"] == 35
+
+            # Test conversion back to Data list
+            data_list = result.to_data_list()
+            assert len(data_list) == 3
+            assert data_list[0].data == {"name": "John", "age": 30}
+            assert data_list[1].data == {"name": "Jane", "age": 25}
+            assert data_list[2].data == {"name": "Bob", "age": 35}
+
+    def test_build_structured_dataframe_fails_when_base_returns_non_list(self):
+        """Test that build_structured_dataframe() fails when base method returns non-list."""
+
+        def mock_get_chat_result(runnable, system_message, input_value, config):  # noqa: ARG001
+            return {
+                "messages": ["mock_message"],
+                "responses": [{"single_item": "value"}],
+                "response_metadata": [{"id": "mock_id"}],
+                "attempts": 1,
+            }
+
+        component = StructuredOutputComponent(
+            llm=MockLanguageModel(),
+            input_value="Test input",
+            schema_name="TestSchema",
+            output_schema=[{"name": "field", "type": "str", "description": "A test field"}],
+            multiple=False,
+            system_prompt="Test system prompt",
+        )
+
+        with (
+            patch("langflow.components.processing.structured_output.get_chat_result", mock_get_chat_result),
+            pytest.raises(ValueError, match="No structured output returned"),
+        ):
+            component.build_structured_dataframe()
+
+    def test_build_structured_dataframe_fails_when_empty_output(self):
+        """Test that build_structured_dataframe() fails when base method returns empty list."""
+
+        def mock_get_chat_result(runnable, system_message, input_value, config):  # noqa: ARG001
+            class MockBaseModel(BaseModel):
+                def model_dump(self, **__):
+                    return {"objects": []}
+
+            return {
+                "messages": ["mock_message"],
+                "responses": [MockBaseModel()],
+                "response_metadata": [{"id": "mock_id"}],
+                "attempts": 1,
+            }
+
+        component = StructuredOutputComponent(
+            llm=MockLanguageModel(),
+            input_value="Test input",
+            schema_name="TestSchema",
+            output_schema=[{"name": "field", "type": "str", "description": "A test field"}],
+            multiple=False,
+            system_prompt="Test system prompt",
+        )
+
+        with (
+            patch("langflow.components.processing.structured_output.get_chat_result", mock_get_chat_result),
+            pytest.raises(ValueError, match="No structured output returned"),
+        ):
+            component.build_structured_dataframe()