feat: Add ParseDataFrameComponent for DataFrame-to-text conversion with tests (#5594)

* add dataframe outputs to vector stores, directory, url, split text * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) * add parse dataframe * [autofix.ci] apply automated fixes * Refactor: Update DataFrame handling in components - Added import of DataFrame in directory and url components. - Renamed variable 'df' to 'dataframe' in ParseDataFrameComponent for clarity. - Updated method _clean_args and parse_data to use 'dataframe' instead of 'df' for consistency. These changes enhance code readability and maintainability by standardizing the terminology used for DataFrame objects. * [autofix.ci] apply automated fixes * remove parse dataframe * feat: add parse dataframe component * [autofix.ci] apply automated fixes * Refactor: Remove duplicate as_dataframe method in LCVectorStoreComponent This commit eliminates the redundant as_dataframe method in the LCVectorStoreComponent class, streamlining the code and improving maintainability. The method was previously defined twice, and this change enhances clarity by ensuring only one implementation exists. * [autofix.ci] apply automated fixes * Refactor: Standardize DataFrame variable naming in ParseDataFrameComponent This commit renames the variable 'df' to 'dataframe' in the ParseDataFrameComponent class to improve clarity and consistency. The changes are reflected in the _clean_args and parse_data methods, enhancing code readability and maintainability. * test: add unit tests for ParseDataFrameComponent This commit introduces a comprehensive suite of unit tests for the ParseDataFrameComponent, covering various scenarios including successful parsing with default and custom templates, handling of empty dataframes, invalid template keys, and performance on large dataframes. The tests ensure that the component behaves correctly with different data types and separators, and validate its functionality in both synchronous and asynchronous contexts. These additions enhance the reliability and maintainability of the component. --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> Co-authored-by: Edwin Jose <edwin.jose@datastax.com>
2025-01-17 20:41:42 -03:00 · 2025-01-17 20:41:42 -03:00 · 8d902e6c74
commit 8d902e6c74
parent aa5a4a505c
3 changed files with 210 additions and 0 deletions
--- a/src/backend/base/langflow/components/processing/init.py
+++ b/src/backend/base/langflow/components/processing/init.py
@ -24,6 +24,7 @@ __all__ = [
    "MergeDataComponent",
    "MessageToDataComponent",
    "ParseDataComponent",
+    "ParseDataFrameComponent",
    "ParseJSONDataComponent",
    "SelectDataComponent",
    "SplitTextComponent",
--- a/src/backend/base/langflow/components/processing/parse_dataframe.py
+++ b/src/backend/base/langflow/components/processing/parse_dataframe.py
@ -0,0 +1,67 @@
+from langflow.custom import Component
+from langflow.io import DataFrameInput, MultilineInput, Output, StrInput
+from langflow.schema.message import Message
+
+
+class ParseDataFrameComponent(Component):
+    display_name = "Parse DataFrame"
+    description = (
+        "Convert a DataFrame into plain text following a specified template. "
+        "Each column in the DataFrame is treated as a possible template key, e.g. {col_name}."
+    )
+    icon = "braces"
+    name = "ParseDataFrame"
+
+    inputs = [
+        DataFrameInput(name="df", display_name="DataFrame", info="The DataFrame to convert to text rows."),
+        MultilineInput(
+            name="template",
+            display_name="Template",
+            info=(
+                "The template for formatting each row. "
+                "Use placeholders matching column names in the DataFrame, for example '{col1}', '{col2}'."
+            ),
+            value="{text}",
+        ),
+        StrInput(
+            name="sep",
+            display_name="Separator",
+            advanced=True,
+            value="\n",
+            info="String that joins all row texts when building the single Text output.",
+        ),
+    ]
+
+    outputs = [
+        Output(
+            display_name="Text",
+            name="text",
+            info="All rows combined into a single text, each row formatted by the template and separated by `sep`.",
+            method="parse_data",
+        ),
+    ]
+
+    def _clean_args(self):
+        dataframe = self.df
+        template = self.template or "{text}"
+        sep = self.sep or "\n"
+        return dataframe, template, sep
+
+    def parse_data(self) -> Message:
+        """Converts each row of the DataFrame into a formatted string using the template.
+
+        then joins them with `sep`. Returns a single combined string as a Message.
+        """
+        dataframe, template, sep = self._clean_args()
+
+        lines = []
+        # For each row in the DataFrame, build a dict and format
+        for _, row in dataframe.iterrows():
+            row_dict = row.to_dict()
+            text_line = template.format(**row_dict)  # e.g. template="{text}", row_dict={"text": "Hello"}
+            lines.append(text_line)
+
+        # Join all lines with the provided separator
+        result_string = sep.join(lines)
+        self.status = result_string  # store in self.status for UI logs
+        return Message(text=result_string)
--- a/src/backend/tests/unit/components/processing/test_parse_dataframe_component.py
+++ b/src/backend/tests/unit/components/processing/test_parse_dataframe_component.py
@ -0,0 +1,142 @@
+import asyncio
+
+import pandas as pd
+import pytest
+from langflow.components.processing.parse_dataframe import ParseDataFrameComponent
+from langflow.schema import DataFrame
+from langflow.schema.message import Message
+
+from tests.base import ComponentTestBaseWithoutClient
+
+
+class TestParseDataFrameComponent(ComponentTestBaseWithoutClient):
+    @pytest.fixture
+    def component_class(self):
+        """Return the component class to test."""
+        return ParseDataFrameComponent
+
+    @pytest.fixture
+    def default_kwargs(self):
+        """Return the default kwargs for the component."""
+        return {"df": DataFrame({"text": ["Hello"]}), "template": "{text}", "sep": "\n"}
+
+    @pytest.fixture
+    def file_names_mapping(self):
+        """Return an empty list since this component doesn't have version-specific files."""
+        return []
+
+    def test_successful_parse_with_default_template(self):
+        # Create test data
+        test_df = DataFrame({"text": ["Hello", "World", "Test"]})
+
+        component = ParseDataFrameComponent(df=test_df, template="{text}", sep="\n")
+
+        # Run the parse process
+        result = component.parse_data()
+
+        # Verify the results
+        assert isinstance(result, Message)
+        assert result.text == "Hello\nWorld\nTest"
+        assert component.status == "Hello\nWorld\nTest"
+
+    def test_parse_with_custom_template(self):
+        test_df = DataFrame({"name": ["John", "Jane"], "age": [30, 25]})
+
+        component = ParseDataFrameComponent(df=test_df, template="Name: {name}, Age: {age}", sep=" | ")
+
+        result = component.parse_data()
+
+        assert isinstance(result, Message)
+        assert result.text == "Name: John, Age: 30 | Name: Jane, Age: 25"
+
+    def test_parse_with_custom_separator(self):
+        test_df = DataFrame({"text": ["Hello", "World"]})
+
+        component = ParseDataFrameComponent(df=test_df, template="{text}", sep=" --- ")
+
+        result = component.parse_data()
+
+        assert isinstance(result, Message)
+        assert result.text == "Hello --- World"
+
+    def test_empty_dataframe(self):
+        component = ParseDataFrameComponent(df=DataFrame({"text": []}), template="{text}", sep="\n")
+
+        result = component.parse_data()
+        assert isinstance(result, Message)
+        assert result.text == ""
+
+    def test_invalid_template_keys(self):
+        component = ParseDataFrameComponent(
+            df=DataFrame({"text": ["Hello"]}), template="{nonexistent_column}", sep="\n"
+        )
+
+        with pytest.raises(KeyError):
+            component.parse_data()
+
+    def test_multiple_column_template(self):
+        test_df = DataFrame({"col1": ["A", "B"], "col2": [1, 2], "col3": ["X", "Y"]})
+
+        component = ParseDataFrameComponent(df=test_df, template="{col1}-{col2}-{col3}", sep=", ")
+
+        result = component.parse_data()
+        assert isinstance(result, Message)
+        assert result.text == "A-1-X, B-2-Y"
+
+    @pytest.mark.asyncio
+    async def test_async_invocation(self, component_class, default_kwargs):
+        """Verify that ParseDataFrameComponent can be called in an async context."""
+        component = component_class(**default_kwargs)
+        # Use asyncio.to_thread to invoke the parse_data method in a thread pool
+        result = await asyncio.to_thread(component.parse_data)
+        assert isinstance(result, Message)
+
+    def test_various_data_types(self, component_class):
+        """Test that the component correctly formats differing data types."""
+        test_dataframe = DataFrame(
+            {
+                "string_col": ["A", "B"],
+                "int_col": [1, 2],
+                "bool_col": [True, False],
+                "time_col": pd.to_datetime(["2023-01-01", "2023-01-02"]),
+            }
+        )
+        template = "{string_col}-{int_col}-{bool_col}-{time_col}"
+        component = component_class(df=test_dataframe, template=template, sep=" | ")
+        result = component.parse_data()
+        assert isinstance(result, Message)
+        # Just check that all columns are present in the text
+        assert "A-1-True-2023-01-01" in result.text
+
+    def test_nan_values(self, component_class):
+        """Test how the component handles missing/NaN values in the DataFrame."""
+        test_dataframe = DataFrame(
+            {
+                "col1": ["Hello", None],
+                "col2": [10, float("nan")],
+            }
+        )
+        template = "{col1}-{col2}"
+        component = component_class(df=test_dataframe, template=template, sep="\n")
+        result = component.parse_data()
+        # Expect None or NaN to be converted to the string "None" or "nan"
+        # depending on Python's behavior
+        assert isinstance(result, Message)
+        # The exact representation can depend on how pandas handles None/NaN.
+        # Typically, None -> 'None' and NaN -> 'nan'.
+        # You can refine these assertions if you have a custom conversion.
+        assert "Hello-10" in result.text
+
+    def test_large_dataframe(self, component_class):
+        """Test performance and correctness on a relatively large DataFrame."""
+        data = {
+            "col": [f"Row{i}" for i in range(10000)],  # 10k rows
+        }
+        large_dataframe = DataFrame(data)
+        component = component_class(df=large_dataframe, template="{col}", sep=", ")
+        result = component.parse_data()
+        assert isinstance(result, Message)
+        # Check the length of the result isn't zero, ensuring it didn't fail
+        assert len(result.text) > 0
+        # Optionally, you can assert the result includes a substring from the middle
+        assert "Row5000" in result.text