From f2fbcfa5790f23ffa3f114fc039c35a06f674f80 Mon Sep 17 00:00:00 2001 From: Cristhian Zanforlin Lousa Date: Mon, 17 Feb 2025 10:22:01 -0300 Subject: [PATCH] feat: add DataToDataFrame component for converting Data objects (#6112) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ✨ (data_to_dataframe.py): add a new component to convert Data objects into a DataFrame for easier data manipulation and analysis. * [autofix.ci] apply automated fixes * 📝 (data_to_dataframe.py): improve documentation for the build_dataframe method to explain the process of building a DataFrame from Data objects * ✨ (test_data_to_dataframe.py): Add unit tests for DataToDataFrameComponent to ensure proper construction of DataFrame from Data objects with various fields and configurations. * ✨ (test_data_to_dataframe.py): Refactor test_data_to_dataframe.py to use pandas module instead of turtle for DataFrame operations ♻️ (test_data_to_dataframe.py): Refactor test_data_to_dataframe.py to improve readability and consistency in DataFrame testing assertions * [autofix.ci] apply automated fixes * 🔧 (test_data_to_dataframe.py): improve variable naming for better readability and consistency in test cases * [autofix.ci] apply automated fixes * ✨ (test_data_to_dataframe_component.py): Add unit tests for DataToDataFrameComponent to ensure correct behavior and functionality. --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- .../processing/data_to_dataframe.py | 68 ++++++++++ .../test_data_to_dataframe_component.py | 127 ++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 src/backend/base/langflow/components/processing/data_to_dataframe.py create mode 100644 src/backend/tests/unit/components/processing/test_data_to_dataframe_component.py diff --git a/src/backend/base/langflow/components/processing/data_to_dataframe.py b/src/backend/base/langflow/components/processing/data_to_dataframe.py new file mode 100644 index 000000000..9cd8e4776 --- /dev/null +++ b/src/backend/base/langflow/components/processing/data_to_dataframe.py @@ -0,0 +1,68 @@ +from langflow.custom import Component +from langflow.io import DataInput, Output +from langflow.schema import Data, DataFrame + + +class DataToDataFrameComponent(Component): + display_name = "Data → DataFrame" + description = ( + "Converts one or multiple Data objects into a DataFrame. " + "Each Data object corresponds to one row. Fields from `.data` become columns, " + "and the `.text` (if present) is placed in a 'text' column." + ) + icon = "table" + name = "DataToDataFrame" + + inputs = [ + DataInput( + name="data_list", + display_name="Data or Data List", + info="One or multiple Data objects to transform into a DataFrame.", + is_list=True, + ), + ] + + outputs = [ + Output( + display_name="DataFrame", + name="dataframe", + method="build_dataframe", + info="A DataFrame built from each Data object's fields plus a 'text' column.", + ), + ] + + def build_dataframe(self) -> DataFrame: + """Builds a DataFrame from Data objects by combining their fields. + + For each Data object: + - Merge item.data (dictionary) as columns + - If item.text is present, add 'text' column + + Returns a DataFrame with one row per Data object. + """ + data_input = self.data_list + + # If user passed a single Data, it might come in as a single object rather than a list + if not isinstance(data_input, list): + data_input = [data_input] + + rows = [] + for item in data_input: + if not isinstance(item, Data): + msg = f"Expected Data objects, got {type(item)} instead." + raise TypeError(msg) + + # Start with a copy of item.data or an empty dict + row_dict = dict(item.data) if item.data else {} + + # If the Data object has text, store it under 'text' col + text_val = item.get_text() + if text_val: + row_dict["text"] = text_val + + rows.append(row_dict) + + # Build a DataFrame from these row dictionaries + df_result = DataFrame(rows) + self.status = df_result # store in self.status for logs + return df_result diff --git a/src/backend/tests/unit/components/processing/test_data_to_dataframe_component.py b/src/backend/tests/unit/components/processing/test_data_to_dataframe_component.py new file mode 100644 index 000000000..196ded59b --- /dev/null +++ b/src/backend/tests/unit/components/processing/test_data_to_dataframe_component.py @@ -0,0 +1,127 @@ +import pytest +from langflow.components.processing.data_to_dataframe import DataToDataFrameComponent +from langflow.schema import Data, DataFrame + +from tests.base import ComponentTestBaseWithoutClient + + +class TestDataToDataFrameComponent(ComponentTestBaseWithoutClient): + @pytest.fixture + def component_class(self): + """Return the component class to test.""" + return DataToDataFrameComponent + + @pytest.fixture + def default_kwargs(self): + """Return the default kwargs for the component.""" + return { + "data_list": [ + Data(text="Row 1", data={"field1": "value1", "field2": 1}), + Data(text="Row 2", data={"field1": "value2", "field2": 2}), + ] + } + + @pytest.fixture + def file_names_mapping(self): + """Return the file names mapping for different versions.""" + # This is a new component, so we return an empty list + return [] + + def test_basic_setup(self, component_class, default_kwargs): + """Test basic component initialization.""" + component = component_class() + component.set_attributes(default_kwargs) + assert component.data_list == default_kwargs["data_list"] + + def test_build_dataframe_basic(self, component_class, default_kwargs): + """Test basic DataFrame construction.""" + component = component_class() + component.set_attributes(default_kwargs) + result_df = component.build_dataframe() + + assert isinstance(result_df, DataFrame) + assert len(result_df) == 2 + assert list(result_df.columns) == ["field1", "field2", "text"] + assert result_df["text"].tolist() == ["Row 1", "Row 2"] + assert result_df["field1"].tolist() == ["value1", "value2"] + assert result_df["field2"].tolist() == [1, 2] + + def test_single_data_input(self, component_class): + """Test handling single Data object input.""" + single_data = Data(text="Single Row", data={"field1": "value"}) + component = component_class() + component.set_attributes({"data_list": single_data}) + + result_df = component.build_dataframe() + + assert len(result_df) == 1 + assert result_df["text"].iloc[0] == "Single Row" + assert result_df["field1"].iloc[0] == "value" + + def test_empty_data_list(self, component_class): + """Test behavior with empty data list.""" + component = component_class() + component.set_attributes({"data_list": []}) + + result_df = component.build_dataframe() + + assert len(result_df) == 0 + + def test_data_without_text(self, component_class): + """Test handling Data objects without text field.""" + data_without_text = [Data(data={"field1": "value1"}), Data(data={"field1": "value2"})] + component = component_class() + component.set_attributes({"data_list": data_without_text}) + + result_df = component.build_dataframe() + + assert len(result_df) == 2 + assert "text" not in result_df.columns + assert result_df["field1"].tolist() == ["value1", "value2"] + + def test_data_without_data_dict(self, component_class): + """Test handling Data objects without data dictionary.""" + data_without_dict = [Data(text="Text 1"), Data(text="Text 2")] + component = component_class() + component.set_attributes({"data_list": data_without_dict}) + + result_df = component.build_dataframe() + + assert len(result_df) == 2 + assert list(result_df.columns) == ["text"] + assert result_df["text"].tolist() == ["Text 1", "Text 2"] + + def test_mixed_data_fields(self, component_class): + """Test handling Data objects with different fields.""" + mixed_data = [ + Data(text="Row 1", data={"field1": "value1", "field2": 1}), + Data(text="Row 2", data={"field1": "value2", "field3": "extra"}), + ] + component = component_class() + component.set_attributes({"data_list": mixed_data}) + + result_df = component.build_dataframe() + + assert len(result_df) == 2 + assert set(result_df.columns) == {"field1", "field2", "field3", "text"} + assert result_df["field1"].tolist() == ["value1", "value2"] + assert result_df["field2"].iloc[1] != result_df["field2"].iloc[1] # Check for NaN using inequality + assert result_df["field3"].iloc[0] != result_df["field3"].iloc[0] # Check for NaN using inequality + + def test_invalid_input_type(self, component_class): + """Test error handling for invalid input types.""" + invalid_data = [{"not": "a Data object"}] + component = component_class() + component.set_attributes({"data_list": invalid_data}) + + with pytest.raises(TypeError) as exc_info: + component.build_dataframe() + assert "Expected Data objects" in str(exc_info.value) + + def test_status_update(self, component_class, default_kwargs): + """Test that status is properly updated.""" + component = component_class() + component.set_attributes(default_kwargs) + result = component.build_dataframe() + + assert component.status is result # Status should be set to the DataFrame