From e6ebe7e206aa7417725fad9ef922e503c5e3f4b3 Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Wed, 27 Nov 2024 18:14:04 -0300 Subject: [PATCH] feat: Add DataFrameInput and refactor DataSet references (#4898) * refactor: rename DataSet to DataFrame * feat: add DataFrameInput type and update InputTypes definition * feat: add DataFrame type to CUSTOM_COMPONENT_SUPPORTED_TYPES * refactor: replace DataSet with DataFrame in schema and tests --- .../base/langflow/field_typing/constants.py | 2 + src/backend/base/langflow/inputs/inputs.py | 9 ++- src/backend/base/langflow/schema/__init__.py | 4 +- .../schema/{data_set.py => dataframe.py} | 27 +++---- .../tests/unit/schema/test_schema_data_set.py | 72 +++++++++---------- 5 files changed, 61 insertions(+), 53 deletions(-) rename src/backend/base/langflow/schema/{data_set.py => dataframe.py} (73%) diff --git a/src/backend/base/langflow/field_typing/constants.py b/src/backend/base/langflow/field_typing/constants.py index 35bb20a22..3acc0f7ea 100644 --- a/src/backend/base/langflow/field_typing/constants.py +++ b/src/backend/base/langflow/field_typing/constants.py @@ -19,6 +19,7 @@ from langchain_core.vectorstores import VectorStore, VectorStoreRetriever from langchain_text_splitters import TextSplitter from langflow.schema.data import Data +from langflow.schema.dataframe import DataFrame from langflow.schema.message import Message NestedDict: TypeAlias = dict[str, str | dict] @@ -77,4 +78,5 @@ CUSTOM_COMPONENT_SUPPORTED_TYPES = { "Callable": Callable, "LanguageModel": LanguageModel, "Retriever": Retriever, + "DataFrame": DataFrame, } diff --git a/src/backend/base/langflow/inputs/inputs.py b/src/backend/base/langflow/inputs/inputs.py index 0fc9f66f3..31c866389 100644 --- a/src/backend/base/langflow/inputs/inputs.py +++ b/src/backend/base/langflow/inputs/inputs.py @@ -1,6 +1,6 @@ import warnings from collections.abc import AsyncIterator, Iterator -from typing import Any, get_args +from typing import Any, TypeAlias, get_args from pandas import DataFrame from pydantic import Field, field_validator @@ -78,6 +78,10 @@ class DataInput(HandleInput, InputTraceMixin, ListableInputMixin, ToolModeMixin) input_types: list[str] = ["Data"] +class DataFrameInput(HandleInput, InputTraceMixin, ListableInputMixin, ToolModeMixin): + input_types: list[str] = ["DataFrame"] + + class PromptInput(BaseInputMixin, ListableInputMixin, InputTraceMixin, ToolModeMixin): field_type: SerializableFieldTypes = FieldTypes.PROMPT @@ -492,7 +496,7 @@ class DefaultPromptField(Input): value: Any = "" # Set the value to empty string -InputTypes = ( +InputTypes: TypeAlias = ( Input | DefaultPromptField | BoolInput @@ -516,6 +520,7 @@ InputTypes = ( | TableInput | LinkInput | SliderInput + | DataFrameInput ) InputTypesMap: dict[str, type[InputTypes]] = {t.__name__: t for t in get_args(InputTypes)} diff --git a/src/backend/base/langflow/schema/__init__.py b/src/backend/base/langflow/schema/__init__.py index 38fce1ae6..6dbefb2fb 100644 --- a/src/backend/base/langflow/schema/__init__.py +++ b/src/backend/base/langflow/schema/__init__.py @@ -1,6 +1,6 @@ from .data import Data -from .data_set import DataSet +from .dataframe import DataFrame from .dotdict import dotdict from .message import Message -__all__ = ["Data", "dotdict", "Message", "DataSet"] +__all__ = ["Data", "dotdict", "Message", "DataFrame"] diff --git a/src/backend/base/langflow/schema/data_set.py b/src/backend/base/langflow/schema/dataframe.py similarity index 73% rename from src/backend/base/langflow/schema/data_set.py rename to src/backend/base/langflow/schema/dataframe.py index 48c879024..bd027351e 100644 --- a/src/backend/base/langflow/schema/data_set.py +++ b/src/backend/base/langflow/schema/dataframe.py @@ -1,11 +1,12 @@ from typing import cast import pandas as pd +from pandas import DataFrame as pandas_DataFrame from langflow.schema.data import Data -class DataSet(pd.DataFrame): +class DataFrame(pandas_DataFrame): """A pandas DataFrame subclass specialized for handling collections of Data objects. This class extends pandas.DataFrame to provide seamless integration between @@ -22,13 +23,13 @@ class DataSet(pd.DataFrame): Examples: >>> # From Data objects - >>> dataset = DataSet([Data(data={"name": "John"}), Data(data={"name": "Jane"})]) + >>> dataset = DataFrame([Data(data={"name": "John"}), Data(data={"name": "Jane"})]) >>> # From dictionaries - >>> dataset = DataSet([{"name": "John"}, {"name": "Jane"}]) + >>> dataset = DataFrame([{"name": "John"}, {"name": "Jane"}]) >>> # From dictionary of lists - >>> dataset = DataSet({"name": ["John", "Jane"], "age": [30, 25]}) + >>> dataset = DataFrame({"name": ["John", "Jane"], "age": [30, 25]}) """ def __init__(self, data: None | list[dict | Data] | dict | pd.DataFrame = None, **kwargs): @@ -49,36 +50,36 @@ class DataSet(pd.DataFrame): super().__init__(**kwargs) def to_data_list(self) -> list[Data]: - """Converts the DataSet back to a list of Data objects.""" + """Converts the DataFrame back to a list of Data objects.""" list_of_dicts = self.to_dict(orient="records") return [Data(data=row) for row in list_of_dicts] - def add_row(self, data: dict | Data) -> "DataSet": + def add_row(self, data: dict | Data) -> "DataFrame": """Adds a single row to the dataset. Args: data: Either a Data object or a dictionary to add as a new row Returns: - DataSet: A new DataSet with the added row + DataFrame: A new DataFrame with the added row Example: - >>> dataset = DataSet([{"name": "John"}]) + >>> dataset = DataFrame([{"name": "John"}]) >>> dataset = dataset.add_row({"name": "Jane"}) """ if isinstance(data, Data): data = data.data new_df = self._constructor([data]) - return cast(DataSet, pd.concat([self, new_df], ignore_index=True)) + return cast(DataFrame, pd.concat([self, new_df], ignore_index=True)) - def add_rows(self, data: list[dict | Data]) -> "DataSet": + def add_rows(self, data: list[dict | Data]) -> "DataFrame": """Adds multiple rows to the dataset. Args: data: List of Data objects or dictionaries to add as new rows Returns: - DataSet: A new DataSet with the added rows + DataFrame: A new DataFrame with the added rows """ processed_data = [] for item in data: @@ -87,11 +88,11 @@ class DataSet(pd.DataFrame): else: processed_data.append(item) new_df = self._constructor(processed_data) - return cast(DataSet, pd.concat([self, new_df], ignore_index=True)) + return cast(DataFrame, pd.concat([self, new_df], ignore_index=True)) @property def _constructor(self): def _c(*args, **kwargs): - return DataSet(*args, **kwargs).__finalize__(self) + return DataFrame(*args, **kwargs).__finalize__(self) return _c diff --git a/src/backend/tests/unit/schema/test_schema_data_set.py b/src/backend/tests/unit/schema/test_schema_data_set.py index bc63e2aa7..68ad47f26 100644 --- a/src/backend/tests/unit/schema/test_schema_data_set.py +++ b/src/backend/tests/unit/schema/test_schema_data_set.py @@ -1,7 +1,7 @@ import pandas as pd import pytest from langflow.schema.data import Data -from langflow.schema.data_set import DataSet +from langflow.schema.dataframe import DataFrame @pytest.fixture @@ -15,17 +15,17 @@ def sample_data_objects() -> list[Data]: @pytest.fixture -def sample_dataset(sample_data_objects) -> DataSet: - """Fixture providing a sample DataSet instance.""" - return DataSet(sample_data_objects) +def sample_dataset(sample_data_objects) -> DataFrame: + """Fixture providing a sample DataFrame instance.""" + return DataFrame(sample_data_objects) def test_from_data_list_basic(): """Test basic functionality of from_data_list.""" data_objects = [Data(data={"name": "John", "age": 30}), Data(data={"name": "Jane", "age": 25})] - dataset = DataSet(data_objects) + dataset = DataFrame(data_objects) - assert isinstance(dataset, DataSet) + assert isinstance(dataset, DataFrame) assert isinstance(dataset, pd.DataFrame) assert len(dataset) == 2 assert list(dataset.columns) == ["name", "age"] @@ -35,8 +35,8 @@ def test_from_data_list_basic(): def test_from_data_list_empty(): """Test from_data_list with empty input.""" - dataset = DataSet([]) - assert isinstance(dataset, DataSet) + dataset = DataFrame([]) + assert isinstance(dataset, DataFrame) assert len(dataset) == 0 @@ -46,9 +46,9 @@ def test_from_data_list_missing_fields(): Data(data={"name": "John", "age": 30}), Data(data={"name": "Jane", "city": "Boston"}), # Missing age ] - dataset = DataSet(data_objects) + dataset = DataFrame(data_objects) - assert isinstance(dataset, DataSet) + assert isinstance(dataset, DataFrame) assert set(dataset.columns) == {"name", "age", "city"} assert pd.isna(dataset.iloc[1]["age"]) assert pd.isna(dataset.iloc[0]["city"]) @@ -60,9 +60,9 @@ def test_from_data_list_nested_data(): Data(data={"name": "John", "address": {"city": "New York", "zip": "10001"}}), Data(data={"name": "Jane", "address": {"city": "Boston", "zip": "02108"}}), ] - dataset = DataSet(data_objects) + dataset = DataFrame(data_objects) - assert isinstance(dataset, DataSet) + assert isinstance(dataset, DataFrame) assert isinstance(dataset["address"][0], dict) assert dataset["address"][0]["city"] == "New York" @@ -82,7 +82,7 @@ def test_to_data_list_basic(sample_dataset, sample_data_objects): def test_to_data_list_empty(): """Test to_data_list with empty DataFrame.""" - empty_dataset = DataSet() + empty_dataset = DataFrame() result = empty_dataset.to_data_list() assert isinstance(result, list) assert len(result) == 0 @@ -103,10 +103,10 @@ def test_to_data_list_modified_data(sample_dataset): def test_dataset_pandas_operations(sample_dataset): - """Test that pandas operations work correctly on DataSet.""" + """Test that pandas operations work correctly on DataFrame.""" # Test filtering filtered = sample_dataset[sample_dataset["age"] > 30] - assert isinstance(filtered, DataSet), f"Expected DataSet, got {type(filtered)}" + assert isinstance(filtered, DataFrame), f"Expected DataFrame, got {type(filtered)}" assert len(filtered) == 1 assert filtered.iloc[0]["name"] == "Bob" @@ -121,9 +121,9 @@ def test_dataset_pandas_operations(sample_dataset): def test_dataset_with_null_values(): - """Test handling of null values in DataSet.""" + """Test handling of null values in DataFrame.""" data_objects = [Data(data={"name": "John", "age": None}), Data(data={"name": None, "age": 25})] - dataset = DataSet(data_objects) + dataset = DataFrame(data_objects) assert pd.isna(dataset.iloc[0]["age"]) assert pd.isna(dataset.iloc[1]["name"]) @@ -148,7 +148,7 @@ def test_dataset_type_preservation(): } ) ] - dataset = DataSet(data_objects) + dataset = DataFrame(data_objects) result = dataset.to_data_list() assert isinstance(result[0].data["int_val"], int) @@ -164,7 +164,7 @@ def test_add_row_with_dict(sample_dataset): new_row = {"name": "Alice", "age": 28, "city": "Seattle"} result = sample_dataset.add_row(new_row) - assert isinstance(result, DataSet) + assert isinstance(result, DataFrame) assert len(result) == len(sample_dataset) + 1 assert result.iloc[-1]["name"] == "Alice" assert result.iloc[-1]["age"] == 28 @@ -176,7 +176,7 @@ def test_add_row_with_data_object(sample_dataset): new_row = Data(data={"name": "Alice", "age": 28, "city": "Seattle"}) result = sample_dataset.add_row(new_row) - assert isinstance(result, DataSet) + assert isinstance(result, DataFrame) assert len(result) == len(sample_dataset) + 1 assert result.iloc[-1]["name"] == "Alice" assert result.iloc[-1]["age"] == 28 @@ -188,7 +188,7 @@ def test_add_rows_with_dicts(sample_dataset): new_rows = [{"name": "Alice", "age": 28, "city": "Seattle"}, {"name": "Charlie", "age": 32, "city": "Portland"}] result = sample_dataset.add_rows(new_rows) - assert isinstance(result, DataSet) + assert isinstance(result, DataFrame) assert len(result) == len(sample_dataset) + 2 assert result.iloc[-2]["name"] == "Alice" assert result.iloc[-1]["name"] == "Charlie" @@ -202,7 +202,7 @@ def test_add_rows_with_data_objects(sample_dataset): ] result = sample_dataset.add_rows(new_rows) - assert isinstance(result, DataSet) + assert isinstance(result, DataFrame) assert len(result) == len(sample_dataset) + 2 assert result.iloc[-2]["name"] == "Alice" assert result.iloc[-1]["name"] == "Charlie" @@ -216,7 +216,7 @@ def test_add_rows_mixed_types(sample_dataset): ] result = sample_dataset.add_rows(new_rows) - assert isinstance(result, DataSet) + assert isinstance(result, DataFrame) assert len(result) == len(sample_dataset) + 2 assert result.iloc[-2]["name"] == "Alice" assert result.iloc[-1]["name"] == "Charlie" @@ -225,9 +225,9 @@ def test_add_rows_mixed_types(sample_dataset): def test_init_with_data_objects(): """Test initialization with Data objects.""" data_objects = [Data(data={"name": "John", "age": 30}), Data(data={"name": "Jane", "age": 25})] - dataset = DataSet(data_objects) + dataset = DataFrame(data_objects) - assert isinstance(dataset, DataSet) + assert isinstance(dataset, DataFrame) assert len(dataset) == 2 assert list(dataset.columns) == ["name", "age"] assert dataset.iloc[0]["name"] == "John" @@ -237,9 +237,9 @@ def test_init_with_data_objects(): def test_init_with_dicts(): """Test initialization with dictionaries.""" data_dicts = [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}] - dataset = DataSet(data_dicts) + dataset = DataFrame(data_dicts) - assert isinstance(dataset, DataSet) + assert isinstance(dataset, DataFrame) assert len(dataset) == 2 assert list(dataset.columns) == ["name", "age"] assert dataset.iloc[0]["name"] == "John" @@ -249,9 +249,9 @@ def test_init_with_dicts(): def test_init_with_dict_of_lists(): """Test initialization with a dictionary of lists.""" data = {"name": ["John", "Jane"], "age": [30, 25]} - dataset = DataSet(data) + dataset = DataFrame(data) - assert isinstance(dataset, DataSet) + assert isinstance(dataset, DataFrame) assert len(dataset) == 2 assert list(dataset.columns) == ["name", "age"] assert dataset.iloc[0]["name"] == "John" @@ -261,9 +261,9 @@ def test_init_with_dict_of_lists(): def test_init_with_pandas_dataframe(): """Test initialization with a pandas DataFrame.""" test_df = pd.DataFrame({"name": ["John", "Jane"], "age": [30, 25]}) - dataset = DataSet(test_df) + dataset = DataFrame(test_df) - assert isinstance(dataset, DataSet) + assert isinstance(dataset, DataFrame) assert len(dataset) == 2 assert list(dataset.columns) == ["name", "age"] assert dataset.iloc[0]["name"] == "John" @@ -272,8 +272,8 @@ def test_init_with_pandas_dataframe(): def test_init_with_none(): """Test initialization with None.""" - dataset = DataSet(None) - assert isinstance(dataset, DataSet) + dataset = DataFrame(None) + assert isinstance(dataset, DataFrame) assert len(dataset) == 0 @@ -284,15 +284,15 @@ def test_init_with_invalid_list(): Data(data={"name": "Jane", "age": 25}), # Mixed types should fail ] with pytest.raises(ValueError, match="List items must be either all Data objects or all dictionaries"): - DataSet(invalid_data) + DataFrame(invalid_data) def test_init_with_kwargs(): """Test initialization with additional kwargs.""" data = {"name": ["John", "Jane"], "age": [30, 25]} - dataset = DataSet(data=data, index=["a", "b"]) + dataset = DataFrame(data=data, index=["a", "b"]) - assert isinstance(dataset, DataSet) + assert isinstance(dataset, DataFrame) assert len(dataset) == 2 assert list(dataset.index) == ["a", "b"] assert dataset.loc["a"]["name"] == "John"