feat: Add DataFrameInput and refactor DataSet references (#4898)

* refactor: rename DataSet to DataFrame

* feat: add DataFrameInput type and update InputTypes definition

* feat: add DataFrame type to CUSTOM_COMPONENT_SUPPORTED_TYPES

* refactor: replace DataSet with DataFrame in schema and tests
This commit is contained in:
Gabriel Luiz Freitas Almeida 2024-11-27 18:14:04 -03:00 committed by GitHub
commit e6ebe7e206
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 61 additions and 53 deletions

View file

@ -19,6 +19,7 @@ from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
from langchain_text_splitters import TextSplitter
from langflow.schema.data import Data
from langflow.schema.dataframe import DataFrame
from langflow.schema.message import Message
NestedDict: TypeAlias = dict[str, str | dict]
@ -77,4 +78,5 @@ CUSTOM_COMPONENT_SUPPORTED_TYPES = {
"Callable": Callable,
"LanguageModel": LanguageModel,
"Retriever": Retriever,
"DataFrame": DataFrame,
}

View file

@ -1,6 +1,6 @@
import warnings
from collections.abc import AsyncIterator, Iterator
from typing import Any, get_args
from typing import Any, TypeAlias, get_args
from pandas import DataFrame
from pydantic import Field, field_validator
@ -78,6 +78,10 @@ class DataInput(HandleInput, InputTraceMixin, ListableInputMixin, ToolModeMixin)
input_types: list[str] = ["Data"]
class DataFrameInput(HandleInput, InputTraceMixin, ListableInputMixin, ToolModeMixin):
input_types: list[str] = ["DataFrame"]
class PromptInput(BaseInputMixin, ListableInputMixin, InputTraceMixin, ToolModeMixin):
field_type: SerializableFieldTypes = FieldTypes.PROMPT
@ -492,7 +496,7 @@ class DefaultPromptField(Input):
value: Any = "" # Set the value to empty string
InputTypes = (
InputTypes: TypeAlias = (
Input
| DefaultPromptField
| BoolInput
@ -516,6 +520,7 @@ InputTypes = (
| TableInput
| LinkInput
| SliderInput
| DataFrameInput
)
InputTypesMap: dict[str, type[InputTypes]] = {t.__name__: t for t in get_args(InputTypes)}

View file

@ -1,6 +1,6 @@
from .data import Data
from .data_set import DataSet
from .dataframe import DataFrame
from .dotdict import dotdict
from .message import Message
__all__ = ["Data", "dotdict", "Message", "DataSet"]
__all__ = ["Data", "dotdict", "Message", "DataFrame"]

View file

@ -1,11 +1,12 @@
from typing import cast
import pandas as pd
from pandas import DataFrame as pandas_DataFrame
from langflow.schema.data import Data
class DataSet(pd.DataFrame):
class DataFrame(pandas_DataFrame):
"""A pandas DataFrame subclass specialized for handling collections of Data objects.
This class extends pandas.DataFrame to provide seamless integration between
@ -22,13 +23,13 @@ class DataSet(pd.DataFrame):
Examples:
>>> # From Data objects
>>> dataset = DataSet([Data(data={"name": "John"}), Data(data={"name": "Jane"})])
>>> dataset = DataFrame([Data(data={"name": "John"}), Data(data={"name": "Jane"})])
>>> # From dictionaries
>>> dataset = DataSet([{"name": "John"}, {"name": "Jane"}])
>>> dataset = DataFrame([{"name": "John"}, {"name": "Jane"}])
>>> # From dictionary of lists
>>> dataset = DataSet({"name": ["John", "Jane"], "age": [30, 25]})
>>> dataset = DataFrame({"name": ["John", "Jane"], "age": [30, 25]})
"""
def __init__(self, data: None | list[dict | Data] | dict | pd.DataFrame = None, **kwargs):
@ -49,36 +50,36 @@ class DataSet(pd.DataFrame):
super().__init__(**kwargs)
def to_data_list(self) -> list[Data]:
"""Converts the DataSet back to a list of Data objects."""
"""Converts the DataFrame back to a list of Data objects."""
list_of_dicts = self.to_dict(orient="records")
return [Data(data=row) for row in list_of_dicts]
def add_row(self, data: dict | Data) -> "DataSet":
def add_row(self, data: dict | Data) -> "DataFrame":
"""Adds a single row to the dataset.
Args:
data: Either a Data object or a dictionary to add as a new row
Returns:
DataSet: A new DataSet with the added row
DataFrame: A new DataFrame with the added row
Example:
>>> dataset = DataSet([{"name": "John"}])
>>> dataset = DataFrame([{"name": "John"}])
>>> dataset = dataset.add_row({"name": "Jane"})
"""
if isinstance(data, Data):
data = data.data
new_df = self._constructor([data])
return cast(DataSet, pd.concat([self, new_df], ignore_index=True))
return cast(DataFrame, pd.concat([self, new_df], ignore_index=True))
def add_rows(self, data: list[dict | Data]) -> "DataSet":
def add_rows(self, data: list[dict | Data]) -> "DataFrame":
"""Adds multiple rows to the dataset.
Args:
data: List of Data objects or dictionaries to add as new rows
Returns:
DataSet: A new DataSet with the added rows
DataFrame: A new DataFrame with the added rows
"""
processed_data = []
for item in data:
@ -87,11 +88,11 @@ class DataSet(pd.DataFrame):
else:
processed_data.append(item)
new_df = self._constructor(processed_data)
return cast(DataSet, pd.concat([self, new_df], ignore_index=True))
return cast(DataFrame, pd.concat([self, new_df], ignore_index=True))
@property
def _constructor(self):
def _c(*args, **kwargs):
return DataSet(*args, **kwargs).__finalize__(self)
return DataFrame(*args, **kwargs).__finalize__(self)
return _c

View file

@ -1,7 +1,7 @@
import pandas as pd
import pytest
from langflow.schema.data import Data
from langflow.schema.data_set import DataSet
from langflow.schema.dataframe import DataFrame
@pytest.fixture
@ -15,17 +15,17 @@ def sample_data_objects() -> list[Data]:
@pytest.fixture
def sample_dataset(sample_data_objects) -> DataSet:
"""Fixture providing a sample DataSet instance."""
return DataSet(sample_data_objects)
def sample_dataset(sample_data_objects) -> DataFrame:
"""Fixture providing a sample DataFrame instance."""
return DataFrame(sample_data_objects)
def test_from_data_list_basic():
"""Test basic functionality of from_data_list."""
data_objects = [Data(data={"name": "John", "age": 30}), Data(data={"name": "Jane", "age": 25})]
dataset = DataSet(data_objects)
dataset = DataFrame(data_objects)
assert isinstance(dataset, DataSet)
assert isinstance(dataset, DataFrame)
assert isinstance(dataset, pd.DataFrame)
assert len(dataset) == 2
assert list(dataset.columns) == ["name", "age"]
@ -35,8 +35,8 @@ def test_from_data_list_basic():
def test_from_data_list_empty():
"""Test from_data_list with empty input."""
dataset = DataSet([])
assert isinstance(dataset, DataSet)
dataset = DataFrame([])
assert isinstance(dataset, DataFrame)
assert len(dataset) == 0
@ -46,9 +46,9 @@ def test_from_data_list_missing_fields():
Data(data={"name": "John", "age": 30}),
Data(data={"name": "Jane", "city": "Boston"}), # Missing age
]
dataset = DataSet(data_objects)
dataset = DataFrame(data_objects)
assert isinstance(dataset, DataSet)
assert isinstance(dataset, DataFrame)
assert set(dataset.columns) == {"name", "age", "city"}
assert pd.isna(dataset.iloc[1]["age"])
assert pd.isna(dataset.iloc[0]["city"])
@ -60,9 +60,9 @@ def test_from_data_list_nested_data():
Data(data={"name": "John", "address": {"city": "New York", "zip": "10001"}}),
Data(data={"name": "Jane", "address": {"city": "Boston", "zip": "02108"}}),
]
dataset = DataSet(data_objects)
dataset = DataFrame(data_objects)
assert isinstance(dataset, DataSet)
assert isinstance(dataset, DataFrame)
assert isinstance(dataset["address"][0], dict)
assert dataset["address"][0]["city"] == "New York"
@ -82,7 +82,7 @@ def test_to_data_list_basic(sample_dataset, sample_data_objects):
def test_to_data_list_empty():
"""Test to_data_list with empty DataFrame."""
empty_dataset = DataSet()
empty_dataset = DataFrame()
result = empty_dataset.to_data_list()
assert isinstance(result, list)
assert len(result) == 0
@ -103,10 +103,10 @@ def test_to_data_list_modified_data(sample_dataset):
def test_dataset_pandas_operations(sample_dataset):
"""Test that pandas operations work correctly on DataSet."""
"""Test that pandas operations work correctly on DataFrame."""
# Test filtering
filtered = sample_dataset[sample_dataset["age"] > 30]
assert isinstance(filtered, DataSet), f"Expected DataSet, got {type(filtered)}"
assert isinstance(filtered, DataFrame), f"Expected DataFrame, got {type(filtered)}"
assert len(filtered) == 1
assert filtered.iloc[0]["name"] == "Bob"
@ -121,9 +121,9 @@ def test_dataset_pandas_operations(sample_dataset):
def test_dataset_with_null_values():
"""Test handling of null values in DataSet."""
"""Test handling of null values in DataFrame."""
data_objects = [Data(data={"name": "John", "age": None}), Data(data={"name": None, "age": 25})]
dataset = DataSet(data_objects)
dataset = DataFrame(data_objects)
assert pd.isna(dataset.iloc[0]["age"])
assert pd.isna(dataset.iloc[1]["name"])
@ -148,7 +148,7 @@ def test_dataset_type_preservation():
}
)
]
dataset = DataSet(data_objects)
dataset = DataFrame(data_objects)
result = dataset.to_data_list()
assert isinstance(result[0].data["int_val"], int)
@ -164,7 +164,7 @@ def test_add_row_with_dict(sample_dataset):
new_row = {"name": "Alice", "age": 28, "city": "Seattle"}
result = sample_dataset.add_row(new_row)
assert isinstance(result, DataSet)
assert isinstance(result, DataFrame)
assert len(result) == len(sample_dataset) + 1
assert result.iloc[-1]["name"] == "Alice"
assert result.iloc[-1]["age"] == 28
@ -176,7 +176,7 @@ def test_add_row_with_data_object(sample_dataset):
new_row = Data(data={"name": "Alice", "age": 28, "city": "Seattle"})
result = sample_dataset.add_row(new_row)
assert isinstance(result, DataSet)
assert isinstance(result, DataFrame)
assert len(result) == len(sample_dataset) + 1
assert result.iloc[-1]["name"] == "Alice"
assert result.iloc[-1]["age"] == 28
@ -188,7 +188,7 @@ def test_add_rows_with_dicts(sample_dataset):
new_rows = [{"name": "Alice", "age": 28, "city": "Seattle"}, {"name": "Charlie", "age": 32, "city": "Portland"}]
result = sample_dataset.add_rows(new_rows)
assert isinstance(result, DataSet)
assert isinstance(result, DataFrame)
assert len(result) == len(sample_dataset) + 2
assert result.iloc[-2]["name"] == "Alice"
assert result.iloc[-1]["name"] == "Charlie"
@ -202,7 +202,7 @@ def test_add_rows_with_data_objects(sample_dataset):
]
result = sample_dataset.add_rows(new_rows)
assert isinstance(result, DataSet)
assert isinstance(result, DataFrame)
assert len(result) == len(sample_dataset) + 2
assert result.iloc[-2]["name"] == "Alice"
assert result.iloc[-1]["name"] == "Charlie"
@ -216,7 +216,7 @@ def test_add_rows_mixed_types(sample_dataset):
]
result = sample_dataset.add_rows(new_rows)
assert isinstance(result, DataSet)
assert isinstance(result, DataFrame)
assert len(result) == len(sample_dataset) + 2
assert result.iloc[-2]["name"] == "Alice"
assert result.iloc[-1]["name"] == "Charlie"
@ -225,9 +225,9 @@ def test_add_rows_mixed_types(sample_dataset):
def test_init_with_data_objects():
"""Test initialization with Data objects."""
data_objects = [Data(data={"name": "John", "age": 30}), Data(data={"name": "Jane", "age": 25})]
dataset = DataSet(data_objects)
dataset = DataFrame(data_objects)
assert isinstance(dataset, DataSet)
assert isinstance(dataset, DataFrame)
assert len(dataset) == 2
assert list(dataset.columns) == ["name", "age"]
assert dataset.iloc[0]["name"] == "John"
@ -237,9 +237,9 @@ def test_init_with_data_objects():
def test_init_with_dicts():
"""Test initialization with dictionaries."""
data_dicts = [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}]
dataset = DataSet(data_dicts)
dataset = DataFrame(data_dicts)
assert isinstance(dataset, DataSet)
assert isinstance(dataset, DataFrame)
assert len(dataset) == 2
assert list(dataset.columns) == ["name", "age"]
assert dataset.iloc[0]["name"] == "John"
@ -249,9 +249,9 @@ def test_init_with_dicts():
def test_init_with_dict_of_lists():
"""Test initialization with a dictionary of lists."""
data = {"name": ["John", "Jane"], "age": [30, 25]}
dataset = DataSet(data)
dataset = DataFrame(data)
assert isinstance(dataset, DataSet)
assert isinstance(dataset, DataFrame)
assert len(dataset) == 2
assert list(dataset.columns) == ["name", "age"]
assert dataset.iloc[0]["name"] == "John"
@ -261,9 +261,9 @@ def test_init_with_dict_of_lists():
def test_init_with_pandas_dataframe():
"""Test initialization with a pandas DataFrame."""
test_df = pd.DataFrame({"name": ["John", "Jane"], "age": [30, 25]})
dataset = DataSet(test_df)
dataset = DataFrame(test_df)
assert isinstance(dataset, DataSet)
assert isinstance(dataset, DataFrame)
assert len(dataset) == 2
assert list(dataset.columns) == ["name", "age"]
assert dataset.iloc[0]["name"] == "John"
@ -272,8 +272,8 @@ def test_init_with_pandas_dataframe():
def test_init_with_none():
"""Test initialization with None."""
dataset = DataSet(None)
assert isinstance(dataset, DataSet)
dataset = DataFrame(None)
assert isinstance(dataset, DataFrame)
assert len(dataset) == 0
@ -284,15 +284,15 @@ def test_init_with_invalid_list():
Data(data={"name": "Jane", "age": 25}), # Mixed types should fail
]
with pytest.raises(ValueError, match="List items must be either all Data objects or all dictionaries"):
DataSet(invalid_data)
DataFrame(invalid_data)
def test_init_with_kwargs():
"""Test initialization with additional kwargs."""
data = {"name": ["John", "Jane"], "age": [30, 25]}
dataset = DataSet(data=data, index=["a", "b"])
dataset = DataFrame(data=data, index=["a", "b"])
assert isinstance(dataset, DataSet)
assert isinstance(dataset, DataFrame)
assert len(dataset) == 2
assert list(dataset.index) == ["a", "b"]
assert dataset.loc["a"]["name"] == "John"