feat: add easier initialization to DataSet (#4864)

* feat: enhance DataSet class with improved constructor and methods for better data handling

- Added custom constructor to support various input formats including lists of Data objects, dictionaries, and existing DataFrames.
- Introduced methods `add_row` and `add_rows` for adding single or multiple rows to the DataSet.
- Updated docstrings and examples for clarity and usability.
- Ensured compatibility with pandas DataFrame operations while preserving Data object structures.

* test: add comprehensive tests for DataSet initialization and row operations

* feat: add DataSet class to schema module

* refactor: simplify DataSet initialization and improve data validation
This commit is contained in:
Gabriel Luiz Freitas Almeida 2024-11-26 21:01:08 -03:00 committed by GitHub
commit 7e88a4760b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 214 additions and 79 deletions

View file

@ -1,5 +1,6 @@
from .data import Data
from .data_set import DataSet
from .dotdict import dotdict
from .message import Message
__all__ = ["Data", "dotdict", "Message"]
__all__ = ["Data", "dotdict", "Message", "DataSet"]

View file

@ -1,3 +1,5 @@
from typing import cast
import pandas as pd
from langflow.schema.data import Data
@ -9,91 +11,83 @@ class DataSet(pd.DataFrame):
This class extends pandas.DataFrame to provide seamless integration between
Langflow's Data objects and pandas' powerful data manipulation capabilities.
Key Features:
- Direct initialization from a list of Data objects
- Maintains all pandas DataFrame functionality
- Conversion back to Data objects when needed
Notes:
- Nested dictionaries within Data objects are preserved in their column representation
- All pandas DataFrame operations (groupby, merge, concat, etc.) remain available
- Column dtypes are inferred from the Data objects' contents
Args:
data: Input data in various formats:
- List[Data]: List of Data objects
- List[Dict]: List of dictionaries
- Dict: Dictionary of arrays/lists
- pandas.DataFrame: Existing DataFrame
- Any format supported by pandas.DataFrame
**kwargs: Additional arguments passed to pandas.DataFrame constructor
Examples:
>>> data_objects = [
... Data(data={"name": "John", "age": 30}),
... Data(data={"name": "Jane", "age": 25})
... ]
>>> dataset = DataSet.from_data_list(data_objects)
>>> dataset['age'].mean()
27.5
>>> original_data = dataset.to_data_list()
>>> # From Data objects
>>> dataset = DataSet([Data(data={"name": "John"}), Data(data={"name": "Jane"})])
Inheritance:
This class inherits all functionality from pandas.DataFrame, meaning any
operation that works on a DataFrame will work on a DataSet:
- Filtering: dataset[dataset['age'] > 25]
- Aggregation: dataset.groupby('category').mean()
- Statistical operations: dataset.describe()
- etc.
>>> # From dictionaries
>>> dataset = DataSet([{"name": "John"}, {"name": "Jane"}])
>>> # From dictionary of lists
>>> dataset = DataSet({"name": ["John", "Jane"], "age": [30, 25]})
"""
@classmethod
def from_data_list(cls, data_list: list[Data]) -> "DataSet":
"""Creates a DataSet from a list of Data objects.
def __init__(self, data: None | list[dict | Data] | dict | pd.DataFrame = None, **kwargs):
if data is None:
super().__init__(**kwargs)
return
This method converts a list of Data objects into a DataFrame structure,
preserving all data from the original Data objects.
if isinstance(data, list):
if all(isinstance(x, Data) for x in data):
data = [d.data for d in data if hasattr(d, "data")]
elif not all(isinstance(x, dict) for x in data):
msg = "List items must be either all Data objects or all dictionaries"
raise ValueError(msg)
kwargs["data"] = data
elif isinstance(data, dict | pd.DataFrame):
kwargs["data"] = data
Args:
data_list (list[Data]): A list of Data objects to convert into a DataFrame.
Each Data object's internal dictionary becomes a row in the DataFrame.
Returns:
DataSet: A new DataSet instance containing all data from the input list.
Examples:
>>> data_objects = [
... Data(data={"name": "John", "age": 30}),
... Data(data={"name": "Jane", "age": 25})
... ]
>>> dataset = DataSet.from_data_list(data_objects)
>>> print(dataset.columns)
Index(['name', 'age'], dtype='object')
Notes:
- Column names are derived from the keys in the Data objects
- If Data objects have different keys, the resulting DataFrame will have
NaN values for missing data
- The original structure of nested data is preserved in the DataFrame
"""
data_dicts = [d.data for d in data_list]
return cls(data_dicts)
super().__init__(**kwargs)
def to_data_list(self) -> list[Data]:
"""Converts the DataSet back to a list of Data objects.
"""Converts the DataSet back to a list of Data objects."""
list_of_dicts = self.to_dict(orient="records")
return [Data(data=row) for row in list_of_dicts]
This method transforms each row of the DataFrame back into a Data object,
reconstructing the original data structure.
def add_row(self, data: dict | Data) -> "DataSet":
"""Adds a single row to the dataset.
Args:
data: Either a Data object or a dictionary to add as a new row
Returns:
list[Data]: A list of Data objects, where each object corresponds to
a row in the DataFrame.
DataSet: A new DataSet with the added row
Examples:
>>> dataset = DataSet({'name': ['John'], 'age': [30]})
>>> data_objects = dataset.to_data_list()
>>> print(data_objects[0].data)
{'name': 'John', 'age': 30}
Notes:
- Each row is converted to a dictionary using to_dict()
- The resulting Data objects will contain all columns as keys in their
internal dictionary
- Any modifications made to the DataFrame will be reflected in the
resulting Data objects
Example:
>>> dataset = DataSet([{"name": "John"}])
>>> dataset = dataset.add_row({"name": "Jane"})
"""
return [Data(data=row.to_dict()) for _, row in self.iterrows()]
if isinstance(data, Data):
data = data.data
new_df = self._constructor([data])
return cast(DataSet, pd.concat([self, new_df], ignore_index=True))
def add_rows(self, data: list[dict | Data]) -> "DataSet":
"""Adds multiple rows to the dataset.
Args:
data: List of Data objects or dictionaries to add as new rows
Returns:
DataSet: A new DataSet with the added rows
"""
processed_data = []
for item in data:
if isinstance(item, Data):
processed_data.append(item.data)
else:
processed_data.append(item)
new_df = self._constructor(processed_data)
return cast(DataSet, pd.concat([self, new_df], ignore_index=True))
@property
def _constructor(self):

View file

@ -17,13 +17,13 @@ def sample_data_objects() -> list[Data]:
@pytest.fixture
def sample_dataset(sample_data_objects) -> DataSet:
"""Fixture providing a sample DataSet instance."""
return DataSet.from_data_list(sample_data_objects)
return DataSet(sample_data_objects)
def test_from_data_list_basic():
"""Test basic functionality of from_data_list."""
data_objects = [Data(data={"name": "John", "age": 30}), Data(data={"name": "Jane", "age": 25})]
dataset = DataSet.from_data_list(data_objects)
dataset = DataSet(data_objects)
assert isinstance(dataset, DataSet)
assert isinstance(dataset, pd.DataFrame)
@ -35,7 +35,7 @@ def test_from_data_list_basic():
def test_from_data_list_empty():
"""Test from_data_list with empty input."""
dataset = DataSet.from_data_list([])
dataset = DataSet([])
assert isinstance(dataset, DataSet)
assert len(dataset) == 0
@ -46,7 +46,7 @@ def test_from_data_list_missing_fields():
Data(data={"name": "John", "age": 30}),
Data(data={"name": "Jane", "city": "Boston"}), # Missing age
]
dataset = DataSet.from_data_list(data_objects)
dataset = DataSet(data_objects)
assert isinstance(dataset, DataSet)
assert set(dataset.columns) == {"name", "age", "city"}
@ -60,7 +60,7 @@ def test_from_data_list_nested_data():
Data(data={"name": "John", "address": {"city": "New York", "zip": "10001"}}),
Data(data={"name": "Jane", "address": {"city": "Boston", "zip": "02108"}}),
]
dataset = DataSet.from_data_list(data_objects)
dataset = DataSet(data_objects)
assert isinstance(dataset, DataSet)
assert isinstance(dataset["address"][0], dict)
@ -123,7 +123,7 @@ def test_dataset_pandas_operations(sample_dataset):
def test_dataset_with_null_values():
"""Test handling of null values in DataSet."""
data_objects = [Data(data={"name": "John", "age": None}), Data(data={"name": None, "age": 25})]
dataset = DataSet.from_data_list(data_objects)
dataset = DataSet(data_objects)
assert pd.isna(dataset.iloc[0]["age"])
assert pd.isna(dataset.iloc[1]["name"])
@ -148,7 +148,7 @@ def test_dataset_type_preservation():
}
)
]
dataset = DataSet.from_data_list(data_objects)
dataset = DataSet(data_objects)
result = dataset.to_data_list()
assert isinstance(result[0].data["int_val"], int)
@ -157,3 +157,143 @@ def test_dataset_type_preservation():
assert isinstance(result[0].data["bool_val"], bool)
assert isinstance(result[0].data["list_val"], list)
assert isinstance(result[0].data["dict_val"], dict)
def test_add_row_with_dict(sample_dataset):
"""Test adding a single row using a dictionary."""
new_row = {"name": "Alice", "age": 28, "city": "Seattle"}
result = sample_dataset.add_row(new_row)
assert isinstance(result, DataSet)
assert len(result) == len(sample_dataset) + 1
assert result.iloc[-1]["name"] == "Alice"
assert result.iloc[-1]["age"] == 28
assert result.iloc[-1]["city"] == "Seattle"
def test_add_row_with_data_object(sample_dataset):
"""Test adding a single row using a Data object."""
new_row = Data(data={"name": "Alice", "age": 28, "city": "Seattle"})
result = sample_dataset.add_row(new_row)
assert isinstance(result, DataSet)
assert len(result) == len(sample_dataset) + 1
assert result.iloc[-1]["name"] == "Alice"
assert result.iloc[-1]["age"] == 28
assert result.iloc[-1]["city"] == "Seattle"
def test_add_rows_with_dicts(sample_dataset):
"""Test adding multiple rows using dictionaries."""
new_rows = [{"name": "Alice", "age": 28, "city": "Seattle"}, {"name": "Charlie", "age": 32, "city": "Portland"}]
result = sample_dataset.add_rows(new_rows)
assert isinstance(result, DataSet)
assert len(result) == len(sample_dataset) + 2
assert result.iloc[-2]["name"] == "Alice"
assert result.iloc[-1]["name"] == "Charlie"
def test_add_rows_with_data_objects(sample_dataset):
"""Test adding multiple rows using Data objects."""
new_rows = [
Data(data={"name": "Alice", "age": 28, "city": "Seattle"}),
Data(data={"name": "Charlie", "age": 32, "city": "Portland"}),
]
result = sample_dataset.add_rows(new_rows)
assert isinstance(result, DataSet)
assert len(result) == len(sample_dataset) + 2
assert result.iloc[-2]["name"] == "Alice"
assert result.iloc[-1]["name"] == "Charlie"
def test_add_rows_mixed_types(sample_dataset):
"""Test adding multiple rows using a mix of dictionaries and Data objects."""
new_rows = [
{"name": "Alice", "age": 28, "city": "Seattle"},
Data(data={"name": "Charlie", "age": 32, "city": "Portland"}),
]
result = sample_dataset.add_rows(new_rows)
assert isinstance(result, DataSet)
assert len(result) == len(sample_dataset) + 2
assert result.iloc[-2]["name"] == "Alice"
assert result.iloc[-1]["name"] == "Charlie"
def test_init_with_data_objects():
"""Test initialization with Data objects."""
data_objects = [Data(data={"name": "John", "age": 30}), Data(data={"name": "Jane", "age": 25})]
dataset = DataSet(data_objects)
assert isinstance(dataset, DataSet)
assert len(dataset) == 2
assert list(dataset.columns) == ["name", "age"]
assert dataset.iloc[0]["name"] == "John"
assert dataset.iloc[1]["age"] == 25
def test_init_with_dicts():
"""Test initialization with dictionaries."""
data_dicts = [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}]
dataset = DataSet(data_dicts)
assert isinstance(dataset, DataSet)
assert len(dataset) == 2
assert list(dataset.columns) == ["name", "age"]
assert dataset.iloc[0]["name"] == "John"
assert dataset.iloc[1]["age"] == 25
def test_init_with_dict_of_lists():
"""Test initialization with a dictionary of lists."""
data = {"name": ["John", "Jane"], "age": [30, 25]}
dataset = DataSet(data)
assert isinstance(dataset, DataSet)
assert len(dataset) == 2
assert list(dataset.columns) == ["name", "age"]
assert dataset.iloc[0]["name"] == "John"
assert dataset.iloc[1]["age"] == 25
def test_init_with_pandas_dataframe():
"""Test initialization with a pandas DataFrame."""
test_df = pd.DataFrame({"name": ["John", "Jane"], "age": [30, 25]})
dataset = DataSet(test_df)
assert isinstance(dataset, DataSet)
assert len(dataset) == 2
assert list(dataset.columns) == ["name", "age"]
assert dataset.iloc[0]["name"] == "John"
assert dataset.iloc[1]["age"] == 25
def test_init_with_none():
"""Test initialization with None."""
dataset = DataSet(None)
assert isinstance(dataset, DataSet)
assert len(dataset) == 0
def test_init_with_invalid_list():
"""Test initialization with invalid list items."""
invalid_data = [
{"name": "John", "age": 30},
Data(data={"name": "Jane", "age": 25}), # Mixed types should fail
]
with pytest.raises(ValueError, match="List items must be either all Data objects or all dictionaries"):
DataSet(invalid_data)
def test_init_with_kwargs():
"""Test initialization with additional kwargs."""
data = {"name": ["John", "Jane"], "age": [30, 25]}
dataset = DataSet(data=data, index=["a", "b"])
assert isinstance(dataset, DataSet)
assert len(dataset) == 2
assert list(dataset.index) == ["a", "b"]
assert dataset.loc["a"]["name"] == "John"
assert dataset.loc["b"]["age"] == 25