feat: add easier initialization to DataSet (#4864)
* feat: enhance DataSet class with improved constructor and methods for better data handling - Added custom constructor to support various input formats including lists of Data objects, dictionaries, and existing DataFrames. - Introduced methods `add_row` and `add_rows` for adding single or multiple rows to the DataSet. - Updated docstrings and examples for clarity and usability. - Ensured compatibility with pandas DataFrame operations while preserving Data object structures. * test: add comprehensive tests for DataSet initialization and row operations * feat: add DataSet class to schema module * refactor: simplify DataSet initialization and improve data validation
This commit is contained in:
parent
159f6e5906
commit
7e88a4760b
3 changed files with 214 additions and 79 deletions
|
|
@ -1,5 +1,6 @@
|
|||
from .data import Data
|
||||
from .data_set import DataSet
|
||||
from .dotdict import dotdict
|
||||
from .message import Message
|
||||
|
||||
__all__ = ["Data", "dotdict", "Message"]
|
||||
__all__ = ["Data", "dotdict", "Message", "DataSet"]
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
from typing import cast
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from langflow.schema.data import Data
|
||||
|
|
@ -9,91 +11,83 @@ class DataSet(pd.DataFrame):
|
|||
This class extends pandas.DataFrame to provide seamless integration between
|
||||
Langflow's Data objects and pandas' powerful data manipulation capabilities.
|
||||
|
||||
Key Features:
|
||||
- Direct initialization from a list of Data objects
|
||||
- Maintains all pandas DataFrame functionality
|
||||
- Conversion back to Data objects when needed
|
||||
|
||||
Notes:
|
||||
- Nested dictionaries within Data objects are preserved in their column representation
|
||||
- All pandas DataFrame operations (groupby, merge, concat, etc.) remain available
|
||||
- Column dtypes are inferred from the Data objects' contents
|
||||
Args:
|
||||
data: Input data in various formats:
|
||||
- List[Data]: List of Data objects
|
||||
- List[Dict]: List of dictionaries
|
||||
- Dict: Dictionary of arrays/lists
|
||||
- pandas.DataFrame: Existing DataFrame
|
||||
- Any format supported by pandas.DataFrame
|
||||
**kwargs: Additional arguments passed to pandas.DataFrame constructor
|
||||
|
||||
Examples:
|
||||
>>> data_objects = [
|
||||
... Data(data={"name": "John", "age": 30}),
|
||||
... Data(data={"name": "Jane", "age": 25})
|
||||
... ]
|
||||
>>> dataset = DataSet.from_data_list(data_objects)
|
||||
>>> dataset['age'].mean()
|
||||
27.5
|
||||
>>> original_data = dataset.to_data_list()
|
||||
>>> # From Data objects
|
||||
>>> dataset = DataSet([Data(data={"name": "John"}), Data(data={"name": "Jane"})])
|
||||
|
||||
Inheritance:
|
||||
This class inherits all functionality from pandas.DataFrame, meaning any
|
||||
operation that works on a DataFrame will work on a DataSet:
|
||||
- Filtering: dataset[dataset['age'] > 25]
|
||||
- Aggregation: dataset.groupby('category').mean()
|
||||
- Statistical operations: dataset.describe()
|
||||
- etc.
|
||||
>>> # From dictionaries
|
||||
>>> dataset = DataSet([{"name": "John"}, {"name": "Jane"}])
|
||||
|
||||
>>> # From dictionary of lists
|
||||
>>> dataset = DataSet({"name": ["John", "Jane"], "age": [30, 25]})
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def from_data_list(cls, data_list: list[Data]) -> "DataSet":
|
||||
"""Creates a DataSet from a list of Data objects.
|
||||
def __init__(self, data: None | list[dict | Data] | dict | pd.DataFrame = None, **kwargs):
|
||||
if data is None:
|
||||
super().__init__(**kwargs)
|
||||
return
|
||||
|
||||
This method converts a list of Data objects into a DataFrame structure,
|
||||
preserving all data from the original Data objects.
|
||||
if isinstance(data, list):
|
||||
if all(isinstance(x, Data) for x in data):
|
||||
data = [d.data for d in data if hasattr(d, "data")]
|
||||
elif not all(isinstance(x, dict) for x in data):
|
||||
msg = "List items must be either all Data objects or all dictionaries"
|
||||
raise ValueError(msg)
|
||||
kwargs["data"] = data
|
||||
elif isinstance(data, dict | pd.DataFrame):
|
||||
kwargs["data"] = data
|
||||
|
||||
Args:
|
||||
data_list (list[Data]): A list of Data objects to convert into a DataFrame.
|
||||
Each Data object's internal dictionary becomes a row in the DataFrame.
|
||||
|
||||
Returns:
|
||||
DataSet: A new DataSet instance containing all data from the input list.
|
||||
|
||||
Examples:
|
||||
>>> data_objects = [
|
||||
... Data(data={"name": "John", "age": 30}),
|
||||
... Data(data={"name": "Jane", "age": 25})
|
||||
... ]
|
||||
>>> dataset = DataSet.from_data_list(data_objects)
|
||||
>>> print(dataset.columns)
|
||||
Index(['name', 'age'], dtype='object')
|
||||
|
||||
Notes:
|
||||
- Column names are derived from the keys in the Data objects
|
||||
- If Data objects have different keys, the resulting DataFrame will have
|
||||
NaN values for missing data
|
||||
- The original structure of nested data is preserved in the DataFrame
|
||||
"""
|
||||
data_dicts = [d.data for d in data_list]
|
||||
return cls(data_dicts)
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def to_data_list(self) -> list[Data]:
|
||||
"""Converts the DataSet back to a list of Data objects.
|
||||
"""Converts the DataSet back to a list of Data objects."""
|
||||
list_of_dicts = self.to_dict(orient="records")
|
||||
return [Data(data=row) for row in list_of_dicts]
|
||||
|
||||
This method transforms each row of the DataFrame back into a Data object,
|
||||
reconstructing the original data structure.
|
||||
def add_row(self, data: dict | Data) -> "DataSet":
|
||||
"""Adds a single row to the dataset.
|
||||
|
||||
Args:
|
||||
data: Either a Data object or a dictionary to add as a new row
|
||||
|
||||
Returns:
|
||||
list[Data]: A list of Data objects, where each object corresponds to
|
||||
a row in the DataFrame.
|
||||
DataSet: A new DataSet with the added row
|
||||
|
||||
Examples:
|
||||
>>> dataset = DataSet({'name': ['John'], 'age': [30]})
|
||||
>>> data_objects = dataset.to_data_list()
|
||||
>>> print(data_objects[0].data)
|
||||
{'name': 'John', 'age': 30}
|
||||
|
||||
Notes:
|
||||
- Each row is converted to a dictionary using to_dict()
|
||||
- The resulting Data objects will contain all columns as keys in their
|
||||
internal dictionary
|
||||
- Any modifications made to the DataFrame will be reflected in the
|
||||
resulting Data objects
|
||||
Example:
|
||||
>>> dataset = DataSet([{"name": "John"}])
|
||||
>>> dataset = dataset.add_row({"name": "Jane"})
|
||||
"""
|
||||
return [Data(data=row.to_dict()) for _, row in self.iterrows()]
|
||||
if isinstance(data, Data):
|
||||
data = data.data
|
||||
new_df = self._constructor([data])
|
||||
return cast(DataSet, pd.concat([self, new_df], ignore_index=True))
|
||||
|
||||
def add_rows(self, data: list[dict | Data]) -> "DataSet":
|
||||
"""Adds multiple rows to the dataset.
|
||||
|
||||
Args:
|
||||
data: List of Data objects or dictionaries to add as new rows
|
||||
|
||||
Returns:
|
||||
DataSet: A new DataSet with the added rows
|
||||
"""
|
||||
processed_data = []
|
||||
for item in data:
|
||||
if isinstance(item, Data):
|
||||
processed_data.append(item.data)
|
||||
else:
|
||||
processed_data.append(item)
|
||||
new_df = self._constructor(processed_data)
|
||||
return cast(DataSet, pd.concat([self, new_df], ignore_index=True))
|
||||
|
||||
@property
|
||||
def _constructor(self):
|
||||
|
|
|
|||
|
|
@ -17,13 +17,13 @@ def sample_data_objects() -> list[Data]:
|
|||
@pytest.fixture
|
||||
def sample_dataset(sample_data_objects) -> DataSet:
|
||||
"""Fixture providing a sample DataSet instance."""
|
||||
return DataSet.from_data_list(sample_data_objects)
|
||||
return DataSet(sample_data_objects)
|
||||
|
||||
|
||||
def test_from_data_list_basic():
|
||||
"""Test basic functionality of from_data_list."""
|
||||
data_objects = [Data(data={"name": "John", "age": 30}), Data(data={"name": "Jane", "age": 25})]
|
||||
dataset = DataSet.from_data_list(data_objects)
|
||||
dataset = DataSet(data_objects)
|
||||
|
||||
assert isinstance(dataset, DataSet)
|
||||
assert isinstance(dataset, pd.DataFrame)
|
||||
|
|
@ -35,7 +35,7 @@ def test_from_data_list_basic():
|
|||
|
||||
def test_from_data_list_empty():
|
||||
"""Test from_data_list with empty input."""
|
||||
dataset = DataSet.from_data_list([])
|
||||
dataset = DataSet([])
|
||||
assert isinstance(dataset, DataSet)
|
||||
assert len(dataset) == 0
|
||||
|
||||
|
|
@ -46,7 +46,7 @@ def test_from_data_list_missing_fields():
|
|||
Data(data={"name": "John", "age": 30}),
|
||||
Data(data={"name": "Jane", "city": "Boston"}), # Missing age
|
||||
]
|
||||
dataset = DataSet.from_data_list(data_objects)
|
||||
dataset = DataSet(data_objects)
|
||||
|
||||
assert isinstance(dataset, DataSet)
|
||||
assert set(dataset.columns) == {"name", "age", "city"}
|
||||
|
|
@ -60,7 +60,7 @@ def test_from_data_list_nested_data():
|
|||
Data(data={"name": "John", "address": {"city": "New York", "zip": "10001"}}),
|
||||
Data(data={"name": "Jane", "address": {"city": "Boston", "zip": "02108"}}),
|
||||
]
|
||||
dataset = DataSet.from_data_list(data_objects)
|
||||
dataset = DataSet(data_objects)
|
||||
|
||||
assert isinstance(dataset, DataSet)
|
||||
assert isinstance(dataset["address"][0], dict)
|
||||
|
|
@ -123,7 +123,7 @@ def test_dataset_pandas_operations(sample_dataset):
|
|||
def test_dataset_with_null_values():
|
||||
"""Test handling of null values in DataSet."""
|
||||
data_objects = [Data(data={"name": "John", "age": None}), Data(data={"name": None, "age": 25})]
|
||||
dataset = DataSet.from_data_list(data_objects)
|
||||
dataset = DataSet(data_objects)
|
||||
|
||||
assert pd.isna(dataset.iloc[0]["age"])
|
||||
assert pd.isna(dataset.iloc[1]["name"])
|
||||
|
|
@ -148,7 +148,7 @@ def test_dataset_type_preservation():
|
|||
}
|
||||
)
|
||||
]
|
||||
dataset = DataSet.from_data_list(data_objects)
|
||||
dataset = DataSet(data_objects)
|
||||
result = dataset.to_data_list()
|
||||
|
||||
assert isinstance(result[0].data["int_val"], int)
|
||||
|
|
@ -157,3 +157,143 @@ def test_dataset_type_preservation():
|
|||
assert isinstance(result[0].data["bool_val"], bool)
|
||||
assert isinstance(result[0].data["list_val"], list)
|
||||
assert isinstance(result[0].data["dict_val"], dict)
|
||||
|
||||
|
||||
def test_add_row_with_dict(sample_dataset):
|
||||
"""Test adding a single row using a dictionary."""
|
||||
new_row = {"name": "Alice", "age": 28, "city": "Seattle"}
|
||||
result = sample_dataset.add_row(new_row)
|
||||
|
||||
assert isinstance(result, DataSet)
|
||||
assert len(result) == len(sample_dataset) + 1
|
||||
assert result.iloc[-1]["name"] == "Alice"
|
||||
assert result.iloc[-1]["age"] == 28
|
||||
assert result.iloc[-1]["city"] == "Seattle"
|
||||
|
||||
|
||||
def test_add_row_with_data_object(sample_dataset):
|
||||
"""Test adding a single row using a Data object."""
|
||||
new_row = Data(data={"name": "Alice", "age": 28, "city": "Seattle"})
|
||||
result = sample_dataset.add_row(new_row)
|
||||
|
||||
assert isinstance(result, DataSet)
|
||||
assert len(result) == len(sample_dataset) + 1
|
||||
assert result.iloc[-1]["name"] == "Alice"
|
||||
assert result.iloc[-1]["age"] == 28
|
||||
assert result.iloc[-1]["city"] == "Seattle"
|
||||
|
||||
|
||||
def test_add_rows_with_dicts(sample_dataset):
|
||||
"""Test adding multiple rows using dictionaries."""
|
||||
new_rows = [{"name": "Alice", "age": 28, "city": "Seattle"}, {"name": "Charlie", "age": 32, "city": "Portland"}]
|
||||
result = sample_dataset.add_rows(new_rows)
|
||||
|
||||
assert isinstance(result, DataSet)
|
||||
assert len(result) == len(sample_dataset) + 2
|
||||
assert result.iloc[-2]["name"] == "Alice"
|
||||
assert result.iloc[-1]["name"] == "Charlie"
|
||||
|
||||
|
||||
def test_add_rows_with_data_objects(sample_dataset):
|
||||
"""Test adding multiple rows using Data objects."""
|
||||
new_rows = [
|
||||
Data(data={"name": "Alice", "age": 28, "city": "Seattle"}),
|
||||
Data(data={"name": "Charlie", "age": 32, "city": "Portland"}),
|
||||
]
|
||||
result = sample_dataset.add_rows(new_rows)
|
||||
|
||||
assert isinstance(result, DataSet)
|
||||
assert len(result) == len(sample_dataset) + 2
|
||||
assert result.iloc[-2]["name"] == "Alice"
|
||||
assert result.iloc[-1]["name"] == "Charlie"
|
||||
|
||||
|
||||
def test_add_rows_mixed_types(sample_dataset):
|
||||
"""Test adding multiple rows using a mix of dictionaries and Data objects."""
|
||||
new_rows = [
|
||||
{"name": "Alice", "age": 28, "city": "Seattle"},
|
||||
Data(data={"name": "Charlie", "age": 32, "city": "Portland"}),
|
||||
]
|
||||
result = sample_dataset.add_rows(new_rows)
|
||||
|
||||
assert isinstance(result, DataSet)
|
||||
assert len(result) == len(sample_dataset) + 2
|
||||
assert result.iloc[-2]["name"] == "Alice"
|
||||
assert result.iloc[-1]["name"] == "Charlie"
|
||||
|
||||
|
||||
def test_init_with_data_objects():
|
||||
"""Test initialization with Data objects."""
|
||||
data_objects = [Data(data={"name": "John", "age": 30}), Data(data={"name": "Jane", "age": 25})]
|
||||
dataset = DataSet(data_objects)
|
||||
|
||||
assert isinstance(dataset, DataSet)
|
||||
assert len(dataset) == 2
|
||||
assert list(dataset.columns) == ["name", "age"]
|
||||
assert dataset.iloc[0]["name"] == "John"
|
||||
assert dataset.iloc[1]["age"] == 25
|
||||
|
||||
|
||||
def test_init_with_dicts():
|
||||
"""Test initialization with dictionaries."""
|
||||
data_dicts = [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}]
|
||||
dataset = DataSet(data_dicts)
|
||||
|
||||
assert isinstance(dataset, DataSet)
|
||||
assert len(dataset) == 2
|
||||
assert list(dataset.columns) == ["name", "age"]
|
||||
assert dataset.iloc[0]["name"] == "John"
|
||||
assert dataset.iloc[1]["age"] == 25
|
||||
|
||||
|
||||
def test_init_with_dict_of_lists():
|
||||
"""Test initialization with a dictionary of lists."""
|
||||
data = {"name": ["John", "Jane"], "age": [30, 25]}
|
||||
dataset = DataSet(data)
|
||||
|
||||
assert isinstance(dataset, DataSet)
|
||||
assert len(dataset) == 2
|
||||
assert list(dataset.columns) == ["name", "age"]
|
||||
assert dataset.iloc[0]["name"] == "John"
|
||||
assert dataset.iloc[1]["age"] == 25
|
||||
|
||||
|
||||
def test_init_with_pandas_dataframe():
|
||||
"""Test initialization with a pandas DataFrame."""
|
||||
test_df = pd.DataFrame({"name": ["John", "Jane"], "age": [30, 25]})
|
||||
dataset = DataSet(test_df)
|
||||
|
||||
assert isinstance(dataset, DataSet)
|
||||
assert len(dataset) == 2
|
||||
assert list(dataset.columns) == ["name", "age"]
|
||||
assert dataset.iloc[0]["name"] == "John"
|
||||
assert dataset.iloc[1]["age"] == 25
|
||||
|
||||
|
||||
def test_init_with_none():
|
||||
"""Test initialization with None."""
|
||||
dataset = DataSet(None)
|
||||
assert isinstance(dataset, DataSet)
|
||||
assert len(dataset) == 0
|
||||
|
||||
|
||||
def test_init_with_invalid_list():
|
||||
"""Test initialization with invalid list items."""
|
||||
invalid_data = [
|
||||
{"name": "John", "age": 30},
|
||||
Data(data={"name": "Jane", "age": 25}), # Mixed types should fail
|
||||
]
|
||||
with pytest.raises(ValueError, match="List items must be either all Data objects or all dictionaries"):
|
||||
DataSet(invalid_data)
|
||||
|
||||
|
||||
def test_init_with_kwargs():
|
||||
"""Test initialization with additional kwargs."""
|
||||
data = {"name": ["John", "Jane"], "age": [30, 25]}
|
||||
dataset = DataSet(data=data, index=["a", "b"])
|
||||
|
||||
assert isinstance(dataset, DataSet)
|
||||
assert len(dataset) == 2
|
||||
assert list(dataset.index) == ["a", "b"]
|
||||
assert dataset.loc["a"]["name"] == "John"
|
||||
assert dataset.loc["b"]["age"] == 25
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue