diff --git a/src/backend/base/langflow/schema/__init__.py b/src/backend/base/langflow/schema/__init__.py index 619ab3ec0..38fce1ae6 100644 --- a/src/backend/base/langflow/schema/__init__.py +++ b/src/backend/base/langflow/schema/__init__.py @@ -1,5 +1,6 @@ from .data import Data +from .data_set import DataSet from .dotdict import dotdict from .message import Message -__all__ = ["Data", "dotdict", "Message"] +__all__ = ["Data", "dotdict", "Message", "DataSet"] diff --git a/src/backend/base/langflow/schema/data_set.py b/src/backend/base/langflow/schema/data_set.py index 449ceead0..48c879024 100644 --- a/src/backend/base/langflow/schema/data_set.py +++ b/src/backend/base/langflow/schema/data_set.py @@ -1,3 +1,5 @@ +from typing import cast + import pandas as pd from langflow.schema.data import Data @@ -9,91 +11,83 @@ class DataSet(pd.DataFrame): This class extends pandas.DataFrame to provide seamless integration between Langflow's Data objects and pandas' powerful data manipulation capabilities. - Key Features: - - Direct initialization from a list of Data objects - - Maintains all pandas DataFrame functionality - - Conversion back to Data objects when needed - - Notes: - - Nested dictionaries within Data objects are preserved in their column representation - - All pandas DataFrame operations (groupby, merge, concat, etc.) remain available - - Column dtypes are inferred from the Data objects' contents + Args: + data: Input data in various formats: + - List[Data]: List of Data objects + - List[Dict]: List of dictionaries + - Dict: Dictionary of arrays/lists + - pandas.DataFrame: Existing DataFrame + - Any format supported by pandas.DataFrame + **kwargs: Additional arguments passed to pandas.DataFrame constructor Examples: - >>> data_objects = [ - ... Data(data={"name": "John", "age": 30}), - ... Data(data={"name": "Jane", "age": 25}) - ... ] - >>> dataset = DataSet.from_data_list(data_objects) - >>> dataset['age'].mean() - 27.5 - >>> original_data = dataset.to_data_list() + >>> # From Data objects + >>> dataset = DataSet([Data(data={"name": "John"}), Data(data={"name": "Jane"})]) - Inheritance: - This class inherits all functionality from pandas.DataFrame, meaning any - operation that works on a DataFrame will work on a DataSet: - - Filtering: dataset[dataset['age'] > 25] - - Aggregation: dataset.groupby('category').mean() - - Statistical operations: dataset.describe() - - etc. + >>> # From dictionaries + >>> dataset = DataSet([{"name": "John"}, {"name": "Jane"}]) + + >>> # From dictionary of lists + >>> dataset = DataSet({"name": ["John", "Jane"], "age": [30, 25]}) """ - @classmethod - def from_data_list(cls, data_list: list[Data]) -> "DataSet": - """Creates a DataSet from a list of Data objects. + def __init__(self, data: None | list[dict | Data] | dict | pd.DataFrame = None, **kwargs): + if data is None: + super().__init__(**kwargs) + return - This method converts a list of Data objects into a DataFrame structure, - preserving all data from the original Data objects. + if isinstance(data, list): + if all(isinstance(x, Data) for x in data): + data = [d.data for d in data if hasattr(d, "data")] + elif not all(isinstance(x, dict) for x in data): + msg = "List items must be either all Data objects or all dictionaries" + raise ValueError(msg) + kwargs["data"] = data + elif isinstance(data, dict | pd.DataFrame): + kwargs["data"] = data - Args: - data_list (list[Data]): A list of Data objects to convert into a DataFrame. - Each Data object's internal dictionary becomes a row in the DataFrame. - - Returns: - DataSet: A new DataSet instance containing all data from the input list. - - Examples: - >>> data_objects = [ - ... Data(data={"name": "John", "age": 30}), - ... Data(data={"name": "Jane", "age": 25}) - ... ] - >>> dataset = DataSet.from_data_list(data_objects) - >>> print(dataset.columns) - Index(['name', 'age'], dtype='object') - - Notes: - - Column names are derived from the keys in the Data objects - - If Data objects have different keys, the resulting DataFrame will have - NaN values for missing data - - The original structure of nested data is preserved in the DataFrame - """ - data_dicts = [d.data for d in data_list] - return cls(data_dicts) + super().__init__(**kwargs) def to_data_list(self) -> list[Data]: - """Converts the DataSet back to a list of Data objects. + """Converts the DataSet back to a list of Data objects.""" + list_of_dicts = self.to_dict(orient="records") + return [Data(data=row) for row in list_of_dicts] - This method transforms each row of the DataFrame back into a Data object, - reconstructing the original data structure. + def add_row(self, data: dict | Data) -> "DataSet": + """Adds a single row to the dataset. + + Args: + data: Either a Data object or a dictionary to add as a new row Returns: - list[Data]: A list of Data objects, where each object corresponds to - a row in the DataFrame. + DataSet: A new DataSet with the added row - Examples: - >>> dataset = DataSet({'name': ['John'], 'age': [30]}) - >>> data_objects = dataset.to_data_list() - >>> print(data_objects[0].data) - {'name': 'John', 'age': 30} - - Notes: - - Each row is converted to a dictionary using to_dict() - - The resulting Data objects will contain all columns as keys in their - internal dictionary - - Any modifications made to the DataFrame will be reflected in the - resulting Data objects + Example: + >>> dataset = DataSet([{"name": "John"}]) + >>> dataset = dataset.add_row({"name": "Jane"}) """ - return [Data(data=row.to_dict()) for _, row in self.iterrows()] + if isinstance(data, Data): + data = data.data + new_df = self._constructor([data]) + return cast(DataSet, pd.concat([self, new_df], ignore_index=True)) + + def add_rows(self, data: list[dict | Data]) -> "DataSet": + """Adds multiple rows to the dataset. + + Args: + data: List of Data objects or dictionaries to add as new rows + + Returns: + DataSet: A new DataSet with the added rows + """ + processed_data = [] + for item in data: + if isinstance(item, Data): + processed_data.append(item.data) + else: + processed_data.append(item) + new_df = self._constructor(processed_data) + return cast(DataSet, pd.concat([self, new_df], ignore_index=True)) @property def _constructor(self): diff --git a/src/backend/tests/unit/schema/test_schema_data_set.py b/src/backend/tests/unit/schema/test_schema_data_set.py index a40eee69a..bc63e2aa7 100644 --- a/src/backend/tests/unit/schema/test_schema_data_set.py +++ b/src/backend/tests/unit/schema/test_schema_data_set.py @@ -17,13 +17,13 @@ def sample_data_objects() -> list[Data]: @pytest.fixture def sample_dataset(sample_data_objects) -> DataSet: """Fixture providing a sample DataSet instance.""" - return DataSet.from_data_list(sample_data_objects) + return DataSet(sample_data_objects) def test_from_data_list_basic(): """Test basic functionality of from_data_list.""" data_objects = [Data(data={"name": "John", "age": 30}), Data(data={"name": "Jane", "age": 25})] - dataset = DataSet.from_data_list(data_objects) + dataset = DataSet(data_objects) assert isinstance(dataset, DataSet) assert isinstance(dataset, pd.DataFrame) @@ -35,7 +35,7 @@ def test_from_data_list_basic(): def test_from_data_list_empty(): """Test from_data_list with empty input.""" - dataset = DataSet.from_data_list([]) + dataset = DataSet([]) assert isinstance(dataset, DataSet) assert len(dataset) == 0 @@ -46,7 +46,7 @@ def test_from_data_list_missing_fields(): Data(data={"name": "John", "age": 30}), Data(data={"name": "Jane", "city": "Boston"}), # Missing age ] - dataset = DataSet.from_data_list(data_objects) + dataset = DataSet(data_objects) assert isinstance(dataset, DataSet) assert set(dataset.columns) == {"name", "age", "city"} @@ -60,7 +60,7 @@ def test_from_data_list_nested_data(): Data(data={"name": "John", "address": {"city": "New York", "zip": "10001"}}), Data(data={"name": "Jane", "address": {"city": "Boston", "zip": "02108"}}), ] - dataset = DataSet.from_data_list(data_objects) + dataset = DataSet(data_objects) assert isinstance(dataset, DataSet) assert isinstance(dataset["address"][0], dict) @@ -123,7 +123,7 @@ def test_dataset_pandas_operations(sample_dataset): def test_dataset_with_null_values(): """Test handling of null values in DataSet.""" data_objects = [Data(data={"name": "John", "age": None}), Data(data={"name": None, "age": 25})] - dataset = DataSet.from_data_list(data_objects) + dataset = DataSet(data_objects) assert pd.isna(dataset.iloc[0]["age"]) assert pd.isna(dataset.iloc[1]["name"]) @@ -148,7 +148,7 @@ def test_dataset_type_preservation(): } ) ] - dataset = DataSet.from_data_list(data_objects) + dataset = DataSet(data_objects) result = dataset.to_data_list() assert isinstance(result[0].data["int_val"], int) @@ -157,3 +157,143 @@ def test_dataset_type_preservation(): assert isinstance(result[0].data["bool_val"], bool) assert isinstance(result[0].data["list_val"], list) assert isinstance(result[0].data["dict_val"], dict) + + +def test_add_row_with_dict(sample_dataset): + """Test adding a single row using a dictionary.""" + new_row = {"name": "Alice", "age": 28, "city": "Seattle"} + result = sample_dataset.add_row(new_row) + + assert isinstance(result, DataSet) + assert len(result) == len(sample_dataset) + 1 + assert result.iloc[-1]["name"] == "Alice" + assert result.iloc[-1]["age"] == 28 + assert result.iloc[-1]["city"] == "Seattle" + + +def test_add_row_with_data_object(sample_dataset): + """Test adding a single row using a Data object.""" + new_row = Data(data={"name": "Alice", "age": 28, "city": "Seattle"}) + result = sample_dataset.add_row(new_row) + + assert isinstance(result, DataSet) + assert len(result) == len(sample_dataset) + 1 + assert result.iloc[-1]["name"] == "Alice" + assert result.iloc[-1]["age"] == 28 + assert result.iloc[-1]["city"] == "Seattle" + + +def test_add_rows_with_dicts(sample_dataset): + """Test adding multiple rows using dictionaries.""" + new_rows = [{"name": "Alice", "age": 28, "city": "Seattle"}, {"name": "Charlie", "age": 32, "city": "Portland"}] + result = sample_dataset.add_rows(new_rows) + + assert isinstance(result, DataSet) + assert len(result) == len(sample_dataset) + 2 + assert result.iloc[-2]["name"] == "Alice" + assert result.iloc[-1]["name"] == "Charlie" + + +def test_add_rows_with_data_objects(sample_dataset): + """Test adding multiple rows using Data objects.""" + new_rows = [ + Data(data={"name": "Alice", "age": 28, "city": "Seattle"}), + Data(data={"name": "Charlie", "age": 32, "city": "Portland"}), + ] + result = sample_dataset.add_rows(new_rows) + + assert isinstance(result, DataSet) + assert len(result) == len(sample_dataset) + 2 + assert result.iloc[-2]["name"] == "Alice" + assert result.iloc[-1]["name"] == "Charlie" + + +def test_add_rows_mixed_types(sample_dataset): + """Test adding multiple rows using a mix of dictionaries and Data objects.""" + new_rows = [ + {"name": "Alice", "age": 28, "city": "Seattle"}, + Data(data={"name": "Charlie", "age": 32, "city": "Portland"}), + ] + result = sample_dataset.add_rows(new_rows) + + assert isinstance(result, DataSet) + assert len(result) == len(sample_dataset) + 2 + assert result.iloc[-2]["name"] == "Alice" + assert result.iloc[-1]["name"] == "Charlie" + + +def test_init_with_data_objects(): + """Test initialization with Data objects.""" + data_objects = [Data(data={"name": "John", "age": 30}), Data(data={"name": "Jane", "age": 25})] + dataset = DataSet(data_objects) + + assert isinstance(dataset, DataSet) + assert len(dataset) == 2 + assert list(dataset.columns) == ["name", "age"] + assert dataset.iloc[0]["name"] == "John" + assert dataset.iloc[1]["age"] == 25 + + +def test_init_with_dicts(): + """Test initialization with dictionaries.""" + data_dicts = [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}] + dataset = DataSet(data_dicts) + + assert isinstance(dataset, DataSet) + assert len(dataset) == 2 + assert list(dataset.columns) == ["name", "age"] + assert dataset.iloc[0]["name"] == "John" + assert dataset.iloc[1]["age"] == 25 + + +def test_init_with_dict_of_lists(): + """Test initialization with a dictionary of lists.""" + data = {"name": ["John", "Jane"], "age": [30, 25]} + dataset = DataSet(data) + + assert isinstance(dataset, DataSet) + assert len(dataset) == 2 + assert list(dataset.columns) == ["name", "age"] + assert dataset.iloc[0]["name"] == "John" + assert dataset.iloc[1]["age"] == 25 + + +def test_init_with_pandas_dataframe(): + """Test initialization with a pandas DataFrame.""" + test_df = pd.DataFrame({"name": ["John", "Jane"], "age": [30, 25]}) + dataset = DataSet(test_df) + + assert isinstance(dataset, DataSet) + assert len(dataset) == 2 + assert list(dataset.columns) == ["name", "age"] + assert dataset.iloc[0]["name"] == "John" + assert dataset.iloc[1]["age"] == 25 + + +def test_init_with_none(): + """Test initialization with None.""" + dataset = DataSet(None) + assert isinstance(dataset, DataSet) + assert len(dataset) == 0 + + +def test_init_with_invalid_list(): + """Test initialization with invalid list items.""" + invalid_data = [ + {"name": "John", "age": 30}, + Data(data={"name": "Jane", "age": 25}), # Mixed types should fail + ] + with pytest.raises(ValueError, match="List items must be either all Data objects or all dictionaries"): + DataSet(invalid_data) + + +def test_init_with_kwargs(): + """Test initialization with additional kwargs.""" + data = {"name": ["John", "Jane"], "age": [30, 25]} + dataset = DataSet(data=data, index=["a", "b"]) + + assert isinstance(dataset, DataSet) + assert len(dataset) == 2 + assert list(dataset.index) == ["a", "b"] + assert dataset.loc["a"]["name"] == "John" + assert dataset.loc["b"]["age"] == 25