feat: add dataframe operations component (#5341)

* add dataframe operations component

* populate entire new column with value

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* [autofix.ci] apply automated fixes

* Add unit tests for DataFrame operations in `test_dataframe_operations.py`

* **Import modules**
  - Import `pytest` and `pandas` for testing DataFrame operations

* **Define test cases**
  - Define test cases for edge cases like empty DataFrames and invalid column names
  - Include tests for operations like "Head", "Tail", and "Replace Value"
  - Use `pytest.mark.parametrize` to test multiple operations with different inputs
  - Add detailed assertions to verify the correctness of DataFrame operations

* [autofix.ci] apply automated fixes

* Remove test cases for DataFrame operations from `test_dataframe_operations.py`. This deletion includes all unit tests related to various DataFrame operations such as adding, dropping, filtering, and renaming columns, as well as handling edge cases like empty DataFrames and invalid operations. The removal streamlines the test suite by eliminating outdated or redundant tests.

* Add unit tests for DataFrame operations in

- Introduced a new test file  for organizing test components.
- Updated import paths for  to reflect the new module structure.
- Refactored test cases to use  for better readability and maintainability.
- Enhanced assertions in tests for various DataFrame operations, including handling of empty DataFrames and invalid operations.
- Improved code formatting for consistency and clarity.

* Refactor DataFrameOperationsComponent for improved readability and maintainability

- Consolidated import statements for clarity.
- Renamed variable `df` to `dataframe_copy` for better understanding.
- Streamlined the `perform_operation` method by replacing `elif` with `if` statements for clearer logic flow.
- Enhanced error message for unsupported operations to improve debugging.

These changes aim to enhance the code structure and make future modifications easier.

* Update unit tests for DataFrame operations in `test_dataframe_operations.py`

- Modified expected values in parameterized tests for various DataFrame operations, including "Add Column", "Filter", "Sort", "Head", "Tail", and "Replace Value" to reflect new test scenarios.
- Adjusted assertions to ensure they correctly validate the output of operations, particularly for lists of expected values.
- Enhanced error handling in the test for invalid operations to provide clearer feedback on unsupported operation types.

These changes improve the accuracy and robustness of the unit tests for DataFrame operations.

* Refactor DataFrameOperationsComponent methods to return DataFrame instances consistently

---------

Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
Rodrigo Nader 2024-12-19 13:14:04 -03:00 committed by GitHub
commit 62c13adcfd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 296 additions and 0 deletions

View file

@ -0,0 +1,212 @@
from langflow.custom import Component
from langflow.io import BoolInput, DataFrameInput, DropdownInput, IntInput, MessageTextInput, Output, StrInput
from langflow.schema import DataFrame
class DataFrameOperationsComponent(Component):
display_name = "DataFrame Operations"
description = "Perform various operations on a DataFrame."
icon = "table"
# Available operations
OPERATION_CHOICES = [
"Add Column",
"Drop Column",
"Filter",
"Head",
"Rename Column",
"Replace Value",
"Select Columns",
"Sort",
"Tail",
]
inputs = [
DataFrameInput(
name="df",
display_name="DataFrame",
info="The input DataFrame to operate on.",
),
DropdownInput(
name="operation",
display_name="Operation",
options=OPERATION_CHOICES,
info="Select the DataFrame operation to perform.",
real_time_refresh=True,
),
StrInput(
name="column_name",
display_name="Column Name",
info="The column name to use for the operation.",
dynamic=True,
show=False,
),
MessageTextInput(
name="filter_value",
display_name="Filter Value",
info="The value to filter rows by.",
dynamic=True,
show=False,
),
BoolInput(
name="ascending",
display_name="Sort Ascending",
info="Whether to sort in ascending order.",
dynamic=True,
show=False,
value=True,
),
StrInput(
name="new_column_name",
display_name="New Column Name",
info="The new column name when renaming or adding a column.",
dynamic=True,
show=False,
),
MessageTextInput(
name="new_column_value",
display_name="New Column Value",
info="The value to populate the new column with.",
dynamic=True,
show=False,
),
StrInput(
name="columns_to_select",
display_name="Columns to Select",
dynamic=True,
is_list=True,
show=False,
),
IntInput(
name="num_rows",
display_name="Number of Rows",
info="Number of rows to return (for head/tail).",
dynamic=True,
show=False,
value=5,
),
MessageTextInput(
name="replace_value",
display_name="Value to Replace",
info="The value to replace in the column.",
dynamic=True,
show=False,
),
MessageTextInput(
name="replacement_value",
display_name="Replacement Value",
info="The value to replace with.",
dynamic=True,
show=False,
),
]
outputs = [
Output(
display_name="DataFrame",
name="output",
method="perform_operation",
info="The resulting DataFrame after the operation.",
)
]
def update_build_config(self, build_config, field_value, field_name=None):
# Hide all dynamic fields by default
dynamic_fields = [
"column_name",
"filter_value",
"ascending",
"new_column_name",
"new_column_value",
"columns_to_select",
"num_rows",
"replace_value",
"replacement_value",
]
for field in dynamic_fields:
build_config[field]["show"] = False
# Show relevant fields based on the selected operation
if field_name == "operation":
if field_value == "Filter":
build_config["column_name"]["show"] = True
build_config["filter_value"]["show"] = True
elif field_value == "Sort":
build_config["column_name"]["show"] = True
build_config["ascending"]["show"] = True
elif field_value == "Drop Column":
build_config["column_name"]["show"] = True
elif field_value == "Rename Column":
build_config["column_name"]["show"] = True
build_config["new_column_name"]["show"] = True
elif field_value == "Add Column":
build_config["new_column_name"]["show"] = True
build_config["new_column_value"]["show"] = True
elif field_value == "Select Columns":
build_config["columns_to_select"]["show"] = True
elif field_value in ["Head", "Tail"]:
build_config["num_rows"]["show"] = True
elif field_value == "Replace Value":
build_config["column_name"]["show"] = True
build_config["replace_value"]["show"] = True
build_config["replacement_value"]["show"] = True
return build_config
def perform_operation(self) -> DataFrame:
dataframe_copy = self.df.copy()
operation = self.operation
if operation == "Filter":
return self.filter_rows_by_value(dataframe_copy)
if operation == "Sort":
return self.sort_by_column(dataframe_copy)
if operation == "Drop Column":
return self.drop_column(dataframe_copy)
if operation == "Rename Column":
return self.rename_column(dataframe_copy)
if operation == "Add Column":
return self.add_column(dataframe_copy)
if operation == "Select Columns":
return self.select_columns(dataframe_copy)
if operation == "Head":
return self.head(dataframe_copy)
if operation == "Tail":
return self.tail(dataframe_copy)
if operation == "Replace Value":
return self.replace_values(dataframe_copy)
msg = f"Unsupported operation: {operation}"
raise ValueError(msg)
# Existing methods
def filter_rows_by_value(self, df: DataFrame) -> DataFrame:
return DataFrame(df[df[self.column_name] == self.filter_value])
def sort_by_column(self, df: DataFrame) -> DataFrame:
return DataFrame(df.sort_values(by=self.column_name, ascending=self.ascending))
def drop_column(self, df: DataFrame) -> DataFrame:
return DataFrame(df.drop(columns=[self.column_name]))
def rename_column(self, df: DataFrame) -> DataFrame:
return DataFrame(df.rename(columns={self.column_name: self.new_column_name}))
def add_column(self, df: DataFrame) -> DataFrame:
df[self.new_column_name] = [self.new_column_value] * len(df)
return DataFrame(df)
def select_columns(self, df: DataFrame) -> DataFrame:
columns = [col.strip() for col in self.columns_to_select]
return DataFrame(df[columns])
# New methods
def head(self, df: DataFrame) -> DataFrame:
return DataFrame(df.head(self.num_rows))
def tail(self, df: DataFrame) -> DataFrame:
return DataFrame(df.tail(self.num_rows))
def replace_values(self, df: DataFrame) -> DataFrame:
df[self.column_name] = df[self.column_name].replace(self.replace_value, self.replacement_value)
return DataFrame(df)

View file

@ -0,0 +1,84 @@
import pandas as pd
import pytest
from langflow.components.processing.dataframe_operations import DataFrameOperationsComponent
@pytest.fixture
def sample_dataframe():
data = {"A": [1, 2, 3, 4, 5], "B": [5, 4, 3, 2, 1], "C": ["a", "b", "c", "d", "e"]}
return pd.DataFrame(data)
@pytest.mark.parametrize(
("operation", "expected_columns", "expected_values"),
[
("Add Column", ["A", "B", "C", "D"], [1, 5, "a", 10]),
("Drop Column", ["A", "C"], None),
("Filter", ["A", "B", "C"], [3, 3, "c"]),
("Sort", ["A", "B", "C"], [5, 1, "e"]),
("Rename Column", ["Z", "B", "C"], None),
("Select Columns", ["A", "C"], None),
("Head", ["A", "B", "C"], [1, 5, "a"]),
("Tail", ["A", "B", "C"], [5, 1, "e"]),
("Replace Value", ["A", "B", "C"], [1, 5, "z"]),
],
)
def test_operations(sample_dataframe, operation, expected_columns, expected_values):
component = DataFrameOperationsComponent()
component.df = sample_dataframe
component.operation = operation
if operation == "Add Column":
component.new_column_name = "D"
component.new_column_value = 10
elif operation == "Drop Column":
component.column_name = "B"
elif operation == "Filter":
component.column_name = "A"
component.filter_value = 3
elif operation == "Sort":
component.column_name = "A"
component.ascending = False
elif operation == "Rename Column":
component.column_name = "A"
component.new_column_name = "Z"
elif operation == "Select Columns":
component.columns_to_select = ["A", "C"]
elif operation in ("Head", "Tail"):
component.num_rows = 1
elif operation == "Replace Value":
component.column_name = "C"
component.replace_value = "a"
component.replacement_value = "z"
result = component.perform_operation()
assert list(result.columns) == expected_columns
if expected_values is not None and isinstance(expected_values, list):
assert list(result.iloc[0]) == expected_values
def test_empty_dataframe():
component = DataFrameOperationsComponent()
component.df = pd.DataFrame()
component.operation = "Head"
component.num_rows = 3
result = component.perform_operation()
assert result.empty
def test_non_existent_column():
component = DataFrameOperationsComponent()
component.df = pd.DataFrame({"A": [1, 2, 3]})
component.operation = "Drop Column"
component.column_name = "B"
with pytest.raises(KeyError):
component.perform_operation()
def test_invalid_operation():
component = DataFrameOperationsComponent()
component.df = pd.DataFrame({"A": [1, 2, 3]})
component.operation = "Invalid Operation"
with pytest.raises(ValueError, match="Unsupported operation: Invalid Operation"):
component.perform_operation()