feat: Add ParseDataFrameComponent for DataFrame-to-text conversion with tests (#5594)

* add dataframe outputs to vector stores, directory, url, split text

* [autofix.ci] apply automated fixes

* [autofix.ci] apply automated fixes (attempt 2/3)

* add parse dataframe

* [autofix.ci] apply automated fixes

* Refactor: Update DataFrame handling in components

- Added import of DataFrame in directory and url components.
- Renamed variable 'df' to 'dataframe' in ParseDataFrameComponent for clarity.
- Updated method _clean_args and parse_data to use 'dataframe' instead of 'df' for consistency.

These changes enhance code readability and maintainability by standardizing the terminology used for DataFrame objects.

* [autofix.ci] apply automated fixes

* remove parse dataframe

* feat: add parse dataframe component

* [autofix.ci] apply automated fixes

* Refactor: Remove duplicate as_dataframe method in LCVectorStoreComponent

This commit eliminates the redundant as_dataframe method in the LCVectorStoreComponent class, streamlining the code and improving maintainability. The method was previously defined twice, and this change enhances clarity by ensuring only one implementation exists.

* [autofix.ci] apply automated fixes

* Refactor: Standardize DataFrame variable naming in ParseDataFrameComponent

This commit renames the variable 'df' to 'dataframe' in the ParseDataFrameComponent class to improve clarity and consistency. The changes are reflected in the _clean_args and parse_data methods, enhancing code readability and maintainability.

* test: add unit tests for ParseDataFrameComponent

This commit introduces a comprehensive suite of unit tests for the ParseDataFrameComponent, covering various scenarios including successful parsing with default and custom templates, handling of empty dataframes, invalid template keys, and performance on large dataframes. The tests ensure that the component behaves correctly with different data types and separators, and validate its functionality in both synchronous and asynchronous contexts. These additions enhance the reliability and maintainability of the component.

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Co-authored-by: Edwin Jose <edwin.jose@datastax.com>
This commit is contained in:
Rodrigo Nader 2025-01-17 20:41:42 -03:00 committed by GitHub
commit 8d902e6c74
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 210 additions and 0 deletions

View file

@ -24,6 +24,7 @@ __all__ = [
"MergeDataComponent",
"MessageToDataComponent",
"ParseDataComponent",
"ParseDataFrameComponent",
"ParseJSONDataComponent",
"SelectDataComponent",
"SplitTextComponent",

View file

@ -0,0 +1,67 @@
from langflow.custom import Component
from langflow.io import DataFrameInput, MultilineInput, Output, StrInput
from langflow.schema.message import Message
class ParseDataFrameComponent(Component):
display_name = "Parse DataFrame"
description = (
"Convert a DataFrame into plain text following a specified template. "
"Each column in the DataFrame is treated as a possible template key, e.g. {col_name}."
)
icon = "braces"
name = "ParseDataFrame"
inputs = [
DataFrameInput(name="df", display_name="DataFrame", info="The DataFrame to convert to text rows."),
MultilineInput(
name="template",
display_name="Template",
info=(
"The template for formatting each row. "
"Use placeholders matching column names in the DataFrame, for example '{col1}', '{col2}'."
),
value="{text}",
),
StrInput(
name="sep",
display_name="Separator",
advanced=True,
value="\n",
info="String that joins all row texts when building the single Text output.",
),
]
outputs = [
Output(
display_name="Text",
name="text",
info="All rows combined into a single text, each row formatted by the template and separated by `sep`.",
method="parse_data",
),
]
def _clean_args(self):
dataframe = self.df
template = self.template or "{text}"
sep = self.sep or "\n"
return dataframe, template, sep
def parse_data(self) -> Message:
"""Converts each row of the DataFrame into a formatted string using the template.
then joins them with `sep`. Returns a single combined string as a Message.
"""
dataframe, template, sep = self._clean_args()
lines = []
# For each row in the DataFrame, build a dict and format
for _, row in dataframe.iterrows():
row_dict = row.to_dict()
text_line = template.format(**row_dict) # e.g. template="{text}", row_dict={"text": "Hello"}
lines.append(text_line)
# Join all lines with the provided separator
result_string = sep.join(lines)
self.status = result_string # store in self.status for UI logs
return Message(text=result_string)

View file

@ -0,0 +1,142 @@
import asyncio
import pandas as pd
import pytest
from langflow.components.processing.parse_dataframe import ParseDataFrameComponent
from langflow.schema import DataFrame
from langflow.schema.message import Message
from tests.base import ComponentTestBaseWithoutClient
class TestParseDataFrameComponent(ComponentTestBaseWithoutClient):
@pytest.fixture
def component_class(self):
"""Return the component class to test."""
return ParseDataFrameComponent
@pytest.fixture
def default_kwargs(self):
"""Return the default kwargs for the component."""
return {"df": DataFrame({"text": ["Hello"]}), "template": "{text}", "sep": "\n"}
@pytest.fixture
def file_names_mapping(self):
"""Return an empty list since this component doesn't have version-specific files."""
return []
def test_successful_parse_with_default_template(self):
# Create test data
test_df = DataFrame({"text": ["Hello", "World", "Test"]})
component = ParseDataFrameComponent(df=test_df, template="{text}", sep="\n")
# Run the parse process
result = component.parse_data()
# Verify the results
assert isinstance(result, Message)
assert result.text == "Hello\nWorld\nTest"
assert component.status == "Hello\nWorld\nTest"
def test_parse_with_custom_template(self):
test_df = DataFrame({"name": ["John", "Jane"], "age": [30, 25]})
component = ParseDataFrameComponent(df=test_df, template="Name: {name}, Age: {age}", sep=" | ")
result = component.parse_data()
assert isinstance(result, Message)
assert result.text == "Name: John, Age: 30 | Name: Jane, Age: 25"
def test_parse_with_custom_separator(self):
test_df = DataFrame({"text": ["Hello", "World"]})
component = ParseDataFrameComponent(df=test_df, template="{text}", sep=" --- ")
result = component.parse_data()
assert isinstance(result, Message)
assert result.text == "Hello --- World"
def test_empty_dataframe(self):
component = ParseDataFrameComponent(df=DataFrame({"text": []}), template="{text}", sep="\n")
result = component.parse_data()
assert isinstance(result, Message)
assert result.text == ""
def test_invalid_template_keys(self):
component = ParseDataFrameComponent(
df=DataFrame({"text": ["Hello"]}), template="{nonexistent_column}", sep="\n"
)
with pytest.raises(KeyError):
component.parse_data()
def test_multiple_column_template(self):
test_df = DataFrame({"col1": ["A", "B"], "col2": [1, 2], "col3": ["X", "Y"]})
component = ParseDataFrameComponent(df=test_df, template="{col1}-{col2}-{col3}", sep=", ")
result = component.parse_data()
assert isinstance(result, Message)
assert result.text == "A-1-X, B-2-Y"
@pytest.mark.asyncio
async def test_async_invocation(self, component_class, default_kwargs):
"""Verify that ParseDataFrameComponent can be called in an async context."""
component = component_class(**default_kwargs)
# Use asyncio.to_thread to invoke the parse_data method in a thread pool
result = await asyncio.to_thread(component.parse_data)
assert isinstance(result, Message)
def test_various_data_types(self, component_class):
"""Test that the component correctly formats differing data types."""
test_dataframe = DataFrame(
{
"string_col": ["A", "B"],
"int_col": [1, 2],
"bool_col": [True, False],
"time_col": pd.to_datetime(["2023-01-01", "2023-01-02"]),
}
)
template = "{string_col}-{int_col}-{bool_col}-{time_col}"
component = component_class(df=test_dataframe, template=template, sep=" | ")
result = component.parse_data()
assert isinstance(result, Message)
# Just check that all columns are present in the text
assert "A-1-True-2023-01-01" in result.text
def test_nan_values(self, component_class):
"""Test how the component handles missing/NaN values in the DataFrame."""
test_dataframe = DataFrame(
{
"col1": ["Hello", None],
"col2": [10, float("nan")],
}
)
template = "{col1}-{col2}"
component = component_class(df=test_dataframe, template=template, sep="\n")
result = component.parse_data()
# Expect None or NaN to be converted to the string "None" or "nan"
# depending on Python's behavior
assert isinstance(result, Message)
# The exact representation can depend on how pandas handles None/NaN.
# Typically, None -> 'None' and NaN -> 'nan'.
# You can refine these assertions if you have a custom conversion.
assert "Hello-10" in result.text
def test_large_dataframe(self, component_class):
"""Test performance and correctness on a relatively large DataFrame."""
data = {
"col": [f"Row{i}" for i in range(10000)], # 10k rows
}
large_dataframe = DataFrame(data)
component = component_class(df=large_dataframe, template="{col}", sep=", ")
result = component.parse_data()
assert isinstance(result, Message)
# Check the length of the result isn't zero, ensuring it didn't fail
assert len(result.text) > 0
# Optionally, you can assert the result includes a substring from the middle
assert "Row5000" in result.text