feat: New parser component with multiple input types and stringify add on (#6652)
* update to parser * error handling * solve lint error and added tests * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes * Update parser.py * fix format errors * [autofix.ci] apply automated fixes * refactor: Remove hardcoded name attribute from ParserComponent * Update src/backend/base/langflow/components/processing/parser.py Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> * error fix * [autofix.ci] apply automated fixes * feat: mark ParserComponent as beta Added a beta flag to the ParserComponent to indicate its experimental status. --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> Co-authored-by: Ítalo Johnny <italojohnnydosanjos@gmail.com> Co-authored-by: Rodrigo <rodrigosilvanader@gmail.com>
This commit is contained in:
parent
94bc8dbc7d
commit
e3cf852307
3 changed files with 387 additions and 0 deletions
|
|
@ -9,6 +9,7 @@ from .merge_data import MergeDataComponent
|
|||
from .message_to_data import MessageToDataComponent
|
||||
from .parse_data import ParseDataComponent
|
||||
from .parse_json_data import ParseJSONDataComponent
|
||||
from .parser import ParserComponent
|
||||
from .regex import RegexExtractorComponent
|
||||
from .select_data import SelectDataComponent
|
||||
from .split_text import SplitTextComponent
|
||||
|
|
@ -27,6 +28,7 @@ __all__ = [
|
|||
"ParseDataComponent",
|
||||
"ParseDataFrameComponent",
|
||||
"ParseJSONDataComponent",
|
||||
"ParserComponent",
|
||||
"RegexExtractorComponent",
|
||||
"SelectDataComponent",
|
||||
"SplitTextComponent",
|
||||
|
|
|
|||
173
src/backend/base/langflow/components/processing/parser.py
Normal file
173
src/backend/base/langflow/components/processing/parser.py
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
from typing import Any
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.io import (
|
||||
BoolInput,
|
||||
HandleInput,
|
||||
MessageTextInput,
|
||||
MultilineInput,
|
||||
Output,
|
||||
)
|
||||
from langflow.schema import Data, DataFrame
|
||||
from langflow.schema.message import Message
|
||||
|
||||
|
||||
class ParserComponent(Component):
|
||||
display_name = "Parser"
|
||||
description = (
|
||||
"Format a DataFrame or Data object into text using a template. "
|
||||
"Enable 'Stringify' to convert input into a readable string instead."
|
||||
)
|
||||
icon = "braces"
|
||||
beta = True
|
||||
|
||||
inputs = [
|
||||
BoolInput(
|
||||
name="stringify",
|
||||
display_name="Stringify",
|
||||
info="Enable to convert input to a string instead of using a template.",
|
||||
value=False,
|
||||
real_time_refresh=True,
|
||||
),
|
||||
MultilineInput(
|
||||
name="template",
|
||||
display_name="Template",
|
||||
info=(
|
||||
"Use variables within curly brackets to extract column values for DataFrames "
|
||||
"or key values for Data."
|
||||
"For example: `Name: {Name}, Age: {Age}, Country: {Country}`"
|
||||
),
|
||||
value="Text: {text}", # Example default
|
||||
dynamic=True,
|
||||
show=True,
|
||||
required=True,
|
||||
),
|
||||
HandleInput(
|
||||
name="input_data",
|
||||
display_name="Data or DataFrame",
|
||||
input_types=["DataFrame", "Data"],
|
||||
info="Accepts either a DataFrame or a Data object.",
|
||||
required=True,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="sep",
|
||||
display_name="Separator",
|
||||
advanced=True,
|
||||
value="\n",
|
||||
info="String used to separate rows/items.",
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(
|
||||
display_name="Parsed Text",
|
||||
name="parsed_text",
|
||||
info="Formatted text output.",
|
||||
method="parse_combined_text",
|
||||
),
|
||||
]
|
||||
|
||||
def update_build_config(self, build_config, field_value, field_name=None):
|
||||
"""Dynamically hide/show `template` and enforce requirement based on `stringify`."""
|
||||
if field_name == "stringify":
|
||||
build_config["template"]["show"] = not field_value
|
||||
build_config["template"]["required"] = not field_value
|
||||
if field_value:
|
||||
clean_data = BoolInput(
|
||||
name="clean_data",
|
||||
display_name="Clean Data",
|
||||
info=(
|
||||
"Enable to clean the data by removing empty rows and lines "
|
||||
"in each cell of the DataFrame/ Data object."
|
||||
),
|
||||
value=True,
|
||||
advanced=True,
|
||||
required=False,
|
||||
)
|
||||
build_config["clean_data"] = clean_data.to_dict()
|
||||
else:
|
||||
build_config.pop("clean_data", None)
|
||||
|
||||
return build_config
|
||||
|
||||
def _clean_args(self):
|
||||
"""Prepare arguments based on input type."""
|
||||
input_data = self.input_data
|
||||
|
||||
match input_data:
|
||||
case list() if all(isinstance(item, Data) for item in input_data):
|
||||
msg = "List of Data objects is not supported."
|
||||
raise ValueError(msg)
|
||||
case DataFrame():
|
||||
return input_data, None
|
||||
case Data():
|
||||
return None, input_data
|
||||
case dict() if "data" in input_data:
|
||||
try:
|
||||
if "columns" in input_data: # Likely a DataFrame
|
||||
return DataFrame.from_dict(input_data), None
|
||||
# Likely a Data object
|
||||
return None, Data(**input_data)
|
||||
except (TypeError, ValueError, KeyError) as e:
|
||||
msg = f"Invalid structured input provided: {e!s}"
|
||||
raise ValueError(msg) from e
|
||||
case _:
|
||||
msg = f"Unsupported input type: {type(input_data)}. Expected DataFrame or Data."
|
||||
raise ValueError(msg)
|
||||
|
||||
def parse_combined_text(self) -> Message:
|
||||
"""Parse all rows/items into a single text or convert input to string if `stringify` is enabled."""
|
||||
# Early return for stringify option
|
||||
if self.stringify:
|
||||
return self.convert_to_string()
|
||||
|
||||
df, data = self._clean_args()
|
||||
|
||||
lines = []
|
||||
if df is not None:
|
||||
for _, row in df.iterrows():
|
||||
formatted_text = self.template.format(**row.to_dict())
|
||||
lines.append(formatted_text)
|
||||
elif data is not None:
|
||||
formatted_text = self.template.format(text=data.get_text())
|
||||
lines.append(formatted_text)
|
||||
|
||||
combined_text = self.sep.join(lines)
|
||||
self.status = combined_text
|
||||
return Message(text=combined_text)
|
||||
|
||||
def _safe_convert(self, data: Any) -> str:
|
||||
"""Safely convert input data to string."""
|
||||
try:
|
||||
if isinstance(data, str):
|
||||
return data
|
||||
if isinstance(data, Message):
|
||||
return data.get_text()
|
||||
if isinstance(data, Data):
|
||||
if data.get_text() is None:
|
||||
msg = "Empty Data object"
|
||||
raise ValueError(msg)
|
||||
return data.get_text()
|
||||
if isinstance(data, DataFrame):
|
||||
if hasattr(self, "clean_data") and self.clean_data:
|
||||
# Remove empty rows
|
||||
data = data.dropna(how="all")
|
||||
# Remove empty lines in each cell
|
||||
data = data.replace(r"^\s*$", "", regex=True)
|
||||
# Replace multiple newlines with a single newline
|
||||
data = data.replace(r"\n+", "\n", regex=True)
|
||||
return data.to_markdown(index=False)
|
||||
return str(data)
|
||||
except (ValueError, TypeError, AttributeError) as e:
|
||||
msg = f"Error converting data: {e!s}"
|
||||
raise ValueError(msg) from e
|
||||
|
||||
def convert_to_string(self) -> Message:
|
||||
"""Convert input data to string with proper error handling."""
|
||||
result = ""
|
||||
if isinstance(self.input_data, list):
|
||||
result = "\n".join([self._safe_convert(item) for item in self.input_data])
|
||||
else:
|
||||
result = self._safe_convert(self.input_data)
|
||||
self.log(f"Converted to string with length: {len(result)}")
|
||||
return Message(text=result)
|
||||
|
|
@ -0,0 +1,212 @@
|
|||
import pytest
|
||||
from langflow.components.processing.parser import ParserComponent
|
||||
from langflow.schema import Data, DataFrame
|
||||
from langflow.schema.message import Message
|
||||
|
||||
from tests.base import ComponentTestBaseWithoutClient
|
||||
|
||||
|
||||
class TestParserComponent(ComponentTestBaseWithoutClient):
|
||||
@pytest.fixture
|
||||
def component_class(self):
|
||||
"""Return the component class to test."""
|
||||
return ParserComponent
|
||||
|
||||
@pytest.fixture
|
||||
def default_kwargs(self):
|
||||
"""Return the default kwargs for the component."""
|
||||
return {
|
||||
"input_data": DataFrame({"Name": ["John"], "Age": [30], "Country": ["USA"]}),
|
||||
"template": "Name: {Name}, Age: {Age}, Country: {Country}",
|
||||
"sep": "\n",
|
||||
"stringify": False,
|
||||
"clean_data": False,
|
||||
}
|
||||
|
||||
@pytest.fixture
|
||||
def file_names_mapping(self):
|
||||
"""Return an empty list since this component doesn't have version-specific files."""
|
||||
return []
|
||||
|
||||
def test_parse_dataframe(self, component_class, default_kwargs):
|
||||
# Arrange
|
||||
component = component_class(**default_kwargs)
|
||||
|
||||
# Act
|
||||
result = component.parse_combined_text()
|
||||
|
||||
# Assert
|
||||
assert isinstance(result, Message)
|
||||
assert result.text == "Name: John, Age: 30, Country: USA"
|
||||
|
||||
def test_parse_data_object(self, component_class):
|
||||
# Arrange
|
||||
data = Data(text="Hello World")
|
||||
kwargs = {
|
||||
"input_data": data,
|
||||
"template": "text: {text}",
|
||||
"sep": "\n",
|
||||
"stringify": False,
|
||||
}
|
||||
component = component_class(**kwargs)
|
||||
|
||||
# Act
|
||||
result = component.parse_combined_text()
|
||||
|
||||
# Assert
|
||||
assert isinstance(result, Message)
|
||||
assert result.text == "text: Hello World"
|
||||
|
||||
def test_stringify_dataframe(self, component_class):
|
||||
# Arrange
|
||||
data_frame = DataFrame({"Name": ["John", "Jane"], "Age": [30, 25]})
|
||||
kwargs = {
|
||||
"input_data": data_frame,
|
||||
"stringify": True,
|
||||
"clean_data": False,
|
||||
}
|
||||
component = component_class(**kwargs)
|
||||
|
||||
# Act
|
||||
result = component.parse_combined_text()
|
||||
|
||||
# Assert
|
||||
assert isinstance(result, Message)
|
||||
assert "| Name | Age |" in result.text
|
||||
assert "| John | 30 |" in result.text
|
||||
assert "| Jane | 25 |" in result.text
|
||||
|
||||
def test_stringify_data_object(self, component_class):
|
||||
# Arrange
|
||||
data = Data(text="Hello\nWorld\nMultiline\nText")
|
||||
kwargs = {
|
||||
"input_data": data,
|
||||
"stringify": True,
|
||||
"clean_data": True,
|
||||
}
|
||||
component = component_class(**kwargs)
|
||||
|
||||
# Act
|
||||
result = component.parse_combined_text()
|
||||
|
||||
# Assert
|
||||
assert isinstance(result, Message)
|
||||
assert "Hello" in result.text
|
||||
assert "World" in result.text
|
||||
assert "Multiline" in result.text
|
||||
assert "Text" in result.text
|
||||
|
||||
def test_stringify_message_object(self, component_class):
|
||||
# Arrange
|
||||
message = Message(text="Test message content")
|
||||
kwargs = {
|
||||
"input_data": message,
|
||||
"stringify": True,
|
||||
}
|
||||
component = component_class(**kwargs)
|
||||
|
||||
# Act
|
||||
result = component.parse_combined_text()
|
||||
|
||||
# Assert
|
||||
assert isinstance(result, Message)
|
||||
assert result.text == "Test message content"
|
||||
|
||||
def test_clean_data_with_stringify(self, component_class):
|
||||
# Arrange
|
||||
data_frame = DataFrame(
|
||||
{"Name": ["John", "Jane\n", "\nBob"], "Age": [30, None, 25], "Notes": ["Good\n\nPerson", "", "Nice\n"]}
|
||||
)
|
||||
kwargs = {
|
||||
"input_data": data_frame,
|
||||
"stringify": True,
|
||||
"clean_data": True,
|
||||
}
|
||||
component = component_class(**kwargs)
|
||||
|
||||
# Act
|
||||
result = component.parse_combined_text()
|
||||
|
||||
# Assert
|
||||
assert isinstance(result, Message)
|
||||
# Check for table structure
|
||||
assert "| Name" in result.text
|
||||
assert "| Age" in result.text
|
||||
assert "| Notes" in result.text
|
||||
# Check for cleaned data
|
||||
assert "| John" in result.text
|
||||
assert "| Jane" in result.text
|
||||
assert "| Bob" in result.text
|
||||
assert "| Good" in result.text
|
||||
assert "| Person" in result.text
|
||||
assert "| Nice" in result.text
|
||||
# Verify data is cleaned
|
||||
assert "Jane\n" not in result.text
|
||||
assert "\nBob" not in result.text
|
||||
assert "Good\n\nPerson" not in result.text
|
||||
assert "Nice\n" not in result.text
|
||||
|
||||
def test_invalid_input_type(self, component_class):
|
||||
# Arrange
|
||||
kwargs = {
|
||||
"input_data": 123, # Invalid input type
|
||||
"template": "{value}",
|
||||
"sep": "\n",
|
||||
}
|
||||
component = component_class(**kwargs)
|
||||
|
||||
# Act & Assert
|
||||
with pytest.raises(ValueError, match="Unsupported input type: <class 'int'>. Expected DataFrame or Data."):
|
||||
component.parse_combined_text()
|
||||
|
||||
def test_none_input(self, component_class):
|
||||
# Arrange
|
||||
kwargs = {
|
||||
"input_data": None,
|
||||
"template": "{value}",
|
||||
"sep": "\n",
|
||||
}
|
||||
component = component_class(**kwargs)
|
||||
|
||||
# Act & Assert
|
||||
with pytest.raises(ValueError, match="Unsupported input type: <class 'NoneType'>. Expected DataFrame or Data."):
|
||||
component.parse_combined_text()
|
||||
|
||||
def test_invalid_template(self, component_class):
|
||||
# Arrange
|
||||
data_frame = DataFrame({"Name": ["John"]})
|
||||
kwargs = {
|
||||
"input_data": data_frame,
|
||||
"template": "{InvalidColumn}", # Invalid column name
|
||||
"sep": "\n",
|
||||
"stringify": False,
|
||||
}
|
||||
component = component_class(**kwargs)
|
||||
|
||||
# Act & Assert
|
||||
with pytest.raises(KeyError):
|
||||
component.parse_combined_text()
|
||||
|
||||
def test_multiple_rows_with_custom_separator(self, component_class):
|
||||
# Arrange
|
||||
data_frame = DataFrame(
|
||||
{
|
||||
"Name": ["John", "Jane", "Bob"],
|
||||
"Age": [30, 25, 35],
|
||||
}
|
||||
)
|
||||
kwargs = {
|
||||
"input_data": data_frame,
|
||||
"template": "{Name} is {Age} years old",
|
||||
"sep": " | ",
|
||||
"stringify": False,
|
||||
}
|
||||
component = component_class(**kwargs)
|
||||
|
||||
# Act
|
||||
result = component.parse_combined_text()
|
||||
|
||||
# Assert
|
||||
assert isinstance(result, Message)
|
||||
expected = "John is 30 years old | Jane is 25 years old | Bob is 35 years old"
|
||||
assert result.text == expected
|
||||
Loading…
Add table
Add a link
Reference in a new issue