feat: New parser component with multiple input types and stringify add on (#6652)

* update to parser

* error handling

* solve lint error and added tests

* [autofix.ci] apply automated fixes

* [autofix.ci] apply automated fixes

* Update parser.py

* fix format errors

* [autofix.ci] apply automated fixes

* refactor: Remove hardcoded name attribute from ParserComponent

* Update src/backend/base/langflow/components/processing/parser.py

Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>

* error fix

* [autofix.ci] apply automated fixes

* feat: mark ParserComponent as beta

Added a beta flag to the ParserComponent to indicate its experimental status.

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Co-authored-by: Ítalo Johnny <italojohnnydosanjos@gmail.com>
Co-authored-by: Rodrigo <rodrigosilvanader@gmail.com>
This commit is contained in:
Edwin Jose 2025-03-14 08:19:00 -04:00 committed by GitHub
commit e3cf852307
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 387 additions and 0 deletions

View file

@ -9,6 +9,7 @@ from .merge_data import MergeDataComponent
from .message_to_data import MessageToDataComponent
from .parse_data import ParseDataComponent
from .parse_json_data import ParseJSONDataComponent
from .parser import ParserComponent
from .regex import RegexExtractorComponent
from .select_data import SelectDataComponent
from .split_text import SplitTextComponent
@ -27,6 +28,7 @@ __all__ = [
"ParseDataComponent",
"ParseDataFrameComponent",
"ParseJSONDataComponent",
"ParserComponent",
"RegexExtractorComponent",
"SelectDataComponent",
"SplitTextComponent",

View file

@ -0,0 +1,173 @@
from typing import Any
from langflow.custom import Component
from langflow.io import (
BoolInput,
HandleInput,
MessageTextInput,
MultilineInput,
Output,
)
from langflow.schema import Data, DataFrame
from langflow.schema.message import Message
class ParserComponent(Component):
display_name = "Parser"
description = (
"Format a DataFrame or Data object into text using a template. "
"Enable 'Stringify' to convert input into a readable string instead."
)
icon = "braces"
beta = True
inputs = [
BoolInput(
name="stringify",
display_name="Stringify",
info="Enable to convert input to a string instead of using a template.",
value=False,
real_time_refresh=True,
),
MultilineInput(
name="template",
display_name="Template",
info=(
"Use variables within curly brackets to extract column values for DataFrames "
"or key values for Data."
"For example: `Name: {Name}, Age: {Age}, Country: {Country}`"
),
value="Text: {text}", # Example default
dynamic=True,
show=True,
required=True,
),
HandleInput(
name="input_data",
display_name="Data or DataFrame",
input_types=["DataFrame", "Data"],
info="Accepts either a DataFrame or a Data object.",
required=True,
),
MessageTextInput(
name="sep",
display_name="Separator",
advanced=True,
value="\n",
info="String used to separate rows/items.",
),
]
outputs = [
Output(
display_name="Parsed Text",
name="parsed_text",
info="Formatted text output.",
method="parse_combined_text",
),
]
def update_build_config(self, build_config, field_value, field_name=None):
"""Dynamically hide/show `template` and enforce requirement based on `stringify`."""
if field_name == "stringify":
build_config["template"]["show"] = not field_value
build_config["template"]["required"] = not field_value
if field_value:
clean_data = BoolInput(
name="clean_data",
display_name="Clean Data",
info=(
"Enable to clean the data by removing empty rows and lines "
"in each cell of the DataFrame/ Data object."
),
value=True,
advanced=True,
required=False,
)
build_config["clean_data"] = clean_data.to_dict()
else:
build_config.pop("clean_data", None)
return build_config
def _clean_args(self):
"""Prepare arguments based on input type."""
input_data = self.input_data
match input_data:
case list() if all(isinstance(item, Data) for item in input_data):
msg = "List of Data objects is not supported."
raise ValueError(msg)
case DataFrame():
return input_data, None
case Data():
return None, input_data
case dict() if "data" in input_data:
try:
if "columns" in input_data: # Likely a DataFrame
return DataFrame.from_dict(input_data), None
# Likely a Data object
return None, Data(**input_data)
except (TypeError, ValueError, KeyError) as e:
msg = f"Invalid structured input provided: {e!s}"
raise ValueError(msg) from e
case _:
msg = f"Unsupported input type: {type(input_data)}. Expected DataFrame or Data."
raise ValueError(msg)
def parse_combined_text(self) -> Message:
"""Parse all rows/items into a single text or convert input to string if `stringify` is enabled."""
# Early return for stringify option
if self.stringify:
return self.convert_to_string()
df, data = self._clean_args()
lines = []
if df is not None:
for _, row in df.iterrows():
formatted_text = self.template.format(**row.to_dict())
lines.append(formatted_text)
elif data is not None:
formatted_text = self.template.format(text=data.get_text())
lines.append(formatted_text)
combined_text = self.sep.join(lines)
self.status = combined_text
return Message(text=combined_text)
def _safe_convert(self, data: Any) -> str:
"""Safely convert input data to string."""
try:
if isinstance(data, str):
return data
if isinstance(data, Message):
return data.get_text()
if isinstance(data, Data):
if data.get_text() is None:
msg = "Empty Data object"
raise ValueError(msg)
return data.get_text()
if isinstance(data, DataFrame):
if hasattr(self, "clean_data") and self.clean_data:
# Remove empty rows
data = data.dropna(how="all")
# Remove empty lines in each cell
data = data.replace(r"^\s*$", "", regex=True)
# Replace multiple newlines with a single newline
data = data.replace(r"\n+", "\n", regex=True)
return data.to_markdown(index=False)
return str(data)
except (ValueError, TypeError, AttributeError) as e:
msg = f"Error converting data: {e!s}"
raise ValueError(msg) from e
def convert_to_string(self) -> Message:
"""Convert input data to string with proper error handling."""
result = ""
if isinstance(self.input_data, list):
result = "\n".join([self._safe_convert(item) for item in self.input_data])
else:
result = self._safe_convert(self.input_data)
self.log(f"Converted to string with length: {len(result)}")
return Message(text=result)

View file

@ -0,0 +1,212 @@
import pytest
from langflow.components.processing.parser import ParserComponent
from langflow.schema import Data, DataFrame
from langflow.schema.message import Message
from tests.base import ComponentTestBaseWithoutClient
class TestParserComponent(ComponentTestBaseWithoutClient):
@pytest.fixture
def component_class(self):
"""Return the component class to test."""
return ParserComponent
@pytest.fixture
def default_kwargs(self):
"""Return the default kwargs for the component."""
return {
"input_data": DataFrame({"Name": ["John"], "Age": [30], "Country": ["USA"]}),
"template": "Name: {Name}, Age: {Age}, Country: {Country}",
"sep": "\n",
"stringify": False,
"clean_data": False,
}
@pytest.fixture
def file_names_mapping(self):
"""Return an empty list since this component doesn't have version-specific files."""
return []
def test_parse_dataframe(self, component_class, default_kwargs):
# Arrange
component = component_class(**default_kwargs)
# Act
result = component.parse_combined_text()
# Assert
assert isinstance(result, Message)
assert result.text == "Name: John, Age: 30, Country: USA"
def test_parse_data_object(self, component_class):
# Arrange
data = Data(text="Hello World")
kwargs = {
"input_data": data,
"template": "text: {text}",
"sep": "\n",
"stringify": False,
}
component = component_class(**kwargs)
# Act
result = component.parse_combined_text()
# Assert
assert isinstance(result, Message)
assert result.text == "text: Hello World"
def test_stringify_dataframe(self, component_class):
# Arrange
data_frame = DataFrame({"Name": ["John", "Jane"], "Age": [30, 25]})
kwargs = {
"input_data": data_frame,
"stringify": True,
"clean_data": False,
}
component = component_class(**kwargs)
# Act
result = component.parse_combined_text()
# Assert
assert isinstance(result, Message)
assert "| Name | Age |" in result.text
assert "| John | 30 |" in result.text
assert "| Jane | 25 |" in result.text
def test_stringify_data_object(self, component_class):
# Arrange
data = Data(text="Hello\nWorld\nMultiline\nText")
kwargs = {
"input_data": data,
"stringify": True,
"clean_data": True,
}
component = component_class(**kwargs)
# Act
result = component.parse_combined_text()
# Assert
assert isinstance(result, Message)
assert "Hello" in result.text
assert "World" in result.text
assert "Multiline" in result.text
assert "Text" in result.text
def test_stringify_message_object(self, component_class):
# Arrange
message = Message(text="Test message content")
kwargs = {
"input_data": message,
"stringify": True,
}
component = component_class(**kwargs)
# Act
result = component.parse_combined_text()
# Assert
assert isinstance(result, Message)
assert result.text == "Test message content"
def test_clean_data_with_stringify(self, component_class):
# Arrange
data_frame = DataFrame(
{"Name": ["John", "Jane\n", "\nBob"], "Age": [30, None, 25], "Notes": ["Good\n\nPerson", "", "Nice\n"]}
)
kwargs = {
"input_data": data_frame,
"stringify": True,
"clean_data": True,
}
component = component_class(**kwargs)
# Act
result = component.parse_combined_text()
# Assert
assert isinstance(result, Message)
# Check for table structure
assert "| Name" in result.text
assert "| Age" in result.text
assert "| Notes" in result.text
# Check for cleaned data
assert "| John" in result.text
assert "| Jane" in result.text
assert "| Bob" in result.text
assert "| Good" in result.text
assert "| Person" in result.text
assert "| Nice" in result.text
# Verify data is cleaned
assert "Jane\n" not in result.text
assert "\nBob" not in result.text
assert "Good\n\nPerson" not in result.text
assert "Nice\n" not in result.text
def test_invalid_input_type(self, component_class):
# Arrange
kwargs = {
"input_data": 123, # Invalid input type
"template": "{value}",
"sep": "\n",
}
component = component_class(**kwargs)
# Act & Assert
with pytest.raises(ValueError, match="Unsupported input type: <class 'int'>. Expected DataFrame or Data."):
component.parse_combined_text()
def test_none_input(self, component_class):
# Arrange
kwargs = {
"input_data": None,
"template": "{value}",
"sep": "\n",
}
component = component_class(**kwargs)
# Act & Assert
with pytest.raises(ValueError, match="Unsupported input type: <class 'NoneType'>. Expected DataFrame or Data."):
component.parse_combined_text()
def test_invalid_template(self, component_class):
# Arrange
data_frame = DataFrame({"Name": ["John"]})
kwargs = {
"input_data": data_frame,
"template": "{InvalidColumn}", # Invalid column name
"sep": "\n",
"stringify": False,
}
component = component_class(**kwargs)
# Act & Assert
with pytest.raises(KeyError):
component.parse_combined_text()
def test_multiple_rows_with_custom_separator(self, component_class):
# Arrange
data_frame = DataFrame(
{
"Name": ["John", "Jane", "Bob"],
"Age": [30, 25, 35],
}
)
kwargs = {
"input_data": data_frame,
"template": "{Name} is {Age} years old",
"sep": " | ",
"stringify": False,
}
component = component_class(**kwargs)
# Act
result = component.parse_combined_text()
# Assert
assert isinstance(result, Message)
expected = "John is 30 years old | Jane is 25 years old | Bob is 35 years old"
assert result.text == expected