diff --git a/src/backend/base/langflow/components/processing/__init__.py b/src/backend/base/langflow/components/processing/__init__.py index 1a57c4cb5..d9e7faf1d 100644 --- a/src/backend/base/langflow/components/processing/__init__.py +++ b/src/backend/base/langflow/components/processing/__init__.py @@ -9,6 +9,7 @@ from .merge_data import MergeDataComponent from .message_to_data import MessageToDataComponent from .parse_data import ParseDataComponent from .parse_json_data import ParseJSONDataComponent +from .parser import ParserComponent from .regex import RegexExtractorComponent from .select_data import SelectDataComponent from .split_text import SplitTextComponent @@ -27,6 +28,7 @@ __all__ = [ "ParseDataComponent", "ParseDataFrameComponent", "ParseJSONDataComponent", + "ParserComponent", "RegexExtractorComponent", "SelectDataComponent", "SplitTextComponent", diff --git a/src/backend/base/langflow/components/processing/parser.py b/src/backend/base/langflow/components/processing/parser.py new file mode 100644 index 000000000..62a9ebb52 --- /dev/null +++ b/src/backend/base/langflow/components/processing/parser.py @@ -0,0 +1,173 @@ +from typing import Any + +from langflow.custom import Component +from langflow.io import ( + BoolInput, + HandleInput, + MessageTextInput, + MultilineInput, + Output, +) +from langflow.schema import Data, DataFrame +from langflow.schema.message import Message + + +class ParserComponent(Component): + display_name = "Parser" + description = ( + "Format a DataFrame or Data object into text using a template. " + "Enable 'Stringify' to convert input into a readable string instead." + ) + icon = "braces" + beta = True + + inputs = [ + BoolInput( + name="stringify", + display_name="Stringify", + info="Enable to convert input to a string instead of using a template.", + value=False, + real_time_refresh=True, + ), + MultilineInput( + name="template", + display_name="Template", + info=( + "Use variables within curly brackets to extract column values for DataFrames " + "or key values for Data." + "For example: `Name: {Name}, Age: {Age}, Country: {Country}`" + ), + value="Text: {text}", # Example default + dynamic=True, + show=True, + required=True, + ), + HandleInput( + name="input_data", + display_name="Data or DataFrame", + input_types=["DataFrame", "Data"], + info="Accepts either a DataFrame or a Data object.", + required=True, + ), + MessageTextInput( + name="sep", + display_name="Separator", + advanced=True, + value="\n", + info="String used to separate rows/items.", + ), + ] + + outputs = [ + Output( + display_name="Parsed Text", + name="parsed_text", + info="Formatted text output.", + method="parse_combined_text", + ), + ] + + def update_build_config(self, build_config, field_value, field_name=None): + """Dynamically hide/show `template` and enforce requirement based on `stringify`.""" + if field_name == "stringify": + build_config["template"]["show"] = not field_value + build_config["template"]["required"] = not field_value + if field_value: + clean_data = BoolInput( + name="clean_data", + display_name="Clean Data", + info=( + "Enable to clean the data by removing empty rows and lines " + "in each cell of the DataFrame/ Data object." + ), + value=True, + advanced=True, + required=False, + ) + build_config["clean_data"] = clean_data.to_dict() + else: + build_config.pop("clean_data", None) + + return build_config + + def _clean_args(self): + """Prepare arguments based on input type.""" + input_data = self.input_data + + match input_data: + case list() if all(isinstance(item, Data) for item in input_data): + msg = "List of Data objects is not supported." + raise ValueError(msg) + case DataFrame(): + return input_data, None + case Data(): + return None, input_data + case dict() if "data" in input_data: + try: + if "columns" in input_data: # Likely a DataFrame + return DataFrame.from_dict(input_data), None + # Likely a Data object + return None, Data(**input_data) + except (TypeError, ValueError, KeyError) as e: + msg = f"Invalid structured input provided: {e!s}" + raise ValueError(msg) from e + case _: + msg = f"Unsupported input type: {type(input_data)}. Expected DataFrame or Data." + raise ValueError(msg) + + def parse_combined_text(self) -> Message: + """Parse all rows/items into a single text or convert input to string if `stringify` is enabled.""" + # Early return for stringify option + if self.stringify: + return self.convert_to_string() + + df, data = self._clean_args() + + lines = [] + if df is not None: + for _, row in df.iterrows(): + formatted_text = self.template.format(**row.to_dict()) + lines.append(formatted_text) + elif data is not None: + formatted_text = self.template.format(text=data.get_text()) + lines.append(formatted_text) + + combined_text = self.sep.join(lines) + self.status = combined_text + return Message(text=combined_text) + + def _safe_convert(self, data: Any) -> str: + """Safely convert input data to string.""" + try: + if isinstance(data, str): + return data + if isinstance(data, Message): + return data.get_text() + if isinstance(data, Data): + if data.get_text() is None: + msg = "Empty Data object" + raise ValueError(msg) + return data.get_text() + if isinstance(data, DataFrame): + if hasattr(self, "clean_data") and self.clean_data: + # Remove empty rows + data = data.dropna(how="all") + # Remove empty lines in each cell + data = data.replace(r"^\s*$", "", regex=True) + # Replace multiple newlines with a single newline + data = data.replace(r"\n+", "\n", regex=True) + return data.to_markdown(index=False) + return str(data) + except (ValueError, TypeError, AttributeError) as e: + msg = f"Error converting data: {e!s}" + raise ValueError(msg) from e + + def convert_to_string(self) -> Message: + """Convert input data to string with proper error handling.""" + result = "" + if isinstance(self.input_data, list): + result = "\n".join([self._safe_convert(item) for item in self.input_data]) + else: + result = self._safe_convert(self.input_data) + self.log(f"Converted to string with length: {len(result)}") + return Message(text=result) diff --git a/src/backend/tests/unit/components/processing/test_parser_component.py b/src/backend/tests/unit/components/processing/test_parser_component.py new file mode 100644 index 000000000..e663edb77 --- /dev/null +++ b/src/backend/tests/unit/components/processing/test_parser_component.py @@ -0,0 +1,212 @@ +import pytest +from langflow.components.processing.parser import ParserComponent +from langflow.schema import Data, DataFrame +from langflow.schema.message import Message + +from tests.base import ComponentTestBaseWithoutClient + + +class TestParserComponent(ComponentTestBaseWithoutClient): + @pytest.fixture + def component_class(self): + """Return the component class to test.""" + return ParserComponent + + @pytest.fixture + def default_kwargs(self): + """Return the default kwargs for the component.""" + return { + "input_data": DataFrame({"Name": ["John"], "Age": [30], "Country": ["USA"]}), + "template": "Name: {Name}, Age: {Age}, Country: {Country}", + "sep": "\n", + "stringify": False, + "clean_data": False, + } + + @pytest.fixture + def file_names_mapping(self): + """Return an empty list since this component doesn't have version-specific files.""" + return [] + + def test_parse_dataframe(self, component_class, default_kwargs): + # Arrange + component = component_class(**default_kwargs) + + # Act + result = component.parse_combined_text() + + # Assert + assert isinstance(result, Message) + assert result.text == "Name: John, Age: 30, Country: USA" + + def test_parse_data_object(self, component_class): + # Arrange + data = Data(text="Hello World") + kwargs = { + "input_data": data, + "template": "text: {text}", + "sep": "\n", + "stringify": False, + } + component = component_class(**kwargs) + + # Act + result = component.parse_combined_text() + + # Assert + assert isinstance(result, Message) + assert result.text == "text: Hello World" + + def test_stringify_dataframe(self, component_class): + # Arrange + data_frame = DataFrame({"Name": ["John", "Jane"], "Age": [30, 25]}) + kwargs = { + "input_data": data_frame, + "stringify": True, + "clean_data": False, + } + component = component_class(**kwargs) + + # Act + result = component.parse_combined_text() + + # Assert + assert isinstance(result, Message) + assert "| Name | Age |" in result.text + assert "| John | 30 |" in result.text + assert "| Jane | 25 |" in result.text + + def test_stringify_data_object(self, component_class): + # Arrange + data = Data(text="Hello\nWorld\nMultiline\nText") + kwargs = { + "input_data": data, + "stringify": True, + "clean_data": True, + } + component = component_class(**kwargs) + + # Act + result = component.parse_combined_text() + + # Assert + assert isinstance(result, Message) + assert "Hello" in result.text + assert "World" in result.text + assert "Multiline" in result.text + assert "Text" in result.text + + def test_stringify_message_object(self, component_class): + # Arrange + message = Message(text="Test message content") + kwargs = { + "input_data": message, + "stringify": True, + } + component = component_class(**kwargs) + + # Act + result = component.parse_combined_text() + + # Assert + assert isinstance(result, Message) + assert result.text == "Test message content" + + def test_clean_data_with_stringify(self, component_class): + # Arrange + data_frame = DataFrame( + {"Name": ["John", "Jane\n", "\nBob"], "Age": [30, None, 25], "Notes": ["Good\n\nPerson", "", "Nice\n"]} + ) + kwargs = { + "input_data": data_frame, + "stringify": True, + "clean_data": True, + } + component = component_class(**kwargs) + + # Act + result = component.parse_combined_text() + + # Assert + assert isinstance(result, Message) + # Check for table structure + assert "| Name" in result.text + assert "| Age" in result.text + assert "| Notes" in result.text + # Check for cleaned data + assert "| John" in result.text + assert "| Jane" in result.text + assert "| Bob" in result.text + assert "| Good" in result.text + assert "| Person" in result.text + assert "| Nice" in result.text + # Verify data is cleaned + assert "Jane\n" not in result.text + assert "\nBob" not in result.text + assert "Good\n\nPerson" not in result.text + assert "Nice\n" not in result.text + + def test_invalid_input_type(self, component_class): + # Arrange + kwargs = { + "input_data": 123, # Invalid input type + "template": "{value}", + "sep": "\n", + } + component = component_class(**kwargs) + + # Act & Assert + with pytest.raises(ValueError, match="Unsupported input type: . Expected DataFrame or Data."): + component.parse_combined_text() + + def test_none_input(self, component_class): + # Arrange + kwargs = { + "input_data": None, + "template": "{value}", + "sep": "\n", + } + component = component_class(**kwargs) + + # Act & Assert + with pytest.raises(ValueError, match="Unsupported input type: . Expected DataFrame or Data."): + component.parse_combined_text() + + def test_invalid_template(self, component_class): + # Arrange + data_frame = DataFrame({"Name": ["John"]}) + kwargs = { + "input_data": data_frame, + "template": "{InvalidColumn}", # Invalid column name + "sep": "\n", + "stringify": False, + } + component = component_class(**kwargs) + + # Act & Assert + with pytest.raises(KeyError): + component.parse_combined_text() + + def test_multiple_rows_with_custom_separator(self, component_class): + # Arrange + data_frame = DataFrame( + { + "Name": ["John", "Jane", "Bob"], + "Age": [30, 25, 35], + } + ) + kwargs = { + "input_data": data_frame, + "template": "{Name} is {Age} years old", + "sep": " | ", + "stringify": False, + } + component = component_class(**kwargs) + + # Act + result = component.parse_combined_text() + + # Assert + assert isinstance(result, Message) + expected = "John is 30 years old | Jane is 25 years old | Bob is 35 years old" + assert result.text == expected