feat: New parser component with multiple input types and stringify add on (#6652)

* update to parser * error handling * solve lint error and added tests * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes * Update parser.py * fix format errors * [autofix.ci] apply automated fixes * refactor: Remove hardcoded name attribute from ParserComponent * Update src/backend/base/langflow/components/processing/parser.py Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> * error fix * [autofix.ci] apply automated fixes * feat: mark ParserComponent as beta Added a beta flag to the ParserComponent to indicate its experimental status. --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> Co-authored-by: Ítalo Johnny <italojohnnydosanjos@gmail.com> Co-authored-by: Rodrigo <rodrigosilvanader@gmail.com>
2025-03-14 08:19:00 -04:00 · 2025-03-14 08:19:00 -04:00 · e3cf852307
commit e3cf852307
parent 94bc8dbc7d
3 changed files with 387 additions and 0 deletions
--- a/src/backend/base/langflow/components/processing/init.py
+++ b/src/backend/base/langflow/components/processing/init.py
@ -9,6 +9,7 @@ from .merge_data import MergeDataComponent
 from .message_to_data import MessageToDataComponent
 from .parse_data import ParseDataComponent
 from .parse_json_data import ParseJSONDataComponent
+from .parser import ParserComponent
 from .regex import RegexExtractorComponent
 from .select_data import SelectDataComponent
 from .split_text import SplitTextComponent
@ -27,6 +28,7 @@ __all__ = [
    "ParseDataComponent",
    "ParseDataFrameComponent",
    "ParseJSONDataComponent",
+    "ParserComponent",
    "RegexExtractorComponent",
    "SelectDataComponent",
    "SplitTextComponent",
--- a/src/backend/base/langflow/components/processing/parser.py
+++ b/src/backend/base/langflow/components/processing/parser.py
@ -0,0 +1,173 @@
+from typing import Any
+
+from langflow.custom import Component
+from langflow.io import (
+    BoolInput,
+    HandleInput,
+    MessageTextInput,
+    MultilineInput,
+    Output,
+)
+from langflow.schema import Data, DataFrame
+from langflow.schema.message import Message
+
+
+class ParserComponent(Component):
+    display_name = "Parser"
+    description = (
+        "Format a DataFrame or Data object into text using a template. "
+        "Enable 'Stringify' to convert input into a readable string instead."
+    )
+    icon = "braces"
+    beta = True
+
+    inputs = [
+        BoolInput(
+            name="stringify",
+            display_name="Stringify",
+            info="Enable to convert input to a string instead of using a template.",
+            value=False,
+            real_time_refresh=True,
+        ),
+        MultilineInput(
+            name="template",
+            display_name="Template",
+            info=(
+                "Use variables within curly brackets to extract column values for DataFrames "
+                "or key values for Data."
+                "For example: `Name: {Name}, Age: {Age}, Country: {Country}`"
+            ),
+            value="Text: {text}",  # Example default
+            dynamic=True,
+            show=True,
+            required=True,
+        ),
+        HandleInput(
+            name="input_data",
+            display_name="Data or DataFrame",
+            input_types=["DataFrame", "Data"],
+            info="Accepts either a DataFrame or a Data object.",
+            required=True,
+        ),
+        MessageTextInput(
+            name="sep",
+            display_name="Separator",
+            advanced=True,
+            value="\n",
+            info="String used to separate rows/items.",
+        ),
+    ]
+
+    outputs = [
+        Output(
+            display_name="Parsed Text",
+            name="parsed_text",
+            info="Formatted text output.",
+            method="parse_combined_text",
+        ),
+    ]
+
+    def update_build_config(self, build_config, field_value, field_name=None):
+        """Dynamically hide/show `template` and enforce requirement based on `stringify`."""
+        if field_name == "stringify":
+            build_config["template"]["show"] = not field_value
+            build_config["template"]["required"] = not field_value
+            if field_value:
+                clean_data = BoolInput(
+                    name="clean_data",
+                    display_name="Clean Data",
+                    info=(
+                        "Enable to clean the data by removing empty rows and lines "
+                        "in each cell of the DataFrame/ Data object."
+                    ),
+                    value=True,
+                    advanced=True,
+                    required=False,
+                )
+                build_config["clean_data"] = clean_data.to_dict()
+            else:
+                build_config.pop("clean_data", None)
+
+        return build_config
+
+    def _clean_args(self):
+        """Prepare arguments based on input type."""
+        input_data = self.input_data
+
+        match input_data:
+            case list() if all(isinstance(item, Data) for item in input_data):
+                msg = "List of Data objects is not supported."
+                raise ValueError(msg)
+            case DataFrame():
+                return input_data, None
+            case Data():
+                return None, input_data
+            case dict() if "data" in input_data:
+                try:
+                    if "columns" in input_data:  # Likely a DataFrame
+                        return DataFrame.from_dict(input_data), None
+                    # Likely a Data object
+                    return None, Data(**input_data)
+                except (TypeError, ValueError, KeyError) as e:
+                    msg = f"Invalid structured input provided: {e!s}"
+                    raise ValueError(msg) from e
+            case _:
+                msg = f"Unsupported input type: {type(input_data)}. Expected DataFrame or Data."
+                raise ValueError(msg)
+
+    def parse_combined_text(self) -> Message:
+        """Parse all rows/items into a single text or convert input to string if `stringify` is enabled."""
+        # Early return for stringify option
+        if self.stringify:
+            return self.convert_to_string()
+
+        df, data = self._clean_args()
+
+        lines = []
+        if df is not None:
+            for _, row in df.iterrows():
+                formatted_text = self.template.format(**row.to_dict())
+                lines.append(formatted_text)
+        elif data is not None:
+            formatted_text = self.template.format(text=data.get_text())
+            lines.append(formatted_text)
+
+        combined_text = self.sep.join(lines)
+        self.status = combined_text
+        return Message(text=combined_text)
+
+    def _safe_convert(self, data: Any) -> str:
+        """Safely convert input data to string."""
+        try:
+            if isinstance(data, str):
+                return data
+            if isinstance(data, Message):
+                return data.get_text()
+            if isinstance(data, Data):
+                if data.get_text() is None:
+                    msg = "Empty Data object"
+                    raise ValueError(msg)
+                return data.get_text()
+            if isinstance(data, DataFrame):
+                if hasattr(self, "clean_data") and self.clean_data:
+                    # Remove empty rows
+                    data = data.dropna(how="all")
+                    # Remove empty lines in each cell
+                    data = data.replace(r"^\s*$", "", regex=True)
+                    # Replace multiple newlines with a single newline
+                    data = data.replace(r"\n+", "\n", regex=True)
+                return data.to_markdown(index=False)
+            return str(data)
+        except (ValueError, TypeError, AttributeError) as e:
+            msg = f"Error converting data: {e!s}"
+            raise ValueError(msg) from e
+
+    def convert_to_string(self) -> Message:
+        """Convert input data to string with proper error handling."""
+        result = ""
+        if isinstance(self.input_data, list):
+            result = "\n".join([self._safe_convert(item) for item in self.input_data])
+        else:
+            result = self._safe_convert(self.input_data)
+        self.log(f"Converted to string with length: {len(result)}")
+        return Message(text=result)
--- a/src/backend/tests/unit/components/processing/test_parser_component.py
+++ b/src/backend/tests/unit/components/processing/test_parser_component.py
@ -0,0 +1,212 @@
+import pytest
+from langflow.components.processing.parser import ParserComponent
+from langflow.schema import Data, DataFrame
+from langflow.schema.message import Message
+
+from tests.base import ComponentTestBaseWithoutClient
+
+
+class TestParserComponent(ComponentTestBaseWithoutClient):
+    @pytest.fixture
+    def component_class(self):
+        """Return the component class to test."""
+        return ParserComponent
+
+    @pytest.fixture
+    def default_kwargs(self):
+        """Return the default kwargs for the component."""
+        return {
+            "input_data": DataFrame({"Name": ["John"], "Age": [30], "Country": ["USA"]}),
+            "template": "Name: {Name}, Age: {Age}, Country: {Country}",
+            "sep": "\n",
+            "stringify": False,
+            "clean_data": False,
+        }
+
+    @pytest.fixture
+    def file_names_mapping(self):
+        """Return an empty list since this component doesn't have version-specific files."""
+        return []
+
+    def test_parse_dataframe(self, component_class, default_kwargs):
+        # Arrange
+        component = component_class(**default_kwargs)
+
+        # Act
+        result = component.parse_combined_text()
+
+        # Assert
+        assert isinstance(result, Message)
+        assert result.text == "Name: John, Age: 30, Country: USA"
+
+    def test_parse_data_object(self, component_class):
+        # Arrange
+        data = Data(text="Hello World")
+        kwargs = {
+            "input_data": data,
+            "template": "text: {text}",
+            "sep": "\n",
+            "stringify": False,
+        }
+        component = component_class(**kwargs)
+
+        # Act
+        result = component.parse_combined_text()
+
+        # Assert
+        assert isinstance(result, Message)
+        assert result.text == "text: Hello World"
+
+    def test_stringify_dataframe(self, component_class):
+        # Arrange
+        data_frame = DataFrame({"Name": ["John", "Jane"], "Age": [30, 25]})
+        kwargs = {
+            "input_data": data_frame,
+            "stringify": True,
+            "clean_data": False,
+        }
+        component = component_class(**kwargs)
+
+        # Act
+        result = component.parse_combined_text()
+
+        # Assert
+        assert isinstance(result, Message)
+        assert "| Name   |   Age |" in result.text
+        assert "| John   |    30 |" in result.text
+        assert "| Jane   |    25 |" in result.text
+
+    def test_stringify_data_object(self, component_class):
+        # Arrange
+        data = Data(text="Hello\nWorld\nMultiline\nText")
+        kwargs = {
+            "input_data": data,
+            "stringify": True,
+            "clean_data": True,
+        }
+        component = component_class(**kwargs)
+
+        # Act
+        result = component.parse_combined_text()
+
+        # Assert
+        assert isinstance(result, Message)
+        assert "Hello" in result.text
+        assert "World" in result.text
+        assert "Multiline" in result.text
+        assert "Text" in result.text
+
+    def test_stringify_message_object(self, component_class):
+        # Arrange
+        message = Message(text="Test message content")
+        kwargs = {
+            "input_data": message,
+            "stringify": True,
+        }
+        component = component_class(**kwargs)
+
+        # Act
+        result = component.parse_combined_text()
+
+        # Assert
+        assert isinstance(result, Message)
+        assert result.text == "Test message content"
+
+    def test_clean_data_with_stringify(self, component_class):
+        # Arrange
+        data_frame = DataFrame(
+            {"Name": ["John", "Jane\n", "\nBob"], "Age": [30, None, 25], "Notes": ["Good\n\nPerson", "", "Nice\n"]}
+        )
+        kwargs = {
+            "input_data": data_frame,
+            "stringify": True,
+            "clean_data": True,
+        }
+        component = component_class(**kwargs)
+
+        # Act
+        result = component.parse_combined_text()
+
+        # Assert
+        assert isinstance(result, Message)
+        # Check for table structure
+        assert "| Name" in result.text
+        assert "|   Age" in result.text
+        assert "| Notes" in result.text
+        # Check for cleaned data
+        assert "| John" in result.text
+        assert "| Jane" in result.text
+        assert "| Bob" in result.text
+        assert "| Good" in result.text
+        assert "| Person" in result.text
+        assert "| Nice" in result.text
+        # Verify data is cleaned
+        assert "Jane\n" not in result.text
+        assert "\nBob" not in result.text
+        assert "Good\n\nPerson" not in result.text
+        assert "Nice\n" not in result.text
+
+    def test_invalid_input_type(self, component_class):
+        # Arrange
+        kwargs = {
+            "input_data": 123,  # Invalid input type
+            "template": "{value}",
+            "sep": "\n",
+        }
+        component = component_class(**kwargs)
+
+        # Act & Assert
+        with pytest.raises(ValueError, match="Unsupported input type: <class 'int'>. Expected DataFrame or Data."):
+            component.parse_combined_text()
+
+    def test_none_input(self, component_class):
+        # Arrange
+        kwargs = {
+            "input_data": None,
+            "template": "{value}",
+            "sep": "\n",
+        }
+        component = component_class(**kwargs)
+
+        # Act & Assert
+        with pytest.raises(ValueError, match="Unsupported input type: <class 'NoneType'>. Expected DataFrame or Data."):
+            component.parse_combined_text()
+
+    def test_invalid_template(self, component_class):
+        # Arrange
+        data_frame = DataFrame({"Name": ["John"]})
+        kwargs = {
+            "input_data": data_frame,
+            "template": "{InvalidColumn}",  # Invalid column name
+            "sep": "\n",
+            "stringify": False,
+        }
+        component = component_class(**kwargs)
+
+        # Act & Assert
+        with pytest.raises(KeyError):
+            component.parse_combined_text()
+
+    def test_multiple_rows_with_custom_separator(self, component_class):
+        # Arrange
+        data_frame = DataFrame(
+            {
+                "Name": ["John", "Jane", "Bob"],
+                "Age": [30, 25, 35],
+            }
+        )
+        kwargs = {
+            "input_data": data_frame,
+            "template": "{Name} is {Age} years old",
+            "sep": " | ",
+            "stringify": False,
+        }
+        component = component_class(**kwargs)
+
+        # Act
+        result = component.parse_combined_text()
+
+        # Assert
+        assert isinstance(result, Message)
+        expected = "John is 30 years old | Jane is 25 years old | Bob is 35 years old"
+        assert result.text == expected