From 2f122393ee73a08827ba329ab47841436ab773c5 Mon Sep 17 00:00:00 2001 From: Raphael Valdetaro <79842132+raphaelchristi@users.noreply.github.com> Date: Thu, 13 Mar 2025 17:57:04 -0300 Subject: [PATCH] feat: add regex pattern extractor component (#6015) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add regex pattern extractor component * [autofix.ci] apply automated fixes * fix: consistent schema and cleaner code style * fix: type annotation in regex.py * [autofix.ci] apply automated fixes * Fix: regex component unit tests to match implementation behavior --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Ítalo Johnny --- .../components/processing/__init__.py | 2 + .../langflow/components/processing/regex.py | 81 ++++++++++++++++ .../processing/test_regex_component.py | 95 +++++++++++++++++++ 3 files changed, 178 insertions(+) create mode 100644 src/backend/base/langflow/components/processing/regex.py create mode 100644 src/backend/tests/unit/components/processing/test_regex_component.py diff --git a/src/backend/base/langflow/components/processing/__init__.py b/src/backend/base/langflow/components/processing/__init__.py index 489554618..1a57c4cb5 100644 --- a/src/backend/base/langflow/components/processing/__init__.py +++ b/src/backend/base/langflow/components/processing/__init__.py @@ -9,6 +9,7 @@ from .merge_data import MergeDataComponent from .message_to_data import MessageToDataComponent from .parse_data import ParseDataComponent from .parse_json_data import ParseJSONDataComponent +from .regex import RegexExtractorComponent from .select_data import SelectDataComponent from .split_text import SplitTextComponent from .update_data import UpdateDataComponent @@ -26,6 +27,7 @@ __all__ = [ "ParseDataComponent", "ParseDataFrameComponent", "ParseJSONDataComponent", + "RegexExtractorComponent", "SelectDataComponent", "SplitTextComponent", "UpdateDataComponent", diff --git a/src/backend/base/langflow/components/processing/regex.py b/src/backend/base/langflow/components/processing/regex.py new file mode 100644 index 000000000..39a0fba32 --- /dev/null +++ b/src/backend/base/langflow/components/processing/regex.py @@ -0,0 +1,81 @@ +import re + +from langflow.custom import Component +from langflow.io import MessageTextInput, Output +from langflow.schema import Data +from langflow.schema.message import Message + + +class RegexExtractorComponent(Component): + display_name = "Regex Extractor" + description = "Extract patterns from text using regular expressions." + icon = "regex" + + inputs = [ + MessageTextInput( + name="input_text", + display_name="Input Text", + info="The text to analyze", + required=True, + ), + MessageTextInput( + name="pattern", + display_name="Regex Pattern", + info="The regular expression pattern to match", + value=r"", + required=True, + tool_mode=True, + ), + ] + + outputs = [ + Output(display_name="Data", name="data", method="extract_matches"), + Output(display_name="Message", name="text", method="get_matches_text"), + ] + + def extract_matches(self) -> list[Data]: + if not self.pattern or not self.input_text: + self.status = [] + return [] + + try: + # Compile regex pattern + pattern = re.compile(self.pattern) + + # Find all matches in the input text + matches = pattern.findall(self.input_text) + + # Filter out empty matches + filtered_matches = [match for match in matches if match] # Remove empty matches + + # Return empty list for no matches, or list of matches if found + result: list = [] if not filtered_matches else [Data(data={"match": match}) for match in filtered_matches] + + except re.error as e: + error_message = f"Invalid regex pattern: {e!s}" + result = [Data(data={"error": error_message})] + except ValueError as e: + error_message = f"Error extracting matches: {e!s}" + result = [Data(data={"error": error_message})] + + self.status = result + return result + + def get_matches_text(self) -> Message: + """Get matches as a formatted text message.""" + matches = self.extract_matches() + + if not matches: + message = Message(text="No matches found") + self.status = message + return message + + if "error" in matches[0].data: + message = Message(text=matches[0].data["error"]) + self.status = message + return message + + result = "\n".join(match.data["match"] for match in matches) + message = Message(text=result) + self.status = message + return message diff --git a/src/backend/tests/unit/components/processing/test_regex_component.py b/src/backend/tests/unit/components/processing/test_regex_component.py new file mode 100644 index 000000000..1dc64ab5f --- /dev/null +++ b/src/backend/tests/unit/components/processing/test_regex_component.py @@ -0,0 +1,95 @@ +import pytest +from langflow.components.processing.regex import RegexExtractorComponent +from langflow.schema import Data +from langflow.schema.message import Message + +from tests.base import ComponentTestBaseWithoutClient + + +class TestRegexExtractorComponent(ComponentTestBaseWithoutClient): + @pytest.fixture + def component_class(self): + """Return the component class to test.""" + return RegexExtractorComponent + + @pytest.fixture + def default_kwargs(self): + """Return the default kwargs for the component.""" + return { + "input_text": "Contact us at test@example.com", + "pattern": r"\b\w+@\w+\.\w+\b", + } + + @pytest.fixture + def file_names_mapping(self): + """Return an empty list since this component doesn't have version-specific files.""" + return [] + + def test_successful_regex_extraction(self): + # Test with email pattern + component = RegexExtractorComponent( + input_text="Contact us at test@example.com or support@test.com", pattern=r"\b\w+@\w+\.\w+\b" + ) + + result = component.extract_matches() + assert isinstance(result, list) + assert all(isinstance(item, Data) for item in result) + assert len(result) == 2 + assert result[0].data["match"] == "test@example.com" + assert result[1].data["match"] == "support@test.com" + + def test_no_matches_found(self): + # Test with pattern that won't match + component = RegexExtractorComponent(input_text="No email addresses here", pattern=r"\b\w+@\w+\.\w+\b") + + result = component.extract_matches() + assert isinstance(result, list) + assert len(result) == 0 # The implementation returns an empty list when no matches are found + + def test_invalid_regex_pattern(self): + # Test with invalid regex pattern + component = RegexExtractorComponent( + input_text="Some text", + pattern="[", # Invalid regex pattern + ) + + result = component.extract_matches() + assert isinstance(result, list) + assert len(result) == 1 + assert "error" in result[0].data + assert "Invalid regex pattern" in result[0].data["error"] + + def test_empty_input_text(self): + # Test with empty input + component = RegexExtractorComponent(input_text="", pattern=r"\b\w+@\w+\.\w+\b") + + result = component.extract_matches() + assert isinstance(result, list) + assert len(result) == 0 # The implementation returns an empty list when input is empty + + def test_get_matches_text_output(self): + # Test the text output method + component = RegexExtractorComponent(input_text="Contact: test@example.com", pattern=r"\b\w+@\w+\.\w+\b") + + result = component.get_matches_text() + assert isinstance(result, Message) + assert result.text == "test@example.com" + + def test_get_matches_text_no_matches(self): + # Test text output with no matches + component = RegexExtractorComponent(input_text="No email addresses", pattern=r"\b\w+@\w+\.\w+\b") + + result = component.get_matches_text() + assert isinstance(result, Message) + assert result.text == "No matches found" + + def test_get_matches_text_invalid_pattern(self): + # Test text output with invalid pattern + component = RegexExtractorComponent( + input_text="Some text", + pattern="[", # Invalid regex pattern + ) + + result = component.get_matches_text() + assert isinstance(result, Message) + assert "Invalid regex pattern" in result.text