feat: add regex pattern extractor component (#6015)

* feat: add regex pattern extractor component

* [autofix.ci] apply automated fixes

* fix: consistent schema and cleaner code style

* fix: type annotation in regex.py

* [autofix.ci] apply automated fixes

* Fix: regex component unit tests to match implementation behavior

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Ítalo Johnny <italojohnnydosanjos@gmail.com>
This commit is contained in:
Raphael Valdetaro 2025-03-13 17:57:04 -03:00 committed by GitHub
commit 2f122393ee
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 178 additions and 0 deletions

View file

@ -9,6 +9,7 @@ from .merge_data import MergeDataComponent
from .message_to_data import MessageToDataComponent
from .parse_data import ParseDataComponent
from .parse_json_data import ParseJSONDataComponent
from .regex import RegexExtractorComponent
from .select_data import SelectDataComponent
from .split_text import SplitTextComponent
from .update_data import UpdateDataComponent
@ -26,6 +27,7 @@ __all__ = [
"ParseDataComponent",
"ParseDataFrameComponent",
"ParseJSONDataComponent",
"RegexExtractorComponent",
"SelectDataComponent",
"SplitTextComponent",
"UpdateDataComponent",

View file

@ -0,0 +1,81 @@
import re
from langflow.custom import Component
from langflow.io import MessageTextInput, Output
from langflow.schema import Data
from langflow.schema.message import Message
class RegexExtractorComponent(Component):
display_name = "Regex Extractor"
description = "Extract patterns from text using regular expressions."
icon = "regex"
inputs = [
MessageTextInput(
name="input_text",
display_name="Input Text",
info="The text to analyze",
required=True,
),
MessageTextInput(
name="pattern",
display_name="Regex Pattern",
info="The regular expression pattern to match",
value=r"",
required=True,
tool_mode=True,
),
]
outputs = [
Output(display_name="Data", name="data", method="extract_matches"),
Output(display_name="Message", name="text", method="get_matches_text"),
]
def extract_matches(self) -> list[Data]:
if not self.pattern or not self.input_text:
self.status = []
return []
try:
# Compile regex pattern
pattern = re.compile(self.pattern)
# Find all matches in the input text
matches = pattern.findall(self.input_text)
# Filter out empty matches
filtered_matches = [match for match in matches if match] # Remove empty matches
# Return empty list for no matches, or list of matches if found
result: list = [] if not filtered_matches else [Data(data={"match": match}) for match in filtered_matches]
except re.error as e:
error_message = f"Invalid regex pattern: {e!s}"
result = [Data(data={"error": error_message})]
except ValueError as e:
error_message = f"Error extracting matches: {e!s}"
result = [Data(data={"error": error_message})]
self.status = result
return result
def get_matches_text(self) -> Message:
"""Get matches as a formatted text message."""
matches = self.extract_matches()
if not matches:
message = Message(text="No matches found")
self.status = message
return message
if "error" in matches[0].data:
message = Message(text=matches[0].data["error"])
self.status = message
return message
result = "\n".join(match.data["match"] for match in matches)
message = Message(text=result)
self.status = message
return message

View file

@ -0,0 +1,95 @@
import pytest
from langflow.components.processing.regex import RegexExtractorComponent
from langflow.schema import Data
from langflow.schema.message import Message
from tests.base import ComponentTestBaseWithoutClient
class TestRegexExtractorComponent(ComponentTestBaseWithoutClient):
@pytest.fixture
def component_class(self):
"""Return the component class to test."""
return RegexExtractorComponent
@pytest.fixture
def default_kwargs(self):
"""Return the default kwargs for the component."""
return {
"input_text": "Contact us at test@example.com",
"pattern": r"\b\w+@\w+\.\w+\b",
}
@pytest.fixture
def file_names_mapping(self):
"""Return an empty list since this component doesn't have version-specific files."""
return []
def test_successful_regex_extraction(self):
# Test with email pattern
component = RegexExtractorComponent(
input_text="Contact us at test@example.com or support@test.com", pattern=r"\b\w+@\w+\.\w+\b"
)
result = component.extract_matches()
assert isinstance(result, list)
assert all(isinstance(item, Data) for item in result)
assert len(result) == 2
assert result[0].data["match"] == "test@example.com"
assert result[1].data["match"] == "support@test.com"
def test_no_matches_found(self):
# Test with pattern that won't match
component = RegexExtractorComponent(input_text="No email addresses here", pattern=r"\b\w+@\w+\.\w+\b")
result = component.extract_matches()
assert isinstance(result, list)
assert len(result) == 0 # The implementation returns an empty list when no matches are found
def test_invalid_regex_pattern(self):
# Test with invalid regex pattern
component = RegexExtractorComponent(
input_text="Some text",
pattern="[", # Invalid regex pattern
)
result = component.extract_matches()
assert isinstance(result, list)
assert len(result) == 1
assert "error" in result[0].data
assert "Invalid regex pattern" in result[0].data["error"]
def test_empty_input_text(self):
# Test with empty input
component = RegexExtractorComponent(input_text="", pattern=r"\b\w+@\w+\.\w+\b")
result = component.extract_matches()
assert isinstance(result, list)
assert len(result) == 0 # The implementation returns an empty list when input is empty
def test_get_matches_text_output(self):
# Test the text output method
component = RegexExtractorComponent(input_text="Contact: test@example.com", pattern=r"\b\w+@\w+\.\w+\b")
result = component.get_matches_text()
assert isinstance(result, Message)
assert result.text == "test@example.com"
def test_get_matches_text_no_matches(self):
# Test text output with no matches
component = RegexExtractorComponent(input_text="No email addresses", pattern=r"\b\w+@\w+\.\w+\b")
result = component.get_matches_text()
assert isinstance(result, Message)
assert result.text == "No matches found"
def test_get_matches_text_invalid_pattern(self):
# Test text output with invalid pattern
component = RegexExtractorComponent(
input_text="Some text",
pattern="[", # Invalid regex pattern
)
result = component.get_matches_text()
assert isinstance(result, Message)
assert "Invalid regex pattern" in result.text