feat: add regex pattern extractor component (#6015)
* feat: add regex pattern extractor component * [autofix.ci] apply automated fixes * fix: consistent schema and cleaner code style * fix: type annotation in regex.py * [autofix.ci] apply automated fixes * Fix: regex component unit tests to match implementation behavior --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Ítalo Johnny <italojohnnydosanjos@gmail.com>
This commit is contained in:
parent
0e069ab861
commit
2f122393ee
3 changed files with 178 additions and 0 deletions
|
|
@ -9,6 +9,7 @@ from .merge_data import MergeDataComponent
|
|||
from .message_to_data import MessageToDataComponent
|
||||
from .parse_data import ParseDataComponent
|
||||
from .parse_json_data import ParseJSONDataComponent
|
||||
from .regex import RegexExtractorComponent
|
||||
from .select_data import SelectDataComponent
|
||||
from .split_text import SplitTextComponent
|
||||
from .update_data import UpdateDataComponent
|
||||
|
|
@ -26,6 +27,7 @@ __all__ = [
|
|||
"ParseDataComponent",
|
||||
"ParseDataFrameComponent",
|
||||
"ParseJSONDataComponent",
|
||||
"RegexExtractorComponent",
|
||||
"SelectDataComponent",
|
||||
"SplitTextComponent",
|
||||
"UpdateDataComponent",
|
||||
|
|
|
|||
81
src/backend/base/langflow/components/processing/regex.py
Normal file
81
src/backend/base/langflow/components/processing/regex.py
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
import re
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.io import MessageTextInput, Output
|
||||
from langflow.schema import Data
|
||||
from langflow.schema.message import Message
|
||||
|
||||
|
||||
class RegexExtractorComponent(Component):
|
||||
display_name = "Regex Extractor"
|
||||
description = "Extract patterns from text using regular expressions."
|
||||
icon = "regex"
|
||||
|
||||
inputs = [
|
||||
MessageTextInput(
|
||||
name="input_text",
|
||||
display_name="Input Text",
|
||||
info="The text to analyze",
|
||||
required=True,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="pattern",
|
||||
display_name="Regex Pattern",
|
||||
info="The regular expression pattern to match",
|
||||
value=r"",
|
||||
required=True,
|
||||
tool_mode=True,
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(display_name="Data", name="data", method="extract_matches"),
|
||||
Output(display_name="Message", name="text", method="get_matches_text"),
|
||||
]
|
||||
|
||||
def extract_matches(self) -> list[Data]:
|
||||
if not self.pattern or not self.input_text:
|
||||
self.status = []
|
||||
return []
|
||||
|
||||
try:
|
||||
# Compile regex pattern
|
||||
pattern = re.compile(self.pattern)
|
||||
|
||||
# Find all matches in the input text
|
||||
matches = pattern.findall(self.input_text)
|
||||
|
||||
# Filter out empty matches
|
||||
filtered_matches = [match for match in matches if match] # Remove empty matches
|
||||
|
||||
# Return empty list for no matches, or list of matches if found
|
||||
result: list = [] if not filtered_matches else [Data(data={"match": match}) for match in filtered_matches]
|
||||
|
||||
except re.error as e:
|
||||
error_message = f"Invalid regex pattern: {e!s}"
|
||||
result = [Data(data={"error": error_message})]
|
||||
except ValueError as e:
|
||||
error_message = f"Error extracting matches: {e!s}"
|
||||
result = [Data(data={"error": error_message})]
|
||||
|
||||
self.status = result
|
||||
return result
|
||||
|
||||
def get_matches_text(self) -> Message:
|
||||
"""Get matches as a formatted text message."""
|
||||
matches = self.extract_matches()
|
||||
|
||||
if not matches:
|
||||
message = Message(text="No matches found")
|
||||
self.status = message
|
||||
return message
|
||||
|
||||
if "error" in matches[0].data:
|
||||
message = Message(text=matches[0].data["error"])
|
||||
self.status = message
|
||||
return message
|
||||
|
||||
result = "\n".join(match.data["match"] for match in matches)
|
||||
message = Message(text=result)
|
||||
self.status = message
|
||||
return message
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
import pytest
|
||||
from langflow.components.processing.regex import RegexExtractorComponent
|
||||
from langflow.schema import Data
|
||||
from langflow.schema.message import Message
|
||||
|
||||
from tests.base import ComponentTestBaseWithoutClient
|
||||
|
||||
|
||||
class TestRegexExtractorComponent(ComponentTestBaseWithoutClient):
|
||||
@pytest.fixture
|
||||
def component_class(self):
|
||||
"""Return the component class to test."""
|
||||
return RegexExtractorComponent
|
||||
|
||||
@pytest.fixture
|
||||
def default_kwargs(self):
|
||||
"""Return the default kwargs for the component."""
|
||||
return {
|
||||
"input_text": "Contact us at test@example.com",
|
||||
"pattern": r"\b\w+@\w+\.\w+\b",
|
||||
}
|
||||
|
||||
@pytest.fixture
|
||||
def file_names_mapping(self):
|
||||
"""Return an empty list since this component doesn't have version-specific files."""
|
||||
return []
|
||||
|
||||
def test_successful_regex_extraction(self):
|
||||
# Test with email pattern
|
||||
component = RegexExtractorComponent(
|
||||
input_text="Contact us at test@example.com or support@test.com", pattern=r"\b\w+@\w+\.\w+\b"
|
||||
)
|
||||
|
||||
result = component.extract_matches()
|
||||
assert isinstance(result, list)
|
||||
assert all(isinstance(item, Data) for item in result)
|
||||
assert len(result) == 2
|
||||
assert result[0].data["match"] == "test@example.com"
|
||||
assert result[1].data["match"] == "support@test.com"
|
||||
|
||||
def test_no_matches_found(self):
|
||||
# Test with pattern that won't match
|
||||
component = RegexExtractorComponent(input_text="No email addresses here", pattern=r"\b\w+@\w+\.\w+\b")
|
||||
|
||||
result = component.extract_matches()
|
||||
assert isinstance(result, list)
|
||||
assert len(result) == 0 # The implementation returns an empty list when no matches are found
|
||||
|
||||
def test_invalid_regex_pattern(self):
|
||||
# Test with invalid regex pattern
|
||||
component = RegexExtractorComponent(
|
||||
input_text="Some text",
|
||||
pattern="[", # Invalid regex pattern
|
||||
)
|
||||
|
||||
result = component.extract_matches()
|
||||
assert isinstance(result, list)
|
||||
assert len(result) == 1
|
||||
assert "error" in result[0].data
|
||||
assert "Invalid regex pattern" in result[0].data["error"]
|
||||
|
||||
def test_empty_input_text(self):
|
||||
# Test with empty input
|
||||
component = RegexExtractorComponent(input_text="", pattern=r"\b\w+@\w+\.\w+\b")
|
||||
|
||||
result = component.extract_matches()
|
||||
assert isinstance(result, list)
|
||||
assert len(result) == 0 # The implementation returns an empty list when input is empty
|
||||
|
||||
def test_get_matches_text_output(self):
|
||||
# Test the text output method
|
||||
component = RegexExtractorComponent(input_text="Contact: test@example.com", pattern=r"\b\w+@\w+\.\w+\b")
|
||||
|
||||
result = component.get_matches_text()
|
||||
assert isinstance(result, Message)
|
||||
assert result.text == "test@example.com"
|
||||
|
||||
def test_get_matches_text_no_matches(self):
|
||||
# Test text output with no matches
|
||||
component = RegexExtractorComponent(input_text="No email addresses", pattern=r"\b\w+@\w+\.\w+\b")
|
||||
|
||||
result = component.get_matches_text()
|
||||
assert isinstance(result, Message)
|
||||
assert result.text == "No matches found"
|
||||
|
||||
def test_get_matches_text_invalid_pattern(self):
|
||||
# Test text output with invalid pattern
|
||||
component = RegexExtractorComponent(
|
||||
input_text="Some text",
|
||||
pattern="[", # Invalid regex pattern
|
||||
)
|
||||
|
||||
result = component.get_matches_text()
|
||||
assert isinstance(result, Message)
|
||||
assert "Invalid regex pattern" in result.text
|
||||
Loading…
Add table
Add a link
Reference in a new issue