feat: improve gitloader component (#5351)
* feat: improve gitloader component * [autofix.ci] apply automated fixes * feat(git.py): refactor GitLoaderComponent to support asynchronous operations and improve temporary directory management - Convert methods to async to enhance performance with file operations. - Implement async context manager for handling temporary clone directories. - Update binary file check and content filtering to be asynchronous. * fix(git.py): enhance GitLoaderComponent with improved file filtering and binary check - Refactor file filtering logic to utilize fnmatch for pattern matching. - Introduce a new method to check for binary files based on null byte detection. - Update content filtering to handle exceptions more gracefully. - Modify temporary directory cleanup to use rmdir instead of remove for better directory management. - Adjust load_documents method to utilize asyncio.to_thread for lazy loading of documents. * refactor(git.py): enhance GitLoaderComponent with improved file filtering and binary check - Refactor binary file check to be synchronous and handle exceptions more gracefully. - Introduce new methods for checking file patterns and content patterns, allowing for more flexible filtering. - Consolidate filtering logic into a single method for better maintainability. - Update load_documents method to run lazy loading in a separate thread for improved performance. * feat(tests): add unit tests for GitLoaderComponent file filtering and binary detection - Introduced new test suite for GitLoaderComponent, including tests for binary file detection and file pattern matching. - Implemented temporary file creation for testing various file types and permissions. - Added tests for combined filtering functionality, ensuring robust handling of file and content patterns. * refactor(git.py): improve pattern handling and content filtering in GitLoaderComponent - Refactored pattern handling to use a more descriptive variable name `pattern_list` for clarity. - Enhanced content filtering by ensuring proper encoding when reading file content. - Updated regex validation to include a test string check for better error handling. - Removed unnecessary comments to streamline the code and improve readability. * fix: exception TypeError TypeError: object async_generator can't be used in 'await' expression --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org> Co-authored-by: italojohnny <italojohnnydosanjos@gmail.com>
This commit is contained in:
parent
8ea4405010
commit
70eaacfd36
3 changed files with 332 additions and 49 deletions
|
|
@ -1,33 +1,50 @@
|
|||
import re
|
||||
import tempfile
|
||||
from contextlib import asynccontextmanager
|
||||
from fnmatch import fnmatch
|
||||
from pathlib import Path
|
||||
|
||||
import anyio
|
||||
from langchain_community.document_loaders.git import GitLoader
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.io import MessageTextInput, Output
|
||||
from langflow.io import DropdownInput, MessageTextInput, Output
|
||||
from langflow.schema import Data
|
||||
|
||||
|
||||
class GitLoaderComponent(Component):
|
||||
display_name = "GitLoader"
|
||||
description = "Load files from a Git repository"
|
||||
documentation = "https://python.langchain.com/v0.2/docs/integrations/document_loaders/git/"
|
||||
display_name = "Git"
|
||||
description = (
|
||||
"Load and filter documents from a local or remote Git repository. "
|
||||
"Use a local repo path or clone from a remote URL."
|
||||
)
|
||||
trace_type = "tool"
|
||||
icon = "GitLoader"
|
||||
name = "GitLoader"
|
||||
|
||||
inputs = [
|
||||
DropdownInput(
|
||||
name="repo_source",
|
||||
display_name="Repository Source",
|
||||
options=["Local", "Remote"],
|
||||
required=True,
|
||||
info="Select whether to use a local repo path or clone from a remote URL.",
|
||||
real_time_refresh=True,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="repo_path",
|
||||
display_name="Repository Path",
|
||||
display_name="Local Repository Path",
|
||||
required=False,
|
||||
info="The local path to the Git repository.",
|
||||
info="The local path to the existing Git repository (used if 'Local' is selected).",
|
||||
dynamic=True,
|
||||
show=False,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="clone_url",
|
||||
display_name="Clone URL",
|
||||
required=False,
|
||||
info="The URL to clone the Git repository from.",
|
||||
info="The URL of the Git repository to clone (used if 'Clone' is selected).",
|
||||
dynamic=True,
|
||||
show=False,
|
||||
),
|
||||
MessageTextInput(
|
||||
name="branch",
|
||||
|
|
@ -41,8 +58,12 @@ class GitLoaderComponent(Component):
|
|||
display_name="File Filter",
|
||||
required=False,
|
||||
advanced=True,
|
||||
info="A list of patterns to filter files. Example to include only .py files: '*.py'. "
|
||||
"Example to exclude .py files: '!*.py'. Multiple patterns can be separated by commas.",
|
||||
info=(
|
||||
"Patterns to filter files. For example:\n"
|
||||
"Include only .py files: '*.py'\n"
|
||||
"Exclude .py files: '!*.py'\n"
|
||||
"Multiple patterns can be separated by commas."
|
||||
),
|
||||
),
|
||||
MessageTextInput(
|
||||
name="content_filter",
|
||||
|
|
@ -58,57 +79,184 @@ class GitLoaderComponent(Component):
|
|||
]
|
||||
|
||||
@staticmethod
|
||||
def is_binary(file_path: str) -> bool:
|
||||
"""Check if a file is binary by looking for null bytes.
|
||||
def is_binary(file_path: str | Path) -> bool:
|
||||
"""Check if a file is binary by looking for null bytes."""
|
||||
try:
|
||||
with Path(file_path).open("rb") as file:
|
||||
content = file.read(1024)
|
||||
return b"\x00" in content
|
||||
except Exception: # noqa: BLE001
|
||||
return True
|
||||
|
||||
This is necessary because when searches are performed using
|
||||
the content_filter, binary files need to be ignored.
|
||||
@staticmethod
|
||||
def check_file_patterns(file_path: str | Path, patterns: str) -> bool:
|
||||
"""Check if a file matches the given patterns.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to check
|
||||
patterns: Comma-separated list of glob patterns
|
||||
|
||||
Returns:
|
||||
bool: True if file should be included, False if excluded
|
||||
"""
|
||||
with Path(file_path).open("rb") as file:
|
||||
return b"\x00" in file.read(1024)
|
||||
# Handle empty or whitespace-only patterns
|
||||
if not patterns or patterns.isspace():
|
||||
return True
|
||||
|
||||
def build_gitloader(self) -> GitLoader:
|
||||
path_str = str(file_path)
|
||||
file_name = Path(path_str).name
|
||||
pattern_list: list[str] = [pattern.strip() for pattern in patterns.split(",") if pattern.strip()]
|
||||
|
||||
# If no valid patterns after stripping, treat as include all
|
||||
if not pattern_list:
|
||||
return True
|
||||
|
||||
# Process exclusion patterns first
|
||||
for pattern in pattern_list:
|
||||
if pattern.startswith("!"):
|
||||
# For exclusions, match against both full path and filename
|
||||
exclude_pattern = pattern[1:]
|
||||
if fnmatch(path_str, exclude_pattern) or fnmatch(file_name, exclude_pattern):
|
||||
return False
|
||||
|
||||
# Then check inclusion patterns
|
||||
include_patterns = [p for p in pattern_list if not p.startswith("!")]
|
||||
# If no include patterns, treat as include all
|
||||
if not include_patterns:
|
||||
return True
|
||||
|
||||
# For inclusions, match against both full path and filename
|
||||
return any(fnmatch(path_str, pattern) or fnmatch(file_name, pattern) for pattern in include_patterns)
|
||||
|
||||
@staticmethod
|
||||
def check_content_pattern(file_path: str | Path, pattern: str) -> bool:
|
||||
"""Check if file content matches the given regex pattern.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to check
|
||||
pattern: Regex pattern to match against content
|
||||
|
||||
Returns:
|
||||
bool: True if content matches, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Check if file is binary
|
||||
with Path(file_path).open("rb") as file:
|
||||
content = file.read(1024)
|
||||
if b"\x00" in content:
|
||||
return False
|
||||
|
||||
# Try to compile the regex pattern first
|
||||
try:
|
||||
# Use the MULTILINE flag to better handle text content
|
||||
content_regex = re.compile(pattern, re.MULTILINE)
|
||||
# Test the pattern with a simple string to catch syntax errors
|
||||
test_str = "test\nstring"
|
||||
if not content_regex.search(test_str):
|
||||
# Pattern is valid but doesn't match test string
|
||||
pass
|
||||
except (re.error, TypeError, ValueError):
|
||||
return False
|
||||
|
||||
# If not binary and regex is valid, check content
|
||||
with Path(file_path).open(encoding="utf-8") as file:
|
||||
file_content = file.read()
|
||||
return bool(content_regex.search(file_content))
|
||||
except (OSError, UnicodeDecodeError):
|
||||
return False
|
||||
|
||||
def build_combined_filter(self, file_filter_patterns: str | None = None, content_filter_pattern: str | None = None):
|
||||
"""Build a combined filter function from file and content patterns.
|
||||
|
||||
Args:
|
||||
file_filter_patterns: Comma-separated glob patterns
|
||||
content_filter_pattern: Regex pattern for content
|
||||
|
||||
Returns:
|
||||
callable: Filter function that takes a file path and returns bool
|
||||
"""
|
||||
|
||||
def combined_filter(file_path: str) -> bool:
|
||||
try:
|
||||
path = Path(file_path)
|
||||
|
||||
# Check if file exists and is readable
|
||||
if not path.exists():
|
||||
return False
|
||||
|
||||
# Check if file is binary
|
||||
if self.is_binary(path):
|
||||
return False
|
||||
|
||||
# Apply file pattern filters
|
||||
if file_filter_patterns and not self.check_file_patterns(path, file_filter_patterns):
|
||||
return False
|
||||
|
||||
# Apply content filter
|
||||
return not (content_filter_pattern and not self.check_content_pattern(path, content_filter_pattern))
|
||||
except Exception: # noqa: BLE001
|
||||
return False
|
||||
|
||||
return combined_filter
|
||||
|
||||
@asynccontextmanager
|
||||
async def temp_clone_dir(self):
|
||||
"""Context manager for handling temporary clone directory."""
|
||||
temp_dir = None
|
||||
try:
|
||||
temp_dir = tempfile.mkdtemp(prefix="langflow_clone_")
|
||||
yield temp_dir
|
||||
finally:
|
||||
if temp_dir:
|
||||
await anyio.Path(temp_dir).rmdir()
|
||||
|
||||
def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:
|
||||
# Hide fields by default
|
||||
build_config["repo_path"]["show"] = False
|
||||
build_config["clone_url"]["show"] = False
|
||||
|
||||
if field_name == "repo_source":
|
||||
if field_value == "Local":
|
||||
build_config["repo_path"]["show"] = True
|
||||
build_config["repo_path"]["required"] = True
|
||||
build_config["clone_url"]["required"] = False
|
||||
elif field_value == "Remote":
|
||||
build_config["clone_url"]["show"] = True
|
||||
build_config["clone_url"]["required"] = True
|
||||
build_config["repo_path"]["required"] = False
|
||||
|
||||
return build_config
|
||||
|
||||
async def build_gitloader(self) -> GitLoader:
|
||||
file_filter_patterns = getattr(self, "file_filter", None)
|
||||
content_filter_pattern = getattr(self, "content_filter", None)
|
||||
|
||||
file_filters = []
|
||||
if file_filter_patterns:
|
||||
patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")]
|
||||
combined_filter = self.build_combined_filter(file_filter_patterns, content_filter_pattern)
|
||||
|
||||
def file_filter(file_path: Path) -> bool:
|
||||
if len(patterns) == 1 and patterns[0].startswith("!"):
|
||||
return not file_path.match(patterns[0][1:])
|
||||
included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!"))
|
||||
excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!"))
|
||||
return included and not excluded
|
||||
repo_source = getattr(self, "repo_source", None)
|
||||
if repo_source == "Local":
|
||||
repo_path = self.repo_path
|
||||
clone_url = None
|
||||
else:
|
||||
# Clone source
|
||||
clone_url = self.clone_url
|
||||
async with self.temp_clone_dir() as temp_dir:
|
||||
repo_path = temp_dir
|
||||
|
||||
file_filters.append(file_filter)
|
||||
|
||||
if content_filter_pattern:
|
||||
content_regex = re.compile(content_filter_pattern)
|
||||
|
||||
def content_filter(file_path: Path) -> bool:
|
||||
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||
return bool(content_regex.search(content))
|
||||
|
||||
file_filters.append(content_filter)
|
||||
|
||||
def combined_filter(file_path: str) -> bool:
|
||||
path = Path(file_path)
|
||||
if self.is_binary(file_path):
|
||||
return False
|
||||
return all(f(path) for f in file_filters)
|
||||
# Only pass branch if it's explicitly set
|
||||
branch = getattr(self, "branch", None)
|
||||
if not branch:
|
||||
branch = None
|
||||
|
||||
return GitLoader(
|
||||
repo_path=self.repo_path,
|
||||
clone_url=self.clone_url,
|
||||
branch=self.branch,
|
||||
repo_path=repo_path,
|
||||
clone_url=clone_url if repo_source == "Remote" else None,
|
||||
branch=branch,
|
||||
file_filter=combined_filter,
|
||||
)
|
||||
|
||||
def load_documents(self) -> list[Data]:
|
||||
gitloader = self.build_gitloader()
|
||||
documents = list(gitloader.lazy_load())
|
||||
data = [Data.from_document(doc) for doc in documents]
|
||||
async def load_documents(self) -> list[Data]:
|
||||
gitloader = await self.build_gitloader()
|
||||
data = [Data.from_document(doc) async for doc in gitloader.alazy_load()]
|
||||
self.status = data
|
||||
return data
|
||||
|
|
|
|||
1
src/backend/tests/unit/components/git/__init__.py
Normal file
1
src/backend/tests/unit/components/git/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
"""Git component tests."""
|
||||
134
src/backend/tests/unit/components/git/test_git_component.py
Normal file
134
src/backend/tests/unit/components/git/test_git_component.py
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from langflow.components.git import GitLoaderComponent
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def git_component():
|
||||
return GitLoaderComponent()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_files():
|
||||
"""Create temporary test files for filtering."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Create a Python file
|
||||
python_file = Path(temp_dir) / "test.py"
|
||||
python_file.write_text("import langchain\nclass TestComponent:\n pass")
|
||||
|
||||
# Create a text file
|
||||
text_file = Path(temp_dir) / "test.txt"
|
||||
text_file.write_text("This is a test file")
|
||||
|
||||
# Create a binary file
|
||||
binary_file = Path(temp_dir) / "test.bin"
|
||||
binary_file.write_bytes(b"Binary\x00Content")
|
||||
|
||||
# Create a directory for permission tests
|
||||
no_access_dir = Path(temp_dir) / "no_access"
|
||||
no_access_dir.mkdir()
|
||||
no_access_file = no_access_dir / "secret.txt"
|
||||
no_access_file.write_text("secret")
|
||||
no_access_file.chmod(0o000) # Remove all permissions
|
||||
|
||||
yield temp_dir
|
||||
|
||||
|
||||
def test_is_binary(git_component, test_files):
|
||||
"""Test binary file detection."""
|
||||
temp_dir = Path(test_files)
|
||||
|
||||
# Test regular files
|
||||
assert not git_component.is_binary(temp_dir / "test.py")
|
||||
assert not git_component.is_binary(temp_dir / "test.txt")
|
||||
assert git_component.is_binary(temp_dir / "test.bin")
|
||||
|
||||
# Test error cases
|
||||
assert git_component.is_binary(temp_dir / "nonexistent.txt") # Non-existent file
|
||||
assert git_component.is_binary(temp_dir / "no_access" / "secret.txt") # No permission
|
||||
|
||||
|
||||
def test_check_file_patterns(git_component, test_files):
|
||||
"""Test file pattern matching."""
|
||||
temp_dir = Path(test_files)
|
||||
|
||||
# Test single pattern
|
||||
assert git_component.check_file_patterns(temp_dir / "test.py", "*.py")
|
||||
assert not git_component.check_file_patterns(temp_dir / "test.txt", "*.py")
|
||||
|
||||
# Test exclusion pattern
|
||||
assert not git_component.check_file_patterns(temp_dir / "test.py", "!*.py")
|
||||
|
||||
# Test multiple patterns
|
||||
assert git_component.check_file_patterns(temp_dir / "test.py", "*.py,*.txt")
|
||||
assert git_component.check_file_patterns(temp_dir / "test.txt", "*.py,*.txt")
|
||||
|
||||
# Test mixed include/exclude
|
||||
assert not git_component.check_file_patterns(temp_dir / "test.py", "*.py,!test.py")
|
||||
assert git_component.check_file_patterns(temp_dir / "other.py", "*.py,!test.py")
|
||||
|
||||
# Test empty pattern (should include all)
|
||||
assert git_component.check_file_patterns(temp_dir / "test.py", "")
|
||||
assert git_component.check_file_patterns(temp_dir / "test.txt", " ")
|
||||
|
||||
# Test invalid pattern (should treat as literal string)
|
||||
assert not git_component.check_file_patterns(temp_dir / "test.py", "[")
|
||||
|
||||
|
||||
def test_check_content_pattern(git_component, test_files):
|
||||
"""Test content pattern matching."""
|
||||
temp_dir = Path(test_files)
|
||||
|
||||
# Test simple content match
|
||||
assert git_component.check_content_pattern(temp_dir / "test.py", r"import langchain")
|
||||
assert not git_component.check_content_pattern(temp_dir / "test.txt", r"import langchain")
|
||||
|
||||
# Test regex pattern
|
||||
assert git_component.check_content_pattern(temp_dir / "test.py", r"class.*Component")
|
||||
|
||||
# Test binary file
|
||||
assert not git_component.check_content_pattern(temp_dir / "test.bin", r"Binary")
|
||||
|
||||
# Test invalid regex patterns
|
||||
assert not git_component.check_content_pattern(temp_dir / "test.py", r"[") # Unclosed bracket
|
||||
assert not git_component.check_content_pattern(temp_dir / "test.py", r"*") # Invalid quantifier
|
||||
assert not git_component.check_content_pattern(temp_dir / "test.py", r"(?<)") # Invalid lookbehind
|
||||
assert not git_component.check_content_pattern(temp_dir / "test.py", r"\1") # Invalid backreference
|
||||
|
||||
|
||||
def test_combined_filter(git_component, test_files):
|
||||
"""Test the combined filter function."""
|
||||
temp_dir = Path(test_files)
|
||||
|
||||
# Test with both patterns
|
||||
filter_func = git_component.build_combined_filter(
|
||||
file_filter_patterns="*.py", content_filter_pattern=r"class.*Component"
|
||||
)
|
||||
assert filter_func(str(temp_dir / "test.py"))
|
||||
assert not filter_func(str(temp_dir / "test.txt"))
|
||||
assert not filter_func(str(temp_dir / "test.bin"))
|
||||
|
||||
# Test with only file pattern
|
||||
filter_func = git_component.build_combined_filter(file_filter_patterns="*.py")
|
||||
assert filter_func(str(temp_dir / "test.py"))
|
||||
assert not filter_func(str(temp_dir / "test.txt"))
|
||||
|
||||
# Test with only content pattern
|
||||
filter_func = git_component.build_combined_filter(content_filter_pattern=r"class.*Component")
|
||||
assert filter_func(str(temp_dir / "test.py"))
|
||||
assert not filter_func(str(temp_dir / "test.txt"))
|
||||
|
||||
# Test with empty patterns
|
||||
filter_func = git_component.build_combined_filter()
|
||||
assert filter_func(str(temp_dir / "test.py"))
|
||||
assert filter_func(str(temp_dir / "test.txt"))
|
||||
assert not filter_func(str(temp_dir / "test.bin")) # Binary files still excluded
|
||||
|
||||
# Test error cases
|
||||
filter_func = git_component.build_combined_filter(
|
||||
file_filter_patterns="*.py", content_filter_pattern=r"class.*Component"
|
||||
)
|
||||
assert not filter_func(str(temp_dir / "nonexistent.txt")) # Non-existent file
|
||||
assert not filter_func(str(temp_dir / "no_access" / "secret.txt")) # No permission
|
||||
Loading…
Add table
Add a link
Reference in a new issue