feat: improve gitloader component (#5351)

* feat: improve gitloader component

* [autofix.ci] apply automated fixes

* feat(git.py): refactor GitLoaderComponent to support asynchronous operations and improve temporary directory management

- Convert methods to async to enhance performance with file operations.
- Implement async context manager for handling temporary clone directories.
- Update binary file check and content filtering to be asynchronous.

* fix(git.py): enhance GitLoaderComponent with improved file filtering and binary check

- Refactor file filtering logic to utilize fnmatch for pattern matching.
- Introduce a new method to check for binary files based on null byte detection.
- Update content filtering to handle exceptions more gracefully.
- Modify temporary directory cleanup to use rmdir instead of remove for better directory management.
- Adjust load_documents method to utilize asyncio.to_thread for lazy loading of documents.

* refactor(git.py): enhance GitLoaderComponent with improved file filtering and binary check

- Refactor binary file check to be synchronous and handle exceptions more gracefully.
- Introduce new methods for checking file patterns and content patterns, allowing for more flexible filtering.
- Consolidate filtering logic into a single method for better maintainability.
- Update load_documents method to run lazy loading in a separate thread for improved performance.

* feat(tests): add unit tests for GitLoaderComponent file filtering and binary detection

- Introduced new test suite for GitLoaderComponent, including tests for binary file detection and file pattern matching.
- Implemented temporary file creation for testing various file types and permissions.
- Added tests for combined filtering functionality, ensuring robust handling of file and content patterns.

* refactor(git.py): improve pattern handling and content filtering in GitLoaderComponent

- Refactored pattern handling to use a more descriptive variable name `pattern_list` for clarity.
- Enhanced content filtering by ensuring proper encoding when reading file content.
- Updated regex validation to include a test string check for better error handling.
- Removed unnecessary comments to streamline the code and improve readability.

* fix: exception TypeError

TypeError: object async_generator can't be used in 'await' expression

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
Co-authored-by: italojohnny <italojohnnydosanjos@gmail.com>
This commit is contained in:
Rodrigo Nader 2025-01-02 13:39:16 -03:00 committed by GitHub
commit 70eaacfd36
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 332 additions and 49 deletions

View file

@ -1,33 +1,50 @@
import re
import tempfile
from contextlib import asynccontextmanager
from fnmatch import fnmatch
from pathlib import Path
import anyio
from langchain_community.document_loaders.git import GitLoader
from langflow.custom import Component
from langflow.io import MessageTextInput, Output
from langflow.io import DropdownInput, MessageTextInput, Output
from langflow.schema import Data
class GitLoaderComponent(Component):
display_name = "GitLoader"
description = "Load files from a Git repository"
documentation = "https://python.langchain.com/v0.2/docs/integrations/document_loaders/git/"
display_name = "Git"
description = (
"Load and filter documents from a local or remote Git repository. "
"Use a local repo path or clone from a remote URL."
)
trace_type = "tool"
icon = "GitLoader"
name = "GitLoader"
inputs = [
DropdownInput(
name="repo_source",
display_name="Repository Source",
options=["Local", "Remote"],
required=True,
info="Select whether to use a local repo path or clone from a remote URL.",
real_time_refresh=True,
),
MessageTextInput(
name="repo_path",
display_name="Repository Path",
display_name="Local Repository Path",
required=False,
info="The local path to the Git repository.",
info="The local path to the existing Git repository (used if 'Local' is selected).",
dynamic=True,
show=False,
),
MessageTextInput(
name="clone_url",
display_name="Clone URL",
required=False,
info="The URL to clone the Git repository from.",
info="The URL of the Git repository to clone (used if 'Clone' is selected).",
dynamic=True,
show=False,
),
MessageTextInput(
name="branch",
@ -41,8 +58,12 @@ class GitLoaderComponent(Component):
display_name="File Filter",
required=False,
advanced=True,
info="A list of patterns to filter files. Example to include only .py files: '*.py'. "
"Example to exclude .py files: '!*.py'. Multiple patterns can be separated by commas.",
info=(
"Patterns to filter files. For example:\n"
"Include only .py files: '*.py'\n"
"Exclude .py files: '!*.py'\n"
"Multiple patterns can be separated by commas."
),
),
MessageTextInput(
name="content_filter",
@ -58,57 +79,184 @@ class GitLoaderComponent(Component):
]
@staticmethod
def is_binary(file_path: str) -> bool:
"""Check if a file is binary by looking for null bytes.
def is_binary(file_path: str | Path) -> bool:
"""Check if a file is binary by looking for null bytes."""
try:
with Path(file_path).open("rb") as file:
content = file.read(1024)
return b"\x00" in content
except Exception: # noqa: BLE001
return True
This is necessary because when searches are performed using
the content_filter, binary files need to be ignored.
@staticmethod
def check_file_patterns(file_path: str | Path, patterns: str) -> bool:
"""Check if a file matches the given patterns.
Args:
file_path: Path to the file to check
patterns: Comma-separated list of glob patterns
Returns:
bool: True if file should be included, False if excluded
"""
with Path(file_path).open("rb") as file:
return b"\x00" in file.read(1024)
# Handle empty or whitespace-only patterns
if not patterns or patterns.isspace():
return True
def build_gitloader(self) -> GitLoader:
path_str = str(file_path)
file_name = Path(path_str).name
pattern_list: list[str] = [pattern.strip() for pattern in patterns.split(",") if pattern.strip()]
# If no valid patterns after stripping, treat as include all
if not pattern_list:
return True
# Process exclusion patterns first
for pattern in pattern_list:
if pattern.startswith("!"):
# For exclusions, match against both full path and filename
exclude_pattern = pattern[1:]
if fnmatch(path_str, exclude_pattern) or fnmatch(file_name, exclude_pattern):
return False
# Then check inclusion patterns
include_patterns = [p for p in pattern_list if not p.startswith("!")]
# If no include patterns, treat as include all
if not include_patterns:
return True
# For inclusions, match against both full path and filename
return any(fnmatch(path_str, pattern) or fnmatch(file_name, pattern) for pattern in include_patterns)
@staticmethod
def check_content_pattern(file_path: str | Path, pattern: str) -> bool:
"""Check if file content matches the given regex pattern.
Args:
file_path: Path to the file to check
pattern: Regex pattern to match against content
Returns:
bool: True if content matches, False otherwise
"""
try:
# Check if file is binary
with Path(file_path).open("rb") as file:
content = file.read(1024)
if b"\x00" in content:
return False
# Try to compile the regex pattern first
try:
# Use the MULTILINE flag to better handle text content
content_regex = re.compile(pattern, re.MULTILINE)
# Test the pattern with a simple string to catch syntax errors
test_str = "test\nstring"
if not content_regex.search(test_str):
# Pattern is valid but doesn't match test string
pass
except (re.error, TypeError, ValueError):
return False
# If not binary and regex is valid, check content
with Path(file_path).open(encoding="utf-8") as file:
file_content = file.read()
return bool(content_regex.search(file_content))
except (OSError, UnicodeDecodeError):
return False
def build_combined_filter(self, file_filter_patterns: str | None = None, content_filter_pattern: str | None = None):
"""Build a combined filter function from file and content patterns.
Args:
file_filter_patterns: Comma-separated glob patterns
content_filter_pattern: Regex pattern for content
Returns:
callable: Filter function that takes a file path and returns bool
"""
def combined_filter(file_path: str) -> bool:
try:
path = Path(file_path)
# Check if file exists and is readable
if not path.exists():
return False
# Check if file is binary
if self.is_binary(path):
return False
# Apply file pattern filters
if file_filter_patterns and not self.check_file_patterns(path, file_filter_patterns):
return False
# Apply content filter
return not (content_filter_pattern and not self.check_content_pattern(path, content_filter_pattern))
except Exception: # noqa: BLE001
return False
return combined_filter
@asynccontextmanager
async def temp_clone_dir(self):
"""Context manager for handling temporary clone directory."""
temp_dir = None
try:
temp_dir = tempfile.mkdtemp(prefix="langflow_clone_")
yield temp_dir
finally:
if temp_dir:
await anyio.Path(temp_dir).rmdir()
def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:
# Hide fields by default
build_config["repo_path"]["show"] = False
build_config["clone_url"]["show"] = False
if field_name == "repo_source":
if field_value == "Local":
build_config["repo_path"]["show"] = True
build_config["repo_path"]["required"] = True
build_config["clone_url"]["required"] = False
elif field_value == "Remote":
build_config["clone_url"]["show"] = True
build_config["clone_url"]["required"] = True
build_config["repo_path"]["required"] = False
return build_config
async def build_gitloader(self) -> GitLoader:
file_filter_patterns = getattr(self, "file_filter", None)
content_filter_pattern = getattr(self, "content_filter", None)
file_filters = []
if file_filter_patterns:
patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")]
combined_filter = self.build_combined_filter(file_filter_patterns, content_filter_pattern)
def file_filter(file_path: Path) -> bool:
if len(patterns) == 1 and patterns[0].startswith("!"):
return not file_path.match(patterns[0][1:])
included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!"))
excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!"))
return included and not excluded
repo_source = getattr(self, "repo_source", None)
if repo_source == "Local":
repo_path = self.repo_path
clone_url = None
else:
# Clone source
clone_url = self.clone_url
async with self.temp_clone_dir() as temp_dir:
repo_path = temp_dir
file_filters.append(file_filter)
if content_filter_pattern:
content_regex = re.compile(content_filter_pattern)
def content_filter(file_path: Path) -> bool:
content = file_path.read_text(encoding="utf-8", errors="ignore")
return bool(content_regex.search(content))
file_filters.append(content_filter)
def combined_filter(file_path: str) -> bool:
path = Path(file_path)
if self.is_binary(file_path):
return False
return all(f(path) for f in file_filters)
# Only pass branch if it's explicitly set
branch = getattr(self, "branch", None)
if not branch:
branch = None
return GitLoader(
repo_path=self.repo_path,
clone_url=self.clone_url,
branch=self.branch,
repo_path=repo_path,
clone_url=clone_url if repo_source == "Remote" else None,
branch=branch,
file_filter=combined_filter,
)
def load_documents(self) -> list[Data]:
gitloader = self.build_gitloader()
documents = list(gitloader.lazy_load())
data = [Data.from_document(doc) for doc in documents]
async def load_documents(self) -> list[Data]:
gitloader = await self.build_gitloader()
data = [Data.from_document(doc) async for doc in gitloader.alazy_load()]
self.status = data
return data

View file

@ -0,0 +1 @@
"""Git component tests."""

View file

@ -0,0 +1,134 @@
import tempfile
from pathlib import Path
import pytest
from langflow.components.git import GitLoaderComponent
@pytest.fixture
def git_component():
return GitLoaderComponent()
@pytest.fixture
def test_files():
"""Create temporary test files for filtering."""
with tempfile.TemporaryDirectory() as temp_dir:
# Create a Python file
python_file = Path(temp_dir) / "test.py"
python_file.write_text("import langchain\nclass TestComponent:\n pass")
# Create a text file
text_file = Path(temp_dir) / "test.txt"
text_file.write_text("This is a test file")
# Create a binary file
binary_file = Path(temp_dir) / "test.bin"
binary_file.write_bytes(b"Binary\x00Content")
# Create a directory for permission tests
no_access_dir = Path(temp_dir) / "no_access"
no_access_dir.mkdir()
no_access_file = no_access_dir / "secret.txt"
no_access_file.write_text("secret")
no_access_file.chmod(0o000) # Remove all permissions
yield temp_dir
def test_is_binary(git_component, test_files):
"""Test binary file detection."""
temp_dir = Path(test_files)
# Test regular files
assert not git_component.is_binary(temp_dir / "test.py")
assert not git_component.is_binary(temp_dir / "test.txt")
assert git_component.is_binary(temp_dir / "test.bin")
# Test error cases
assert git_component.is_binary(temp_dir / "nonexistent.txt") # Non-existent file
assert git_component.is_binary(temp_dir / "no_access" / "secret.txt") # No permission
def test_check_file_patterns(git_component, test_files):
"""Test file pattern matching."""
temp_dir = Path(test_files)
# Test single pattern
assert git_component.check_file_patterns(temp_dir / "test.py", "*.py")
assert not git_component.check_file_patterns(temp_dir / "test.txt", "*.py")
# Test exclusion pattern
assert not git_component.check_file_patterns(temp_dir / "test.py", "!*.py")
# Test multiple patterns
assert git_component.check_file_patterns(temp_dir / "test.py", "*.py,*.txt")
assert git_component.check_file_patterns(temp_dir / "test.txt", "*.py,*.txt")
# Test mixed include/exclude
assert not git_component.check_file_patterns(temp_dir / "test.py", "*.py,!test.py")
assert git_component.check_file_patterns(temp_dir / "other.py", "*.py,!test.py")
# Test empty pattern (should include all)
assert git_component.check_file_patterns(temp_dir / "test.py", "")
assert git_component.check_file_patterns(temp_dir / "test.txt", " ")
# Test invalid pattern (should treat as literal string)
assert not git_component.check_file_patterns(temp_dir / "test.py", "[")
def test_check_content_pattern(git_component, test_files):
"""Test content pattern matching."""
temp_dir = Path(test_files)
# Test simple content match
assert git_component.check_content_pattern(temp_dir / "test.py", r"import langchain")
assert not git_component.check_content_pattern(temp_dir / "test.txt", r"import langchain")
# Test regex pattern
assert git_component.check_content_pattern(temp_dir / "test.py", r"class.*Component")
# Test binary file
assert not git_component.check_content_pattern(temp_dir / "test.bin", r"Binary")
# Test invalid regex patterns
assert not git_component.check_content_pattern(temp_dir / "test.py", r"[") # Unclosed bracket
assert not git_component.check_content_pattern(temp_dir / "test.py", r"*") # Invalid quantifier
assert not git_component.check_content_pattern(temp_dir / "test.py", r"(?<)") # Invalid lookbehind
assert not git_component.check_content_pattern(temp_dir / "test.py", r"\1") # Invalid backreference
def test_combined_filter(git_component, test_files):
"""Test the combined filter function."""
temp_dir = Path(test_files)
# Test with both patterns
filter_func = git_component.build_combined_filter(
file_filter_patterns="*.py", content_filter_pattern=r"class.*Component"
)
assert filter_func(str(temp_dir / "test.py"))
assert not filter_func(str(temp_dir / "test.txt"))
assert not filter_func(str(temp_dir / "test.bin"))
# Test with only file pattern
filter_func = git_component.build_combined_filter(file_filter_patterns="*.py")
assert filter_func(str(temp_dir / "test.py"))
assert not filter_func(str(temp_dir / "test.txt"))
# Test with only content pattern
filter_func = git_component.build_combined_filter(content_filter_pattern=r"class.*Component")
assert filter_func(str(temp_dir / "test.py"))
assert not filter_func(str(temp_dir / "test.txt"))
# Test with empty patterns
filter_func = git_component.build_combined_filter()
assert filter_func(str(temp_dir / "test.py"))
assert filter_func(str(temp_dir / "test.txt"))
assert not filter_func(str(temp_dir / "test.bin")) # Binary files still excluded
# Test error cases
filter_func = git_component.build_combined_filter(
file_filter_patterns="*.py", content_filter_pattern=r"class.*Component"
)
assert not filter_func(str(temp_dir / "nonexistent.txt")) # Non-existent file
assert not filter_func(str(temp_dir / "no_access" / "secret.txt")) # No permission