From 70eaacfd36c69ec710a22235fae2647908bd77f5 Mon Sep 17 00:00:00 2001 From: Rodrigo Nader Date: Thu, 2 Jan 2025 13:39:16 -0300 Subject: [PATCH] feat: improve gitloader component (#5351) * feat: improve gitloader component * [autofix.ci] apply automated fixes * feat(git.py): refactor GitLoaderComponent to support asynchronous operations and improve temporary directory management - Convert methods to async to enhance performance with file operations. - Implement async context manager for handling temporary clone directories. - Update binary file check and content filtering to be asynchronous. * fix(git.py): enhance GitLoaderComponent with improved file filtering and binary check - Refactor file filtering logic to utilize fnmatch for pattern matching. - Introduce a new method to check for binary files based on null byte detection. - Update content filtering to handle exceptions more gracefully. - Modify temporary directory cleanup to use rmdir instead of remove for better directory management. - Adjust load_documents method to utilize asyncio.to_thread for lazy loading of documents. * refactor(git.py): enhance GitLoaderComponent with improved file filtering and binary check - Refactor binary file check to be synchronous and handle exceptions more gracefully. - Introduce new methods for checking file patterns and content patterns, allowing for more flexible filtering. - Consolidate filtering logic into a single method for better maintainability. - Update load_documents method to run lazy loading in a separate thread for improved performance. * feat(tests): add unit tests for GitLoaderComponent file filtering and binary detection - Introduced new test suite for GitLoaderComponent, including tests for binary file detection and file pattern matching. - Implemented temporary file creation for testing various file types and permissions. - Added tests for combined filtering functionality, ensuring robust handling of file and content patterns. * refactor(git.py): improve pattern handling and content filtering in GitLoaderComponent - Refactored pattern handling to use a more descriptive variable name `pattern_list` for clarity. - Enhanced content filtering by ensuring proper encoding when reading file content. - Updated regex validation to include a test string check for better error handling. - Removed unnecessary comments to streamline the code and improve readability. * fix: exception TypeError TypeError: object async_generator can't be used in 'await' expression --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Gabriel Luiz Freitas Almeida Co-authored-by: italojohnny --- .../base/langflow/components/git/git.py | 246 ++++++++++++++---- .../tests/unit/components/git/__init__.py | 1 + .../unit/components/git/test_git_component.py | 134 ++++++++++ 3 files changed, 332 insertions(+), 49 deletions(-) create mode 100644 src/backend/tests/unit/components/git/__init__.py create mode 100644 src/backend/tests/unit/components/git/test_git_component.py diff --git a/src/backend/base/langflow/components/git/git.py b/src/backend/base/langflow/components/git/git.py index b06ecc96c..368062ab1 100644 --- a/src/backend/base/langflow/components/git/git.py +++ b/src/backend/base/langflow/components/git/git.py @@ -1,33 +1,50 @@ import re +import tempfile +from contextlib import asynccontextmanager +from fnmatch import fnmatch from pathlib import Path +import anyio from langchain_community.document_loaders.git import GitLoader from langflow.custom import Component -from langflow.io import MessageTextInput, Output +from langflow.io import DropdownInput, MessageTextInput, Output from langflow.schema import Data class GitLoaderComponent(Component): - display_name = "GitLoader" - description = "Load files from a Git repository" - documentation = "https://python.langchain.com/v0.2/docs/integrations/document_loaders/git/" + display_name = "Git" + description = ( + "Load and filter documents from a local or remote Git repository. " + "Use a local repo path or clone from a remote URL." + ) trace_type = "tool" icon = "GitLoader" - name = "GitLoader" inputs = [ + DropdownInput( + name="repo_source", + display_name="Repository Source", + options=["Local", "Remote"], + required=True, + info="Select whether to use a local repo path or clone from a remote URL.", + real_time_refresh=True, + ), MessageTextInput( name="repo_path", - display_name="Repository Path", + display_name="Local Repository Path", required=False, - info="The local path to the Git repository.", + info="The local path to the existing Git repository (used if 'Local' is selected).", + dynamic=True, + show=False, ), MessageTextInput( name="clone_url", display_name="Clone URL", required=False, - info="The URL to clone the Git repository from.", + info="The URL of the Git repository to clone (used if 'Clone' is selected).", + dynamic=True, + show=False, ), MessageTextInput( name="branch", @@ -41,8 +58,12 @@ class GitLoaderComponent(Component): display_name="File Filter", required=False, advanced=True, - info="A list of patterns to filter files. Example to include only .py files: '*.py'. " - "Example to exclude .py files: '!*.py'. Multiple patterns can be separated by commas.", + info=( + "Patterns to filter files. For example:\n" + "Include only .py files: '*.py'\n" + "Exclude .py files: '!*.py'\n" + "Multiple patterns can be separated by commas." + ), ), MessageTextInput( name="content_filter", @@ -58,57 +79,184 @@ class GitLoaderComponent(Component): ] @staticmethod - def is_binary(file_path: str) -> bool: - """Check if a file is binary by looking for null bytes. + def is_binary(file_path: str | Path) -> bool: + """Check if a file is binary by looking for null bytes.""" + try: + with Path(file_path).open("rb") as file: + content = file.read(1024) + return b"\x00" in content + except Exception: # noqa: BLE001 + return True - This is necessary because when searches are performed using - the content_filter, binary files need to be ignored. + @staticmethod + def check_file_patterns(file_path: str | Path, patterns: str) -> bool: + """Check if a file matches the given patterns. + + Args: + file_path: Path to the file to check + patterns: Comma-separated list of glob patterns + + Returns: + bool: True if file should be included, False if excluded """ - with Path(file_path).open("rb") as file: - return b"\x00" in file.read(1024) + # Handle empty or whitespace-only patterns + if not patterns or patterns.isspace(): + return True - def build_gitloader(self) -> GitLoader: + path_str = str(file_path) + file_name = Path(path_str).name + pattern_list: list[str] = [pattern.strip() for pattern in patterns.split(",") if pattern.strip()] + + # If no valid patterns after stripping, treat as include all + if not pattern_list: + return True + + # Process exclusion patterns first + for pattern in pattern_list: + if pattern.startswith("!"): + # For exclusions, match against both full path and filename + exclude_pattern = pattern[1:] + if fnmatch(path_str, exclude_pattern) or fnmatch(file_name, exclude_pattern): + return False + + # Then check inclusion patterns + include_patterns = [p for p in pattern_list if not p.startswith("!")] + # If no include patterns, treat as include all + if not include_patterns: + return True + + # For inclusions, match against both full path and filename + return any(fnmatch(path_str, pattern) or fnmatch(file_name, pattern) for pattern in include_patterns) + + @staticmethod + def check_content_pattern(file_path: str | Path, pattern: str) -> bool: + """Check if file content matches the given regex pattern. + + Args: + file_path: Path to the file to check + pattern: Regex pattern to match against content + + Returns: + bool: True if content matches, False otherwise + """ + try: + # Check if file is binary + with Path(file_path).open("rb") as file: + content = file.read(1024) + if b"\x00" in content: + return False + + # Try to compile the regex pattern first + try: + # Use the MULTILINE flag to better handle text content + content_regex = re.compile(pattern, re.MULTILINE) + # Test the pattern with a simple string to catch syntax errors + test_str = "test\nstring" + if not content_regex.search(test_str): + # Pattern is valid but doesn't match test string + pass + except (re.error, TypeError, ValueError): + return False + + # If not binary and regex is valid, check content + with Path(file_path).open(encoding="utf-8") as file: + file_content = file.read() + return bool(content_regex.search(file_content)) + except (OSError, UnicodeDecodeError): + return False + + def build_combined_filter(self, file_filter_patterns: str | None = None, content_filter_pattern: str | None = None): + """Build a combined filter function from file and content patterns. + + Args: + file_filter_patterns: Comma-separated glob patterns + content_filter_pattern: Regex pattern for content + + Returns: + callable: Filter function that takes a file path and returns bool + """ + + def combined_filter(file_path: str) -> bool: + try: + path = Path(file_path) + + # Check if file exists and is readable + if not path.exists(): + return False + + # Check if file is binary + if self.is_binary(path): + return False + + # Apply file pattern filters + if file_filter_patterns and not self.check_file_patterns(path, file_filter_patterns): + return False + + # Apply content filter + return not (content_filter_pattern and not self.check_content_pattern(path, content_filter_pattern)) + except Exception: # noqa: BLE001 + return False + + return combined_filter + + @asynccontextmanager + async def temp_clone_dir(self): + """Context manager for handling temporary clone directory.""" + temp_dir = None + try: + temp_dir = tempfile.mkdtemp(prefix="langflow_clone_") + yield temp_dir + finally: + if temp_dir: + await anyio.Path(temp_dir).rmdir() + + def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict: + # Hide fields by default + build_config["repo_path"]["show"] = False + build_config["clone_url"]["show"] = False + + if field_name == "repo_source": + if field_value == "Local": + build_config["repo_path"]["show"] = True + build_config["repo_path"]["required"] = True + build_config["clone_url"]["required"] = False + elif field_value == "Remote": + build_config["clone_url"]["show"] = True + build_config["clone_url"]["required"] = True + build_config["repo_path"]["required"] = False + + return build_config + + async def build_gitloader(self) -> GitLoader: file_filter_patterns = getattr(self, "file_filter", None) content_filter_pattern = getattr(self, "content_filter", None) - file_filters = [] - if file_filter_patterns: - patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")] + combined_filter = self.build_combined_filter(file_filter_patterns, content_filter_pattern) - def file_filter(file_path: Path) -> bool: - if len(patterns) == 1 and patterns[0].startswith("!"): - return not file_path.match(patterns[0][1:]) - included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!")) - excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!")) - return included and not excluded + repo_source = getattr(self, "repo_source", None) + if repo_source == "Local": + repo_path = self.repo_path + clone_url = None + else: + # Clone source + clone_url = self.clone_url + async with self.temp_clone_dir() as temp_dir: + repo_path = temp_dir - file_filters.append(file_filter) - - if content_filter_pattern: - content_regex = re.compile(content_filter_pattern) - - def content_filter(file_path: Path) -> bool: - content = file_path.read_text(encoding="utf-8", errors="ignore") - return bool(content_regex.search(content)) - - file_filters.append(content_filter) - - def combined_filter(file_path: str) -> bool: - path = Path(file_path) - if self.is_binary(file_path): - return False - return all(f(path) for f in file_filters) + # Only pass branch if it's explicitly set + branch = getattr(self, "branch", None) + if not branch: + branch = None return GitLoader( - repo_path=self.repo_path, - clone_url=self.clone_url, - branch=self.branch, + repo_path=repo_path, + clone_url=clone_url if repo_source == "Remote" else None, + branch=branch, file_filter=combined_filter, ) - def load_documents(self) -> list[Data]: - gitloader = self.build_gitloader() - documents = list(gitloader.lazy_load()) - data = [Data.from_document(doc) for doc in documents] + async def load_documents(self) -> list[Data]: + gitloader = await self.build_gitloader() + data = [Data.from_document(doc) async for doc in gitloader.alazy_load()] self.status = data return data diff --git a/src/backend/tests/unit/components/git/__init__.py b/src/backend/tests/unit/components/git/__init__.py new file mode 100644 index 000000000..88101da31 --- /dev/null +++ b/src/backend/tests/unit/components/git/__init__.py @@ -0,0 +1 @@ +"""Git component tests.""" diff --git a/src/backend/tests/unit/components/git/test_git_component.py b/src/backend/tests/unit/components/git/test_git_component.py new file mode 100644 index 000000000..338cb48d3 --- /dev/null +++ b/src/backend/tests/unit/components/git/test_git_component.py @@ -0,0 +1,134 @@ +import tempfile +from pathlib import Path + +import pytest +from langflow.components.git import GitLoaderComponent + + +@pytest.fixture +def git_component(): + return GitLoaderComponent() + + +@pytest.fixture +def test_files(): + """Create temporary test files for filtering.""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create a Python file + python_file = Path(temp_dir) / "test.py" + python_file.write_text("import langchain\nclass TestComponent:\n pass") + + # Create a text file + text_file = Path(temp_dir) / "test.txt" + text_file.write_text("This is a test file") + + # Create a binary file + binary_file = Path(temp_dir) / "test.bin" + binary_file.write_bytes(b"Binary\x00Content") + + # Create a directory for permission tests + no_access_dir = Path(temp_dir) / "no_access" + no_access_dir.mkdir() + no_access_file = no_access_dir / "secret.txt" + no_access_file.write_text("secret") + no_access_file.chmod(0o000) # Remove all permissions + + yield temp_dir + + +def test_is_binary(git_component, test_files): + """Test binary file detection.""" + temp_dir = Path(test_files) + + # Test regular files + assert not git_component.is_binary(temp_dir / "test.py") + assert not git_component.is_binary(temp_dir / "test.txt") + assert git_component.is_binary(temp_dir / "test.bin") + + # Test error cases + assert git_component.is_binary(temp_dir / "nonexistent.txt") # Non-existent file + assert git_component.is_binary(temp_dir / "no_access" / "secret.txt") # No permission + + +def test_check_file_patterns(git_component, test_files): + """Test file pattern matching.""" + temp_dir = Path(test_files) + + # Test single pattern + assert git_component.check_file_patterns(temp_dir / "test.py", "*.py") + assert not git_component.check_file_patterns(temp_dir / "test.txt", "*.py") + + # Test exclusion pattern + assert not git_component.check_file_patterns(temp_dir / "test.py", "!*.py") + + # Test multiple patterns + assert git_component.check_file_patterns(temp_dir / "test.py", "*.py,*.txt") + assert git_component.check_file_patterns(temp_dir / "test.txt", "*.py,*.txt") + + # Test mixed include/exclude + assert not git_component.check_file_patterns(temp_dir / "test.py", "*.py,!test.py") + assert git_component.check_file_patterns(temp_dir / "other.py", "*.py,!test.py") + + # Test empty pattern (should include all) + assert git_component.check_file_patterns(temp_dir / "test.py", "") + assert git_component.check_file_patterns(temp_dir / "test.txt", " ") + + # Test invalid pattern (should treat as literal string) + assert not git_component.check_file_patterns(temp_dir / "test.py", "[") + + +def test_check_content_pattern(git_component, test_files): + """Test content pattern matching.""" + temp_dir = Path(test_files) + + # Test simple content match + assert git_component.check_content_pattern(temp_dir / "test.py", r"import langchain") + assert not git_component.check_content_pattern(temp_dir / "test.txt", r"import langchain") + + # Test regex pattern + assert git_component.check_content_pattern(temp_dir / "test.py", r"class.*Component") + + # Test binary file + assert not git_component.check_content_pattern(temp_dir / "test.bin", r"Binary") + + # Test invalid regex patterns + assert not git_component.check_content_pattern(temp_dir / "test.py", r"[") # Unclosed bracket + assert not git_component.check_content_pattern(temp_dir / "test.py", r"*") # Invalid quantifier + assert not git_component.check_content_pattern(temp_dir / "test.py", r"(?<)") # Invalid lookbehind + assert not git_component.check_content_pattern(temp_dir / "test.py", r"\1") # Invalid backreference + + +def test_combined_filter(git_component, test_files): + """Test the combined filter function.""" + temp_dir = Path(test_files) + + # Test with both patterns + filter_func = git_component.build_combined_filter( + file_filter_patterns="*.py", content_filter_pattern=r"class.*Component" + ) + assert filter_func(str(temp_dir / "test.py")) + assert not filter_func(str(temp_dir / "test.txt")) + assert not filter_func(str(temp_dir / "test.bin")) + + # Test with only file pattern + filter_func = git_component.build_combined_filter(file_filter_patterns="*.py") + assert filter_func(str(temp_dir / "test.py")) + assert not filter_func(str(temp_dir / "test.txt")) + + # Test with only content pattern + filter_func = git_component.build_combined_filter(content_filter_pattern=r"class.*Component") + assert filter_func(str(temp_dir / "test.py")) + assert not filter_func(str(temp_dir / "test.txt")) + + # Test with empty patterns + filter_func = git_component.build_combined_filter() + assert filter_func(str(temp_dir / "test.py")) + assert filter_func(str(temp_dir / "test.txt")) + assert not filter_func(str(temp_dir / "test.bin")) # Binary files still excluded + + # Test error cases + filter_func = git_component.build_combined_filter( + file_filter_patterns="*.py", content_filter_pattern=r"class.*Component" + ) + assert not filter_func(str(temp_dir / "nonexistent.txt")) # Non-existent file + assert not filter_func(str(temp_dir / "no_access" / "secret.txt")) # No permission