From d108ca11c59d37dd2fad05f48869352192d55597 Mon Sep 17 00:00:00 2001 From: Daniel Gines Date: Mon, 29 Jul 2024 23:00:53 -0300 Subject: [PATCH] feat: Add GitLoader Component with advanced filtering options (#2850) * feat: Add GitLoader Component with advanced filtering options This commit introduces the GitLoaderComponent, enabling users to load files from a Git repository with advanced filtering options. GitLoader Component: - Implementation of the GitLoaderComponent to load files from a Git repository using the `langchain_community.document_loaders.git.GitLoader` module. - Advanced filtering option using `file_filter` to include or exclude specific files based on their extensions or other criteria. Examples of `file_filter` usage: - Include only .py files: `lambda file_path: file_path.endswith('.py')` - Exclude .py files: `lambda file_path: not file_path.endswith('.py')` This component ensures a flexible and customizable approach for loading documents from Git repositories, enhancing the user experience with advanced filtering capabilities. Features: - Support for loading documents from Git repositories. - Advanced file filtering options to include or exclude specific files. * feat: Add GitLoader Component with advanced filtering options This commit introduces the GitLoaderComponent, enabling users to load files from a Git repository with advanced filtering options. GitLoader Component: - Implementation of the GitLoaderComponent to load files from a Git repository using the `langchain_community.document_loaders.git.GitLoader` module. - Advanced filtering option using `file_filter` to include or exclude specific files based on their extensions or other criteria. Examples of `file_filter` usage: - Include only .py files: `lambda file_path: file_path.endswith('.py')` - Exclude .py files: `lambda file_path: not file_path.endswith('.py')` This component ensures a flexible and customizable approach for loading documents from Git repositories, enhancing the user experience with advanced filtering capabilities. Features: - Support for loading documents from Git repositories. - Advanced file filtering options to include or exclude specific files. * fix: Ensure proper evaluation and validation of file_filter in GitLoaderComponent This commit fixes the issue where the GitLoaderComponent would fail if the file_filter input was not evaluated correctly. Changes include: - Added a check to ensure that file_filter is a valid string before calling eval. - Ensured that the evaluated file_filter is callable, otherwise it defaults to None. * [autofix.ci] apply automated fixes * feat: Enhance GitLoaderComponent with dynamic inputs, content filtering - Changed inputs from `StrInput` to `MessageTextInput` to enable dynamic use with agents. - Added `content_filter` field to allow additional content filtering using regex. - Updated `file_filter` to support glob format, simplifying usage for users. - Implemented binary file removal filter to exclude binary files from queries, aligning with the agent's purpose. * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- .../components/documentloaders/GitLoader.py | 116 ++++++++++++++++++ .../components/documentloaders/__init__.py | 3 +- src/frontend/src/icons/GitLoader/Git.svg | 1 + .../src/icons/GitLoader/GitLoader.jsx | 22 ++++ src/frontend/src/icons/GitLoader/index.tsx | 9 ++ src/frontend/src/utils/styleUtils.ts | 2 + 6 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 src/backend/base/langflow/components/documentloaders/GitLoader.py create mode 100644 src/frontend/src/icons/GitLoader/Git.svg create mode 100644 src/frontend/src/icons/GitLoader/GitLoader.jsx create mode 100644 src/frontend/src/icons/GitLoader/index.tsx diff --git a/src/backend/base/langflow/components/documentloaders/GitLoader.py b/src/backend/base/langflow/components/documentloaders/GitLoader.py new file mode 100644 index 000000000..ea39d76bd --- /dev/null +++ b/src/backend/base/langflow/components/documentloaders/GitLoader.py @@ -0,0 +1,116 @@ +from pathlib import Path +from typing import List +import re + +from langchain_community.document_loaders.git import GitLoader +from langflow.custom import Component +from langflow.io import MessageTextInput, Output +from langflow.schema import Data + + +class GitLoaderComponent(Component): + display_name = "GitLoader" + description = "Load files from a Git repository" + documentation = "https://python.langchain.com/v0.2/docs/integrations/document_loaders/git/" + trace_type = "tool" + icon = "GitLoader" + name = "GitLoader" + + inputs = [ + MessageTextInput( + name="repo_path", + display_name="Repository Path", + required=True, + info="The local path to the Git repository.", + ), + MessageTextInput( + name="clone_url", + display_name="Clone URL", + required=False, + info="The URL to clone the Git repository from.", + ), + MessageTextInput( + name="branch", + display_name="Branch", + required=False, + value="main", + info="The branch to load files from. Defaults to 'main'.", + ), + MessageTextInput( + name="file_filter", + display_name="File Filter", + required=False, + advanced=True, + info="A list of patterns to filter files. Example to include only .py files: '*.py'. " + "Example to exclude .py files: '!*.py'. Multiple patterns can be separated by commas.", + ), + MessageTextInput( + name="content_filter", + display_name="Content Filter", + required=False, + advanced=True, + info="A regex pattern to filter files based on their content.", + ), + ] + + outputs = [ + Output(name="data", display_name="Data", method="load_documents"), + ] + + @staticmethod + def is_binary(file_path: str) -> bool: + """ + Check if a file is binary by looking for null bytes. + This is necessary because when searches are performed using + the content_filter, binary files need to be ignored. + """ + with open(file_path, "rb") as file: + return b"\x00" in file.read(1024) + + def build_gitloader(self) -> GitLoader: + file_filter_patterns = getattr(self, "file_filter", None) + content_filter_pattern = getattr(self, "content_filter", None) + + file_filters = [] + if file_filter_patterns: + patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")] + + def file_filter(file_path: Path) -> bool: + if len(patterns) == 1 and patterns[0].startswith("!"): + return not file_path.match(patterns[0][1:]) + included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!")) + excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!")) + return included and not excluded + + file_filters.append(file_filter) + + if content_filter_pattern: + content_regex = re.compile(content_filter_pattern) + + def content_filter(file_path: Path) -> bool: + with file_path.open("r", encoding="utf-8", errors="ignore") as file: + content = file.read() + return bool(content_regex.search(content)) + + file_filters.append(content_filter) + + def combined_filter(file_path: str) -> bool: + path = Path(file_path) + if self.is_binary(file_path): + return False + return all(f(path) for f in file_filters) + + loader = GitLoader( + repo_path=self.repo_path, + clone_url=self.clone_url, + branch=self.branch, + file_filter=combined_filter, + ) + return loader + + def load_documents(self) -> List[Data]: + gitloader = self.build_gitloader() + documents = list(gitloader.lazy_load()) + data = [Data.from_document(doc) for doc in documents] + self.status = data + return data diff --git a/src/backend/base/langflow/components/documentloaders/__init__.py b/src/backend/base/langflow/components/documentloaders/__init__.py index d33437c75..8f18cd3db 100644 --- a/src/backend/base/langflow/components/documentloaders/__init__.py +++ b/src/backend/base/langflow/components/documentloaders/__init__.py @@ -1,3 +1,4 @@ from .Confluence import ConfluenceComponent +from .GitLoader import GitLoaderComponent -__all__ = ["ConfluenceComponent"] +__all__ = ["ConfluenceComponent", "GitLoaderComponent"] diff --git a/src/frontend/src/icons/GitLoader/Git.svg b/src/frontend/src/icons/GitLoader/Git.svg new file mode 100644 index 000000000..5bf444b9b --- /dev/null +++ b/src/frontend/src/icons/GitLoader/Git.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/src/frontend/src/icons/GitLoader/GitLoader.jsx b/src/frontend/src/icons/GitLoader/GitLoader.jsx new file mode 100644 index 000000000..f1b291643 --- /dev/null +++ b/src/frontend/src/icons/GitLoader/GitLoader.jsx @@ -0,0 +1,22 @@ +const GitLoaderIcon = (props) => ( + + + +); + +export default GitLoaderIcon; diff --git a/src/frontend/src/icons/GitLoader/index.tsx b/src/frontend/src/icons/GitLoader/index.tsx new file mode 100644 index 000000000..51e82f4f8 --- /dev/null +++ b/src/frontend/src/icons/GitLoader/index.tsx @@ -0,0 +1,9 @@ +import React, { forwardRef } from "react"; +import SvgGitLoader from "./GitLoader"; + +export const GitLoaderIcon = forwardRef< + SVGSVGElement, + React.PropsWithChildren<{}> +>((props, ref) => { + return ; +}); diff --git a/src/frontend/src/utils/styleUtils.ts b/src/frontend/src/utils/styleUtils.ts index 3bde56e9a..244e68fa6 100644 --- a/src/frontend/src/utils/styleUtils.ts +++ b/src/frontend/src/utils/styleUtils.ts @@ -179,6 +179,7 @@ import { EvernoteIcon } from "../icons/Evernote"; import { FBIcon } from "../icons/FacebookMessenger"; import { FirecrawlIcon } from "../icons/Firecrawl"; import { GitBookIcon } from "../icons/GitBook"; +import { GitLoaderIcon } from "../icons/GitLoader"; import { GoogleIcon } from "../icons/Google"; import { GoogleGenerativeAIIcon } from "../icons/GoogleGenerativeAI"; import { @@ -588,4 +589,5 @@ export const nodeIconsLucide: iconsType = { Table: Table, AIML: AIMLIcon, "AI/ML": AIMLIcon, + GitLoader: GitLoaderIcon, };