diff --git a/src/backend/base/langflow/components/documentloaders/GitLoader.py b/src/backend/base/langflow/components/documentloaders/GitLoader.py new file mode 100644 index 000000000..ea39d76bd --- /dev/null +++ b/src/backend/base/langflow/components/documentloaders/GitLoader.py @@ -0,0 +1,116 @@ +from pathlib import Path +from typing import List +import re + +from langchain_community.document_loaders.git import GitLoader +from langflow.custom import Component +from langflow.io import MessageTextInput, Output +from langflow.schema import Data + + +class GitLoaderComponent(Component): + display_name = "GitLoader" + description = "Load files from a Git repository" + documentation = "https://python.langchain.com/v0.2/docs/integrations/document_loaders/git/" + trace_type = "tool" + icon = "GitLoader" + name = "GitLoader" + + inputs = [ + MessageTextInput( + name="repo_path", + display_name="Repository Path", + required=True, + info="The local path to the Git repository.", + ), + MessageTextInput( + name="clone_url", + display_name="Clone URL", + required=False, + info="The URL to clone the Git repository from.", + ), + MessageTextInput( + name="branch", + display_name="Branch", + required=False, + value="main", + info="The branch to load files from. Defaults to 'main'.", + ), + MessageTextInput( + name="file_filter", + display_name="File Filter", + required=False, + advanced=True, + info="A list of patterns to filter files. Example to include only .py files: '*.py'. " + "Example to exclude .py files: '!*.py'. Multiple patterns can be separated by commas.", + ), + MessageTextInput( + name="content_filter", + display_name="Content Filter", + required=False, + advanced=True, + info="A regex pattern to filter files based on their content.", + ), + ] + + outputs = [ + Output(name="data", display_name="Data", method="load_documents"), + ] + + @staticmethod + def is_binary(file_path: str) -> bool: + """ + Check if a file is binary by looking for null bytes. + This is necessary because when searches are performed using + the content_filter, binary files need to be ignored. + """ + with open(file_path, "rb") as file: + return b"\x00" in file.read(1024) + + def build_gitloader(self) -> GitLoader: + file_filter_patterns = getattr(self, "file_filter", None) + content_filter_pattern = getattr(self, "content_filter", None) + + file_filters = [] + if file_filter_patterns: + patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")] + + def file_filter(file_path: Path) -> bool: + if len(patterns) == 1 and patterns[0].startswith("!"): + return not file_path.match(patterns[0][1:]) + included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!")) + excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!")) + return included and not excluded + + file_filters.append(file_filter) + + if content_filter_pattern: + content_regex = re.compile(content_filter_pattern) + + def content_filter(file_path: Path) -> bool: + with file_path.open("r", encoding="utf-8", errors="ignore") as file: + content = file.read() + return bool(content_regex.search(content)) + + file_filters.append(content_filter) + + def combined_filter(file_path: str) -> bool: + path = Path(file_path) + if self.is_binary(file_path): + return False + return all(f(path) for f in file_filters) + + loader = GitLoader( + repo_path=self.repo_path, + clone_url=self.clone_url, + branch=self.branch, + file_filter=combined_filter, + ) + return loader + + def load_documents(self) -> List[Data]: + gitloader = self.build_gitloader() + documents = list(gitloader.lazy_load()) + data = [Data.from_document(doc) for doc in documents] + self.status = data + return data diff --git a/src/backend/base/langflow/components/documentloaders/__init__.py b/src/backend/base/langflow/components/documentloaders/__init__.py index d33437c75..8f18cd3db 100644 --- a/src/backend/base/langflow/components/documentloaders/__init__.py +++ b/src/backend/base/langflow/components/documentloaders/__init__.py @@ -1,3 +1,4 @@ from .Confluence import ConfluenceComponent +from .GitLoader import GitLoaderComponent -__all__ = ["ConfluenceComponent"] +__all__ = ["ConfluenceComponent", "GitLoaderComponent"] diff --git a/src/frontend/src/icons/GitLoader/Git.svg b/src/frontend/src/icons/GitLoader/Git.svg new file mode 100644 index 000000000..5bf444b9b --- /dev/null +++ b/src/frontend/src/icons/GitLoader/Git.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/src/frontend/src/icons/GitLoader/GitLoader.jsx b/src/frontend/src/icons/GitLoader/GitLoader.jsx new file mode 100644 index 000000000..f1b291643 --- /dev/null +++ b/src/frontend/src/icons/GitLoader/GitLoader.jsx @@ -0,0 +1,22 @@ +const GitLoaderIcon = (props) => ( + + + +); + +export default GitLoaderIcon; diff --git a/src/frontend/src/icons/GitLoader/index.tsx b/src/frontend/src/icons/GitLoader/index.tsx new file mode 100644 index 000000000..51e82f4f8 --- /dev/null +++ b/src/frontend/src/icons/GitLoader/index.tsx @@ -0,0 +1,9 @@ +import React, { forwardRef } from "react"; +import SvgGitLoader from "./GitLoader"; + +export const GitLoaderIcon = forwardRef< + SVGSVGElement, + React.PropsWithChildren<{}> +>((props, ref) => { + return ; +}); diff --git a/src/frontend/src/utils/styleUtils.ts b/src/frontend/src/utils/styleUtils.ts index 3bde56e9a..244e68fa6 100644 --- a/src/frontend/src/utils/styleUtils.ts +++ b/src/frontend/src/utils/styleUtils.ts @@ -179,6 +179,7 @@ import { EvernoteIcon } from "../icons/Evernote"; import { FBIcon } from "../icons/FacebookMessenger"; import { FirecrawlIcon } from "../icons/Firecrawl"; import { GitBookIcon } from "../icons/GitBook"; +import { GitLoaderIcon } from "../icons/GitLoader"; import { GoogleIcon } from "../icons/Google"; import { GoogleGenerativeAIIcon } from "../icons/GoogleGenerativeAI"; import { @@ -588,4 +589,5 @@ export const nodeIconsLucide: iconsType = { Table: Table, AIML: AIMLIcon, "AI/ML": AIMLIcon, + GitLoader: GitLoaderIcon, };