feat: Add GitLoader Component with advanced filtering options (#2850)

* feat: Add GitLoader Component with advanced filtering options

This commit introduces the GitLoaderComponent, enabling users to load files from a Git repository with advanced filtering options.

GitLoader Component:

- Implementation of the GitLoaderComponent to load files from a Git repository using the `langchain_community.document_loaders.git.GitLoader` module.
- Advanced filtering option using `file_filter` to include or exclude specific files based on their extensions or other criteria.

Examples of `file_filter` usage:
- Include only .py files: `lambda file_path: file_path.endswith('.py')`
- Exclude .py files: `lambda file_path: not file_path.endswith('.py')`

This component ensures a flexible and customizable approach for loading documents from Git repositories, enhancing the user experience with advanced filtering capabilities.

Features:

- Support for loading documents from Git repositories.
- Advanced file filtering options to include or exclude specific files.

* feat: Add GitLoader Component with advanced filtering options

This commit introduces the GitLoaderComponent, enabling users to load files from a Git repository with advanced filtering options.

GitLoader Component:

- Implementation of the GitLoaderComponent to load files from a Git repository using the `langchain_community.document_loaders.git.GitLoader` module.
- Advanced filtering option using `file_filter` to include or exclude specific files based on their extensions or other criteria.

Examples of `file_filter` usage:
- Include only .py files: `lambda file_path: file_path.endswith('.py')`
- Exclude .py files: `lambda file_path: not file_path.endswith('.py')`

This component ensures a flexible and customizable approach for loading documents from Git repositories, enhancing the user experience with advanced filtering capabilities.

Features:

- Support for loading documents from Git repositories.
- Advanced file filtering options to include or exclude specific files.

* fix: Ensure proper evaluation and validation of file_filter in GitLoaderComponent

This commit fixes the issue where the GitLoaderComponent would fail if the file_filter input was not evaluated correctly. Changes include:

- Added a check to ensure that file_filter is a valid string before calling eval.
- Ensured that the evaluated file_filter is callable, otherwise it defaults to None.

* [autofix.ci] apply automated fixes

* feat: Enhance GitLoaderComponent with dynamic inputs, content filtering

- Changed inputs from `StrInput` to `MessageTextInput` to enable dynamic use with agents.
- Added `content_filter` field to allow additional content filtering using regex.
- Updated `file_filter` to support glob format, simplifying usage for users.
- Implemented binary file removal filter to exclude binary files from queries, aligning with the agent's purpose.

* [autofix.ci] apply automated fixes

* [autofix.ci] apply automated fixes (attempt 2/3)

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
Daniel Gines 2024-07-29 23:00:53 -03:00 committed by GitHub
commit d108ca11c5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 152 additions and 1 deletions

View file

@ -0,0 +1,116 @@
from pathlib import Path
from typing import List
import re
from langchain_community.document_loaders.git import GitLoader
from langflow.custom import Component
from langflow.io import MessageTextInput, Output
from langflow.schema import Data
class GitLoaderComponent(Component):
display_name = "GitLoader"
description = "Load files from a Git repository"
documentation = "https://python.langchain.com/v0.2/docs/integrations/document_loaders/git/"
trace_type = "tool"
icon = "GitLoader"
name = "GitLoader"
inputs = [
MessageTextInput(
name="repo_path",
display_name="Repository Path",
required=True,
info="The local path to the Git repository.",
),
MessageTextInput(
name="clone_url",
display_name="Clone URL",
required=False,
info="The URL to clone the Git repository from.",
),
MessageTextInput(
name="branch",
display_name="Branch",
required=False,
value="main",
info="The branch to load files from. Defaults to 'main'.",
),
MessageTextInput(
name="file_filter",
display_name="File Filter",
required=False,
advanced=True,
info="A list of patterns to filter files. Example to include only .py files: '*.py'. "
"Example to exclude .py files: '!*.py'. Multiple patterns can be separated by commas.",
),
MessageTextInput(
name="content_filter",
display_name="Content Filter",
required=False,
advanced=True,
info="A regex pattern to filter files based on their content.",
),
]
outputs = [
Output(name="data", display_name="Data", method="load_documents"),
]
@staticmethod
def is_binary(file_path: str) -> bool:
"""
Check if a file is binary by looking for null bytes.
This is necessary because when searches are performed using
the content_filter, binary files need to be ignored.
"""
with open(file_path, "rb") as file:
return b"\x00" in file.read(1024)
def build_gitloader(self) -> GitLoader:
file_filter_patterns = getattr(self, "file_filter", None)
content_filter_pattern = getattr(self, "content_filter", None)
file_filters = []
if file_filter_patterns:
patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")]
def file_filter(file_path: Path) -> bool:
if len(patterns) == 1 and patterns[0].startswith("!"):
return not file_path.match(patterns[0][1:])
included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!"))
excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!"))
return included and not excluded
file_filters.append(file_filter)
if content_filter_pattern:
content_regex = re.compile(content_filter_pattern)
def content_filter(file_path: Path) -> bool:
with file_path.open("r", encoding="utf-8", errors="ignore") as file:
content = file.read()
return bool(content_regex.search(content))
file_filters.append(content_filter)
def combined_filter(file_path: str) -> bool:
path = Path(file_path)
if self.is_binary(file_path):
return False
return all(f(path) for f in file_filters)
loader = GitLoader(
repo_path=self.repo_path,
clone_url=self.clone_url,
branch=self.branch,
file_filter=combined_filter,
)
return loader
def load_documents(self) -> List[Data]:
gitloader = self.build_gitloader()
documents = list(gitloader.lazy_load())
data = [Data.from_document(doc) for doc in documents]
self.status = data
return data

View file

@ -1,3 +1,4 @@
from .Confluence import ConfluenceComponent
from .GitLoader import GitLoaderComponent
__all__ = ["ConfluenceComponent"]
__all__ = ["ConfluenceComponent", "GitLoaderComponent"]

View file

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="92pt" height="92pt" viewBox="0 0 92 92"><defs><clipPath id="a"><path d="M0 .113h91.887V92H0Zm0 0"/></clipPath></defs><g clip-path="url(#a)"><path style="stroke:none;fill-rule:nonzero;fill:#f03c2e;fill-opacity:1" d="M90.156 41.965 50.036 1.848a5.918 5.918 0 0 0-8.372 0l-8.328 8.332 10.566 10.566a7.03 7.03 0 0 1 7.23 1.684 7.034 7.034 0 0 1 1.669 7.277l10.187 10.184a7.028 7.028 0 0 1 7.278 1.672 7.04 7.04 0 0 1 0 9.957 7.05 7.05 0 0 1-9.965 0 7.044 7.044 0 0 1-1.528-7.66l-9.5-9.497V59.36a7.04 7.04 0 0 1 1.86 11.29 7.04 7.04 0 0 1-9.957 0 7.04 7.04 0 0 1 0-9.958 7.06 7.06 0 0 1 2.304-1.539V33.926a7.049 7.049 0 0 1-3.82-9.234L29.242 14.272 1.73 41.777a5.925 5.925 0 0 0 0 8.371L41.852 90.27a5.925 5.925 0 0 0 8.37 0l39.934-39.934a5.925 5.925 0 0 0 0-8.371"/></g></svg>

After

Width:  |  Height:  |  Size: 819 B

View file

@ -0,0 +1,22 @@
const GitLoaderIcon = (props) => (
<svg
xmlns="http://www.w3.org/2000/svg"
width="32"
height="32"
viewBox="0 0 32 32"
fill="none"
{...props}
>
<path
d="M31.349 14.191L17.451.293a1.938 1.938 0 0 0-2.738 0L11.618 3.39l3.47 3.47a2.311 2.311 0 0 1 2.377.554 2.31 2.31 0 0 1 .549 2.392l3.36 3.359a2.31 2.31 0 0 1 2.393.55 2.311 2.311 0 0 1 0 3.27 2.312 2.312 0 0 1-3.271 0 2.309 2.309 0 0 1-.501-2.511l-3.12-3.12V20.24a2.31 2.31 0 0 1 .611 3.701 2.31 2.31 0 0 1-3.27 0 2.31 2.31 0 0 1 0-3.27 2.324 2.324 0 0 1 .759-.509V11.925a2.35 2.35 0 0 1-1.27-3.082L9.747 4.741 1.73 12.758a1.938 1.938 0 0 0 0 2.737L14.628 28.393a1.938 1.938 0 0 0 2.737 0l13.372-13.371a1.938 1.938 0 0 0 0-2.738"
style={{
stroke: "none",
fillRule: "nonzero",
fill: "#f03c2e",
fillOpacity: 1,
}}
/>
</svg>
);
export default GitLoaderIcon;

View file

@ -0,0 +1,9 @@
import React, { forwardRef } from "react";
import SvgGitLoader from "./GitLoader";
export const GitLoaderIcon = forwardRef<
SVGSVGElement,
React.PropsWithChildren<{}>
>((props, ref) => {
return <SvgGitLoader ref={ref} {...props} />;
});

View file

@ -179,6 +179,7 @@ import { EvernoteIcon } from "../icons/Evernote";
import { FBIcon } from "../icons/FacebookMessenger";
import { FirecrawlIcon } from "../icons/Firecrawl";
import { GitBookIcon } from "../icons/GitBook";
import { GitLoaderIcon } from "../icons/GitLoader";
import { GoogleIcon } from "../icons/Google";
import { GoogleGenerativeAIIcon } from "../icons/GoogleGenerativeAI";
import {
@ -588,4 +589,5 @@ export const nodeIconsLucide: iconsType = {
Table: Table,
AIML: AIMLIcon,
"AI/ML": AIMLIcon,
GitLoader: GitLoaderIcon,
};