feat: Add GitLoader Component with advanced filtering options (#2850)
* feat: Add GitLoader Component with advanced filtering options
This commit introduces the GitLoaderComponent, enabling users to load files from a Git repository with advanced filtering options.
GitLoader Component:
- Implementation of the GitLoaderComponent to load files from a Git repository using the `langchain_community.document_loaders.git.GitLoader` module.
- Advanced filtering option using `file_filter` to include or exclude specific files based on their extensions or other criteria.
Examples of `file_filter` usage:
- Include only .py files: `lambda file_path: file_path.endswith('.py')`
- Exclude .py files: `lambda file_path: not file_path.endswith('.py')`
This component ensures a flexible and customizable approach for loading documents from Git repositories, enhancing the user experience with advanced filtering capabilities.
Features:
- Support for loading documents from Git repositories.
- Advanced file filtering options to include or exclude specific files.
* feat: Add GitLoader Component with advanced filtering options
This commit introduces the GitLoaderComponent, enabling users to load files from a Git repository with advanced filtering options.
GitLoader Component:
- Implementation of the GitLoaderComponent to load files from a Git repository using the `langchain_community.document_loaders.git.GitLoader` module.
- Advanced filtering option using `file_filter` to include or exclude specific files based on their extensions or other criteria.
Examples of `file_filter` usage:
- Include only .py files: `lambda file_path: file_path.endswith('.py')`
- Exclude .py files: `lambda file_path: not file_path.endswith('.py')`
This component ensures a flexible and customizable approach for loading documents from Git repositories, enhancing the user experience with advanced filtering capabilities.
Features:
- Support for loading documents from Git repositories.
- Advanced file filtering options to include or exclude specific files.
* fix: Ensure proper evaluation and validation of file_filter in GitLoaderComponent
This commit fixes the issue where the GitLoaderComponent would fail if the file_filter input was not evaluated correctly. Changes include:
- Added a check to ensure that file_filter is a valid string before calling eval.
- Ensured that the evaluated file_filter is callable, otherwise it defaults to None.
* [autofix.ci] apply automated fixes
* feat: Enhance GitLoaderComponent with dynamic inputs, content filtering
- Changed inputs from `StrInput` to `MessageTextInput` to enable dynamic use with agents.
- Added `content_filter` field to allow additional content filtering using regex.
- Updated `file_filter` to support glob format, simplifying usage for users.
- Implemented binary file removal filter to exclude binary files from queries, aligning with the agent's purpose.
* [autofix.ci] apply automated fixes
* [autofix.ci] apply automated fixes (attempt 2/3)
---------
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
12e3b51980
commit
d108ca11c5
6 changed files with 152 additions and 1 deletions
|
|
@ -0,0 +1,116 @@
|
|||
from pathlib import Path
|
||||
from typing import List
|
||||
import re
|
||||
|
||||
from langchain_community.document_loaders.git import GitLoader
|
||||
from langflow.custom import Component
|
||||
from langflow.io import MessageTextInput, Output
|
||||
from langflow.schema import Data
|
||||
|
||||
|
||||
class GitLoaderComponent(Component):
|
||||
display_name = "GitLoader"
|
||||
description = "Load files from a Git repository"
|
||||
documentation = "https://python.langchain.com/v0.2/docs/integrations/document_loaders/git/"
|
||||
trace_type = "tool"
|
||||
icon = "GitLoader"
|
||||
name = "GitLoader"
|
||||
|
||||
inputs = [
|
||||
MessageTextInput(
|
||||
name="repo_path",
|
||||
display_name="Repository Path",
|
||||
required=True,
|
||||
info="The local path to the Git repository.",
|
||||
),
|
||||
MessageTextInput(
|
||||
name="clone_url",
|
||||
display_name="Clone URL",
|
||||
required=False,
|
||||
info="The URL to clone the Git repository from.",
|
||||
),
|
||||
MessageTextInput(
|
||||
name="branch",
|
||||
display_name="Branch",
|
||||
required=False,
|
||||
value="main",
|
||||
info="The branch to load files from. Defaults to 'main'.",
|
||||
),
|
||||
MessageTextInput(
|
||||
name="file_filter",
|
||||
display_name="File Filter",
|
||||
required=False,
|
||||
advanced=True,
|
||||
info="A list of patterns to filter files. Example to include only .py files: '*.py'. "
|
||||
"Example to exclude .py files: '!*.py'. Multiple patterns can be separated by commas.",
|
||||
),
|
||||
MessageTextInput(
|
||||
name="content_filter",
|
||||
display_name="Content Filter",
|
||||
required=False,
|
||||
advanced=True,
|
||||
info="A regex pattern to filter files based on their content.",
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(name="data", display_name="Data", method="load_documents"),
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def is_binary(file_path: str) -> bool:
|
||||
"""
|
||||
Check if a file is binary by looking for null bytes.
|
||||
This is necessary because when searches are performed using
|
||||
the content_filter, binary files need to be ignored.
|
||||
"""
|
||||
with open(file_path, "rb") as file:
|
||||
return b"\x00" in file.read(1024)
|
||||
|
||||
def build_gitloader(self) -> GitLoader:
|
||||
file_filter_patterns = getattr(self, "file_filter", None)
|
||||
content_filter_pattern = getattr(self, "content_filter", None)
|
||||
|
||||
file_filters = []
|
||||
if file_filter_patterns:
|
||||
patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")]
|
||||
|
||||
def file_filter(file_path: Path) -> bool:
|
||||
if len(patterns) == 1 and patterns[0].startswith("!"):
|
||||
return not file_path.match(patterns[0][1:])
|
||||
included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!"))
|
||||
excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!"))
|
||||
return included and not excluded
|
||||
|
||||
file_filters.append(file_filter)
|
||||
|
||||
if content_filter_pattern:
|
||||
content_regex = re.compile(content_filter_pattern)
|
||||
|
||||
def content_filter(file_path: Path) -> bool:
|
||||
with file_path.open("r", encoding="utf-8", errors="ignore") as file:
|
||||
content = file.read()
|
||||
return bool(content_regex.search(content))
|
||||
|
||||
file_filters.append(content_filter)
|
||||
|
||||
def combined_filter(file_path: str) -> bool:
|
||||
path = Path(file_path)
|
||||
if self.is_binary(file_path):
|
||||
return False
|
||||
return all(f(path) for f in file_filters)
|
||||
|
||||
loader = GitLoader(
|
||||
repo_path=self.repo_path,
|
||||
clone_url=self.clone_url,
|
||||
branch=self.branch,
|
||||
file_filter=combined_filter,
|
||||
)
|
||||
return loader
|
||||
|
||||
def load_documents(self) -> List[Data]:
|
||||
gitloader = self.build_gitloader()
|
||||
documents = list(gitloader.lazy_load())
|
||||
data = [Data.from_document(doc) for doc in documents]
|
||||
self.status = data
|
||||
return data
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
from .Confluence import ConfluenceComponent
|
||||
from .GitLoader import GitLoaderComponent
|
||||
|
||||
__all__ = ["ConfluenceComponent"]
|
||||
__all__ = ["ConfluenceComponent", "GitLoaderComponent"]
|
||||
|
|
|
|||
1
src/frontend/src/icons/GitLoader/Git.svg
Normal file
1
src/frontend/src/icons/GitLoader/Git.svg
Normal file
|
|
@ -0,0 +1 @@
|
|||
<svg xmlns="http://www.w3.org/2000/svg" width="92pt" height="92pt" viewBox="0 0 92 92"><defs><clipPath id="a"><path d="M0 .113h91.887V92H0Zm0 0"/></clipPath></defs><g clip-path="url(#a)"><path style="stroke:none;fill-rule:nonzero;fill:#f03c2e;fill-opacity:1" d="M90.156 41.965 50.036 1.848a5.918 5.918 0 0 0-8.372 0l-8.328 8.332 10.566 10.566a7.03 7.03 0 0 1 7.23 1.684 7.034 7.034 0 0 1 1.669 7.277l10.187 10.184a7.028 7.028 0 0 1 7.278 1.672 7.04 7.04 0 0 1 0 9.957 7.05 7.05 0 0 1-9.965 0 7.044 7.044 0 0 1-1.528-7.66l-9.5-9.497V59.36a7.04 7.04 0 0 1 1.86 11.29 7.04 7.04 0 0 1-9.957 0 7.04 7.04 0 0 1 0-9.958 7.06 7.06 0 0 1 2.304-1.539V33.926a7.049 7.049 0 0 1-3.82-9.234L29.242 14.272 1.73 41.777a5.925 5.925 0 0 0 0 8.371L41.852 90.27a5.925 5.925 0 0 0 8.37 0l39.934-39.934a5.925 5.925 0 0 0 0-8.371"/></g></svg>
|
||||
|
After Width: | Height: | Size: 819 B |
22
src/frontend/src/icons/GitLoader/GitLoader.jsx
Normal file
22
src/frontend/src/icons/GitLoader/GitLoader.jsx
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
const GitLoaderIcon = (props) => (
|
||||
<svg
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
width="32"
|
||||
height="32"
|
||||
viewBox="0 0 32 32"
|
||||
fill="none"
|
||||
{...props}
|
||||
>
|
||||
<path
|
||||
d="M31.349 14.191L17.451.293a1.938 1.938 0 0 0-2.738 0L11.618 3.39l3.47 3.47a2.311 2.311 0 0 1 2.377.554 2.31 2.31 0 0 1 .549 2.392l3.36 3.359a2.31 2.31 0 0 1 2.393.55 2.311 2.311 0 0 1 0 3.27 2.312 2.312 0 0 1-3.271 0 2.309 2.309 0 0 1-.501-2.511l-3.12-3.12V20.24a2.31 2.31 0 0 1 .611 3.701 2.31 2.31 0 0 1-3.27 0 2.31 2.31 0 0 1 0-3.27 2.324 2.324 0 0 1 .759-.509V11.925a2.35 2.35 0 0 1-1.27-3.082L9.747 4.741 1.73 12.758a1.938 1.938 0 0 0 0 2.737L14.628 28.393a1.938 1.938 0 0 0 2.737 0l13.372-13.371a1.938 1.938 0 0 0 0-2.738"
|
||||
style={{
|
||||
stroke: "none",
|
||||
fillRule: "nonzero",
|
||||
fill: "#f03c2e",
|
||||
fillOpacity: 1,
|
||||
}}
|
||||
/>
|
||||
</svg>
|
||||
);
|
||||
|
||||
export default GitLoaderIcon;
|
||||
9
src/frontend/src/icons/GitLoader/index.tsx
Normal file
9
src/frontend/src/icons/GitLoader/index.tsx
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
import React, { forwardRef } from "react";
|
||||
import SvgGitLoader from "./GitLoader";
|
||||
|
||||
export const GitLoaderIcon = forwardRef<
|
||||
SVGSVGElement,
|
||||
React.PropsWithChildren<{}>
|
||||
>((props, ref) => {
|
||||
return <SvgGitLoader ref={ref} {...props} />;
|
||||
});
|
||||
|
|
@ -179,6 +179,7 @@ import { EvernoteIcon } from "../icons/Evernote";
|
|||
import { FBIcon } from "../icons/FacebookMessenger";
|
||||
import { FirecrawlIcon } from "../icons/Firecrawl";
|
||||
import { GitBookIcon } from "../icons/GitBook";
|
||||
import { GitLoaderIcon } from "../icons/GitLoader";
|
||||
import { GoogleIcon } from "../icons/Google";
|
||||
import { GoogleGenerativeAIIcon } from "../icons/GoogleGenerativeAI";
|
||||
import {
|
||||
|
|
@ -588,4 +589,5 @@ export const nodeIconsLucide: iconsType = {
|
|||
Table: Table,
|
||||
AIML: AIMLIcon,
|
||||
"AI/ML": AIMLIcon,
|
||||
GitLoader: GitLoaderIcon,
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue