From f384f9fdbf9e89a6099b96d43181d7bd5a381be1 Mon Sep 17 00:00:00 2001 From: DiogenesBR Date: Mon, 25 Sep 2023 19:58:27 +0000 Subject: [PATCH] Loaders Genericos --- .../documentloaders/DirectoryLoader.py | 96 ++++++++ .../components/documentloaders/FileLoader.py | 223 +++++++++--------- .../components/documentloaders/UrlLoader.py | 61 +++++ 3 files changed, 269 insertions(+), 111 deletions(-) create mode 100644 src/backend/langflow/components/documentloaders/DirectoryLoader.py create mode 100644 src/backend/langflow/components/documentloaders/UrlLoader.py diff --git a/src/backend/langflow/components/documentloaders/DirectoryLoader.py b/src/backend/langflow/components/documentloaders/DirectoryLoader.py new file mode 100644 index 000000000..ef86164b7 --- /dev/null +++ b/src/backend/langflow/components/documentloaders/DirectoryLoader.py @@ -0,0 +1,96 @@ +from langflow import CustomComponent +from langchain.schema import Document +from langflow.components.documentloaders.FileLoader import loaders_info +import os + + +class DirectoryLoaderComponent(CustomComponent): + display_name: str = "Directory Loader" + description: str = "Generic File Loader" + beta = True + + def build_config(self): + loader_options = ["Automatic"] + [ + loader_info["name"] for loader_info in loaders_info + ] + + file_types = [] + suffixes = [] + + for loader_info in loaders_info: + if "allowedTypes" in loader_info: + file_types.extend(loader_info["allowedTypes"]) + suffixes.extend([f".{ext}" for ext in loader_info["allowedTypes"]]) + + return { + "directory_path": { + "display_name": "Directory Path", + "required": True, + }, + "loader": { + "display_name": "Loader", + "is_list": True, + "required": True, + "options": loader_options, + "value": "Automatic", + }, + "code": {"show": False}, + } + + def build(self, directory_path: str, loader: str) -> Document: + # Verifique se o diretório existe + if not os.path.exists(directory_path): + raise ValueError(f"Directory not found: {directory_path}") + + # Lista os arquivos no diretório + files = [ + f + for f in os.listdir(directory_path) + if os.path.isfile(os.path.join(directory_path, f)) + ] + + # Determine o loader automaticamente com base nas extensões dos arquivos + loader_info = None + if loader == "Automatic": + for file in files: + file_type = file.split(".")[-1] + for info in loaders_info: + if "defaultFor" in info and file_type in info["defaultFor"]: + loader_info = info + break + if loader_info: + break + + if not loader_info: + raise ValueError( + "No default loader found for any file in the directory" + ) + + else: + for info in loaders_info: + if info["name"] == loader: + loader_info = info + break + + if not loader_info: + raise ValueError(f"Loader {loader} not found in the loader info list") + + loader_import = loader_info["import"] + module_name, class_name = loader_import.rsplit(".", 1) + + try: + # Importe o loader dinamicamente + loader_module = __import__(module_name, fromlist=[class_name]) + loader_instance = getattr(loader_module, class_name) + except ImportError as e: + raise ValueError( + f"Loader {loader} could not be imported\nLoader info:\n{loader_info}" + ) from e + + results = [] + for file in files: + file_path = os.path.join(directory_path, file) + result = loader_instance(file_path=file_path).load() + results.append(result) + + return results diff --git a/src/backend/langflow/components/documentloaders/FileLoader.py b/src/backend/langflow/components/documentloaders/FileLoader.py index 3dbffe784..0dba48727 100644 --- a/src/backend/langflow/components/documentloaders/FileLoader.py +++ b/src/backend/langflow/components/documentloaders/FileLoader.py @@ -1,126 +1,127 @@ from langflow import CustomComponent from langchain.schema import Document +loaders_info = [ + { + "loader": "AirbyteJSONLoader", + "name": "Airbyte JSON (.jsonl)", + "import": "langchain.document_loaders.AirbyteJSONLoader", + "defaultFor": ["jsonl"], + "allowdTypes": ["jsonl"], + }, + { + "loader": "BSHTMLLoader", + "name": "BeautifulSoup4 HTML (.html, .htm)", + "import": "langchain.document_loaders.BSHTMLLoader", + "allowdTypes": ["html", "htm"], + }, + { + "loader": "CSVLoader", + "name": "CSV (.csv)", + "import": "langchain.document_loaders.CSVLoader", + "defaultFor": ["csv"], + "allowdTypes": ["csv"], + }, + { + "loader": "CoNLLULoader", + "name": "CoNLL-U (.conllu)", + "import": "langchain.document_loaders.CoNLLULoader", + "defaultFor": ["conllu"], + "allowdTypes": ["conllu"], + }, + { + "loader": "EverNoteLoader", + "name": "EverNote (.enex)", + "import": "langchain.document_loaders.EverNoteLoader", + "defaultFor": ["enex"], + "allowdTypes": ["enex"], + }, + { + "loader": "FacebookChatLoader", + "name": "Facebook Chat (.json)", + "import": "langchain.document_loaders.FacebookChatLoader", + "allowdTypes": ["json"], + }, + { + "loader": "OutlookMessageLoader", + "name": "Outlook Message (.msg)", + "import": "langchain.document_loaders.OutlookMessageLoader", + "defaultFor": ["msg"], + "allowdTypes": ["msg"], + }, + { + "loader": "PyPDFLoader", + "name": "PyPDF (.pdf)", + "import": "langchain.document_loaders.PyPDFLoader", + "defaultFor": ["pdf"], + "allowdTypes": ["pdf"], + }, + { + "loader": "STRLoader", + "name": "Subtitle (.str)", + "import": "langchain.document_loaders.STRLoader", + "defaultFor": ["str"], + "allowdTypes": ["str"], + }, + { + "loader": "TextLoader", + "name": "Text (.txt)", + "import": "langchain.document_loaders.TextLoader", + "defaultFor": ["txt"], + "allowdTypes": ["txt"], + }, + { + "loader": "UnstructuredEmailLoader", + "name": "Unstructured Email (.eml)", + "import": "langchain.document_loaders.UnstructuredEmailLoader", + "defaultFor": ["eml"], + "allowdTypes": ["eml"], + }, + { + "loader": "UnstructuredHTMLLoader", + "name": "Unstructured HTML (.html, .htm)", + "import": "langchain.document_loaders.UnstructuredHTMLLoader", + "defaultFor": ["html", "htm"], + "allowdTypes": ["html", "htm"], + }, + { + "loader": "UnstructuredMarkdownLoader", + "name": "Unstructured Markdown (.md)", + "import": "langchain.document_loaders.UnstructuredMarkdownLoader", + "defaultFor": ["md"], + "allowdTypes": ["md"], + }, + { + "loader": "UnstructuredPowerPointLoader", + "name": "Unstructured PowerPoint (.pptx)", + "import": "langchain.document_loaders.UnstructuredPowerPointLoader", + "defaultFor": ["pptx"], + "allowdTypes": ["pptx"], + }, + { + "loader": "UnstructuredWordLoader", + "name": "Unstructured Word (.docx)", + "import": "langchain.document_loaders.UnstructuredWordLoader", + "defaultFor": ["docx"], + "allowdTypes": ["docx"], + }, +] + class FileLoaderComponent(CustomComponent): - display_name: str = "File Loader Component" + display_name: str = "File Loader" description: str = "Generic File Loader" - - loaders_info = [ - { - "loader": "AirbyteJSONLoader", - "name": "Airbyte JSON (.jsonl)", - "import": "langchain.document_loaders.AirbyteJSONLoader", - "defaultFor": ["jsonl"], - "allowdTypes": ["jsonl"], - }, - { - "loader": "BSHTMLLoader", - "name": "BeautifulSoup4 HTML (.html, .htm)", - "import": "langchain.document_loaders.BSHTMLLoader", - "allowdTypes": ["html", "htm"], - }, - { - "loader": "CSVLoader", - "name": "CSV (.csv)", - "import": "langchain.document_loaders.CSVLoader", - "defaultFor": ["csv"], - "allowdTypes": ["csv"], - }, - { - "loader": "CoNLLULoader", - "name": "CoNLL-U (.conllu)", - "import": "langchain.document_loaders.CoNLLULoader", - "defaultFor": ["conllu"], - "allowdTypes": ["conllu"], - }, - { - "loader": "EverNoteLoader", - "name": "EverNote (.enex)", - "import": "langchain.document_loaders.EverNoteLoader", - "defaultFor": ["enex"], - "allowdTypes": ["enex"], - }, - { - "loader": "FacebookChatLoader", - "name": "Facebook Chat (.json)", - "import": "langchain.document_loaders.FacebookChatLoader", - "allowdTypes": ["json"], - }, - { - "loader": "OutlookMessageLoader", - "name": "Outlook Message (.msg)", - "import": "langchain.document_loaders.OutlookMessageLoader", - "defaultFor": ["msg"], - "allowdTypes": ["msg"], - }, - { - "loader": "PyPDFLoader", - "name": "PyPDF (.pdf)", - "import": "langchain.document_loaders.PyPDFLoader", - "defaultFor": ["pdf"], - "allowdTypes": ["pdf"], - }, - { - "loader": "STRLoader", - "name": "Subtitle (.str)", - "import": "langchain.document_loaders.STRLoader", - "defaultFor": ["str"], - "allowdTypes": ["str"], - }, - { - "loader": "TextLoader", - "name": "Text (.txt)", - "import": "langchain.document_loaders.TextLoader", - "defaultFor": ["txt"], - "allowdTypes": ["txt"], - }, - { - "loader": "UnstructuredEmailLoader", - "name": "Unstructured Email (.eml)", - "import": "langchain.document_loaders.UnstructuredEmailLoader", - "defaultFor": ["eml"], - "allowdTypes": ["eml"], - }, - { - "loader": "UnstructuredHTMLLoader", - "name": "Unstructured HTML (.html, .htm)", - "import": "langchain.document_loaders.UnstructuredHTMLLoader", - "defaultFor": ["html", "htm"], - "allowdTypes": ["html", "htm"], - }, - { - "loader": "UnstructuredMarkdownLoader", - "name": "Unstructured Markdown (.md)", - "import": "langchain.document_loaders.UnstructuredMarkdownLoader", - "defaultFor": ["md"], - "allowdTypes": ["md"], - }, - { - "loader": "UnstructuredPowerPointLoader", - "name": "Unstructured PowerPoint (.pptx)", - "import": "langchain.document_loaders.UnstructuredPowerPointLoader", - "defaultFor": ["pptx"], - "allowdTypes": ["pptx"], - }, - { - "loader": "UnstructuredWordLoader", - "name": "Unstructured Word (.docx)", - "import": "langchain.document_loaders.UnstructuredWordLoader", - "defaultFor": ["docx"], - "allowdTypes": ["docx"], - }, - ] + beta = True def build_config(self): loader_options = ["Automatic"] + [ - loader_info["name"] for loader_info in self.loaders_info + loader_info["name"] for loader_info in loaders_info ] file_types = [] suffixes = [] - for loader_info in self.loaders_info: + for loader_info in loaders_info: if "allowedTypes" in loader_info: file_types.extend(loader_info["allowedTypes"]) suffixes.extend([f".{ext}" for ext in loader_info["allowedTypes"]]) @@ -150,7 +151,7 @@ class FileLoaderComponent(CustomComponent): # Mapeie o nome do loader selecionado para suas informações selected_loader_info = None - for loader_info in self.loaders_info: + for loader_info in loaders_info: if loader_info["name"] == loader: selected_loader_info = loader_info break @@ -161,7 +162,7 @@ class FileLoaderComponent(CustomComponent): if loader == "Automatic": # Determine o loader automaticamente com base na extensão do arquivo default_loader_info = None - for info in self.loaders_info: + for info in loaders_info: if "defaultFor" in info and file_type in info["defaultFor"]: default_loader_info = info break diff --git a/src/backend/langflow/components/documentloaders/UrlLoader.py b/src/backend/langflow/components/documentloaders/UrlLoader.py new file mode 100644 index 000000000..ae75e6fab --- /dev/null +++ b/src/backend/langflow/components/documentloaders/UrlLoader.py @@ -0,0 +1,61 @@ +from langflow import CustomComponent +from langchain.document_loaders import AZLyricsLoader +from langchain.document_loaders import CollegeConfidentialLoader +from langchain.document_loaders import GitbookLoader +from langchain.document_loaders import HNLoader +from langchain.document_loaders import IFixitLoader +from langchain.document_loaders import IMSDbLoader +from langchain.document_loaders import WebBaseLoader + + +from langchain.schema import Document + + +class UrlLoaderComponent(CustomComponent): + display_name: str = "Url Loader" + description: str = "Generic Url Loader Component" + + def build_config(self): + return { + "web_path": { + "display_name": "Url", + "required": True, + }, + "loader": { + "display_name": "Loader", + "is_list": True, + "required": True, + "options": [ + "AZLyricsLoader", + "CollegeConfidentialLoader", + "GitbookLoader", + "HNLoader", + "IFixitLoader", + "IMSDbLoader", + "WebBaseLoader", + ], + "value": "WebBaseLoader", + }, + "code": {"show": False}, + } + + def build(self, web_path: str, loader: str) -> Document: + if loader == "AZLyricsLoader": + loader_instance = AZLyricsLoader(web_path=web_path) + elif loader == "CollegeConfidentialLoader": + loader_instance = CollegeConfidentialLoader(web_path=web_path) + elif loader == "GitbookLoader": + loader_instance = GitbookLoader(web_path=web_path) + elif loader == "HNLoader": + loader_instance = HNLoader(web_path=web_path) + elif loader == "IFixitLoader": + loader_instance = IFixitLoader(web_path=web_path) + elif loader == "IMSDbLoader": + loader_instance = IMSDbLoader(web_path=web_path) + elif loader == "WebBaseLoader": + loader_instance = WebBaseLoader(web_path=web_path) + + if loader_instance is None: + raise ValueError(f"No loader found for: {web_path}") + + return loader_instance