Loaders Genericos
This commit is contained in:
parent
2f5dba1210
commit
f384f9fdbf
3 changed files with 269 additions and 111 deletions
|
|
@ -0,0 +1,96 @@
|
|||
from langflow import CustomComponent
|
||||
from langchain.schema import Document
|
||||
from langflow.components.documentloaders.FileLoader import loaders_info
|
||||
import os
|
||||
|
||||
|
||||
class DirectoryLoaderComponent(CustomComponent):
|
||||
display_name: str = "Directory Loader"
|
||||
description: str = "Generic File Loader"
|
||||
beta = True
|
||||
|
||||
def build_config(self):
|
||||
loader_options = ["Automatic"] + [
|
||||
loader_info["name"] for loader_info in loaders_info
|
||||
]
|
||||
|
||||
file_types = []
|
||||
suffixes = []
|
||||
|
||||
for loader_info in loaders_info:
|
||||
if "allowedTypes" in loader_info:
|
||||
file_types.extend(loader_info["allowedTypes"])
|
||||
suffixes.extend([f".{ext}" for ext in loader_info["allowedTypes"]])
|
||||
|
||||
return {
|
||||
"directory_path": {
|
||||
"display_name": "Directory Path",
|
||||
"required": True,
|
||||
},
|
||||
"loader": {
|
||||
"display_name": "Loader",
|
||||
"is_list": True,
|
||||
"required": True,
|
||||
"options": loader_options,
|
||||
"value": "Automatic",
|
||||
},
|
||||
"code": {"show": False},
|
||||
}
|
||||
|
||||
def build(self, directory_path: str, loader: str) -> Document:
|
||||
# Verifique se o diretório existe
|
||||
if not os.path.exists(directory_path):
|
||||
raise ValueError(f"Directory not found: {directory_path}")
|
||||
|
||||
# Lista os arquivos no diretório
|
||||
files = [
|
||||
f
|
||||
for f in os.listdir(directory_path)
|
||||
if os.path.isfile(os.path.join(directory_path, f))
|
||||
]
|
||||
|
||||
# Determine o loader automaticamente com base nas extensões dos arquivos
|
||||
loader_info = None
|
||||
if loader == "Automatic":
|
||||
for file in files:
|
||||
file_type = file.split(".")[-1]
|
||||
for info in loaders_info:
|
||||
if "defaultFor" in info and file_type in info["defaultFor"]:
|
||||
loader_info = info
|
||||
break
|
||||
if loader_info:
|
||||
break
|
||||
|
||||
if not loader_info:
|
||||
raise ValueError(
|
||||
"No default loader found for any file in the directory"
|
||||
)
|
||||
|
||||
else:
|
||||
for info in loaders_info:
|
||||
if info["name"] == loader:
|
||||
loader_info = info
|
||||
break
|
||||
|
||||
if not loader_info:
|
||||
raise ValueError(f"Loader {loader} not found in the loader info list")
|
||||
|
||||
loader_import = loader_info["import"]
|
||||
module_name, class_name = loader_import.rsplit(".", 1)
|
||||
|
||||
try:
|
||||
# Importe o loader dinamicamente
|
||||
loader_module = __import__(module_name, fromlist=[class_name])
|
||||
loader_instance = getattr(loader_module, class_name)
|
||||
except ImportError as e:
|
||||
raise ValueError(
|
||||
f"Loader {loader} could not be imported\nLoader info:\n{loader_info}"
|
||||
) from e
|
||||
|
||||
results = []
|
||||
for file in files:
|
||||
file_path = os.path.join(directory_path, file)
|
||||
result = loader_instance(file_path=file_path).load()
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
|
@ -1,126 +1,127 @@
|
|||
from langflow import CustomComponent
|
||||
from langchain.schema import Document
|
||||
|
||||
loaders_info = [
|
||||
{
|
||||
"loader": "AirbyteJSONLoader",
|
||||
"name": "Airbyte JSON (.jsonl)",
|
||||
"import": "langchain.document_loaders.AirbyteJSONLoader",
|
||||
"defaultFor": ["jsonl"],
|
||||
"allowdTypes": ["jsonl"],
|
||||
},
|
||||
{
|
||||
"loader": "BSHTMLLoader",
|
||||
"name": "BeautifulSoup4 HTML (.html, .htm)",
|
||||
"import": "langchain.document_loaders.BSHTMLLoader",
|
||||
"allowdTypes": ["html", "htm"],
|
||||
},
|
||||
{
|
||||
"loader": "CSVLoader",
|
||||
"name": "CSV (.csv)",
|
||||
"import": "langchain.document_loaders.CSVLoader",
|
||||
"defaultFor": ["csv"],
|
||||
"allowdTypes": ["csv"],
|
||||
},
|
||||
{
|
||||
"loader": "CoNLLULoader",
|
||||
"name": "CoNLL-U (.conllu)",
|
||||
"import": "langchain.document_loaders.CoNLLULoader",
|
||||
"defaultFor": ["conllu"],
|
||||
"allowdTypes": ["conllu"],
|
||||
},
|
||||
{
|
||||
"loader": "EverNoteLoader",
|
||||
"name": "EverNote (.enex)",
|
||||
"import": "langchain.document_loaders.EverNoteLoader",
|
||||
"defaultFor": ["enex"],
|
||||
"allowdTypes": ["enex"],
|
||||
},
|
||||
{
|
||||
"loader": "FacebookChatLoader",
|
||||
"name": "Facebook Chat (.json)",
|
||||
"import": "langchain.document_loaders.FacebookChatLoader",
|
||||
"allowdTypes": ["json"],
|
||||
},
|
||||
{
|
||||
"loader": "OutlookMessageLoader",
|
||||
"name": "Outlook Message (.msg)",
|
||||
"import": "langchain.document_loaders.OutlookMessageLoader",
|
||||
"defaultFor": ["msg"],
|
||||
"allowdTypes": ["msg"],
|
||||
},
|
||||
{
|
||||
"loader": "PyPDFLoader",
|
||||
"name": "PyPDF (.pdf)",
|
||||
"import": "langchain.document_loaders.PyPDFLoader",
|
||||
"defaultFor": ["pdf"],
|
||||
"allowdTypes": ["pdf"],
|
||||
},
|
||||
{
|
||||
"loader": "STRLoader",
|
||||
"name": "Subtitle (.str)",
|
||||
"import": "langchain.document_loaders.STRLoader",
|
||||
"defaultFor": ["str"],
|
||||
"allowdTypes": ["str"],
|
||||
},
|
||||
{
|
||||
"loader": "TextLoader",
|
||||
"name": "Text (.txt)",
|
||||
"import": "langchain.document_loaders.TextLoader",
|
||||
"defaultFor": ["txt"],
|
||||
"allowdTypes": ["txt"],
|
||||
},
|
||||
{
|
||||
"loader": "UnstructuredEmailLoader",
|
||||
"name": "Unstructured Email (.eml)",
|
||||
"import": "langchain.document_loaders.UnstructuredEmailLoader",
|
||||
"defaultFor": ["eml"],
|
||||
"allowdTypes": ["eml"],
|
||||
},
|
||||
{
|
||||
"loader": "UnstructuredHTMLLoader",
|
||||
"name": "Unstructured HTML (.html, .htm)",
|
||||
"import": "langchain.document_loaders.UnstructuredHTMLLoader",
|
||||
"defaultFor": ["html", "htm"],
|
||||
"allowdTypes": ["html", "htm"],
|
||||
},
|
||||
{
|
||||
"loader": "UnstructuredMarkdownLoader",
|
||||
"name": "Unstructured Markdown (.md)",
|
||||
"import": "langchain.document_loaders.UnstructuredMarkdownLoader",
|
||||
"defaultFor": ["md"],
|
||||
"allowdTypes": ["md"],
|
||||
},
|
||||
{
|
||||
"loader": "UnstructuredPowerPointLoader",
|
||||
"name": "Unstructured PowerPoint (.pptx)",
|
||||
"import": "langchain.document_loaders.UnstructuredPowerPointLoader",
|
||||
"defaultFor": ["pptx"],
|
||||
"allowdTypes": ["pptx"],
|
||||
},
|
||||
{
|
||||
"loader": "UnstructuredWordLoader",
|
||||
"name": "Unstructured Word (.docx)",
|
||||
"import": "langchain.document_loaders.UnstructuredWordLoader",
|
||||
"defaultFor": ["docx"],
|
||||
"allowdTypes": ["docx"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class FileLoaderComponent(CustomComponent):
|
||||
display_name: str = "File Loader Component"
|
||||
display_name: str = "File Loader"
|
||||
description: str = "Generic File Loader"
|
||||
|
||||
loaders_info = [
|
||||
{
|
||||
"loader": "AirbyteJSONLoader",
|
||||
"name": "Airbyte JSON (.jsonl)",
|
||||
"import": "langchain.document_loaders.AirbyteJSONLoader",
|
||||
"defaultFor": ["jsonl"],
|
||||
"allowdTypes": ["jsonl"],
|
||||
},
|
||||
{
|
||||
"loader": "BSHTMLLoader",
|
||||
"name": "BeautifulSoup4 HTML (.html, .htm)",
|
||||
"import": "langchain.document_loaders.BSHTMLLoader",
|
||||
"allowdTypes": ["html", "htm"],
|
||||
},
|
||||
{
|
||||
"loader": "CSVLoader",
|
||||
"name": "CSV (.csv)",
|
||||
"import": "langchain.document_loaders.CSVLoader",
|
||||
"defaultFor": ["csv"],
|
||||
"allowdTypes": ["csv"],
|
||||
},
|
||||
{
|
||||
"loader": "CoNLLULoader",
|
||||
"name": "CoNLL-U (.conllu)",
|
||||
"import": "langchain.document_loaders.CoNLLULoader",
|
||||
"defaultFor": ["conllu"],
|
||||
"allowdTypes": ["conllu"],
|
||||
},
|
||||
{
|
||||
"loader": "EverNoteLoader",
|
||||
"name": "EverNote (.enex)",
|
||||
"import": "langchain.document_loaders.EverNoteLoader",
|
||||
"defaultFor": ["enex"],
|
||||
"allowdTypes": ["enex"],
|
||||
},
|
||||
{
|
||||
"loader": "FacebookChatLoader",
|
||||
"name": "Facebook Chat (.json)",
|
||||
"import": "langchain.document_loaders.FacebookChatLoader",
|
||||
"allowdTypes": ["json"],
|
||||
},
|
||||
{
|
||||
"loader": "OutlookMessageLoader",
|
||||
"name": "Outlook Message (.msg)",
|
||||
"import": "langchain.document_loaders.OutlookMessageLoader",
|
||||
"defaultFor": ["msg"],
|
||||
"allowdTypes": ["msg"],
|
||||
},
|
||||
{
|
||||
"loader": "PyPDFLoader",
|
||||
"name": "PyPDF (.pdf)",
|
||||
"import": "langchain.document_loaders.PyPDFLoader",
|
||||
"defaultFor": ["pdf"],
|
||||
"allowdTypes": ["pdf"],
|
||||
},
|
||||
{
|
||||
"loader": "STRLoader",
|
||||
"name": "Subtitle (.str)",
|
||||
"import": "langchain.document_loaders.STRLoader",
|
||||
"defaultFor": ["str"],
|
||||
"allowdTypes": ["str"],
|
||||
},
|
||||
{
|
||||
"loader": "TextLoader",
|
||||
"name": "Text (.txt)",
|
||||
"import": "langchain.document_loaders.TextLoader",
|
||||
"defaultFor": ["txt"],
|
||||
"allowdTypes": ["txt"],
|
||||
},
|
||||
{
|
||||
"loader": "UnstructuredEmailLoader",
|
||||
"name": "Unstructured Email (.eml)",
|
||||
"import": "langchain.document_loaders.UnstructuredEmailLoader",
|
||||
"defaultFor": ["eml"],
|
||||
"allowdTypes": ["eml"],
|
||||
},
|
||||
{
|
||||
"loader": "UnstructuredHTMLLoader",
|
||||
"name": "Unstructured HTML (.html, .htm)",
|
||||
"import": "langchain.document_loaders.UnstructuredHTMLLoader",
|
||||
"defaultFor": ["html", "htm"],
|
||||
"allowdTypes": ["html", "htm"],
|
||||
},
|
||||
{
|
||||
"loader": "UnstructuredMarkdownLoader",
|
||||
"name": "Unstructured Markdown (.md)",
|
||||
"import": "langchain.document_loaders.UnstructuredMarkdownLoader",
|
||||
"defaultFor": ["md"],
|
||||
"allowdTypes": ["md"],
|
||||
},
|
||||
{
|
||||
"loader": "UnstructuredPowerPointLoader",
|
||||
"name": "Unstructured PowerPoint (.pptx)",
|
||||
"import": "langchain.document_loaders.UnstructuredPowerPointLoader",
|
||||
"defaultFor": ["pptx"],
|
||||
"allowdTypes": ["pptx"],
|
||||
},
|
||||
{
|
||||
"loader": "UnstructuredWordLoader",
|
||||
"name": "Unstructured Word (.docx)",
|
||||
"import": "langchain.document_loaders.UnstructuredWordLoader",
|
||||
"defaultFor": ["docx"],
|
||||
"allowdTypes": ["docx"],
|
||||
},
|
||||
]
|
||||
beta = True
|
||||
|
||||
def build_config(self):
|
||||
loader_options = ["Automatic"] + [
|
||||
loader_info["name"] for loader_info in self.loaders_info
|
||||
loader_info["name"] for loader_info in loaders_info
|
||||
]
|
||||
|
||||
file_types = []
|
||||
suffixes = []
|
||||
|
||||
for loader_info in self.loaders_info:
|
||||
for loader_info in loaders_info:
|
||||
if "allowedTypes" in loader_info:
|
||||
file_types.extend(loader_info["allowedTypes"])
|
||||
suffixes.extend([f".{ext}" for ext in loader_info["allowedTypes"]])
|
||||
|
|
@ -150,7 +151,7 @@ class FileLoaderComponent(CustomComponent):
|
|||
|
||||
# Mapeie o nome do loader selecionado para suas informações
|
||||
selected_loader_info = None
|
||||
for loader_info in self.loaders_info:
|
||||
for loader_info in loaders_info:
|
||||
if loader_info["name"] == loader:
|
||||
selected_loader_info = loader_info
|
||||
break
|
||||
|
|
@ -161,7 +162,7 @@ class FileLoaderComponent(CustomComponent):
|
|||
if loader == "Automatic":
|
||||
# Determine o loader automaticamente com base na extensão do arquivo
|
||||
default_loader_info = None
|
||||
for info in self.loaders_info:
|
||||
for info in loaders_info:
|
||||
if "defaultFor" in info and file_type in info["defaultFor"]:
|
||||
default_loader_info = info
|
||||
break
|
||||
|
|
|
|||
61
src/backend/langflow/components/documentloaders/UrlLoader.py
Normal file
61
src/backend/langflow/components/documentloaders/UrlLoader.py
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
from langflow import CustomComponent
|
||||
from langchain.document_loaders import AZLyricsLoader
|
||||
from langchain.document_loaders import CollegeConfidentialLoader
|
||||
from langchain.document_loaders import GitbookLoader
|
||||
from langchain.document_loaders import HNLoader
|
||||
from langchain.document_loaders import IFixitLoader
|
||||
from langchain.document_loaders import IMSDbLoader
|
||||
from langchain.document_loaders import WebBaseLoader
|
||||
|
||||
|
||||
from langchain.schema import Document
|
||||
|
||||
|
||||
class UrlLoaderComponent(CustomComponent):
|
||||
display_name: str = "Url Loader"
|
||||
description: str = "Generic Url Loader Component"
|
||||
|
||||
def build_config(self):
|
||||
return {
|
||||
"web_path": {
|
||||
"display_name": "Url",
|
||||
"required": True,
|
||||
},
|
||||
"loader": {
|
||||
"display_name": "Loader",
|
||||
"is_list": True,
|
||||
"required": True,
|
||||
"options": [
|
||||
"AZLyricsLoader",
|
||||
"CollegeConfidentialLoader",
|
||||
"GitbookLoader",
|
||||
"HNLoader",
|
||||
"IFixitLoader",
|
||||
"IMSDbLoader",
|
||||
"WebBaseLoader",
|
||||
],
|
||||
"value": "WebBaseLoader",
|
||||
},
|
||||
"code": {"show": False},
|
||||
}
|
||||
|
||||
def build(self, web_path: str, loader: str) -> Document:
|
||||
if loader == "AZLyricsLoader":
|
||||
loader_instance = AZLyricsLoader(web_path=web_path)
|
||||
elif loader == "CollegeConfidentialLoader":
|
||||
loader_instance = CollegeConfidentialLoader(web_path=web_path)
|
||||
elif loader == "GitbookLoader":
|
||||
loader_instance = GitbookLoader(web_path=web_path)
|
||||
elif loader == "HNLoader":
|
||||
loader_instance = HNLoader(web_path=web_path)
|
||||
elif loader == "IFixitLoader":
|
||||
loader_instance = IFixitLoader(web_path=web_path)
|
||||
elif loader == "IMSDbLoader":
|
||||
loader_instance = IMSDbLoader(web_path=web_path)
|
||||
elif loader == "WebBaseLoader":
|
||||
loader_instance = WebBaseLoader(web_path=web_path)
|
||||
|
||||
if loader_instance is None:
|
||||
raise ValueError(f"No loader found for: {web_path}")
|
||||
|
||||
return loader_instance
|
||||
Loading…
Add table
Add a link
Reference in a new issue