Implemented add_extra_fields method for the documentloaders (#408)

This commit is contained in:
Alexandre Henrique Pereira Tavares 2023-06-09 15:24:56 -03:00 committed by GitHub
commit b02ed92b84
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 148 additions and 156 deletions

View file

@ -53,7 +53,7 @@ llms:
# - AzureOpenAI
# - AzureChatOpenAI
- ChatOpenAI
- LlamaCpp
- LlamaCpp
- CTransformers
- Cohere
- Anthropic
@ -69,7 +69,7 @@ prompts:
- ZeroShotPrompt
textsplitters:
- CharacterTextSplitter
# - RecursiveCharacterTextSplitter
- RecursiveCharacterTextSplitter
# - LatexTextSplitter
# - PythonCodeTextSplitter
toolkits:

View file

@ -1,30 +1,20 @@
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Type
from langflow.interface.base import LangChainTypeCreator
from langflow.template.frontend_node.documentloaders import DocumentLoaderFrontNode
from langflow.interface.custom_lists import documentloaders_type_to_cls_dict
from langflow.settings import settings
from langflow.utils.logger import logger
from langflow.utils.util import build_template_from_class
def build_file_path_template(
suffixes: list, fileTypes: list, name: str = "file_path"
) -> Dict:
"""Build a file path template for a document loader."""
return {
"type": "file",
"required": True,
"show": True,
"name": name,
"value": "",
"suffixes": suffixes,
"fileTypes": fileTypes,
}
class DocumentLoaderCreator(LangChainTypeCreator):
type_name: str = "documentloaders"
@property
def frontend_node_class(self) -> Type[DocumentLoaderFrontNode]:
return DocumentLoaderFrontNode
@property
def type_to_loader_dict(self) -> Dict:
return documentloaders_type_to_cls_dict
@ -32,106 +22,7 @@ class DocumentLoaderCreator(LangChainTypeCreator):
def get_signature(self, name: str) -> Optional[Dict]:
"""Get the signature of a document loader."""
try:
signature = build_template_from_class(
name, documentloaders_type_to_cls_dict
)
file_path_templates = {
"AirbyteJSONLoader": build_file_path_template(
suffixes=[".json"], fileTypes=["json"]
),
"CoNLLULoader": build_file_path_template(
suffixes=[".csv"], fileTypes=["csv"]
),
"CSVLoader": build_file_path_template(
suffixes=[".csv"], fileTypes=["csv"]
),
"UnstructuredEmailLoader": build_file_path_template(
suffixes=[".eml"], fileTypes=["eml"]
),
"EverNoteLoader": build_file_path_template(
suffixes=[".xml"], fileTypes=["xml"]
),
"FacebookChatLoader": build_file_path_template(
suffixes=[".json"], fileTypes=["json"]
),
"GutenbergLoader": build_file_path_template(
suffixes=[".txt"], fileTypes=["txt"]
),
"BSHTMLLoader": build_file_path_template(
suffixes=[".html"], fileTypes=["html"]
),
"UnstructuredHTMLLoader": build_file_path_template(
suffixes=[".html"], fileTypes=["html"]
),
"UnstructuredImageLoader": build_file_path_template(
suffixes=[".jpg", ".jpeg", ".png", ".gif", ".bmp"],
fileTypes=["jpg", "jpeg", "png", "gif", "bmp"],
),
"UnstructuredMarkdownLoader": build_file_path_template(
suffixes=[".md"], fileTypes=["md"]
),
"PyPDFLoader": build_file_path_template(
suffixes=[".pdf"], fileTypes=["pdf"]
),
"UnstructuredPowerPointLoader": build_file_path_template(
suffixes=[".pptx", ".ppt"], fileTypes=["pptx", "ppt"]
),
"SRTLoader": build_file_path_template(
suffixes=[".srt"], fileTypes=["srt"]
),
"TelegramChatLoader": build_file_path_template(
suffixes=[".json"], fileTypes=["json"]
),
"TextLoader": build_file_path_template(
suffixes=[".txt"], fileTypes=["txt"]
),
"UnstructuredWordDocumentLoader": build_file_path_template(
suffixes=[".docx", ".doc"], fileTypes=["docx", "doc"]
),
"SlackDirectoryLoader": build_file_path_template(
suffixes=[".zip"], fileTypes=["zip"]
),
}
if name in file_path_templates:
signature["template"]["file_path"] = file_path_templates[name]
elif name in {
"WebBaseLoader",
"AZLyricsLoader",
"CollegeConfidentialLoader",
"HNLoader",
"IFixitLoader",
"IMSDbLoader",
}:
signature["template"]["web_path"] = {
"type": "str",
"required": True,
"show": True,
"name": "web_path",
"value": "",
"display_name": "Web Page",
}
elif name in {"GitbookLoader"}:
signature["template"]["web_page"] = {
"type": "str",
"required": True,
"show": True,
"name": "web_page",
"value": "",
"display_name": "Web Page",
}
elif name in {"ReadTheDocsLoader", "NotionDirectoryLoader"}:
signature["template"]["path"] = {
"type": "str",
"required": True,
"show": True,
"name": "path",
"value": "",
"display_name": "Web Page",
}
return signature
return build_template_from_class(name, documentloaders_type_to_cls_dict)
except ValueError as exc:
raise ValueError(f"Documment Loader {name} not found") from exc
except AttributeError as exc:

View file

@ -1,6 +1,7 @@
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Type
from langflow.interface.base import LangChainTypeCreator
from langflow.template.frontend_node.textsplitters import TextSplittersFrontendNode
from langflow.interface.custom_lists import textsplitter_type_to_cls_dict
from langflow.settings import settings
from langflow.utils.logger import logger
@ -10,6 +11,10 @@ from langflow.utils.util import build_template_from_class
class TextSplitterCreator(LangChainTypeCreator):
type_name: str = "textsplitters"
@property
def frontend_node_class(self) -> Type[TextSplittersFrontendNode]:
return TextSplittersFrontendNode
@property
def type_to_loader_dict(self) -> Dict:
return textsplitter_type_to_cls_dict
@ -17,43 +22,7 @@ class TextSplitterCreator(LangChainTypeCreator):
def get_signature(self, name: str) -> Optional[Dict]:
"""Get the signature of a text splitter."""
try:
signature = build_template_from_class(name, textsplitter_type_to_cls_dict)
signature["template"]["documents"] = {
"type": "BaseLoader",
"required": True,
"show": True,
"name": "documents",
}
signature["template"]["separator"] = {
"type": "str",
"required": True,
"show": True,
"value": ".",
"name": "separator",
"display_name": "Separator",
}
signature["template"]["chunk_size"] = {
"type": "int",
"required": True,
"show": True,
"value": 1000,
"name": "chunk_size",
"display_name": "Chunk Size",
}
signature["template"]["chunk_overlap"] = {
"type": "int",
"required": True,
"show": True,
"value": 200,
"name": "chunk_overlap",
"display_name": "Chunk Overlap",
}
return signature
return build_template_from_class(name, textsplitter_type_to_cls_dict)
except ValueError as exc:
raise ValueError(f"Text Splitter {name} not found") from exc
except AttributeError as exc:

View file

@ -7,6 +7,8 @@ from langflow.template.frontend_node import (
prompts,
tools,
vectorstores,
documentloaders,
textsplitters,
)
__all__ = [
@ -18,4 +20,6 @@ __all__ = [
"llms",
"prompts",
"vectorstores",
"documentloaders",
"textsplitters",
]

View file

@ -0,0 +1,79 @@
from langflow.template.field.base import TemplateField
from langflow.template.frontend_node.base import FrontendNode
class DocumentLoaderFrontNode(FrontendNode):
@staticmethod
def build_template(
suffixes: list, fileTypes: list, name: str = "file_path"
) -> TemplateField:
"""Build a template field for a document loader."""
return TemplateField(
field_type="file",
required=True,
show=True,
name=name,
value="",
suffixes=suffixes,
fileTypes=fileTypes,
)
file_path_templates = {
"AirbyteJSONLoader": build_template(suffixes=[".json"], fileTypes=["json"]),
"CoNLLULoader": build_template(suffixes=[".csv"], fileTypes=["csv"]),
"CSVLoader": build_template(suffixes=[".csv"], fileTypes=["csv"]),
"UnstructuredEmailLoader": build_template(suffixes=[".eml"], fileTypes=["eml"]),
"EverNoteLoader": build_template(suffixes=[".xml"], fileTypes=["xml"]),
"FacebookChatLoader": build_template(suffixes=[".json"], fileTypes=["json"]),
"GutenbergLoader": build_template(suffixes=[".txt"], fileTypes=["txt"]),
"BSHTMLLoader": build_template(suffixes=[".html"], fileTypes=["html"]),
"UnstructuredHTMLLoader": build_template(
suffixes=[".html"], fileTypes=["html"]
),
"UnstructuredImageLoader": build_template(
suffixes=[".jpg", ".jpeg", ".png", ".gif", ".bmp"],
fileTypes=["jpg", "jpeg", "png", "gif", "bmp"],
),
"UnstructuredMarkdownLoader": build_template(
suffixes=[".md"], fileTypes=["md"]
),
"PyPDFLoader": build_template(suffixes=[".pdf"], fileTypes=["pdf"]),
"UnstructuredPowerPointLoader": build_template(
suffixes=[".pptx", ".ppt"], fileTypes=["pptx", "ppt"]
),
"SRTLoader": build_template(suffixes=[".srt"], fileTypes=["srt"]),
"TelegramChatLoader": build_template(suffixes=[".json"], fileTypes=["json"]),
"TextLoader": build_template(suffixes=[".txt"], fileTypes=["txt"]),
"UnstructuredWordDocumentLoader": build_template(
suffixes=[".docx", ".doc"], fileTypes=["docx", "doc"]
),
}
def add_extra_fields(self) -> None:
name = None
if self.template.type_name in self.file_path_templates:
self.template.add_field(self.file_path_templates[self.template.type_name])
elif self.template.type_name in {
"WebBaseLoader",
"AZLyricsLoader",
"CollegeConfidentialLoader",
"HNLoader",
"IFixitLoader",
"IMSDbLoader",
}:
name = "web_path"
elif self.template.type_name in {"GitbookLoader"}:
name = "web_page"
elif self.template.type_name in {"ReadTheDocsLoader"}:
name = "path"
if name:
self.template.add_field(
TemplateField(
field_type="str",
required=True,
show=True,
name=name,
value="",
display_name="Web Page",
)
)

View file

@ -0,0 +1,49 @@
from langflow.template.field.base import TemplateField
from langflow.template.frontend_node.base import FrontendNode
class TextSplittersFrontendNode(FrontendNode):
def add_extra_fields(self) -> None:
self.template.add_field(
TemplateField(
field_type="BaseLoader",
required=True,
show=True,
name="documents",
)
)
name = "separator"
if self.template.type_name == "CharacterTextSplitter":
name = "separator"
elif self.template.type_name == "RecursiveCharacterTextSplitter":
name = "separators"
self.template.add_field(
TemplateField(
field_type="str",
required=True,
show=True,
value=".",
name=name,
display_name="Separator",
)
)
self.template.add_field(
TemplateField(
field_type="int",
required=True,
show=True,
value=1000,
name="chunk_size",
display_name="Chunk Size",
)
)
self.template.add_field(
TemplateField(
field_type="int",
required=True,
show=True,
value=200,
name="chunk_overlap",
display_name="Chunk Overlap",
)
)