feat: add text splitter

This commit is contained in:
Ibis Prevedello 2023-04-07 16:10:44 -03:00
commit 8adf26e8de
16 changed files with 93 additions and 28 deletions

View file

@ -63,4 +63,7 @@ documentloaders:
- TextLoader
- WebBaseLoader
textsplitters:
- CharacterTextSplitter
dev: false

View file

@ -153,8 +153,6 @@ class Node:
result = result.run # type: ignore
elif hasattr(result, "get_function"):
result = result.get_function() # type: ignore
elif value.base_type == "documentloaders":
result = result.load()
self.params[key] = result
elif isinstance(value, list) and all(

View file

@ -10,6 +10,7 @@ from langflow.graph.nodes import (
LLMNode,
MemoryNode,
PromptNode,
TextSplitterNode,
ToolkitNode,
ToolNode,
VectorStoreNode,
@ -22,6 +23,7 @@ from langflow.interface.embeddings.base import embedding_creator
from langflow.interface.llms.base import llm_creator
from langflow.interface.memories.base import memory_creator
from langflow.interface.prompts.base import prompt_creator
from langflow.interface.textSplitters.base import textsplitter_creator
from langflow.interface.toolkits.base import toolkits_creator
from langflow.interface.tools.base import tool_creator
from langflow.interface.tools.constants import FILE_TOOLS
@ -126,6 +128,7 @@ class Graph:
**{t: EmbeddingNode for t in embedding_creator.to_list()},
**{t: VectorStoreNode for t in vectorstore_creator.to_list()},
**{t: DocumentLoaderNode for t in documentloader_creator.to_list()},
**{t: TextSplitterNode for t in textsplitter_creator.to_list()},
}
if node_type in FILE_TOOLS:

View file

@ -147,3 +147,8 @@ class VectorStoreNode(Node):
class MemoryNode(Node):
def __init__(self, data: Dict):
super().__init__(data, base_type="memory")
class TextSplitterNode(Node):
def __init__(self, data: Dict):
super().__init__(data, base_type="textsplitters")

View file

@ -1,6 +1,6 @@
import inspect
from typing import Any
## LLM
from langchain import (
chains,
document_loaders,
@ -8,6 +8,7 @@ from langchain import (
llms,
memory,
requests,
text_splitter,
vectorstores,
)
from langchain.agents import agent_toolkits
@ -15,16 +16,17 @@ from langchain.chat_models import ChatOpenAI
from langflow.interface.importing.utils import import_class
## LLM
## LLMs
llm_type_to_cls_dict = llms.type_to_cls_dict
llm_type_to_cls_dict["openai-chat"] = ChatOpenAI # type: ignore
## Chain
## Chains
chain_type_to_cls_dict: dict[str, Any] = {
chain_name: import_class(f"langchain.chains.{chain_name}")
for chain_name in chains.__all__
}
## Toolkits
toolkit_type_to_loader_dict: dict[str, Any] = {
toolkit_name: import_class(f"langchain.agents.agent_toolkits.{toolkit_name}")
# if toolkit_name is lower case it is a loader
@ -69,3 +71,8 @@ documentloaders_type_to_cls_dict: dict[str, Any] = {
)
for documentloader_name in document_loaders.__all__
}
## Text Splitters
textsplitter_type_to_cls_dict: dict[str, Any] = dict(
inspect.getmembers(text_splitter, inspect.isclass)
)

View file

@ -16,7 +16,6 @@ class DocumentLoaderCreator(LangChainTypeCreator):
# Drop some types that are reimplemented with the same name
types.pop("TextLoader")
types.pop("WebBaseLoader")
for name, documentloader in CUSTOM_DOCUMENTLOADERS.items():
types[name] = documentloader

View file

@ -3,8 +3,6 @@ from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.web_base import WebBaseLoader as LCWebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
class TextLoader(BaseLoader):
@ -18,25 +16,7 @@ class TextLoader(BaseLoader):
"""Load from file path."""
documents = [Document(page_content=self.file, metadata={"source": "loaded"})]
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
return text_splitter.split_documents(documents)
class WebBaseLoader(LCWebBaseLoader):
def load(self) -> List[Document]:
"""Load data into document objects."""
soup = self.scrape()
text = soup.get_text()
metadata = {"source": self.web_path}
documents = [Document(page_content=text, metadata=metadata)]
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
return text_splitter.split_documents(documents)
CUSTOM_DOCUMENTLOADERS = {
"TextLoader": TextLoader,
"WebBaseLoader": WebBaseLoader,
}

View file

@ -44,6 +44,7 @@ def import_by_type(_type: str, name: str) -> Any:
"embeddings": import_embedding,
"vectorstores": import_vectorstore,
"documentloaders": import_documentloader,
"textsplitters": import_textsplitter,
}
if _type == "llms":
key = "chat" if "chat" in name.lower() else "llm"
@ -135,3 +136,8 @@ def import_documentloader(documentloader: str) -> Any:
return CUSTOM_DOCUMENTLOADERS[documentloader]
return import_class(f"langchain.document_loaders.{documentloader}")
def import_textsplitter(textsplitter: str) -> Any:
"""Import textsplitter from textsplitter name"""
return import_class(f"langchain.text_splitter.{textsplitter}")

View file

@ -1,10 +1,14 @@
from langflow.interface.agents.base import agent_creator
from langflow.interface.chains.base import chain_creator
from langflow.interface.documentLoaders.base import documentloader_creator
from langflow.interface.embeddings.base import embedding_creator
from langflow.interface.llms.base import llm_creator
from langflow.interface.memories.base import memory_creator
from langflow.interface.prompts.base import prompt_creator
from langflow.interface.textSplitters.base import textsplitter_creator
from langflow.interface.toolkits.base import toolkits_creator
from langflow.interface.tools.base import tool_creator
from langflow.interface.vectorStore.base import vectorstore_creator
from langflow.interface.wrappers.base import wrapper_creator
@ -18,6 +22,10 @@ def get_type_dict():
"memory": memory_creator.to_list(),
"toolkits": toolkits_creator.to_list(),
"wrappers": wrapper_creator.to_list(),
"documentLoaders": documentloader_creator.to_list(),
"vectorStore": vectorstore_creator.to_list(),
"embeddings": embedding_creator.to_list(),
"textSplitters": textsplitter_creator.to_list(),
}

View file

@ -62,6 +62,12 @@ def instantiate_class(node_type: str, base_type: str, params: Dict) -> Any:
return class_object(**params)
elif base_type == "vectorstores":
return class_object.from_documents(**params)
elif base_type == "documentloaders":
return class_object(**params).load()
elif base_type == "textsplitters":
documents = params.pop("documents")
text_splitter = class_object(**params)
return text_splitter.split_documents(documents)
else:
return class_object(**params)

View file

@ -0,0 +1,3 @@
from langflow.interface.textSplitters.base import TextSplitterCreator
__all__ = ["TextSplitterCreator"]

View file

@ -0,0 +1,40 @@
from typing import Dict, List, Optional
from langflow.interface.base import LangChainTypeCreator
from langflow.interface.custom_lists import textsplitter_type_to_cls_dict
from langflow.settings import settings
from langflow.utils.util import build_template_from_class
class TextSplitterCreator(LangChainTypeCreator):
type_name: str = "textsplitters"
@property
def type_to_loader_dict(self) -> Dict:
return textsplitter_type_to_cls_dict
def get_signature(self, name: str) -> Optional[Dict]:
"""Get the signature of a text splitter."""
try:
signature = build_template_from_class(name, textsplitter_type_to_cls_dict)
signature["template"]["documents"] = {
"type": "BaseLoader",
"required": True,
"show": True,
"name": "documents",
}
return signature
except ValueError as exc:
raise ValueError(f"Text Splitter {name} not found") from exc
def to_list(self) -> List[str]:
return [
textsplitter.__name__
for textsplitter in self.type_to_loader_dict.values()
if textsplitter.__name__ in settings.textsplitters or settings.dev
]
textsplitter_creator = TextSplitterCreator()

View file

@ -5,6 +5,7 @@ from langflow.interface.embeddings.base import embedding_creator
from langflow.interface.llms.base import llm_creator
from langflow.interface.memories.base import memory_creator
from langflow.interface.prompts.base import prompt_creator
from langflow.interface.textSplitters.base import textsplitter_creator
from langflow.interface.toolkits.base import toolkits_creator
from langflow.interface.tools.base import tool_creator
from langflow.interface.vectorStore.base import vectorstore_creator
@ -40,6 +41,7 @@ def build_langchain_types_dict(): # sourcery skip: dict-assign-update-to-union
embedding_creator,
vectorstore_creator,
documentloader_creator,
textsplitter_creator,
]
all_types = {}

View file

@ -20,11 +20,11 @@ class VectorstoreCreator(LangChainTypeCreator):
signature["template"] = {
"documents": {
"type": "BaseLoader",
"type": "TextSplitter",
"required": True,
"show": True,
"name": "documents",
"display_name": "Document Loader",
"display_name": "Text Splitter",
},
"embedding": {
"type": "Embeddings",

View file

@ -17,6 +17,7 @@ class Settings(BaseSettings):
documentloaders: List[str] = []
wrappers: List[str] = []
toolkits: List[str] = []
textsplitters: List[str] = []
dev: bool = False
class Config:
@ -40,6 +41,7 @@ class Settings(BaseSettings):
self.memories = new_settings.memories or []
self.wrappers = new_settings.wrappers or []
self.toolkits = new_settings.toolkits or []
self.textsplitters = new_settings.textsplitters or []
self.dev = new_settings.dev or False

View file

@ -79,6 +79,7 @@ export const nodeColors: {[char: string]: string} = {
embeddings:"#FF9135",
documentloaders:"#FF9135",
vectorstores: "#FF9135",
textsplitters: "#FF9135",
toolkits:"#DB2C2C",
wrappers:"#E6277A",
unknown:"#9CA3AF"
@ -98,6 +99,7 @@ export const nodeNames:{[char: string]: string} = {
vectorstores: "Vector Stores",
toolkits:"Toolkits",
wrappers:"Wrappers",
textsplitters: "Text Splitters",
unknown:"Unknown"
};
@ -114,6 +116,7 @@ export const nodeIcons:{[char: string]: React.ForwardRefExoticComponent<React.SV
documentloaders:PaperClipIcon,
vectorstores: PaperClipIcon,
toolkits:WrenchScrewdriverIcon,
textsplitters:PaperClipIcon,
wrappers:GiftIcon,
unknown:QuestionMarkCircleIcon
};