New Text Splitters (#817)

This commit is contained in:
Gabriel Luiz Freitas Almeida 2023-08-25 01:56:26 +00:00 committed by GitHub
commit f19e745b77
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 207 additions and 17 deletions

View file

@ -1,11 +1,13 @@
import Admonition from '@theme/Admonition';
import Admonition from "@theme/Admonition";
# Text Splitters
<Admonition type="caution" icon="🚧" title="ZONE UNDER CONSTRUCTION">
<p>
We appreciate your understanding as we polish our documentation it may contain some rough edges. Share your feedback or report issues to help us improve! 🛠️📝
</p>
<p>
We appreciate your understanding as we polish our documentation it may
contain some rough edges. Share your feedback or report issues to help us
improve! 🛠️📝
</p>
</Admonition>
A text splitter is a tool that divides a document or text into smaller chunks or segments. It is used to break down large texts into more manageable pieces for analysis or processing.
@ -22,13 +24,13 @@ The `CharacterTextSplitter` is used to split a long text into smaller chunks bas
- **chunk_overlap:** Determines the number of characters that overlap between consecutive chunks when splitting text. It specifies how much of the previous chunk should be included in the next chunk.
For example, if the `chunk_overlap` is set to 20 and the `chunk_size` is set to 100, the splitter will create chunks of 100 characters each, but the last 20 characters of each chunk will overlap with the first 20 characters of the next chunk. This allows for a smoother transition between chunks and ensures that no information is lost defaults to `200`.
For example, if the `chunk_overlap` is set to 20 and the `chunk_size` is set to 100, the splitter will create chunks of 100 characters each, but the last 20 characters of each chunk will overlap with the first 20 characters of the next chunk. This allows for a smoother transition between chunks and ensures that no information is lost defaults to `200`.
- **chunk_size:** Determines the maximum number of characters in each chunk when splitting a text. It specifies the size or length of each chunk.
For example, if the chunk_size is set to 100, the splitter will create chunks of 100 characters each. If the text is longer than 100 characters, it will be divided into multiple chunks of equal size, except for the last chunk, which may be smaller if there are remaining characters defaults to `1000`.
For example, if the chunk_size is set to 100, the splitter will create chunks of 100 characters each. If the text is longer than 100 characters, it will be divided into multiple chunks of equal size, except for the last chunk, which may be smaller if there are remaining characters defaults to `1000`.
- **separator:** Specifies the character that will be used to split the text into chunks defaults to `.`
- **separator:** Specifies the character that will be used to split the text into chunks defaults to `.`
---
@ -44,6 +46,18 @@ The `RecursiveCharacterTextSplitter` splits the text by trying to keep paragra
- **chunk_size:** Determines the maximum number of characters in each chunk when splitting a text. It specifies the size or length of each chunk.
- **separator_type:** The parameter allows the user to split the code with multiple language support. It supports various languages such as Text, Ruby, Python, Solidity, Java, and more. Defaults to `Text`.
- **separators:** The `separators` in RecursiveCharacterTextSplitter are the characters used to split the text into chunks. The text splitter tries to create chunks based on splitting on the first character in the list of `separators`. If any chunks are too large, it moves on to the next character in the list and continues splitting. Defaults to ["\n\n", "\n", " ", ""].
- **separators:** The `separators` in RecursiveCharacterTextSplitter are the characters used to split the text into chunks. The text splitter tries to create chunks based on splitting on the first character in the list of `separators`. If any chunks are too large, it moves on to the next character in the list and continues splitting. Defaults to `.`
### LanguageRecursiveTextSplitter
The `LanguageRecursiveTextSplitter` is a text splitter that splits the text into smaller chunks based on the (programming) language of the text.
**Params**
- **Documents:** Input documents to split.
- **chunk_overlap:** Determines the number of characters that overlap between consecutive chunks when splitting text. It specifies how much of the previous chunk should be included in the next chunk.
- **chunk_size:** Determines the maximum number of characters in each chunk when splitting a text. It specifies the size or length of each chunk.
- **separator_type:** The parameter allows the user to split the code with multiple language support. It supports various languages such as Ruby, Python, Solidity, Java, and more. Defaults to `Python`.

4
poetry.lock generated
View file

@ -4195,7 +4195,7 @@ files = [
name = "pillow"
version = "10.0.0"
description = "Python Imaging Library (Fork)"
optional = true
optional = false
python-versions = ">=3.8"
files = [
{file = "Pillow-10.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1f62406a884ae75fb2f818694469519fb685cc7eaff05d3451a9ebe55c646891"},
@ -7467,4 +7467,4 @@ local = ["ctransformers", "llama-cpp-python", "sentence-transformers"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<3.11"
content-hash = "1effd07e35ba89cc3971f027218032e24e7816d93bccb7fd6470cc56acc04418"
content-hash = "36e1f79f4e6d2e55b652d10e43ccde639714ffff2965fa52b466bd854259ebf6"

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "langflow"
version = "0.4.15"
version = "0.4.16"
description = "A Python package with a built-in web application"
authors = ["Logspace <contact@logspace.ai>"]
maintainers = [
@ -79,6 +79,7 @@ psycopg-binary = "^3.1.9"
fastavro = "^1.8.0"
langchain-experimental = "^0.0.8"
metaphor-python = "^0.1.11"
pillow = "^10.0.0"
[tool.poetry.group.dev.dependencies]
black = "^23.1.0"

View file

@ -0,0 +1,82 @@
from typing import Optional
from langflow import CustomComponent
from langchain.text_splitter import Language
from langchain.schema import Document
from langflow.utils.util import build_loader_repr_from_documents
class LanguageRecursiveTextSplitterComponent(CustomComponent):
display_name: str = "Language Recursive Text Splitter"
description: str = "Split text into chunks of a specified length based on language."
documentation: str = "https://docs.langflow.org/components/text-splitters#languagerecursivetextsplitter"
def build_config(self):
options = [x.value for x in Language]
return {
"documents": {
"display_name": "Documents",
"info": "The documents to split.",
},
"separator_type": {
"display_name": "Separator Type",
"info": "The type of separator to use.",
"field_type": "str",
"options": options,
"value": "Python",
},
"separators": {
"display_name": "Separators",
"info": "The characters to split on.",
"is_list": True,
},
"chunk_size": {
"display_name": "Chunk Size",
"info": "The maximum length of each chunk.",
"field_type": "int",
"value": 1000,
},
"chunk_overlap": {
"display_name": "Chunk Overlap",
"info": "The amount of overlap between chunks.",
"field_type": "int",
"value": 200,
},
"code": {"show": False},
}
def build(
self,
documents: list[Document],
chunk_size: Optional[int] = 1000,
chunk_overlap: Optional[int] = 200,
separator_type: Optional[str] = "Python",
) -> list[Document]:
"""
Split text into chunks of a specified length.
Args:
separators (list[str]): The characters to split on.
chunk_size (int): The maximum length of each chunk.
chunk_overlap (int): The amount of overlap between chunks.
length_function (function): The function to use to calculate the length of the text.
Returns:
list[str]: The chunks of text.
"""
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Make sure chunk_size and chunk_overlap are ints
if isinstance(chunk_size, str):
chunk_size = int(chunk_size)
if isinstance(chunk_overlap, str):
chunk_overlap = int(chunk_overlap)
splitter = RecursiveCharacterTextSplitter.from_language(
language=Language(separator_type),
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
docs = splitter.split_documents(documents)
self.repr_value = build_loader_repr_from_documents(docs)
return docs

View file

@ -0,0 +1,79 @@
from typing import Optional
from langflow import CustomComponent
from langchain.schema import Document
class RecursiveCharacterTextSplitterComponent(CustomComponent):
display_name: str = "Recursive Character Text Splitter"
description: str = "Split text into chunks of a specified length."
documentation: str = "https://docs.langflow.org/components/text-splitters#recursivecharactertextsplitter"
def build_config(self):
return {
"documents": {
"display_name": "Documents",
"info": "The documents to split.",
},
"separators": {
"display_name": "Separators",
"info": 'The characters to split on.\nIf left empty defaults to ["\\n\\n", "\\n", " ", ""].',
"is_list": True,
},
"chunk_size": {
"display_name": "Chunk Size",
"info": "The maximum length of each chunk.",
"field_type": "int",
"value": 1000,
},
"chunk_overlap": {
"display_name": "Chunk Overlap",
"info": "The amount of overlap between chunks.",
"field_type": "int",
"value": 200,
},
"code": {"show": False},
}
def build(
self,
documents: list[Document],
separators: Optional[list[str]] = None,
chunk_size: Optional[int] = 1000,
chunk_overlap: Optional[int] = 200,
) -> list[Document]:
"""
Split text into chunks of a specified length.
Args:
separators (list[str]): The characters to split on.
chunk_size (int): The maximum length of each chunk.
chunk_overlap (int): The amount of overlap between chunks.
length_function (function): The function to use to calculate the length of the text.
Returns:
list[str]: The chunks of text.
"""
from langchain.text_splitter import RecursiveCharacterTextSplitter
if separators == "":
separators = None
elif separators:
# check if the separators list has escaped characters
# if there are escaped characters, unescape them
separators = [x.encode().decode("unicode-escape") for x in separators]
# Make sure chunk_size and chunk_overlap are ints
if isinstance(chunk_size, str):
chunk_size = int(chunk_size)
if isinstance(chunk_overlap, str):
chunk_overlap = int(chunk_overlap)
splitter = RecursiveCharacterTextSplitter(
separators=separators,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
docs = splitter.split_documents(documents)
# self.repr_value = build_loader_repr_from_documents(docs)
self.repr_value = separators
return docs

View file

@ -169,8 +169,6 @@ prompts:
textsplitters:
CharacterTextSplitter:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/character_text_splitter"
RecursiveCharacterTextSplitter:
documentation: "https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter"
toolkits:
OpenAPIToolkit:
documentation: ""

View file

@ -5,6 +5,7 @@ from langflow.api.utils import merge_nested_dicts_with_renaming
from langflow.interface.agents.base import agent_creator
from langflow.interface.chains.base import chain_creator
from langflow.interface.custom.constants import CUSTOM_COMPONENT_SUPPORTED_TYPES
from langflow.interface.custom.utils import extract_inner_type
from langflow.interface.document_loaders.base import documentloader_creator
from langflow.interface.embeddings.base import embedding_creator
from langflow.interface.importing.utils import get_function_custom
@ -84,6 +85,8 @@ def build_langchain_types_dict(): # sourcery skip: dict-assign-update-to-union
def process_type(field_type: str):
if field_type.startswith("list") or field_type.startswith("List"):
return extract_inner_type(field_type)
return "prompt" if field_type == "Prompt" else field_type
@ -100,6 +103,7 @@ def add_new_custom_field(
# if it is, update the value
display_name = field_config.pop("display_name", field_name)
field_type = field_config.pop("field_type", field_type)
field_contains_list = "list" in field_type.lower()
field_type = process_type(field_type)
field_value = field_config.pop("value", field_value)
field_advanced = field_config.pop("advanced", False)
@ -110,7 +114,9 @@ def add_new_custom_field(
# If options is a list, then it's a dropdown
# If options is None, then it's a list of strings
is_list = isinstance(field_config.get("options"), list)
field_config["is_list"] = is_list or field_config.get("is_list", False)
field_config["is_list"] = (
is_list or field_config.get("is_list", False) or field_contains_list
)
if "name" in field_config:
warnings.warn(
@ -172,7 +178,7 @@ def extract_type_from_optional(field_type):
Returns:
str: The extracted type, or an empty string if no type was found.
"""
match = re.search(r"\[(.*?)\]", field_type)
match = re.search(r"\[(.*?)\]$", field_type)
return match[1] if match else None

View file

@ -2,7 +2,7 @@ import re
import inspect
import importlib
from functools import wraps
from typing import Optional, Dict, Any, Union
from typing import List, Optional, Dict, Any, Union
from docstring_parser import parse # type: ignore
@ -10,6 +10,7 @@ from langflow.template.frontend_node.constants import FORCE_SHOW_FIELDS
from langflow.utils import constants
from langflow.utils.logger import logger
from multiprocess import cpu_count # type: ignore
from langchain.schema import Document
def build_template_from_function(
@ -462,3 +463,12 @@ def get_number_of_workers(workers=None):
workers = (cpu_count() * 2) + 1
logger.debug(f"Number of workers: {workers}")
return workers
def build_loader_repr_from_documents(documents: List[Document]) -> str:
if documents:
avg_length = sum(len(doc.page_content) for doc in documents) / len(documents)
return f"""{len(documents)} documents
\nAvg. Document Length (characters): {int(avg_length)}
Documents: {documents[:3]}..."""
return "0 documents"