From 0649b61fca0c2a47f3efc84e88797f7e990a3093 Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Thu, 24 Aug 2023 22:24:07 -0300 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=A6=20chore(LanguageRecursiveTextSplit?= =?UTF-8?q?ter.py):=20add=20LanguageRecursiveTextSplitter=20component=20to?= =?UTF-8?q?=20split=20text=20into=20chunks=20based=20on=20language=20?= =?UTF-8?q?=F0=9F=93=9D=20docs(LanguageRecursiveTextSplitter.py):=20add=20?= =?UTF-8?q?documentation=20link=20for=20LanguageRecursiveTextSplitter=20co?= =?UTF-8?q?mponent=20=F0=9F=94=A7=20refactor(LanguageRecursiveTextSplitter?= =?UTF-8?q?.py):=20refactor=20build=5Fconfig=20method=20to=20use=20options?= =?UTF-8?q?=20from=20Language=20enum=20for=20separator=5Ftype=20?= =?UTF-8?q?=F0=9F=94=A7=20refactor(LanguageRecursiveTextSplitter.py):=20re?= =?UTF-8?q?factor=20build=20method=20to=20split=20text=20into=20chunks=20b?= =?UTF-8?q?ased=20on=20specified=20length=20and=20overlap=20=F0=9F=94=A7?= =?UTF-8?q?=20refactor(LanguageRecursiveTextSplitter.py):=20refactor=20bui?= =?UTF-8?q?ld=20method=20to=20handle=20chunk=5Fsize=20and=20chunk=5Foverla?= =?UTF-8?q?p=20as=20integers=20=F0=9F=94=A7=20refactor(LanguageRecursiveTe?= =?UTF-8?q?xtSplitter.py):=20refactor=20build=20method=20to=20use=20Recurs?= =?UTF-8?q?iveCharacterTextSplitter=20from=20langchain.text=5Fsplitter=20?= =?UTF-8?q?=F0=9F=94=A7=20refactor(LanguageRecursiveTextSplitter.py):=20re?= =?UTF-8?q?factor=20build=20method=20to=20split=20documents=20using=20Recu?= =?UTF-8?q?rsiveCharacterTextSplitter=20and=20return=20the=20chunks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../LanguageRecursiveTextSplitter.py | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 src/backend/langflow/components/textsplitters/LanguageRecursiveTextSplitter.py diff --git a/src/backend/langflow/components/textsplitters/LanguageRecursiveTextSplitter.py b/src/backend/langflow/components/textsplitters/LanguageRecursiveTextSplitter.py new file mode 100644 index 000000000..6b4373971 --- /dev/null +++ b/src/backend/langflow/components/textsplitters/LanguageRecursiveTextSplitter.py @@ -0,0 +1,85 @@ +from typing import Optional +from langflow import CustomComponent +from langchain.text_splitter import Language +from langchain.schema import Document +from langflow.utils.util import build_loader_repr_from_documents + + +class LanguageRecursiveTextSplitterComponent(CustomComponent): + display_name: str = "Language Recursive Text Splitter" + description: str = "Split text into chunks of a specified length based on language." + documentation: str = "https://docs.langflow.org/components/text-splitters#recursivecharactertextsplitter" + + def build_config(self): + options = [x.value for x in Language] + return { + "documents": { + "display_name": "Documents", + "info": "The documents to split.", + }, + "separator_type": { + "display_name": "Separator Type", + "info": "The type of separator to use.", + "field_type": "str", + "options": options, + "value": "Python", + }, + "separators": { + "display_name": "Separators", + "info": "The characters to split on.", + "is_list": True, + }, + "chunk_size": { + "display_name": "Chunk Size", + "info": "The maximum length of each chunk.", + "field_type": "int", + "value": 1000, + }, + "chunk_overlap": { + "display_name": "Chunk Overlap", + "info": "The amount of overlap between chunks.", + "field_type": "int", + "value": 200, + }, + "code": {"show": False}, + } + + def build( + self, + documents: Document, + chunk_size: Optional[int] = 1000, + chunk_overlap: Optional[int] = 200, + separator_type: Optional[str] = "Python", + ) -> Document: + """ + Split text into chunks of a specified length. + + Args: + separators (list[str]): The characters to split on. + chunk_size (int): The maximum length of each chunk. + chunk_overlap (int): The amount of overlap between chunks. + length_function (function): The function to use to calculate the length of the text. + + Returns: + list[str]: The chunks of text. + """ + from langchain.text_splitter import RecursiveCharacterTextSplitter + + # Make sure chunk_size and chunk_overlap are ints + try: + chunk_size = int(chunk_size) + chunk_overlap = int(chunk_overlap) + except Exception as e: + raise ValueError( + "chunk_size and chunk_overlap must be integers." + " Received chunk_size={chunk_size} and chunk_overlap={chunk_overlap}." + ) from e + splitter = RecursiveCharacterTextSplitter.from_language( + language=Language(separator_type), + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + + docs = splitter.split_documents(documents) + self.repr_value = build_loader_repr_from_documents(docs) + return docs