diff --git a/src/backend/langflow/components/textsplitters/RecursiveCharacterTextSplitter.py b/src/backend/langflow/components/textsplitters/RecursiveCharacterTextSplitter.py new file mode 100644 index 000000000..3b1f70815 --- /dev/null +++ b/src/backend/langflow/components/textsplitters/RecursiveCharacterTextSplitter.py @@ -0,0 +1,83 @@ +from typing import Optional +from langflow import CustomComponent +from langchain.schema import Document + + +class RecursiveCharacterTextSplitterComponent(CustomComponent): + display_name: str = "Recursive Character Text Splitter" + description: str = "Split text into chunks of a specified length." + documentation: str = "https://docs.langflow.org/components/text-splitters#recursivecharactertextsplitter" + + def build_config(self): + return { + "documents": { + "display_name": "Documents", + "info": "The documents to split.", + }, + "separators": { + "display_name": "Separators", + "info": 'The characters to split on.\nIf left empty defaults to ["\\n\\n", "\\n", " ", ""].', + "is_list": True, + }, + "chunk_size": { + "display_name": "Chunk Size", + "info": "The maximum length of each chunk.", + "field_type": "int", + "value": 1000, + }, + "chunk_overlap": { + "display_name": "Chunk Overlap", + "info": "The amount of overlap between chunks.", + "field_type": "int", + "value": 200, + }, + "code": {"show": False}, + } + + def build( + self, + documents: Document, + separators: Optional[str] = None, + chunk_size: Optional[int] = 1000, + chunk_overlap: Optional[int] = 200, + ) -> Document: + """ + Split text into chunks of a specified length. + + Args: + separators (list[str]): The characters to split on. + chunk_size (int): The maximum length of each chunk. + chunk_overlap (int): The amount of overlap between chunks. + length_function (function): The function to use to calculate the length of the text. + + Returns: + list[str]: The chunks of text. + """ + from langchain.text_splitter import RecursiveCharacterTextSplitter + + if separators == "": + separators = None + elif separators: + # check if the separators list has escaped characters + # if there are escaped characters, unescape them + separators = [x.encode().decode("unicode-escape") for x in separators] + + # Make sure chunk_size and chunk_overlap are ints + try: + chunk_size = int(chunk_size) + chunk_overlap = int(chunk_overlap) + except Exception as e: + raise ValueError( + "chunk_size and chunk_overlap must be integers." + " Received chunk_size={chunk_size} and chunk_overlap={chunk_overlap}." + ) from e + splitter = RecursiveCharacterTextSplitter( + separators=separators, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + + docs = splitter.split_documents(documents) + # self.repr_value = build_loader_repr_from_documents(docs) + self.repr_value = separators + return docs diff --git a/src/backend/langflow/components/textsplitters/__init__.py b/src/backend/langflow/components/textsplitters/__init__.py new file mode 100644 index 000000000..e69de29bb