Add SplitTextComponent to split text into chunks

2024-03-30 18:06:32 -03:00 · 2024-03-30 18:06:32 -03:00 · 1672339992
commit 1672339992
parent f139ecb35d
1 changed files with 87 additions and 0 deletions
--- a/src/backend/base/langflow/components/helpers/SplitText.py
+++ b/src/backend/base/langflow/components/helpers/SplitText.py
@ -0,0 +1,87 @@
+from typing import Optional
+
+from langchain.text_splitter import (
+    RecursiveCharacterTextSplitter,
+    CharacterTextSplitter,
+)
+from langchain_core.documents import Document
+
+from langflow.interface.custom.custom_component import CustomComponent
+from langflow.schema import Record
+from langflow.field_typing import Text
+from langflow.utils.util import build_loader_repr_from_records, unescape_string
+
+
+class SplitTextComponent(CustomComponent):
+    display_name: str = "Split Text"
+    description: str = "Split text into chunks of a specified length."
+
+    def build_config(self):
+        return {
+            "texts": {
+                "display_name": "Texts",
+                "info": "Texts to split.",
+                "input_types": ["Text"],
+            },
+            "separators": {
+                "display_name": "Separators",
+                "info": 'The characters to split on.\nIf left empty defaults to [" "].',
+                "is_list": True,
+            },
+            "chunk_size": {
+                "display_name": "Chunk Size",
+                "info": "The maximum length of each chunk.",
+                "field_type": "int",
+                "value": 1000,
+            },
+            "chunk_overlap": {
+                "display_name": "Chunk Overlap",
+                "info": "The amount of overlap between chunks.",
+                "field_type": "int",
+                "value": 200,
+            },
+            "recursive": {
+                "display_name": "Recursive",
+            },
+            "code": {"show": False},
+        }
+
+    def build(
+        self,
+        texts: list[Text],
+        separators: Optional[list[str]] = [" "],
+        chunk_size: Optional[int] = 1000,
+        chunk_overlap: Optional[int] = 200,
+        recursive: bool = False,
+    ) -> list[Record]:
+
+        separators = [unescape_string(x) for x in separators]
+
+        # Make sure chunk_size and chunk_overlap are ints
+        if isinstance(chunk_size, str):
+            chunk_size = int(chunk_size)
+        if isinstance(chunk_overlap, str):
+            chunk_overlap = int(chunk_overlap)
+
+        if recursive:
+            splitter = RecursiveCharacterTextSplitter(
+                separators=separators,
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+            )
+
+        else:
+            splitter = CharacterTextSplitter(
+                separator=separators[0],
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+            )
+
+        documents = []
+        for _text in texts:
+            # documents.append(_input.to_lc_document())
+            documents.append(Document(page_content=_text))
+
+        records = self.to_records(splitter.split_documents(documents))
+        self.status = records
+        return records