Add SplitTextComponent to split text into chunks
This commit is contained in:
parent
f139ecb35d
commit
1672339992
1 changed files with 87 additions and 0 deletions
87
src/backend/base/langflow/components/helpers/SplitText.py
Normal file
87
src/backend/base/langflow/components/helpers/SplitText.py
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
from typing import Optional
|
||||
|
||||
from langchain.text_splitter import (
|
||||
RecursiveCharacterTextSplitter,
|
||||
CharacterTextSplitter,
|
||||
)
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langflow.interface.custom.custom_component import CustomComponent
|
||||
from langflow.schema import Record
|
||||
from langflow.field_typing import Text
|
||||
from langflow.utils.util import build_loader_repr_from_records, unescape_string
|
||||
|
||||
|
||||
class SplitTextComponent(CustomComponent):
|
||||
display_name: str = "Split Text"
|
||||
description: str = "Split text into chunks of a specified length."
|
||||
|
||||
def build_config(self):
|
||||
return {
|
||||
"texts": {
|
||||
"display_name": "Texts",
|
||||
"info": "Texts to split.",
|
||||
"input_types": ["Text"],
|
||||
},
|
||||
"separators": {
|
||||
"display_name": "Separators",
|
||||
"info": 'The characters to split on.\nIf left empty defaults to [" "].',
|
||||
"is_list": True,
|
||||
},
|
||||
"chunk_size": {
|
||||
"display_name": "Chunk Size",
|
||||
"info": "The maximum length of each chunk.",
|
||||
"field_type": "int",
|
||||
"value": 1000,
|
||||
},
|
||||
"chunk_overlap": {
|
||||
"display_name": "Chunk Overlap",
|
||||
"info": "The amount of overlap between chunks.",
|
||||
"field_type": "int",
|
||||
"value": 200,
|
||||
},
|
||||
"recursive": {
|
||||
"display_name": "Recursive",
|
||||
},
|
||||
"code": {"show": False},
|
||||
}
|
||||
|
||||
def build(
|
||||
self,
|
||||
texts: list[Text],
|
||||
separators: Optional[list[str]] = [" "],
|
||||
chunk_size: Optional[int] = 1000,
|
||||
chunk_overlap: Optional[int] = 200,
|
||||
recursive: bool = False,
|
||||
) -> list[Record]:
|
||||
|
||||
separators = [unescape_string(x) for x in separators]
|
||||
|
||||
# Make sure chunk_size and chunk_overlap are ints
|
||||
if isinstance(chunk_size, str):
|
||||
chunk_size = int(chunk_size)
|
||||
if isinstance(chunk_overlap, str):
|
||||
chunk_overlap = int(chunk_overlap)
|
||||
|
||||
if recursive:
|
||||
splitter = RecursiveCharacterTextSplitter(
|
||||
separators=separators,
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
else:
|
||||
splitter = CharacterTextSplitter(
|
||||
separator=separators[0],
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
documents = []
|
||||
for _text in texts:
|
||||
# documents.append(_input.to_lc_document())
|
||||
documents.append(Document(page_content=_text))
|
||||
|
||||
records = self.to_records(splitter.split_documents(documents))
|
||||
self.status = records
|
||||
return records
|
||||
Loading…
Add table
Add a link
Reference in a new issue