refactor: Rename SplitContentComponent to SplitTextComponent and update related variables and descriptions

This commit is contained in:
Rodrigo 2024-06-17 14:25:32 -03:00
commit 6b5d4798ac

View file

@ -7,18 +7,18 @@ from langflow.template import Output
from langflow.utils.util import unescape_string
class SplitContentComponent(Component):
display_name: str = "Split Content"
description: str = "Split textual content into chunks based on specified criteria."
class SplitTextComponent(Component):
display_name: str = "Split Text"
description: str = "Split text into chunks based on specified criteria."
icon = "split"
inputs = [
HandleInput(name="data", display_name="Data", info="Data with text to split.", input_types=["Data"]),
TextInput(
name="content_key",
display_name="Content Key",
name="text_key",
display_name="Text Key",
info="The key to access the text content in the Data object.",
value="content",
value="text",
),
TextInput(
name="separator",
@ -27,25 +27,18 @@ class SplitContentComponent(Component):
value="\n",
advanced=True,
),
IntInput(
name="chunk_size",
display_name="Chunk Size",
info="The target length (in number of characters) of each chunk.",
value=0,
advanced=True,
),
IntInput(
name="min_chunk_size",
display_name="Minimum Chunk Size",
info="The minimum size of chunks. Smaller chunks will be merged.",
value=0,
value=10,
advanced=True,
),
IntInput(
name="max_chunk_size",
display_name="Maximum Chunk Size",
info="The maximum size of chunks. Larger chunks will be split.",
value=0,
value=200,
advanced=True,
),
]
@ -56,33 +49,39 @@ class SplitContentComponent(Component):
def split_text(self) -> List[Data]:
data = self.data if isinstance(self.data, list) else [self.data]
content_key = self.content_key
text_key = self.text_key
separator = unescape_string(self.separator)
chunk_size = self.chunk_size
min_chunk_size = self.min_chunk_size
max_chunk_size = self.max_chunk_size
results = []
if not separator:
raise ValueError("Separator cannot be empty.")
if max_chunk_size < 10:
raise ValueError("Maximum chunk size cannot be less than 10 characters.")
if min_chunk_size < 10:
raise ValueError("Minimum chunk size cannot be less than 10 characters.")
if max_chunk_size < min_chunk_size:
raise ValueError("Maximum chunk size cannot be less than minimum chunk size.")
buffer = ""
for row in data:
content = row.data.get(content_key, "")
if chunk_size > 0:
chunks = [content[i : i + chunk_size] for i in range(0, len(content), chunk_size)]
else:
chunks = content.split(separator)
text = row.data.get(text_key, "")
chunks = text.split(separator)
for chunk in chunks:
buffer += chunk
while len(buffer) >= max_chunk_size:
results.append(Data(data={"parent": content, "chunk": buffer[:max_chunk_size]}))
results.append(Data(data={"parent": text, "chunk": buffer[:max_chunk_size]}))
buffer = buffer[max_chunk_size:]
if len(buffer) >= min_chunk_size:
results.append(Data(data={"parent": content, "chunk": buffer}))
results.append(Data(data={"parent": text, "chunk": buffer}))
buffer = ""
# Handle any remaining content that may not meet the min_chunk_size requirement
# Handle any remaining text that may not meet the min_chunk_size requirement
if buffer:
results.append(Data(data={"parent": content, "chunk": buffer}))
results.append(Data(data={"parent": text, "chunk": buffer}))
self.status = results
return results