refactor: Update URLComponent and ParseDataComponent

This commit updates the URLComponent and ParseDataComponent classes in the URL.py and ParseData.py files respectively.

In URLComponent, the 'value' attribute in the 'urls' input has been removed, improving code clarity.

In ParseDataComponent, the 'value' attribute in the 'sep' input has been changed from '---' to '\n', enhancing the functionality of the component.

These changes optimize the code logic and ensure consistency in the codebase.
This commit is contained in:
Rodrigo 2024-06-19 01:13:26 -03:00
commit 40fd956c9c
7 changed files with 60 additions and 69 deletions

View file

@ -18,7 +18,6 @@ class URLComponent(Component):
name="urls",
display_name="URLs",
info="Enter one or more URLs, separated by commas.",
value="",
is_list=True,
),
]

View file

@ -1,45 +1,42 @@
from typing import List
from langchain_text_splitters import CharacterTextSplitter
from langflow.custom import Component
from langflow.inputs import HandleInput, IntInput, TextInput
from langflow.inputs import IntInput, TextInput, HandleInput
from langflow.schema import Data
from langflow.template import Output
from langflow.utils.util import unescape_string
class SplitTextComponent(Component):
display_name: str = "Split Text"
description: str = "Split text into chunks based on specified criteria."
icon = "scissors-line-dashed"
inputs = [
HandleInput(name="data", display_name="Data", info="Data with text to split.", input_types=["Data"]),
TextInput(
name="text_key",
display_name="Text Key",
info="The key to access the text content in the Data object.",
value="text",
HandleInput(
name="data_inputs",
display_name="Data Inputs",
info="The data to split.",
input_types=["Data"],
is_list=True,
),
IntInput(
name="chunk_overlap",
display_name="Chunk Overlap",
info="Number of characters to overlap between chunks.",
value=200,
),
IntInput(
name="chunk_size",
display_name="Chunk Size",
info="The maximum number of characters in each chunk.",
value=1000,
),
TextInput(
name="separator",
display_name="Separator",
info='The character to split on. Defaults to "\n".',
info="The character to split on. Defaults to newline.",
value="\n",
advanced=True,
),
IntInput(
name="min_chunk_size",
display_name="Minimum Chunk Size",
info="The minimum size of chunks. Smaller chunks will be merged.",
value=10,
advanced=True,
),
IntInput(
name="max_chunk_size",
display_name="Maximum Chunk Size",
info="The maximum size of chunks. Larger chunks will be split.",
value=200,
advanced=True,
),
]
@ -47,41 +44,26 @@ class SplitTextComponent(Component):
Output(display_name="Chunks", name="chunks", method="split_text"),
]
def _docs_to_data(self, docs):
data = []
for doc in docs:
data.append(Data(text=doc.page_content, data=doc.metadata))
return data
def split_text(self) -> List[Data]:
data = self.data if isinstance(self.data, list) else [self.data]
text_key = self.text_key
separator = unescape_string(self.separator)
min_chunk_size = self.min_chunk_size
max_chunk_size = self.max_chunk_size
results = []
if not separator:
raise ValueError("Separator cannot be empty.")
if max_chunk_size < 10:
raise ValueError("Maximum chunk size cannot be less than 10 characters.")
if min_chunk_size < 10:
raise ValueError("Minimum chunk size cannot be less than 10 characters.")
if max_chunk_size < min_chunk_size:
raise ValueError("Maximum chunk size cannot be less than minimum chunk size.")
documents = []
for _input in self.data_inputs:
if isinstance(_input, Data):
documents.append(_input.to_lc_document())
buffer = ""
for row in data:
parent = row.data.get(text_key, "")
chunks = parent.split(separator)
for chunk in chunks:
buffer += chunk
while len(buffer) >= max_chunk_size:
results.append(Data(data={"parent": parent, "text": buffer[:max_chunk_size]}))
buffer = buffer[max_chunk_size:]
if len(buffer) >= min_chunk_size:
results.append(Data(data={"parent": parent, "text": buffer}))
buffer = ""
# Handle any remaining text that may not meet the min_chunk_size requirement
if buffer:
results.append(Data(parent=parent, text=buffer))
self.status = results
return results
splitter = CharacterTextSplitter(
chunk_overlap=self.chunk_overlap,
chunk_size=self.chunk_size,
separator=separator,
)
docs = splitter.split_documents(documents)
data = self._docs_to_data(docs)
self.status = data
return data

View file

@ -4,12 +4,13 @@ from langflow.memory import get_messages
from langflow.schema import Data
from langflow.schema.message import Message
from langflow.template import Output
from langflow.helpers.data import data_to_text
class MemoryComponent(Component):
display_name = "Memory"
display_name = "Chat Memory"
description = "Retrieves stored chat messages."
icon = "history"
icon = "message-square-more"
inputs = [
DropdownInput(
@ -47,11 +48,18 @@ class MemoryComponent(Component):
info="Order of the messages.",
advanced=True,
),
MultilineInput(
name="template",
display_name="Template",
info="The template to use for formatting the data. It can contain the keys {text}, {sender} or any other key in the message data.",
value="{sender_name}: {text}",
advanced=True,
),
]
outputs = [
Output(display_name="Message Data", name="messages", method="retrieve_messages"),
Output(display_name="Parsed", name="messages_text", method="retrieve_messages_as_text"),
Output(display_name="Chat History", name="messages", method="retrieve_messages"),
Output(display_name="Messages (Text)", name="messages_text", method="retrieve_messages_as_text"),
]
def retrieve_messages(self) -> Data:
@ -75,7 +83,6 @@ class MemoryComponent(Component):
return messages
def retrieve_messages_as_text(self) -> Message:
messages = self.retrieve_messages()
messages_text = "\n".join(["{sender_name}: {text}".format(**message.data) for message in messages])
messages_text = data_to_text(self.template, self.retrieve_messages())
self.status = messages_text
return Message(text=messages_text)

View file

@ -22,7 +22,7 @@ class ParseDataComponent(Component):
name="sep",
display_name="Separator",
advanced=True,
value='---'
value='\n'
)
]

View file

@ -19,6 +19,7 @@ class AnthropicModelComponent(LCModelComponent):
name="max_tokens",
display_name="Max Tokens",
advanced=True,
value=4096,
info="The maximum number of tokens to generate. Set to 0 for unlimited tokens.",
),
DropdownInput(

View file

@ -37,7 +37,6 @@ class ChromaVectorStoreComponent(LCVectorStoreComponent):
TextInput(
name="search_query",
display_name="Search Query",
is_list=True,
),
DataInput(
name="ingest_data",
@ -146,11 +145,12 @@ class ChromaVectorStoreComponent(LCVectorStoreComponent):
self.status = ""
return
_stored_documents_without_id = []
if self.allow_duplicates:
stored_data = []
else:
stored_data = chroma_collection_to_data(vector_store.get(self.limit))
_stored_documents_without_id = []
for value in deepcopy(stored_data):
del value.id
_stored_documents_without_id.append(value)
@ -192,4 +192,5 @@ class ChromaVectorStoreComponent(LCVectorStoreComponent):
search_results = self.search_with_vector_store(
self.search_query, self.search_type, vector_store, k=self.number_of_results
)
self.status = search_results
return search_results

View file

@ -123,7 +123,8 @@ class TextInput(StrInput):
input_types (list[str]): A list of input types that this component supports. In this case, it supports the "Message" input type.
"""
input_types: list[str] = ["Message"]
# ! adding str since it's checked on line 143
input_types: list[str] = ["Message", "str"]
@staticmethod
def _validate_value(v: Any, _info):