From 40fd956c9cd41828fd1b36b7cc48598e6e291d81 Mon Sep 17 00:00:00 2001 From: Rodrigo Date: Wed, 19 Jun 2024 01:13:26 -0300 Subject: [PATCH] refactor: Update URLComponent and ParseDataComponent This commit updates the URLComponent and ParseDataComponent classes in the URL.py and ParseData.py files respectively. In URLComponent, the 'value' attribute in the 'urls' input has been removed, improving code clarity. In ParseDataComponent, the 'value' attribute in the 'sep' input has been changed from '---' to '\n', enhancing the functionality of the component. These changes optimize the code logic and ensure consistency in the codebase. --- .../base/langflow/components/data/URL.py | 1 - .../components/experimental/SplitText.py | 98 ++++++++----------- .../langflow/components/helpers/Memory.py | 19 ++-- .../langflow/components/helpers/ParseData.py | 2 +- .../components/models/AnthropicModel.py | 1 + .../components/vectorstores/Chroma.py | 5 +- src/backend/base/langflow/inputs/inputs.py | 3 +- 7 files changed, 60 insertions(+), 69 deletions(-) diff --git a/src/backend/base/langflow/components/data/URL.py b/src/backend/base/langflow/components/data/URL.py index e20b526d7..f8fc12bbf 100644 --- a/src/backend/base/langflow/components/data/URL.py +++ b/src/backend/base/langflow/components/data/URL.py @@ -18,7 +18,6 @@ class URLComponent(Component): name="urls", display_name="URLs", info="Enter one or more URLs, separated by commas.", - value="", is_list=True, ), ] diff --git a/src/backend/base/langflow/components/experimental/SplitText.py b/src/backend/base/langflow/components/experimental/SplitText.py index 0789d1422..5e0d7f178 100644 --- a/src/backend/base/langflow/components/experimental/SplitText.py +++ b/src/backend/base/langflow/components/experimental/SplitText.py @@ -1,45 +1,42 @@ from typing import List +from langchain_text_splitters import CharacterTextSplitter from langflow.custom import Component -from langflow.inputs import HandleInput, IntInput, TextInput +from langflow.inputs import IntInput, TextInput, HandleInput from langflow.schema import Data from langflow.template import Output from langflow.utils.util import unescape_string - class SplitTextComponent(Component): display_name: str = "Split Text" description: str = "Split text into chunks based on specified criteria." icon = "scissors-line-dashed" inputs = [ - HandleInput(name="data", display_name="Data", info="Data with text to split.", input_types=["Data"]), - TextInput( - name="text_key", - display_name="Text Key", - info="The key to access the text content in the Data object.", - value="text", + HandleInput( + name="data_inputs", + display_name="Data Inputs", + info="The data to split.", + input_types=["Data"], + is_list=True, + ), + IntInput( + name="chunk_overlap", + display_name="Chunk Overlap", + info="Number of characters to overlap between chunks.", + value=200, + ), + IntInput( + name="chunk_size", + display_name="Chunk Size", + info="The maximum number of characters in each chunk.", + value=1000, ), TextInput( name="separator", display_name="Separator", - info='The character to split on. Defaults to "\n".', + info="The character to split on. Defaults to newline.", value="\n", - advanced=True, - ), - IntInput( - name="min_chunk_size", - display_name="Minimum Chunk Size", - info="The minimum size of chunks. Smaller chunks will be merged.", - value=10, - advanced=True, - ), - IntInput( - name="max_chunk_size", - display_name="Maximum Chunk Size", - info="The maximum size of chunks. Larger chunks will be split.", - value=200, - advanced=True, ), ] @@ -47,41 +44,26 @@ class SplitTextComponent(Component): Output(display_name="Chunks", name="chunks", method="split_text"), ] + def _docs_to_data(self, docs): + data = [] + for doc in docs: + data.append(Data(text=doc.page_content, data=doc.metadata)) + return data + def split_text(self) -> List[Data]: - data = self.data if isinstance(self.data, list) else [self.data] - text_key = self.text_key separator = unescape_string(self.separator) - min_chunk_size = self.min_chunk_size - max_chunk_size = self.max_chunk_size - results = [] - if not separator: - raise ValueError("Separator cannot be empty.") - if max_chunk_size < 10: - raise ValueError("Maximum chunk size cannot be less than 10 characters.") - if min_chunk_size < 10: - raise ValueError("Minimum chunk size cannot be less than 10 characters.") - if max_chunk_size < min_chunk_size: - raise ValueError("Maximum chunk size cannot be less than minimum chunk size.") + documents = [] + for _input in self.data_inputs: + if isinstance(_input, Data): + documents.append(_input.to_lc_document()) - buffer = "" - - for row in data: - parent = row.data.get(text_key, "") - chunks = parent.split(separator) - - for chunk in chunks: - buffer += chunk - while len(buffer) >= max_chunk_size: - results.append(Data(data={"parent": parent, "text": buffer[:max_chunk_size]})) - buffer = buffer[max_chunk_size:] - if len(buffer) >= min_chunk_size: - results.append(Data(data={"parent": parent, "text": buffer})) - buffer = "" - - # Handle any remaining text that may not meet the min_chunk_size requirement - if buffer: - results.append(Data(parent=parent, text=buffer)) - - self.status = results - return results + splitter = CharacterTextSplitter( + chunk_overlap=self.chunk_overlap, + chunk_size=self.chunk_size, + separator=separator, + ) + docs = splitter.split_documents(documents) + data = self._docs_to_data(docs) + self.status = data + return data diff --git a/src/backend/base/langflow/components/helpers/Memory.py b/src/backend/base/langflow/components/helpers/Memory.py index 8c2231d79..21b2770f0 100644 --- a/src/backend/base/langflow/components/helpers/Memory.py +++ b/src/backend/base/langflow/components/helpers/Memory.py @@ -4,12 +4,13 @@ from langflow.memory import get_messages from langflow.schema import Data from langflow.schema.message import Message from langflow.template import Output +from langflow.helpers.data import data_to_text class MemoryComponent(Component): - display_name = "Memory" + display_name = "Chat Memory" description = "Retrieves stored chat messages." - icon = "history" + icon = "message-square-more" inputs = [ DropdownInput( @@ -47,11 +48,18 @@ class MemoryComponent(Component): info="Order of the messages.", advanced=True, ), + MultilineInput( + name="template", + display_name="Template", + info="The template to use for formatting the data. It can contain the keys {text}, {sender} or any other key in the message data.", + value="{sender_name}: {text}", + advanced=True, + ), ] outputs = [ - Output(display_name="Message Data", name="messages", method="retrieve_messages"), - Output(display_name="Parsed", name="messages_text", method="retrieve_messages_as_text"), + Output(display_name="Chat History", name="messages", method="retrieve_messages"), + Output(display_name="Messages (Text)", name="messages_text", method="retrieve_messages_as_text"), ] def retrieve_messages(self) -> Data: @@ -75,7 +83,6 @@ class MemoryComponent(Component): return messages def retrieve_messages_as_text(self) -> Message: - messages = self.retrieve_messages() - messages_text = "\n".join(["{sender_name}: {text}".format(**message.data) for message in messages]) + messages_text = data_to_text(self.template, self.retrieve_messages()) self.status = messages_text return Message(text=messages_text) diff --git a/src/backend/base/langflow/components/helpers/ParseData.py b/src/backend/base/langflow/components/helpers/ParseData.py index 375830a02..c1a3c7101 100644 --- a/src/backend/base/langflow/components/helpers/ParseData.py +++ b/src/backend/base/langflow/components/helpers/ParseData.py @@ -22,7 +22,7 @@ class ParseDataComponent(Component): name="sep", display_name="Separator", advanced=True, - value='---' + value='\n' ) ] diff --git a/src/backend/base/langflow/components/models/AnthropicModel.py b/src/backend/base/langflow/components/models/AnthropicModel.py index 24aec2384..716afc4d9 100644 --- a/src/backend/base/langflow/components/models/AnthropicModel.py +++ b/src/backend/base/langflow/components/models/AnthropicModel.py @@ -19,6 +19,7 @@ class AnthropicModelComponent(LCModelComponent): name="max_tokens", display_name="Max Tokens", advanced=True, + value=4096, info="The maximum number of tokens to generate. Set to 0 for unlimited tokens.", ), DropdownInput( diff --git a/src/backend/base/langflow/components/vectorstores/Chroma.py b/src/backend/base/langflow/components/vectorstores/Chroma.py index 8a7e34d98..ac3629e13 100644 --- a/src/backend/base/langflow/components/vectorstores/Chroma.py +++ b/src/backend/base/langflow/components/vectorstores/Chroma.py @@ -37,7 +37,6 @@ class ChromaVectorStoreComponent(LCVectorStoreComponent): TextInput( name="search_query", display_name="Search Query", - is_list=True, ), DataInput( name="ingest_data", @@ -146,11 +145,12 @@ class ChromaVectorStoreComponent(LCVectorStoreComponent): self.status = "" return + + _stored_documents_without_id = [] if self.allow_duplicates: stored_data = [] else: stored_data = chroma_collection_to_data(vector_store.get(self.limit)) - _stored_documents_without_id = [] for value in deepcopy(stored_data): del value.id _stored_documents_without_id.append(value) @@ -192,4 +192,5 @@ class ChromaVectorStoreComponent(LCVectorStoreComponent): search_results = self.search_with_vector_store( self.search_query, self.search_type, vector_store, k=self.number_of_results ) + self.status = search_results return search_results diff --git a/src/backend/base/langflow/inputs/inputs.py b/src/backend/base/langflow/inputs/inputs.py index bc3a65339..10163740c 100644 --- a/src/backend/base/langflow/inputs/inputs.py +++ b/src/backend/base/langflow/inputs/inputs.py @@ -123,7 +123,8 @@ class TextInput(StrInput): input_types (list[str]): A list of input types that this component supports. In this case, it supports the "Message" input type. """ - input_types: list[str] = ["Message"] + # ! adding str since it's checked on line 143 + input_types: list[str] = ["Message", "str"] @staticmethod def _validate_value(v: Any, _info):