From f080049526cc4b7c68559dc893e6ff29da903467 Mon Sep 17 00:00:00 2001 From: Cristhian Zanforlin Lousa Date: Wed, 8 Jan 2025 10:01:39 -0300 Subject: [PATCH] fix: add tests and adjustments to Chroma component (#5571) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Refactor: Update EditNodeComponent to hide table options and block hide * 🐛 (model.py): fix issue with search_documents method not returning empty list when search_query is empty ♻️ (model.py): refactor search_documents method to handle search_query logic more efficiently 📝 (chroma.py): add typing_extensions override import for build_vector_store method to improve code readability * ✨ (tests): add unit tests for ChromaVectorStoreComponent - Introduced new test suite for ChromaVectorStoreComponent, covering various functionalities including database creation, collection management, similarity and MMR searches, and duplicate handling. - Implemented tests for creating collections with and without data, ensuring proper functionality and data integrity. - Verified search capabilities with different query types and result limits, enhancing overall test coverage for the component. * fix: remove unnecessary whitespace in model.py and add missing import in chroma.py * fix: mypy error module has no attribute "timeout" * ♻️ (async_helpers.py): Remove unnecessary type hint ignore comment from timeout_context function * 📝 (async_helpers.py): add a comment with issue reference PGH003 to document the reason for ignoring type checking in timeout_context function * ♻️ (async_helpers.py): Remove unnecessary type hint comment to improve code readability and maintainability * ♻️ (async_helpers.py): Add type ignore comment to suppress miscellaneous type error for timeout_context function * ♻️ (async_helpers.py): refactor timeout_context function to remove unnecessary type ignore comments and improve code readability * [autofix.ci] apply automated fixes * 📝 (async_helpers.py): add a blank line for better code readability and consistency * fix: mypy error: incompatible redefinition --------- Co-authored-by: anovazzi1 Co-authored-by: Gabriel Luiz Freitas Almeida Co-authored-by: italojohnny Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- .../base/langflow/base/vectorstores/model.py | 10 +- .../base/langflow/components/tools/mcp_sse.py | 4 +- .../components/vectorstores/chroma.py | 2 + .../base/langflow/utils/async_helpers.py | 17 + .../unit/components/vectorstores/__init__.py | 0 .../test_chroma_vector_store_component.py | 320 ++++++++++++++++++ .../components/editNodeComponent/index.tsx | 1 + 7 files changed, 347 insertions(+), 7 deletions(-) create mode 100644 src/backend/tests/unit/components/vectorstores/__init__.py create mode 100644 src/backend/tests/unit/components/vectorstores/test_chroma_vector_store_component.py diff --git a/src/backend/base/langflow/base/vectorstores/model.py b/src/backend/base/langflow/base/vectorstores/model.py index ee157c037..0c58c96fb 100644 --- a/src/backend/base/langflow/base/vectorstores/model.py +++ b/src/backend/base/langflow/base/vectorstores/model.py @@ -122,17 +122,17 @@ class LCVectorStoreComponent(Component): def search_documents(self) -> list[Data]: """Search for documents in the vector store.""" - search_query: str = self.search_query - if not search_query: - self.status = "" - return [] - if self._cached_vector_store is not None: vector_store = self._cached_vector_store else: vector_store = self.build_vector_store() self._cached_vector_store = vector_store + search_query: str = self.search_query + if not search_query: + self.status = "" + return [] + self.log(f"Search input: {search_query}") self.log(f"Search type: {self.search_type}") self.log(f"Number of results: {self.number_of_results}") diff --git a/src/backend/base/langflow/components/tools/mcp_sse.py b/src/backend/base/langflow/components/tools/mcp_sse.py index 876c05c7d..8c813a574 100644 --- a/src/backend/base/langflow/components/tools/mcp_sse.py +++ b/src/backend/base/langflow/components/tools/mcp_sse.py @@ -1,5 +1,4 @@ # from langflow.field_typing import Data -import asyncio from contextlib import AsyncExitStack import httpx @@ -11,6 +10,7 @@ from langflow.components.tools.mcp_stdio import create_input_schema_from_json_sc from langflow.custom import Component from langflow.field_typing import Tool from langflow.io import MessageTextInput, Output +from langflow.utils.async_helpers import timeout_context # Define constant for status code HTTP_TEMPORARY_REDIRECT = 307 @@ -39,7 +39,7 @@ class MCPSseClient: headers = {} url = await self.pre_check_redirect(url) - async with asyncio.timeout(timeout_seconds): + async with timeout_context(timeout_seconds): sse_transport = await self.exit_stack.enter_async_context( sse_client(url, headers, timeout_seconds, sse_read_timeout_seconds) ) diff --git a/src/backend/base/langflow/components/vectorstores/chroma.py b/src/backend/base/langflow/components/vectorstores/chroma.py index c009443b0..970b53530 100644 --- a/src/backend/base/langflow/components/vectorstores/chroma.py +++ b/src/backend/base/langflow/components/vectorstores/chroma.py @@ -2,6 +2,7 @@ from copy import deepcopy from chromadb.config import Settings from langchain_chroma import Chroma +from typing_extensions import override from langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store from langflow.base.vectorstores.utils import chroma_collection_to_data @@ -82,6 +83,7 @@ class ChromaVectorStoreComponent(LCVectorStoreComponent): ), ] + @override @check_cached_vector_store def build_vector_store(self) -> Chroma: """Builds the Chroma object.""" diff --git a/src/backend/base/langflow/utils/async_helpers.py b/src/backend/base/langflow/utils/async_helpers.py index 1b0ba25b4..68b68e147 100644 --- a/src/backend/base/langflow/utils/async_helpers.py +++ b/src/backend/base/langflow/utils/async_helpers.py @@ -1,4 +1,21 @@ import asyncio +from contextlib import asynccontextmanager + +if hasattr(asyncio, "timeout"): + + @asynccontextmanager + async def timeout_context(timeout_seconds): + with asyncio.timeout(timeout_seconds) as ctx: + yield ctx +else: + + @asynccontextmanager + async def timeout_context(timeout_seconds): + try: + yield await asyncio.wait_for(asyncio.Future(), timeout=timeout_seconds) + except asyncio.TimeoutError as e: + msg = f"Operation timed out after {timeout_seconds} seconds" + raise TimeoutError(msg) from e def run_until_complete(coro): diff --git a/src/backend/tests/unit/components/vectorstores/__init__.py b/src/backend/tests/unit/components/vectorstores/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/backend/tests/unit/components/vectorstores/test_chroma_vector_store_component.py b/src/backend/tests/unit/components/vectorstores/test_chroma_vector_store_component.py new file mode 100644 index 000000000..5775233d8 --- /dev/null +++ b/src/backend/tests/unit/components/vectorstores/test_chroma_vector_store_component.py @@ -0,0 +1,320 @@ +import os +from pathlib import Path +from typing import Any + +import pytest +from langflow.components.vectorstores.chroma import ChromaVectorStoreComponent +from langflow.schema.data import Data + +from tests.base import ComponentTestBaseWithoutClient, VersionComponentMapping + + +@pytest.mark.api_key_required +class TestChromaVectorStoreComponent(ComponentTestBaseWithoutClient): + @pytest.fixture + def component_class(self) -> type[Any]: + """Return the component class to test.""" + return ChromaVectorStoreComponent + + @pytest.fixture + def default_kwargs(self, tmp_path: Path) -> dict[str, Any]: + """Return the default kwargs for the component.""" + from langflow.components.embeddings.openai import OpenAIEmbeddingsComponent + + if os.getenv("OPENAI_API_KEY") is None: + pytest.skip("OPENAI_API_KEY is not set") + + api_key = os.getenv("OPENAI_API_KEY") + + return { + "embedding": OpenAIEmbeddingsComponent(openai_api_key=api_key).build_embeddings(), + "collection_name": "test_collection", + "persist_directory": tmp_path, + } + + @pytest.fixture + def file_names_mapping(self) -> list[VersionComponentMapping]: + """Return the file names mapping for different versions.""" + return [ + {"version": "1.0.19", "module": "vectorstores", "file_name": "Chroma"}, + {"version": "1.1.0", "module": "vectorstores", "file_name": "chroma"}, + {"version": "1.1.1", "module": "vectorstores", "file_name": "chroma"}, + ] + + def test_create_db(self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any]) -> None: + """Test the create_collection method.""" + component: ChromaVectorStoreComponent = component_class().set(**default_kwargs) + component.build_vector_store() + persist_directory = default_kwargs["persist_directory"] + assert persist_directory.exists() + assert persist_directory.is_dir() + # Assert it isn't empty + assert len(list(persist_directory.iterdir())) > 0 + # Assert there's a chroma.sqlite3 file + assert (persist_directory / "chroma.sqlite3").exists() + assert (persist_directory / "chroma.sqlite3").is_file() + + def test_create_collection_with_data( + self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the create_collection method with data.""" + # set ingest_data in default_kwargs to a list of Data objects + test_texts = ["test data 1", "test data 2", "something completely different"] + default_kwargs["ingest_data"] = [Data(text=text) for text in test_texts] + + component: ChromaVectorStoreComponent = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + + # Verify collection exists and has the correct data + collection = vector_store._collection + assert collection.name == default_kwargs["collection_name"] + assert collection.count() == len(test_texts) + + def test_similarity_search( + self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the similarity search functionality through the component.""" + # Create test data with distinct topics + test_data = [ + "The quick brown fox jumps over the lazy dog", + "Python is a popular programming language", + "Machine learning models process data", + "The lazy dog sleeps all day long", + ] + default_kwargs["ingest_data"] = [Data(text=text) for text in test_data] + default_kwargs["search_type"] = "Similarity" + default_kwargs["number_of_results"] = 2 + + component: ChromaVectorStoreComponent = component_class().set(**default_kwargs) + component.build_vector_store() + + # Test similarity search through the component + component.set(search_query="dog sleeping") + results = component.search_documents() + + assert len(results) == 2 + # The most relevant results should be about dogs + assert any("dog" in result.text.lower() for result in results) + + # Test with different number of results + component.set(number_of_results=3) + results = component.search_documents() + assert len(results) == 3 + + def test_mmr_search( + self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the MMR search functionality through the component.""" + # Create test data with some similar documents + test_data = [ + "The quick brown fox jumps", + "The quick brown fox leaps", + "The quick brown fox hops", + "Something completely different about cats", + ] + default_kwargs["ingest_data"] = [Data(text=text) for text in test_data] + default_kwargs["search_type"] = "MMR" + default_kwargs["number_of_results"] = 3 + + component: ChromaVectorStoreComponent = component_class().set(**default_kwargs) + component.build_vector_store() + + # Test MMR search through the component + component.set(search_query="quick fox") + results = component.search_documents() + + assert len(results) == 3 + # Results should be diverse but relevant + assert any("fox" in result.text.lower() for result in results) + + # Test with different settings + component.set(number_of_results=2) + diverse_results = component.search_documents() + assert len(diverse_results) == 2 + + def test_search_with_different_types( + self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test search with different search types.""" + test_data = [ + "The quick brown fox jumps over the lazy dog", + "Python is a popular programming language", + "Machine learning models process data", + ] + default_kwargs["ingest_data"] = [Data(text=text) for text in test_data] + default_kwargs["number_of_results"] = 2 + + component: ChromaVectorStoreComponent = component_class().set(**default_kwargs) + component.build_vector_store() + + # Test similarity search + component.set(search_type="Similarity", search_query="programming languages") + similarity_results = component.search_documents() + assert len(similarity_results) == 2 + assert any("python" in result.text.lower() for result in similarity_results) + + # Test MMR search + component.set(search_type="MMR", search_query="programming languages") + mmr_results = component.search_documents() + assert len(mmr_results) == 2 + + # Test with empty query + component.set(search_query="") + empty_results = component.search_documents() + assert len(empty_results) == 0 + + def test_search_with_score( + self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the search with score functionality through the component.""" + test_data = [ + "The quick brown fox jumps over the lazy dog", + "Python is a popular programming language", + "Machine learning models process data", + ] + default_kwargs["ingest_data"] = [Data(text=text) for text in test_data] + default_kwargs["number_of_results"] = 2 + + component: ChromaVectorStoreComponent = component_class().set(**default_kwargs) + component.build_vector_store() + + # Test search with score through the component + component.set( + search_type="similarity_score_threshold", search_query="programming languages", number_of_results=2 + ) + results = component.search_documents() + + assert len(results) == 2 + # Results should be sorted by relevance + assert any("python" in result.text.lower() for result in results) + assert any("programming" in result.text.lower() for result in results) + + # Test with different number of results + component.set(number_of_results=3) + results = component.search_documents() + assert len(results) == 3 + + def test_duplicate_handling( + self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test handling of duplicate documents.""" + # Create test data with duplicates + test_data = [ + Data(text_key="text", data={"text": "This is a test document"}), + Data(text_key="text", data={"text": "This is a test document"}), # Duplicate with exact same data + Data(text_key="text", data={"text": "This is another document"}), + ] + default_kwargs["ingest_data"] = test_data + default_kwargs["allow_duplicates"] = False + default_kwargs["limit"] = 100 # Set a high enough limit to get all documents + + component: ChromaVectorStoreComponent = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + + # Get all documents + results = vector_store.get(limit=100) + + documents = results["documents"] + + # The documents are returned in a list structure + assert len(documents) == 3 # All documents are added, even duplicates + + # Count unique texts + unique_texts = set(documents) + assert len(unique_texts) == 2 # Should have 2 unique texts + + # Test with allow_duplicates=True + test_data = [ + Data(text_key="text", data={"text": "This is a test document"}), + Data(text_key="text", data={"text": "This is a test document"}), # Duplicate + ] + default_kwargs["ingest_data"] = test_data + default_kwargs["allow_duplicates"] = True + default_kwargs["collection_name"] = "test_collection_2" # Use a different collection name + + component = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + + # Get all documents + results = vector_store.get(limit=100) + documents = results["documents"] + + # With allow_duplicates=True, we should have both documents + assert len(documents) == 2 + assert all("test document" in doc for doc in documents) + + # Verify that we have the expected number of documents + assert vector_store._collection.count() == 2 + + def test_chroma_collection_to_data( + self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the chroma_collection_to_data function.""" + from langflow.base.vectorstores.utils import chroma_collection_to_data + + # Create a collection with documents and metadata + test_data = [ + Data(data={"text": "Document 1", "metadata_field": "value1"}), + Data(data={"text": "Document 2", "metadata_field": "value2"}), + ] + default_kwargs["ingest_data"] = test_data + component: ChromaVectorStoreComponent = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + + # Get the collection data + collection_dict = vector_store.get() + data_objects = chroma_collection_to_data(collection_dict) + + # Verify the conversion + assert len(data_objects) == 2 + for data_obj in data_objects: + assert isinstance(data_obj, Data) + assert "id" in data_obj.data + assert "text" in data_obj.data + assert data_obj.data["text"] in ["Document 1", "Document 2"] + assert "metadata_field" in data_obj.data + assert data_obj.data["metadata_field"] in ["value1", "value2"] + + def test_chroma_collection_to_data_without_metadata( + self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the chroma_collection_to_data function with documents that have no metadata.""" + from langflow.base.vectorstores.utils import chroma_collection_to_data + + # Create a collection with documents but no metadata + test_data = [ + Data(data={"text": "Simple document 1"}), + Data(data={"text": "Simple document 2"}), + ] + default_kwargs["ingest_data"] = test_data + component: ChromaVectorStoreComponent = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + + # Get the collection data + collection_dict = vector_store.get() + data_objects = chroma_collection_to_data(collection_dict) + + # Verify the conversion + assert len(data_objects) == 2 + for data_obj in data_objects: + assert isinstance(data_obj, Data) + assert "id" in data_obj.data + assert "text" in data_obj.data + assert data_obj.data["text"] in ["Simple document 1", "Simple document 2"] + + def test_chroma_collection_to_data_empty_collection( + self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test the chroma_collection_to_data function with an empty collection.""" + from langflow.base.vectorstores.utils import chroma_collection_to_data + + # Create an empty collection + component: ChromaVectorStoreComponent = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + + # Get the collection data + collection_dict = vector_store.get() + data_objects = chroma_collection_to_data(collection_dict) + + # Verify the conversion + assert len(data_objects) == 0 diff --git a/src/frontend/src/modals/editNodeModal/components/editNodeComponent/index.tsx b/src/frontend/src/modals/editNodeModal/components/editNodeComponent/index.tsx index 5e7f23596..53bf2f4d8 100644 --- a/src/frontend/src/modals/editNodeModal/components/editNodeComponent/index.tsx +++ b/src/frontend/src/modals/editNodeModal/components/editNodeComponent/index.tsx @@ -34,6 +34,7 @@ export function EditNodeComponent({
{nodeClass && (