From caccdb79e26220b36fc963f6028c9e5a2292aabb Mon Sep 17 00:00:00 2001 From: Cristhian Zanforlin Lousa Date: Fri, 1 Aug 2025 11:24:07 -0300 Subject: [PATCH] fix: Add metadata filtering for Chroma components (#9137) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🐛 (chroma.py, local_db.py): Handle ImportError when importing filter_complex_metadata function to avoid crashing the application 💡 (chroma.py, local_db.py): Add try-except block to gracefully handle ImportError and log a warning message if the function cannot be imported * ♻️ (local_db.py): remove unnecessary try-except block and simplify document adding process in LocalDBComponent * 🔧 (chroma.py): Remove unused import 'filter_complex_metadata' and add support for filtering complex metadata to prevent ChromaDB errors 🔧 (test_chroma_vector_store_component.py): Add test cases for filtering complex metadata, preserving simple metadata types, handling single file upload scenario, fallback behavior when import fails, and handling empty and None metadata values. * [autofix.ci] apply automated fixes * 🐛 (test_chroma_vector_store_component.py): fix failing test due to None value being filtered out by ChromaDB metadata handling * 🔧 (test_chroma_vector_store_component.py): refactor test method names and data to improve clarity and consistency in metadata filtering and preservation logic * 🐛 (test_chroma_vector_store_component.py): fix missing variable assignment for error message in mocked import error to improve error handling and debugging in tests * [autofix.ci] apply automated fixes * 🔧 (test_chroma_vector_store_component.py): delete the file test_chroma_vector_store_component.py as it is no longer needed in the project * [autofix.ci] apply automated fixes * ✨ (test_chroma_vector_store_component.py): add unit tests for ChromaVectorStoreComponent including create_db, create_collection_with_data, similarity_search, mmr_search, search_with_different_types, search_with_score, duplicate_handling, chroma_collection_to_data, metadata_filtering_with_complex_data, metadata_filtering_fallback --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Edwin Jose --- .../components/vectorstores/chroma.py | 10 ++- .../test_chroma_vector_store_component.py | 81 +++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/src/backend/base/langflow/components/vectorstores/chroma.py b/src/backend/base/langflow/components/vectorstores/chroma.py index 31cc1fb85..b035d5f58 100644 --- a/src/backend/base/langflow/components/vectorstores/chroma.py +++ b/src/backend/base/langflow/components/vectorstores/chroma.py @@ -154,6 +154,14 @@ class ChromaVectorStoreComponent(LCVectorStoreComponent): if documents and self.embedding is not None: self.log(f"Adding {len(documents)} documents to the Vector Store.") - vector_store.add_documents(documents) + # Filter complex metadata to prevent ChromaDB errors + try: + from langchain_community.vectorstores.utils import filter_complex_metadata + + filtered_documents = filter_complex_metadata(documents) + vector_store.add_documents(filtered_documents) + except ImportError: + self.log("Warning: Could not import filter_complex_metadata. Adding documents without filtering.") + vector_store.add_documents(documents) else: self.log("No documents to add to the Vector Store.") diff --git a/src/backend/tests/unit/components/vectorstores/test_chroma_vector_store_component.py b/src/backend/tests/unit/components/vectorstores/test_chroma_vector_store_component.py index 4fee4b194..ab45d6f2f 100644 --- a/src/backend/tests/unit/components/vectorstores/test_chroma_vector_store_component.py +++ b/src/backend/tests/unit/components/vectorstores/test_chroma_vector_store_component.py @@ -318,3 +318,84 @@ class TestChromaVectorStoreComponent(ComponentTestBaseWithoutClient): # Verify the conversion assert len(data_objects) == 0 + + def test_metadata_filtering_with_complex_data( + self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any] + ) -> None: + """Test that complex metadata is properly filtered and simple types are preserved.""" + from langflow.base.vectorstores.utils import chroma_collection_to_data + + # Create test data that covers the original error scenario and validation + test_data = [ + Data( + data={ + "text": "Document with mixed metadata", + "files": [], # This empty list was causing the original ChromaDB error + "tags": ["tag1", "tag2"], # Lists should be filtered out + "nested": {"key": "value"}, # Nested objects should be filtered out + "simple_string": "preserved", + "simple_int": 42, + "simple_bool": True, + "empty_string": "", # Edge case: empty but valid + "zero_value": 0, # Edge case: falsy but valid + } + ) + ] + + default_kwargs["ingest_data"] = test_data + default_kwargs["collection_name"] = "test_metadata_filtering" + + # This should not raise an error despite the complex metadata + component: ChromaVectorStoreComponent = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + + # Verify document was added successfully + collection_dict = vector_store.get() + assert len(collection_dict["documents"]) == 1 + assert "Document with mixed metadata" in collection_dict["documents"][0] + + # Verify metadata filtering: simple types preserved, complex types filtered out + data_objects = chroma_collection_to_data(collection_dict) + data_obj = data_objects[0] + + # Simple types should be preserved + assert data_obj.data["simple_string"] == "preserved" + assert data_obj.data["simple_int"] == 42 + assert data_obj.data["simple_bool"] is True + assert data_obj.data["empty_string"] == "" + assert data_obj.data["zero_value"] == 0 + + # Complex types should be filtered out + assert "files" not in data_obj.data + assert "tags" not in data_obj.data + assert "nested" not in data_obj.data + + def test_metadata_filtering_fallback( + self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any], monkeypatch + ) -> None: + """Test the fallback behavior when filter_complex_metadata import fails.""" + import builtins + + original_import = builtins.__import__ + + def mock_import(name, *args, **kwargs): + if name == "langchain_community.vectorstores.utils": + error_msg = "Mocked import error" + raise ImportError(error_msg) + return original_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", mock_import) + + # Use simple test data to avoid ChromaDB errors when filtering is unavailable + test_data = [Data(data={"text": "Simple document", "simple_field": "simple_value"})] + default_kwargs["ingest_data"] = test_data + default_kwargs["collection_name"] = "test_fallback" + + # Should work with fallback (no filtering) + component: ChromaVectorStoreComponent = component_class().set(**default_kwargs) + vector_store = component.build_vector_store() + + # Verify document was added + collection_dict = vector_store.get() + assert len(collection_dict["documents"]) == 1 + assert "Simple document" in collection_dict["documents"][0]