fix: Add metadata filtering for Chroma components (#9137)

* 🐛 (chroma.py, local_db.py): Handle ImportError when importing filter_complex_metadata function to avoid crashing the application
💡 (chroma.py, local_db.py): Add try-except block to gracefully handle ImportError and log a warning message if the function cannot be imported

* ♻️ (local_db.py): remove unnecessary try-except block and simplify document adding process in LocalDBComponent

* 🔧 (chroma.py): Remove unused import 'filter_complex_metadata' and add support for filtering complex metadata to prevent ChromaDB errors
🔧 (test_chroma_vector_store_component.py): Add test cases for filtering complex metadata, preserving simple metadata types, handling single file upload scenario, fallback behavior when import fails, and handling empty and None metadata values.

* [autofix.ci] apply automated fixes

* 🐛 (test_chroma_vector_store_component.py): fix failing test due to None value being filtered out by ChromaDB metadata handling

* 🔧 (test_chroma_vector_store_component.py): refactor test method names and data to improve clarity and consistency in metadata filtering and preservation logic

* 🐛 (test_chroma_vector_store_component.py): fix missing variable assignment for error message in mocked import error to improve error handling and debugging in tests

* [autofix.ci] apply automated fixes

* 🔧 (test_chroma_vector_store_component.py): delete the file test_chroma_vector_store_component.py as it is no longer needed in the project

* [autofix.ci] apply automated fixes

*  (test_chroma_vector_store_component.py): add unit tests for ChromaVectorStoreComponent including create_db, create_collection_with_data, similarity_search, mmr_search, search_with_different_types, search_with_score, duplicate_handling, chroma_collection_to_data, metadata_filtering_with_complex_data, metadata_filtering_fallback

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Edwin Jose <edwin.jose@datastax.com>
This commit is contained in:
Cristhian Zanforlin Lousa 2025-08-01 11:24:07 -03:00 committed by GitHub
commit caccdb79e2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 90 additions and 1 deletions

View file

@ -154,6 +154,14 @@ class ChromaVectorStoreComponent(LCVectorStoreComponent):
if documents and self.embedding is not None:
self.log(f"Adding {len(documents)} documents to the Vector Store.")
vector_store.add_documents(documents)
# Filter complex metadata to prevent ChromaDB errors
try:
from langchain_community.vectorstores.utils import filter_complex_metadata
filtered_documents = filter_complex_metadata(documents)
vector_store.add_documents(filtered_documents)
except ImportError:
self.log("Warning: Could not import filter_complex_metadata. Adding documents without filtering.")
vector_store.add_documents(documents)
else:
self.log("No documents to add to the Vector Store.")

View file

@ -318,3 +318,84 @@ class TestChromaVectorStoreComponent(ComponentTestBaseWithoutClient):
# Verify the conversion
assert len(data_objects) == 0
def test_metadata_filtering_with_complex_data(
self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any]
) -> None:
"""Test that complex metadata is properly filtered and simple types are preserved."""
from langflow.base.vectorstores.utils import chroma_collection_to_data
# Create test data that covers the original error scenario and validation
test_data = [
Data(
data={
"text": "Document with mixed metadata",
"files": [], # This empty list was causing the original ChromaDB error
"tags": ["tag1", "tag2"], # Lists should be filtered out
"nested": {"key": "value"}, # Nested objects should be filtered out
"simple_string": "preserved",
"simple_int": 42,
"simple_bool": True,
"empty_string": "", # Edge case: empty but valid
"zero_value": 0, # Edge case: falsy but valid
}
)
]
default_kwargs["ingest_data"] = test_data
default_kwargs["collection_name"] = "test_metadata_filtering"
# This should not raise an error despite the complex metadata
component: ChromaVectorStoreComponent = component_class().set(**default_kwargs)
vector_store = component.build_vector_store()
# Verify document was added successfully
collection_dict = vector_store.get()
assert len(collection_dict["documents"]) == 1
assert "Document with mixed metadata" in collection_dict["documents"][0]
# Verify metadata filtering: simple types preserved, complex types filtered out
data_objects = chroma_collection_to_data(collection_dict)
data_obj = data_objects[0]
# Simple types should be preserved
assert data_obj.data["simple_string"] == "preserved"
assert data_obj.data["simple_int"] == 42
assert data_obj.data["simple_bool"] is True
assert data_obj.data["empty_string"] == ""
assert data_obj.data["zero_value"] == 0
# Complex types should be filtered out
assert "files" not in data_obj.data
assert "tags" not in data_obj.data
assert "nested" not in data_obj.data
def test_metadata_filtering_fallback(
self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any], monkeypatch
) -> None:
"""Test the fallback behavior when filter_complex_metadata import fails."""
import builtins
original_import = builtins.__import__
def mock_import(name, *args, **kwargs):
if name == "langchain_community.vectorstores.utils":
error_msg = "Mocked import error"
raise ImportError(error_msg)
return original_import(name, *args, **kwargs)
monkeypatch.setattr(builtins, "__import__", mock_import)
# Use simple test data to avoid ChromaDB errors when filtering is unavailable
test_data = [Data(data={"text": "Simple document", "simple_field": "simple_value"})]
default_kwargs["ingest_data"] = test_data
default_kwargs["collection_name"] = "test_fallback"
# Should work with fallback (no filtering)
component: ChromaVectorStoreComponent = component_class().set(**default_kwargs)
vector_store = component.build_vector_store()
# Verify document was added
collection_dict = vector_store.get()
assert len(collection_dict["documents"]) == 1
assert "Simple document" in collection_dict["documents"][0]