fix: Add metadata filtering for Chroma components (#9137)
* 🐛 (chroma.py, local_db.py): Handle ImportError when importing filter_complex_metadata function to avoid crashing the application 💡 (chroma.py, local_db.py): Add try-except block to gracefully handle ImportError and log a warning message if the function cannot be imported * ♻️ (local_db.py): remove unnecessary try-except block and simplify document adding process in LocalDBComponent * 🔧 (chroma.py): Remove unused import 'filter_complex_metadata' and add support for filtering complex metadata to prevent ChromaDB errors 🔧 (test_chroma_vector_store_component.py): Add test cases for filtering complex metadata, preserving simple metadata types, handling single file upload scenario, fallback behavior when import fails, and handling empty and None metadata values. * [autofix.ci] apply automated fixes * 🐛 (test_chroma_vector_store_component.py): fix failing test due to None value being filtered out by ChromaDB metadata handling * 🔧 (test_chroma_vector_store_component.py): refactor test method names and data to improve clarity and consistency in metadata filtering and preservation logic * 🐛 (test_chroma_vector_store_component.py): fix missing variable assignment for error message in mocked import error to improve error handling and debugging in tests * [autofix.ci] apply automated fixes * 🔧 (test_chroma_vector_store_component.py): delete the file test_chroma_vector_store_component.py as it is no longer needed in the project * [autofix.ci] apply automated fixes * ✨ (test_chroma_vector_store_component.py): add unit tests for ChromaVectorStoreComponent including create_db, create_collection_with_data, similarity_search, mmr_search, search_with_different_types, search_with_score, duplicate_handling, chroma_collection_to_data, metadata_filtering_with_complex_data, metadata_filtering_fallback --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Edwin Jose <edwin.jose@datastax.com>
This commit is contained in:
parent
73225f4d81
commit
caccdb79e2
2 changed files with 90 additions and 1 deletions
|
|
@ -154,6 +154,14 @@ class ChromaVectorStoreComponent(LCVectorStoreComponent):
|
|||
|
||||
if documents and self.embedding is not None:
|
||||
self.log(f"Adding {len(documents)} documents to the Vector Store.")
|
||||
vector_store.add_documents(documents)
|
||||
# Filter complex metadata to prevent ChromaDB errors
|
||||
try:
|
||||
from langchain_community.vectorstores.utils import filter_complex_metadata
|
||||
|
||||
filtered_documents = filter_complex_metadata(documents)
|
||||
vector_store.add_documents(filtered_documents)
|
||||
except ImportError:
|
||||
self.log("Warning: Could not import filter_complex_metadata. Adding documents without filtering.")
|
||||
vector_store.add_documents(documents)
|
||||
else:
|
||||
self.log("No documents to add to the Vector Store.")
|
||||
|
|
|
|||
|
|
@ -318,3 +318,84 @@ class TestChromaVectorStoreComponent(ComponentTestBaseWithoutClient):
|
|||
|
||||
# Verify the conversion
|
||||
assert len(data_objects) == 0
|
||||
|
||||
def test_metadata_filtering_with_complex_data(
|
||||
self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any]
|
||||
) -> None:
|
||||
"""Test that complex metadata is properly filtered and simple types are preserved."""
|
||||
from langflow.base.vectorstores.utils import chroma_collection_to_data
|
||||
|
||||
# Create test data that covers the original error scenario and validation
|
||||
test_data = [
|
||||
Data(
|
||||
data={
|
||||
"text": "Document with mixed metadata",
|
||||
"files": [], # This empty list was causing the original ChromaDB error
|
||||
"tags": ["tag1", "tag2"], # Lists should be filtered out
|
||||
"nested": {"key": "value"}, # Nested objects should be filtered out
|
||||
"simple_string": "preserved",
|
||||
"simple_int": 42,
|
||||
"simple_bool": True,
|
||||
"empty_string": "", # Edge case: empty but valid
|
||||
"zero_value": 0, # Edge case: falsy but valid
|
||||
}
|
||||
)
|
||||
]
|
||||
|
||||
default_kwargs["ingest_data"] = test_data
|
||||
default_kwargs["collection_name"] = "test_metadata_filtering"
|
||||
|
||||
# This should not raise an error despite the complex metadata
|
||||
component: ChromaVectorStoreComponent = component_class().set(**default_kwargs)
|
||||
vector_store = component.build_vector_store()
|
||||
|
||||
# Verify document was added successfully
|
||||
collection_dict = vector_store.get()
|
||||
assert len(collection_dict["documents"]) == 1
|
||||
assert "Document with mixed metadata" in collection_dict["documents"][0]
|
||||
|
||||
# Verify metadata filtering: simple types preserved, complex types filtered out
|
||||
data_objects = chroma_collection_to_data(collection_dict)
|
||||
data_obj = data_objects[0]
|
||||
|
||||
# Simple types should be preserved
|
||||
assert data_obj.data["simple_string"] == "preserved"
|
||||
assert data_obj.data["simple_int"] == 42
|
||||
assert data_obj.data["simple_bool"] is True
|
||||
assert data_obj.data["empty_string"] == ""
|
||||
assert data_obj.data["zero_value"] == 0
|
||||
|
||||
# Complex types should be filtered out
|
||||
assert "files" not in data_obj.data
|
||||
assert "tags" not in data_obj.data
|
||||
assert "nested" not in data_obj.data
|
||||
|
||||
def test_metadata_filtering_fallback(
|
||||
self, component_class: type[ChromaVectorStoreComponent], default_kwargs: dict[str, Any], monkeypatch
|
||||
) -> None:
|
||||
"""Test the fallback behavior when filter_complex_metadata import fails."""
|
||||
import builtins
|
||||
|
||||
original_import = builtins.__import__
|
||||
|
||||
def mock_import(name, *args, **kwargs):
|
||||
if name == "langchain_community.vectorstores.utils":
|
||||
error_msg = "Mocked import error"
|
||||
raise ImportError(error_msg)
|
||||
return original_import(name, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", mock_import)
|
||||
|
||||
# Use simple test data to avoid ChromaDB errors when filtering is unavailable
|
||||
test_data = [Data(data={"text": "Simple document", "simple_field": "simple_value"})]
|
||||
default_kwargs["ingest_data"] = test_data
|
||||
default_kwargs["collection_name"] = "test_fallback"
|
||||
|
||||
# Should work with fallback (no filtering)
|
||||
component: ChromaVectorStoreComponent = component_class().set(**default_kwargs)
|
||||
vector_store = component.build_vector_store()
|
||||
|
||||
# Verify document was added
|
||||
collection_dict = vector_store.get()
|
||||
assert len(collection_dict["documents"]) == 1
|
||||
assert "Simple document" in collection_dict["documents"][0]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue