From f9a7c9bcef9df06e16e97499a3e581c6148a5573 Mon Sep 17 00:00:00 2001 From: Gustavo Costa Date: Thu, 3 Apr 2025 16:03:17 -0300 Subject: [PATCH] Fix: add insert mode MongoDB (#7394) * add dropdown inser mode, create method __insert_mode * fix unit_test mongodb * add info to index_name * to overwrite, delete_many from collection * create verify_search_index * fix SIMILARITY_OPTIONS * fix documentation components-vector-stores.md --- .../Components/components-vector-stores.md | 26 +++-- .../components/vectorstores/mongodb_atlas.py | 104 ++++++++++++++++-- .../vectorstores/test_mongodb_atlas.py | 64 +++++------ 3 files changed, 136 insertions(+), 58 deletions(-) diff --git a/docs/docs/Components/components-vector-stores.md b/docs/docs/Components/components-vector-stores.md index fba582188..2d9a7c8dc 100644 --- a/docs/docs/Components/components-vector-stores.md +++ b/docs/docs/Components/components-vector-stores.md @@ -430,16 +430,22 @@ For more information, see the [MongoDB Atlas documentation](https://www.mongodb. ### Inputs -| Name | Type | Description | -| ------------------------ | ------------ | ----------------------------------------- | -| mongodb_atlas_cluster_uri | SecretString | MongoDB Atlas Cluster URI | -| db_name | String | Database name | -| collection_name | String | Collection name | -| index_name | String | Index name | -| search_query | String | Query for similarity search | -| ingest_data | Data | Data to be ingested into the vector store | -| embedding | Embeddings | Embedding function to use | -| number_of_results | Integer | Number of results to return in search | +| Name | Type | Description | +| ------------------------- | ------------ | ----------------------------------------- | +| mongodb_atlas_cluster_uri | SecretString | The connection URI for your MongoDB Atlas cluster (required) | +| enable_mtls | Boolean | Enable mutual TLS authentication (default: false) | +| mongodb_atlas_client_cert | SecretString | Client certificate combined with private key for mTLS authentication (required if mTLS is enabled) | +| db_name | String | The name of the database to use (required) | +| collection_name | String | The name of the collection to use (required) | +| index_name | String | The name of the Atlas Search index, it should be a Vector Search (required) | +| insert_mode | String | How to insert new documents into the collection (options: "append", "overwrite", default: "append") | +| embedding | Embeddings | The embedding model to use | +| number_of_results | Integer | Number of results to return in similarity search (default: 4) | +| index_field | String | The field to index (default: "embedding") | +| filter_field | String | The field to filter the index | +| number_dimensions | Integer | Embedding context length (default: 1536) | +| similarity | String | The method used to measure similarity between vectors (options: "cosine", "euclidean", "dotProduct", default: "cosine") | +| quantization | String | Quantization reduces memory costs by converting 32-bit floats to smaller data types (options: "scalar", "binary") | ### Outputs diff --git a/src/backend/base/langflow/components/vectorstores/mongodb_atlas.py b/src/backend/base/langflow/components/vectorstores/mongodb_atlas.py index 532590a44..d71a8ddb8 100644 --- a/src/backend/base/langflow/components/vectorstores/mongodb_atlas.py +++ b/src/backend/base/langflow/components/vectorstores/mongodb_atlas.py @@ -1,11 +1,14 @@ import tempfile +import time import certifi from langchain_community.vectorstores import MongoDBAtlasVectorSearch +from pymongo.collection import Collection +from pymongo.operations import SearchIndexModel from langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store from langflow.helpers.data import docs_to_data -from langflow.io import BoolInput, HandleInput, IntInput, SecretStrInput, StrInput +from langflow.io import BoolInput, DropdownInput, HandleInput, IntInput, SecretStrInput, StrInput from langflow.schema import Data @@ -14,7 +17,9 @@ class MongoVectorStoreComponent(LCVectorStoreComponent): description = "MongoDB Atlas Vector Store with search capabilities" name = "MongoDBAtlasVector" icon = "MongoDB" - + INSERT_MODES = ["append", "overwrite"] + SIMILARITY_OPTIONS = ["cosine", "euclidean", "dotProduct"] + QUANTIZATION_OPTIONS = ["scalar", "binary"] inputs = [ SecretStrInput(name="mongodb_atlas_cluster_uri", display_name="MongoDB Atlas Cluster URI", required=True), BoolInput(name="enable_mtls", display_name="Enable mTLS", value=False, advanced=True, required=True), @@ -28,8 +33,21 @@ class MongoVectorStoreComponent(LCVectorStoreComponent): ), StrInput(name="db_name", display_name="Database Name", required=True), StrInput(name="collection_name", display_name="Collection Name", required=True), - StrInput(name="index_name", display_name="Index Name", required=True), + StrInput( + name="index_name", + display_name="Index Name", + required=True, + info="The name of Atlas Search index, it should be a Vector Search.", + ), *LCVectorStoreComponent.inputs, + DropdownInput( + name="insert_mode", + display_name="Insert Mode", + options=INSERT_MODES, + value=INSERT_MODES[0], + info="How to insert new documents into the collection.", + advanced=True, + ), HandleInput(name="embedding", display_name="Embedding", input_types=["Embeddings"]), IntInput( name="number_of_results", @@ -38,6 +56,41 @@ class MongoVectorStoreComponent(LCVectorStoreComponent): value=4, advanced=True, ), + StrInput( + name="index_field", + display_name="Index Field", + advanced=True, + required=True, + info="The field to index.", + value="embedding", + ), + StrInput( + name="filter_field", display_name="Filter Field", advanced=True, info="The field to filter the index." + ), + IntInput( + name="number_dimensions", + display_name="Number of Dimensions", + info="Embedding Context Length.", + value=1536, + advanced=True, + required=True, + ), + DropdownInput( + name="similarity", + display_name="Similarity", + options=SIMILARITY_OPTIONS, + value=SIMILARITY_OPTIONS[0], + info="The method used to measure the similarity between vectors.", + advanced=True, + ), + DropdownInput( + name="quantization", + display_name="Quantization", + options=QUANTIZATION_OPTIONS, + value=None, + info="Quantization reduces memory costs converting 32-bit floats to smaller data types", + advanced=True, + ), ] @check_cached_vector_store @@ -96,21 +149,20 @@ class MongoVectorStoreComponent(LCVectorStoreComponent): documents.append(_input) if documents: - collection.drop() # Drop collection to override the vector store + self.__insert_mode(collection) + return MongoDBAtlasVectorSearch.from_documents( documents=documents, embedding=self.embedding, collection=collection, index_name=self.index_name ) - return MongoDBAtlasVectorSearch( - embedding=self.embedding, - collection=collection, - index_name=self.index_name, - ) + return MongoDBAtlasVectorSearch(embedding=self.embedding, collection=collection, index_name=self.index_name) def search_documents(self) -> list[Data]: from bson.objectid import ObjectId vector_store = self.build_vector_store() + self.verify_search_index(vector_store._collection) + if self.search_query and isinstance(self.search_query, str): docs = vector_store.similarity_search( query=self.search_query, @@ -125,3 +177,37 @@ class MongoVectorStoreComponent(LCVectorStoreComponent): self.status = data return data return [] + + def __insert_mode(self, collection: Collection) -> None: + if self.insert_mode == "overwrite": + collection.delete_many({}) # Delete all documents while preserving collection structure + + def verify_search_index(self, collection: Collection) -> None: + """Verify if the search index exists, if not, create it. + + Args: + collection (Collection): The collection to verify the search index on. + """ + indexes = collection.list_search_indexes() + + index_names_types = {idx["name"]: idx["type"] for idx in indexes} + index_names = list(index_names_types.keys()) + index_type = index_names_types.get(self.index_name) + if self.index_name not in index_names and index_type != "vectorSearch": + collection.create_search_index(self.__create_index_definition()) + + time.sleep(20) # Give some time for index to be ready + + def __create_index_definition(self) -> SearchIndexModel: + fields = [ + { + "type": "vector", + "path": self.index_field, + "numDimensions": self.number_dimensions, + "similarity": self.similarity, + "quantization": self.quantization, + } + ] + if self.filter_field: + fields.append({"type": "filter", "path": self.filter_field}) + return SearchIndexModel(definition={"fields": fields}, name=self.index_name, type="vectorSearch") diff --git a/src/backend/tests/unit/components/vectorstores/test_mongodb_atlas.py b/src/backend/tests/unit/components/vectorstores/test_mongodb_atlas.py index a665f0993..1a7f32108 100644 --- a/src/backend/tests/unit/components/vectorstores/test_mongodb_atlas.py +++ b/src/backend/tests/unit/components/vectorstores/test_mongodb_atlas.py @@ -1,12 +1,12 @@ import os +import time from typing import Any import pytest from langchain_community.embeddings.fake import DeterministicFakeEmbedding -from langchain_community.vectorstores import MongoDBAtlasVectorSearch from langflow.components.vectorstores.mongodb_atlas import MongoVectorStoreComponent from langflow.schema.data import Data -from pymongo.operations import SearchIndexModel +from pymongo.collection import Collection from tests.base import ComponentTestBaseWithoutClient, VersionComponentMapping @@ -30,6 +30,13 @@ class TestMongoVectorStoreComponent(ComponentTestBaseWithoutClient): "index_name": "test_index", "enable_mtls": False, "embedding": DeterministicFakeEmbedding(size=8), + "index_field": "embedding", + "filter_field": "text", + "number_dimensions": 8, + "similarity": "cosine", + "quantization": "scalar", + "insert_mode": "append", + "ingest_data": [Data(data={"text": "test data 1"}), Data(data={"text": "test data 2"})], } @pytest.fixture @@ -41,42 +48,18 @@ class TestMongoVectorStoreComponent(ComponentTestBaseWithoutClient): {"version": "1.1.1", "module": "vectorstores", "file_name": "mongodb_atlas"}, ] - def __create_search_index(self, vector_store: MongoDBAtlasVectorSearch, default_kwargs: dict[str, Any]) -> None: + def __create_search_index( + self, component_class: type[MongoVectorStoreComponent], collection: Collection, default_kwargs: dict[str, Any] + ) -> None: """Create a vector search index if it doesn't exist.""" - try: - index_definition = SearchIndexModel( - definition={ - "fields": [ - { - "type": "vector", - "path": "embedding", - "numDimensions": 8, - "similarity": "cosine", - "quantization": "scalar", - }, - {"type": "filter", "path": "text"}, - ] - }, - name=default_kwargs["index_name"], - type="vectorSearch", - ) + component_class().set(**default_kwargs).verify_search_index(collection) - vector_store._collection.create_search_index(index_definition) - - # Wait for index to be ready - import time - - time.sleep(40) # Give some time for index to be ready - - # Verify index was created - indexes = vector_store._collection.list_search_indexes() - index_names = [idx["name"] for idx in indexes] - assert default_kwargs["index_name"] in index_names - - except Exception as e: - # Index might already exist, which is fine - if "AlreadyExists" not in str(e): - raise + # Verify index was created + indexes = collection.list_search_indexes() + index_names = {idx["name"]: idx["type"] for idx in indexes} + index_type = index_names.get(default_kwargs["index_name"]) + assert default_kwargs["index_name"] in index_names + assert index_type == "vectorSearch" def test_create_db(self, component_class: type[MongoVectorStoreComponent], default_kwargs: dict[str, Any]) -> None: """Test creating a MongoDB Atlas vector store.""" @@ -93,6 +76,7 @@ class TestMongoVectorStoreComponent(ComponentTestBaseWithoutClient): """Test creating a collection with data.""" test_texts = ["test data 1", "test data 2", "something completely different"] default_kwargs["ingest_data"] = [Data(data={"text": text}) for text in test_texts] + default_kwargs["insert_mode"] = "overwrite" component: MongoVectorStoreComponent = component_class().set(**default_kwargs) vector_store = component.build_vector_store() @@ -115,6 +99,7 @@ class TestMongoVectorStoreComponent(ComponentTestBaseWithoutClient): ] default_kwargs["ingest_data"] = [Data(data={"text": text, "metadata": {}}) for text in test_data] default_kwargs["number_of_results"] = 2 + default_kwargs["insert_mode"] = "overwrite" # Create and initialize the component component: MongoVectorStoreComponent = component_class().set(**default_kwargs) @@ -131,7 +116,7 @@ class TestMongoVectorStoreComponent(ComponentTestBaseWithoutClient): assert isinstance(doc["embedding"], list) assert len(doc["embedding"]) == 8 # Should match our embedding size - self.__create_search_index(vector_store, default_kwargs) + self.__create_search_index(component_class, vector_store._collection, default_kwargs) # Verify index was created indexes = vector_store._collection.list_search_indexes() @@ -141,6 +126,7 @@ class TestMongoVectorStoreComponent(ComponentTestBaseWithoutClient): # Test similarity search through the component component.set(search_query="dog") results = component.search_documents() + time.sleep(5) # wait the results come from API assert len(results) == 2, "Expected 2 results for 'lazy dog' query" # The most relevant results should be about dogs @@ -168,8 +154,8 @@ class TestMongoVectorStoreComponent(ComponentTestBaseWithoutClient): self, component_class: type[MongoVectorStoreComponent], default_kwargs: dict[str, Any] ) -> None: """Test search with empty query.""" + default_kwargs["insert_mode"] = "overwrite" component: MongoVectorStoreComponent = component_class().set(**default_kwargs) - component.build_vector_store() # Test with empty search query component.set(search_query="") @@ -191,7 +177,7 @@ class TestMongoVectorStoreComponent(ComponentTestBaseWithoutClient): component: MongoVectorStoreComponent = component_class().set(**default_kwargs) vector_store = component.build_vector_store() - self.__create_search_index(vector_store, default_kwargs) + self.__create_search_index(component_class, vector_store._collection, default_kwargs) # Test search and verify metadata is preserved component.set(search_query="Document", number_of_results=2)