From 46966d164daf89e45d9697d23614f972d8dfc033 Mon Sep 17 00:00:00 2001 From: Jordan Frazier <122494242+jordanrfrazier@users.noreply.github.com> Date: Wed, 10 Jul 2024 02:32:34 -0700 Subject: [PATCH] tests: fix up the astra integ tests and add vectorize tests (#2616) * fix up the astra integ tests and add vectorize tests * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- .../base/langflow/base/vectorstores/model.py | 2 +- .../components/embeddings/AstraVectorize.py | 6 +- .../components/embeddings/__init__.py | 2 + .../integration/astra/test_astra_component.py | 157 ++++++++++++++++-- 4 files changed, 151 insertions(+), 16 deletions(-) diff --git a/src/backend/base/langflow/base/vectorstores/model.py b/src/backend/base/langflow/base/vectorstores/model.py index a00b56e99..ef48a1d35 100644 --- a/src/backend/base/langflow/base/vectorstores/model.py +++ b/src/backend/base/langflow/base/vectorstores/model.py @@ -88,7 +88,7 @@ class LCVectorStoreComponent(Component): def search_documents(self) -> List[Data]: """ - Search for documents in the Chroma vector store. + Search for documents in the vector store. """ search_query: str = self.search_query if not search_query: diff --git a/src/backend/base/langflow/components/embeddings/AstraVectorize.py b/src/backend/base/langflow/components/embeddings/AstraVectorize.py index 4c8914665..b68159cc9 100644 --- a/src/backend/base/langflow/components/embeddings/AstraVectorize.py +++ b/src/backend/base/langflow/components/embeddings/AstraVectorize.py @@ -4,7 +4,7 @@ from langflow.inputs.inputs import DictInput, SecretStrInput, MessageTextInput, from langflow.template.field.base import Output -class AstraVectorize(Component): +class AstraVectorizeComponent(Component): display_name: str = "Astra Vectorize" description: str = "Configuration options for Astra Vectorize server-side embeddings." documentation: str = "https://docs.datastax.com/en/astra-db-serverless/databases/embedding-generation.html" @@ -92,7 +92,7 @@ class AstraVectorize(Component): def build_options(self) -> dict[str, Any]: provider_value = self.VECTORIZE_PROVIDERS_MAPPING[self.provider][0] - authentication = {**self.authentication} + authentication = {**(self.authentication or {})} api_key_name = self.api_key_name if api_key_name: authentication["providerKey"] = api_key_name @@ -102,7 +102,7 @@ class AstraVectorize(Component): "provider": provider_value, "modelName": self.model_name, "authentication": authentication, - "parameters": self.model_parameters, + "parameters": self.model_parameters or {}, }, "collection_embedding_api_key": self.provider_api_key, } diff --git a/src/backend/base/langflow/components/embeddings/__init__.py b/src/backend/base/langflow/components/embeddings/__init__.py index a55a13ffe..79f8173d1 100644 --- a/src/backend/base/langflow/components/embeddings/__init__.py +++ b/src/backend/base/langflow/components/embeddings/__init__.py @@ -1,4 +1,5 @@ from .AmazonBedrockEmbeddings import AmazonBedrockEmbeddingsComponent +from .AstraVectorize import AstraVectorizeComponent from .AzureOpenAIEmbeddings import AzureOpenAIEmbeddingsComponent from .CohereEmbeddings import CohereEmbeddingsComponent from .HuggingFaceEmbeddings import HuggingFaceEmbeddingsComponent @@ -9,6 +10,7 @@ from .VertexAIEmbeddings import VertexAIEmbeddingsComponent __all__ = [ "AmazonBedrockEmbeddingsComponent", + "AstraVectorizeComponent", "AzureOpenAIEmbeddingsComponent", "CohereEmbeddingsComponent", "HuggingFaceEmbeddingsComponent", diff --git a/tests/integration/astra/test_astra_component.py b/tests/integration/astra/test_astra_component.py index e8982c797..8dd7c6965 100644 --- a/tests/integration/astra/test_astra_component.py +++ b/tests/integration/astra/test_astra_component.py @@ -1,5 +1,7 @@ import os +from langchain_astradb import AstraDBVectorStore, CollectionVectorServiceOptions +from langflow.components.embeddings.AstraVectorize import AstraVectorizeComponent import pytest from integration.utils import MockEmbeddings, check_env_vars from langchain_core.documents import Document @@ -11,7 +13,10 @@ from langflow.schema.data import Data COLLECTION = "test_basic" SEARCH_COLLECTION = "test_search" -MEMORY_COLLECTION = "test_memory" +# MEMORY_COLLECTION = "test_memory" +VECTORIZE_COLLECTION = "test_vectorize" +VECTORIZE_COLLECTION_OPENAI = "test_vectorize_openai" +VECTORIZE_COLLECTION_OPENAI_WITH_AUTH = "test_vectorize_openai_auth" @pytest.fixture() @@ -77,24 +82,152 @@ def test_astra_embeds_and_search(astra_fixture): api_endpoint=api_endpoint, collection_name=SEARCH_COLLECTION, embedding=embedding, - inputs=records, - add_to_vector_store=True, - ) - component.build_vector_store() - - component.build( - token=application_token, - api_endpoint=api_endpoint, - collection_name=SEARCH_COLLECTION, - embedding=embedding, - input_value="test1", + ingest_data=records, + search_input="test1", number_of_results=1, ) + component.build_vector_store() records = component.search_documents() assert len(records) == 1 +@pytest.mark.skipif( + not check_env_vars("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT"), + reason="missing astra env vars", +) +def test_astra_vectorize(): + store = None + try: + options = {"provider": "nvidia", "modelName": "NV-Embed-QA", "parameters": {}, "authentication": {}} + store = AstraDBVectorStore( + collection_name=VECTORIZE_COLLECTION, + api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"), + token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"), + collection_vector_service_options=CollectionVectorServiceOptions.from_dict(options), + ) + + application_token = os.getenv("ASTRA_DB_APPLICATION_TOKEN") + api_endpoint = os.getenv("ASTRA_DB_API_ENDPOINT") + + documents = [Document(page_content="test1"), Document(page_content="test2")] + records = [Data.from_document(d) for d in documents] + + vectorize = AstraVectorizeComponent() + vectorize.build(provider="NVIDIA", model_name="NV-Embed-QA") + vectorize_options = vectorize.build_options() + + component = AstraVectorStoreComponent() + component.build( + token=application_token, + api_endpoint=api_endpoint, + collection_name=VECTORIZE_COLLECTION, + ingest_data=records, + embedding=vectorize_options, + search_input="test", + number_of_results=2, + ) + component.build_vector_store() + records = component.search_documents() + + assert len(records) == 2 + finally: + if store is not None: + store.delete_collection() + + +@pytest.mark.skipif( + not check_env_vars("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT", "OPENAI_API_KEY"), + reason="missing env vars", +) +def test_astra_vectorize_with_provider_api_key(): + """tests vectorize using an openai api key""" + store = None + try: + application_token = os.getenv("ASTRA_DB_APPLICATION_TOKEN") + api_endpoint = os.getenv("ASTRA_DB_API_ENDPOINT") + options = {"provider": "openai", "modelName": "text-embedding-3-small", "parameters": {}, "authentication": {}} + store = AstraDBVectorStore( + collection_name=VECTORIZE_COLLECTION_OPENAI, + api_endpoint=api_endpoint, + token=application_token, + collection_vector_service_options=CollectionVectorServiceOptions.from_dict(options), + collection_embedding_api_key=os.getenv("OPENAI_API_KEY"), + ) + documents = [Document(page_content="test1"), Document(page_content="test2")] + records = [Data.from_document(d) for d in documents] + + vectorize = AstraVectorizeComponent() + vectorize.build( + provider="OpenAI", model_name="text-embedding-3-small", provider_api_key=os.getenv("OPENAI_API_KEY") + ) + vectorize_options = vectorize.build_options() + + component = AstraVectorStoreComponent() + component.build( + token=application_token, + api_endpoint=api_endpoint, + collection_name=VECTORIZE_COLLECTION_OPENAI, + ingest_data=records, + embedding=vectorize_options, + search_input="test", + ) + component.build_vector_store() + records = component.search_documents() + assert len(records) == 2 + finally: + if store is not None: + store.delete_collection() + + +@pytest.mark.skipif( + not check_env_vars("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT", "OPENAI_API_KEY"), + reason="missing env vars", +) +def test_astra_vectorize_passes_authentication(): + """tests vectorize using the authentication parameter""" + store = None + try: + application_token = os.getenv("ASTRA_DB_APPLICATION_TOKEN") + api_endpoint = os.getenv("ASTRA_DB_API_ENDPOINT") + options = { + "provider": "openai", + "modelName": "text-embedding-3-small", + "parameters": {}, + "authentication": {"providerKey": "providerKey"}, + } + store = AstraDBVectorStore( + collection_name=VECTORIZE_COLLECTION_OPENAI_WITH_AUTH, + api_endpoint=api_endpoint, + token=application_token, + collection_vector_service_options=CollectionVectorServiceOptions.from_dict(options), + ) + documents = [Document(page_content="test1"), Document(page_content="test2")] + records = [Data.from_document(d) for d in documents] + + vectorize = AstraVectorizeComponent() + vectorize.build( + provider="OpenAI", model_name="text-embedding-3-small", authentication={"providerKey": "providerKey"} + ) + vectorize_options = vectorize.build_options() + + component = AstraVectorStoreComponent() + component.build( + token=application_token, + api_endpoint=api_endpoint, + collection_name=VECTORIZE_COLLECTION_OPENAI_WITH_AUTH, + ingest_data=records, + embedding=vectorize_options, + search_input="test", + ) + component.build_vector_store() + records = component.search_documents() + assert len(records) == 2 + finally: + if store is not None: + store.delete_collection() + + # @pytest.mark.skipif( # not check_env_vars("ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_API_ENDPOINT"), # reason="missing astra env vars",