From 86502232a1aee23cc6fe0a136fe5a3cf45697a49 Mon Sep 17 00:00:00 2001 From: Rodrigo Nader Date: Mon, 17 Mar 2025 16:05:12 -0300 Subject: [PATCH] feat: add dataframe support for vector stores (#6990) * feat: add dataframe support for vector stores * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) * [autofix.ci] apply automated fixes * fix: remove import * fix: ruff error * fix: mypy errors * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Edwin Jose Co-authored-by: italojohnny Co-authored-by: cristhianzl --- .../base/langflow/base/vectorstores/model.py | 26 ++++++++++++++++--- .../components/vectorstores/astradb.py | 2 ++ .../components/vectorstores/astradb_graph.py | 2 ++ .../components/vectorstores/cassandra.py | 5 +++- .../vectorstores/cassandra_graph.py | 3 +++ .../components/vectorstores/chroma.py | 10 ++++--- .../components/vectorstores/clickhouse.py | 3 +++ .../components/vectorstores/couchbase.py | 2 ++ .../components/vectorstores/elasticsearch.py | 2 ++ .../langflow/components/vectorstores/faiss.py | 3 +++ .../langflow/components/vectorstores/hcd.py | 3 +++ .../components/vectorstores/milvus.py | 3 +++ .../components/vectorstores/mongodb_atlas.py | 3 +++ .../components/vectorstores/opensearch.py | 3 +++ .../components/vectorstores/pgvector.py | 3 +++ .../components/vectorstores/pinecone.py | 4 +++ .../components/vectorstores/qdrant.py | 5 +++- .../langflow/components/vectorstores/redis.py | 4 ++- .../components/vectorstores/supabase.py | 3 +++ .../components/vectorstores/upstash.py | 3 +++ .../components/vectorstores/vectara.py | 10 ++++--- .../components/vectorstores/weaviate.py | 3 +++ .../starter_projects/Diet Analysis.json | 2 +- .../starter_projects/Financial Agent.json | 6 ++--- .../Graph Vector Store RAG.json | 19 ++++++++------ .../starter_projects/Vector Store RAG.json | 19 ++++++++------ src/backend/base/langflow/schema/dataframe.py | 10 +++++++ 27 files changed, 129 insertions(+), 32 deletions(-) diff --git a/src/backend/base/langflow/base/vectorstores/model.py b/src/backend/base/langflow/base/vectorstores/model.py index c203aad01..7505eaa93 100644 --- a/src/backend/base/langflow/base/vectorstores/model.py +++ b/src/backend/base/langflow/base/vectorstores/model.py @@ -1,12 +1,12 @@ from abc import abstractmethod from functools import wraps -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from langflow.custom import Component from langflow.field_typing import Text, VectorStore from langflow.helpers.data import docs_to_data from langflow.inputs.inputs import BoolInput -from langflow.io import DataInput, MultilineInput, Output +from langflow.io import HandleInput, MultilineInput, Output from langflow.schema import Data, DataFrame if TYPE_CHECKING: @@ -56,9 +56,11 @@ class LCVectorStoreComponent(Component): trace_type = "retriever" inputs = [ - DataInput( + HandleInput( name="ingest_data", display_name="Ingest Data", + input_types=["Data", "DataFrame"], + is_list=True, ), MultilineInput( name="search_query", @@ -99,6 +101,24 @@ class LCVectorStoreComponent(Component): msg = f"Method '{method_name}' must be defined." raise ValueError(msg) + def _prepare_ingest_data(self) -> list[Any]: + """Prepares ingest_data by converting DataFrame to Data if needed.""" + ingest_data: list | Data | DataFrame = self.ingest_data + if not ingest_data: + return [ingest_data] + + if not isinstance(ingest_data, list): + ingest_data = [ingest_data] + + result = [] + + for _input in ingest_data: + if isinstance(_input, DataFrame): + result.extend(_input.to_data_list()) + else: + result.append(_input) + return result + def search_with_vector_store( self, input_value: Text, diff --git a/src/backend/base/langflow/components/vectorstores/astradb.py b/src/backend/base/langflow/components/vectorstores/astradb.py index d160046d7..2c68da3ea 100644 --- a/src/backend/base/langflow/components/vectorstores/astradb.py +++ b/src/backend/base/langflow/components/vectorstores/astradb.py @@ -995,6 +995,8 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent): return vector_store def _add_documents_to_vector_store(self, vector_store) -> None: + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/components/vectorstores/astradb_graph.py b/src/backend/base/langflow/components/vectorstores/astradb_graph.py index 7f279a833..0d027e652 100644 --- a/src/backend/base/langflow/components/vectorstores/astradb_graph.py +++ b/src/backend/base/langflow/components/vectorstores/astradb_graph.py @@ -226,6 +226,8 @@ class AstraDBGraphVectorStoreComponent(LCVectorStoreComponent): return vector_store def _add_documents_to_vector_store(self, vector_store) -> None: + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/components/vectorstores/cassandra.py b/src/backend/base/langflow/components/vectorstores/cassandra.py index 1f13abdda..6414432d5 100644 --- a/src/backend/base/langflow/components/vectorstores/cassandra.py +++ b/src/backend/base/langflow/components/vectorstores/cassandra.py @@ -158,8 +158,11 @@ class CassandraVectorStoreComponent(LCVectorStoreComponent): password=self.token, cluster_kwargs=self.cluster_kwargs, ) - documents = [] + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): documents.append(_input.to_lc_document()) diff --git a/src/backend/base/langflow/components/vectorstores/cassandra_graph.py b/src/backend/base/langflow/components/vectorstores/cassandra_graph.py index 1216a90f0..418ae3261 100644 --- a/src/backend/base/langflow/components/vectorstores/cassandra_graph.py +++ b/src/backend/base/langflow/components/vectorstores/cassandra_graph.py @@ -144,6 +144,9 @@ class CassandraGraphVectorStoreComponent(LCVectorStoreComponent): password=self.token, cluster_kwargs=self.cluster_kwargs, ) + + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: diff --git a/src/backend/base/langflow/components/vectorstores/chroma.py b/src/backend/base/langflow/components/vectorstores/chroma.py index 970b53530..e4f5711a0 100644 --- a/src/backend/base/langflow/components/vectorstores/chroma.py +++ b/src/backend/base/langflow/components/vectorstores/chroma.py @@ -7,7 +7,7 @@ from typing_extensions import override from langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store from langflow.base.vectorstores.utils import chroma_collection_to_data from langflow.io import BoolInput, DropdownInput, HandleInput, IntInput, StrInput -from langflow.schema import Data +from langflow.schema import Data, DataFrame class ChromaVectorStoreComponent(LCVectorStoreComponent): @@ -122,10 +122,14 @@ class ChromaVectorStoreComponent(LCVectorStoreComponent): def _add_documents_to_vector_store(self, vector_store: "Chroma") -> None: """Adds documents to the Vector Store.""" - if not self.ingest_data: + ingest_data: list | Data | DataFrame = self.ingest_data + if not ingest_data: self.status = "" return + # Convert DataFrame to Data if needed using parent's method + ingest_data = self._prepare_ingest_data() + stored_documents_without_id = [] if self.allow_duplicates: stored_data = [] @@ -136,7 +140,7 @@ class ChromaVectorStoreComponent(LCVectorStoreComponent): stored_documents_without_id.append(value) documents = [] - for _input in self.ingest_data or []: + for _input in ingest_data or []: if isinstance(_input, Data): if _input not in stored_documents_without_id: documents.append(_input.to_lc_document()) diff --git a/src/backend/base/langflow/components/vectorstores/clickhouse.py b/src/backend/base/langflow/components/vectorstores/clickhouse.py index f528d1955..42c43e700 100644 --- a/src/backend/base/langflow/components/vectorstores/clickhouse.py +++ b/src/backend/base/langflow/components/vectorstores/clickhouse.py @@ -83,6 +83,9 @@ class ClickhouseVectorStoreComponent(LCVectorStoreComponent): msg = f"Failed to connect to Clickhouse: {e}" raise ValueError(msg) from e + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/components/vectorstores/couchbase.py b/src/backend/base/langflow/components/vectorstores/couchbase.py index fce34743f..c831dcd70 100644 --- a/src/backend/base/langflow/components/vectorstores/couchbase.py +++ b/src/backend/base/langflow/components/vectorstores/couchbase.py @@ -55,6 +55,8 @@ class CouchbaseVectorStoreComponent(LCVectorStoreComponent): msg = f"Failed to connect to Couchbase: {e}" raise ValueError(msg) from e + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/components/vectorstores/elasticsearch.py b/src/backend/base/langflow/components/vectorstores/elasticsearch.py index b4627c117..85e13fd23 100644 --- a/src/backend/base/langflow/components/vectorstores/elasticsearch.py +++ b/src/backend/base/langflow/components/vectorstores/elasticsearch.py @@ -136,6 +136,8 @@ class ElasticsearchVectorStoreComponent(LCVectorStoreComponent): def _prepare_documents(self) -> list[Document]: """Prepares documents from the input data to add to the vector store.""" + self.ingest_data = self._prepare_ingest_data() + documents = [] for data in self.ingest_data: if isinstance(data, Data): diff --git a/src/backend/base/langflow/components/vectorstores/faiss.py b/src/backend/base/langflow/components/vectorstores/faiss.py index fd1b9c360..6999413e3 100644 --- a/src/backend/base/langflow/components/vectorstores/faiss.py +++ b/src/backend/base/langflow/components/vectorstores/faiss.py @@ -69,6 +69,9 @@ class FaissVectorStoreComponent(LCVectorStoreComponent): path = self.get_persist_directory() path.mkdir(parents=True, exist_ok=True) + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/components/vectorstores/hcd.py b/src/backend/base/langflow/components/vectorstores/hcd.py index 8b12b9424..e43411a72 100644 --- a/src/backend/base/langflow/components/vectorstores/hcd.py +++ b/src/backend/base/langflow/components/vectorstores/hcd.py @@ -242,6 +242,9 @@ class HCDVectorStoreComponent(LCVectorStoreComponent): return vector_store def _add_documents_to_vector_store(self, vector_store) -> None: + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/components/vectorstores/milvus.py b/src/backend/base/langflow/components/vectorstores/milvus.py index 4300ddf9a..054fbe1db 100644 --- a/src/backend/base/langflow/components/vectorstores/milvus.py +++ b/src/backend/base/langflow/components/vectorstores/milvus.py @@ -85,6 +85,9 @@ class MilvusVectorStoreComponent(LCVectorStoreComponent): timeout=self.timeout, ) + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/components/vectorstores/mongodb_atlas.py b/src/backend/base/langflow/components/vectorstores/mongodb_atlas.py index f0a9d236d..2798e1423 100644 --- a/src/backend/base/langflow/components/vectorstores/mongodb_atlas.py +++ b/src/backend/base/langflow/components/vectorstores/mongodb_atlas.py @@ -85,6 +85,9 @@ class MongoVectorStoreComponent(LCVectorStoreComponent): msg = f"Failed to connect to MongoDB Atlas: {e}" raise ValueError(msg) from e + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/components/vectorstores/opensearch.py b/src/backend/base/langflow/components/vectorstores/opensearch.py index 96bf2cc5b..c3962a20d 100644 --- a/src/backend/base/langflow/components/vectorstores/opensearch.py +++ b/src/backend/base/langflow/components/vectorstores/opensearch.py @@ -132,6 +132,9 @@ class OpenSearchVectorStoreComponent(LCVectorStoreComponent): def _add_documents_to_vector_store(self, vector_store: "OpenSearchVectorSearch") -> None: """Adds documents to the Vector Store.""" + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/components/vectorstores/pgvector.py b/src/backend/base/langflow/components/vectorstores/pgvector.py index 7ae08470c..fef611393 100644 --- a/src/backend/base/langflow/components/vectorstores/pgvector.py +++ b/src/backend/base/langflow/components/vectorstores/pgvector.py @@ -29,6 +29,9 @@ class PGVectorStoreComponent(LCVectorStoreComponent): @check_cached_vector_store def build_vector_store(self) -> PGVector: + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/components/vectorstores/pinecone.py b/src/backend/base/langflow/components/vectorstores/pinecone.py index f71a2aed2..43488a112 100644 --- a/src/backend/base/langflow/components/vectorstores/pinecone.py +++ b/src/backend/base/langflow/components/vectorstores/pinecone.py @@ -73,9 +73,13 @@ class PineconeVectorStoreComponent(LCVectorStoreComponent): error_msg = "Error building Pinecone vector store" raise ValueError(error_msg) from e else: + self.ingest_data = self._prepare_ingest_data() + # Process documents if any documents = [] if self.ingest_data: + # Convert DataFrame to Data if needed using parent's method + for doc in self.ingest_data: if isinstance(doc, Data): documents.append(doc.to_lc_document()) diff --git a/src/backend/base/langflow/components/vectorstores/qdrant.py b/src/backend/base/langflow/components/vectorstores/qdrant.py index 7a37b243c..1bd50c836 100644 --- a/src/backend/base/langflow/components/vectorstores/qdrant.py +++ b/src/backend/base/langflow/components/vectorstores/qdrant.py @@ -69,8 +69,11 @@ class QdrantVectorStoreComponent(LCVectorStoreComponent): } server_kwargs = {k: v for k, v in server_kwargs.items() if v is not None} - documents = [] + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): documents.append(_input.to_lc_document()) diff --git a/src/backend/base/langflow/components/vectorstores/redis.py b/src/backend/base/langflow/components/vectorstores/redis.py index f628e75b8..dc78c8b00 100644 --- a/src/backend/base/langflow/components/vectorstores/redis.py +++ b/src/backend/base/langflow/components/vectorstores/redis.py @@ -41,8 +41,10 @@ class RedisVectorStoreComponent(LCVectorStoreComponent): @check_cached_vector_store def build_vector_store(self) -> Redis: - documents = [] + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): documents.append(_input.to_lc_document()) diff --git a/src/backend/base/langflow/components/vectorstores/supabase.py b/src/backend/base/langflow/components/vectorstores/supabase.py index 2dc8d8c8a..1decae938 100644 --- a/src/backend/base/langflow/components/vectorstores/supabase.py +++ b/src/backend/base/langflow/components/vectorstores/supabase.py @@ -33,6 +33,9 @@ class SupabaseVectorStoreComponent(LCVectorStoreComponent): def build_vector_store(self) -> SupabaseVectorStore: supabase: Client = create_client(self.supabase_url, supabase_key=self.supabase_service_key) + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/components/vectorstores/upstash.py b/src/backend/base/langflow/components/vectorstores/upstash.py index b1f94f1f7..d1a80de19 100644 --- a/src/backend/base/langflow/components/vectorstores/upstash.py +++ b/src/backend/base/langflow/components/vectorstores/upstash.py @@ -68,6 +68,9 @@ class UpstashVectorStoreComponent(LCVectorStoreComponent): def build_vector_store(self) -> UpstashVectorStore: use_upstash_embedding = self.embedding is None + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/components/vectorstores/vectara.py b/src/backend/base/langflow/components/vectorstores/vectara.py index 82f146aef..b058ba2f0 100644 --- a/src/backend/base/langflow/components/vectorstores/vectara.py +++ b/src/backend/base/langflow/components/vectorstores/vectara.py @@ -5,7 +5,7 @@ from langchain_community.vectorstores import Vectara from langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store from langflow.helpers.data import docs_to_data from langflow.io import HandleInput, IntInput, SecretStrInput, StrInput -from langflow.schema import Data +from langflow.schema import Data, DataFrame if TYPE_CHECKING: from langchain_community.vectorstores import Vectara @@ -58,12 +58,16 @@ class VectaraVectorStoreComponent(LCVectorStoreComponent): def _add_documents_to_vector_store(self, vector_store: "Vectara") -> None: """Adds documents to the Vector Store.""" - if not self.ingest_data: + ingest_data: list | Data | DataFrame = self.ingest_data + if not ingest_data: self.status = "No documents to add to Vectara" return + # Convert DataFrame to Data if needed using parent's method + ingest_data = self._prepare_ingest_data() + documents = [] - for _input in self.ingest_data or []: + for _input in ingest_data or []: if isinstance(_input, Data): documents.append(_input.to_lc_document()) else: diff --git a/src/backend/base/langflow/components/vectorstores/weaviate.py b/src/backend/base/langflow/components/vectorstores/weaviate.py index feaabdbcb..e6418ce7b 100644 --- a/src/backend/base/langflow/components/vectorstores/weaviate.py +++ b/src/backend/base/langflow/components/vectorstores/weaviate.py @@ -47,6 +47,9 @@ class WeaviateVectorStoreComponent(LCVectorStoreComponent): msg = f"Weaviate requires the index name to be capitalized. Use: {self.index_name.capitalize()}" raise ValueError(msg) + # Convert DataFrame to Data if needed using parent's method + self.ingest_data = self._prepare_ingest_data() + documents = [] for _input in self.ingest_data or []: if isinstance(_input, Data): diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Diet Analysis.json b/src/backend/base/langflow/initial_setup/starter_projects/Diet Analysis.json index 5801095fb..dcfe5cb64 100644 --- a/src/backend/base/langflow/initial_setup/starter_projects/Diet Analysis.json +++ b/src/backend/base/langflow/initial_setup/starter_projects/Diet Analysis.json @@ -809,7 +809,7 @@ }, "stream": { "_input_type": "BoolInput", - "advanced": false, + "advanced": true, "display_name": "Stream", "dynamic": false, "info": "Stream the response from the model. Streaming works only in Chat.", diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Financial Agent.json b/src/backend/base/langflow/initial_setup/starter_projects/Financial Agent.json index 66c93bbcd..4b157d8e1 100644 --- a/src/backend/base/langflow/initial_setup/starter_projects/Financial Agent.json +++ b/src/backend/base/langflow/initial_setup/starter_projects/Financial Agent.json @@ -2325,7 +2325,7 @@ "show": true, "title_case": false, "type": "code", - "value": "from langchain_sambanova import ChatSambaNovaCloud\nfrom pydantic.v1 import SecretStr\n\nfrom langflow.base.models.model import LCModelComponent\nfrom langflow.base.models.sambanova_constants import SAMBANOVA_MODEL_NAMES\nfrom langflow.field_typing import LanguageModel\nfrom langflow.field_typing.range_spec import RangeSpec\nfrom langflow.io import DropdownInput, IntInput, SecretStrInput, SliderInput, StrInput\n\n\nclass SambaNovaComponent(LCModelComponent):\n display_name = \"SambaNova\"\n description = \"Generate text using Sambanova LLMs.\"\n documentation = \"https://cloud.sambanova.ai/\"\n icon = \"SambaNova\"\n name = \"SambaNovaModel\"\n\n inputs = [\n *LCModelComponent._base_inputs,\n StrInput(\n name=\"base_url\",\n display_name=\"SambaNova Cloud Base Url\",\n advanced=True,\n info=\"The base URL of the Sambanova Cloud API. \"\n \"Defaults to https://api.sambanova.ai/v1/chat/completions. \"\n \"You can change this to use other urls like Sambastudio\",\n ),\n DropdownInput(\n name=\"model_name\",\n display_name=\"Model Name\",\n advanced=False,\n options=SAMBANOVA_MODEL_NAMES,\n value=SAMBANOVA_MODEL_NAMES[0],\n ),\n SecretStrInput(\n name=\"api_key\",\n display_name=\"Sambanova API Key\",\n info=\"The Sambanova API Key to use for the Sambanova model.\",\n advanced=False,\n value=\"SAMBANOVA_API_KEY\",\n required=True,\n ),\n IntInput(\n name=\"max_tokens\",\n display_name=\"Max Tokens\",\n advanced=True,\n value=2048,\n info=\"The maximum number of tokens to generate.\",\n ),\n SliderInput(\n name=\"top_p\",\n display_name=\"top_p\",\n advanced=True,\n value=1.0,\n range_spec=RangeSpec(min=0, max=1, step=0.01),\n info=\"Model top_p\",\n ),\n SliderInput(\n name=\"temperature\", display_name=\"Temperature\", value=0.1, range_spec=RangeSpec(min=0, max=2, step=0.01)\n ),\n ]\n\n def build_model(self) -> LanguageModel: # type: ignore[type-var]\n sambanova_url = self.base_url\n sambanova_api_key = self.api_key\n model_name = self.model_name\n max_tokens = self.max_tokens\n top_p = self.top_p\n temperature = self.temperature\n\n api_key = SecretStr(sambanova_api_key).get_secret_value() if sambanova_api_key else None\n\n return ChatSambaNovaCloud(\n model=model_name,\n max_tokens=max_tokens or 1024,\n temperature=temperature or 0.07,\n top_p=top_p,\n sambanova_url=sambanova_url,\n sambanova_api_key=api_key,\n )\n" + "value": "from langchain_sambanova import ChatSambaNovaCloud\nfrom pydantic.v1 import SecretStr\n\nfrom langflow.base.models.model import LCModelComponent\nfrom langflow.base.models.sambanova_constants import SAMBANOVA_MODEL_NAMES\nfrom langflow.field_typing import LanguageModel\nfrom langflow.field_typing.range_spec import RangeSpec\nfrom langflow.io import DropdownInput, IntInput, SecretStrInput, SliderInput, StrInput\n\n\nclass SambaNovaComponent(LCModelComponent):\n display_name = \"SambaNova\"\n description = \"Generate text using Sambanova LLMs.\"\n documentation = \"https://cloud.sambanova.ai/\"\n icon = \"SambaNova\"\n name = \"SambaNovaModel\"\n\n inputs = [\n *LCModelComponent._base_inputs,\n StrInput(\n name=\"base_url\",\n display_name=\"SambaNova Cloud Base Url\",\n advanced=True,\n info=\"The base URL of the Sambanova Cloud API. \"\n \"Defaults to https://api.sambanova.ai/v1/chat/completions. \"\n \"You can change this to use other urls like Sambastudio\",\n ),\n DropdownInput(\n name=\"model_name\",\n display_name=\"Model Name\",\n advanced=False,\n options=SAMBANOVA_MODEL_NAMES,\n value=SAMBANOVA_MODEL_NAMES[0],\n ),\n SecretStrInput(\n name=\"api_key\",\n display_name=\"Sambanova API Key\",\n info=\"The Sambanova API Key to use for the Sambanova model.\",\n advanced=False,\n value=\"SAMBANOVA_API_KEY\",\n required=True,\n ),\n IntInput(\n name=\"max_tokens\",\n display_name=\"Max Tokens\",\n advanced=True,\n value=2048,\n info=\"The maximum number of tokens to generate.\",\n ),\n SliderInput(\n name=\"top_p\",\n display_name=\"top_p\",\n advanced=True,\n value=1.0,\n range_spec=RangeSpec(min=0, max=1, step=0.01),\n info=\"Model top_p\",\n ),\n SliderInput(\n name=\"temperature\",\n display_name=\"Temperature\",\n value=0.1,\n range_spec=RangeSpec(min=0, max=2, step=0.01),\n advanced=True,\n ),\n ]\n\n def build_model(self) -> LanguageModel: # type: ignore[type-var]\n sambanova_url = self.base_url\n sambanova_api_key = self.api_key\n model_name = self.model_name\n max_tokens = self.max_tokens\n top_p = self.top_p\n temperature = self.temperature\n\n api_key = SecretStr(sambanova_api_key).get_secret_value() if sambanova_api_key else None\n\n return ChatSambaNovaCloud(\n model=model_name,\n max_tokens=max_tokens or 1024,\n temperature=temperature or 0.07,\n top_p=top_p,\n sambanova_url=sambanova_url,\n sambanova_api_key=api_key,\n )\n" }, "input_value": { "_input_type": "MessageInput", @@ -2405,7 +2405,7 @@ }, "stream": { "_input_type": "BoolInput", - "advanced": false, + "advanced": true, "display_name": "Stream", "dynamic": false, "info": "Stream the response from the model. Streaming works only in Chat.", @@ -2447,7 +2447,7 @@ }, "temperature": { "_input_type": "SliderInput", - "advanced": false, + "advanced": true, "display_name": "Temperature", "dynamic": false, "info": "", diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Graph Vector Store RAG.json b/src/backend/base/langflow/initial_setup/starter_projects/Graph Vector Store RAG.json index 1afcd9e5d..acb278f5e 100644 --- a/src/backend/base/langflow/initial_setup/starter_projects/Graph Vector Store RAG.json +++ b/src/backend/base/langflow/initial_setup/starter_projects/Graph Vector Store RAG.json @@ -246,7 +246,8 @@ "fieldName": "ingest_data", "id": "AstraDBGraph-OBH6o", "inputTypes": [ - "Data" + "Data", + "DataFrame" ], "type": "other" } @@ -256,7 +257,7 @@ "source": "HtmlLinkExtractor-I8rzh", "sourceHandle": "{œdataTypeœ: œHtmlLinkExtractorœ, œidœ: œHtmlLinkExtractor-I8rzhœ, œnameœ: œdataœ, œoutput_typesœ: [œDataœ]}", "target": "AstraDBGraph-OBH6o", - "targetHandle": "{œfieldNameœ: œingest_dataœ, œidœ: œAstraDBGraph-OBH6oœ, œinputTypesœ: [œDataœ], œtypeœ: œotherœ}" + "targetHandle": "{œfieldNameœ: œingest_dataœ, œidœ: œAstraDBGraph-OBH6oœ, œinputTypesœ: [œDataœ, œDataFrameœ], œtypeœ: œotherœ}" }, { "animated": false, @@ -1004,7 +1005,7 @@ "show": true, "title_case": false, "type": "code", - "value": "import os\n\nimport orjson\nfrom astrapy.admin import parse_api_endpoint\n\nfrom langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom langflow.helpers import docs_to_data\nfrom langflow.inputs import (\n BoolInput,\n DictInput,\n DropdownInput,\n FloatInput,\n HandleInput,\n IntInput,\n SecretStrInput,\n StrInput,\n)\nfrom langflow.schema import Data\n\n\nclass AstraDBGraphVectorStoreComponent(LCVectorStoreComponent):\n display_name: str = \"Astra DB Graph\"\n description: str = \"Implementation of Graph Vector Store using Astra DB\"\n name = \"AstraDBGraph\"\n icon: str = \"AstraDB\"\n\n inputs = [\n SecretStrInput(\n name=\"token\",\n display_name=\"Astra DB Application Token\",\n info=\"Authentication token for accessing Astra DB.\",\n value=\"ASTRA_DB_APPLICATION_TOKEN\",\n required=True,\n advanced=os.getenv(\"ASTRA_ENHANCED\", \"false\").lower() == \"true\",\n ),\n SecretStrInput(\n name=\"api_endpoint\",\n display_name=\"Database\" if os.getenv(\"ASTRA_ENHANCED\", \"false\").lower() == \"true\" else \"API Endpoint\",\n info=\"API endpoint URL for the Astra DB service.\",\n value=\"ASTRA_DB_API_ENDPOINT\",\n required=True,\n ),\n StrInput(\n name=\"collection_name\",\n display_name=\"Collection Name\",\n info=\"The name of the collection within Astra DB where the vectors will be stored.\",\n required=True,\n ),\n StrInput(\n name=\"metadata_incoming_links_key\",\n display_name=\"Metadata incoming links key\",\n info=\"Metadata key used for incoming links.\",\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs,\n StrInput(\n name=\"keyspace\",\n display_name=\"Keyspace\",\n info=\"Optional keyspace within Astra DB to use for the collection.\",\n advanced=True,\n ),\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Allows an embedding model configuration.\",\n ),\n DropdownInput(\n name=\"metric\",\n display_name=\"Metric\",\n info=\"Optional distance metric for vector comparisons in the vector store.\",\n options=[\"cosine\", \"dot_product\", \"euclidean\"],\n value=\"cosine\",\n advanced=True,\n ),\n IntInput(\n name=\"batch_size\",\n display_name=\"Batch Size\",\n info=\"Optional number of data to process in a single batch.\",\n advanced=True,\n ),\n IntInput(\n name=\"bulk_insert_batch_concurrency\",\n display_name=\"Bulk Insert Batch Concurrency\",\n info=\"Optional concurrency level for bulk insert operations.\",\n advanced=True,\n ),\n IntInput(\n name=\"bulk_insert_overwrite_concurrency\",\n display_name=\"Bulk Insert Overwrite Concurrency\",\n info=\"Optional concurrency level for bulk insert operations that overwrite existing data.\",\n advanced=True,\n ),\n IntInput(\n name=\"bulk_delete_concurrency\",\n display_name=\"Bulk Delete Concurrency\",\n info=\"Optional concurrency level for bulk delete operations.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"setup_mode\",\n display_name=\"Setup Mode\",\n info=\"Configuration mode for setting up the vector store, with options like 'Sync', or 'Off'.\",\n options=[\"Sync\", \"Off\"],\n advanced=True,\n value=\"Sync\",\n ),\n BoolInput(\n name=\"pre_delete_collection\",\n display_name=\"Pre Delete Collection\",\n info=\"Boolean flag to determine whether to delete the collection before creating a new one.\",\n advanced=True,\n value=False,\n ),\n StrInput(\n name=\"metadata_indexing_include\",\n display_name=\"Metadata Indexing Include\",\n info=\"Optional list of metadata fields to include in the indexing.\",\n advanced=True,\n list=True,\n ),\n StrInput(\n name=\"metadata_indexing_exclude\",\n display_name=\"Metadata Indexing Exclude\",\n info=\"Optional list of metadata fields to exclude from the indexing.\",\n advanced=True,\n list=True,\n ),\n StrInput(\n name=\"collection_indexing_policy\",\n display_name=\"Collection Indexing Policy\",\n info='Optional JSON string for the \"indexing\" field of the collection. '\n \"See https://docs.datastax.com/en/astra-db-serverless/api-reference/collections.html#the-indexing-option\",\n advanced=True,\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Results\",\n info=\"Number of results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\n \"Similarity\",\n \"Similarity with score threshold\",\n \"MMR (Max Marginal Relevance)\",\n \"Graph Traversal\",\n \"MMR (Max Marginal Relevance) Graph Traversal\",\n ],\n value=\"MMR (Max Marginal Relevance) Graph Traversal\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n DictInput(\n name=\"search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n is_list=True,\n ),\n ]\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBGraphVectorStore\n from langchain_astradb.utils.astradb import SetupMode\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n try:\n if not self.setup_mode:\n self.setup_mode = self._inputs[\"setup_mode\"].options[0]\n\n setup_mode_value = SetupMode[self.setup_mode.upper()]\n except KeyError as e:\n msg = f\"Invalid setup mode: {self.setup_mode}\"\n raise ValueError(msg) from e\n\n try:\n self.log(f\"Initializing Graph Vector Store {self.collection_name}\")\n\n vector_store = AstraDBGraphVectorStore(\n embedding=self.embedding_model,\n collection_name=self.collection_name,\n metadata_incoming_links_key=self.metadata_incoming_links_key or \"incoming_links\",\n token=self.token,\n api_endpoint=self.api_endpoint,\n namespace=self.keyspace or None,\n environment=parse_api_endpoint(self.api_endpoint).environment if self.api_endpoint else None,\n metric=self.metric or None,\n batch_size=self.batch_size or None,\n bulk_insert_batch_concurrency=self.bulk_insert_batch_concurrency or None,\n bulk_insert_overwrite_concurrency=self.bulk_insert_overwrite_concurrency or None,\n bulk_delete_concurrency=self.bulk_delete_concurrency or None,\n setup_mode=setup_mode_value,\n pre_delete_collection=self.pre_delete_collection,\n metadata_indexing_include=[s for s in self.metadata_indexing_include if s] or None,\n metadata_indexing_exclude=[s for s in self.metadata_indexing_exclude if s] or None,\n collection_indexing_policy=orjson.loads(self.collection_indexing_policy.encode(\"utf-8\"))\n if self.collection_indexing_policy\n else None,\n )\n except Exception as e:\n msg = f\"Error initializing AstraDBGraphVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Vector Store initialized: {vector_store.astra_env.collection_name}\")\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except Exception as e:\n msg = f\"Error adding documents to AstraDBGraphVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n match self.search_type:\n case \"Similarity\":\n return \"similarity\"\n case \"Similarity with score threshold\":\n return \"similarity_score_threshold\"\n case \"MMR (Max Marginal Relevance)\":\n return \"mmr\"\n case \"Graph Traversal\":\n return \"traversal\"\n case \"MMR (Max Marginal Relevance) Graph Traversal\":\n return \"mmr_traversal\"\n case _:\n return \"similarity\"\n\n def _build_search_args(self):\n args = {\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n }\n\n if self.search_filter:\n clean_filter = {k: v for k, v in self.search_filter.items() if k and v}\n if len(clean_filter) > 0:\n args[\"filter\"] = clean_filter\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n if not vector_store:\n vector_store = self.build_vector_store()\n\n self.log(\"Searching for documents in AstraDBGraphVectorStore.\")\n self.log(f\"Search query: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n\n if self.search_query and isinstance(self.search_query, str) and self.search_query.strip():\n try:\n search_type = self._map_search_type()\n search_args = self._build_search_args()\n\n docs = vector_store.search(query=self.search_query, search_type=search_type, **search_args)\n\n # Drop links from the metadata. At this point the links don't add any value for building the\n # context and haven't been restored to json which causes the conversion to fail.\n self.log(\"Removing links from metadata.\")\n for doc in docs:\n if \"links\" in doc.metadata:\n doc.metadata.pop(\"links\")\n\n except Exception as e:\n msg = f\"Error performing search in AstraDBGraphVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n\n self.log(f\"Converted documents to data: {len(data)}\")\n\n self.status = data\n return data\n self.log(\"No search input provided. Skipping search.\")\n return []\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n" + "value": "import os\n\nimport orjson\nfrom astrapy.admin import parse_api_endpoint\n\nfrom langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom langflow.helpers import docs_to_data\nfrom langflow.inputs import (\n BoolInput,\n DictInput,\n DropdownInput,\n FloatInput,\n HandleInput,\n IntInput,\n SecretStrInput,\n StrInput,\n)\nfrom langflow.schema import Data\n\n\nclass AstraDBGraphVectorStoreComponent(LCVectorStoreComponent):\n display_name: str = \"Astra DB Graph\"\n description: str = \"Implementation of Graph Vector Store using Astra DB\"\n name = \"AstraDBGraph\"\n icon: str = \"AstraDB\"\n\n inputs = [\n SecretStrInput(\n name=\"token\",\n display_name=\"Astra DB Application Token\",\n info=\"Authentication token for accessing Astra DB.\",\n value=\"ASTRA_DB_APPLICATION_TOKEN\",\n required=True,\n advanced=os.getenv(\"ASTRA_ENHANCED\", \"false\").lower() == \"true\",\n ),\n SecretStrInput(\n name=\"api_endpoint\",\n display_name=\"Database\" if os.getenv(\"ASTRA_ENHANCED\", \"false\").lower() == \"true\" else \"API Endpoint\",\n info=\"API endpoint URL for the Astra DB service.\",\n value=\"ASTRA_DB_API_ENDPOINT\",\n required=True,\n ),\n StrInput(\n name=\"collection_name\",\n display_name=\"Collection Name\",\n info=\"The name of the collection within Astra DB where the vectors will be stored.\",\n required=True,\n ),\n StrInput(\n name=\"metadata_incoming_links_key\",\n display_name=\"Metadata incoming links key\",\n info=\"Metadata key used for incoming links.\",\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs,\n StrInput(\n name=\"keyspace\",\n display_name=\"Keyspace\",\n info=\"Optional keyspace within Astra DB to use for the collection.\",\n advanced=True,\n ),\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Allows an embedding model configuration.\",\n ),\n DropdownInput(\n name=\"metric\",\n display_name=\"Metric\",\n info=\"Optional distance metric for vector comparisons in the vector store.\",\n options=[\"cosine\", \"dot_product\", \"euclidean\"],\n value=\"cosine\",\n advanced=True,\n ),\n IntInput(\n name=\"batch_size\",\n display_name=\"Batch Size\",\n info=\"Optional number of data to process in a single batch.\",\n advanced=True,\n ),\n IntInput(\n name=\"bulk_insert_batch_concurrency\",\n display_name=\"Bulk Insert Batch Concurrency\",\n info=\"Optional concurrency level for bulk insert operations.\",\n advanced=True,\n ),\n IntInput(\n name=\"bulk_insert_overwrite_concurrency\",\n display_name=\"Bulk Insert Overwrite Concurrency\",\n info=\"Optional concurrency level for bulk insert operations that overwrite existing data.\",\n advanced=True,\n ),\n IntInput(\n name=\"bulk_delete_concurrency\",\n display_name=\"Bulk Delete Concurrency\",\n info=\"Optional concurrency level for bulk delete operations.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"setup_mode\",\n display_name=\"Setup Mode\",\n info=\"Configuration mode for setting up the vector store, with options like 'Sync', or 'Off'.\",\n options=[\"Sync\", \"Off\"],\n advanced=True,\n value=\"Sync\",\n ),\n BoolInput(\n name=\"pre_delete_collection\",\n display_name=\"Pre Delete Collection\",\n info=\"Boolean flag to determine whether to delete the collection before creating a new one.\",\n advanced=True,\n value=False,\n ),\n StrInput(\n name=\"metadata_indexing_include\",\n display_name=\"Metadata Indexing Include\",\n info=\"Optional list of metadata fields to include in the indexing.\",\n advanced=True,\n list=True,\n ),\n StrInput(\n name=\"metadata_indexing_exclude\",\n display_name=\"Metadata Indexing Exclude\",\n info=\"Optional list of metadata fields to exclude from the indexing.\",\n advanced=True,\n list=True,\n ),\n StrInput(\n name=\"collection_indexing_policy\",\n display_name=\"Collection Indexing Policy\",\n info='Optional JSON string for the \"indexing\" field of the collection. '\n \"See https://docs.datastax.com/en/astra-db-serverless/api-reference/collections.html#the-indexing-option\",\n advanced=True,\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Results\",\n info=\"Number of results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\n \"Similarity\",\n \"Similarity with score threshold\",\n \"MMR (Max Marginal Relevance)\",\n \"Graph Traversal\",\n \"MMR (Max Marginal Relevance) Graph Traversal\",\n ],\n value=\"MMR (Max Marginal Relevance) Graph Traversal\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n DictInput(\n name=\"search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n is_list=True,\n ),\n ]\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBGraphVectorStore\n from langchain_astradb.utils.astradb import SetupMode\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n try:\n if not self.setup_mode:\n self.setup_mode = self._inputs[\"setup_mode\"].options[0]\n\n setup_mode_value = SetupMode[self.setup_mode.upper()]\n except KeyError as e:\n msg = f\"Invalid setup mode: {self.setup_mode}\"\n raise ValueError(msg) from e\n\n try:\n self.log(f\"Initializing Graph Vector Store {self.collection_name}\")\n\n vector_store = AstraDBGraphVectorStore(\n embedding=self.embedding_model,\n collection_name=self.collection_name,\n metadata_incoming_links_key=self.metadata_incoming_links_key or \"incoming_links\",\n token=self.token,\n api_endpoint=self.api_endpoint,\n namespace=self.keyspace or None,\n environment=parse_api_endpoint(self.api_endpoint).environment if self.api_endpoint else None,\n metric=self.metric or None,\n batch_size=self.batch_size or None,\n bulk_insert_batch_concurrency=self.bulk_insert_batch_concurrency or None,\n bulk_insert_overwrite_concurrency=self.bulk_insert_overwrite_concurrency or None,\n bulk_delete_concurrency=self.bulk_delete_concurrency or None,\n setup_mode=setup_mode_value,\n pre_delete_collection=self.pre_delete_collection,\n metadata_indexing_include=[s for s in self.metadata_indexing_include if s] or None,\n metadata_indexing_exclude=[s for s in self.metadata_indexing_exclude if s] or None,\n collection_indexing_policy=orjson.loads(self.collection_indexing_policy.encode(\"utf-8\"))\n if self.collection_indexing_policy\n else None,\n )\n except Exception as e:\n msg = f\"Error initializing AstraDBGraphVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Vector Store initialized: {vector_store.astra_env.collection_name}\")\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n self.ingest_data = self._prepare_ingest_data()\n\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except Exception as e:\n msg = f\"Error adding documents to AstraDBGraphVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n match self.search_type:\n case \"Similarity\":\n return \"similarity\"\n case \"Similarity with score threshold\":\n return \"similarity_score_threshold\"\n case \"MMR (Max Marginal Relevance)\":\n return \"mmr\"\n case \"Graph Traversal\":\n return \"traversal\"\n case \"MMR (Max Marginal Relevance) Graph Traversal\":\n return \"mmr_traversal\"\n case _:\n return \"similarity\"\n\n def _build_search_args(self):\n args = {\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n }\n\n if self.search_filter:\n clean_filter = {k: v for k, v in self.search_filter.items() if k and v}\n if len(clean_filter) > 0:\n args[\"filter\"] = clean_filter\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n if not vector_store:\n vector_store = self.build_vector_store()\n\n self.log(\"Searching for documents in AstraDBGraphVectorStore.\")\n self.log(f\"Search query: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n\n if self.search_query and isinstance(self.search_query, str) and self.search_query.strip():\n try:\n search_type = self._map_search_type()\n search_args = self._build_search_args()\n\n docs = vector_store.search(query=self.search_query, search_type=search_type, **search_args)\n\n # Drop links from the metadata. At this point the links don't add any value for building the\n # context and haven't been restored to json which causes the conversion to fail.\n self.log(\"Removing links from metadata.\")\n for doc in docs:\n if \"links\" in doc.metadata:\n doc.metadata.pop(\"links\")\n\n except Exception as e:\n msg = f\"Error performing search in AstraDBGraphVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n\n self.log(f\"Converted documents to data: {len(data)}\")\n\n self.status = data\n return data\n self.log(\"No search input provided. Skipping search.\")\n return []\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n" }, "collection_indexing_policy": { "_input_type": "StrInput", @@ -1068,9 +1069,10 @@ "dynamic": false, "info": "", "input_types": [ - "Data" + "Data", + "DataFrame" ], - "list": false, + "list": true, "name": "ingest_data", "placeholder": "", "required": false, @@ -3669,7 +3671,7 @@ "show": true, "title_case": false, "type": "code", - "value": "import os\n\nimport orjson\nfrom astrapy.admin import parse_api_endpoint\n\nfrom langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom langflow.helpers import docs_to_data\nfrom langflow.inputs import (\n BoolInput,\n DictInput,\n DropdownInput,\n FloatInput,\n HandleInput,\n IntInput,\n SecretStrInput,\n StrInput,\n)\nfrom langflow.schema import Data\n\n\nclass AstraDBGraphVectorStoreComponent(LCVectorStoreComponent):\n display_name: str = \"Astra DB Graph\"\n description: str = \"Implementation of Graph Vector Store using Astra DB\"\n name = \"AstraDBGraph\"\n icon: str = \"AstraDB\"\n\n inputs = [\n SecretStrInput(\n name=\"token\",\n display_name=\"Astra DB Application Token\",\n info=\"Authentication token for accessing Astra DB.\",\n value=\"ASTRA_DB_APPLICATION_TOKEN\",\n required=True,\n advanced=os.getenv(\"ASTRA_ENHANCED\", \"false\").lower() == \"true\",\n ),\n SecretStrInput(\n name=\"api_endpoint\",\n display_name=\"Database\" if os.getenv(\"ASTRA_ENHANCED\", \"false\").lower() == \"true\" else \"API Endpoint\",\n info=\"API endpoint URL for the Astra DB service.\",\n value=\"ASTRA_DB_API_ENDPOINT\",\n required=True,\n ),\n StrInput(\n name=\"collection_name\",\n display_name=\"Collection Name\",\n info=\"The name of the collection within Astra DB where the vectors will be stored.\",\n required=True,\n ),\n StrInput(\n name=\"metadata_incoming_links_key\",\n display_name=\"Metadata incoming links key\",\n info=\"Metadata key used for incoming links.\",\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs,\n StrInput(\n name=\"keyspace\",\n display_name=\"Keyspace\",\n info=\"Optional keyspace within Astra DB to use for the collection.\",\n advanced=True,\n ),\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Allows an embedding model configuration.\",\n ),\n DropdownInput(\n name=\"metric\",\n display_name=\"Metric\",\n info=\"Optional distance metric for vector comparisons in the vector store.\",\n options=[\"cosine\", \"dot_product\", \"euclidean\"],\n value=\"cosine\",\n advanced=True,\n ),\n IntInput(\n name=\"batch_size\",\n display_name=\"Batch Size\",\n info=\"Optional number of data to process in a single batch.\",\n advanced=True,\n ),\n IntInput(\n name=\"bulk_insert_batch_concurrency\",\n display_name=\"Bulk Insert Batch Concurrency\",\n info=\"Optional concurrency level for bulk insert operations.\",\n advanced=True,\n ),\n IntInput(\n name=\"bulk_insert_overwrite_concurrency\",\n display_name=\"Bulk Insert Overwrite Concurrency\",\n info=\"Optional concurrency level for bulk insert operations that overwrite existing data.\",\n advanced=True,\n ),\n IntInput(\n name=\"bulk_delete_concurrency\",\n display_name=\"Bulk Delete Concurrency\",\n info=\"Optional concurrency level for bulk delete operations.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"setup_mode\",\n display_name=\"Setup Mode\",\n info=\"Configuration mode for setting up the vector store, with options like 'Sync', or 'Off'.\",\n options=[\"Sync\", \"Off\"],\n advanced=True,\n value=\"Sync\",\n ),\n BoolInput(\n name=\"pre_delete_collection\",\n display_name=\"Pre Delete Collection\",\n info=\"Boolean flag to determine whether to delete the collection before creating a new one.\",\n advanced=True,\n value=False,\n ),\n StrInput(\n name=\"metadata_indexing_include\",\n display_name=\"Metadata Indexing Include\",\n info=\"Optional list of metadata fields to include in the indexing.\",\n advanced=True,\n list=True,\n ),\n StrInput(\n name=\"metadata_indexing_exclude\",\n display_name=\"Metadata Indexing Exclude\",\n info=\"Optional list of metadata fields to exclude from the indexing.\",\n advanced=True,\n list=True,\n ),\n StrInput(\n name=\"collection_indexing_policy\",\n display_name=\"Collection Indexing Policy\",\n info='Optional JSON string for the \"indexing\" field of the collection. '\n \"See https://docs.datastax.com/en/astra-db-serverless/api-reference/collections.html#the-indexing-option\",\n advanced=True,\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Results\",\n info=\"Number of results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\n \"Similarity\",\n \"Similarity with score threshold\",\n \"MMR (Max Marginal Relevance)\",\n \"Graph Traversal\",\n \"MMR (Max Marginal Relevance) Graph Traversal\",\n ],\n value=\"MMR (Max Marginal Relevance) Graph Traversal\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n DictInput(\n name=\"search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n is_list=True,\n ),\n ]\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBGraphVectorStore\n from langchain_astradb.utils.astradb import SetupMode\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n try:\n if not self.setup_mode:\n self.setup_mode = self._inputs[\"setup_mode\"].options[0]\n\n setup_mode_value = SetupMode[self.setup_mode.upper()]\n except KeyError as e:\n msg = f\"Invalid setup mode: {self.setup_mode}\"\n raise ValueError(msg) from e\n\n try:\n self.log(f\"Initializing Graph Vector Store {self.collection_name}\")\n\n vector_store = AstraDBGraphVectorStore(\n embedding=self.embedding_model,\n collection_name=self.collection_name,\n metadata_incoming_links_key=self.metadata_incoming_links_key or \"incoming_links\",\n token=self.token,\n api_endpoint=self.api_endpoint,\n namespace=self.keyspace or None,\n environment=parse_api_endpoint(self.api_endpoint).environment if self.api_endpoint else None,\n metric=self.metric or None,\n batch_size=self.batch_size or None,\n bulk_insert_batch_concurrency=self.bulk_insert_batch_concurrency or None,\n bulk_insert_overwrite_concurrency=self.bulk_insert_overwrite_concurrency or None,\n bulk_delete_concurrency=self.bulk_delete_concurrency or None,\n setup_mode=setup_mode_value,\n pre_delete_collection=self.pre_delete_collection,\n metadata_indexing_include=[s for s in self.metadata_indexing_include if s] or None,\n metadata_indexing_exclude=[s for s in self.metadata_indexing_exclude if s] or None,\n collection_indexing_policy=orjson.loads(self.collection_indexing_policy.encode(\"utf-8\"))\n if self.collection_indexing_policy\n else None,\n )\n except Exception as e:\n msg = f\"Error initializing AstraDBGraphVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Vector Store initialized: {vector_store.astra_env.collection_name}\")\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except Exception as e:\n msg = f\"Error adding documents to AstraDBGraphVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n match self.search_type:\n case \"Similarity\":\n return \"similarity\"\n case \"Similarity with score threshold\":\n return \"similarity_score_threshold\"\n case \"MMR (Max Marginal Relevance)\":\n return \"mmr\"\n case \"Graph Traversal\":\n return \"traversal\"\n case \"MMR (Max Marginal Relevance) Graph Traversal\":\n return \"mmr_traversal\"\n case _:\n return \"similarity\"\n\n def _build_search_args(self):\n args = {\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n }\n\n if self.search_filter:\n clean_filter = {k: v for k, v in self.search_filter.items() if k and v}\n if len(clean_filter) > 0:\n args[\"filter\"] = clean_filter\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n if not vector_store:\n vector_store = self.build_vector_store()\n\n self.log(\"Searching for documents in AstraDBGraphVectorStore.\")\n self.log(f\"Search query: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n\n if self.search_query and isinstance(self.search_query, str) and self.search_query.strip():\n try:\n search_type = self._map_search_type()\n search_args = self._build_search_args()\n\n docs = vector_store.search(query=self.search_query, search_type=search_type, **search_args)\n\n # Drop links from the metadata. At this point the links don't add any value for building the\n # context and haven't been restored to json which causes the conversion to fail.\n self.log(\"Removing links from metadata.\")\n for doc in docs:\n if \"links\" in doc.metadata:\n doc.metadata.pop(\"links\")\n\n except Exception as e:\n msg = f\"Error performing search in AstraDBGraphVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n\n self.log(f\"Converted documents to data: {len(data)}\")\n\n self.status = data\n return data\n self.log(\"No search input provided. Skipping search.\")\n return []\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n" + "value": "import os\n\nimport orjson\nfrom astrapy.admin import parse_api_endpoint\n\nfrom langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom langflow.helpers import docs_to_data\nfrom langflow.inputs import (\n BoolInput,\n DictInput,\n DropdownInput,\n FloatInput,\n HandleInput,\n IntInput,\n SecretStrInput,\n StrInput,\n)\nfrom langflow.schema import Data\n\n\nclass AstraDBGraphVectorStoreComponent(LCVectorStoreComponent):\n display_name: str = \"Astra DB Graph\"\n description: str = \"Implementation of Graph Vector Store using Astra DB\"\n name = \"AstraDBGraph\"\n icon: str = \"AstraDB\"\n\n inputs = [\n SecretStrInput(\n name=\"token\",\n display_name=\"Astra DB Application Token\",\n info=\"Authentication token for accessing Astra DB.\",\n value=\"ASTRA_DB_APPLICATION_TOKEN\",\n required=True,\n advanced=os.getenv(\"ASTRA_ENHANCED\", \"false\").lower() == \"true\",\n ),\n SecretStrInput(\n name=\"api_endpoint\",\n display_name=\"Database\" if os.getenv(\"ASTRA_ENHANCED\", \"false\").lower() == \"true\" else \"API Endpoint\",\n info=\"API endpoint URL for the Astra DB service.\",\n value=\"ASTRA_DB_API_ENDPOINT\",\n required=True,\n ),\n StrInput(\n name=\"collection_name\",\n display_name=\"Collection Name\",\n info=\"The name of the collection within Astra DB where the vectors will be stored.\",\n required=True,\n ),\n StrInput(\n name=\"metadata_incoming_links_key\",\n display_name=\"Metadata incoming links key\",\n info=\"Metadata key used for incoming links.\",\n advanced=True,\n ),\n *LCVectorStoreComponent.inputs,\n StrInput(\n name=\"keyspace\",\n display_name=\"Keyspace\",\n info=\"Optional keyspace within Astra DB to use for the collection.\",\n advanced=True,\n ),\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Allows an embedding model configuration.\",\n ),\n DropdownInput(\n name=\"metric\",\n display_name=\"Metric\",\n info=\"Optional distance metric for vector comparisons in the vector store.\",\n options=[\"cosine\", \"dot_product\", \"euclidean\"],\n value=\"cosine\",\n advanced=True,\n ),\n IntInput(\n name=\"batch_size\",\n display_name=\"Batch Size\",\n info=\"Optional number of data to process in a single batch.\",\n advanced=True,\n ),\n IntInput(\n name=\"bulk_insert_batch_concurrency\",\n display_name=\"Bulk Insert Batch Concurrency\",\n info=\"Optional concurrency level for bulk insert operations.\",\n advanced=True,\n ),\n IntInput(\n name=\"bulk_insert_overwrite_concurrency\",\n display_name=\"Bulk Insert Overwrite Concurrency\",\n info=\"Optional concurrency level for bulk insert operations that overwrite existing data.\",\n advanced=True,\n ),\n IntInput(\n name=\"bulk_delete_concurrency\",\n display_name=\"Bulk Delete Concurrency\",\n info=\"Optional concurrency level for bulk delete operations.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"setup_mode\",\n display_name=\"Setup Mode\",\n info=\"Configuration mode for setting up the vector store, with options like 'Sync', or 'Off'.\",\n options=[\"Sync\", \"Off\"],\n advanced=True,\n value=\"Sync\",\n ),\n BoolInput(\n name=\"pre_delete_collection\",\n display_name=\"Pre Delete Collection\",\n info=\"Boolean flag to determine whether to delete the collection before creating a new one.\",\n advanced=True,\n value=False,\n ),\n StrInput(\n name=\"metadata_indexing_include\",\n display_name=\"Metadata Indexing Include\",\n info=\"Optional list of metadata fields to include in the indexing.\",\n advanced=True,\n list=True,\n ),\n StrInput(\n name=\"metadata_indexing_exclude\",\n display_name=\"Metadata Indexing Exclude\",\n info=\"Optional list of metadata fields to exclude from the indexing.\",\n advanced=True,\n list=True,\n ),\n StrInput(\n name=\"collection_indexing_policy\",\n display_name=\"Collection Indexing Policy\",\n info='Optional JSON string for the \"indexing\" field of the collection. '\n \"See https://docs.datastax.com/en/astra-db-serverless/api-reference/collections.html#the-indexing-option\",\n advanced=True,\n ),\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Results\",\n info=\"Number of results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\n \"Similarity\",\n \"Similarity with score threshold\",\n \"MMR (Max Marginal Relevance)\",\n \"Graph Traversal\",\n \"MMR (Max Marginal Relevance) Graph Traversal\",\n ],\n value=\"MMR (Max Marginal Relevance) Graph Traversal\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n DictInput(\n name=\"search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n is_list=True,\n ),\n ]\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBGraphVectorStore\n from langchain_astradb.utils.astradb import SetupMode\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n try:\n if not self.setup_mode:\n self.setup_mode = self._inputs[\"setup_mode\"].options[0]\n\n setup_mode_value = SetupMode[self.setup_mode.upper()]\n except KeyError as e:\n msg = f\"Invalid setup mode: {self.setup_mode}\"\n raise ValueError(msg) from e\n\n try:\n self.log(f\"Initializing Graph Vector Store {self.collection_name}\")\n\n vector_store = AstraDBGraphVectorStore(\n embedding=self.embedding_model,\n collection_name=self.collection_name,\n metadata_incoming_links_key=self.metadata_incoming_links_key or \"incoming_links\",\n token=self.token,\n api_endpoint=self.api_endpoint,\n namespace=self.keyspace or None,\n environment=parse_api_endpoint(self.api_endpoint).environment if self.api_endpoint else None,\n metric=self.metric or None,\n batch_size=self.batch_size or None,\n bulk_insert_batch_concurrency=self.bulk_insert_batch_concurrency or None,\n bulk_insert_overwrite_concurrency=self.bulk_insert_overwrite_concurrency or None,\n bulk_delete_concurrency=self.bulk_delete_concurrency or None,\n setup_mode=setup_mode_value,\n pre_delete_collection=self.pre_delete_collection,\n metadata_indexing_include=[s for s in self.metadata_indexing_include if s] or None,\n metadata_indexing_exclude=[s for s in self.metadata_indexing_exclude if s] or None,\n collection_indexing_policy=orjson.loads(self.collection_indexing_policy.encode(\"utf-8\"))\n if self.collection_indexing_policy\n else None,\n )\n except Exception as e:\n msg = f\"Error initializing AstraDBGraphVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Vector Store initialized: {vector_store.astra_env.collection_name}\")\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n self.ingest_data = self._prepare_ingest_data()\n\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except Exception as e:\n msg = f\"Error adding documents to AstraDBGraphVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n match self.search_type:\n case \"Similarity\":\n return \"similarity\"\n case \"Similarity with score threshold\":\n return \"similarity_score_threshold\"\n case \"MMR (Max Marginal Relevance)\":\n return \"mmr\"\n case \"Graph Traversal\":\n return \"traversal\"\n case \"MMR (Max Marginal Relevance) Graph Traversal\":\n return \"mmr_traversal\"\n case _:\n return \"similarity\"\n\n def _build_search_args(self):\n args = {\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n }\n\n if self.search_filter:\n clean_filter = {k: v for k, v in self.search_filter.items() if k and v}\n if len(clean_filter) > 0:\n args[\"filter\"] = clean_filter\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n if not vector_store:\n vector_store = self.build_vector_store()\n\n self.log(\"Searching for documents in AstraDBGraphVectorStore.\")\n self.log(f\"Search query: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n\n if self.search_query and isinstance(self.search_query, str) and self.search_query.strip():\n try:\n search_type = self._map_search_type()\n search_args = self._build_search_args()\n\n docs = vector_store.search(query=self.search_query, search_type=search_type, **search_args)\n\n # Drop links from the metadata. At this point the links don't add any value for building the\n # context and haven't been restored to json which causes the conversion to fail.\n self.log(\"Removing links from metadata.\")\n for doc in docs:\n if \"links\" in doc.metadata:\n doc.metadata.pop(\"links\")\n\n except Exception as e:\n msg = f\"Error performing search in AstraDBGraphVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n\n self.log(f\"Converted documents to data: {len(data)}\")\n\n self.status = data\n return data\n self.log(\"No search input provided. Skipping search.\")\n return []\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n" }, "collection_indexing_policy": { "_input_type": "StrInput", @@ -3736,9 +3738,10 @@ "dynamic": false, "info": "", "input_types": [ - "Data" + "Data", + "DataFrame" ], - "list": false, + "list": true, "list_add_label": "Add More", "name": "ingest_data", "placeholder": "", diff --git a/src/backend/base/langflow/initial_setup/starter_projects/Vector Store RAG.json b/src/backend/base/langflow/initial_setup/starter_projects/Vector Store RAG.json index ea4b97759..8f3e575e2 100644 --- a/src/backend/base/langflow/initial_setup/starter_projects/Vector Store RAG.json +++ b/src/backend/base/langflow/initial_setup/starter_projects/Vector Store RAG.json @@ -185,7 +185,8 @@ "fieldName": "ingest_data", "id": "AstraDB-PEjRy", "inputTypes": [ - "Data" + "Data", + "DataFrame" ], "type": "other" } @@ -194,7 +195,7 @@ "source": "SplitText-vaZeR", "sourceHandle": "{œdataTypeœ: œSplitTextœ, œidœ: œSplitText-vaZeRœ, œnameœ: œchunksœ, œoutput_typesœ: [œDataœ]}", "target": "AstraDB-PEjRy", - "targetHandle": "{œfieldNameœ: œingest_dataœ, œidœ: œAstraDB-PEjRyœ, œinputTypesœ: [œDataœ], œtypeœ: œotherœ}" + "targetHandle": "{œfieldNameœ: œingest_dataœ, œidœ: œAstraDB-PEjRyœ, œinputTypesœ: [œDataœ, œDataFrameœ], œtypeœ: œotherœ}" }, { "data": { @@ -3479,7 +3480,7 @@ "show": true, "title_case": false, "type": "code", - "value": "import re\nfrom collections import defaultdict\nfrom dataclasses import asdict, dataclass, field\n\nfrom astrapy import AstraDBAdmin, DataAPIClient, Database\nfrom astrapy.info import CollectionDescriptor\nfrom langchain_astradb import AstraDBVectorStore, CollectionVectorServiceOptions\n\nfrom langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom langflow.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom langflow.helpers import docs_to_data\nfrom langflow.inputs import FloatInput, NestedDictInput\nfrom langflow.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n SecretStrInput,\n StrInput,\n)\nfrom langflow.schema import Data\nfrom langflow.utils.version import get_version_info\n\n\n@vector_store_connection\nclass AstraDBVectorStoreComponent(LCVectorStoreComponent):\n display_name: str = \"Astra DB\"\n description: str = \"Ingest and search documents in Astra DB\"\n documentation: str = \"https://docs.datastax.com/en/langflow/astra-components.html\"\n name = \"AstraDB\"\n icon: str = \"AstraDB\"\n\n _cached_vector_store: AstraDBVectorStore | None = None\n\n @dataclass\n class NewDatabaseInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_database\",\n \"description\": \"Please allow several minutes for creation to complete.\",\n \"display_name\": \"Create new database\",\n \"field_order\": [\"01_new_database_name\", \"02_cloud_provider\", \"03_region\"],\n \"template\": {\n \"01_new_database_name\": StrInput(\n name=\"new_database_name\",\n display_name=\"Name\",\n info=\"Name of the new database to create in Astra DB.\",\n required=True,\n ),\n \"02_cloud_provider\": DropdownInput(\n name=\"cloud_provider\",\n display_name=\"Cloud provider\",\n info=\"Cloud provider for the new database.\",\n options=[],\n required=True,\n real_time_refresh=True,\n ),\n \"03_region\": DropdownInput(\n name=\"region\",\n display_name=\"Region\",\n info=\"Region for the new database.\",\n options=[],\n required=True,\n ),\n },\n },\n }\n }\n )\n\n @dataclass\n class NewCollectionInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_collection\",\n \"description\": \"Please allow several seconds for creation to complete.\",\n \"display_name\": \"Create new collection\",\n \"field_order\": [\n \"01_new_collection_name\",\n \"02_embedding_generation_provider\",\n \"03_embedding_generation_model\",\n \"04_dimension\",\n ],\n \"template\": {\n \"01_new_collection_name\": StrInput(\n name=\"new_collection_name\",\n display_name=\"Name\",\n info=\"Name of the new collection to create in Astra DB.\",\n required=True,\n ),\n \"02_embedding_generation_provider\": DropdownInput(\n name=\"embedding_generation_provider\",\n display_name=\"Embedding generation method\",\n info=\"Provider to use for generating embeddings.\",\n helper_text=(\n \"To create collections with more embedding provider options, go to \"\n 'your database in Astra DB'\n ),\n real_time_refresh=True,\n required=True,\n options=[],\n ),\n \"03_embedding_generation_model\": DropdownInput(\n name=\"embedding_generation_model\",\n display_name=\"Embedding model\",\n info=\"Model to use for generating embeddings.\",\n real_time_refresh=True,\n options=[],\n ),\n \"04_dimension\": IntInput(\n name=\"dimension\",\n display_name=\"Dimensions\",\n info=\"Dimensions of the embeddings to generate.\",\n value=None,\n ),\n },\n },\n }\n }\n )\n\n inputs = [\n SecretStrInput(\n name=\"token\",\n display_name=\"Astra DB Application Token\",\n info=\"Authentication token for accessing Astra DB.\",\n value=\"ASTRA_DB_APPLICATION_TOKEN\",\n required=True,\n real_time_refresh=True,\n input_types=[],\n ),\n StrInput(\n name=\"environment\",\n display_name=\"Environment\",\n info=\"The environment for the Astra DB API Endpoint.\",\n advanced=True,\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"database_name\",\n display_name=\"Database\",\n info=\"The Database name for the Astra DB instance.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewDatabaseInput()),\n combobox=True,\n ),\n StrInput(\n name=\"api_endpoint\",\n display_name=\"Astra DB API Endpoint\",\n info=\"The API Endpoint for the Astra DB instance. Supercedes database selection.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"collection_name\",\n display_name=\"Collection\",\n info=\"The name of the collection within Astra DB where the vectors will be stored.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewCollectionInput()),\n combobox=True,\n advanced=True,\n ),\n StrInput(\n name=\"keyspace\",\n display_name=\"Keyspace\",\n info=\"Optional keyspace within Astra DB to use for the collection.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"embedding_choice\",\n display_name=\"Embedding Model or Astra Vectorize\",\n info=\"Choose an embedding model or use Astra Vectorize.\",\n options=[\"Embedding Model\", \"Astra Vectorize\"],\n value=\"Embedding Model\",\n advanced=True,\n real_time_refresh=True,\n ),\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Specify the Embedding Model. Not required for Astra Vectorize collections.\",\n required=False,\n ),\n *LCVectorStoreComponent.inputs,\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Search Results\",\n info=\"Number of search results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\"Similarity\", \"Similarity with score threshold\", \"MMR (Max Marginal Relevance)\"],\n value=\"Similarity\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n NestedDictInput(\n name=\"advanced_search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n ),\n BoolInput(\n name=\"autodetect_collection\",\n display_name=\"Autodetect Collection\",\n info=\"Boolean flag to determine whether to autodetect the collection.\",\n advanced=True,\n value=True,\n ),\n StrInput(\n name=\"content_field\",\n display_name=\"Content Field\",\n info=\"Field to use as the text content field for the vector store.\",\n advanced=True,\n ),\n StrInput(\n name=\"deletion_field\",\n display_name=\"Deletion Based On Field\",\n info=\"When this parameter is provided, documents in the target collection with \"\n \"metadata field values matching the input metadata field value will be deleted \"\n \"before new data is loaded.\",\n advanced=True,\n ),\n BoolInput(\n name=\"ignore_invalid_documents\",\n display_name=\"Ignore Invalid Documents\",\n info=\"Boolean flag to determine whether to ignore invalid documents at runtime.\",\n advanced=True,\n ),\n NestedDictInput(\n name=\"astradb_vectorstore_kwargs\",\n display_name=\"AstraDBVectorStore Parameters\",\n info=\"Optional dictionary of additional parameters for the AstraDBVectorStore.\",\n advanced=True,\n ),\n ]\n\n @classmethod\n def map_cloud_providers(cls):\n # TODO: Programmatically fetch the regions for each cloud provider\n return {\n \"dev\": {\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\"],\n },\n },\n # TODO: Check test regions\n \"test\": {\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\"],\n },\n },\n \"prod\": {\n \"Amazon Web Services\": {\n \"id\": \"aws\",\n \"regions\": [\"us-east-2\", \"ap-south-1\", \"eu-west-1\"],\n },\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-east1\"],\n },\n \"Microsoft Azure\": {\n \"id\": \"azure\",\n \"regions\": [\"westus3\"],\n },\n },\n }\n\n @classmethod\n def get_vectorize_providers(cls, token: str, environment: str | None = None, api_endpoint: str | None = None):\n try:\n # Get the admin object\n admin = AstraDBAdmin(token=token, environment=environment)\n db_admin = admin.get_database_admin(api_endpoint=api_endpoint)\n\n # Get the list of embedding providers\n embedding_providers = db_admin.find_embedding_providers().as_dict()\n\n vectorize_providers_mapping = {}\n # Map the provider display name to the provider key and models\n for provider_key, provider_data in embedding_providers[\"embeddingProviders\"].items():\n # Get the provider display name and models\n display_name = provider_data[\"displayName\"]\n models = [model[\"name\"] for model in provider_data[\"models\"]]\n\n # Build our mapping\n vectorize_providers_mapping[display_name] = [provider_key, models]\n\n # Sort the resulting dictionary\n return defaultdict(list, dict(sorted(vectorize_providers_mapping.items())))\n except Exception as _: # noqa: BLE001\n return {}\n\n @classmethod\n async def create_database_api(\n cls,\n new_database_name: str,\n cloud_provider: str,\n region: str,\n token: str,\n environment: str | None = None,\n keyspace: str | None = None,\n ):\n client = DataAPIClient(token=token, environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the environment, set to prod if null like\n my_env = environment or \"prod\"\n\n # Raise a value error if name isn't provided\n if not new_database_name:\n msg = \"Database name is required to create a new database.\"\n raise ValueError(msg)\n\n # Call the create database function\n return await admin_client.async_create_database(\n name=new_database_name,\n cloud_provider=cls.map_cloud_providers()[my_env][cloud_provider][\"id\"],\n region=region,\n keyspace=keyspace,\n wait_until_active=False,\n )\n\n @classmethod\n async def create_collection_api(\n cls,\n new_collection_name: str,\n token: str,\n api_endpoint: str,\n environment: str | None = None,\n keyspace: str | None = None,\n dimension: int | None = None,\n embedding_generation_provider: str | None = None,\n embedding_generation_model: str | None = None,\n ):\n # Create the data API client\n client = DataAPIClient(token=token, environment=environment)\n\n # Get the database object\n database = client.get_async_database(api_endpoint=api_endpoint, token=token)\n\n # Build vectorize options, if needed\n vectorize_options = None\n if not dimension:\n vectorize_options = CollectionVectorServiceOptions(\n provider=cls.get_vectorize_providers(\n token=token, environment=environment, api_endpoint=api_endpoint\n ).get(embedding_generation_provider, [None, []])[0],\n model_name=embedding_generation_model,\n )\n\n # Raise a value error if name isn't provided\n if not new_collection_name:\n msg = \"Collection name is required to create a new collection.\"\n raise ValueError(msg)\n\n # Create the collection\n return await database.create_collection(\n name=new_collection_name,\n keyspace=keyspace,\n dimension=dimension,\n service=vectorize_options,\n )\n\n @classmethod\n def get_database_list_static(cls, token: str, environment: str | None = None):\n client = DataAPIClient(token=token, environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the list of databases\n db_list = list(admin_client.list_databases())\n\n # Set the environment properly\n env_string = \"\"\n if environment and environment != \"prod\":\n env_string = f\"-{environment}\"\n\n # Generate the api endpoint for each database\n db_info_dict = {}\n for db in db_list:\n try:\n # Get the API endpoint for the database\n api_endpoint = f\"https://{db.info.id}-{db.info.region}.apps.astra{env_string}.datastax.com\"\n\n # Get the number of collections\n try:\n num_collections = len(\n list(\n client.get_database(\n api_endpoint=api_endpoint, token=token, keyspace=db.info.keyspace\n ).list_collection_names(keyspace=db.info.keyspace)\n )\n )\n except Exception: # noqa: BLE001\n if db.status != \"PENDING\":\n continue\n num_collections = 0\n\n # Add the database to the dictionary\n db_info_dict[db.info.name] = {\n \"api_endpoint\": api_endpoint,\n \"collections\": num_collections,\n \"status\": db.status if db.status != \"ACTIVE\" else None,\n \"org_id\": db.org_id if db.org_id else None,\n }\n except Exception: # noqa: BLE001, S110\n pass\n\n return db_info_dict\n\n def get_database_list(self):\n return self.get_database_list_static(token=self.token, environment=self.environment)\n\n @classmethod\n def get_api_endpoint_static(\n cls,\n token: str,\n environment: str | None = None,\n api_endpoint: str | None = None,\n database_name: str | None = None,\n ):\n # If the api_endpoint is set, return it\n if api_endpoint:\n return api_endpoint\n\n # Check if the database_name is like a url\n if database_name and database_name.startswith(\"https://\"):\n return database_name\n\n # If the database is not set, nothing we can do.\n if not database_name:\n return None\n\n # Grab the database object\n db = cls.get_database_list_static(token=token, environment=environment).get(database_name)\n if not db:\n return None\n\n # Otherwise, get the URL from the database list\n return db.get(\"api_endpoint\")\n\n def get_api_endpoint(self):\n return self.get_api_endpoint_static(\n token=self.token,\n environment=self.environment,\n api_endpoint=self.api_endpoint,\n database_name=self.database_name,\n )\n\n @classmethod\n def get_database_id_static(cls, api_endpoint: str) -> str | None:\n # Pattern matches standard UUID format: 8-4-4-4-12 hexadecimal characters\n uuid_pattern = r\"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\"\n match = re.search(uuid_pattern, api_endpoint)\n\n return match.group(0) if match else None\n\n def get_database_id(self):\n return self.get_database_id_static(api_endpoint=self.get_api_endpoint())\n\n def get_keyspace(self):\n keyspace = self.keyspace\n\n if keyspace:\n return keyspace.strip()\n\n return None\n\n def get_database_object(self, api_endpoint: str | None = None):\n try:\n client = DataAPIClient(token=self.token, environment=self.environment)\n\n return client.get_database(\n api_endpoint=api_endpoint or self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n except Exception as e:\n msg = f\"Error fetching database object: {e}\"\n raise ValueError(msg) from e\n\n def collection_data(self, collection_name: str, database: Database | None = None):\n try:\n if not database:\n client = DataAPIClient(token=self.token, environment=self.environment)\n\n database = client.get_database(\n api_endpoint=self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n\n collection = database.get_collection(collection_name, keyspace=self.get_keyspace())\n\n return collection.estimated_document_count()\n except Exception as e: # noqa: BLE001\n self.log(f\"Error checking collection data: {e}\")\n\n return None\n\n def _initialize_database_options(self):\n try:\n return [\n {\n \"name\": name,\n \"status\": info[\"status\"],\n \"collections\": info[\"collections\"],\n \"api_endpoint\": info[\"api_endpoint\"],\n \"org_id\": info[\"org_id\"],\n }\n for name, info in self.get_database_list().items()\n ]\n except Exception as e:\n msg = f\"Error fetching database options: {e}\"\n raise ValueError(msg) from e\n\n @classmethod\n def get_provider_icon(cls, collection: CollectionDescriptor | None = None, provider_name: str | None = None) -> str:\n # Get the provider name from the collection\n provider_name = provider_name or (\n collection.options.vector.service.provider\n if collection and collection.options and collection.options.vector and collection.options.vector.service\n else None\n )\n\n # If there is no provider, use the vector store icon\n if not provider_name or provider_name == \"Bring your own\":\n return \"vectorstores\"\n\n # Map provider casings\n case_map = {\n \"nvidia\": \"NVIDIA\",\n \"openai\": \"OpenAI\",\n \"amazon bedrock\": \"AmazonBedrockEmbeddings\",\n \"azure openai\": \"AzureOpenAiEmbeddings\",\n \"cohere\": \"Cohere\",\n \"jina ai\": \"JinaAI\",\n \"mistral ai\": \"MistralAI\",\n \"upstage\": \"Upstage\",\n \"voyage ai\": \"VoyageAI\",\n }\n\n # Adjust the casing on some like nvidia\n return case_map[provider_name.lower()] if provider_name.lower() in case_map else provider_name.title()\n\n def _initialize_collection_options(self, api_endpoint: str | None = None):\n # Nothing to generate if we don't have an API endpoint yet\n api_endpoint = api_endpoint or self.get_api_endpoint()\n if not api_endpoint:\n return []\n\n # Retrieve the database object\n database = self.get_database_object(api_endpoint=api_endpoint)\n\n # Get the list of collections\n collection_list = list(database.list_collections(keyspace=self.get_keyspace()))\n\n # Return the list of collections and metadata associated\n return [\n {\n \"name\": col.name,\n \"records\": self.collection_data(collection_name=col.name, database=database),\n \"provider\": (\n col.options.vector.service.provider if col.options.vector and col.options.vector.service else None\n ),\n \"icon\": self.get_provider_icon(collection=col),\n \"model\": (\n col.options.vector.service.model_name if col.options.vector and col.options.vector.service else None\n ),\n }\n for col in collection_list\n ]\n\n def reset_provider_options(self, build_config: dict) -> dict:\n \"\"\"Reset provider options and related configurations in the build_config dictionary.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get vectorize providers\n vectorize_providers_api = self.get_vectorize_providers(\n token=self.token,\n environment=self.environment,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n )\n\n # Create a new dictionary with \"Bring your own\" first\n vectorize_providers: dict[str, list[list[str]]] = {\"Bring your own\": [[], []]}\n\n # Add the remaining items (only Nvidia) from the original dictionary\n vectorize_providers.update(\n {\n k: v\n for k, v in vectorize_providers_api.items()\n if k.lower() in [\"nvidia\"] # TODO: Eventually support more\n }\n )\n\n # Set provider options\n provider_field = \"02_embedding_generation_provider\"\n template[provider_field][\"options\"] = list(vectorize_providers.keys())\n\n # Add metadata for each provider option\n template[provider_field][\"options_metadata\"] = [\n {\"icon\": self.get_provider_icon(provider_name=provider)} for provider in template[provider_field][\"options\"]\n ]\n\n # Get selected embedding provider\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure embedding model field\n model_field = \"03_embedding_generation_model\"\n template[model_field].update(\n {\n \"options\": vectorize_providers.get(embedding_provider, [[], []])[1],\n \"placeholder\": \"Bring your own\" if is_bring_your_own else None,\n \"readonly\": is_bring_your_own,\n \"required\": not is_bring_your_own,\n \"value\": None,\n }\n )\n\n # If this is a bring your own, set dimensions to 0\n return self.reset_dimension_field(build_config)\n\n def reset_dimension_field(self, build_config: dict) -> dict:\n \"\"\"Reset dimension field options based on provided configuration.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get selected embedding model\n provider_field = \"02_embedding_generation_provider\"\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure dimension field\n dimension_field = \"04_dimension\"\n dimension_value = 1024 if not is_bring_your_own else None # TODO: Dynamically figure this out\n template[dimension_field].update(\n {\n \"placeholder\": dimension_value,\n \"value\": dimension_value,\n \"readonly\": not is_bring_your_own,\n \"required\": is_bring_your_own,\n }\n )\n\n return build_config\n\n def reset_collection_list(self, build_config: dict) -> dict:\n \"\"\"Reset collection list options based on provided configuration.\"\"\"\n # Get collection options\n collection_options = self._initialize_collection_options(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n\n # Update collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update(\n {\n \"options\": [col[\"name\"] for col in collection_options],\n \"options_metadata\": [{k: v for k, v in col.items() if k != \"name\"} for col in collection_options],\n }\n )\n\n # Reset selected collection if not in options\n if collection_config[\"value\"] not in collection_config[\"options\"]:\n collection_config[\"value\"] = \"\"\n\n # Set advanced status based on database selection\n collection_config[\"advanced\"] = not build_config[\"database_name\"][\"value\"]\n\n return build_config\n\n def reset_database_list(self, build_config: dict) -> dict:\n \"\"\"Reset database list options and related configurations.\"\"\"\n # Get database options\n database_options = self._initialize_database_options()\n\n # Update cloud provider options\n env = self.environment or \"prod\"\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_cloud_provider\"][\"options\"] = list(self.map_cloud_providers()[env].keys())\n\n # Update database configuration\n database_config = build_config[\"database_name\"]\n database_config.update(\n {\n \"options\": [db[\"name\"] for db in database_options],\n \"options_metadata\": [{k: v for k, v in db.items() if k != \"name\"} for db in database_options],\n }\n )\n\n # Reset selections if value not in options\n if database_config[\"value\"] not in database_config[\"options\"]:\n database_config[\"value\"] = \"\"\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n build_config[\"collection_name\"][\"advanced\"] = True\n\n # Set advanced status based on token presence\n database_config[\"advanced\"] = not build_config[\"token\"][\"value\"]\n\n return build_config\n\n def reset_build_config(self, build_config: dict) -> dict:\n \"\"\"Reset all build configuration options to default empty state.\"\"\"\n # Reset database configuration\n database_config = build_config[\"database_name\"]\n database_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"advanced\": True})\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n\n # Reset collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"advanced\": True})\n\n return build_config\n\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Update build configuration based on field name and value.\"\"\"\n # Early return if no token provided\n if not self.token:\n return self.reset_build_config(build_config)\n\n # Database creation callback\n if field_name == \"database_name\" and isinstance(field_value, dict):\n if \"01_new_database_name\" in field_value:\n await self._create_new_database(build_config, field_value)\n return self.reset_collection_list(build_config)\n return self._update_cloud_regions(build_config, field_value)\n\n # Collection creation callback\n if field_name == \"collection_name\" and isinstance(field_value, dict):\n # Case 1: New collection creation\n if \"01_new_collection_name\" in field_value:\n await self._create_new_collection(build_config, field_value)\n return build_config\n\n # Case 2: Update embedding provider options\n if \"02_embedding_generation_provider\" in field_value:\n return self.reset_provider_options(build_config)\n\n # Case 3: Update dimension field\n if \"03_embedding_generation_model\" in field_value:\n return self.reset_dimension_field(build_config)\n\n # Initial execution or token/environment change\n first_run = field_name == \"collection_name\" and not field_value and not build_config[\"database_name\"][\"options\"]\n if first_run or field_name in {\"token\", \"environment\"}:\n return self.reset_database_list(build_config)\n\n # Database selection change\n if field_name == \"database_name\" and not isinstance(field_value, dict):\n return self._handle_database_selection(build_config, field_value)\n\n # Collection selection change\n if field_name == \"collection_name\" and not isinstance(field_value, dict):\n return self._handle_collection_selection(build_config, field_value)\n\n return build_config\n\n async def _create_new_database(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new database and update build config options.\"\"\"\n try:\n await self.create_database_api(\n new_database_name=field_value[\"01_new_database_name\"],\n token=self.token,\n keyspace=self.get_keyspace(),\n environment=self.environment,\n cloud_provider=field_value[\"02_cloud_provider\"],\n region=field_value[\"03_region\"],\n )\n except Exception as e:\n msg = f\"Error creating database: {e}\"\n raise ValueError(msg) from e\n\n build_config[\"database_name\"][\"options\"].append(field_value[\"01_new_database_name\"])\n build_config[\"database_name\"][\"options_metadata\"].append(\n {\n \"status\": \"PENDING\",\n \"collections\": 0,\n \"api_endpoint\": None,\n \"org_id\": None,\n }\n )\n\n def _update_cloud_regions(self, build_config: dict, field_value: dict) -> dict:\n \"\"\"Update cloud provider regions in build config.\"\"\"\n env = self.environment or \"prod\"\n cloud_provider = field_value[\"02_cloud_provider\"]\n\n # Update the region options based on the selected cloud provider\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"03_region\"][\"options\"] = self.map_cloud_providers()[env][cloud_provider][\"regions\"]\n\n # Reset the the 03_region value if it's not in the new options\n if template[\"03_region\"][\"value\"] not in template[\"03_region\"][\"options\"]:\n template[\"03_region\"][\"value\"] = None\n\n return build_config\n\n async def _create_new_collection(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new collection and update build config options.\"\"\"\n embedding_provider = field_value.get(\"02_embedding_generation_provider\")\n try:\n await self.create_collection_api(\n new_collection_name=field_value[\"01_new_collection_name\"],\n token=self.token,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n environment=self.environment,\n keyspace=self.get_keyspace(),\n dimension=field_value.get(\"04_dimension\") if embedding_provider == \"Bring your own\" else None,\n embedding_generation_provider=embedding_provider,\n embedding_generation_model=field_value.get(\"03_embedding_generation_model\"),\n )\n except Exception as e:\n msg = f\"Error creating collection: {e}\"\n raise ValueError(msg) from e\n\n provider = embedding_provider.lower() if embedding_provider and embedding_provider != \"Bring your own\" else None\n build_config[\"collection_name\"].update(\n {\n \"value\": field_value[\"01_new_collection_name\"],\n \"options\": build_config[\"collection_name\"][\"options\"] + [field_value[\"01_new_collection_name\"]],\n }\n )\n build_config[\"embedding_choice\"][\"value\"] = \"Astra Vectorize\" if provider else \"Embedding Model\"\n build_config[\"embedding_model\"][\"advanced\"] = bool(provider)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": provider,\n \"icon\": self.get_provider_icon(provider_name=embedding_provider),\n \"model\": field_value.get(\"03_embedding_generation_model\"),\n }\n )\n\n def _handle_database_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle database selection and update related configurations.\"\"\"\n build_config = self.reset_database_list(build_config)\n\n # Reset collection list if database selection changes\n if field_value not in build_config[\"database_name\"][\"options\"]:\n build_config[\"database_name\"][\"value\"] = \"\"\n return build_config\n\n # Get the api endpoint for the selected database\n index = build_config[\"database_name\"][\"options\"].index(field_value)\n build_config[\"api_endpoint\"][\"value\"] = build_config[\"database_name\"][\"options_metadata\"][index][\"api_endpoint\"]\n\n # Get the org_id for the selected database\n org_id = build_config[\"database_name\"][\"options_metadata\"][index][\"org_id\"]\n if not org_id:\n return build_config\n\n # Get the database id for the selected database\n db_id = self.get_database_id_static(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n keyspace = self.get_keyspace() or \"default_keyspace\"\n\n # Update the helper text for the embedding provider field\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_embedding_generation_provider\"][\"helper_text\"] = (\n \"To create collections with more embedding provider options, go to \"\n f''\n \"your database in Astra DB.\"\n )\n\n # Reset provider options\n build_config = self.reset_provider_options(build_config)\n\n return self.reset_collection_list(build_config)\n\n def _handle_collection_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle collection selection and update embedding options.\"\"\"\n build_config[\"autodetect_collection\"][\"value\"] = True\n build_config = self.reset_collection_list(build_config)\n\n if field_value and field_value not in build_config[\"collection_name\"][\"options\"]:\n build_config[\"collection_name\"][\"options\"].append(field_value)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": None,\n \"icon\": \"vectorstores\",\n \"model\": None,\n }\n )\n build_config[\"autodetect_collection\"][\"value\"] = False\n\n if not field_value:\n return build_config\n\n index = build_config[\"collection_name\"][\"options\"].index(field_value)\n provider = build_config[\"collection_name\"][\"options_metadata\"][index][\"provider\"]\n build_config[\"embedding_model\"][\"advanced\"] = bool(provider)\n build_config[\"embedding_choice\"][\"value\"] = \"Astra Vectorize\" if provider else \"Embedding Model\"\n return build_config\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBVectorStore\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n # Get the embedding model and additional params\n embedding_params = (\n {\"embedding\": self.embedding_model}\n if self.embedding_model and self.embedding_choice == \"Embedding Model\"\n else {}\n )\n\n # Get the additional parameters\n additional_params = self.astradb_vectorstore_kwargs or {}\n\n # Get Langflow version and platform information\n __version__ = get_version_info()[\"version\"]\n langflow_prefix = \"\"\n # if os.getenv(\"AWS_EXECUTION_ENV\") == \"AWS_ECS_FARGATE\": # TODO: More precise way of detecting\n # langflow_prefix = \"ds-\"\n\n # Get the database object\n database = self.get_database_object()\n autodetect = self.collection_name in database.list_collection_names() and self.autodetect_collection\n\n # Bundle up the auto-detect parameters\n autodetect_params = {\n \"autodetect_collection\": autodetect,\n \"content_field\": (\n self.content_field\n if self.content_field and embedding_params\n else (\n \"page_content\"\n if embedding_params\n and self.collection_data(collection_name=self.collection_name, database=database) == 0\n else None\n )\n ),\n \"ignore_invalid_documents\": self.ignore_invalid_documents,\n }\n\n # Attempt to build the Vector Store object\n try:\n vector_store = AstraDBVectorStore(\n # Astra DB Authentication Parameters\n token=self.token,\n api_endpoint=database.api_endpoint,\n namespace=database.keyspace,\n collection_name=self.collection_name,\n environment=self.environment,\n # Astra DB Usage Tracking Parameters\n ext_callers=[(f\"{langflow_prefix}langflow\", __version__)],\n # Astra DB Vector Store Parameters\n **autodetect_params,\n **embedding_params,\n **additional_params,\n )\n except Exception as e:\n msg = f\"Error initializing AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n # Add documents to the vector store\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n if documents and self.deletion_field:\n self.log(f\"Deleting documents where {self.deletion_field}\")\n try:\n database = self.get_database_object()\n collection = database.get_collection(self.collection_name, keyspace=database.keyspace)\n delete_values = list({doc.metadata[self.deletion_field] for doc in documents})\n self.log(f\"Deleting documents where {self.deletion_field} matches {delete_values}.\")\n collection.delete_many({f\"metadata.{self.deletion_field}\": {\"$in\": delete_values}})\n except Exception as e:\n msg = f\"Error deleting documents from AstraDBVectorStore based on '{self.deletion_field}': {e}\"\n raise ValueError(msg) from e\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except Exception as e:\n msg = f\"Error adding documents to AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n search_type_mapping = {\n \"Similarity with score threshold\": \"similarity_score_threshold\",\n \"MMR (Max Marginal Relevance)\": \"mmr\",\n }\n\n return search_type_mapping.get(self.search_type, \"similarity\")\n\n def _build_search_args(self):\n query = self.search_query if isinstance(self.search_query, str) and self.search_query.strip() else None\n\n if query:\n args = {\n \"query\": query,\n \"search_type\": self._map_search_type(),\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n }\n elif self.advanced_search_filter:\n args = {\n \"n\": self.number_of_results,\n }\n else:\n return {}\n\n filter_arg = self.advanced_search_filter or {}\n if filter_arg:\n args[\"filter\"] = filter_arg\n\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n vector_store = vector_store or self.build_vector_store()\n\n self.log(f\"Search input: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n\n try:\n search_args = self._build_search_args()\n except Exception as e:\n msg = f\"Error in AstraDBVectorStore._build_search_args: {e}\"\n raise ValueError(msg) from e\n\n if not search_args:\n self.log(\"No search input or filters provided. Skipping search.\")\n return []\n\n docs = []\n search_method = \"search\" if \"query\" in search_args else \"metadata_search\"\n\n try:\n self.log(f\"Calling vector_store.{search_method} with args: {search_args}\")\n docs = getattr(vector_store, search_method)(**search_args)\n except Exception as e:\n msg = f\"Error performing {search_method} in AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n self.log(f\"Converted documents to data: {len(data)}\")\n self.status = data\n\n return data\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n" + "value": "import re\nfrom collections import defaultdict\nfrom dataclasses import asdict, dataclass, field\n\nfrom astrapy import AstraDBAdmin, DataAPIClient, Database\nfrom astrapy.info import CollectionDescriptor\nfrom langchain_astradb import AstraDBVectorStore, CollectionVectorServiceOptions\n\nfrom langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom langflow.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom langflow.helpers import docs_to_data\nfrom langflow.inputs import FloatInput, NestedDictInput\nfrom langflow.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n SecretStrInput,\n StrInput,\n)\nfrom langflow.schema import Data\nfrom langflow.utils.version import get_version_info\n\n\n@vector_store_connection\nclass AstraDBVectorStoreComponent(LCVectorStoreComponent):\n display_name: str = \"Astra DB\"\n description: str = \"Ingest and search documents in Astra DB\"\n documentation: str = \"https://docs.datastax.com/en/langflow/astra-components.html\"\n name = \"AstraDB\"\n icon: str = \"AstraDB\"\n\n _cached_vector_store: AstraDBVectorStore | None = None\n\n @dataclass\n class NewDatabaseInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_database\",\n \"description\": \"Please allow several minutes for creation to complete.\",\n \"display_name\": \"Create new database\",\n \"field_order\": [\"01_new_database_name\", \"02_cloud_provider\", \"03_region\"],\n \"template\": {\n \"01_new_database_name\": StrInput(\n name=\"new_database_name\",\n display_name=\"Name\",\n info=\"Name of the new database to create in Astra DB.\",\n required=True,\n ),\n \"02_cloud_provider\": DropdownInput(\n name=\"cloud_provider\",\n display_name=\"Cloud provider\",\n info=\"Cloud provider for the new database.\",\n options=[],\n required=True,\n real_time_refresh=True,\n ),\n \"03_region\": DropdownInput(\n name=\"region\",\n display_name=\"Region\",\n info=\"Region for the new database.\",\n options=[],\n required=True,\n ),\n },\n },\n }\n }\n )\n\n @dataclass\n class NewCollectionInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_collection\",\n \"description\": \"Please allow several seconds for creation to complete.\",\n \"display_name\": \"Create new collection\",\n \"field_order\": [\n \"01_new_collection_name\",\n \"02_embedding_generation_provider\",\n \"03_embedding_generation_model\",\n \"04_dimension\",\n ],\n \"template\": {\n \"01_new_collection_name\": StrInput(\n name=\"new_collection_name\",\n display_name=\"Name\",\n info=\"Name of the new collection to create in Astra DB.\",\n required=True,\n ),\n \"02_embedding_generation_provider\": DropdownInput(\n name=\"embedding_generation_provider\",\n display_name=\"Embedding generation method\",\n info=\"Provider to use for generating embeddings.\",\n helper_text=(\n \"To create collections with more embedding provider options, go to \"\n 'your database in Astra DB'\n ),\n real_time_refresh=True,\n required=True,\n options=[],\n ),\n \"03_embedding_generation_model\": DropdownInput(\n name=\"embedding_generation_model\",\n display_name=\"Embedding model\",\n info=\"Model to use for generating embeddings.\",\n real_time_refresh=True,\n options=[],\n ),\n \"04_dimension\": IntInput(\n name=\"dimension\",\n display_name=\"Dimensions\",\n info=\"Dimensions of the embeddings to generate.\",\n value=None,\n ),\n },\n },\n }\n }\n )\n\n inputs = [\n SecretStrInput(\n name=\"token\",\n display_name=\"Astra DB Application Token\",\n info=\"Authentication token for accessing Astra DB.\",\n value=\"ASTRA_DB_APPLICATION_TOKEN\",\n required=True,\n real_time_refresh=True,\n input_types=[],\n ),\n StrInput(\n name=\"environment\",\n display_name=\"Environment\",\n info=\"The environment for the Astra DB API Endpoint.\",\n advanced=True,\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"database_name\",\n display_name=\"Database\",\n info=\"The Database name for the Astra DB instance.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewDatabaseInput()),\n combobox=True,\n ),\n StrInput(\n name=\"api_endpoint\",\n display_name=\"Astra DB API Endpoint\",\n info=\"The API Endpoint for the Astra DB instance. Supercedes database selection.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"collection_name\",\n display_name=\"Collection\",\n info=\"The name of the collection within Astra DB where the vectors will be stored.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewCollectionInput()),\n combobox=True,\n advanced=True,\n ),\n StrInput(\n name=\"keyspace\",\n display_name=\"Keyspace\",\n info=\"Optional keyspace within Astra DB to use for the collection.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"embedding_choice\",\n display_name=\"Embedding Model or Astra Vectorize\",\n info=\"Choose an embedding model or use Astra Vectorize.\",\n options=[\"Embedding Model\", \"Astra Vectorize\"],\n value=\"Embedding Model\",\n advanced=True,\n real_time_refresh=True,\n ),\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Specify the Embedding Model. Not required for Astra Vectorize collections.\",\n required=False,\n ),\n *LCVectorStoreComponent.inputs,\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Search Results\",\n info=\"Number of search results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\"Similarity\", \"Similarity with score threshold\", \"MMR (Max Marginal Relevance)\"],\n value=\"Similarity\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n NestedDictInput(\n name=\"advanced_search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n ),\n BoolInput(\n name=\"autodetect_collection\",\n display_name=\"Autodetect Collection\",\n info=\"Boolean flag to determine whether to autodetect the collection.\",\n advanced=True,\n value=True,\n ),\n StrInput(\n name=\"content_field\",\n display_name=\"Content Field\",\n info=\"Field to use as the text content field for the vector store.\",\n advanced=True,\n ),\n StrInput(\n name=\"deletion_field\",\n display_name=\"Deletion Based On Field\",\n info=\"When this parameter is provided, documents in the target collection with \"\n \"metadata field values matching the input metadata field value will be deleted \"\n \"before new data is loaded.\",\n advanced=True,\n ),\n BoolInput(\n name=\"ignore_invalid_documents\",\n display_name=\"Ignore Invalid Documents\",\n info=\"Boolean flag to determine whether to ignore invalid documents at runtime.\",\n advanced=True,\n ),\n NestedDictInput(\n name=\"astradb_vectorstore_kwargs\",\n display_name=\"AstraDBVectorStore Parameters\",\n info=\"Optional dictionary of additional parameters for the AstraDBVectorStore.\",\n advanced=True,\n ),\n ]\n\n @classmethod\n def map_cloud_providers(cls):\n # TODO: Programmatically fetch the regions for each cloud provider\n return {\n \"dev\": {\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\"],\n },\n },\n # TODO: Check test regions\n \"test\": {\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\"],\n },\n },\n \"prod\": {\n \"Amazon Web Services\": {\n \"id\": \"aws\",\n \"regions\": [\"us-east-2\", \"ap-south-1\", \"eu-west-1\"],\n },\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-east1\"],\n },\n \"Microsoft Azure\": {\n \"id\": \"azure\",\n \"regions\": [\"westus3\"],\n },\n },\n }\n\n @classmethod\n def get_vectorize_providers(cls, token: str, environment: str | None = None, api_endpoint: str | None = None):\n try:\n # Get the admin object\n admin = AstraDBAdmin(token=token, environment=environment)\n db_admin = admin.get_database_admin(api_endpoint=api_endpoint)\n\n # Get the list of embedding providers\n embedding_providers = db_admin.find_embedding_providers().as_dict()\n\n vectorize_providers_mapping = {}\n # Map the provider display name to the provider key and models\n for provider_key, provider_data in embedding_providers[\"embeddingProviders\"].items():\n # Get the provider display name and models\n display_name = provider_data[\"displayName\"]\n models = [model[\"name\"] for model in provider_data[\"models\"]]\n\n # Build our mapping\n vectorize_providers_mapping[display_name] = [provider_key, models]\n\n # Sort the resulting dictionary\n return defaultdict(list, dict(sorted(vectorize_providers_mapping.items())))\n except Exception as _: # noqa: BLE001\n return {}\n\n @classmethod\n async def create_database_api(\n cls,\n new_database_name: str,\n cloud_provider: str,\n region: str,\n token: str,\n environment: str | None = None,\n keyspace: str | None = None,\n ):\n client = DataAPIClient(token=token, environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the environment, set to prod if null like\n my_env = environment or \"prod\"\n\n # Raise a value error if name isn't provided\n if not new_database_name:\n msg = \"Database name is required to create a new database.\"\n raise ValueError(msg)\n\n # Call the create database function\n return await admin_client.async_create_database(\n name=new_database_name,\n cloud_provider=cls.map_cloud_providers()[my_env][cloud_provider][\"id\"],\n region=region,\n keyspace=keyspace,\n wait_until_active=False,\n )\n\n @classmethod\n async def create_collection_api(\n cls,\n new_collection_name: str,\n token: str,\n api_endpoint: str,\n environment: str | None = None,\n keyspace: str | None = None,\n dimension: int | None = None,\n embedding_generation_provider: str | None = None,\n embedding_generation_model: str | None = None,\n ):\n # Create the data API client\n client = DataAPIClient(token=token, environment=environment)\n\n # Get the database object\n database = client.get_async_database(api_endpoint=api_endpoint, token=token)\n\n # Build vectorize options, if needed\n vectorize_options = None\n if not dimension:\n vectorize_options = CollectionVectorServiceOptions(\n provider=cls.get_vectorize_providers(\n token=token, environment=environment, api_endpoint=api_endpoint\n ).get(embedding_generation_provider, [None, []])[0],\n model_name=embedding_generation_model,\n )\n\n # Raise a value error if name isn't provided\n if not new_collection_name:\n msg = \"Collection name is required to create a new collection.\"\n raise ValueError(msg)\n\n # Create the collection\n return await database.create_collection(\n name=new_collection_name,\n keyspace=keyspace,\n dimension=dimension,\n service=vectorize_options,\n )\n\n @classmethod\n def get_database_list_static(cls, token: str, environment: str | None = None):\n client = DataAPIClient(token=token, environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the list of databases\n db_list = list(admin_client.list_databases())\n\n # Set the environment properly\n env_string = \"\"\n if environment and environment != \"prod\":\n env_string = f\"-{environment}\"\n\n # Generate the api endpoint for each database\n db_info_dict = {}\n for db in db_list:\n try:\n # Get the API endpoint for the database\n api_endpoint = f\"https://{db.info.id}-{db.info.region}.apps.astra{env_string}.datastax.com\"\n\n # Get the number of collections\n try:\n num_collections = len(\n list(\n client.get_database(\n api_endpoint=api_endpoint, token=token, keyspace=db.info.keyspace\n ).list_collection_names(keyspace=db.info.keyspace)\n )\n )\n except Exception: # noqa: BLE001\n if db.status != \"PENDING\":\n continue\n num_collections = 0\n\n # Add the database to the dictionary\n db_info_dict[db.info.name] = {\n \"api_endpoint\": api_endpoint,\n \"collections\": num_collections,\n \"status\": db.status if db.status != \"ACTIVE\" else None,\n \"org_id\": db.org_id if db.org_id else None,\n }\n except Exception: # noqa: BLE001, S110\n pass\n\n return db_info_dict\n\n def get_database_list(self):\n return self.get_database_list_static(token=self.token, environment=self.environment)\n\n @classmethod\n def get_api_endpoint_static(\n cls,\n token: str,\n environment: str | None = None,\n api_endpoint: str | None = None,\n database_name: str | None = None,\n ):\n # If the api_endpoint is set, return it\n if api_endpoint:\n return api_endpoint\n\n # Check if the database_name is like a url\n if database_name and database_name.startswith(\"https://\"):\n return database_name\n\n # If the database is not set, nothing we can do.\n if not database_name:\n return None\n\n # Grab the database object\n db = cls.get_database_list_static(token=token, environment=environment).get(database_name)\n if not db:\n return None\n\n # Otherwise, get the URL from the database list\n return db.get(\"api_endpoint\")\n\n def get_api_endpoint(self):\n return self.get_api_endpoint_static(\n token=self.token,\n environment=self.environment,\n api_endpoint=self.api_endpoint,\n database_name=self.database_name,\n )\n\n @classmethod\n def get_database_id_static(cls, api_endpoint: str) -> str | None:\n # Pattern matches standard UUID format: 8-4-4-4-12 hexadecimal characters\n uuid_pattern = r\"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\"\n match = re.search(uuid_pattern, api_endpoint)\n\n return match.group(0) if match else None\n\n def get_database_id(self):\n return self.get_database_id_static(api_endpoint=self.get_api_endpoint())\n\n def get_keyspace(self):\n keyspace = self.keyspace\n\n if keyspace:\n return keyspace.strip()\n\n return None\n\n def get_database_object(self, api_endpoint: str | None = None):\n try:\n client = DataAPIClient(token=self.token, environment=self.environment)\n\n return client.get_database(\n api_endpoint=api_endpoint or self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n except Exception as e:\n msg = f\"Error fetching database object: {e}\"\n raise ValueError(msg) from e\n\n def collection_data(self, collection_name: str, database: Database | None = None):\n try:\n if not database:\n client = DataAPIClient(token=self.token, environment=self.environment)\n\n database = client.get_database(\n api_endpoint=self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n\n collection = database.get_collection(collection_name, keyspace=self.get_keyspace())\n\n return collection.estimated_document_count()\n except Exception as e: # noqa: BLE001\n self.log(f\"Error checking collection data: {e}\")\n\n return None\n\n def _initialize_database_options(self):\n try:\n return [\n {\n \"name\": name,\n \"status\": info[\"status\"],\n \"collections\": info[\"collections\"],\n \"api_endpoint\": info[\"api_endpoint\"],\n \"org_id\": info[\"org_id\"],\n }\n for name, info in self.get_database_list().items()\n ]\n except Exception as e:\n msg = f\"Error fetching database options: {e}\"\n raise ValueError(msg) from e\n\n @classmethod\n def get_provider_icon(cls, collection: CollectionDescriptor | None = None, provider_name: str | None = None) -> str:\n # Get the provider name from the collection\n provider_name = provider_name or (\n collection.options.vector.service.provider\n if collection and collection.options and collection.options.vector and collection.options.vector.service\n else None\n )\n\n # If there is no provider, use the vector store icon\n if not provider_name or provider_name == \"Bring your own\":\n return \"vectorstores\"\n\n # Map provider casings\n case_map = {\n \"nvidia\": \"NVIDIA\",\n \"openai\": \"OpenAI\",\n \"amazon bedrock\": \"AmazonBedrockEmbeddings\",\n \"azure openai\": \"AzureOpenAiEmbeddings\",\n \"cohere\": \"Cohere\",\n \"jina ai\": \"JinaAI\",\n \"mistral ai\": \"MistralAI\",\n \"upstage\": \"Upstage\",\n \"voyage ai\": \"VoyageAI\",\n }\n\n # Adjust the casing on some like nvidia\n return case_map[provider_name.lower()] if provider_name.lower() in case_map else provider_name.title()\n\n def _initialize_collection_options(self, api_endpoint: str | None = None):\n # Nothing to generate if we don't have an API endpoint yet\n api_endpoint = api_endpoint or self.get_api_endpoint()\n if not api_endpoint:\n return []\n\n # Retrieve the database object\n database = self.get_database_object(api_endpoint=api_endpoint)\n\n # Get the list of collections\n collection_list = list(database.list_collections(keyspace=self.get_keyspace()))\n\n # Return the list of collections and metadata associated\n return [\n {\n \"name\": col.name,\n \"records\": self.collection_data(collection_name=col.name, database=database),\n \"provider\": (\n col.options.vector.service.provider if col.options.vector and col.options.vector.service else None\n ),\n \"icon\": self.get_provider_icon(collection=col),\n \"model\": (\n col.options.vector.service.model_name if col.options.vector and col.options.vector.service else None\n ),\n }\n for col in collection_list\n ]\n\n def reset_provider_options(self, build_config: dict) -> dict:\n \"\"\"Reset provider options and related configurations in the build_config dictionary.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get vectorize providers\n vectorize_providers_api = self.get_vectorize_providers(\n token=self.token,\n environment=self.environment,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n )\n\n # Create a new dictionary with \"Bring your own\" first\n vectorize_providers: dict[str, list[list[str]]] = {\"Bring your own\": [[], []]}\n\n # Add the remaining items (only Nvidia) from the original dictionary\n vectorize_providers.update(\n {\n k: v\n for k, v in vectorize_providers_api.items()\n if k.lower() in [\"nvidia\"] # TODO: Eventually support more\n }\n )\n\n # Set provider options\n provider_field = \"02_embedding_generation_provider\"\n template[provider_field][\"options\"] = list(vectorize_providers.keys())\n\n # Add metadata for each provider option\n template[provider_field][\"options_metadata\"] = [\n {\"icon\": self.get_provider_icon(provider_name=provider)} for provider in template[provider_field][\"options\"]\n ]\n\n # Get selected embedding provider\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure embedding model field\n model_field = \"03_embedding_generation_model\"\n template[model_field].update(\n {\n \"options\": vectorize_providers.get(embedding_provider, [[], []])[1],\n \"placeholder\": \"Bring your own\" if is_bring_your_own else None,\n \"readonly\": is_bring_your_own,\n \"required\": not is_bring_your_own,\n \"value\": None,\n }\n )\n\n # If this is a bring your own, set dimensions to 0\n return self.reset_dimension_field(build_config)\n\n def reset_dimension_field(self, build_config: dict) -> dict:\n \"\"\"Reset dimension field options based on provided configuration.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get selected embedding model\n provider_field = \"02_embedding_generation_provider\"\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure dimension field\n dimension_field = \"04_dimension\"\n dimension_value = 1024 if not is_bring_your_own else None # TODO: Dynamically figure this out\n template[dimension_field].update(\n {\n \"placeholder\": dimension_value,\n \"value\": dimension_value,\n \"readonly\": not is_bring_your_own,\n \"required\": is_bring_your_own,\n }\n )\n\n return build_config\n\n def reset_collection_list(self, build_config: dict) -> dict:\n \"\"\"Reset collection list options based on provided configuration.\"\"\"\n # Get collection options\n collection_options = self._initialize_collection_options(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n\n # Update collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update(\n {\n \"options\": [col[\"name\"] for col in collection_options],\n \"options_metadata\": [{k: v for k, v in col.items() if k != \"name\"} for col in collection_options],\n }\n )\n\n # Reset selected collection if not in options\n if collection_config[\"value\"] not in collection_config[\"options\"]:\n collection_config[\"value\"] = \"\"\n\n # Set advanced status based on database selection\n collection_config[\"advanced\"] = not build_config[\"database_name\"][\"value\"]\n\n return build_config\n\n def reset_database_list(self, build_config: dict) -> dict:\n \"\"\"Reset database list options and related configurations.\"\"\"\n # Get database options\n database_options = self._initialize_database_options()\n\n # Update cloud provider options\n env = self.environment or \"prod\"\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_cloud_provider\"][\"options\"] = list(self.map_cloud_providers()[env].keys())\n\n # Update database configuration\n database_config = build_config[\"database_name\"]\n database_config.update(\n {\n \"options\": [db[\"name\"] for db in database_options],\n \"options_metadata\": [{k: v for k, v in db.items() if k != \"name\"} for db in database_options],\n }\n )\n\n # Reset selections if value not in options\n if database_config[\"value\"] not in database_config[\"options\"]:\n database_config[\"value\"] = \"\"\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n build_config[\"collection_name\"][\"advanced\"] = True\n\n # Set advanced status based on token presence\n database_config[\"advanced\"] = not build_config[\"token\"][\"value\"]\n\n return build_config\n\n def reset_build_config(self, build_config: dict) -> dict:\n \"\"\"Reset all build configuration options to default empty state.\"\"\"\n # Reset database configuration\n database_config = build_config[\"database_name\"]\n database_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"advanced\": True})\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n\n # Reset collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"advanced\": True})\n\n return build_config\n\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Update build configuration based on field name and value.\"\"\"\n # Early return if no token provided\n if not self.token:\n return self.reset_build_config(build_config)\n\n # Database creation callback\n if field_name == \"database_name\" and isinstance(field_value, dict):\n if \"01_new_database_name\" in field_value:\n await self._create_new_database(build_config, field_value)\n return self.reset_collection_list(build_config)\n return self._update_cloud_regions(build_config, field_value)\n\n # Collection creation callback\n if field_name == \"collection_name\" and isinstance(field_value, dict):\n # Case 1: New collection creation\n if \"01_new_collection_name\" in field_value:\n await self._create_new_collection(build_config, field_value)\n return build_config\n\n # Case 2: Update embedding provider options\n if \"02_embedding_generation_provider\" in field_value:\n return self.reset_provider_options(build_config)\n\n # Case 3: Update dimension field\n if \"03_embedding_generation_model\" in field_value:\n return self.reset_dimension_field(build_config)\n\n # Initial execution or token/environment change\n first_run = field_name == \"collection_name\" and not field_value and not build_config[\"database_name\"][\"options\"]\n if first_run or field_name in {\"token\", \"environment\"}:\n return self.reset_database_list(build_config)\n\n # Database selection change\n if field_name == \"database_name\" and not isinstance(field_value, dict):\n return self._handle_database_selection(build_config, field_value)\n\n # Collection selection change\n if field_name == \"collection_name\" and not isinstance(field_value, dict):\n return self._handle_collection_selection(build_config, field_value)\n\n return build_config\n\n async def _create_new_database(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new database and update build config options.\"\"\"\n try:\n await self.create_database_api(\n new_database_name=field_value[\"01_new_database_name\"],\n token=self.token,\n keyspace=self.get_keyspace(),\n environment=self.environment,\n cloud_provider=field_value[\"02_cloud_provider\"],\n region=field_value[\"03_region\"],\n )\n except Exception as e:\n msg = f\"Error creating database: {e}\"\n raise ValueError(msg) from e\n\n build_config[\"database_name\"][\"options\"].append(field_value[\"01_new_database_name\"])\n build_config[\"database_name\"][\"options_metadata\"].append(\n {\n \"status\": \"PENDING\",\n \"collections\": 0,\n \"api_endpoint\": None,\n \"org_id\": None,\n }\n )\n\n def _update_cloud_regions(self, build_config: dict, field_value: dict) -> dict:\n \"\"\"Update cloud provider regions in build config.\"\"\"\n env = self.environment or \"prod\"\n cloud_provider = field_value[\"02_cloud_provider\"]\n\n # Update the region options based on the selected cloud provider\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"03_region\"][\"options\"] = self.map_cloud_providers()[env][cloud_provider][\"regions\"]\n\n # Reset the the 03_region value if it's not in the new options\n if template[\"03_region\"][\"value\"] not in template[\"03_region\"][\"options\"]:\n template[\"03_region\"][\"value\"] = None\n\n return build_config\n\n async def _create_new_collection(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new collection and update build config options.\"\"\"\n embedding_provider = field_value.get(\"02_embedding_generation_provider\")\n try:\n await self.create_collection_api(\n new_collection_name=field_value[\"01_new_collection_name\"],\n token=self.token,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n environment=self.environment,\n keyspace=self.get_keyspace(),\n dimension=field_value.get(\"04_dimension\") if embedding_provider == \"Bring your own\" else None,\n embedding_generation_provider=embedding_provider,\n embedding_generation_model=field_value.get(\"03_embedding_generation_model\"),\n )\n except Exception as e:\n msg = f\"Error creating collection: {e}\"\n raise ValueError(msg) from e\n\n provider = embedding_provider.lower() if embedding_provider and embedding_provider != \"Bring your own\" else None\n build_config[\"collection_name\"].update(\n {\n \"value\": field_value[\"01_new_collection_name\"],\n \"options\": build_config[\"collection_name\"][\"options\"] + [field_value[\"01_new_collection_name\"]],\n }\n )\n build_config[\"embedding_choice\"][\"value\"] = \"Astra Vectorize\" if provider else \"Embedding Model\"\n build_config[\"embedding_model\"][\"advanced\"] = bool(provider)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": provider,\n \"icon\": self.get_provider_icon(provider_name=embedding_provider),\n \"model\": field_value.get(\"03_embedding_generation_model\"),\n }\n )\n\n def _handle_database_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle database selection and update related configurations.\"\"\"\n build_config = self.reset_database_list(build_config)\n\n # Reset collection list if database selection changes\n if field_value not in build_config[\"database_name\"][\"options\"]:\n build_config[\"database_name\"][\"value\"] = \"\"\n return build_config\n\n # Get the api endpoint for the selected database\n index = build_config[\"database_name\"][\"options\"].index(field_value)\n build_config[\"api_endpoint\"][\"value\"] = build_config[\"database_name\"][\"options_metadata\"][index][\"api_endpoint\"]\n\n # Get the org_id for the selected database\n org_id = build_config[\"database_name\"][\"options_metadata\"][index][\"org_id\"]\n if not org_id:\n return build_config\n\n # Get the database id for the selected database\n db_id = self.get_database_id_static(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n keyspace = self.get_keyspace() or \"default_keyspace\"\n\n # Update the helper text for the embedding provider field\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_embedding_generation_provider\"][\"helper_text\"] = (\n \"To create collections with more embedding provider options, go to \"\n f''\n \"your database in Astra DB.\"\n )\n\n # Reset provider options\n build_config = self.reset_provider_options(build_config)\n\n return self.reset_collection_list(build_config)\n\n def _handle_collection_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle collection selection and update embedding options.\"\"\"\n build_config[\"autodetect_collection\"][\"value\"] = True\n build_config = self.reset_collection_list(build_config)\n\n if field_value and field_value not in build_config[\"collection_name\"][\"options\"]:\n build_config[\"collection_name\"][\"options\"].append(field_value)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": None,\n \"icon\": \"vectorstores\",\n \"model\": None,\n }\n )\n build_config[\"autodetect_collection\"][\"value\"] = False\n\n if not field_value:\n return build_config\n\n index = build_config[\"collection_name\"][\"options\"].index(field_value)\n provider = build_config[\"collection_name\"][\"options_metadata\"][index][\"provider\"]\n build_config[\"embedding_model\"][\"advanced\"] = bool(provider)\n build_config[\"embedding_choice\"][\"value\"] = \"Astra Vectorize\" if provider else \"Embedding Model\"\n return build_config\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBVectorStore\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n # Get the embedding model and additional params\n embedding_params = (\n {\"embedding\": self.embedding_model}\n if self.embedding_model and self.embedding_choice == \"Embedding Model\"\n else {}\n )\n\n # Get the additional parameters\n additional_params = self.astradb_vectorstore_kwargs or {}\n\n # Get Langflow version and platform information\n __version__ = get_version_info()[\"version\"]\n langflow_prefix = \"\"\n # if os.getenv(\"AWS_EXECUTION_ENV\") == \"AWS_ECS_FARGATE\": # TODO: More precise way of detecting\n # langflow_prefix = \"ds-\"\n\n # Get the database object\n database = self.get_database_object()\n autodetect = self.collection_name in database.list_collection_names() and self.autodetect_collection\n\n # Bundle up the auto-detect parameters\n autodetect_params = {\n \"autodetect_collection\": autodetect,\n \"content_field\": (\n self.content_field\n if self.content_field and embedding_params\n else (\n \"page_content\"\n if embedding_params\n and self.collection_data(collection_name=self.collection_name, database=database) == 0\n else None\n )\n ),\n \"ignore_invalid_documents\": self.ignore_invalid_documents,\n }\n\n # Attempt to build the Vector Store object\n try:\n vector_store = AstraDBVectorStore(\n # Astra DB Authentication Parameters\n token=self.token,\n api_endpoint=database.api_endpoint,\n namespace=database.keyspace,\n collection_name=self.collection_name,\n environment=self.environment,\n # Astra DB Usage Tracking Parameters\n ext_callers=[(f\"{langflow_prefix}langflow\", __version__)],\n # Astra DB Vector Store Parameters\n **autodetect_params,\n **embedding_params,\n **additional_params,\n )\n except Exception as e:\n msg = f\"Error initializing AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n # Add documents to the vector store\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n self.ingest_data = self._prepare_ingest_data()\n\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n if documents and self.deletion_field:\n self.log(f\"Deleting documents where {self.deletion_field}\")\n try:\n database = self.get_database_object()\n collection = database.get_collection(self.collection_name, keyspace=database.keyspace)\n delete_values = list({doc.metadata[self.deletion_field] for doc in documents})\n self.log(f\"Deleting documents where {self.deletion_field} matches {delete_values}.\")\n collection.delete_many({f\"metadata.{self.deletion_field}\": {\"$in\": delete_values}})\n except Exception as e:\n msg = f\"Error deleting documents from AstraDBVectorStore based on '{self.deletion_field}': {e}\"\n raise ValueError(msg) from e\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except Exception as e:\n msg = f\"Error adding documents to AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n search_type_mapping = {\n \"Similarity with score threshold\": \"similarity_score_threshold\",\n \"MMR (Max Marginal Relevance)\": \"mmr\",\n }\n\n return search_type_mapping.get(self.search_type, \"similarity\")\n\n def _build_search_args(self):\n query = self.search_query if isinstance(self.search_query, str) and self.search_query.strip() else None\n\n if query:\n args = {\n \"query\": query,\n \"search_type\": self._map_search_type(),\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n }\n elif self.advanced_search_filter:\n args = {\n \"n\": self.number_of_results,\n }\n else:\n return {}\n\n filter_arg = self.advanced_search_filter or {}\n if filter_arg:\n args[\"filter\"] = filter_arg\n\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n vector_store = vector_store or self.build_vector_store()\n\n self.log(f\"Search input: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n\n try:\n search_args = self._build_search_args()\n except Exception as e:\n msg = f\"Error in AstraDBVectorStore._build_search_args: {e}\"\n raise ValueError(msg) from e\n\n if not search_args:\n self.log(\"No search input or filters provided. Skipping search.\")\n return []\n\n docs = []\n search_method = \"search\" if \"query\" in search_args else \"metadata_search\"\n\n try:\n self.log(f\"Calling vector_store.{search_method} with args: {search_args}\")\n docs = getattr(vector_store, search_method)(**search_args)\n except Exception as e:\n msg = f\"Error performing {search_method} in AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n self.log(f\"Converted documents to data: {len(data)}\")\n self.status = data\n\n return data\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n" }, "collection_name": { "_input_type": "DropdownInput", @@ -3831,9 +3832,10 @@ "dynamic": false, "info": "", "input_types": [ - "Data" + "Data", + "DataFrame" ], - "list": false, + "list": true, "list_add_label": "Add More", "name": "ingest_data", "placeholder": "", @@ -4189,7 +4191,7 @@ "show": true, "title_case": false, "type": "code", - "value": "import re\nfrom collections import defaultdict\nfrom dataclasses import asdict, dataclass, field\n\nfrom astrapy import AstraDBAdmin, DataAPIClient, Database\nfrom astrapy.info import CollectionDescriptor\nfrom langchain_astradb import AstraDBVectorStore, CollectionVectorServiceOptions\n\nfrom langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom langflow.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom langflow.helpers import docs_to_data\nfrom langflow.inputs import FloatInput, NestedDictInput\nfrom langflow.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n SecretStrInput,\n StrInput,\n)\nfrom langflow.schema import Data\nfrom langflow.utils.version import get_version_info\n\n\n@vector_store_connection\nclass AstraDBVectorStoreComponent(LCVectorStoreComponent):\n display_name: str = \"Astra DB\"\n description: str = \"Ingest and search documents in Astra DB\"\n documentation: str = \"https://docs.datastax.com/en/langflow/astra-components.html\"\n name = \"AstraDB\"\n icon: str = \"AstraDB\"\n\n _cached_vector_store: AstraDBVectorStore | None = None\n\n @dataclass\n class NewDatabaseInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_database\",\n \"description\": \"Please allow several minutes for creation to complete.\",\n \"display_name\": \"Create new database\",\n \"field_order\": [\"01_new_database_name\", \"02_cloud_provider\", \"03_region\"],\n \"template\": {\n \"01_new_database_name\": StrInput(\n name=\"new_database_name\",\n display_name=\"Name\",\n info=\"Name of the new database to create in Astra DB.\",\n required=True,\n ),\n \"02_cloud_provider\": DropdownInput(\n name=\"cloud_provider\",\n display_name=\"Cloud provider\",\n info=\"Cloud provider for the new database.\",\n options=[],\n required=True,\n real_time_refresh=True,\n ),\n \"03_region\": DropdownInput(\n name=\"region\",\n display_name=\"Region\",\n info=\"Region for the new database.\",\n options=[],\n required=True,\n ),\n },\n },\n }\n }\n )\n\n @dataclass\n class NewCollectionInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_collection\",\n \"description\": \"Please allow several seconds for creation to complete.\",\n \"display_name\": \"Create new collection\",\n \"field_order\": [\n \"01_new_collection_name\",\n \"02_embedding_generation_provider\",\n \"03_embedding_generation_model\",\n \"04_dimension\",\n ],\n \"template\": {\n \"01_new_collection_name\": StrInput(\n name=\"new_collection_name\",\n display_name=\"Name\",\n info=\"Name of the new collection to create in Astra DB.\",\n required=True,\n ),\n \"02_embedding_generation_provider\": DropdownInput(\n name=\"embedding_generation_provider\",\n display_name=\"Embedding generation method\",\n info=\"Provider to use for generating embeddings.\",\n helper_text=(\n \"To create collections with more embedding provider options, go to \"\n 'your database in Astra DB'\n ),\n real_time_refresh=True,\n required=True,\n options=[],\n ),\n \"03_embedding_generation_model\": DropdownInput(\n name=\"embedding_generation_model\",\n display_name=\"Embedding model\",\n info=\"Model to use for generating embeddings.\",\n real_time_refresh=True,\n options=[],\n ),\n \"04_dimension\": IntInput(\n name=\"dimension\",\n display_name=\"Dimensions\",\n info=\"Dimensions of the embeddings to generate.\",\n value=None,\n ),\n },\n },\n }\n }\n )\n\n inputs = [\n SecretStrInput(\n name=\"token\",\n display_name=\"Astra DB Application Token\",\n info=\"Authentication token for accessing Astra DB.\",\n value=\"ASTRA_DB_APPLICATION_TOKEN\",\n required=True,\n real_time_refresh=True,\n input_types=[],\n ),\n StrInput(\n name=\"environment\",\n display_name=\"Environment\",\n info=\"The environment for the Astra DB API Endpoint.\",\n advanced=True,\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"database_name\",\n display_name=\"Database\",\n info=\"The Database name for the Astra DB instance.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewDatabaseInput()),\n combobox=True,\n ),\n StrInput(\n name=\"api_endpoint\",\n display_name=\"Astra DB API Endpoint\",\n info=\"The API Endpoint for the Astra DB instance. Supercedes database selection.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"collection_name\",\n display_name=\"Collection\",\n info=\"The name of the collection within Astra DB where the vectors will be stored.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewCollectionInput()),\n combobox=True,\n advanced=True,\n ),\n StrInput(\n name=\"keyspace\",\n display_name=\"Keyspace\",\n info=\"Optional keyspace within Astra DB to use for the collection.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"embedding_choice\",\n display_name=\"Embedding Model or Astra Vectorize\",\n info=\"Choose an embedding model or use Astra Vectorize.\",\n options=[\"Embedding Model\", \"Astra Vectorize\"],\n value=\"Embedding Model\",\n advanced=True,\n real_time_refresh=True,\n ),\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Specify the Embedding Model. Not required for Astra Vectorize collections.\",\n required=False,\n ),\n *LCVectorStoreComponent.inputs,\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Search Results\",\n info=\"Number of search results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\"Similarity\", \"Similarity with score threshold\", \"MMR (Max Marginal Relevance)\"],\n value=\"Similarity\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n NestedDictInput(\n name=\"advanced_search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n ),\n BoolInput(\n name=\"autodetect_collection\",\n display_name=\"Autodetect Collection\",\n info=\"Boolean flag to determine whether to autodetect the collection.\",\n advanced=True,\n value=True,\n ),\n StrInput(\n name=\"content_field\",\n display_name=\"Content Field\",\n info=\"Field to use as the text content field for the vector store.\",\n advanced=True,\n ),\n StrInput(\n name=\"deletion_field\",\n display_name=\"Deletion Based On Field\",\n info=\"When this parameter is provided, documents in the target collection with \"\n \"metadata field values matching the input metadata field value will be deleted \"\n \"before new data is loaded.\",\n advanced=True,\n ),\n BoolInput(\n name=\"ignore_invalid_documents\",\n display_name=\"Ignore Invalid Documents\",\n info=\"Boolean flag to determine whether to ignore invalid documents at runtime.\",\n advanced=True,\n ),\n NestedDictInput(\n name=\"astradb_vectorstore_kwargs\",\n display_name=\"AstraDBVectorStore Parameters\",\n info=\"Optional dictionary of additional parameters for the AstraDBVectorStore.\",\n advanced=True,\n ),\n ]\n\n @classmethod\n def map_cloud_providers(cls):\n # TODO: Programmatically fetch the regions for each cloud provider\n return {\n \"dev\": {\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\"],\n },\n },\n # TODO: Check test regions\n \"test\": {\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\"],\n },\n },\n \"prod\": {\n \"Amazon Web Services\": {\n \"id\": \"aws\",\n \"regions\": [\"us-east-2\", \"ap-south-1\", \"eu-west-1\"],\n },\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-east1\"],\n },\n \"Microsoft Azure\": {\n \"id\": \"azure\",\n \"regions\": [\"westus3\"],\n },\n },\n }\n\n @classmethod\n def get_vectorize_providers(cls, token: str, environment: str | None = None, api_endpoint: str | None = None):\n try:\n # Get the admin object\n admin = AstraDBAdmin(token=token, environment=environment)\n db_admin = admin.get_database_admin(api_endpoint=api_endpoint)\n\n # Get the list of embedding providers\n embedding_providers = db_admin.find_embedding_providers().as_dict()\n\n vectorize_providers_mapping = {}\n # Map the provider display name to the provider key and models\n for provider_key, provider_data in embedding_providers[\"embeddingProviders\"].items():\n # Get the provider display name and models\n display_name = provider_data[\"displayName\"]\n models = [model[\"name\"] for model in provider_data[\"models\"]]\n\n # Build our mapping\n vectorize_providers_mapping[display_name] = [provider_key, models]\n\n # Sort the resulting dictionary\n return defaultdict(list, dict(sorted(vectorize_providers_mapping.items())))\n except Exception as _: # noqa: BLE001\n return {}\n\n @classmethod\n async def create_database_api(\n cls,\n new_database_name: str,\n cloud_provider: str,\n region: str,\n token: str,\n environment: str | None = None,\n keyspace: str | None = None,\n ):\n client = DataAPIClient(token=token, environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the environment, set to prod if null like\n my_env = environment or \"prod\"\n\n # Raise a value error if name isn't provided\n if not new_database_name:\n msg = \"Database name is required to create a new database.\"\n raise ValueError(msg)\n\n # Call the create database function\n return await admin_client.async_create_database(\n name=new_database_name,\n cloud_provider=cls.map_cloud_providers()[my_env][cloud_provider][\"id\"],\n region=region,\n keyspace=keyspace,\n wait_until_active=False,\n )\n\n @classmethod\n async def create_collection_api(\n cls,\n new_collection_name: str,\n token: str,\n api_endpoint: str,\n environment: str | None = None,\n keyspace: str | None = None,\n dimension: int | None = None,\n embedding_generation_provider: str | None = None,\n embedding_generation_model: str | None = None,\n ):\n # Create the data API client\n client = DataAPIClient(token=token, environment=environment)\n\n # Get the database object\n database = client.get_async_database(api_endpoint=api_endpoint, token=token)\n\n # Build vectorize options, if needed\n vectorize_options = None\n if not dimension:\n vectorize_options = CollectionVectorServiceOptions(\n provider=cls.get_vectorize_providers(\n token=token, environment=environment, api_endpoint=api_endpoint\n ).get(embedding_generation_provider, [None, []])[0],\n model_name=embedding_generation_model,\n )\n\n # Raise a value error if name isn't provided\n if not new_collection_name:\n msg = \"Collection name is required to create a new collection.\"\n raise ValueError(msg)\n\n # Create the collection\n return await database.create_collection(\n name=new_collection_name,\n keyspace=keyspace,\n dimension=dimension,\n service=vectorize_options,\n )\n\n @classmethod\n def get_database_list_static(cls, token: str, environment: str | None = None):\n client = DataAPIClient(token=token, environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the list of databases\n db_list = list(admin_client.list_databases())\n\n # Set the environment properly\n env_string = \"\"\n if environment and environment != \"prod\":\n env_string = f\"-{environment}\"\n\n # Generate the api endpoint for each database\n db_info_dict = {}\n for db in db_list:\n try:\n # Get the API endpoint for the database\n api_endpoint = f\"https://{db.info.id}-{db.info.region}.apps.astra{env_string}.datastax.com\"\n\n # Get the number of collections\n try:\n num_collections = len(\n list(\n client.get_database(\n api_endpoint=api_endpoint, token=token, keyspace=db.info.keyspace\n ).list_collection_names(keyspace=db.info.keyspace)\n )\n )\n except Exception: # noqa: BLE001\n if db.status != \"PENDING\":\n continue\n num_collections = 0\n\n # Add the database to the dictionary\n db_info_dict[db.info.name] = {\n \"api_endpoint\": api_endpoint,\n \"collections\": num_collections,\n \"status\": db.status if db.status != \"ACTIVE\" else None,\n \"org_id\": db.org_id if db.org_id else None,\n }\n except Exception: # noqa: BLE001, S110\n pass\n\n return db_info_dict\n\n def get_database_list(self):\n return self.get_database_list_static(token=self.token, environment=self.environment)\n\n @classmethod\n def get_api_endpoint_static(\n cls,\n token: str,\n environment: str | None = None,\n api_endpoint: str | None = None,\n database_name: str | None = None,\n ):\n # If the api_endpoint is set, return it\n if api_endpoint:\n return api_endpoint\n\n # Check if the database_name is like a url\n if database_name and database_name.startswith(\"https://\"):\n return database_name\n\n # If the database is not set, nothing we can do.\n if not database_name:\n return None\n\n # Grab the database object\n db = cls.get_database_list_static(token=token, environment=environment).get(database_name)\n if not db:\n return None\n\n # Otherwise, get the URL from the database list\n return db.get(\"api_endpoint\")\n\n def get_api_endpoint(self):\n return self.get_api_endpoint_static(\n token=self.token,\n environment=self.environment,\n api_endpoint=self.api_endpoint,\n database_name=self.database_name,\n )\n\n @classmethod\n def get_database_id_static(cls, api_endpoint: str) -> str | None:\n # Pattern matches standard UUID format: 8-4-4-4-12 hexadecimal characters\n uuid_pattern = r\"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\"\n match = re.search(uuid_pattern, api_endpoint)\n\n return match.group(0) if match else None\n\n def get_database_id(self):\n return self.get_database_id_static(api_endpoint=self.get_api_endpoint())\n\n def get_keyspace(self):\n keyspace = self.keyspace\n\n if keyspace:\n return keyspace.strip()\n\n return None\n\n def get_database_object(self, api_endpoint: str | None = None):\n try:\n client = DataAPIClient(token=self.token, environment=self.environment)\n\n return client.get_database(\n api_endpoint=api_endpoint or self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n except Exception as e:\n msg = f\"Error fetching database object: {e}\"\n raise ValueError(msg) from e\n\n def collection_data(self, collection_name: str, database: Database | None = None):\n try:\n if not database:\n client = DataAPIClient(token=self.token, environment=self.environment)\n\n database = client.get_database(\n api_endpoint=self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n\n collection = database.get_collection(collection_name, keyspace=self.get_keyspace())\n\n return collection.estimated_document_count()\n except Exception as e: # noqa: BLE001\n self.log(f\"Error checking collection data: {e}\")\n\n return None\n\n def _initialize_database_options(self):\n try:\n return [\n {\n \"name\": name,\n \"status\": info[\"status\"],\n \"collections\": info[\"collections\"],\n \"api_endpoint\": info[\"api_endpoint\"],\n \"org_id\": info[\"org_id\"],\n }\n for name, info in self.get_database_list().items()\n ]\n except Exception as e:\n msg = f\"Error fetching database options: {e}\"\n raise ValueError(msg) from e\n\n @classmethod\n def get_provider_icon(cls, collection: CollectionDescriptor | None = None, provider_name: str | None = None) -> str:\n # Get the provider name from the collection\n provider_name = provider_name or (\n collection.options.vector.service.provider\n if collection and collection.options and collection.options.vector and collection.options.vector.service\n else None\n )\n\n # If there is no provider, use the vector store icon\n if not provider_name or provider_name == \"Bring your own\":\n return \"vectorstores\"\n\n # Map provider casings\n case_map = {\n \"nvidia\": \"NVIDIA\",\n \"openai\": \"OpenAI\",\n \"amazon bedrock\": \"AmazonBedrockEmbeddings\",\n \"azure openai\": \"AzureOpenAiEmbeddings\",\n \"cohere\": \"Cohere\",\n \"jina ai\": \"JinaAI\",\n \"mistral ai\": \"MistralAI\",\n \"upstage\": \"Upstage\",\n \"voyage ai\": \"VoyageAI\",\n }\n\n # Adjust the casing on some like nvidia\n return case_map[provider_name.lower()] if provider_name.lower() in case_map else provider_name.title()\n\n def _initialize_collection_options(self, api_endpoint: str | None = None):\n # Nothing to generate if we don't have an API endpoint yet\n api_endpoint = api_endpoint or self.get_api_endpoint()\n if not api_endpoint:\n return []\n\n # Retrieve the database object\n database = self.get_database_object(api_endpoint=api_endpoint)\n\n # Get the list of collections\n collection_list = list(database.list_collections(keyspace=self.get_keyspace()))\n\n # Return the list of collections and metadata associated\n return [\n {\n \"name\": col.name,\n \"records\": self.collection_data(collection_name=col.name, database=database),\n \"provider\": (\n col.options.vector.service.provider if col.options.vector and col.options.vector.service else None\n ),\n \"icon\": self.get_provider_icon(collection=col),\n \"model\": (\n col.options.vector.service.model_name if col.options.vector and col.options.vector.service else None\n ),\n }\n for col in collection_list\n ]\n\n def reset_provider_options(self, build_config: dict) -> dict:\n \"\"\"Reset provider options and related configurations in the build_config dictionary.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get vectorize providers\n vectorize_providers_api = self.get_vectorize_providers(\n token=self.token,\n environment=self.environment,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n )\n\n # Create a new dictionary with \"Bring your own\" first\n vectorize_providers: dict[str, list[list[str]]] = {\"Bring your own\": [[], []]}\n\n # Add the remaining items (only Nvidia) from the original dictionary\n vectorize_providers.update(\n {\n k: v\n for k, v in vectorize_providers_api.items()\n if k.lower() in [\"nvidia\"] # TODO: Eventually support more\n }\n )\n\n # Set provider options\n provider_field = \"02_embedding_generation_provider\"\n template[provider_field][\"options\"] = list(vectorize_providers.keys())\n\n # Add metadata for each provider option\n template[provider_field][\"options_metadata\"] = [\n {\"icon\": self.get_provider_icon(provider_name=provider)} for provider in template[provider_field][\"options\"]\n ]\n\n # Get selected embedding provider\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure embedding model field\n model_field = \"03_embedding_generation_model\"\n template[model_field].update(\n {\n \"options\": vectorize_providers.get(embedding_provider, [[], []])[1],\n \"placeholder\": \"Bring your own\" if is_bring_your_own else None,\n \"readonly\": is_bring_your_own,\n \"required\": not is_bring_your_own,\n \"value\": None,\n }\n )\n\n # If this is a bring your own, set dimensions to 0\n return self.reset_dimension_field(build_config)\n\n def reset_dimension_field(self, build_config: dict) -> dict:\n \"\"\"Reset dimension field options based on provided configuration.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get selected embedding model\n provider_field = \"02_embedding_generation_provider\"\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure dimension field\n dimension_field = \"04_dimension\"\n dimension_value = 1024 if not is_bring_your_own else None # TODO: Dynamically figure this out\n template[dimension_field].update(\n {\n \"placeholder\": dimension_value,\n \"value\": dimension_value,\n \"readonly\": not is_bring_your_own,\n \"required\": is_bring_your_own,\n }\n )\n\n return build_config\n\n def reset_collection_list(self, build_config: dict) -> dict:\n \"\"\"Reset collection list options based on provided configuration.\"\"\"\n # Get collection options\n collection_options = self._initialize_collection_options(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n\n # Update collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update(\n {\n \"options\": [col[\"name\"] for col in collection_options],\n \"options_metadata\": [{k: v for k, v in col.items() if k != \"name\"} for col in collection_options],\n }\n )\n\n # Reset selected collection if not in options\n if collection_config[\"value\"] not in collection_config[\"options\"]:\n collection_config[\"value\"] = \"\"\n\n # Set advanced status based on database selection\n collection_config[\"advanced\"] = not build_config[\"database_name\"][\"value\"]\n\n return build_config\n\n def reset_database_list(self, build_config: dict) -> dict:\n \"\"\"Reset database list options and related configurations.\"\"\"\n # Get database options\n database_options = self._initialize_database_options()\n\n # Update cloud provider options\n env = self.environment or \"prod\"\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_cloud_provider\"][\"options\"] = list(self.map_cloud_providers()[env].keys())\n\n # Update database configuration\n database_config = build_config[\"database_name\"]\n database_config.update(\n {\n \"options\": [db[\"name\"] for db in database_options],\n \"options_metadata\": [{k: v for k, v in db.items() if k != \"name\"} for db in database_options],\n }\n )\n\n # Reset selections if value not in options\n if database_config[\"value\"] not in database_config[\"options\"]:\n database_config[\"value\"] = \"\"\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n build_config[\"collection_name\"][\"advanced\"] = True\n\n # Set advanced status based on token presence\n database_config[\"advanced\"] = not build_config[\"token\"][\"value\"]\n\n return build_config\n\n def reset_build_config(self, build_config: dict) -> dict:\n \"\"\"Reset all build configuration options to default empty state.\"\"\"\n # Reset database configuration\n database_config = build_config[\"database_name\"]\n database_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"advanced\": True})\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n\n # Reset collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"advanced\": True})\n\n return build_config\n\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Update build configuration based on field name and value.\"\"\"\n # Early return if no token provided\n if not self.token:\n return self.reset_build_config(build_config)\n\n # Database creation callback\n if field_name == \"database_name\" and isinstance(field_value, dict):\n if \"01_new_database_name\" in field_value:\n await self._create_new_database(build_config, field_value)\n return self.reset_collection_list(build_config)\n return self._update_cloud_regions(build_config, field_value)\n\n # Collection creation callback\n if field_name == \"collection_name\" and isinstance(field_value, dict):\n # Case 1: New collection creation\n if \"01_new_collection_name\" in field_value:\n await self._create_new_collection(build_config, field_value)\n return build_config\n\n # Case 2: Update embedding provider options\n if \"02_embedding_generation_provider\" in field_value:\n return self.reset_provider_options(build_config)\n\n # Case 3: Update dimension field\n if \"03_embedding_generation_model\" in field_value:\n return self.reset_dimension_field(build_config)\n\n # Initial execution or token/environment change\n first_run = field_name == \"collection_name\" and not field_value and not build_config[\"database_name\"][\"options\"]\n if first_run or field_name in {\"token\", \"environment\"}:\n return self.reset_database_list(build_config)\n\n # Database selection change\n if field_name == \"database_name\" and not isinstance(field_value, dict):\n return self._handle_database_selection(build_config, field_value)\n\n # Collection selection change\n if field_name == \"collection_name\" and not isinstance(field_value, dict):\n return self._handle_collection_selection(build_config, field_value)\n\n return build_config\n\n async def _create_new_database(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new database and update build config options.\"\"\"\n try:\n await self.create_database_api(\n new_database_name=field_value[\"01_new_database_name\"],\n token=self.token,\n keyspace=self.get_keyspace(),\n environment=self.environment,\n cloud_provider=field_value[\"02_cloud_provider\"],\n region=field_value[\"03_region\"],\n )\n except Exception as e:\n msg = f\"Error creating database: {e}\"\n raise ValueError(msg) from e\n\n build_config[\"database_name\"][\"options\"].append(field_value[\"01_new_database_name\"])\n build_config[\"database_name\"][\"options_metadata\"].append(\n {\n \"status\": \"PENDING\",\n \"collections\": 0,\n \"api_endpoint\": None,\n \"org_id\": None,\n }\n )\n\n def _update_cloud_regions(self, build_config: dict, field_value: dict) -> dict:\n \"\"\"Update cloud provider regions in build config.\"\"\"\n env = self.environment or \"prod\"\n cloud_provider = field_value[\"02_cloud_provider\"]\n\n # Update the region options based on the selected cloud provider\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"03_region\"][\"options\"] = self.map_cloud_providers()[env][cloud_provider][\"regions\"]\n\n # Reset the the 03_region value if it's not in the new options\n if template[\"03_region\"][\"value\"] not in template[\"03_region\"][\"options\"]:\n template[\"03_region\"][\"value\"] = None\n\n return build_config\n\n async def _create_new_collection(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new collection and update build config options.\"\"\"\n embedding_provider = field_value.get(\"02_embedding_generation_provider\")\n try:\n await self.create_collection_api(\n new_collection_name=field_value[\"01_new_collection_name\"],\n token=self.token,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n environment=self.environment,\n keyspace=self.get_keyspace(),\n dimension=field_value.get(\"04_dimension\") if embedding_provider == \"Bring your own\" else None,\n embedding_generation_provider=embedding_provider,\n embedding_generation_model=field_value.get(\"03_embedding_generation_model\"),\n )\n except Exception as e:\n msg = f\"Error creating collection: {e}\"\n raise ValueError(msg) from e\n\n provider = embedding_provider.lower() if embedding_provider and embedding_provider != \"Bring your own\" else None\n build_config[\"collection_name\"].update(\n {\n \"value\": field_value[\"01_new_collection_name\"],\n \"options\": build_config[\"collection_name\"][\"options\"] + [field_value[\"01_new_collection_name\"]],\n }\n )\n build_config[\"embedding_choice\"][\"value\"] = \"Astra Vectorize\" if provider else \"Embedding Model\"\n build_config[\"embedding_model\"][\"advanced\"] = bool(provider)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": provider,\n \"icon\": self.get_provider_icon(provider_name=embedding_provider),\n \"model\": field_value.get(\"03_embedding_generation_model\"),\n }\n )\n\n def _handle_database_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle database selection and update related configurations.\"\"\"\n build_config = self.reset_database_list(build_config)\n\n # Reset collection list if database selection changes\n if field_value not in build_config[\"database_name\"][\"options\"]:\n build_config[\"database_name\"][\"value\"] = \"\"\n return build_config\n\n # Get the api endpoint for the selected database\n index = build_config[\"database_name\"][\"options\"].index(field_value)\n build_config[\"api_endpoint\"][\"value\"] = build_config[\"database_name\"][\"options_metadata\"][index][\"api_endpoint\"]\n\n # Get the org_id for the selected database\n org_id = build_config[\"database_name\"][\"options_metadata\"][index][\"org_id\"]\n if not org_id:\n return build_config\n\n # Get the database id for the selected database\n db_id = self.get_database_id_static(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n keyspace = self.get_keyspace() or \"default_keyspace\"\n\n # Update the helper text for the embedding provider field\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_embedding_generation_provider\"][\"helper_text\"] = (\n \"To create collections with more embedding provider options, go to \"\n f''\n \"your database in Astra DB.\"\n )\n\n # Reset provider options\n build_config = self.reset_provider_options(build_config)\n\n return self.reset_collection_list(build_config)\n\n def _handle_collection_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle collection selection and update embedding options.\"\"\"\n build_config[\"autodetect_collection\"][\"value\"] = True\n build_config = self.reset_collection_list(build_config)\n\n if field_value and field_value not in build_config[\"collection_name\"][\"options\"]:\n build_config[\"collection_name\"][\"options\"].append(field_value)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": None,\n \"icon\": \"vectorstores\",\n \"model\": None,\n }\n )\n build_config[\"autodetect_collection\"][\"value\"] = False\n\n if not field_value:\n return build_config\n\n index = build_config[\"collection_name\"][\"options\"].index(field_value)\n provider = build_config[\"collection_name\"][\"options_metadata\"][index][\"provider\"]\n build_config[\"embedding_model\"][\"advanced\"] = bool(provider)\n build_config[\"embedding_choice\"][\"value\"] = \"Astra Vectorize\" if provider else \"Embedding Model\"\n return build_config\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBVectorStore\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n # Get the embedding model and additional params\n embedding_params = (\n {\"embedding\": self.embedding_model}\n if self.embedding_model and self.embedding_choice == \"Embedding Model\"\n else {}\n )\n\n # Get the additional parameters\n additional_params = self.astradb_vectorstore_kwargs or {}\n\n # Get Langflow version and platform information\n __version__ = get_version_info()[\"version\"]\n langflow_prefix = \"\"\n # if os.getenv(\"AWS_EXECUTION_ENV\") == \"AWS_ECS_FARGATE\": # TODO: More precise way of detecting\n # langflow_prefix = \"ds-\"\n\n # Get the database object\n database = self.get_database_object()\n autodetect = self.collection_name in database.list_collection_names() and self.autodetect_collection\n\n # Bundle up the auto-detect parameters\n autodetect_params = {\n \"autodetect_collection\": autodetect,\n \"content_field\": (\n self.content_field\n if self.content_field and embedding_params\n else (\n \"page_content\"\n if embedding_params\n and self.collection_data(collection_name=self.collection_name, database=database) == 0\n else None\n )\n ),\n \"ignore_invalid_documents\": self.ignore_invalid_documents,\n }\n\n # Attempt to build the Vector Store object\n try:\n vector_store = AstraDBVectorStore(\n # Astra DB Authentication Parameters\n token=self.token,\n api_endpoint=database.api_endpoint,\n namespace=database.keyspace,\n collection_name=self.collection_name,\n environment=self.environment,\n # Astra DB Usage Tracking Parameters\n ext_callers=[(f\"{langflow_prefix}langflow\", __version__)],\n # Astra DB Vector Store Parameters\n **autodetect_params,\n **embedding_params,\n **additional_params,\n )\n except Exception as e:\n msg = f\"Error initializing AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n # Add documents to the vector store\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n if documents and self.deletion_field:\n self.log(f\"Deleting documents where {self.deletion_field}\")\n try:\n database = self.get_database_object()\n collection = database.get_collection(self.collection_name, keyspace=database.keyspace)\n delete_values = list({doc.metadata[self.deletion_field] for doc in documents})\n self.log(f\"Deleting documents where {self.deletion_field} matches {delete_values}.\")\n collection.delete_many({f\"metadata.{self.deletion_field}\": {\"$in\": delete_values}})\n except Exception as e:\n msg = f\"Error deleting documents from AstraDBVectorStore based on '{self.deletion_field}': {e}\"\n raise ValueError(msg) from e\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except Exception as e:\n msg = f\"Error adding documents to AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n search_type_mapping = {\n \"Similarity with score threshold\": \"similarity_score_threshold\",\n \"MMR (Max Marginal Relevance)\": \"mmr\",\n }\n\n return search_type_mapping.get(self.search_type, \"similarity\")\n\n def _build_search_args(self):\n query = self.search_query if isinstance(self.search_query, str) and self.search_query.strip() else None\n\n if query:\n args = {\n \"query\": query,\n \"search_type\": self._map_search_type(),\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n }\n elif self.advanced_search_filter:\n args = {\n \"n\": self.number_of_results,\n }\n else:\n return {}\n\n filter_arg = self.advanced_search_filter or {}\n if filter_arg:\n args[\"filter\"] = filter_arg\n\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n vector_store = vector_store or self.build_vector_store()\n\n self.log(f\"Search input: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n\n try:\n search_args = self._build_search_args()\n except Exception as e:\n msg = f\"Error in AstraDBVectorStore._build_search_args: {e}\"\n raise ValueError(msg) from e\n\n if not search_args:\n self.log(\"No search input or filters provided. Skipping search.\")\n return []\n\n docs = []\n search_method = \"search\" if \"query\" in search_args else \"metadata_search\"\n\n try:\n self.log(f\"Calling vector_store.{search_method} with args: {search_args}\")\n docs = getattr(vector_store, search_method)(**search_args)\n except Exception as e:\n msg = f\"Error performing {search_method} in AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n self.log(f\"Converted documents to data: {len(data)}\")\n self.status = data\n\n return data\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n" + "value": "import re\nfrom collections import defaultdict\nfrom dataclasses import asdict, dataclass, field\n\nfrom astrapy import AstraDBAdmin, DataAPIClient, Database\nfrom astrapy.info import CollectionDescriptor\nfrom langchain_astradb import AstraDBVectorStore, CollectionVectorServiceOptions\n\nfrom langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store\nfrom langflow.base.vectorstores.vector_store_connection_decorator import vector_store_connection\nfrom langflow.helpers import docs_to_data\nfrom langflow.inputs import FloatInput, NestedDictInput\nfrom langflow.io import (\n BoolInput,\n DropdownInput,\n HandleInput,\n IntInput,\n SecretStrInput,\n StrInput,\n)\nfrom langflow.schema import Data\nfrom langflow.utils.version import get_version_info\n\n\n@vector_store_connection\nclass AstraDBVectorStoreComponent(LCVectorStoreComponent):\n display_name: str = \"Astra DB\"\n description: str = \"Ingest and search documents in Astra DB\"\n documentation: str = \"https://docs.datastax.com/en/langflow/astra-components.html\"\n name = \"AstraDB\"\n icon: str = \"AstraDB\"\n\n _cached_vector_store: AstraDBVectorStore | None = None\n\n @dataclass\n class NewDatabaseInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_database\",\n \"description\": \"Please allow several minutes for creation to complete.\",\n \"display_name\": \"Create new database\",\n \"field_order\": [\"01_new_database_name\", \"02_cloud_provider\", \"03_region\"],\n \"template\": {\n \"01_new_database_name\": StrInput(\n name=\"new_database_name\",\n display_name=\"Name\",\n info=\"Name of the new database to create in Astra DB.\",\n required=True,\n ),\n \"02_cloud_provider\": DropdownInput(\n name=\"cloud_provider\",\n display_name=\"Cloud provider\",\n info=\"Cloud provider for the new database.\",\n options=[],\n required=True,\n real_time_refresh=True,\n ),\n \"03_region\": DropdownInput(\n name=\"region\",\n display_name=\"Region\",\n info=\"Region for the new database.\",\n options=[],\n required=True,\n ),\n },\n },\n }\n }\n )\n\n @dataclass\n class NewCollectionInput:\n functionality: str = \"create\"\n fields: dict[str, dict] = field(\n default_factory=lambda: {\n \"data\": {\n \"node\": {\n \"name\": \"create_collection\",\n \"description\": \"Please allow several seconds for creation to complete.\",\n \"display_name\": \"Create new collection\",\n \"field_order\": [\n \"01_new_collection_name\",\n \"02_embedding_generation_provider\",\n \"03_embedding_generation_model\",\n \"04_dimension\",\n ],\n \"template\": {\n \"01_new_collection_name\": StrInput(\n name=\"new_collection_name\",\n display_name=\"Name\",\n info=\"Name of the new collection to create in Astra DB.\",\n required=True,\n ),\n \"02_embedding_generation_provider\": DropdownInput(\n name=\"embedding_generation_provider\",\n display_name=\"Embedding generation method\",\n info=\"Provider to use for generating embeddings.\",\n helper_text=(\n \"To create collections with more embedding provider options, go to \"\n 'your database in Astra DB'\n ),\n real_time_refresh=True,\n required=True,\n options=[],\n ),\n \"03_embedding_generation_model\": DropdownInput(\n name=\"embedding_generation_model\",\n display_name=\"Embedding model\",\n info=\"Model to use for generating embeddings.\",\n real_time_refresh=True,\n options=[],\n ),\n \"04_dimension\": IntInput(\n name=\"dimension\",\n display_name=\"Dimensions\",\n info=\"Dimensions of the embeddings to generate.\",\n value=None,\n ),\n },\n },\n }\n }\n )\n\n inputs = [\n SecretStrInput(\n name=\"token\",\n display_name=\"Astra DB Application Token\",\n info=\"Authentication token for accessing Astra DB.\",\n value=\"ASTRA_DB_APPLICATION_TOKEN\",\n required=True,\n real_time_refresh=True,\n input_types=[],\n ),\n StrInput(\n name=\"environment\",\n display_name=\"Environment\",\n info=\"The environment for the Astra DB API Endpoint.\",\n advanced=True,\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"database_name\",\n display_name=\"Database\",\n info=\"The Database name for the Astra DB instance.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewDatabaseInput()),\n combobox=True,\n ),\n StrInput(\n name=\"api_endpoint\",\n display_name=\"Astra DB API Endpoint\",\n info=\"The API Endpoint for the Astra DB instance. Supercedes database selection.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"collection_name\",\n display_name=\"Collection\",\n info=\"The name of the collection within Astra DB where the vectors will be stored.\",\n required=True,\n refresh_button=True,\n real_time_refresh=True,\n dialog_inputs=asdict(NewCollectionInput()),\n combobox=True,\n advanced=True,\n ),\n StrInput(\n name=\"keyspace\",\n display_name=\"Keyspace\",\n info=\"Optional keyspace within Astra DB to use for the collection.\",\n advanced=True,\n ),\n DropdownInput(\n name=\"embedding_choice\",\n display_name=\"Embedding Model or Astra Vectorize\",\n info=\"Choose an embedding model or use Astra Vectorize.\",\n options=[\"Embedding Model\", \"Astra Vectorize\"],\n value=\"Embedding Model\",\n advanced=True,\n real_time_refresh=True,\n ),\n HandleInput(\n name=\"embedding_model\",\n display_name=\"Embedding Model\",\n input_types=[\"Embeddings\"],\n info=\"Specify the Embedding Model. Not required for Astra Vectorize collections.\",\n required=False,\n ),\n *LCVectorStoreComponent.inputs,\n IntInput(\n name=\"number_of_results\",\n display_name=\"Number of Search Results\",\n info=\"Number of search results to return.\",\n advanced=True,\n value=4,\n ),\n DropdownInput(\n name=\"search_type\",\n display_name=\"Search Type\",\n info=\"Search type to use\",\n options=[\"Similarity\", \"Similarity with score threshold\", \"MMR (Max Marginal Relevance)\"],\n value=\"Similarity\",\n advanced=True,\n ),\n FloatInput(\n name=\"search_score_threshold\",\n display_name=\"Search Score Threshold\",\n info=\"Minimum similarity score threshold for search results. \"\n \"(when using 'Similarity with score threshold')\",\n value=0,\n advanced=True,\n ),\n NestedDictInput(\n name=\"advanced_search_filter\",\n display_name=\"Search Metadata Filter\",\n info=\"Optional dictionary of filters to apply to the search query.\",\n advanced=True,\n ),\n BoolInput(\n name=\"autodetect_collection\",\n display_name=\"Autodetect Collection\",\n info=\"Boolean flag to determine whether to autodetect the collection.\",\n advanced=True,\n value=True,\n ),\n StrInput(\n name=\"content_field\",\n display_name=\"Content Field\",\n info=\"Field to use as the text content field for the vector store.\",\n advanced=True,\n ),\n StrInput(\n name=\"deletion_field\",\n display_name=\"Deletion Based On Field\",\n info=\"When this parameter is provided, documents in the target collection with \"\n \"metadata field values matching the input metadata field value will be deleted \"\n \"before new data is loaded.\",\n advanced=True,\n ),\n BoolInput(\n name=\"ignore_invalid_documents\",\n display_name=\"Ignore Invalid Documents\",\n info=\"Boolean flag to determine whether to ignore invalid documents at runtime.\",\n advanced=True,\n ),\n NestedDictInput(\n name=\"astradb_vectorstore_kwargs\",\n display_name=\"AstraDBVectorStore Parameters\",\n info=\"Optional dictionary of additional parameters for the AstraDBVectorStore.\",\n advanced=True,\n ),\n ]\n\n @classmethod\n def map_cloud_providers(cls):\n # TODO: Programmatically fetch the regions for each cloud provider\n return {\n \"dev\": {\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\"],\n },\n },\n # TODO: Check test regions\n \"test\": {\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-central1\"],\n },\n },\n \"prod\": {\n \"Amazon Web Services\": {\n \"id\": \"aws\",\n \"regions\": [\"us-east-2\", \"ap-south-1\", \"eu-west-1\"],\n },\n \"Google Cloud Platform\": {\n \"id\": \"gcp\",\n \"regions\": [\"us-east1\"],\n },\n \"Microsoft Azure\": {\n \"id\": \"azure\",\n \"regions\": [\"westus3\"],\n },\n },\n }\n\n @classmethod\n def get_vectorize_providers(cls, token: str, environment: str | None = None, api_endpoint: str | None = None):\n try:\n # Get the admin object\n admin = AstraDBAdmin(token=token, environment=environment)\n db_admin = admin.get_database_admin(api_endpoint=api_endpoint)\n\n # Get the list of embedding providers\n embedding_providers = db_admin.find_embedding_providers().as_dict()\n\n vectorize_providers_mapping = {}\n # Map the provider display name to the provider key and models\n for provider_key, provider_data in embedding_providers[\"embeddingProviders\"].items():\n # Get the provider display name and models\n display_name = provider_data[\"displayName\"]\n models = [model[\"name\"] for model in provider_data[\"models\"]]\n\n # Build our mapping\n vectorize_providers_mapping[display_name] = [provider_key, models]\n\n # Sort the resulting dictionary\n return defaultdict(list, dict(sorted(vectorize_providers_mapping.items())))\n except Exception as _: # noqa: BLE001\n return {}\n\n @classmethod\n async def create_database_api(\n cls,\n new_database_name: str,\n cloud_provider: str,\n region: str,\n token: str,\n environment: str | None = None,\n keyspace: str | None = None,\n ):\n client = DataAPIClient(token=token, environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the environment, set to prod if null like\n my_env = environment or \"prod\"\n\n # Raise a value error if name isn't provided\n if not new_database_name:\n msg = \"Database name is required to create a new database.\"\n raise ValueError(msg)\n\n # Call the create database function\n return await admin_client.async_create_database(\n name=new_database_name,\n cloud_provider=cls.map_cloud_providers()[my_env][cloud_provider][\"id\"],\n region=region,\n keyspace=keyspace,\n wait_until_active=False,\n )\n\n @classmethod\n async def create_collection_api(\n cls,\n new_collection_name: str,\n token: str,\n api_endpoint: str,\n environment: str | None = None,\n keyspace: str | None = None,\n dimension: int | None = None,\n embedding_generation_provider: str | None = None,\n embedding_generation_model: str | None = None,\n ):\n # Create the data API client\n client = DataAPIClient(token=token, environment=environment)\n\n # Get the database object\n database = client.get_async_database(api_endpoint=api_endpoint, token=token)\n\n # Build vectorize options, if needed\n vectorize_options = None\n if not dimension:\n vectorize_options = CollectionVectorServiceOptions(\n provider=cls.get_vectorize_providers(\n token=token, environment=environment, api_endpoint=api_endpoint\n ).get(embedding_generation_provider, [None, []])[0],\n model_name=embedding_generation_model,\n )\n\n # Raise a value error if name isn't provided\n if not new_collection_name:\n msg = \"Collection name is required to create a new collection.\"\n raise ValueError(msg)\n\n # Create the collection\n return await database.create_collection(\n name=new_collection_name,\n keyspace=keyspace,\n dimension=dimension,\n service=vectorize_options,\n )\n\n @classmethod\n def get_database_list_static(cls, token: str, environment: str | None = None):\n client = DataAPIClient(token=token, environment=environment)\n\n # Get the admin object\n admin_client = client.get_admin(token=token)\n\n # Get the list of databases\n db_list = list(admin_client.list_databases())\n\n # Set the environment properly\n env_string = \"\"\n if environment and environment != \"prod\":\n env_string = f\"-{environment}\"\n\n # Generate the api endpoint for each database\n db_info_dict = {}\n for db in db_list:\n try:\n # Get the API endpoint for the database\n api_endpoint = f\"https://{db.info.id}-{db.info.region}.apps.astra{env_string}.datastax.com\"\n\n # Get the number of collections\n try:\n num_collections = len(\n list(\n client.get_database(\n api_endpoint=api_endpoint, token=token, keyspace=db.info.keyspace\n ).list_collection_names(keyspace=db.info.keyspace)\n )\n )\n except Exception: # noqa: BLE001\n if db.status != \"PENDING\":\n continue\n num_collections = 0\n\n # Add the database to the dictionary\n db_info_dict[db.info.name] = {\n \"api_endpoint\": api_endpoint,\n \"collections\": num_collections,\n \"status\": db.status if db.status != \"ACTIVE\" else None,\n \"org_id\": db.org_id if db.org_id else None,\n }\n except Exception: # noqa: BLE001, S110\n pass\n\n return db_info_dict\n\n def get_database_list(self):\n return self.get_database_list_static(token=self.token, environment=self.environment)\n\n @classmethod\n def get_api_endpoint_static(\n cls,\n token: str,\n environment: str | None = None,\n api_endpoint: str | None = None,\n database_name: str | None = None,\n ):\n # If the api_endpoint is set, return it\n if api_endpoint:\n return api_endpoint\n\n # Check if the database_name is like a url\n if database_name and database_name.startswith(\"https://\"):\n return database_name\n\n # If the database is not set, nothing we can do.\n if not database_name:\n return None\n\n # Grab the database object\n db = cls.get_database_list_static(token=token, environment=environment).get(database_name)\n if not db:\n return None\n\n # Otherwise, get the URL from the database list\n return db.get(\"api_endpoint\")\n\n def get_api_endpoint(self):\n return self.get_api_endpoint_static(\n token=self.token,\n environment=self.environment,\n api_endpoint=self.api_endpoint,\n database_name=self.database_name,\n )\n\n @classmethod\n def get_database_id_static(cls, api_endpoint: str) -> str | None:\n # Pattern matches standard UUID format: 8-4-4-4-12 hexadecimal characters\n uuid_pattern = r\"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\"\n match = re.search(uuid_pattern, api_endpoint)\n\n return match.group(0) if match else None\n\n def get_database_id(self):\n return self.get_database_id_static(api_endpoint=self.get_api_endpoint())\n\n def get_keyspace(self):\n keyspace = self.keyspace\n\n if keyspace:\n return keyspace.strip()\n\n return None\n\n def get_database_object(self, api_endpoint: str | None = None):\n try:\n client = DataAPIClient(token=self.token, environment=self.environment)\n\n return client.get_database(\n api_endpoint=api_endpoint or self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n except Exception as e:\n msg = f\"Error fetching database object: {e}\"\n raise ValueError(msg) from e\n\n def collection_data(self, collection_name: str, database: Database | None = None):\n try:\n if not database:\n client = DataAPIClient(token=self.token, environment=self.environment)\n\n database = client.get_database(\n api_endpoint=self.get_api_endpoint(),\n token=self.token,\n keyspace=self.get_keyspace(),\n )\n\n collection = database.get_collection(collection_name, keyspace=self.get_keyspace())\n\n return collection.estimated_document_count()\n except Exception as e: # noqa: BLE001\n self.log(f\"Error checking collection data: {e}\")\n\n return None\n\n def _initialize_database_options(self):\n try:\n return [\n {\n \"name\": name,\n \"status\": info[\"status\"],\n \"collections\": info[\"collections\"],\n \"api_endpoint\": info[\"api_endpoint\"],\n \"org_id\": info[\"org_id\"],\n }\n for name, info in self.get_database_list().items()\n ]\n except Exception as e:\n msg = f\"Error fetching database options: {e}\"\n raise ValueError(msg) from e\n\n @classmethod\n def get_provider_icon(cls, collection: CollectionDescriptor | None = None, provider_name: str | None = None) -> str:\n # Get the provider name from the collection\n provider_name = provider_name or (\n collection.options.vector.service.provider\n if collection and collection.options and collection.options.vector and collection.options.vector.service\n else None\n )\n\n # If there is no provider, use the vector store icon\n if not provider_name or provider_name == \"Bring your own\":\n return \"vectorstores\"\n\n # Map provider casings\n case_map = {\n \"nvidia\": \"NVIDIA\",\n \"openai\": \"OpenAI\",\n \"amazon bedrock\": \"AmazonBedrockEmbeddings\",\n \"azure openai\": \"AzureOpenAiEmbeddings\",\n \"cohere\": \"Cohere\",\n \"jina ai\": \"JinaAI\",\n \"mistral ai\": \"MistralAI\",\n \"upstage\": \"Upstage\",\n \"voyage ai\": \"VoyageAI\",\n }\n\n # Adjust the casing on some like nvidia\n return case_map[provider_name.lower()] if provider_name.lower() in case_map else provider_name.title()\n\n def _initialize_collection_options(self, api_endpoint: str | None = None):\n # Nothing to generate if we don't have an API endpoint yet\n api_endpoint = api_endpoint or self.get_api_endpoint()\n if not api_endpoint:\n return []\n\n # Retrieve the database object\n database = self.get_database_object(api_endpoint=api_endpoint)\n\n # Get the list of collections\n collection_list = list(database.list_collections(keyspace=self.get_keyspace()))\n\n # Return the list of collections and metadata associated\n return [\n {\n \"name\": col.name,\n \"records\": self.collection_data(collection_name=col.name, database=database),\n \"provider\": (\n col.options.vector.service.provider if col.options.vector and col.options.vector.service else None\n ),\n \"icon\": self.get_provider_icon(collection=col),\n \"model\": (\n col.options.vector.service.model_name if col.options.vector and col.options.vector.service else None\n ),\n }\n for col in collection_list\n ]\n\n def reset_provider_options(self, build_config: dict) -> dict:\n \"\"\"Reset provider options and related configurations in the build_config dictionary.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get vectorize providers\n vectorize_providers_api = self.get_vectorize_providers(\n token=self.token,\n environment=self.environment,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n )\n\n # Create a new dictionary with \"Bring your own\" first\n vectorize_providers: dict[str, list[list[str]]] = {\"Bring your own\": [[], []]}\n\n # Add the remaining items (only Nvidia) from the original dictionary\n vectorize_providers.update(\n {\n k: v\n for k, v in vectorize_providers_api.items()\n if k.lower() in [\"nvidia\"] # TODO: Eventually support more\n }\n )\n\n # Set provider options\n provider_field = \"02_embedding_generation_provider\"\n template[provider_field][\"options\"] = list(vectorize_providers.keys())\n\n # Add metadata for each provider option\n template[provider_field][\"options_metadata\"] = [\n {\"icon\": self.get_provider_icon(provider_name=provider)} for provider in template[provider_field][\"options\"]\n ]\n\n # Get selected embedding provider\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure embedding model field\n model_field = \"03_embedding_generation_model\"\n template[model_field].update(\n {\n \"options\": vectorize_providers.get(embedding_provider, [[], []])[1],\n \"placeholder\": \"Bring your own\" if is_bring_your_own else None,\n \"readonly\": is_bring_your_own,\n \"required\": not is_bring_your_own,\n \"value\": None,\n }\n )\n\n # If this is a bring your own, set dimensions to 0\n return self.reset_dimension_field(build_config)\n\n def reset_dimension_field(self, build_config: dict) -> dict:\n \"\"\"Reset dimension field options based on provided configuration.\"\"\"\n # Extract template path for cleaner access\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n\n # Get selected embedding model\n provider_field = \"02_embedding_generation_provider\"\n embedding_provider = template[provider_field][\"value\"]\n is_bring_your_own = embedding_provider and embedding_provider == \"Bring your own\"\n\n # Configure dimension field\n dimension_field = \"04_dimension\"\n dimension_value = 1024 if not is_bring_your_own else None # TODO: Dynamically figure this out\n template[dimension_field].update(\n {\n \"placeholder\": dimension_value,\n \"value\": dimension_value,\n \"readonly\": not is_bring_your_own,\n \"required\": is_bring_your_own,\n }\n )\n\n return build_config\n\n def reset_collection_list(self, build_config: dict) -> dict:\n \"\"\"Reset collection list options based on provided configuration.\"\"\"\n # Get collection options\n collection_options = self._initialize_collection_options(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n\n # Update collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update(\n {\n \"options\": [col[\"name\"] for col in collection_options],\n \"options_metadata\": [{k: v for k, v in col.items() if k != \"name\"} for col in collection_options],\n }\n )\n\n # Reset selected collection if not in options\n if collection_config[\"value\"] not in collection_config[\"options\"]:\n collection_config[\"value\"] = \"\"\n\n # Set advanced status based on database selection\n collection_config[\"advanced\"] = not build_config[\"database_name\"][\"value\"]\n\n return build_config\n\n def reset_database_list(self, build_config: dict) -> dict:\n \"\"\"Reset database list options and related configurations.\"\"\"\n # Get database options\n database_options = self._initialize_database_options()\n\n # Update cloud provider options\n env = self.environment or \"prod\"\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_cloud_provider\"][\"options\"] = list(self.map_cloud_providers()[env].keys())\n\n # Update database configuration\n database_config = build_config[\"database_name\"]\n database_config.update(\n {\n \"options\": [db[\"name\"] for db in database_options],\n \"options_metadata\": [{k: v for k, v in db.items() if k != \"name\"} for db in database_options],\n }\n )\n\n # Reset selections if value not in options\n if database_config[\"value\"] not in database_config[\"options\"]:\n database_config[\"value\"] = \"\"\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n build_config[\"collection_name\"][\"advanced\"] = True\n\n # Set advanced status based on token presence\n database_config[\"advanced\"] = not build_config[\"token\"][\"value\"]\n\n return build_config\n\n def reset_build_config(self, build_config: dict) -> dict:\n \"\"\"Reset all build configuration options to default empty state.\"\"\"\n # Reset database configuration\n database_config = build_config[\"database_name\"]\n database_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"advanced\": True})\n build_config[\"api_endpoint\"][\"value\"] = \"\"\n\n # Reset collection configuration\n collection_config = build_config[\"collection_name\"]\n collection_config.update({\"options\": [], \"options_metadata\": [], \"value\": \"\", \"advanced\": True})\n\n return build_config\n\n async def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Update build configuration based on field name and value.\"\"\"\n # Early return if no token provided\n if not self.token:\n return self.reset_build_config(build_config)\n\n # Database creation callback\n if field_name == \"database_name\" and isinstance(field_value, dict):\n if \"01_new_database_name\" in field_value:\n await self._create_new_database(build_config, field_value)\n return self.reset_collection_list(build_config)\n return self._update_cloud_regions(build_config, field_value)\n\n # Collection creation callback\n if field_name == \"collection_name\" and isinstance(field_value, dict):\n # Case 1: New collection creation\n if \"01_new_collection_name\" in field_value:\n await self._create_new_collection(build_config, field_value)\n return build_config\n\n # Case 2: Update embedding provider options\n if \"02_embedding_generation_provider\" in field_value:\n return self.reset_provider_options(build_config)\n\n # Case 3: Update dimension field\n if \"03_embedding_generation_model\" in field_value:\n return self.reset_dimension_field(build_config)\n\n # Initial execution or token/environment change\n first_run = field_name == \"collection_name\" and not field_value and not build_config[\"database_name\"][\"options\"]\n if first_run or field_name in {\"token\", \"environment\"}:\n return self.reset_database_list(build_config)\n\n # Database selection change\n if field_name == \"database_name\" and not isinstance(field_value, dict):\n return self._handle_database_selection(build_config, field_value)\n\n # Collection selection change\n if field_name == \"collection_name\" and not isinstance(field_value, dict):\n return self._handle_collection_selection(build_config, field_value)\n\n return build_config\n\n async def _create_new_database(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new database and update build config options.\"\"\"\n try:\n await self.create_database_api(\n new_database_name=field_value[\"01_new_database_name\"],\n token=self.token,\n keyspace=self.get_keyspace(),\n environment=self.environment,\n cloud_provider=field_value[\"02_cloud_provider\"],\n region=field_value[\"03_region\"],\n )\n except Exception as e:\n msg = f\"Error creating database: {e}\"\n raise ValueError(msg) from e\n\n build_config[\"database_name\"][\"options\"].append(field_value[\"01_new_database_name\"])\n build_config[\"database_name\"][\"options_metadata\"].append(\n {\n \"status\": \"PENDING\",\n \"collections\": 0,\n \"api_endpoint\": None,\n \"org_id\": None,\n }\n )\n\n def _update_cloud_regions(self, build_config: dict, field_value: dict) -> dict:\n \"\"\"Update cloud provider regions in build config.\"\"\"\n env = self.environment or \"prod\"\n cloud_provider = field_value[\"02_cloud_provider\"]\n\n # Update the region options based on the selected cloud provider\n template = build_config[\"database_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"03_region\"][\"options\"] = self.map_cloud_providers()[env][cloud_provider][\"regions\"]\n\n # Reset the the 03_region value if it's not in the new options\n if template[\"03_region\"][\"value\"] not in template[\"03_region\"][\"options\"]:\n template[\"03_region\"][\"value\"] = None\n\n return build_config\n\n async def _create_new_collection(self, build_config: dict, field_value: dict) -> None:\n \"\"\"Create a new collection and update build config options.\"\"\"\n embedding_provider = field_value.get(\"02_embedding_generation_provider\")\n try:\n await self.create_collection_api(\n new_collection_name=field_value[\"01_new_collection_name\"],\n token=self.token,\n api_endpoint=build_config[\"api_endpoint\"][\"value\"],\n environment=self.environment,\n keyspace=self.get_keyspace(),\n dimension=field_value.get(\"04_dimension\") if embedding_provider == \"Bring your own\" else None,\n embedding_generation_provider=embedding_provider,\n embedding_generation_model=field_value.get(\"03_embedding_generation_model\"),\n )\n except Exception as e:\n msg = f\"Error creating collection: {e}\"\n raise ValueError(msg) from e\n\n provider = embedding_provider.lower() if embedding_provider and embedding_provider != \"Bring your own\" else None\n build_config[\"collection_name\"].update(\n {\n \"value\": field_value[\"01_new_collection_name\"],\n \"options\": build_config[\"collection_name\"][\"options\"] + [field_value[\"01_new_collection_name\"]],\n }\n )\n build_config[\"embedding_choice\"][\"value\"] = \"Astra Vectorize\" if provider else \"Embedding Model\"\n build_config[\"embedding_model\"][\"advanced\"] = bool(provider)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": provider,\n \"icon\": self.get_provider_icon(provider_name=embedding_provider),\n \"model\": field_value.get(\"03_embedding_generation_model\"),\n }\n )\n\n def _handle_database_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle database selection and update related configurations.\"\"\"\n build_config = self.reset_database_list(build_config)\n\n # Reset collection list if database selection changes\n if field_value not in build_config[\"database_name\"][\"options\"]:\n build_config[\"database_name\"][\"value\"] = \"\"\n return build_config\n\n # Get the api endpoint for the selected database\n index = build_config[\"database_name\"][\"options\"].index(field_value)\n build_config[\"api_endpoint\"][\"value\"] = build_config[\"database_name\"][\"options_metadata\"][index][\"api_endpoint\"]\n\n # Get the org_id for the selected database\n org_id = build_config[\"database_name\"][\"options_metadata\"][index][\"org_id\"]\n if not org_id:\n return build_config\n\n # Get the database id for the selected database\n db_id = self.get_database_id_static(api_endpoint=build_config[\"api_endpoint\"][\"value\"])\n keyspace = self.get_keyspace() or \"default_keyspace\"\n\n # Update the helper text for the embedding provider field\n template = build_config[\"collection_name\"][\"dialog_inputs\"][\"fields\"][\"data\"][\"node\"][\"template\"]\n template[\"02_embedding_generation_provider\"][\"helper_text\"] = (\n \"To create collections with more embedding provider options, go to \"\n f''\n \"your database in Astra DB.\"\n )\n\n # Reset provider options\n build_config = self.reset_provider_options(build_config)\n\n return self.reset_collection_list(build_config)\n\n def _handle_collection_selection(self, build_config: dict, field_value: str) -> dict:\n \"\"\"Handle collection selection and update embedding options.\"\"\"\n build_config[\"autodetect_collection\"][\"value\"] = True\n build_config = self.reset_collection_list(build_config)\n\n if field_value and field_value not in build_config[\"collection_name\"][\"options\"]:\n build_config[\"collection_name\"][\"options\"].append(field_value)\n build_config[\"collection_name\"][\"options_metadata\"].append(\n {\n \"records\": 0,\n \"provider\": None,\n \"icon\": \"vectorstores\",\n \"model\": None,\n }\n )\n build_config[\"autodetect_collection\"][\"value\"] = False\n\n if not field_value:\n return build_config\n\n index = build_config[\"collection_name\"][\"options\"].index(field_value)\n provider = build_config[\"collection_name\"][\"options_metadata\"][index][\"provider\"]\n build_config[\"embedding_model\"][\"advanced\"] = bool(provider)\n build_config[\"embedding_choice\"][\"value\"] = \"Astra Vectorize\" if provider else \"Embedding Model\"\n return build_config\n\n @check_cached_vector_store\n def build_vector_store(self):\n try:\n from langchain_astradb import AstraDBVectorStore\n except ImportError as e:\n msg = (\n \"Could not import langchain Astra DB integration package. \"\n \"Please install it with `pip install langchain-astradb`.\"\n )\n raise ImportError(msg) from e\n\n # Get the embedding model and additional params\n embedding_params = (\n {\"embedding\": self.embedding_model}\n if self.embedding_model and self.embedding_choice == \"Embedding Model\"\n else {}\n )\n\n # Get the additional parameters\n additional_params = self.astradb_vectorstore_kwargs or {}\n\n # Get Langflow version and platform information\n __version__ = get_version_info()[\"version\"]\n langflow_prefix = \"\"\n # if os.getenv(\"AWS_EXECUTION_ENV\") == \"AWS_ECS_FARGATE\": # TODO: More precise way of detecting\n # langflow_prefix = \"ds-\"\n\n # Get the database object\n database = self.get_database_object()\n autodetect = self.collection_name in database.list_collection_names() and self.autodetect_collection\n\n # Bundle up the auto-detect parameters\n autodetect_params = {\n \"autodetect_collection\": autodetect,\n \"content_field\": (\n self.content_field\n if self.content_field and embedding_params\n else (\n \"page_content\"\n if embedding_params\n and self.collection_data(collection_name=self.collection_name, database=database) == 0\n else None\n )\n ),\n \"ignore_invalid_documents\": self.ignore_invalid_documents,\n }\n\n # Attempt to build the Vector Store object\n try:\n vector_store = AstraDBVectorStore(\n # Astra DB Authentication Parameters\n token=self.token,\n api_endpoint=database.api_endpoint,\n namespace=database.keyspace,\n collection_name=self.collection_name,\n environment=self.environment,\n # Astra DB Usage Tracking Parameters\n ext_callers=[(f\"{langflow_prefix}langflow\", __version__)],\n # Astra DB Vector Store Parameters\n **autodetect_params,\n **embedding_params,\n **additional_params,\n )\n except Exception as e:\n msg = f\"Error initializing AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n # Add documents to the vector store\n self._add_documents_to_vector_store(vector_store)\n\n return vector_store\n\n def _add_documents_to_vector_store(self, vector_store) -> None:\n self.ingest_data = self._prepare_ingest_data()\n\n documents = []\n for _input in self.ingest_data or []:\n if isinstance(_input, Data):\n documents.append(_input.to_lc_document())\n else:\n msg = \"Vector Store Inputs must be Data objects.\"\n raise TypeError(msg)\n\n if documents and self.deletion_field:\n self.log(f\"Deleting documents where {self.deletion_field}\")\n try:\n database = self.get_database_object()\n collection = database.get_collection(self.collection_name, keyspace=database.keyspace)\n delete_values = list({doc.metadata[self.deletion_field] for doc in documents})\n self.log(f\"Deleting documents where {self.deletion_field} matches {delete_values}.\")\n collection.delete_many({f\"metadata.{self.deletion_field}\": {\"$in\": delete_values}})\n except Exception as e:\n msg = f\"Error deleting documents from AstraDBVectorStore based on '{self.deletion_field}': {e}\"\n raise ValueError(msg) from e\n\n if documents:\n self.log(f\"Adding {len(documents)} documents to the Vector Store.\")\n try:\n vector_store.add_documents(documents)\n except Exception as e:\n msg = f\"Error adding documents to AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n else:\n self.log(\"No documents to add to the Vector Store.\")\n\n def _map_search_type(self) -> str:\n search_type_mapping = {\n \"Similarity with score threshold\": \"similarity_score_threshold\",\n \"MMR (Max Marginal Relevance)\": \"mmr\",\n }\n\n return search_type_mapping.get(self.search_type, \"similarity\")\n\n def _build_search_args(self):\n query = self.search_query if isinstance(self.search_query, str) and self.search_query.strip() else None\n\n if query:\n args = {\n \"query\": query,\n \"search_type\": self._map_search_type(),\n \"k\": self.number_of_results,\n \"score_threshold\": self.search_score_threshold,\n }\n elif self.advanced_search_filter:\n args = {\n \"n\": self.number_of_results,\n }\n else:\n return {}\n\n filter_arg = self.advanced_search_filter or {}\n if filter_arg:\n args[\"filter\"] = filter_arg\n\n return args\n\n def search_documents(self, vector_store=None) -> list[Data]:\n vector_store = vector_store or self.build_vector_store()\n\n self.log(f\"Search input: {self.search_query}\")\n self.log(f\"Search type: {self.search_type}\")\n self.log(f\"Number of results: {self.number_of_results}\")\n\n try:\n search_args = self._build_search_args()\n except Exception as e:\n msg = f\"Error in AstraDBVectorStore._build_search_args: {e}\"\n raise ValueError(msg) from e\n\n if not search_args:\n self.log(\"No search input or filters provided. Skipping search.\")\n return []\n\n docs = []\n search_method = \"search\" if \"query\" in search_args else \"metadata_search\"\n\n try:\n self.log(f\"Calling vector_store.{search_method} with args: {search_args}\")\n docs = getattr(vector_store, search_method)(**search_args)\n except Exception as e:\n msg = f\"Error performing {search_method} in AstraDBVectorStore: {e}\"\n raise ValueError(msg) from e\n\n self.log(f\"Retrieved documents: {len(docs)}\")\n\n data = docs_to_data(docs)\n self.log(f\"Converted documents to data: {len(data)}\")\n self.status = data\n\n return data\n\n def get_retriever_kwargs(self):\n search_args = self._build_search_args()\n\n return {\n \"search_type\": self._map_search_type(),\n \"search_kwargs\": search_args,\n }\n" }, "collection_name": { "_input_type": "DropdownInput", @@ -4541,9 +4543,10 @@ "dynamic": false, "info": "", "input_types": [ - "Data" + "Data", + "DataFrame" ], - "list": false, + "list": true, "list_add_label": "Add More", "name": "ingest_data", "placeholder": "", diff --git a/src/backend/base/langflow/schema/dataframe.py b/src/backend/base/langflow/schema/dataframe.py index 4d7373690..9316e09a1 100644 --- a/src/backend/base/langflow/schema/dataframe.py +++ b/src/backend/base/langflow/schema/dataframe.py @@ -168,3 +168,13 @@ class DataFrame(pandas_DataFrame): DataFrame: A new DataFrame with the converted Documents """ return DataFrame(docs) + + def __eq__(self, other): + """Override equality to handle comparison with empty DataFrames and non-DataFrame objects.""" + if self.empty: + return False + if isinstance(other, list) and not other: # Empty list case + return False + if not isinstance(other, DataFrame | pd.DataFrame): # Non-DataFrame case + return False + return super().__eq__(other)