diff --git a/poetry.lock b/poetry.lock index ddbf3a41f..668b7d9d6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4621,19 +4621,22 @@ langchain-core = ">=0.2.26,<0.3.0" [[package]] name = "langchain-astradb" -version = "0.3.3" +version = "0.3.5" description = "An integration package connecting Astra DB and LangChain" optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "langchain_astradb-0.3.3-py3-none-any.whl", hash = "sha256:39deef1253947ef1bfaf3c27881ecdf07621d96c2cf37814aed9e506a9bee217"}, - {file = "langchain_astradb-0.3.3.tar.gz", hash = "sha256:f9a996ec4bef134896195430adeb7f264389c368a03d2ea91356837e8ddde091"}, + {file = "langchain_astradb-0.3.5-py3-none-any.whl", hash = "sha256:c86db219ec7b93548b23c06bf4303fadf0cb90b07e27222956c97ae27a14860c"}, + {file = "langchain_astradb-0.3.5.tar.gz", hash = "sha256:9377fed7f380b7ece363ef5acd6788f787bbacc4de825860c710c401047f4ece"}, ] [package.dependencies] -astrapy = ">=1.2,<2.0" +astrapy = ">=1.4,<2.0" langchain-core = ">=0.1.31,<0.3" -numpy = ">=1,<2" +numpy = [ + {version = ">=1.24.0,<2.0.0", markers = "python_version < \"3.12\""}, + {version = ">=1.26.0,<2.0.0", markers = "python_version >= \"3.12\""}, +] [[package]] name = "langchain-aws" @@ -5052,7 +5055,7 @@ cachetools = "^5.3.1" chardet = "^5.2.0" clickhouse-connect = "0.7.19" crewai = "^0.36.0" -cryptography = "^42.0.5" +cryptography = ">=42.0.5,<44.0.0" diskcache = "^5.6.3" docstring-parser = "^0.16" duckdb = "^1.0.0" @@ -12084,4 +12087,4 @@ local = ["ctransformers", "llama-cpp-python", "sentence-transformers"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "38d0bb488f9d89b6fc67c04fe2242a5fc1e5796aa67a045f37f1496e5134cc23" +content-hash = "22fc0755bfc3ae10abe8d225f832b22dcb81c4d52f52accc8ce4a27a4d06fdb9" diff --git a/pyproject.toml b/pyproject.toml index 92f6c33a6..a5e7e516e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,7 @@ assemblyai = "^0.26.0" litellm = "^1.44.0" chromadb = "^0.4" langchain-anthropic = "^0.1.23" -langchain-astradb = "^0.3.3" +langchain-astradb = "^0.3.5" langchain-openai = "0.1.22" zep-python = { version = "^2.0.0rc5", allow-prereleases = true } langchain-google-vertexai = "1.0.10" diff --git a/src/backend/base/langflow/components/vectorstores/HCD.py b/src/backend/base/langflow/components/vectorstores/HCD.py new file mode 100644 index 000000000..d15a01af7 --- /dev/null +++ b/src/backend/base/langflow/components/vectorstores/HCD.py @@ -0,0 +1,320 @@ +from loguru import logger + +from langflow.base.vectorstores.model import LCVectorStoreComponent, check_cached_vector_store +from langflow.helpers import docs_to_data +from langflow.inputs import DictInput, FloatInput +from langflow.io import ( + BoolInput, + DataInput, + DropdownInput, + HandleInput, + IntInput, + MultilineInput, + SecretStrInput, + StrInput, +) +from langflow.schema import Data + + +class HCDVectorStoreComponent(LCVectorStoreComponent): + display_name: str = "Hyper-Converged Database" + description: str = "Implementation of Vector Store using Hyper-Converged Database (HCD) with search capabilities" + documentation: str = "https://python.langchain.com/docs/integrations/vectorstores/astradb" + name = "HCD" + icon: str = "HCD" + + inputs = [ + StrInput( + name="collection_name", + display_name="Collection Name", + info="The name of the collection within HCD where the vectors will be stored.", + required=True, + ), + StrInput( + name="username", + display_name="HCD Username", + info="Authentication username for accessing HCD.", + value="hcd-superuser", + required=True, + ), + SecretStrInput( + name="password", + display_name="HCD Password", + info="Authentication password for accessing HCD.", + value="HCD_PASSWORD", + required=True, + ), + SecretStrInput( + name="api_endpoint", + display_name="HCD API Endpoint", + info="API endpoint URL for the HCD service.", + value="HCD_API_ENDPOINT", + required=True, + ), + MultilineInput( + name="search_input", + display_name="Search Input", + ), + DataInput( + name="ingest_data", + display_name="Ingest Data", + is_list=True, + ), + StrInput( + name="namespace", + display_name="Namespace", + info="Optional namespace within HCD to use for the collection.", + value="default_namespace", + advanced=True, + ), + MultilineInput( + name="ca_certificate", + display_name="CA Certificate", + info="Optional CA certificate for TLS connections to HCD.", + advanced=True, + ), + DropdownInput( + name="metric", + display_name="Metric", + info="Optional distance metric for vector comparisons in the vector store.", + options=["cosine", "dot_product", "euclidean"], + advanced=True, + ), + IntInput( + name="batch_size", + display_name="Batch Size", + info="Optional number of data to process in a single batch.", + advanced=True, + ), + IntInput( + name="bulk_insert_batch_concurrency", + display_name="Bulk Insert Batch Concurrency", + info="Optional concurrency level for bulk insert operations.", + advanced=True, + ), + IntInput( + name="bulk_insert_overwrite_concurrency", + display_name="Bulk Insert Overwrite Concurrency", + info="Optional concurrency level for bulk insert operations that overwrite existing data.", + advanced=True, + ), + IntInput( + name="bulk_delete_concurrency", + display_name="Bulk Delete Concurrency", + info="Optional concurrency level for bulk delete operations.", + advanced=True, + ), + DropdownInput( + name="setup_mode", + display_name="Setup Mode", + info="Configuration mode for setting up the vector store, with options like 'Sync', 'Async', or 'Off'.", + options=["Sync", "Async", "Off"], + advanced=True, + value="Sync", + ), + BoolInput( + name="pre_delete_collection", + display_name="Pre Delete Collection", + info="Boolean flag to determine whether to delete the collection before creating a new one.", + advanced=True, + ), + StrInput( + name="metadata_indexing_include", + display_name="Metadata Indexing Include", + info="Optional list of metadata fields to include in the indexing.", + advanced=True, + ), + HandleInput( + name="embedding", + display_name="Embedding or Astra Vectorize", + input_types=["Embeddings", "dict"], + info="Allows either an embedding model or an Astra Vectorize configuration.", # TODO: This should be optional, but need to refactor langchain-astradb first. + ), + StrInput( + name="metadata_indexing_exclude", + display_name="Metadata Indexing Exclude", + info="Optional list of metadata fields to exclude from the indexing.", + advanced=True, + ), + StrInput( + name="collection_indexing_policy", + display_name="Collection Indexing Policy", + info="Optional dictionary defining the indexing policy for the collection.", + advanced=True, + ), + IntInput( + name="number_of_results", + display_name="Number of Results", + info="Number of results to return.", + advanced=True, + value=4, + ), + DropdownInput( + name="search_type", + display_name="Search Type", + info="Search type to use", + options=["Similarity", "Similarity with score threshold", "MMR (Max Marginal Relevance)"], + value="Similarity", + advanced=True, + ), + FloatInput( + name="search_score_threshold", + display_name="Search Score Threshold", + info="Minimum similarity score threshold for search results. (when using 'Similarity with score threshold')", + value=0, + advanced=True, + ), + DictInput( + name="search_filter", + display_name="Search Metadata Filter", + info="Optional dictionary of filters to apply to the search query.", + advanced=True, + is_list=True, + ), + ] + + @check_cached_vector_store + def build_vector_store(self): + try: + from langchain_astradb import AstraDBVectorStore + from langchain_astradb.utils.astradb import SetupMode + except ImportError: + raise ImportError( + "Could not import langchain Astra DB integration package. " + "Please install it with `pip install langchain-astradb`." + ) + + try: + from astrapy.constants import Environment + from astrapy.authentication import UsernamePasswordTokenProvider + except ImportError: + raise ImportError( + "Could not import astrapy integration package. " "Please install it with `pip install astrapy`." + ) + + try: + if not self.setup_mode: + self.setup_mode = self._inputs["setup_mode"].options[0] + + setup_mode_value = SetupMode[self.setup_mode.upper()] + except KeyError: + raise ValueError(f"Invalid setup mode: {self.setup_mode}") + + if not isinstance(self.embedding, dict): + embedding_dict = {"embedding": self.embedding} + else: + from astrapy.info import CollectionVectorServiceOptions + + dict_options = self.embedding.get("collection_vector_service_options", {}) + dict_options["authentication"] = { + k: v for k, v in dict_options.get("authentication", {}).items() if k and v + } + dict_options["parameters"] = {k: v for k, v in dict_options.get("parameters", {}).items() if k and v} + embedding_dict = { + "collection_vector_service_options": CollectionVectorServiceOptions.from_dict(dict_options) + } + collection_embedding_api_key = self.embedding.get("collection_embedding_api_key") + if collection_embedding_api_key: + embedding_dict["collection_embedding_api_key"] = collection_embedding_api_key + + token_provider = UsernamePasswordTokenProvider(self.username, self.password) + vector_store_kwargs = { + **embedding_dict, + "collection_name": self.collection_name, + "token": token_provider, + "api_endpoint": self.api_endpoint, + "namespace": self.namespace, + "metric": self.metric or None, + "batch_size": self.batch_size or None, + "bulk_insert_batch_concurrency": self.bulk_insert_batch_concurrency or None, + "bulk_insert_overwrite_concurrency": self.bulk_insert_overwrite_concurrency or None, + "bulk_delete_concurrency": self.bulk_delete_concurrency or None, + "setup_mode": setup_mode_value, + "pre_delete_collection": self.pre_delete_collection or False, + "environment": Environment.HCD, + } + + if self.metadata_indexing_include: + vector_store_kwargs["metadata_indexing_include"] = self.metadata_indexing_include + elif self.metadata_indexing_exclude: + vector_store_kwargs["metadata_indexing_exclude"] = self.metadata_indexing_exclude + elif self.collection_indexing_policy: + vector_store_kwargs["collection_indexing_policy"] = self.collection_indexing_policy + + try: + vector_store = AstraDBVectorStore(**vector_store_kwargs) + except Exception as e: + raise ValueError(f"Error initializing AstraDBVectorStore: {str(e)}") from e + + self._add_documents_to_vector_store(vector_store) + return vector_store + + def _add_documents_to_vector_store(self, vector_store): + documents = [] + for _input in self.ingest_data or []: + if isinstance(_input, Data): + documents.append(_input.to_lc_document()) + else: + raise ValueError("Vector Store Inputs must be Data objects.") + + if documents: + logger.debug(f"Adding {len(documents)} documents to the Vector Store.") + try: + vector_store.add_documents(documents) + except Exception as e: + raise ValueError(f"Error adding documents to AstraDBVectorStore: {str(e)}") from e + else: + logger.debug("No documents to add to the Vector Store.") + + def _map_search_type(self): + if self.search_type == "Similarity with score threshold": + return "similarity_score_threshold" + elif self.search_type == "MMR (Max Marginal Relevance)": + return "mmr" + else: + return "similarity" + + def _build_search_args(self): + args = { + "k": self.number_of_results, + "score_threshold": self.search_score_threshold, + } + + if self.search_filter: + clean_filter = {k: v for k, v in self.search_filter.items() if k and v} + if len(clean_filter) > 0: + args["filter"] = clean_filter + return args + + def search_documents(self) -> list[Data]: + vector_store = self.build_vector_store() + + logger.debug(f"Search input: {self.search_input}") + logger.debug(f"Search type: {self.search_type}") + logger.debug(f"Number of results: {self.number_of_results}") + + if self.search_input and isinstance(self.search_input, str) and self.search_input.strip(): + try: + search_type = self._map_search_type() + search_args = self._build_search_args() + + docs = vector_store.search(query=self.search_input, search_type=search_type, **search_args) + except Exception as e: + raise ValueError(f"Error performing search in AstraDBVectorStore: {str(e)}") from e + + logger.debug(f"Retrieved documents: {len(docs)}") + + data = docs_to_data(docs) + logger.debug(f"Converted documents to data: {len(data)}") + self.status = data + return data + else: + logger.debug("No search input provided. Skipping search.") + return [] + + def get_retriever_kwargs(self): + search_args = self._build_search_args() + return { + "search_type": self._map_search_type(), + "search_kwargs": search_args, + } diff --git a/src/frontend/src/icons/HCD/Favicon.svg b/src/frontend/src/icons/HCD/Favicon.svg new file mode 100644 index 000000000..7fe145b1b --- /dev/null +++ b/src/frontend/src/icons/HCD/Favicon.svg @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/src/frontend/src/icons/HCD/HCD.jsx b/src/frontend/src/icons/HCD/HCD.jsx new file mode 100644 index 000000000..3aae62735 --- /dev/null +++ b/src/frontend/src/icons/HCD/HCD.jsx @@ -0,0 +1,28 @@ +const HCDSVG = (props) => ( + + + {/* */} + + + + + + + + + +); +export default HCDSVG; diff --git a/src/frontend/src/icons/HCD/index.tsx b/src/frontend/src/icons/HCD/index.tsx new file mode 100644 index 000000000..1503f9a07 --- /dev/null +++ b/src/frontend/src/icons/HCD/index.tsx @@ -0,0 +1,8 @@ +import React, { forwardRef } from "react"; +import HCDSVG from "./HCD"; + +export const HCDIcon = forwardRef>( + (props, ref) => { + return ; + }, +); diff --git a/src/frontend/src/utils/styleUtils.ts b/src/frontend/src/utils/styleUtils.ts index 26256fd84..5dc074836 100644 --- a/src/frontend/src/utils/styleUtils.ts +++ b/src/frontend/src/utils/styleUtils.ts @@ -196,6 +196,7 @@ import { GradientUngroup, } from "../icons/GradientSparkles"; import { GroqIcon } from "../icons/Groq"; +import { HCDIcon } from "../icons/HCD"; import { HuggingFaceIcon } from "../icons/HuggingFace"; import { IFixIcon } from "../icons/IFixIt"; import { LangChainIcon } from "../icons/LangChain"; @@ -399,6 +400,7 @@ export const nodeIconsLucide: iconsType = { Google: GoogleIcon, GoogleGenerativeAI: GoogleGenerativeAIIcon, Groq: GroqIcon, + HCD: HCDIcon, HNLoader: HackerNewsIcon, Unstructured: UnstructuredIcon, HuggingFaceHub: HuggingFaceIcon,