FIX: Clean up the advanced parameters in Astra DB Vector Store Component (#5298)

* fix: Clean up the list of params in AstraDB * Clean up some more parameters * Update Vector Store RAG.json * [autofix.ci] apply automated fixes * Update Vector Store RAG.json * [autofix.ci] apply automated fixes * Update Vector Store RAG.json * Update astradb.py * [autofix.ci] apply automated fixes * Update Vector Store RAG.json * [autofix.ci] apply automated fixes * Update Vector Store RAG.json * [autofix.ci] apply automated fixes * Error if no file provided * Fix base file value to be empty * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2024-12-16 18:33:07 -08:00 · 2024-12-16 18:33:07 -08:00 · a3d238c280
commit a3d238c280
parent ba75a15548
5 changed files with 240 additions and 715 deletions
--- a/src/backend/base/langflow/base/data/base_file.py
+++ b/src/backend/base/langflow/base/data/base_file.py
@ -115,6 +115,7 @@ class BaseFileComponent(Component, ABC):
            fileTypes=[],  # Dynamically set in __init__
            info="",  # Dynamically set in __init__
            required=False,
+            value="",
        ),
        HandleInput(
            name="file_path",
--- a/src/backend/base/langflow/components/data/file.py
+++ b/src/backend/base/langflow/components/data/file.py
@ -68,8 +68,8 @@ class FileComponent(BaseFileComponent):
                return None

        if not file_list:
-            self.log("No files to process.")
-            return file_list
+            msg = "No files to process."
+            raise ValueError(msg)

        concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)
        file_count = len(file_list)
--- a/src/backend/base/langflow/components/vectorstores/astradb.py
+++ b/src/backend/base/langflow/components/vectorstores/astradb.py
@ -1,7 +1,6 @@
 import os
 from collections import defaultdict

-import orjson
 from astrapy import DataAPIClient
 from astrapy.admin import parse_api_endpoint
 from langchain_astradb import AstraDBVectorStore
@ -113,15 +112,33 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
            info="Optional keyspace within Astra DB to use for the collection.",
            advanced=True,
        ),
+        DropdownInput(
+            name="embedding_choice",
+            display_name="Embedding Model or Astra Vectorize",
+            info="Determines whether to use Astra Vectorize for the collection.",
+            options=["Embedding Model", "Astra Vectorize"],
+            real_time_refresh=True,
+            value="Embedding Model",
+        ),
+        HandleInput(
+            name="embedding_model",
+            display_name="Embedding Model",
+            input_types=["Embeddings"],
+            info="Allows an embedding model configuration.",
+        ),
+        DataInput(
+            name="ingest_data",
+            display_name="Ingest Data",
+        ),
        MultilineInput(
            name="search_input",
-            display_name="Search Input",
+            display_name="Search Query",
            tool_mode=True,
        ),
        IntInput(
            name="number_of_results",
-            display_name="Number of Results",
-            info="Number of results to return.",
+            display_name="Number of Search Results",
+            info="Number of search results to return.",
            advanced=True,
            value=4,
        ),
@ -147,98 +164,6 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
            info="Optional dictionary of filters to apply to the search query.",
            advanced=True,
        ),
-        DictInput(
-            name="search_filter",
-            display_name="[DEPRECATED] Search Metadata Filter",
-            info="Deprecated: use advanced_search_filter. Optional dictionary of filters to apply to the search query.",
-            advanced=True,
-            list=True,
-        ),
-        DataInput(
-            name="ingest_data",
-            display_name="Ingest Data",
-        ),
-        DropdownInput(
-            name="embedding_choice",
-            display_name="Embedding Model or Astra Vectorize",
-            info="Determines whether to use Astra Vectorize for the collection.",
-            options=["Embedding Model", "Astra Vectorize"],
-            real_time_refresh=True,
-            value="Embedding Model",
-        ),
-        HandleInput(
-            name="embedding_model",
-            display_name="Embedding Model",
-            input_types=["Embeddings"],
-            info="Allows an embedding model configuration.",
-        ),
-        DropdownInput(
-            name="metric",
-            display_name="Metric",
-            info="Optional distance metric for vector comparisons in the vector store.",
-            options=["cosine", "dot_product", "euclidean"],
-            value="cosine",
-            advanced=True,
-        ),
-        IntInput(
-            name="batch_size",
-            display_name="Batch Size",
-            info="Optional number of data to process in a single batch.",
-            advanced=True,
-        ),
-        IntInput(
-            name="bulk_insert_batch_concurrency",
-            display_name="Bulk Insert Batch Concurrency",
-            info="Optional concurrency level for bulk insert operations.",
-            advanced=True,
-        ),
-        IntInput(
-            name="bulk_insert_overwrite_concurrency",
-            display_name="Bulk Insert Overwrite Concurrency",
-            info="Optional concurrency level for bulk insert operations that overwrite existing data.",
-            advanced=True,
-        ),
-        IntInput(
-            name="bulk_delete_concurrency",
-            display_name="Bulk Delete Concurrency",
-            info="Optional concurrency level for bulk delete operations.",
-            advanced=True,
-        ),
-        DropdownInput(
-            name="setup_mode",
-            display_name="Setup Mode",
-            info="Configuration mode for setting up the vector store, with options like 'Sync' or 'Off'.",
-            options=["Sync", "Off"],
-            advanced=True,
-            value="Sync",
-        ),
-        BoolInput(
-            name="pre_delete_collection",
-            display_name="Pre Delete Collection",
-            info="Boolean flag to determine whether to delete the collection before creating a new one.",
-            advanced=True,
-        ),
-        StrInput(
-            name="metadata_indexing_include",
-            display_name="Metadata Indexing Include",
-            info="Optional list of metadata fields to include in the indexing.",
-            list=True,
-            advanced=True,
-        ),
-        StrInput(
-            name="metadata_indexing_exclude",
-            display_name="Metadata Indexing Exclude",
-            info="Optional list of metadata fields to exclude from the indexing.",
-            list=True,
-            advanced=True,
-        ),
-        StrInput(
-            name="collection_indexing_policy",
-            display_name="Collection Indexing Policy",
-            info='Optional JSON string for the "indexing" field of the collection. '
-            "See https://docs.datastax.com/en/astra-db-serverless/api-reference/collections.html#the-indexing-option",
-            advanced=True,
-        ),
        StrInput(
            name="content_field",
            display_name="Content Field",
@ -251,6 +176,12 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
            info="Boolean flag to determine whether to ignore invalid documents at runtime.",
            advanced=True,
        ),
+        NestedDictInput(
+            name="astradb_vectorstore_kwargs",
+            display_name="AstraDBVectorStore Parameters",
+            info="Optional dictionary of additional parameters for the AstraDBVectorStore.",
+            advanced=True,
+        ),
    ]

    def del_fields(self, build_config, field_list):
@ -586,7 +517,6 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
    def build_vector_store(self, vectorize_options=None):
        try:
            from langchain_astradb import AstraDBVectorStore
-            from langchain_astradb.utils.astradb import SetupMode
        except ImportError as e:
            msg = (
                "Could not import langchain Astra DB integration package. "
@ -594,49 +524,14 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
            )
            raise ImportError(msg) from e

-        try:
-            if not self.setup_mode:
-                self.setup_mode = self._inputs["setup_mode"].options[0]
-
-            setup_mode_value = SetupMode[self.setup_mode.upper()]
-        except KeyError as e:
-            msg = f"Invalid setup mode: {self.setup_mode}"
-            raise ValueError(msg) from e
-
        # Initialize parameters based on the collection name
        is_new_collection = self.collection_name == "+ Create new collection"

-        # Build the list of autodetect parameters
-        autodetect_params = {
-            "autodetect": not is_new_collection,
-            "metric_value": self.metric if is_new_collection else None,
-            "metadata_indexing_include": (
-                [s for s in self.metadata_indexing_include if s] or None if is_new_collection else None
-            ),
-            "metadata_indexing_exclude": (
-                [s for s in self.metadata_indexing_exclude if s] or None if is_new_collection else None
-            ),
-            "collection_indexing_policy": (
-                orjson.dumps(self.collection_indexing_policy)
-                if is_new_collection and self.collection_indexing_policy
-                else None
-            ),
-            "setup_mode": setup_mode_value if is_new_collection else None,
-        }
-
-        # Unpack parameters
-        autodetect = autodetect_params["autodetect"]
-        metric_value = autodetect_params["metric_value"]
-        metadata_indexing_include = autodetect_params["metadata_indexing_include"]
-        metadata_indexing_exclude = autodetect_params["metadata_indexing_exclude"]
-        collection_indexing_policy = autodetect_params["collection_indexing_policy"]
-        setup_mode = autodetect_params["setup_mode"]
-
        # Get the embedding model
-        embedding_dict = {"embedding": self.embedding_model} if self.embedding_choice == "Embedding Model" else {}
+        embedding_params = {"embedding": self.embedding_model} if self.embedding_choice == "Embedding Model" else {}

        # Use the embedding model if the choice is set to "Embedding Model"
-        if self.embedding_choice == "Astra Vectorize" and not autodetect:
+        if self.embedding_choice == "Astra Vectorize" and is_new_collection:
            from astrapy.info import CollectionVectorServiceOptions

            # Build the vectorize options dictionary
@ -650,7 +545,7 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
            )

            # Set the embedding dictionary
-            embedding_dict = {
+            embedding_params = {
                "collection_vector_service_options": CollectionVectorServiceOptions.from_dict(
                    dict_options.get("collection_vector_service_options")
                ),
@ -670,29 +565,28 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
        if os.getenv("LANGFLOW_HOST") is not None:
            langflow_prefix = "ds-"

+        # Bundle up the auto-detect parameters
+        autodetect_params = {
+            "autodetect_collection": not is_new_collection,  # TODO: May want to expose this option
+            "content_field": self.content_field or None,
+            "ignore_invalid_documents": self.ignore_invalid_documents,
+        }
+
        # Attempt to build the Vector Store object
        try:
            vector_store = AstraDBVectorStore(
+                # Astra DB Authentication Parameters
                token=self.token,
                api_endpoint=self.api_endpoint,
                namespace=self.keyspace or None,
                collection_name=self.get_collection_choice(),
-                autodetect_collection=autodetect,
-                content_field=self.content_field or None,
-                ignore_invalid_documents=self.ignore_invalid_documents,
                environment=environment,
-                metric=metric_value,
-                batch_size=self.batch_size or None,
-                bulk_insert_batch_concurrency=self.bulk_insert_batch_concurrency or None,
-                bulk_insert_overwrite_concurrency=self.bulk_insert_overwrite_concurrency or None,
-                bulk_delete_concurrency=self.bulk_delete_concurrency or None,
-                setup_mode=setup_mode,
-                pre_delete_collection=self.pre_delete_collection,
-                metadata_indexing_include=metadata_indexing_include,
-                metadata_indexing_exclude=metadata_indexing_exclude,
-                collection_indexing_policy=collection_indexing_policy,
+                # Astra DB Usage Tracking Parameters
                ext_callers=[(f"{langflow_prefix}langflow", __version__)],
-                **embedding_dict,
+                # Astra DB Vector Store Parameters
+                **autodetect_params,
+                **embedding_params,
+                **self.astradb_vectorstore_kwargs,
            )
        except Exception as e:
            msg = f"Error initializing AstraDBVectorStore: {e}"
@ -730,9 +624,6 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):

    def _build_search_args(self):
        query = self.search_input if isinstance(self.search_input, str) and self.search_input.strip() else None
-        search_filter = (
-            {k: v for k, v in self.search_filter.items() if k and v and k.strip()} if self.search_filter else None
-        )

        if query:
            args = {
@ -741,7 +632,7 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
                "k": self.number_of_results,
                "score_threshold": self.search_score_threshold,
            }
-        elif self.advanced_search_filter or search_filter:
+        elif self.advanced_search_filter:
            args = {
                "n": self.number_of_results,
            }
@ -749,11 +640,6 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
            return {}

        filter_arg = self.advanced_search_filter or {}
-
-        if search_filter:
-            self.log(self.log(f"`search_filter` is deprecated. Use `advanced_search_filter`. Cleaned: {search_filter}"))
-            filter_arg.update(search_filter)
-
        if filter_arg:
            args["filter"] = filter_arg

--- a/src/backend/base/langflow/initial_setup/starter_projects/Document
+++ b/src/backend/base/langflow/initial_setup/starter_projects/Document
@ -1267,7 +1267,7 @@
                "show": true,
                "title_case": false,
                "type": "code",
-                "value": "from langflow.base.data import BaseFileComponent\nfrom langflow.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data\nfrom langflow.io import BoolInput, IntInput\nfrom langflow.schema import Data\n\n\nclass FileComponent(BaseFileComponent):\n    \"\"\"Handles loading and processing of individual or zipped text files.\n\n    This component supports processing multiple valid files within a zip archive,\n    resolving paths, validating file types, and optionally using multithreading for processing.\n    \"\"\"\n\n    display_name = \"File\"\n    description = \"Load a file to be used in your project.\"\n    icon = \"file-text\"\n    name = \"File\"\n\n    VALID_EXTENSIONS = TEXT_FILE_TYPES\n\n    inputs = [\n        *BaseFileComponent._base_inputs,\n        BoolInput(\n            name=\"use_multithreading\",\n            display_name=\"[Deprecated] Use Multithreading\",\n            advanced=True,\n            value=True,\n            info=\"Set 'Processing Concurrency' greater than 1 to enable multithreading.\",\n        ),\n        IntInput(\n            name=\"concurrency_multithreading\",\n            display_name=\"Processing Concurrency\",\n            advanced=False,\n            info=\"When multiple files are being processed, the number of files to process concurrently.\",\n            value=1,\n        ),\n    ]\n\n    outputs = [\n        *BaseFileComponent._base_outputs,\n    ]\n\n    def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:\n        \"\"\"Processes files either sequentially or in parallel, depending on concurrency settings.\n\n        Args:\n            file_list (list[BaseFileComponent.BaseFile]): List of files to process.\n\n        Returns:\n            list[BaseFileComponent.BaseFile]: Updated list of files with merged data.\n        \"\"\"\n\n        def process_file(file_path: str, *, silent_errors: bool = False) -> Data | None:\n            \"\"\"Processes a single file and returns its Data object.\"\"\"\n            try:\n                return parse_text_file_to_data(file_path, silent_errors=silent_errors)\n            except FileNotFoundError as e:\n                msg = f\"File not found: {file_path}. Error: {e}\"\n                self.log(msg)\n                if not silent_errors:\n                    raise\n                return None\n            except Exception as e:\n                msg = f\"Unexpected error processing {file_path}: {e}\"\n                self.log(msg)\n                if not silent_errors:\n                    raise\n                return None\n\n        if not file_list:\n            self.log(\"No files to process.\")\n            return file_list\n\n        concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)\n        file_count = len(file_list)\n\n        parallel_processing_threshold = 2\n        if concurrency < parallel_processing_threshold or file_count < parallel_processing_threshold:\n            if file_count > 1:\n                self.log(f\"Processing {file_count} files sequentially.\")\n            processed_data = [process_file(str(file.path), silent_errors=self.silent_errors) for file in file_list]\n        else:\n            self.log(f\"Starting parallel processing of {file_count} files with concurrency: {concurrency}.\")\n            file_paths = [str(file.path) for file in file_list]\n            processed_data = parallel_load_data(\n                file_paths,\n                silent_errors=self.silent_errors,\n                load_function=process_file,\n                max_concurrency=concurrency,\n            )\n\n        # Use rollup_basefile_data to merge processed data with BaseFile objects\n        return self.rollup_data(file_list, processed_data)\n"
+                "value": "from langflow.base.data import BaseFileComponent\nfrom langflow.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data\nfrom langflow.io import BoolInput, IntInput\nfrom langflow.schema import Data\n\n\nclass FileComponent(BaseFileComponent):\n    \"\"\"Handles loading and processing of individual or zipped text files.\n\n    This component supports processing multiple valid files within a zip archive,\n    resolving paths, validating file types, and optionally using multithreading for processing.\n    \"\"\"\n\n    display_name = \"File\"\n    description = \"Load a file to be used in your project.\"\n    icon = \"file-text\"\n    name = \"File\"\n\n    VALID_EXTENSIONS = TEXT_FILE_TYPES\n\n    inputs = [\n        *BaseFileComponent._base_inputs,\n        BoolInput(\n            name=\"use_multithreading\",\n            display_name=\"[Deprecated] Use Multithreading\",\n            advanced=True,\n            value=True,\n            info=\"Set 'Processing Concurrency' greater than 1 to enable multithreading.\",\n        ),\n        IntInput(\n            name=\"concurrency_multithreading\",\n            display_name=\"Processing Concurrency\",\n            advanced=False,\n            info=\"When multiple files are being processed, the number of files to process concurrently.\",\n            value=1,\n        ),\n    ]\n\n    outputs = [\n        *BaseFileComponent._base_outputs,\n    ]\n\n    def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:\n        \"\"\"Processes files either sequentially or in parallel, depending on concurrency settings.\n\n        Args:\n            file_list (list[BaseFileComponent.BaseFile]): List of files to process.\n\n        Returns:\n            list[BaseFileComponent.BaseFile]: Updated list of files with merged data.\n        \"\"\"\n\n        def process_file(file_path: str, *, silent_errors: bool = False) -> Data | None:\n            \"\"\"Processes a single file and returns its Data object.\"\"\"\n            try:\n                return parse_text_file_to_data(file_path, silent_errors=silent_errors)\n            except FileNotFoundError as e:\n                msg = f\"File not found: {file_path}. Error: {e}\"\n                self.log(msg)\n                if not silent_errors:\n                    raise\n                return None\n            except Exception as e:\n                msg = f\"Unexpected error processing {file_path}: {e}\"\n                self.log(msg)\n                if not silent_errors:\n                    raise\n                return None\n\n        if not file_list:\n            msg = \"No files to process.\"\n            raise ValueError(msg)\n\n        concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)\n        file_count = len(file_list)\n\n        parallel_processing_threshold = 2\n        if concurrency < parallel_processing_threshold or file_count < parallel_processing_threshold:\n            if file_count > 1:\n                self.log(f\"Processing {file_count} files sequentially.\")\n            processed_data = [process_file(str(file.path), silent_errors=self.silent_errors) for file in file_list]\n        else:\n            self.log(f\"Starting parallel processing of {file_count} files with concurrency: {concurrency}.\")\n            file_paths = [str(file.path) for file in file_list]\n            processed_data = parallel_load_data(\n                file_paths,\n                silent_errors=self.silent_errors,\n                load_function=process_file,\n                max_concurrency=concurrency,\n            )\n\n        # Use rollup_basefile_data to merge processed data with BaseFile objects\n        return self.rollup_data(file_list, processed_data)\n"
              },
              "concurrency_multithreading": {
                "_input_type": "IntInput",
--- a/src/backend/base/langflow/initial_setup/starter_projects/Vector
+++ b/src/backend/base/langflow/initial_setup/starter_projects/Vector