FIX: Clean up the advanced parameters in Astra DB Vector Store Component (#5298)
* fix: Clean up the list of params in AstraDB * Clean up some more parameters * Update Vector Store RAG.json * [autofix.ci] apply automated fixes * Update Vector Store RAG.json * [autofix.ci] apply automated fixes * Update Vector Store RAG.json * Update astradb.py * [autofix.ci] apply automated fixes * Update Vector Store RAG.json * [autofix.ci] apply automated fixes * Update Vector Store RAG.json * [autofix.ci] apply automated fixes * Error if no file provided * Fix base file value to be empty * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
ba75a15548
commit
a3d238c280
5 changed files with 240 additions and 715 deletions
|
|
@ -115,6 +115,7 @@ class BaseFileComponent(Component, ABC):
|
|||
fileTypes=[], # Dynamically set in __init__
|
||||
info="", # Dynamically set in __init__
|
||||
required=False,
|
||||
value="",
|
||||
),
|
||||
HandleInput(
|
||||
name="file_path",
|
||||
|
|
|
|||
|
|
@ -68,8 +68,8 @@ class FileComponent(BaseFileComponent):
|
|||
return None
|
||||
|
||||
if not file_list:
|
||||
self.log("No files to process.")
|
||||
return file_list
|
||||
msg = "No files to process."
|
||||
raise ValueError(msg)
|
||||
|
||||
concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)
|
||||
file_count = len(file_list)
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
import orjson
|
||||
from astrapy import DataAPIClient
|
||||
from astrapy.admin import parse_api_endpoint
|
||||
from langchain_astradb import AstraDBVectorStore
|
||||
|
|
@ -113,15 +112,33 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
|
|||
info="Optional keyspace within Astra DB to use for the collection.",
|
||||
advanced=True,
|
||||
),
|
||||
DropdownInput(
|
||||
name="embedding_choice",
|
||||
display_name="Embedding Model or Astra Vectorize",
|
||||
info="Determines whether to use Astra Vectorize for the collection.",
|
||||
options=["Embedding Model", "Astra Vectorize"],
|
||||
real_time_refresh=True,
|
||||
value="Embedding Model",
|
||||
),
|
||||
HandleInput(
|
||||
name="embedding_model",
|
||||
display_name="Embedding Model",
|
||||
input_types=["Embeddings"],
|
||||
info="Allows an embedding model configuration.",
|
||||
),
|
||||
DataInput(
|
||||
name="ingest_data",
|
||||
display_name="Ingest Data",
|
||||
),
|
||||
MultilineInput(
|
||||
name="search_input",
|
||||
display_name="Search Input",
|
||||
display_name="Search Query",
|
||||
tool_mode=True,
|
||||
),
|
||||
IntInput(
|
||||
name="number_of_results",
|
||||
display_name="Number of Results",
|
||||
info="Number of results to return.",
|
||||
display_name="Number of Search Results",
|
||||
info="Number of search results to return.",
|
||||
advanced=True,
|
||||
value=4,
|
||||
),
|
||||
|
|
@ -147,98 +164,6 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
|
|||
info="Optional dictionary of filters to apply to the search query.",
|
||||
advanced=True,
|
||||
),
|
||||
DictInput(
|
||||
name="search_filter",
|
||||
display_name="[DEPRECATED] Search Metadata Filter",
|
||||
info="Deprecated: use advanced_search_filter. Optional dictionary of filters to apply to the search query.",
|
||||
advanced=True,
|
||||
list=True,
|
||||
),
|
||||
DataInput(
|
||||
name="ingest_data",
|
||||
display_name="Ingest Data",
|
||||
),
|
||||
DropdownInput(
|
||||
name="embedding_choice",
|
||||
display_name="Embedding Model or Astra Vectorize",
|
||||
info="Determines whether to use Astra Vectorize for the collection.",
|
||||
options=["Embedding Model", "Astra Vectorize"],
|
||||
real_time_refresh=True,
|
||||
value="Embedding Model",
|
||||
),
|
||||
HandleInput(
|
||||
name="embedding_model",
|
||||
display_name="Embedding Model",
|
||||
input_types=["Embeddings"],
|
||||
info="Allows an embedding model configuration.",
|
||||
),
|
||||
DropdownInput(
|
||||
name="metric",
|
||||
display_name="Metric",
|
||||
info="Optional distance metric for vector comparisons in the vector store.",
|
||||
options=["cosine", "dot_product", "euclidean"],
|
||||
value="cosine",
|
||||
advanced=True,
|
||||
),
|
||||
IntInput(
|
||||
name="batch_size",
|
||||
display_name="Batch Size",
|
||||
info="Optional number of data to process in a single batch.",
|
||||
advanced=True,
|
||||
),
|
||||
IntInput(
|
||||
name="bulk_insert_batch_concurrency",
|
||||
display_name="Bulk Insert Batch Concurrency",
|
||||
info="Optional concurrency level for bulk insert operations.",
|
||||
advanced=True,
|
||||
),
|
||||
IntInput(
|
||||
name="bulk_insert_overwrite_concurrency",
|
||||
display_name="Bulk Insert Overwrite Concurrency",
|
||||
info="Optional concurrency level for bulk insert operations that overwrite existing data.",
|
||||
advanced=True,
|
||||
),
|
||||
IntInput(
|
||||
name="bulk_delete_concurrency",
|
||||
display_name="Bulk Delete Concurrency",
|
||||
info="Optional concurrency level for bulk delete operations.",
|
||||
advanced=True,
|
||||
),
|
||||
DropdownInput(
|
||||
name="setup_mode",
|
||||
display_name="Setup Mode",
|
||||
info="Configuration mode for setting up the vector store, with options like 'Sync' or 'Off'.",
|
||||
options=["Sync", "Off"],
|
||||
advanced=True,
|
||||
value="Sync",
|
||||
),
|
||||
BoolInput(
|
||||
name="pre_delete_collection",
|
||||
display_name="Pre Delete Collection",
|
||||
info="Boolean flag to determine whether to delete the collection before creating a new one.",
|
||||
advanced=True,
|
||||
),
|
||||
StrInput(
|
||||
name="metadata_indexing_include",
|
||||
display_name="Metadata Indexing Include",
|
||||
info="Optional list of metadata fields to include in the indexing.",
|
||||
list=True,
|
||||
advanced=True,
|
||||
),
|
||||
StrInput(
|
||||
name="metadata_indexing_exclude",
|
||||
display_name="Metadata Indexing Exclude",
|
||||
info="Optional list of metadata fields to exclude from the indexing.",
|
||||
list=True,
|
||||
advanced=True,
|
||||
),
|
||||
StrInput(
|
||||
name="collection_indexing_policy",
|
||||
display_name="Collection Indexing Policy",
|
||||
info='Optional JSON string for the "indexing" field of the collection. '
|
||||
"See https://docs.datastax.com/en/astra-db-serverless/api-reference/collections.html#the-indexing-option",
|
||||
advanced=True,
|
||||
),
|
||||
StrInput(
|
||||
name="content_field",
|
||||
display_name="Content Field",
|
||||
|
|
@ -251,6 +176,12 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
|
|||
info="Boolean flag to determine whether to ignore invalid documents at runtime.",
|
||||
advanced=True,
|
||||
),
|
||||
NestedDictInput(
|
||||
name="astradb_vectorstore_kwargs",
|
||||
display_name="AstraDBVectorStore Parameters",
|
||||
info="Optional dictionary of additional parameters for the AstraDBVectorStore.",
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
def del_fields(self, build_config, field_list):
|
||||
|
|
@ -586,7 +517,6 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
|
|||
def build_vector_store(self, vectorize_options=None):
|
||||
try:
|
||||
from langchain_astradb import AstraDBVectorStore
|
||||
from langchain_astradb.utils.astradb import SetupMode
|
||||
except ImportError as e:
|
||||
msg = (
|
||||
"Could not import langchain Astra DB integration package. "
|
||||
|
|
@ -594,49 +524,14 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
|
|||
)
|
||||
raise ImportError(msg) from e
|
||||
|
||||
try:
|
||||
if not self.setup_mode:
|
||||
self.setup_mode = self._inputs["setup_mode"].options[0]
|
||||
|
||||
setup_mode_value = SetupMode[self.setup_mode.upper()]
|
||||
except KeyError as e:
|
||||
msg = f"Invalid setup mode: {self.setup_mode}"
|
||||
raise ValueError(msg) from e
|
||||
|
||||
# Initialize parameters based on the collection name
|
||||
is_new_collection = self.collection_name == "+ Create new collection"
|
||||
|
||||
# Build the list of autodetect parameters
|
||||
autodetect_params = {
|
||||
"autodetect": not is_new_collection,
|
||||
"metric_value": self.metric if is_new_collection else None,
|
||||
"metadata_indexing_include": (
|
||||
[s for s in self.metadata_indexing_include if s] or None if is_new_collection else None
|
||||
),
|
||||
"metadata_indexing_exclude": (
|
||||
[s for s in self.metadata_indexing_exclude if s] or None if is_new_collection else None
|
||||
),
|
||||
"collection_indexing_policy": (
|
||||
orjson.dumps(self.collection_indexing_policy)
|
||||
if is_new_collection and self.collection_indexing_policy
|
||||
else None
|
||||
),
|
||||
"setup_mode": setup_mode_value if is_new_collection else None,
|
||||
}
|
||||
|
||||
# Unpack parameters
|
||||
autodetect = autodetect_params["autodetect"]
|
||||
metric_value = autodetect_params["metric_value"]
|
||||
metadata_indexing_include = autodetect_params["metadata_indexing_include"]
|
||||
metadata_indexing_exclude = autodetect_params["metadata_indexing_exclude"]
|
||||
collection_indexing_policy = autodetect_params["collection_indexing_policy"]
|
||||
setup_mode = autodetect_params["setup_mode"]
|
||||
|
||||
# Get the embedding model
|
||||
embedding_dict = {"embedding": self.embedding_model} if self.embedding_choice == "Embedding Model" else {}
|
||||
embedding_params = {"embedding": self.embedding_model} if self.embedding_choice == "Embedding Model" else {}
|
||||
|
||||
# Use the embedding model if the choice is set to "Embedding Model"
|
||||
if self.embedding_choice == "Astra Vectorize" and not autodetect:
|
||||
if self.embedding_choice == "Astra Vectorize" and is_new_collection:
|
||||
from astrapy.info import CollectionVectorServiceOptions
|
||||
|
||||
# Build the vectorize options dictionary
|
||||
|
|
@ -650,7 +545,7 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
|
|||
)
|
||||
|
||||
# Set the embedding dictionary
|
||||
embedding_dict = {
|
||||
embedding_params = {
|
||||
"collection_vector_service_options": CollectionVectorServiceOptions.from_dict(
|
||||
dict_options.get("collection_vector_service_options")
|
||||
),
|
||||
|
|
@ -670,29 +565,28 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
|
|||
if os.getenv("LANGFLOW_HOST") is not None:
|
||||
langflow_prefix = "ds-"
|
||||
|
||||
# Bundle up the auto-detect parameters
|
||||
autodetect_params = {
|
||||
"autodetect_collection": not is_new_collection, # TODO: May want to expose this option
|
||||
"content_field": self.content_field or None,
|
||||
"ignore_invalid_documents": self.ignore_invalid_documents,
|
||||
}
|
||||
|
||||
# Attempt to build the Vector Store object
|
||||
try:
|
||||
vector_store = AstraDBVectorStore(
|
||||
# Astra DB Authentication Parameters
|
||||
token=self.token,
|
||||
api_endpoint=self.api_endpoint,
|
||||
namespace=self.keyspace or None,
|
||||
collection_name=self.get_collection_choice(),
|
||||
autodetect_collection=autodetect,
|
||||
content_field=self.content_field or None,
|
||||
ignore_invalid_documents=self.ignore_invalid_documents,
|
||||
environment=environment,
|
||||
metric=metric_value,
|
||||
batch_size=self.batch_size or None,
|
||||
bulk_insert_batch_concurrency=self.bulk_insert_batch_concurrency or None,
|
||||
bulk_insert_overwrite_concurrency=self.bulk_insert_overwrite_concurrency or None,
|
||||
bulk_delete_concurrency=self.bulk_delete_concurrency or None,
|
||||
setup_mode=setup_mode,
|
||||
pre_delete_collection=self.pre_delete_collection,
|
||||
metadata_indexing_include=metadata_indexing_include,
|
||||
metadata_indexing_exclude=metadata_indexing_exclude,
|
||||
collection_indexing_policy=collection_indexing_policy,
|
||||
# Astra DB Usage Tracking Parameters
|
||||
ext_callers=[(f"{langflow_prefix}langflow", __version__)],
|
||||
**embedding_dict,
|
||||
# Astra DB Vector Store Parameters
|
||||
**autodetect_params,
|
||||
**embedding_params,
|
||||
**self.astradb_vectorstore_kwargs,
|
||||
)
|
||||
except Exception as e:
|
||||
msg = f"Error initializing AstraDBVectorStore: {e}"
|
||||
|
|
@ -730,9 +624,6 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
|
|||
|
||||
def _build_search_args(self):
|
||||
query = self.search_input if isinstance(self.search_input, str) and self.search_input.strip() else None
|
||||
search_filter = (
|
||||
{k: v for k, v in self.search_filter.items() if k and v and k.strip()} if self.search_filter else None
|
||||
)
|
||||
|
||||
if query:
|
||||
args = {
|
||||
|
|
@ -741,7 +632,7 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
|
|||
"k": self.number_of_results,
|
||||
"score_threshold": self.search_score_threshold,
|
||||
}
|
||||
elif self.advanced_search_filter or search_filter:
|
||||
elif self.advanced_search_filter:
|
||||
args = {
|
||||
"n": self.number_of_results,
|
||||
}
|
||||
|
|
@ -749,11 +640,6 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
|
|||
return {}
|
||||
|
||||
filter_arg = self.advanced_search_filter or {}
|
||||
|
||||
if search_filter:
|
||||
self.log(self.log(f"`search_filter` is deprecated. Use `advanced_search_filter`. Cleaned: {search_filter}"))
|
||||
filter_arg.update(search_filter)
|
||||
|
||||
if filter_arg:
|
||||
args["filter"] = filter_arg
|
||||
|
||||
|
|
|
|||
|
|
@ -1267,7 +1267,7 @@
|
|||
"show": true,
|
||||
"title_case": false,
|
||||
"type": "code",
|
||||
"value": "from langflow.base.data import BaseFileComponent\nfrom langflow.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data\nfrom langflow.io import BoolInput, IntInput\nfrom langflow.schema import Data\n\n\nclass FileComponent(BaseFileComponent):\n \"\"\"Handles loading and processing of individual or zipped text files.\n\n This component supports processing multiple valid files within a zip archive,\n resolving paths, validating file types, and optionally using multithreading for processing.\n \"\"\"\n\n display_name = \"File\"\n description = \"Load a file to be used in your project.\"\n icon = \"file-text\"\n name = \"File\"\n\n VALID_EXTENSIONS = TEXT_FILE_TYPES\n\n inputs = [\n *BaseFileComponent._base_inputs,\n BoolInput(\n name=\"use_multithreading\",\n display_name=\"[Deprecated] Use Multithreading\",\n advanced=True,\n value=True,\n info=\"Set 'Processing Concurrency' greater than 1 to enable multithreading.\",\n ),\n IntInput(\n name=\"concurrency_multithreading\",\n display_name=\"Processing Concurrency\",\n advanced=False,\n info=\"When multiple files are being processed, the number of files to process concurrently.\",\n value=1,\n ),\n ]\n\n outputs = [\n *BaseFileComponent._base_outputs,\n ]\n\n def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:\n \"\"\"Processes files either sequentially or in parallel, depending on concurrency settings.\n\n Args:\n file_list (list[BaseFileComponent.BaseFile]): List of files to process.\n\n Returns:\n list[BaseFileComponent.BaseFile]: Updated list of files with merged data.\n \"\"\"\n\n def process_file(file_path: str, *, silent_errors: bool = False) -> Data | None:\n \"\"\"Processes a single file and returns its Data object.\"\"\"\n try:\n return parse_text_file_to_data(file_path, silent_errors=silent_errors)\n except FileNotFoundError as e:\n msg = f\"File not found: {file_path}. Error: {e}\"\n self.log(msg)\n if not silent_errors:\n raise\n return None\n except Exception as e:\n msg = f\"Unexpected error processing {file_path}: {e}\"\n self.log(msg)\n if not silent_errors:\n raise\n return None\n\n if not file_list:\n self.log(\"No files to process.\")\n return file_list\n\n concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)\n file_count = len(file_list)\n\n parallel_processing_threshold = 2\n if concurrency < parallel_processing_threshold or file_count < parallel_processing_threshold:\n if file_count > 1:\n self.log(f\"Processing {file_count} files sequentially.\")\n processed_data = [process_file(str(file.path), silent_errors=self.silent_errors) for file in file_list]\n else:\n self.log(f\"Starting parallel processing of {file_count} files with concurrency: {concurrency}.\")\n file_paths = [str(file.path) for file in file_list]\n processed_data = parallel_load_data(\n file_paths,\n silent_errors=self.silent_errors,\n load_function=process_file,\n max_concurrency=concurrency,\n )\n\n # Use rollup_basefile_data to merge processed data with BaseFile objects\n return self.rollup_data(file_list, processed_data)\n"
|
||||
"value": "from langflow.base.data import BaseFileComponent\nfrom langflow.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data\nfrom langflow.io import BoolInput, IntInput\nfrom langflow.schema import Data\n\n\nclass FileComponent(BaseFileComponent):\n \"\"\"Handles loading and processing of individual or zipped text files.\n\n This component supports processing multiple valid files within a zip archive,\n resolving paths, validating file types, and optionally using multithreading for processing.\n \"\"\"\n\n display_name = \"File\"\n description = \"Load a file to be used in your project.\"\n icon = \"file-text\"\n name = \"File\"\n\n VALID_EXTENSIONS = TEXT_FILE_TYPES\n\n inputs = [\n *BaseFileComponent._base_inputs,\n BoolInput(\n name=\"use_multithreading\",\n display_name=\"[Deprecated] Use Multithreading\",\n advanced=True,\n value=True,\n info=\"Set 'Processing Concurrency' greater than 1 to enable multithreading.\",\n ),\n IntInput(\n name=\"concurrency_multithreading\",\n display_name=\"Processing Concurrency\",\n advanced=False,\n info=\"When multiple files are being processed, the number of files to process concurrently.\",\n value=1,\n ),\n ]\n\n outputs = [\n *BaseFileComponent._base_outputs,\n ]\n\n def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:\n \"\"\"Processes files either sequentially or in parallel, depending on concurrency settings.\n\n Args:\n file_list (list[BaseFileComponent.BaseFile]): List of files to process.\n\n Returns:\n list[BaseFileComponent.BaseFile]: Updated list of files with merged data.\n \"\"\"\n\n def process_file(file_path: str, *, silent_errors: bool = False) -> Data | None:\n \"\"\"Processes a single file and returns its Data object.\"\"\"\n try:\n return parse_text_file_to_data(file_path, silent_errors=silent_errors)\n except FileNotFoundError as e:\n msg = f\"File not found: {file_path}. Error: {e}\"\n self.log(msg)\n if not silent_errors:\n raise\n return None\n except Exception as e:\n msg = f\"Unexpected error processing {file_path}: {e}\"\n self.log(msg)\n if not silent_errors:\n raise\n return None\n\n if not file_list:\n msg = \"No files to process.\"\n raise ValueError(msg)\n\n concurrency = 1 if not self.use_multithreading else max(1, self.concurrency_multithreading)\n file_count = len(file_list)\n\n parallel_processing_threshold = 2\n if concurrency < parallel_processing_threshold or file_count < parallel_processing_threshold:\n if file_count > 1:\n self.log(f\"Processing {file_count} files sequentially.\")\n processed_data = [process_file(str(file.path), silent_errors=self.silent_errors) for file in file_list]\n else:\n self.log(f\"Starting parallel processing of {file_count} files with concurrency: {concurrency}.\")\n file_paths = [str(file.path) for file in file_list]\n processed_data = parallel_load_data(\n file_paths,\n silent_errors=self.silent_errors,\n load_function=process_file,\n max_concurrency=concurrency,\n )\n\n # Use rollup_basefile_data to merge processed data with BaseFile objects\n return self.rollup_data(file_list, processed_data)\n"
|
||||
},
|
||||
"concurrency_multithreading": {
|
||||
"_input_type": "IntInput",
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue