feat: astra db chunks deletion based on metadata field (#5537)

* feat: Add deletion_field parameter to AstraDBVectorStoreComponent for document management

- Introduced a new 'deletion_field' input to specify a metadata field for deleting documents before loading new data.
- Enhanced the _add_documents_to_vector_store method to handle document deletion based on the specified field, improving data management capabilities.

* Merging with main

* [autofix.ci] apply automated fixes

* [autofix.ci] apply automated fixes

* - Enhanced the info string for the 'deletion_field' parameter to improve readability.
- Optimized the deletion logic by using a set comprehension to eliminate duplicates when gathering delete values from documents.

* [autofix.ci] apply automated fixes

* Update src/backend/base/langflow/components/vectorstores/astradb.py

Co-authored-by: Madhavan <msmygit@users.noreply.github.com>

* [autofix.ci] apply automated fixes

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Hare <ericrhare@gmail.com>
Co-authored-by: Madhavan <msmygit@users.noreply.github.com>
This commit is contained in:
Samuel Matioli 2025-01-08 15:18:54 -03:00 committed by GitHub
commit 3df81309bf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 60 additions and 3 deletions

View file

@ -140,6 +140,14 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
info="Field to use as the text content field for the vector store.",
advanced=True,
),
StrInput(
name="deletion_field",
display_name="Deletion Based On Field",
info="When this parameter is provided, documents in the target collection with "
"metadata field values matching the input metadata field value will be deleted "
"before new data is loaded.",
advanced=True,
),
BoolInput(
name="ignore_invalid_documents",
display_name="Ignore Invalid Documents",
@ -569,7 +577,8 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
# Bundle up the auto-detect parameters
autodetect_params = {
"autodetect_collection": not is_new_collection, # TODO: May want to expose this option
# TODO: May want to expose this option
"autodetect_collection": not is_new_collection,
"content_field": self.content_field or None,
"ignore_invalid_documents": self.ignore_invalid_documents,
}
@ -607,6 +616,18 @@ class AstraDBVectorStoreComponent(LCVectorStoreComponent):
msg = "Vector Store Inputs must be Data objects."
raise TypeError(msg)
if documents and self.deletion_field:
self.log(f"Deleting documents where {self.deletion_field}")
try:
database = self.get_database()
collection = database.get_collection(self.get_collection_choice(), keyspace=self.keyspace or None)
delete_values = list({doc.metadata[self.deletion_field] for doc in documents})
self.log(f"Deleting documents where {self.deletion_field} matches {delete_values}.")
collection.delete_many({f"metadata.{self.deletion_field}": {"$in": delete_values}})
except Exception as e:
msg = f"Error deleting documents from AstraDBVectorStore based on '{self.deletion_field}': {e}"
raise ValueError(msg) from e
if documents:
self.log(f"Adding {len(documents)} documents to the Vector Store.")
try:

File diff suppressed because one or more lines are too long