Merge remote-tracking branch 'origin/cz/mergeAll' into fix/minor_ui_adjustments

2024-06-10 16:08:18 -03:00 · 2024-06-10 16:08:18 -03:00 · 8b209c2b1b
commit 8b209c2b1b
parent f32277c1ce e6fefa680d
43 changed files with 173 additions and 180 deletions
--- a/src/backend/base/langflow/api/v1/files.py
+++ b/src/backend/base/langflow/api/v1/files.py
@ -110,7 +110,7 @@ async def download_profile_picture(
        extension = file_name.split(".")[-1]
        config_dir = get_storage_service().settings_service.settings.config_dir
        config_path = Path(config_dir)
-        folder_path = config_path / 'profile_pictures' / folder_name
+        folder_path = config_path / "profile_pictures" / folder_name
        content_type = build_content_type_from_extension(extension)
        file_content = await storage_service.get_file(flow_id=folder_path, file_name=file_name)
        return StreamingResponse(BytesIO(file_content), media_type=content_type)
@ -140,7 +140,6 @@ async def list_profile_pictures(storage_service: StorageService = Depends(get_st
        raise HTTPException(status_code=500, detail=str(e))


-
@router.get("/list/{flow_id}")
 async def list_files(
    flow_id: UUID = Depends(get_flow_id), storage_service: StorageService = Depends(get_storage_service)
--- a/src/backend/base/langflow/base/vectorstores/utils.py
+++ b/src/backend/base/langflow/base/vectorstores/utils.py
@ -15,7 +15,7 @@ def chroma_collection_to_records(collection_dict: dict):
    for i, doc in enumerate(collection_dict["documents"]):
        record_dict = {
            "id": collection_dict["ids"][i],
-            "document": doc,
+            "text": doc,
        }
        if "metadatas" in collection_dict:
            for key, value in collection_dict["metadatas"][i].items():
--- a/src/backend/base/langflow/components/retrievers/SelfQueryRetriever.py
+++ b/src/backend/base/langflow/components/retrievers/SelfQueryRetriever.py
@ -4,7 +4,7 @@ from langchain.retrievers.self_query.base import SelfQueryRetriever
 from langchain_core.vectorstores import VectorStore

 from langflow.custom import CustomComponent
-from langflow.field_typing import BaseLanguageModel
+from langflow.field_typing import BaseLanguageModel, Text
 from langflow.schema import Record
 from langflow.schema.message import Message

@ -14,25 +14,54 @@ class SelfQueryRetrieverComponent(CustomComponent):
    description: str = "Retriever that uses a vector store and an LLM to generate the vector store queries."
    icon = "LangChain"

+    def build_config(self):
+        return {
+            "query": {
+                "display_name": "Query",
+                "input_types": ["Message", "Text"],
+                "info": "Query to be passed as input.",
+            },
+            "vectorstore": {
+                "display_name": "Vector Store",
+                "info": "Vector Store to be passed as input.",
+            },
+            "attribute_infos": {
+                "display_name": "Metadata Field Info",
+                "info": "Metadata Field Info to be passed as input.",
+            },
+            "document_content_description": {
+                "display_name": "Document Content Description",
+                "info": "Document Content Description to be passed as input.",
+            },
+            "llm": {
+                "display_name": "LLM",
+                "info": "LLM to be passed as input.",
+            },
+        }
+
    def build(
        self,
        query: Message,
        vectorstore: VectorStore,
-        metadata_field_info: list[AttributeInfo],
-        document_content_description: str,
+        attribute_infos: list[Record],
+        document_content_description: Text,
        llm: BaseLanguageModel,
    ) -> Record:
-        metadata_field_info = [i[0] for i in metadata_field_info]
-
+        metadata_field_infos = [AttributeInfo(**record.data) for record in attribute_infos]
        self_query_retriever = SelfQueryRetriever.from_llm(
-            llm,
-            vectorstore,
-            document_content_description,
-            metadata_field_info,
+            llm=llm,
+            vectorstore=vectorstore,
+            document_contents=document_content_description,
+            metadata_field_info=metadata_field_infos,
            enable_limit=True,
        )

-        input_text = query.text
+        if isinstance(query, Message):
+            input_text = query.text
+        elif isinstance(query, str):
+            input_text = query
+        else:
+            raise ValueError(f"Query type {type(query)} not supported.")
        documents = self_query_retriever.invoke(input=input_text)
        records = [Record.from_document(document) for document in documents]
        self.status = records
--- a/src/backend/base/langflow/components/vectorstores/Chroma.py
+++ b/src/backend/base/langflow/components/vectorstores/Chroma.py
@ -1,3 +1,4 @@
+from copy import deepcopy
 from typing import List, Optional, Union

 import chromadb
@ -6,6 +7,7 @@ from langchain_chroma import Chroma
 from langchain_core.embeddings import Embeddings
 from langchain_core.retrievers import BaseRetriever
 from langchain_core.vectorstores import VectorStore
+
 from langflow.base.vectorstores.utils import chroma_collection_to_records
 from langflow.custom import CustomComponent
 from langflow.schema import Record
@ -48,6 +50,11 @@ class ChromaComponent(CustomComponent):
                "display_name": "Server SSL Enabled",
                "advanced": True,
            },
+            "allow_duplicates": {
+                "display_name": "Allow Duplicates",
+                "advanced": True,
+                "info": "If false, will not add documents that are already in the Vector Store.",
+            },
        }

    def build(
@ -61,6 +68,7 @@ class ChromaComponent(CustomComponent):
        chroma_server_host: Optional[str] = None,
        chroma_server_http_port: Optional[int] = None,
        chroma_server_grpc_port: Optional[int] = None,
+        allow_duplicates: bool = False,
    ) -> Union[VectorStore, BaseRetriever]:
        """
        Builds the Vector Store or BaseRetriever object.
@ -75,6 +83,7 @@ class ChromaComponent(CustomComponent):
        - chroma_server_host (Optional[str]): The host for the Chroma server.
        - chroma_server_http_port (Optional[int]): The HTTP port for the Chroma server.
        - chroma_server_grpc_port (Optional[int]): The gRPC port for the Chroma server.
+        - allow_duplicates (bool): Whether to allow duplicates in the Vector Store.

        Returns:
        - Union[VectorStore, BaseRetriever]: The Vector Store or BaseRetriever object.
@ -93,35 +102,34 @@ class ChromaComponent(CustomComponent):
            )
            client = chromadb.HttpClient(settings=chroma_settings)

-        # If documents, then we need to create a Chroma instance using .from_documents
-
        # Check index_directory and expand it if it is a relative path
        if index_directory is not None:
            index_directory = self.resolve_path(index_directory)

+        chroma = Chroma(
+            persist_directory=index_directory,
+            client=client,
+            embedding_function=embedding,
+            collection_name=collection_name,
+        )
+        if allow_duplicates:
+            stored_records = []
+        else:
+            stored_records = chroma_collection_to_records(chroma.get())
+            _stored_documents_without_id = []
+            for record in deepcopy(stored_records):
+                del record.id
+                _stored_documents_without_id.append(record)
        documents = []
        for _input in inputs or []:
            if isinstance(_input, Record):
-                documents.append(_input.to_lc_document())
+                if _input not in _stored_documents_without_id:
+                    documents.append(_input.to_lc_document())
            else:
-                documents.append(_input)
-        if documents is not None and embedding is not None:
-            if len(documents) == 0:
-                raise ValueError("If documents are provided, there must be at least one document.")
-            chroma = Chroma.from_documents(
-                documents=documents,  # type: ignore
-                persist_directory=index_directory,
-                collection_name=collection_name,
-                embedding=embedding,
-                client=client,
-            )
-        else:
-            chroma = Chroma(
-                persist_directory=index_directory,
-                client=client,
-                embedding_function=embedding,
-            )
+                raise ValueError("Inputs must be a Record objects.")

-        store = chroma.get()
-        self.status = chroma_collection_to_records(store)
+        if documents and embedding is not None:
+            chroma.add_documents(documents)
+
+        self.status = stored_records
        return chroma
--- a/src/backend/base/langflow/helpers/folders.py
+++ b/src/backend/base/langflow/helpers/folders.py
@ -20,4 +20,4 @@ def generate_unique_folder_name(folder_name, user_id, session):

        # If a folder with the name already exists, append (n) to the name and increment n
        folder_name = f"{original_name} ({n})"
-        n += 1
+        n += 1