🔒 chore(loading.py): remove code that deletes files after loading in instantiate_documentloader function

The save_uploaded_file function now uses the folder_name parameter instead of file_name to improve semantics. The appdirs library is now used to get the user cache directory and a folder for langflow cache is created. The sha256 hash of the file content is now used as the file name to avoid collisions and improve security. A folder is now created for each flow_id in the save_uploaded_file function. The code that deletes files after loading in the instantiate_documentloader function has been removed as it is unnecessary and can cause issues. 🐛 fix(endpoints.py): change file_name parameter to folder_name in save_uploaded_file function 🔒 chore(utils.py): use appdirs to get user cache directory and create a folder for langflow cache 🔒 chore(utils.py): use sha256 hash of file content as file name to avoid collisions and improve security 🔒 chore(utils.py): create folder for each flow_id in save_uploaded_file function
2023-06-21 15:43:27 -03:00 · 2023-06-21 15:43:27 -03:00 · bdd2076deb
commit bdd2076deb
parent 59deed4009
3 changed files with 35 additions and 19 deletions
--- a/src/backend/langflow/api/v1/endpoints.py
+++ b/src/backend/langflow/api/v1/endpoints.py
@ -62,7 +62,7 @@ async def predict_flow(
 async def create_upload_file(file: UploadFile, flow_id: str):
    # Cache file
    try:
-        file_path = save_uploaded_file(file.file, file_name=flow_id)
+        file_path = save_uploaded_file(file.file, folder_name=flow_id)

        return UploadFileResponse(
            flowId=flow_id,
--- a/src/backend/langflow/cache/utils.py
+++ b/src/backend/langflow/cache/utils.py
@ -8,15 +8,17 @@ import tempfile
 from collections import OrderedDict
 from pathlib import Path
 from typing import Any, Dict
-
+from appdirs import user_cache_dir

 CACHE: Dict[str, Any] = {}

+CACHE_DIR = user_cache_dir("langflow", "langflow")
+

 def create_cache_folder(func):
    def wrapper(*args, **kwargs):
        # Get the destination folder
-        cache_path = Path(tempfile.gettempdir()) / PREFIX
+        cache_path = Path(CACHE_DIR) / PREFIX

        # Create the destination folder if it doesn't exist
        os.makedirs(cache_path, exist_ok=True)
@ -118,7 +120,7 @@ def save_binary_file(content: str, file_name: str, accepted_types: list[str]) ->
        raise ValueError(f"File {file_name} is not accepted")

    # Get the destination folder
-    cache_path = Path(tempfile.gettempdir()) / PREFIX
+    cache_path = Path(CACHE_DIR) / PREFIX
    if not content:
        raise ValueError("Please, reload the file in the loader.")
    data = content.split(",")[1]
@ -135,23 +137,44 @@ def save_binary_file(content: str, file_name: str, accepted_types: list[str]) ->


@create_cache_folder
-def save_uploaded_file(file, file_name):
+def save_uploaded_file(file, folder_name):
    """
-    Save an uploaded file to the specified folder.
+    Save an uploaded file to the specified folder with a hash of its content as the file name.

    Args:
        file: The uploaded file object.
-        file_name: The name of the file, including its extension.
+        folder_name: The name of the folder to save the file in.

    Returns:
        The path to the saved file.
    """
-    cache_path = Path(tempfile.gettempdir()) / PREFIX
-    file_path = cache_path / file_name
+    cache_path = Path(CACHE_DIR)
+    folder_path = cache_path / folder_name

+    # Create the folder if it doesn't exist
+    if not folder_path.exists():
+        folder_path.mkdir()
+
+    # Create a hash of the file content
+    sha256_hash = hashlib.sha256()
+    # Reset the file cursor to the beginning of the file
+    file.seek(0)
+    # Iterate over the uploaded file in small chunks to conserve memory
+    while chunk := file.read(8192):  # Read 8KB at a time (adjust as needed)
+        sha256_hash.update(chunk)
+
+    # Use the hex digest of the hash as the file name
+    hex_dig = sha256_hash.hexdigest()
+    file_extension = Path(file.filename).suffix
+    file_name = f"{hex_dig}{file_extension}"
+
+    # Reset the file cursor to the beginning of the file
+    file.seek(0)
+
+    # Save the file with the hash as its name
+    file_path = folder_path / file_name
    with open(file_path, "wb") as new_file:
-        # Iterate over the uploaded file in small chunks to conserve memory
-        while chunk := file.read(8192):  # Read 8KB at a time (adjust as needed)
+        while chunk := file.read(8192):
            new_file.write(chunk)

    return file_path
--- a/src/backend/langflow/interface/loading.py
+++ b/src/backend/langflow/interface/loading.py
@ -11,7 +11,6 @@ from langchain.agents.load_tools import (
    _EXTRA_OPTIONAL_TOOLS,
    _LLM_TOOLS,
 )
-from pathlib import Path
 from langchain.agents.loading import load_agent_from_config
 from langchain.agents.tools import Tool
 from langchain.base_language import BaseLanguageModel
@ -171,13 +170,7 @@ def instantiate_vectorstore(class_object, params):


 def instantiate_documentloader(class_object, params):
-    documents = class_object(**params).load()
-    # now that the file is loaded, we can remove the path
-    for value in params.values():
-        path = Path(value)
-        if path.exists():
-            path.unlink()
-    return documents
+    return class_object(**params).load()


 def instantiate_textsplitter(class_object, params):