From bdd2076debb25ec2c0ddeadc2ebcdc8a8de67ff1 Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Wed, 21 Jun 2023 15:43:27 -0300 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=92=20chore(loading.py):=20remove=20co?= =?UTF-8?q?de=20that=20deletes=20files=20after=20loading=20in=20instantiat?= =?UTF-8?q?e=5Fdocumentloader=20function=20The=20save=5Fuploaded=5Ffile=20?= =?UTF-8?q?function=20now=20uses=20the=20folder=5Fname=20parameter=20inste?= =?UTF-8?q?ad=20of=20file=5Fname=20to=20improve=20semantics.=20The=20appdi?= =?UTF-8?q?rs=20library=20is=20now=20used=20to=20get=20the=20user=20cache?= =?UTF-8?q?=20directory=20and=20a=20folder=20for=20langflow=20cache=20is?= =?UTF-8?q?=20created.=20The=20sha256=20hash=20of=20the=20file=20content?= =?UTF-8?q?=20is=20now=20used=20as=20the=20file=20name=20to=20avoid=20coll?= =?UTF-8?q?isions=20and=20improve=20security.=20A=20folder=20is=20now=20cr?= =?UTF-8?q?eated=20for=20each=20flow=5Fid=20in=20the=20save=5Fuploaded=5Ff?= =?UTF-8?q?ile=20function.=20The=20code=20that=20deletes=20files=20after?= =?UTF-8?q?=20loading=20in=20the=20instantiate=5Fdocumentloader=20function?= =?UTF-8?q?=20has=20been=20removed=20as=20it=20is=20unnecessary=20and=20ca?= =?UTF-8?q?n=20cause=20issues.=20=F0=9F=90=9B=20fix(endpoints.py):=20chang?= =?UTF-8?q?e=20file=5Fname=20parameter=20to=20folder=5Fname=20in=20save=5F?= =?UTF-8?q?uploaded=5Ffile=20function=20=F0=9F=94=92=20chore(utils.py):=20?= =?UTF-8?q?use=20appdirs=20to=20get=20user=20cache=20directory=20and=20cre?= =?UTF-8?q?ate=20a=20folder=20for=20langflow=20cache=20=F0=9F=94=92=20chor?= =?UTF-8?q?e(utils.py):=20use=20sha256=20hash=20of=20file=20content=20as?= =?UTF-8?q?=20file=20name=20to=20avoid=20collisions=20and=20improve=20secu?= =?UTF-8?q?rity=20=F0=9F=94=92=20chore(utils.py):=20create=20folder=20for?= =?UTF-8?q?=20each=20flow=5Fid=20in=20save=5Fuploaded=5Ffile=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/backend/langflow/api/v1/endpoints.py | 2 +- src/backend/langflow/cache/utils.py | 43 +++++++++++++++++------ src/backend/langflow/interface/loading.py | 9 +---- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/src/backend/langflow/api/v1/endpoints.py b/src/backend/langflow/api/v1/endpoints.py index 10f2f4692..a8e5e5b76 100644 --- a/src/backend/langflow/api/v1/endpoints.py +++ b/src/backend/langflow/api/v1/endpoints.py @@ -62,7 +62,7 @@ async def predict_flow( async def create_upload_file(file: UploadFile, flow_id: str): # Cache file try: - file_path = save_uploaded_file(file.file, file_name=flow_id) + file_path = save_uploaded_file(file.file, folder_name=flow_id) return UploadFileResponse( flowId=flow_id, diff --git a/src/backend/langflow/cache/utils.py b/src/backend/langflow/cache/utils.py index 3fa95a3d6..3e456c3d4 100644 --- a/src/backend/langflow/cache/utils.py +++ b/src/backend/langflow/cache/utils.py @@ -8,15 +8,17 @@ import tempfile from collections import OrderedDict from pathlib import Path from typing import Any, Dict - +from appdirs import user_cache_dir CACHE: Dict[str, Any] = {} +CACHE_DIR = user_cache_dir("langflow", "langflow") + def create_cache_folder(func): def wrapper(*args, **kwargs): # Get the destination folder - cache_path = Path(tempfile.gettempdir()) / PREFIX + cache_path = Path(CACHE_DIR) / PREFIX # Create the destination folder if it doesn't exist os.makedirs(cache_path, exist_ok=True) @@ -118,7 +120,7 @@ def save_binary_file(content: str, file_name: str, accepted_types: list[str]) -> raise ValueError(f"File {file_name} is not accepted") # Get the destination folder - cache_path = Path(tempfile.gettempdir()) / PREFIX + cache_path = Path(CACHE_DIR) / PREFIX if not content: raise ValueError("Please, reload the file in the loader.") data = content.split(",")[1] @@ -135,23 +137,44 @@ def save_binary_file(content: str, file_name: str, accepted_types: list[str]) -> @create_cache_folder -def save_uploaded_file(file, file_name): +def save_uploaded_file(file, folder_name): """ - Save an uploaded file to the specified folder. + Save an uploaded file to the specified folder with a hash of its content as the file name. Args: file: The uploaded file object. - file_name: The name of the file, including its extension. + folder_name: The name of the folder to save the file in. Returns: The path to the saved file. """ - cache_path = Path(tempfile.gettempdir()) / PREFIX - file_path = cache_path / file_name + cache_path = Path(CACHE_DIR) + folder_path = cache_path / folder_name + # Create the folder if it doesn't exist + if not folder_path.exists(): + folder_path.mkdir() + + # Create a hash of the file content + sha256_hash = hashlib.sha256() + # Reset the file cursor to the beginning of the file + file.seek(0) + # Iterate over the uploaded file in small chunks to conserve memory + while chunk := file.read(8192): # Read 8KB at a time (adjust as needed) + sha256_hash.update(chunk) + + # Use the hex digest of the hash as the file name + hex_dig = sha256_hash.hexdigest() + file_extension = Path(file.filename).suffix + file_name = f"{hex_dig}{file_extension}" + + # Reset the file cursor to the beginning of the file + file.seek(0) + + # Save the file with the hash as its name + file_path = folder_path / file_name with open(file_path, "wb") as new_file: - # Iterate over the uploaded file in small chunks to conserve memory - while chunk := file.read(8192): # Read 8KB at a time (adjust as needed) + while chunk := file.read(8192): new_file.write(chunk) return file_path diff --git a/src/backend/langflow/interface/loading.py b/src/backend/langflow/interface/loading.py index af1b51678..a765d3b9b 100644 --- a/src/backend/langflow/interface/loading.py +++ b/src/backend/langflow/interface/loading.py @@ -11,7 +11,6 @@ from langchain.agents.load_tools import ( _EXTRA_OPTIONAL_TOOLS, _LLM_TOOLS, ) -from pathlib import Path from langchain.agents.loading import load_agent_from_config from langchain.agents.tools import Tool from langchain.base_language import BaseLanguageModel @@ -171,13 +170,7 @@ def instantiate_vectorstore(class_object, params): def instantiate_documentloader(class_object, params): - documents = class_object(**params).load() - # now that the file is loaded, we can remove the path - for value in params.values(): - path = Path(value) - if path.exists(): - path.unlink() - return documents + return class_object(**params).load() def instantiate_textsplitter(class_object, params):