🔒 chore(loading.py): remove code that deletes files after loading in instantiate_documentloader function

The save_uploaded_file function now uses the folder_name parameter instead of file_name to improve semantics. The appdirs library is now used to get the user cache directory and a folder for langflow cache is created. The sha256 hash of the file content is now used as the file name to avoid collisions and improve security. A folder is now created for each flow_id in the save_uploaded_file function. The code that deletes files after loading in the instantiate_documentloader function has been removed as it is unnecessary and can cause issues.
🐛 fix(endpoints.py): change file_name parameter to folder_name in save_uploaded_file function
🔒 chore(utils.py): use appdirs to get user cache directory and create a folder for langflow cache
🔒 chore(utils.py): use sha256 hash of file content as file name to avoid collisions and improve security
🔒 chore(utils.py): create folder for each flow_id in save_uploaded_file function
This commit is contained in:
Gabriel Luiz Freitas Almeida 2023-06-21 15:43:27 -03:00
commit bdd2076deb
3 changed files with 35 additions and 19 deletions

View file

@ -62,7 +62,7 @@ async def predict_flow(
async def create_upload_file(file: UploadFile, flow_id: str):
# Cache file
try:
file_path = save_uploaded_file(file.file, file_name=flow_id)
file_path = save_uploaded_file(file.file, folder_name=flow_id)
return UploadFileResponse(
flowId=flow_id,

View file

@ -8,15 +8,17 @@ import tempfile
from collections import OrderedDict
from pathlib import Path
from typing import Any, Dict
from appdirs import user_cache_dir
CACHE: Dict[str, Any] = {}
CACHE_DIR = user_cache_dir("langflow", "langflow")
def create_cache_folder(func):
def wrapper(*args, **kwargs):
# Get the destination folder
cache_path = Path(tempfile.gettempdir()) / PREFIX
cache_path = Path(CACHE_DIR) / PREFIX
# Create the destination folder if it doesn't exist
os.makedirs(cache_path, exist_ok=True)
@ -118,7 +120,7 @@ def save_binary_file(content: str, file_name: str, accepted_types: list[str]) ->
raise ValueError(f"File {file_name} is not accepted")
# Get the destination folder
cache_path = Path(tempfile.gettempdir()) / PREFIX
cache_path = Path(CACHE_DIR) / PREFIX
if not content:
raise ValueError("Please, reload the file in the loader.")
data = content.split(",")[1]
@ -135,23 +137,44 @@ def save_binary_file(content: str, file_name: str, accepted_types: list[str]) ->
@create_cache_folder
def save_uploaded_file(file, file_name):
def save_uploaded_file(file, folder_name):
"""
Save an uploaded file to the specified folder.
Save an uploaded file to the specified folder with a hash of its content as the file name.
Args:
file: The uploaded file object.
file_name: The name of the file, including its extension.
folder_name: The name of the folder to save the file in.
Returns:
The path to the saved file.
"""
cache_path = Path(tempfile.gettempdir()) / PREFIX
file_path = cache_path / file_name
cache_path = Path(CACHE_DIR)
folder_path = cache_path / folder_name
# Create the folder if it doesn't exist
if not folder_path.exists():
folder_path.mkdir()
# Create a hash of the file content
sha256_hash = hashlib.sha256()
# Reset the file cursor to the beginning of the file
file.seek(0)
# Iterate over the uploaded file in small chunks to conserve memory
while chunk := file.read(8192): # Read 8KB at a time (adjust as needed)
sha256_hash.update(chunk)
# Use the hex digest of the hash as the file name
hex_dig = sha256_hash.hexdigest()
file_extension = Path(file.filename).suffix
file_name = f"{hex_dig}{file_extension}"
# Reset the file cursor to the beginning of the file
file.seek(0)
# Save the file with the hash as its name
file_path = folder_path / file_name
with open(file_path, "wb") as new_file:
# Iterate over the uploaded file in small chunks to conserve memory
while chunk := file.read(8192): # Read 8KB at a time (adjust as needed)
while chunk := file.read(8192):
new_file.write(chunk)
return file_path

View file

@ -11,7 +11,6 @@ from langchain.agents.load_tools import (
_EXTRA_OPTIONAL_TOOLS,
_LLM_TOOLS,
)
from pathlib import Path
from langchain.agents.loading import load_agent_from_config
from langchain.agents.tools import Tool
from langchain.base_language import BaseLanguageModel
@ -171,13 +170,7 @@ def instantiate_vectorstore(class_object, params):
def instantiate_documentloader(class_object, params):
documents = class_object(**params).load()
# now that the file is loaded, we can remove the path
for value in params.values():
path = Path(value)
if path.exists():
path.unlink()
return documents
return class_object(**params).load()
def instantiate_textsplitter(class_object, params):