From 15ac75ecc51401687d422abea8ceb2a33e1283ac Mon Sep 17 00:00:00 2001 From: Gabriel Luiz Freitas Almeida Date: Fri, 23 Jun 2023 16:50:11 -0300 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A8=20refactor(loading.py):=20add=20su?= =?UTF-8?q?pport=20for=20filtering=20files=20by=20extension=20in=20documen?= =?UTF-8?q?t=20loader=20The=20`instantiate=5Fdocumentloader`=20function=20?= =?UTF-8?q?now=20supports=20filtering=20files=20by=20extension=20using=20a?= =?UTF-8?q?=20`file=5Ffilter`=20parameter.=20The=20parameter=20is=20a=20st?= =?UTF-8?q?ring=20of=20comma-separated=20extensions,=20and=20the=20functio?= =?UTF-8?q?n=20now=20converts=20it=20into=20a=20lambda=20function=20that?= =?UTF-8?q?=20filters=20files=20based=20on=20whether=20their=20name=20cont?= =?UTF-8?q?ains=20any=20of=20the=20specified=20extensions.=20This=20improv?= =?UTF-8?q?es=20the=20flexibility=20of=20the=20document=20loader=20by=20al?= =?UTF-8?q?lowing=20it=20to=20load=20only=20specific=20types=20of=20files.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../langflow/interface/initialize/loading.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/backend/langflow/interface/initialize/loading.py b/src/backend/langflow/interface/initialize/loading.py index 2d547c6d8..fa695f437 100644 --- a/src/backend/langflow/interface/initialize/loading.py +++ b/src/backend/langflow/interface/initialize/loading.py @@ -159,6 +159,19 @@ def instantiate_vectorstore(class_object, params): def instantiate_documentloader(class_object, params): + + + if "file_filter" in params: + # file_filter will be a string but we need a function + # that will be used to filter the files using file_filter + # like lambda x: x.endswith(".txt") but as we don't know + # anything besides the string, we will simply check if the string is + # in x and if it is, we will return True + file_filter = params.pop("file_filter", None) + extensions = file_filter.split(",") + params["file_filter"] = lambda x: any( + extension.strip() in x for extension in extensions + ) metadata = params.pop("metadata", None) docs = class_object(**params).load() if metadata: @@ -172,6 +185,7 @@ def instantiate_documentloader(class_object, params): for doc in docs: doc.metadata = metadata + return docs