🔨 refactor(loading.py): add support for filtering files by extension in document loader
The `instantiate_documentloader` function now supports filtering files by extension using a `file_filter` parameter. The parameter is a string of comma-separated extensions, and the function now converts it into a lambda function that filters files based on whether their name contains any of the specified extensions. This improves the flexibility of the document loader by allowing it to load only specific types of files.
This commit is contained in:
parent
96bab94b14
commit
15ac75ecc5
1 changed files with 14 additions and 0 deletions
|
|
@ -159,6 +159,19 @@ def instantiate_vectorstore(class_object, params):
|
|||
|
||||
|
||||
def instantiate_documentloader(class_object, params):
|
||||
|
||||
|
||||
if "file_filter" in params:
|
||||
# file_filter will be a string but we need a function
|
||||
# that will be used to filter the files using file_filter
|
||||
# like lambda x: x.endswith(".txt") but as we don't know
|
||||
# anything besides the string, we will simply check if the string is
|
||||
# in x and if it is, we will return True
|
||||
file_filter = params.pop("file_filter", None)
|
||||
extensions = file_filter.split(",")
|
||||
params["file_filter"] = lambda x: any(
|
||||
extension.strip() in x for extension in extensions
|
||||
)
|
||||
metadata = params.pop("metadata", None)
|
||||
docs = class_object(**params).load()
|
||||
if metadata:
|
||||
|
|
@ -172,6 +185,7 @@ def instantiate_documentloader(class_object, params):
|
|||
|
||||
for doc in docs:
|
||||
doc.metadata = metadata
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue