🐛 fix(loading.py): handle case where metadata is an empty dict and document already has metadata

🐛 fix(loading.py): handle case where separator_type is not provided in params
The first fix ensures that if the metadata is an empty dict, it will not be added to the documents if they already have metadata. This prevents overwriting existing metadata.

The second fix handles the case where the separator_type is not provided in the params. In this case, the text_splitter will be instantiated using the class_object and the params as is.
This commit is contained in:
Gabriel Luiz Freitas Almeida 2023-06-28 08:59:31 -03:00
commit 7364ba41f8

View file

@ -188,18 +188,22 @@ def instantiate_documentloader(class_object: Type[BaseLoader], params: Dict):
extension.strip() in x for extension in extensions
)
metadata = params.pop("metadata", None)
if metadata and isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except json.JSONDecodeError as exc:
raise ValueError(
"The metadata you provided is not a valid JSON string."
) from exc
docs = class_object(**params).load()
# Now if metadata is an empty dict, we will not add it to the documents
if metadata:
if isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except json.JSONDecodeError as exc:
raise ValueError(
"The metadata you provided is not a valid JSON string."
) from exc
for doc in docs:
doc.metadata = metadata
# If the document already has metadata, we will not overwrite it
if not doc.metadata:
doc.metadata = metadata
else:
doc.metadata.update(metadata)
return docs
@ -216,13 +220,16 @@ def instantiate_textsplitter(
"Try changing the chunk_size of the Text Splitter."
) from exc
if "separator_type" in params and params["separator_type"] == "Text":
if (
"separator_type" in params
and params["separator_type"] == "Text"
or "separator_type" not in params
):
text_splitter = class_object(**params)
else:
params["language"] = params.pop("separator_type", None)
params.pop("separators", None)
text_splitter = class_object.from_language(**params)
return text_splitter.split_documents(documents)