From 8accf4c64c6b5f92feffbf2435a9dd1057239530 Mon Sep 17 00:00:00 2001 From: Eric Hare Date: Mon, 19 Aug 2024 15:31:46 -0700 Subject: [PATCH] feat: Add Unstructured Component to Document Loaders (#3308) * FEAT: Add Unstructured component * Update Unstructured.py * [autofix.ci] apply automated fixes * Switch to FileInput * Ensure we import the unstructured component * Small updates to unstructured comp * Update Unstructured.py --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- .../documentloaders/Unstructured.py | 58 +++++++++++++++++++ .../components/documentloaders/__init__.py | 3 +- 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 src/backend/base/langflow/components/documentloaders/Unstructured.py diff --git a/src/backend/base/langflow/components/documentloaders/Unstructured.py b/src/backend/base/langflow/components/documentloaders/Unstructured.py new file mode 100644 index 000000000..5fdf05c3c --- /dev/null +++ b/src/backend/base/langflow/components/documentloaders/Unstructured.py @@ -0,0 +1,58 @@ +import os + +from typing import List + +from langflow.custom import Component +from langflow.inputs import FileInput, SecretStrInput +from langflow.template import Output +from langflow.schema import Data + +from langchain_community.document_loaders.unstructured import UnstructuredFileLoader + + +class UnstructuredComponent(Component): + display_name = "Unstructured" + description = "Unstructured data loader" + documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/" + trace_type = "tool" + icon = "Unstructured" + name = "Unstructured" + + inputs = [ + FileInput( + name="file", + display_name="File", + required=True, + info="The path to the file with which you want to use Unstructured to parse", + file_types=["pdf", "docx", "txt"], # TODO: Support all unstructured file types + ), + SecretStrInput( + name="api_key", + display_name="API Key", + required=False, + info="Unstructured API Key. Create at: https://unstructured.io/ - If not provided, open source library will be used", + ), + ] + + outputs = [ + Output(name="data", display_name="Data", method="load_documents"), + ] + + def build_unstructured(self) -> UnstructuredFileLoader: + os.environ["UNSTRUCTURED_API_KEY"] = self.api_key + + file_paths = [self.file] + + loader = UnstructuredFileLoader(file_paths) + + return loader + + def load_documents(self) -> List[Data]: + unstructured = self.build_unstructured() + + documents = unstructured.load() + data = [Data.from_document(doc) for doc in documents] # Using the from_document method of Data + + self.status = data + + return data diff --git a/src/backend/base/langflow/components/documentloaders/__init__.py b/src/backend/base/langflow/components/documentloaders/__init__.py index 8f18cd3db..0bed3869d 100644 --- a/src/backend/base/langflow/components/documentloaders/__init__.py +++ b/src/backend/base/langflow/components/documentloaders/__init__.py @@ -1,4 +1,5 @@ from .Confluence import ConfluenceComponent from .GitLoader import GitLoaderComponent +from .Unstructured import UnstructuredComponent -__all__ = ["ConfluenceComponent", "GitLoaderComponent"] +__all__ = ["ConfluenceComponent", "GitLoaderComponent", "UnstructuredComponent"]