diff --git a/src/backend/base/langflow/components/documentloaders/Unstructured.py b/src/backend/base/langflow/components/documentloaders/Unstructured.py new file mode 100644 index 000000000..5fdf05c3c --- /dev/null +++ b/src/backend/base/langflow/components/documentloaders/Unstructured.py @@ -0,0 +1,58 @@ +import os + +from typing import List + +from langflow.custom import Component +from langflow.inputs import FileInput, SecretStrInput +from langflow.template import Output +from langflow.schema import Data + +from langchain_community.document_loaders.unstructured import UnstructuredFileLoader + + +class UnstructuredComponent(Component): + display_name = "Unstructured" + description = "Unstructured data loader" + documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/" + trace_type = "tool" + icon = "Unstructured" + name = "Unstructured" + + inputs = [ + FileInput( + name="file", + display_name="File", + required=True, + info="The path to the file with which you want to use Unstructured to parse", + file_types=["pdf", "docx", "txt"], # TODO: Support all unstructured file types + ), + SecretStrInput( + name="api_key", + display_name="API Key", + required=False, + info="Unstructured API Key. Create at: https://unstructured.io/ - If not provided, open source library will be used", + ), + ] + + outputs = [ + Output(name="data", display_name="Data", method="load_documents"), + ] + + def build_unstructured(self) -> UnstructuredFileLoader: + os.environ["UNSTRUCTURED_API_KEY"] = self.api_key + + file_paths = [self.file] + + loader = UnstructuredFileLoader(file_paths) + + return loader + + def load_documents(self) -> List[Data]: + unstructured = self.build_unstructured() + + documents = unstructured.load() + data = [Data.from_document(doc) for doc in documents] # Using the from_document method of Data + + self.status = data + + return data diff --git a/src/backend/base/langflow/components/documentloaders/__init__.py b/src/backend/base/langflow/components/documentloaders/__init__.py index 8f18cd3db..0bed3869d 100644 --- a/src/backend/base/langflow/components/documentloaders/__init__.py +++ b/src/backend/base/langflow/components/documentloaders/__init__.py @@ -1,4 +1,5 @@ from .Confluence import ConfluenceComponent from .GitLoader import GitLoaderComponent +from .Unstructured import UnstructuredComponent -__all__ = ["ConfluenceComponent", "GitLoaderComponent"] +__all__ = ["ConfluenceComponent", "GitLoaderComponent", "UnstructuredComponent"]