feat: Add Unstructured Component to Document Loaders (#3308)

* FEAT: Add Unstructured component

* Update Unstructured.py

* [autofix.ci] apply automated fixes

* Switch to FileInput

* Ensure we import the unstructured component

* Small updates to unstructured comp

* Update Unstructured.py

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
Eric Hare 2024-08-19 15:31:46 -07:00 committed by GitHub
commit 8accf4c64c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 60 additions and 1 deletions

View file

@ -0,0 +1,58 @@
import os
from typing import List
from langflow.custom import Component
from langflow.inputs import FileInput, SecretStrInput
from langflow.template import Output
from langflow.schema import Data
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
class UnstructuredComponent(Component):
display_name = "Unstructured"
description = "Unstructured data loader"
documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/"
trace_type = "tool"
icon = "Unstructured"
name = "Unstructured"
inputs = [
FileInput(
name="file",
display_name="File",
required=True,
info="The path to the file with which you want to use Unstructured to parse",
file_types=["pdf", "docx", "txt"], # TODO: Support all unstructured file types
),
SecretStrInput(
name="api_key",
display_name="API Key",
required=False,
info="Unstructured API Key. Create at: https://unstructured.io/ - If not provided, open source library will be used",
),
]
outputs = [
Output(name="data", display_name="Data", method="load_documents"),
]
def build_unstructured(self) -> UnstructuredFileLoader:
os.environ["UNSTRUCTURED_API_KEY"] = self.api_key
file_paths = [self.file]
loader = UnstructuredFileLoader(file_paths)
return loader
def load_documents(self) -> List[Data]:
unstructured = self.build_unstructured()
documents = unstructured.load()
data = [Data.from_document(doc) for doc in documents] # Using the from_document method of Data
self.status = data
return data

View file

@ -1,4 +1,5 @@
from .Confluence import ConfluenceComponent
from .GitLoader import GitLoaderComponent
from .Unstructured import UnstructuredComponent
__all__ = ["ConfluenceComponent", "GitLoaderComponent"]
__all__ = ["ConfluenceComponent", "GitLoaderComponent", "UnstructuredComponent"]