feat: Add Unstructured Component to Document Loaders (#3308)
* FEAT: Add Unstructured component * Update Unstructured.py * [autofix.ci] apply automated fixes * Switch to FileInput * Ensure we import the unstructured component * Small updates to unstructured comp * Update Unstructured.py --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
d4a1111974
commit
8accf4c64c
2 changed files with 60 additions and 1 deletions
|
|
@ -0,0 +1,58 @@
|
|||
import os
|
||||
|
||||
from typing import List
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.inputs import FileInput, SecretStrInput
|
||||
from langflow.template import Output
|
||||
from langflow.schema import Data
|
||||
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class UnstructuredComponent(Component):
|
||||
display_name = "Unstructured"
|
||||
description = "Unstructured data loader"
|
||||
documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/"
|
||||
trace_type = "tool"
|
||||
icon = "Unstructured"
|
||||
name = "Unstructured"
|
||||
|
||||
inputs = [
|
||||
FileInput(
|
||||
name="file",
|
||||
display_name="File",
|
||||
required=True,
|
||||
info="The path to the file with which you want to use Unstructured to parse",
|
||||
file_types=["pdf", "docx", "txt"], # TODO: Support all unstructured file types
|
||||
),
|
||||
SecretStrInput(
|
||||
name="api_key",
|
||||
display_name="API Key",
|
||||
required=False,
|
||||
info="Unstructured API Key. Create at: https://unstructured.io/ - If not provided, open source library will be used",
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(name="data", display_name="Data", method="load_documents"),
|
||||
]
|
||||
|
||||
def build_unstructured(self) -> UnstructuredFileLoader:
|
||||
os.environ["UNSTRUCTURED_API_KEY"] = self.api_key
|
||||
|
||||
file_paths = [self.file]
|
||||
|
||||
loader = UnstructuredFileLoader(file_paths)
|
||||
|
||||
return loader
|
||||
|
||||
def load_documents(self) -> List[Data]:
|
||||
unstructured = self.build_unstructured()
|
||||
|
||||
documents = unstructured.load()
|
||||
data = [Data.from_document(doc) for doc in documents] # Using the from_document method of Data
|
||||
|
||||
self.status = data
|
||||
|
||||
return data
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
from .Confluence import ConfluenceComponent
|
||||
from .GitLoader import GitLoaderComponent
|
||||
from .Unstructured import UnstructuredComponent
|
||||
|
||||
__all__ = ["ConfluenceComponent", "GitLoaderComponent"]
|
||||
__all__ = ["ConfluenceComponent", "GitLoaderComponent", "UnstructuredComponent"]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue