feat: Switch to API for Unstructured Component (#3671)

* FEAT: Switch to API for Unstructured Component

* chore: new lock

---------

Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
This commit is contained in:
Eric Hare 2024-09-04 05:51:52 -07:00 committed by GitHub
commit de1fdff742
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 44 additions and 40 deletions

View file

@ -1,5 +1,3 @@
import os
from typing import List
from langflow.custom import Component
@ -7,12 +5,12 @@ from langflow.inputs import FileInput, SecretStrInput
from langflow.template import Output
from langflow.schema import Data
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_unstructured import UnstructuredLoader
class UnstructuredComponent(Component):
display_name = "Unstructured"
description = "Unstructured data loader"
description = "Uses Unstructured.io to extract clean text from raw source documents. Supports: PDF, DOCX, TXT"
documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/"
trace_type = "tool"
icon = "Unstructured"
@ -23,14 +21,14 @@ class UnstructuredComponent(Component):
name="file",
display_name="File",
required=True,
info="The path to the file with which you want to use Unstructured to parse",
info="The path to the file with which you want to use Unstructured to parse. Supports: PDF, DOCX, TXT",
file_types=["pdf", "docx", "txt"], # TODO: Support all unstructured file types
),
SecretStrInput(
name="api_key",
display_name="API Key",
required=False,
info="Unstructured API Key. Create at: https://unstructured.io/ - If not provided, open source library will be used",
display_name="Unstructured.io Serverless API Key",
required=True,
info="Unstructured API Key. Create at: https://app.unstructured.io/",
),
]
@ -38,12 +36,14 @@ class UnstructuredComponent(Component):
Output(name="data", display_name="Data", method="load_documents"),
]
def build_unstructured(self) -> UnstructuredFileLoader:
os.environ["UNSTRUCTURED_API_KEY"] = self.api_key
def build_unstructured(self) -> UnstructuredLoader:
file_paths = [self.file]
loader = UnstructuredFileLoader(file_paths)
loader = UnstructuredLoader(
file_paths,
api_key=self.api_key,
partition_via_api=True,
)
return loader

View file

@ -3,7 +3,6 @@ import json
import os
import shutil
import time
import nltk
from collections import defaultdict
from copy import deepcopy
from datetime import datetime, timezone
@ -613,19 +612,3 @@ def initialize_super_user_if_needed():
get_variable_service().initialize_user_variables(super_user.id, session)
create_default_folder_if_it_doesnt_exist(session, super_user.id)
logger.info("Super user initialized")
# Function to download NLTK packages if not already downloaded
def download_nltk_resources():
nltk_resources = {
"corpora": ["wordnet"],
"taggers": ["averaged_perceptron_tagger"],
"tokenizers": ["punkt", "punkt_tab"],
}
for category, packages in nltk_resources.items():
for package in packages:
try:
nltk.data.find(f"{category}/{package}")
except LookupError:
nltk.download(package)

View file

@ -180,9 +180,6 @@ def create_app():
FastAPIInstrumentor.instrument_app(app)
# Get necessary NLTK packages
# download_nltk_resources()
return app