feat: Switch to API for Unstructured Component (#3671)
* FEAT: Switch to API for Unstructured Component * chore: new lock --------- Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
This commit is contained in:
parent
d93c90760a
commit
de1fdff742
5 changed files with 44 additions and 40 deletions
|
|
@ -1,5 +1,3 @@
|
|||
import os
|
||||
|
||||
from typing import List
|
||||
|
||||
from langflow.custom import Component
|
||||
|
|
@ -7,12 +5,12 @@ from langflow.inputs import FileInput, SecretStrInput
|
|||
from langflow.template import Output
|
||||
from langflow.schema import Data
|
||||
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
|
||||
|
||||
class UnstructuredComponent(Component):
|
||||
display_name = "Unstructured"
|
||||
description = "Unstructured data loader"
|
||||
description = "Uses Unstructured.io to extract clean text from raw source documents. Supports: PDF, DOCX, TXT"
|
||||
documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/"
|
||||
trace_type = "tool"
|
||||
icon = "Unstructured"
|
||||
|
|
@ -23,14 +21,14 @@ class UnstructuredComponent(Component):
|
|||
name="file",
|
||||
display_name="File",
|
||||
required=True,
|
||||
info="The path to the file with which you want to use Unstructured to parse",
|
||||
info="The path to the file with which you want to use Unstructured to parse. Supports: PDF, DOCX, TXT",
|
||||
file_types=["pdf", "docx", "txt"], # TODO: Support all unstructured file types
|
||||
),
|
||||
SecretStrInput(
|
||||
name="api_key",
|
||||
display_name="API Key",
|
||||
required=False,
|
||||
info="Unstructured API Key. Create at: https://unstructured.io/ - If not provided, open source library will be used",
|
||||
display_name="Unstructured.io Serverless API Key",
|
||||
required=True,
|
||||
info="Unstructured API Key. Create at: https://app.unstructured.io/",
|
||||
),
|
||||
]
|
||||
|
||||
|
|
@ -38,12 +36,14 @@ class UnstructuredComponent(Component):
|
|||
Output(name="data", display_name="Data", method="load_documents"),
|
||||
]
|
||||
|
||||
def build_unstructured(self) -> UnstructuredFileLoader:
|
||||
os.environ["UNSTRUCTURED_API_KEY"] = self.api_key
|
||||
|
||||
def build_unstructured(self) -> UnstructuredLoader:
|
||||
file_paths = [self.file]
|
||||
|
||||
loader = UnstructuredFileLoader(file_paths)
|
||||
loader = UnstructuredLoader(
|
||||
file_paths,
|
||||
api_key=self.api_key,
|
||||
partition_via_api=True,
|
||||
)
|
||||
|
||||
return loader
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@ import json
|
|||
import os
|
||||
import shutil
|
||||
import time
|
||||
import nltk
|
||||
from collections import defaultdict
|
||||
from copy import deepcopy
|
||||
from datetime import datetime, timezone
|
||||
|
|
@ -613,19 +612,3 @@ def initialize_super_user_if_needed():
|
|||
get_variable_service().initialize_user_variables(super_user.id, session)
|
||||
create_default_folder_if_it_doesnt_exist(session, super_user.id)
|
||||
logger.info("Super user initialized")
|
||||
|
||||
|
||||
# Function to download NLTK packages if not already downloaded
|
||||
def download_nltk_resources():
|
||||
nltk_resources = {
|
||||
"corpora": ["wordnet"],
|
||||
"taggers": ["averaged_perceptron_tagger"],
|
||||
"tokenizers": ["punkt", "punkt_tab"],
|
||||
}
|
||||
|
||||
for category, packages in nltk_resources.items():
|
||||
for package in packages:
|
||||
try:
|
||||
nltk.data.find(f"{category}/{package}")
|
||||
except LookupError:
|
||||
nltk.download(package)
|
||||
|
|
|
|||
|
|
@ -180,9 +180,6 @@ def create_app():
|
|||
|
||||
FastAPIInstrumentor.instrument_app(app)
|
||||
|
||||
# Get necessary NLTK packages
|
||||
# download_nltk_resources()
|
||||
|
||||
return app
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue