feat: Switch to API for Unstructured Component (#3671)

* FEAT: Switch to API for Unstructured Component

* chore: new lock

---------

Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
This commit is contained in:
Eric Hare 2024-09-04 05:51:52 -07:00 committed by GitHub
commit de1fdff742
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 44 additions and 40 deletions

39
poetry.lock generated
View file

@ -1190,7 +1190,7 @@ testing = ["pytest (>=7.2.1)", "pytest-cov (>=4.0.0)", "tox (>=4.4.3)"]
name = "clickhouse-connect"
version = "0.7.19"
description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
optional = true
optional = false
python-versions = "~=3.8"
files = [
{file = "clickhouse-connect-0.7.19.tar.gz", hash = "sha256:ce8f21f035781c5ef6ff57dc162e8150779c009b59f14030ba61f8c9c10c06d0"},
@ -5025,6 +5025,24 @@ files = [
[package.dependencies]
langchain-core = ">=0.2.10,<0.3.0"
[[package]]
name = "langchain-unstructured"
version = "0.1.2"
description = "An integration package connecting Unstructured and LangChain"
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "langchain_unstructured-0.1.2-py3-none-any.whl", hash = "sha256:d8212adcbd9c2ca37ada10b65425042f7b609e9b0588279218bc54a8518ef3c3"},
{file = "langchain_unstructured-0.1.2.tar.gz", hash = "sha256:55f0a0de7829df37f5bc42f83b495fb879cf55b551e49a4ee4704a40614ed071"},
]
[package.dependencies]
langchain-core = ">=0.2.23,<0.3.0"
unstructured-client = ">=0.24.1,<0.25.0"
[package.extras]
local = ["unstructured[all-docs] (>=0.15.7,<0.16.0)"]
[[package]]
name = "langchainhub"
version = "0.1.21"
@ -5071,6 +5089,7 @@ asyncer = "^0.0.5"
bcrypt = "4.0.1"
cachetools = "^5.3.1"
chardet = "^5.2.0"
clickhouse-connect = "0.7.19"
crewai = "^0.36.0"
cryptography = "^42.0.5"
diskcache = "^5.6.3"
@ -5459,7 +5478,7 @@ source = ["Cython (>=3.0.11)"]
name = "lz4"
version = "4.3.3"
description = "LZ4 Bindings for Python"
optional = true
optional = false
python-versions = ">=3.8"
files = [
{file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
@ -5733,7 +5752,6 @@ python-versions = ">=3.7"
files = [
{file = "milvus_lite-2.4.9-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d3e617b3d68c09ad656d54bc3d8cc4ef6ef56c54015e1563d4fe4bcec6b7c90a"},
{file = "milvus_lite-2.4.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:6e7029282d6829b277ebb92f64e2370be72b938e34770e1eb649346bda5d1d7f"},
{file = "milvus_lite-2.4.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9b8e991e4e433596f6a399a165c1a506f823ec9133332e03d7f8a114bff4550d"},
{file = "milvus_lite-2.4.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:7f53e674602101cfbcf0a4a59d19eaa139dfd5580639f3040ad73d901f24fc0b"},
]
@ -10588,6 +10606,11 @@ files = [
{file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
{file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
{file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
{file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
{file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
{file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
{file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
{file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
]
[package.dependencies]
@ -11040,13 +11063,13 @@ xlsx = ["networkx", "openpyxl", "pandas", "xlrd"]
[[package]]
name = "unstructured-client"
version = "0.25.5"
version = "0.24.1"
description = "Python Client SDK for Unstructured API"
optional = false
python-versions = ">=3.8"
files = [
{file = "unstructured-client-0.25.5.tar.gz", hash = "sha256:adb97ea56ce65f8b277d5b05f093e9d13a3320ac8dea7265ffa71f5e13ed5f84"},
{file = "unstructured_client-0.25.5-py3-none-any.whl", hash = "sha256:23537fee984e43d06a75f986a73e420a9659cc92010afb8324fbf67c85962eaf"},
{file = "unstructured-client-0.24.1.tar.gz", hash = "sha256:1bd82a532497783dd77b30ed4e56837d6abfae8cc6d61442acac0bcacbd568c8"},
{file = "unstructured_client-0.24.1-py3-none-any.whl", hash = "sha256:044dab0c3079f908f6adf7088ad44f0e17476b47e2b04e0de608134a482bd0e3"},
]
[package.dependencies]
@ -11973,7 +11996,7 @@ type = ["pytest-mypy"]
name = "zstandard"
version = "0.23.0"
description = "Zstandard bindings for Python"
optional = true
optional = false
python-versions = ">=3.8"
files = [
{file = "zstandard-0.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9"},
@ -12090,4 +12113,4 @@ local = ["ctransformers", "llama-cpp-python", "sentence-transformers"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.10,<3.13"
content-hash = "314d59d48b52557fae6ec8918e6cc6f7f4d3af0ddcdfc5737448eee5d28f9a0a"
content-hash = "5d520ff9f32b297bc10c0823992bfddbe240d2e5d38bb94e2c48fd110b909663"

View file

@ -109,6 +109,7 @@ bson = "^0.5.10"
lark = "^1.2.2"
jq = "^1.8.0"
clickhouse-connect = {version = "0.7.19", optional = true, extras = ["clickhouse-connect"]}
langchain-unstructured = "^0.1.2"
[tool.poetry.group.dev.dependencies]

View file

@ -1,5 +1,3 @@
import os
from typing import List
from langflow.custom import Component
@ -7,12 +5,12 @@ from langflow.inputs import FileInput, SecretStrInput
from langflow.template import Output
from langflow.schema import Data
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_unstructured import UnstructuredLoader
class UnstructuredComponent(Component):
display_name = "Unstructured"
description = "Unstructured data loader"
description = "Uses Unstructured.io to extract clean text from raw source documents. Supports: PDF, DOCX, TXT"
documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/"
trace_type = "tool"
icon = "Unstructured"
@ -23,14 +21,14 @@ class UnstructuredComponent(Component):
name="file",
display_name="File",
required=True,
info="The path to the file with which you want to use Unstructured to parse",
info="The path to the file with which you want to use Unstructured to parse. Supports: PDF, DOCX, TXT",
file_types=["pdf", "docx", "txt"], # TODO: Support all unstructured file types
),
SecretStrInput(
name="api_key",
display_name="API Key",
required=False,
info="Unstructured API Key. Create at: https://unstructured.io/ - If not provided, open source library will be used",
display_name="Unstructured.io Serverless API Key",
required=True,
info="Unstructured API Key. Create at: https://app.unstructured.io/",
),
]
@ -38,12 +36,14 @@ class UnstructuredComponent(Component):
Output(name="data", display_name="Data", method="load_documents"),
]
def build_unstructured(self) -> UnstructuredFileLoader:
os.environ["UNSTRUCTURED_API_KEY"] = self.api_key
def build_unstructured(self) -> UnstructuredLoader:
file_paths = [self.file]
loader = UnstructuredFileLoader(file_paths)
loader = UnstructuredLoader(
file_paths,
api_key=self.api_key,
partition_via_api=True,
)
return loader

View file

@ -3,7 +3,6 @@ import json
import os
import shutil
import time
import nltk
from collections import defaultdict
from copy import deepcopy
from datetime import datetime, timezone
@ -613,19 +612,3 @@ def initialize_super_user_if_needed():
get_variable_service().initialize_user_variables(super_user.id, session)
create_default_folder_if_it_doesnt_exist(session, super_user.id)
logger.info("Super user initialized")
# Function to download NLTK packages if not already downloaded
def download_nltk_resources():
nltk_resources = {
"corpora": ["wordnet"],
"taggers": ["averaged_perceptron_tagger"],
"tokenizers": ["punkt", "punkt_tab"],
}
for category, packages in nltk_resources.items():
for package in packages:
try:
nltk.data.find(f"{category}/{package}")
except LookupError:
nltk.download(package)

View file

@ -180,9 +180,6 @@ def create_app():
FastAPIInstrumentor.instrument_app(app)
# Get necessary NLTK packages
# download_nltk_resources()
return app