feat: Switch to API for Unstructured Component (#3671)
* FEAT: Switch to API for Unstructured Component * chore: new lock --------- Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
This commit is contained in:
parent
d93c90760a
commit
de1fdff742
5 changed files with 44 additions and 40 deletions
39
poetry.lock
generated
39
poetry.lock
generated
|
|
@ -1190,7 +1190,7 @@ testing = ["pytest (>=7.2.1)", "pytest-cov (>=4.0.0)", "tox (>=4.4.3)"]
|
|||
name = "clickhouse-connect"
|
||||
version = "0.7.19"
|
||||
description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = "~=3.8"
|
||||
files = [
|
||||
{file = "clickhouse-connect-0.7.19.tar.gz", hash = "sha256:ce8f21f035781c5ef6ff57dc162e8150779c009b59f14030ba61f8c9c10c06d0"},
|
||||
|
|
@ -5025,6 +5025,24 @@ files = [
|
|||
[package.dependencies]
|
||||
langchain-core = ">=0.2.10,<0.3.0"
|
||||
|
||||
[[package]]
|
||||
name = "langchain-unstructured"
|
||||
version = "0.1.2"
|
||||
description = "An integration package connecting Unstructured and LangChain"
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.9"
|
||||
files = [
|
||||
{file = "langchain_unstructured-0.1.2-py3-none-any.whl", hash = "sha256:d8212adcbd9c2ca37ada10b65425042f7b609e9b0588279218bc54a8518ef3c3"},
|
||||
{file = "langchain_unstructured-0.1.2.tar.gz", hash = "sha256:55f0a0de7829df37f5bc42f83b495fb879cf55b551e49a4ee4704a40614ed071"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
langchain-core = ">=0.2.23,<0.3.0"
|
||||
unstructured-client = ">=0.24.1,<0.25.0"
|
||||
|
||||
[package.extras]
|
||||
local = ["unstructured[all-docs] (>=0.15.7,<0.16.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "langchainhub"
|
||||
version = "0.1.21"
|
||||
|
|
@ -5071,6 +5089,7 @@ asyncer = "^0.0.5"
|
|||
bcrypt = "4.0.1"
|
||||
cachetools = "^5.3.1"
|
||||
chardet = "^5.2.0"
|
||||
clickhouse-connect = "0.7.19"
|
||||
crewai = "^0.36.0"
|
||||
cryptography = "^42.0.5"
|
||||
diskcache = "^5.6.3"
|
||||
|
|
@ -5459,7 +5478,7 @@ source = ["Cython (>=3.0.11)"]
|
|||
name = "lz4"
|
||||
version = "4.3.3"
|
||||
description = "LZ4 Bindings for Python"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
|
||||
|
|
@ -5733,7 +5752,6 @@ python-versions = ">=3.7"
|
|||
files = [
|
||||
{file = "milvus_lite-2.4.9-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d3e617b3d68c09ad656d54bc3d8cc4ef6ef56c54015e1563d4fe4bcec6b7c90a"},
|
||||
{file = "milvus_lite-2.4.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:6e7029282d6829b277ebb92f64e2370be72b938e34770e1eb649346bda5d1d7f"},
|
||||
{file = "milvus_lite-2.4.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9b8e991e4e433596f6a399a165c1a506f823ec9133332e03d7f8a114bff4550d"},
|
||||
{file = "milvus_lite-2.4.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:7f53e674602101cfbcf0a4a59d19eaa139dfd5580639f3040ad73d901f24fc0b"},
|
||||
]
|
||||
|
||||
|
|
@ -10588,6 +10606,11 @@ files = [
|
|||
{file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
|
||||
{file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
|
||||
{file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
|
||||
{file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
|
||||
{file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
|
||||
{file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
|
||||
{file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
|
||||
{file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
|
@ -11040,13 +11063,13 @@ xlsx = ["networkx", "openpyxl", "pandas", "xlrd"]
|
|||
|
||||
[[package]]
|
||||
name = "unstructured-client"
|
||||
version = "0.25.5"
|
||||
version = "0.24.1"
|
||||
description = "Python Client SDK for Unstructured API"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "unstructured-client-0.25.5.tar.gz", hash = "sha256:adb97ea56ce65f8b277d5b05f093e9d13a3320ac8dea7265ffa71f5e13ed5f84"},
|
||||
{file = "unstructured_client-0.25.5-py3-none-any.whl", hash = "sha256:23537fee984e43d06a75f986a73e420a9659cc92010afb8324fbf67c85962eaf"},
|
||||
{file = "unstructured-client-0.24.1.tar.gz", hash = "sha256:1bd82a532497783dd77b30ed4e56837d6abfae8cc6d61442acac0bcacbd568c8"},
|
||||
{file = "unstructured_client-0.24.1-py3-none-any.whl", hash = "sha256:044dab0c3079f908f6adf7088ad44f0e17476b47e2b04e0de608134a482bd0e3"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
|
|
@ -11973,7 +11996,7 @@ type = ["pytest-mypy"]
|
|||
name = "zstandard"
|
||||
version = "0.23.0"
|
||||
description = "Zstandard bindings for Python"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "zstandard-0.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9"},
|
||||
|
|
@ -12090,4 +12113,4 @@ local = ["ctransformers", "llama-cpp-python", "sentence-transformers"]
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "314d59d48b52557fae6ec8918e6cc6f7f4d3af0ddcdfc5737448eee5d28f9a0a"
|
||||
content-hash = "5d520ff9f32b297bc10c0823992bfddbe240d2e5d38bb94e2c48fd110b909663"
|
||||
|
|
|
|||
|
|
@ -109,6 +109,7 @@ bson = "^0.5.10"
|
|||
lark = "^1.2.2"
|
||||
jq = "^1.8.0"
|
||||
clickhouse-connect = {version = "0.7.19", optional = true, extras = ["clickhouse-connect"]}
|
||||
langchain-unstructured = "^0.1.2"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
|
|
|||
|
|
@ -1,5 +1,3 @@
|
|||
import os
|
||||
|
||||
from typing import List
|
||||
|
||||
from langflow.custom import Component
|
||||
|
|
@ -7,12 +5,12 @@ from langflow.inputs import FileInput, SecretStrInput
|
|||
from langflow.template import Output
|
||||
from langflow.schema import Data
|
||||
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
|
||||
|
||||
class UnstructuredComponent(Component):
|
||||
display_name = "Unstructured"
|
||||
description = "Unstructured data loader"
|
||||
description = "Uses Unstructured.io to extract clean text from raw source documents. Supports: PDF, DOCX, TXT"
|
||||
documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/"
|
||||
trace_type = "tool"
|
||||
icon = "Unstructured"
|
||||
|
|
@ -23,14 +21,14 @@ class UnstructuredComponent(Component):
|
|||
name="file",
|
||||
display_name="File",
|
||||
required=True,
|
||||
info="The path to the file with which you want to use Unstructured to parse",
|
||||
info="The path to the file with which you want to use Unstructured to parse. Supports: PDF, DOCX, TXT",
|
||||
file_types=["pdf", "docx", "txt"], # TODO: Support all unstructured file types
|
||||
),
|
||||
SecretStrInput(
|
||||
name="api_key",
|
||||
display_name="API Key",
|
||||
required=False,
|
||||
info="Unstructured API Key. Create at: https://unstructured.io/ - If not provided, open source library will be used",
|
||||
display_name="Unstructured.io Serverless API Key",
|
||||
required=True,
|
||||
info="Unstructured API Key. Create at: https://app.unstructured.io/",
|
||||
),
|
||||
]
|
||||
|
||||
|
|
@ -38,12 +36,14 @@ class UnstructuredComponent(Component):
|
|||
Output(name="data", display_name="Data", method="load_documents"),
|
||||
]
|
||||
|
||||
def build_unstructured(self) -> UnstructuredFileLoader:
|
||||
os.environ["UNSTRUCTURED_API_KEY"] = self.api_key
|
||||
|
||||
def build_unstructured(self) -> UnstructuredLoader:
|
||||
file_paths = [self.file]
|
||||
|
||||
loader = UnstructuredFileLoader(file_paths)
|
||||
loader = UnstructuredLoader(
|
||||
file_paths,
|
||||
api_key=self.api_key,
|
||||
partition_via_api=True,
|
||||
)
|
||||
|
||||
return loader
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@ import json
|
|||
import os
|
||||
import shutil
|
||||
import time
|
||||
import nltk
|
||||
from collections import defaultdict
|
||||
from copy import deepcopy
|
||||
from datetime import datetime, timezone
|
||||
|
|
@ -613,19 +612,3 @@ def initialize_super_user_if_needed():
|
|||
get_variable_service().initialize_user_variables(super_user.id, session)
|
||||
create_default_folder_if_it_doesnt_exist(session, super_user.id)
|
||||
logger.info("Super user initialized")
|
||||
|
||||
|
||||
# Function to download NLTK packages if not already downloaded
|
||||
def download_nltk_resources():
|
||||
nltk_resources = {
|
||||
"corpora": ["wordnet"],
|
||||
"taggers": ["averaged_perceptron_tagger"],
|
||||
"tokenizers": ["punkt", "punkt_tab"],
|
||||
}
|
||||
|
||||
for category, packages in nltk_resources.items():
|
||||
for package in packages:
|
||||
try:
|
||||
nltk.data.find(f"{category}/{package}")
|
||||
except LookupError:
|
||||
nltk.download(package)
|
||||
|
|
|
|||
|
|
@ -180,9 +180,6 @@ def create_app():
|
|||
|
||||
FastAPIInstrumentor.instrument_app(app)
|
||||
|
||||
# Get necessary NLTK packages
|
||||
# download_nltk_resources()
|
||||
|
||||
return app
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue