From de1fdff7421c5c27ffd12a5025cdafdaaec3e1d6 Mon Sep 17 00:00:00 2001 From: Eric Hare Date: Wed, 4 Sep 2024 05:51:52 -0700 Subject: [PATCH] feat: Switch to API for Unstructured Component (#3671) * FEAT: Switch to API for Unstructured Component * chore: new lock --------- Co-authored-by: Gabriel Luiz Freitas Almeida --- poetry.lock | 39 +++++++++++++++---- pyproject.toml | 1 + .../documentloaders/Unstructured.py | 24 ++++++------ .../base/langflow/initial_setup/setup.py | 17 -------- src/backend/base/langflow/main.py | 3 -- 5 files changed, 44 insertions(+), 40 deletions(-) diff --git a/poetry.lock b/poetry.lock index 41bd37842..c74bf6f57 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1190,7 +1190,7 @@ testing = ["pytest (>=7.2.1)", "pytest-cov (>=4.0.0)", "tox (>=4.4.3)"] name = "clickhouse-connect" version = "0.7.19" description = "ClickHouse Database Core Driver for Python, Pandas, and Superset" -optional = true +optional = false python-versions = "~=3.8" files = [ {file = "clickhouse-connect-0.7.19.tar.gz", hash = "sha256:ce8f21f035781c5ef6ff57dc162e8150779c009b59f14030ba61f8c9c10c06d0"}, @@ -5025,6 +5025,24 @@ files = [ [package.dependencies] langchain-core = ">=0.2.10,<0.3.0" +[[package]] +name = "langchain-unstructured" +version = "0.1.2" +description = "An integration package connecting Unstructured and LangChain" +optional = false +python-versions = "<4.0,>=3.9" +files = [ + {file = "langchain_unstructured-0.1.2-py3-none-any.whl", hash = "sha256:d8212adcbd9c2ca37ada10b65425042f7b609e9b0588279218bc54a8518ef3c3"}, + {file = "langchain_unstructured-0.1.2.tar.gz", hash = "sha256:55f0a0de7829df37f5bc42f83b495fb879cf55b551e49a4ee4704a40614ed071"}, +] + +[package.dependencies] +langchain-core = ">=0.2.23,<0.3.0" +unstructured-client = ">=0.24.1,<0.25.0" + +[package.extras] +local = ["unstructured[all-docs] (>=0.15.7,<0.16.0)"] + [[package]] name = "langchainhub" version = "0.1.21" @@ -5071,6 +5089,7 @@ asyncer = "^0.0.5" bcrypt = "4.0.1" cachetools = "^5.3.1" chardet = "^5.2.0" +clickhouse-connect = "0.7.19" crewai = "^0.36.0" cryptography = "^42.0.5" diskcache = "^5.6.3" @@ -5459,7 +5478,7 @@ source = ["Cython (>=3.0.11)"] name = "lz4" version = "4.3.3" description = "LZ4 Bindings for Python" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"}, @@ -5733,7 +5752,6 @@ python-versions = ">=3.7" files = [ {file = "milvus_lite-2.4.9-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d3e617b3d68c09ad656d54bc3d8cc4ef6ef56c54015e1563d4fe4bcec6b7c90a"}, {file = "milvus_lite-2.4.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:6e7029282d6829b277ebb92f64e2370be72b938e34770e1eb649346bda5d1d7f"}, - {file = "milvus_lite-2.4.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9b8e991e4e433596f6a399a165c1a506f823ec9133332e03d7f8a114bff4550d"}, {file = "milvus_lite-2.4.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:7f53e674602101cfbcf0a4a59d19eaa139dfd5580639f3040ad73d901f24fc0b"}, ] @@ -10588,6 +10606,11 @@ files = [ {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"}, {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"}, {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"}, + {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"}, + {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"}, + {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"}, + {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"}, + {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"}, ] [package.dependencies] @@ -11040,13 +11063,13 @@ xlsx = ["networkx", "openpyxl", "pandas", "xlrd"] [[package]] name = "unstructured-client" -version = "0.25.5" +version = "0.24.1" description = "Python Client SDK for Unstructured API" optional = false python-versions = ">=3.8" files = [ - {file = "unstructured-client-0.25.5.tar.gz", hash = "sha256:adb97ea56ce65f8b277d5b05f093e9d13a3320ac8dea7265ffa71f5e13ed5f84"}, - {file = "unstructured_client-0.25.5-py3-none-any.whl", hash = "sha256:23537fee984e43d06a75f986a73e420a9659cc92010afb8324fbf67c85962eaf"}, + {file = "unstructured-client-0.24.1.tar.gz", hash = "sha256:1bd82a532497783dd77b30ed4e56837d6abfae8cc6d61442acac0bcacbd568c8"}, + {file = "unstructured_client-0.24.1-py3-none-any.whl", hash = "sha256:044dab0c3079f908f6adf7088ad44f0e17476b47e2b04e0de608134a482bd0e3"}, ] [package.dependencies] @@ -11973,7 +11996,7 @@ type = ["pytest-mypy"] name = "zstandard" version = "0.23.0" description = "Zstandard bindings for Python" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "zstandard-0.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9"}, @@ -12090,4 +12113,4 @@ local = ["ctransformers", "llama-cpp-python", "sentence-transformers"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.13" -content-hash = "314d59d48b52557fae6ec8918e6cc6f7f4d3af0ddcdfc5737448eee5d28f9a0a" +content-hash = "5d520ff9f32b297bc10c0823992bfddbe240d2e5d38bb94e2c48fd110b909663" diff --git a/pyproject.toml b/pyproject.toml index 7657048f3..7ebea2166 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,6 +109,7 @@ bson = "^0.5.10" lark = "^1.2.2" jq = "^1.8.0" clickhouse-connect = {version = "0.7.19", optional = true, extras = ["clickhouse-connect"]} +langchain-unstructured = "^0.1.2" [tool.poetry.group.dev.dependencies] diff --git a/src/backend/base/langflow/components/documentloaders/Unstructured.py b/src/backend/base/langflow/components/documentloaders/Unstructured.py index 5fdf05c3c..dea478a8c 100644 --- a/src/backend/base/langflow/components/documentloaders/Unstructured.py +++ b/src/backend/base/langflow/components/documentloaders/Unstructured.py @@ -1,5 +1,3 @@ -import os - from typing import List from langflow.custom import Component @@ -7,12 +5,12 @@ from langflow.inputs import FileInput, SecretStrInput from langflow.template import Output from langflow.schema import Data -from langchain_community.document_loaders.unstructured import UnstructuredFileLoader +from langchain_unstructured import UnstructuredLoader class UnstructuredComponent(Component): display_name = "Unstructured" - description = "Unstructured data loader" + description = "Uses Unstructured.io to extract clean text from raw source documents. Supports: PDF, DOCX, TXT" documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/" trace_type = "tool" icon = "Unstructured" @@ -23,14 +21,14 @@ class UnstructuredComponent(Component): name="file", display_name="File", required=True, - info="The path to the file with which you want to use Unstructured to parse", + info="The path to the file with which you want to use Unstructured to parse. Supports: PDF, DOCX, TXT", file_types=["pdf", "docx", "txt"], # TODO: Support all unstructured file types ), SecretStrInput( name="api_key", - display_name="API Key", - required=False, - info="Unstructured API Key. Create at: https://unstructured.io/ - If not provided, open source library will be used", + display_name="Unstructured.io Serverless API Key", + required=True, + info="Unstructured API Key. Create at: https://app.unstructured.io/", ), ] @@ -38,12 +36,14 @@ class UnstructuredComponent(Component): Output(name="data", display_name="Data", method="load_documents"), ] - def build_unstructured(self) -> UnstructuredFileLoader: - os.environ["UNSTRUCTURED_API_KEY"] = self.api_key - + def build_unstructured(self) -> UnstructuredLoader: file_paths = [self.file] - loader = UnstructuredFileLoader(file_paths) + loader = UnstructuredLoader( + file_paths, + api_key=self.api_key, + partition_via_api=True, + ) return loader diff --git a/src/backend/base/langflow/initial_setup/setup.py b/src/backend/base/langflow/initial_setup/setup.py index 47f922adf..a1dd431ec 100644 --- a/src/backend/base/langflow/initial_setup/setup.py +++ b/src/backend/base/langflow/initial_setup/setup.py @@ -3,7 +3,6 @@ import json import os import shutil import time -import nltk from collections import defaultdict from copy import deepcopy from datetime import datetime, timezone @@ -613,19 +612,3 @@ def initialize_super_user_if_needed(): get_variable_service().initialize_user_variables(super_user.id, session) create_default_folder_if_it_doesnt_exist(session, super_user.id) logger.info("Super user initialized") - - -# Function to download NLTK packages if not already downloaded -def download_nltk_resources(): - nltk_resources = { - "corpora": ["wordnet"], - "taggers": ["averaged_perceptron_tagger"], - "tokenizers": ["punkt", "punkt_tab"], - } - - for category, packages in nltk_resources.items(): - for package in packages: - try: - nltk.data.find(f"{category}/{package}") - except LookupError: - nltk.download(package) diff --git a/src/backend/base/langflow/main.py b/src/backend/base/langflow/main.py index 0a507dd31..ba80bbd22 100644 --- a/src/backend/base/langflow/main.py +++ b/src/backend/base/langflow/main.py @@ -180,9 +180,6 @@ def create_app(): FastAPIInstrumentor.instrument_app(app) - # Get necessary NLTK packages - # download_nltk_resources() - return app