fix: pdf parsing, pass as str instead of posix path (#4448)

* FIX: PyMuPDF for parsing, pass as str instead of posix path * Switch back to pypdf due to licensing * Allow specification of the number of threads * [autofix.ci] apply automated fixes * Update file.py * Switch StrInput to IntInput * ✨ (Document QA.json, Vector Store RAG.json): Update FileComponent class to include IntInput for concurrency_multithreading to control the number of workers for parallel processing. This enhancement allows for better control and optimization of processing tasks. * ⬆️ (pyproject.toml): upgrade langchain-core dependency from version 0.3.10 to version 0.3.12 to ensure compatibility and leverage new features * ⬆️ (pyproject.toml): update langchain-core dependency to allow versions greater than or equal to 0.3.12 for compatibility reasons --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: cristhianzl <cristhian.lousa@gmail.com> Co-authored-by: Gabriel Luiz Freitas Almeida <gabriel@langflow.org>
2024-11-08 08:55:06 -08:00 · 2024-11-08 08:55:06 -08:00 · a4863bbe57
commit a4863bbe57
parent 98274633db
6 changed files with 70 additions and 59 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -113,7 +113,6 @@ dependencies = [
    "langchain-elasticsearch>=0.2.0",
    "opensearch-py>=2.7.1",
    "langchain-ollama>=0.2.0",
-    "pymupdf~=1.24.13",
    "sqlalchemy[aiosqlite,postgresql_psycopg2binary,postgresql_psycopgbinary]>=2.0.36",
    "atlassian-python-api>=3.41.16",
 ]
--- a/src/backend/base/langflow/components/data/file.py
+++ b/src/backend/base/langflow/components/data/file.py
@ -1,13 +1,10 @@
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from zipfile import ZipFile, is_zipfile

-import fitz
-
-from langflow.base.data.utils import TEXT_FILE_TYPES, parse_text_file_to_data
+from langflow.base.data.utils import TEXT_FILE_TYPES, parallel_load_data, parse_text_file_to_data
 from langflow.custom import Component
-from langflow.io import BoolInput, FileInput, Output
+from langflow.io import BoolInput, FileInput, IntInput, Output
 from langflow.schema import Data


@ -49,6 +46,13 @@ class FileComponent(Component):
            advanced=True,
            info="If true, parallel processing will be enabled for zip files.",
        ),
+        IntInput(
+            name="concurrency_multithreading",
+            display_name="Multithreading Concurrency",
+            advanced=True,
+            info="The maximum number of workers to use, if concurrency is enabled",
+            value=4,
+        ),
    ]

    outputs = [Output(display_name="Data", name="data", method="load_file")]
@ -74,6 +78,7 @@ class FileComponent(Component):
            # Check if the file is a zip archive
            if is_zipfile(resolved_path):
                self.log(f"Processing zip file: {resolved_path.name}.")
+
                return self._process_zip_file(
                    resolved_path,
                    silent_errors=self.silent_errors,
@ -81,9 +86,11 @@ class FileComponent(Component):
                )

            self.log(f"Processing single file: {resolved_path.name}.")
+
            return self._process_single_file(resolved_path, silent_errors=self.silent_errors)
        except FileNotFoundError:
            self.log(f"File not found: {resolved_path.name}.")
+
            raise

    def _process_zip_file(self, zip_path: Path, *, silent_errors: bool = False, parallel: bool = False) -> Data:
@ -126,7 +133,7 @@ class FileComponent(Component):
                raise ValueError(msg)

            # Define a function to process each file
-            def process_file(file_name):
+            def process_file(file_name, silent_errors=silent_errors):
                with NamedTemporaryFile(delete=False) as temp_file:
                    temp_path = Path(temp_file.name).with_name(file_name)
                    with zip_file.open(file_name) as file_content:
@ -138,19 +145,24 @@ class FileComponent(Component):

            # Process files in parallel if specified
            if parallel:
-                self.log("Initializing parallel Thread Pool Executor.")
-                with ThreadPoolExecutor() as executor:
-                    futures = {executor.submit(process_file, file): file for file in valid_files}
-                    for future in as_completed(futures):
-                        try:
-                            data.append(future.result())
-                        except Exception as e:
-                            self.log(f"Error processing file {futures[future]}: {e}")
-                            if not silent_errors:
-                                raise
+                self.log(
+                    f"Initializing parallel Thread Pool Executor with max workers: "
+                    f"{self.concurrency_multithreading}."
+                )
+
+                # Process files in parallel
+                initial_data = parallel_load_data(
+                    valid_files,
+                    silent_errors=silent_errors,
+                    load_function=process_file,
+                    max_concurrency=self.concurrency_multithreading,
+                )
+
+                # Filter out empty data
+                data = list(filter(None, initial_data))
            else:
                # Sequential processing
-                data.extend([process_file(file_name) for file_name in valid_files])
+                data = [process_file(file_name) for file_name in valid_files]

        self.log(f"Successfully processed zip file: {zip_path.name}.")

@ -169,20 +181,8 @@ class FileComponent(Component):
        Raises:
            ValueError: For unsupported file formats.
        """
-
-        # Define a function to extract text from a PDF file
-        def pdf_to_text(filepath):
-            text = ""
-
-            # Open the PDF file
-            with fitz.open(filepath) as pdf:
-                for page in pdf:
-                    text += page.get_text() + "\n"
-
-            return text
-
        # Check if the file type is supported
-        if not any(file_path.suffix == ext for ext in ["." + f for f in [*TEXT_FILE_TYPES, "pdf"]]):
+        if not any(file_path.suffix == ext for ext in ["." + f for f in TEXT_FILE_TYPES]):
            self.log(f"Unsupported file type: {file_path.suffix}")

            # Return empty data if silent_errors is True
@ -193,13 +193,10 @@ class FileComponent(Component):
            raise ValueError(msg)

        try:
-            # Parse the file based on the file type
-            if file_path.suffix == ".pdf":
-                data = Data(data={"file_path": file_path, "text": pdf_to_text(file_path)})
-            else:
-                data = parse_text_file_to_data(str(file_path), silent_errors=silent_errors)  # type: ignore[assignment]
-                if not data:
-                    data = Data()
+            # Parse the text file as appropriate
+            data = parse_text_file_to_data(str(file_path), silent_errors=silent_errors)  # type: ignore[assignment]
+            if not data:
+                data = Data()

            self.log(f"Successfully processed file: {file_path.name}.")
        except Exception as e:
--- a/src/backend/base/langflow/initial_setup/starter_projects/Document
+++ b/src/backend/base/langflow/initial_setup/starter_projects/Document
--- a/src/backend/base/langflow/initial_setup/starter_projects/Vector
+++ b/src/backend/base/langflow/initial_setup/starter_projects/Vector
--- a/src/backend/base/pyproject.toml
+++ b/src/backend/base/pyproject.toml
@ -135,7 +135,6 @@ dependencies = [
    "duckdb>=1.0.0",
    "python-docx>=1.1.0",
    "jq>=1.7.0; sys_platform != 'win32'",
-    "pypdf>=4.2.0",
    "nest-asyncio>=1.6.0",
    "emoji>=2.12.0",
    "cryptography>=42.0.5,<44.0.0",
@ -162,6 +161,7 @@ dependencies = [
    "assemblyai>=0.33.0",
    "fastapi-pagination>=0.12.29",
    "defusedxml>=0.7.1",
+    "pypdf~=5.1.0",
 ]

 [project.urls]
--- a/uv.lock
+++ b/uv.lock
@ -3697,7 +3697,6 @@ dependencies = [
    { name = "pyautogen" },
    { name = "pydantic-settings" },
    { name = "pymongo" },
-    { name = "pymupdf" },
    { name = "pytube" },
    { name = "pywin32", marker = "sys_platform == 'win32'" },
    { name = "qdrant-client" },
@ -3847,7 +3846,6 @@ requires-dist = [
    { name = "pyautogen", specifier = ">=0.2.0" },
    { name = "pydantic-settings", specifier = "==2.4.0" },
    { name = "pymongo", specifier = ">=4.6.0" },
-    { name = "pymupdf", specifier = "~=1.24.13" },
    { name = "pytube", specifier = ">=15.0.0" },
    { name = "pywin32", marker = "sys_platform == 'win32'", specifier = ">=306" },
    { name = "qdrant-client", specifier = "~=1.9.2" },
@ -4088,7 +4086,7 @@ requires-dist = [
    { name = "prometheus-client", specifier = ">=0.20.0" },
    { name = "pydantic", specifier = ">=2.7.0" },
    { name = "pydantic-settings", specifier = ">=2.2.0" },
-    { name = "pypdf", specifier = ">=4.2.0" },
+    { name = "pypdf", specifier = "~=5.1.0" },
    { name = "pyperclip", specifier = ">=1.8.2" },
    { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.2.0" },
    { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23.0" },
@ -6324,21 +6322,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/0d/2a/7c24a6144eaa06d18ed52822ea2b0f119fd9267cd1abbb75dae4d89a3803/pymongo-4.10.1-cp313-cp313-win_amd64.whl", hash = "sha256:45ee87a4e12337353242bc758accc7fb47a2f2d9ecc0382a61e64c8f01e86708", size = 976873 },
 ]

-[[package]]
-name = "pymupdf"
-version = "1.24.13"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/22/39/84efca63af4e5a014c1d4c21686469f99c3d1c160a3a0b902ac676f6ffd9/PyMuPDF-1.24.13.tar.gz", hash = "sha256:6ec3ab3c6d5cba60bfcf58daaa2d1a5b700b0366ce52be666445007351461fa4", size = 53655596 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ce/79/8d31a98ebeb329000406d6c36fb2ad42264d5a4a6915ebabbde332642204/PyMuPDF-1.24.13-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c11bb9198af69d490b4b346421db827d875a28fbc760d239e691d4b3ed12b5ad", size = 19147116 },
-    { url = "https://files.pythonhosted.org/packages/ea/fe/ff2bb633c0934ba43c36184b8ed025092e946994dc6b4c764a0079f0ab3c/PyMuPDF-1.24.13-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:240d5c43daa9278db50d609162b48f673ab256d7e5c73eea67af517c1fc2d47c", size = 18406545 },
-    { url = "https://files.pythonhosted.org/packages/5b/5f/916bb534fd498d069d68c7a52289ba78d27823c2d6f8c693889e288e31e4/PyMuPDF-1.24.13-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e4c8808e62afbbde0f7b9c4151c4b1a5735911c2d39c34332860df600dba76f8", size = 19284324 },
-    { url = "https://files.pythonhosted.org/packages/85/48/e4630eb58f4daed22a078e19db8a709d407d2e19316089675f6ed185f01a/PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c830610e4fde237fcf0532f1f8c1381453f48c164a5eadd0c6e5fd0bea1ca8e3", size = 19812221 },
-    { url = "https://files.pythonhosted.org/packages/6d/22/5aa9e01747518878a54866b4d925abdc663c64c75f5fbc6a9706957a7a30/PyMuPDF-1.24.13-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:4520558580ac6b5a7164fda29fbc14e39d3114fd803420721500edbf47d04872", size = 20942140 },
-    { url = "https://files.pythonhosted.org/packages/07/a4/2e545217436e7717642809c7392bd7d7156ba102e7a47acb22659bfd41de/PyMuPDF-1.24.13-cp39-abi3-win32.whl", hash = "sha256:ab22828d4fc205791ef1332a64893cbfc38cd9c331c5f46ae4537372ffee6fc1", size = 14943060 },
-    { url = "https://files.pythonhosted.org/packages/38/80/f8d8ae555b237574005faef8a181a5c6a1d983e16a982b65ccc56a42faa2/PyMuPDF-1.24.13-cp39-abi3-win_amd64.whl", hash = "sha256:ec17914e4a560f4070212a2e84db5cc8b561d85d1ead193605a22f9561b03148", size = 16242035 },
-]
-
 [[package]]
 name = "pynacl"
 version = "1.5.0"