From 0118012a4326a46de2c190be1b1336fb2f0d2e7a Mon Sep 17 00:00:00 2001 From: Eric Hare Date: Tue, 26 Aug 2025 19:32:13 -0700 Subject: [PATCH] fix: Run docling processing in subprocess (#9541) * fix: Run docling processing in subprocess * [autofix.ci] apply automated fixes * Update file.py * [autofix.ci] apply automated fixes * Update file.py * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes (attempt 2/3) * Update file.py * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Coelho <80289056+carlosrcoelho@users.noreply.github.com> --- .../base/langflow/components/data/file.py | 690 ++++++++---------- .../starter_projects/Document Q&A.json | 2 +- .../Portfolio Website Code Generator.json | 2 +- .../Text Sentiment Analysis.json | 2 +- .../starter_projects/Vector Store RAG.json | 2 +- 5 files changed, 312 insertions(+), 386 deletions(-) diff --git a/src/backend/base/langflow/components/data/file.py b/src/backend/base/langflow/components/data/file.py index 0e877eb7e..f52581919 100644 --- a/src/backend/base/langflow/components/data/file.py +++ b/src/backend/base/langflow/components/data/file.py @@ -1,9 +1,21 @@ -"""Enhanced file component v2 with mypy and ruff compliance.""" +"""Enhanced file component with clearer structure and Docling isolation. + +Notes: +----- +- Functionality is preserved with minimal behavioral changes. +- ALL Docling parsing/export runs in a separate OS process to prevent memory + growth and native library state from impacting the main Langflow process. +- Standard text/structured parsing continues to use existing BaseFileComponent + utilities (and optional threading via `parallel_load_data`). +""" from __future__ import annotations +import json +import subprocess +import sys +import textwrap from copy import deepcopy -from enum import Enum from typing import TYPE_CHECKING, Any from langflow.base.data.base_file import BaseFileComponent @@ -24,51 +36,8 @@ if TYPE_CHECKING: from langflow.schema import DataFrame -class MockConversionStatus(Enum): - """Mock ConversionStatus for fallback compatibility.""" - - SUCCESS = "success" - FAILURE = "failure" - - -class MockInputFormat(Enum): - """Mock InputFormat for fallback compatibility.""" - - PDF = "pdf" - IMAGE = "image" - - -class MockImageRefMode(Enum): - """Mock ImageRefMode for fallback compatibility.""" - - PLACEHOLDER = "placeholder" - EMBEDDED = "embedded" - - -class DoclingImports: - """Container for docling imports with type information.""" - - def __init__( - self, - conversion_status: type[Enum], - input_format: type[Enum], - document_converter: type, - image_ref_mode: type[Enum], - strategy: str, - ) -> None: - self.conversion_status = conversion_status - self.input_format = input_format - self.document_converter = document_converter - self.image_ref_mode = image_ref_mode - self.strategy = strategy - - class FileComponent(BaseFileComponent): - """Enhanced file component v2 that combines standard file loading with optional Docling processing and export. - - This component supports all features of the standard File component, plus an advanced mode - that enables Docling document processing and export to various formats (Markdown, HTML, etc.). - """ + """File component with optional Docling processing (isolated in a subprocess).""" display_name = "File" description = "Loads content from files with optional advanced document processing and export using Docling." @@ -76,7 +45,7 @@ class FileComponent(BaseFileComponent): icon = "file-text" name = "File" - # Docling supported formats from original component + # Docling-supported/compatible extensions; TEXT_FILE_TYPES are supported by the base loader. VALID_EXTENSIONS = [ "adoc", "asciidoc", @@ -110,12 +79,12 @@ class FileComponent(BaseFileComponent): *TEXT_FILE_TYPES, ] - # Fixed export settings + # Fixed export settings used when markdown export is requested. EXPORT_FORMAT = "Markdown" IMAGE_MODE = "placeholder" + # ---- Inputs / Outputs (kept as close to original as possible) ------------------- _base_inputs = deepcopy(BaseFileComponent._base_inputs) - for input_item in _base_inputs: if isinstance(input_item, FileInput) and input_item.name == "path": input_item.real_time_refresh = True @@ -175,6 +144,7 @@ class FileComponent(BaseFileComponent): advanced=True, show=False, ), + # Deprecated input retained for backward-compatibility. BoolInput( name="use_multithreading", display_name="[Deprecated] Use Multithreading", @@ -202,8 +172,10 @@ class FileComponent(BaseFileComponent): Output(display_name="Raw Content", name="message", method="load_files_message"), ] - def _path_value(self, template) -> list[str]: - # Get current path value + # ------------------------------ UI helpers -------------------------------------- + + def _path_value(self, template: dict) -> list[str]: + """Return the list of currently selected file paths from the template.""" return template.get("path", {}).get("file_path", []) def update_build_config( @@ -212,65 +184,41 @@ class FileComponent(BaseFileComponent): field_value: Any, field_name: str | None = None, ) -> dict[str, Any]: - """Update build configuration to show/hide fields based on file count and advanced_mode.""" + """Show/hide Advanced Parser and related fields based on selection context.""" if field_name == "path": - # Get current path value - path_value = self._path_value(build_config) - file_path = path_value[0] if len(path_value) > 0 else "" - - # Show/hide Advanced Parser based on file count (only for single files) + paths = self._path_value(build_config) + file_path = paths[0] if paths else "" file_count = len(field_value) if field_value else 0 - if file_count == 1 and not file_path.endswith((".csv", ".xlsx", ".parquet")): - build_config["advanced_mode"]["show"] = True - else: - build_config["advanced_mode"]["show"] = False - build_config["advanced_mode"]["value"] = False # Reset to False when hidden - # Hide all advanced fields when Advanced Parser is not available - advanced_fields = [ - "pipeline", - "ocr_engine", - "doc_key", - "md_image_placeholder", - "md_page_break_placeholder", - ] - for field in advanced_fields: - if field in build_config: - build_config[field]["show"] = False + # Advanced mode only for single (non-tabular) file + allow_advanced = file_count == 1 and not file_path.endswith((".csv", ".xlsx", ".parquet")) + build_config["advanced_mode"]["show"] = allow_advanced + if not allow_advanced: + build_config["advanced_mode"]["value"] = False + for f in ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder"): + if f in build_config: + build_config[f]["show"] = False elif field_name == "advanced_mode": - # Show/hide advanced fields based on advanced_mode (only if single file) - advanced_fields = [ - "pipeline", - "ocr_engine", - "doc_key", - "md_image_placeholder", - "md_page_break_placeholder", - ] - - for field in advanced_fields: - if field in build_config: - build_config[field]["show"] = field_value + for f in ("pipeline", "ocr_engine", "doc_key", "md_image_placeholder", "md_page_break_placeholder"): + if f in build_config: + build_config[f]["show"] = bool(field_value) return build_config def update_outputs(self, frontend_node: dict[str, Any], field_name: str, field_value: Any) -> dict[str, Any]: # noqa: ARG002 - """Dynamically show outputs based on the number of files and their types.""" + """Dynamically show outputs based on file count/type and advanced mode.""" if field_name not in ["path", "advanced_mode"]: return frontend_node - # Add outputs based on the number of files in the path template = frontend_node.get("template", {}) - path_value = self._path_value(template) - if len(path_value) == 0: + paths = self._path_value(template) + if not paths: return frontend_node - # Clear existing outputs frontend_node["outputs"] = [] - - if len(path_value) == 1: - # We need to check if the file is structured content - file_path = path_value[0] if field_name == "path" else frontend_node["template"]["path"]["file_path"][0] + if len(paths) == 1: + file_path = paths[0] if field_name == "path" else frontend_node["template"]["path"]["file_path"][0] if file_path.endswith((".csv", ".xlsx", ".parquet")): frontend_node["outputs"].append( Output(display_name="Structured Content", name="dataframe", method="load_files_structured"), @@ -280,11 +228,8 @@ class FileComponent(BaseFileComponent): Output(display_name="Structured Content", name="json", method="load_files_json"), ) - # Add outputs based on advanced mode advanced_mode = frontend_node.get("template", {}).get("advanced_mode", {}).get("value", False) - if advanced_mode: - # Advanced mode: Structured Output, Markdown, and File Path frontend_node["outputs"].append( Output(display_name="Structured Output", name="advanced", method="load_files_advanced"), ) @@ -295,7 +240,6 @@ class FileComponent(BaseFileComponent): Output(display_name="File Path", name="path", method="load_files_path"), ) else: - # Normal mode: Raw Content and File Path frontend_node["outputs"].append( Output(display_name="Raw Content", name="message", method="load_files_message"), ) @@ -303,130 +247,16 @@ class FileComponent(BaseFileComponent): Output(display_name="File Path", name="path", method="load_files_path"), ) else: - # For multiple files, we show the files output (DataFrame format) - # Advanced Parser is not available for multiple files - frontend_node["outputs"].append( - Output(display_name="Files", name="dataframe", method="load_files"), - ) + # Multiple files => DataFrame output; advanced parser disabled + frontend_node["outputs"].append(Output(display_name="Files", name="dataframe", method="load_files")) return frontend_node - def _try_import_docling(self) -> DoclingImports | None: - """Try different import strategies for docling components.""" - # Try strategy 1: Latest docling structure - try: - from docling.datamodel.base_models import ConversionStatus, InputFormat # type: ignore[import-untyped] - from docling.document_converter import DocumentConverter # type: ignore[import-untyped] - from docling_core.types.doc import ImageRefMode # type: ignore[import-untyped] - - self.log("Using latest docling import structure") - return DoclingImports( - conversion_status=ConversionStatus, - input_format=InputFormat, - document_converter=DocumentConverter, - image_ref_mode=ImageRefMode, - strategy="latest", - ) - except ImportError as e: - self.log(f"Latest docling structure failed: {e}") - - # Try strategy 2: Alternative import paths - try: - from docling.document_converter import DocumentConverter # type: ignore[import-untyped] - from docling_core.types.doc import ImageRefMode # type: ignore[import-untyped] - - # Try to get ConversionStatus from different locations - conversion_status: type[Enum] = MockConversionStatus - input_format: type[Enum] = MockInputFormat - - try: - from docling_core.types import ConversionStatus, InputFormat # type: ignore[import-untyped] - - conversion_status = ConversionStatus - input_format = InputFormat - except ImportError: - try: - from docling.datamodel import ConversionStatus, InputFormat # type: ignore[import-untyped] - - conversion_status = ConversionStatus - input_format = InputFormat - except ImportError: - # Use mock enums if we can't find them - pass - - self.log("Using alternative docling import structure") - return DoclingImports( - conversion_status=conversion_status, - input_format=input_format, - document_converter=DocumentConverter, - image_ref_mode=ImageRefMode, - strategy="alternative", - ) - except ImportError as e: - self.log(f"Alternative docling structure failed: {e}") - - # Try strategy 3: Basic converter only - try: - from docling.document_converter import DocumentConverter # type: ignore[import-untyped] - - self.log("Using basic docling import structure with mocks") - return DoclingImports( - conversion_status=MockConversionStatus, - input_format=MockInputFormat, - document_converter=DocumentConverter, - image_ref_mode=MockImageRefMode, - strategy="basic", - ) - except ImportError as e: - self.log(f"Basic docling structure failed: {e}") - - # Strategy 4: Complete fallback - return None to indicate failure - return None - - def _create_advanced_converter(self, docling_imports: DoclingImports) -> Any: - """Create advanced converter with pipeline options if available.""" - try: - from docling.datamodel.pipeline_options import PdfPipelineOptions # type: ignore[import-untyped] - from docling.document_converter import PdfFormatOption # type: ignore[import-untyped] - - document_converter = docling_imports.document_converter - input_format = docling_imports.input_format - - # Create basic pipeline options - pipeline_options = PdfPipelineOptions() - - # Configure OCR if specified and available - if self.ocr_engine: - try: - from docling.models.factories import get_ocr_factory # type: ignore[import-untyped] - - pipeline_options.do_ocr = True - ocr_factory = get_ocr_factory(allow_external_plugins=False) - ocr_options = ocr_factory.create_options(kind=self.ocr_engine) - pipeline_options.ocr_options = ocr_options - self.log(f"Configured OCR with engine: {self.ocr_engine}") - except Exception as e: # noqa: BLE001 - self.log(f"Could not configure OCR: {e}, proceeding without OCR") - pipeline_options.do_ocr = False - - # Create format options - pdf_format_option = PdfFormatOption(pipeline_options=pipeline_options) - format_options = {} - if hasattr(input_format, "PDF"): - format_options[input_format.PDF] = pdf_format_option - if hasattr(input_format, "IMAGE"): - format_options[input_format.IMAGE] = pdf_format_option - - return document_converter(format_options=format_options) - - except Exception as e: # noqa: BLE001 - self.log(f"Could not create advanced converter: {e}, using basic converter") - return docling_imports.document_converter() + # ------------------------------ Core processing ---------------------------------- def _is_docling_compatible(self, file_path: str) -> bool: - """Check if file is compatible with Docling processing.""" - # All VALID_EXTENSIONS are Docling compatible (except for TEXT_FILE_TYPES which may overlap) - docling_extensions = [ + """Lightweight extension gate for Docling-compatible types.""" + docling_exts = ( ".adoc", ".asciidoc", ".asc", @@ -456,102 +286,296 @@ class FileComponent(BaseFileComponent): ".xhtml", ".xml", ".webp", - ] - return any(file_path.lower().endswith(ext) for ext in docling_extensions) + ) + return file_path.lower().endswith(docling_exts) + + def _process_docling_in_subprocess(self, file_path: str) -> Data | None: + """Run Docling in a separate OS process and map the result to a Data object. + + We avoid multiprocessing pickling by launching `python -c "