build: add additional OCR dependencies for docling (#9156)
* ux cleanup * [autofix.ci] apply automated fixes * Update src/backend/base/langflow/components/docling/docling_inline.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * Update pyproject.toml Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * ruff * fix pyproject * [autofix.ci] apply automated fixes * [autofix.ci] apply automated fixes * uvlock * Update uv.lock --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Co-authored-by: Eric Hare <ericrhare@gmail.com> Co-authored-by: Carlos Coelho <80289056+carlosrcoelho@users.noreply.github.com>
This commit is contained in:
parent
4e92f75939
commit
234ff867b9
3 changed files with 235 additions and 7 deletions
|
|
@ -61,11 +61,11 @@ class DoclingInlineComponent(BaseFileComponent):
|
|||
),
|
||||
DropdownInput(
|
||||
name="ocr_engine",
|
||||
display_name="Ocr",
|
||||
info="OCR engine to use",
|
||||
options=["", "easyocr", "tesserocr", "rapidocr", "ocrmac"],
|
||||
display_name="OCR Engine",
|
||||
info="OCR engine to use. None will disable OCR.",
|
||||
options=["None", "easyocr", "tesserocr", "rapidocr", "ocrmac"],
|
||||
real_time_refresh=False,
|
||||
value="",
|
||||
value="None",
|
||||
),
|
||||
# TODO: expose more Docling options
|
||||
]
|
||||
|
|
@ -130,6 +130,58 @@ class DoclingInlineComponent(BaseFileComponent):
|
|||
self.log("Warning: Process still alive after SIGKILL")
|
||||
|
||||
def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
|
||||
try:
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling.models.factories import get_ocr_factory
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
except ImportError as e:
|
||||
msg = (
|
||||
"Docling is an optional dependency. Install with `uv pip install 'langflow[docling]'` or refer to the "
|
||||
"documentation on how to install optional dependencies."
|
||||
)
|
||||
raise ImportError(msg) from e
|
||||
|
||||
# Configure the standard PDF pipeline
|
||||
def _get_standard_opts() -> PdfPipelineOptions:
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = self.ocr_engine != "None"
|
||||
if pipeline_options.do_ocr:
|
||||
ocr_factory = get_ocr_factory(
|
||||
allow_external_plugins=False,
|
||||
)
|
||||
|
||||
ocr_options: OcrOptions = ocr_factory.create_options(
|
||||
kind=self.ocr_engine,
|
||||
)
|
||||
pipeline_options.ocr_options = ocr_options
|
||||
return pipeline_options
|
||||
|
||||
# Configure the VLM pipeline
|
||||
def _get_vlm_opts() -> VlmPipelineOptions:
|
||||
return VlmPipelineOptions()
|
||||
|
||||
# Configure the main format options and create the DocumentConverter()
|
||||
def _get_converter() -> DocumentConverter:
|
||||
if self.pipeline == "standard":
|
||||
pdf_format_option = PdfFormatOption(
|
||||
pipeline_options=_get_standard_opts(),
|
||||
)
|
||||
elif self.pipeline == "vlm":
|
||||
pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts())
|
||||
|
||||
format_options: dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: pdf_format_option,
|
||||
InputFormat.IMAGE: pdf_format_option,
|
||||
}
|
||||
|
||||
return DocumentConverter(format_options=format_options)
|
||||
|
||||
file_paths = [file.path for file in file_list if file.path]
|
||||
|
||||
if not file_paths:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue