feat: NV-ingest image extraction and authentication support (#7795)

* nv-ingest auth workaround

* nv-ingest local dependency

* workaround for RestClient

* use forked nv-ingest-client with auth/url improvements

* update to nv-ingest-client 20250422

* image extraction

* fix splitter options

* improving defaults

* review feedback fixes

* [autofix.ci] apply automated fixes

* ruff fixes

* require base url

* [autofix.ci] apply automated fixes

* add check for empty segment

* Remove html from list of supported

* Use hardcoded list of supported types

* prayer to lint gods

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Jordan Frazier <jordan.frazier@datastax.com>
Co-authored-by: Jordan Frazier <122494242+jordanrfrazier@users.noreply.github.com>
This commit is contained in:
Jeffrey Carpenter 2025-04-28 21:38:57 -07:00 committed by GitHub
commit 4610bc3eb4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 4506 additions and 4379 deletions

View file

@ -208,12 +208,8 @@ clickhouse-connect = [
]
nv-ingest = [
# nv-ingest-client 2025.2.7.dev0 does not correctly install its
# dependencies, so we need to install some manually.
"nv-ingest-client==2025.2.7.dev0",
"python-pptx==0.6.23",
"pymilvus[bulk_writer,model]==2.5.0",
"llama-index-embeddings-nvidia==0.1.5",
"nv-ingest-api==2025.4.22.dev20250422",
"nv-ingest-client==2025.4.22.dev20250422",
]
postgresql = [
@ -320,4 +316,4 @@ ignore_missing_imports = true
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
build-backend = "hatchling.build"

View file

@ -1,7 +1,14 @@
from urllib.parse import urlparse
from langflow.base.data import BaseFileComponent
from langflow.io import BoolInput, DropdownInput, IntInput, MessageTextInput
from langflow.io import (
BoolInput,
DropdownInput,
FloatInput,
IntInput,
MessageTextInput,
SecretStrInput,
)
from langflow.schema import Data
@ -15,7 +22,8 @@ class NvidiaIngestComponent(BaseFileComponent):
try:
from nv_ingest_client.util.file_processing.extract import EXTENSION_TO_DOCUMENT_TYPE
VALID_EXTENSIONS = list(EXTENSION_TO_DOCUMENT_TYPE.keys())
# Supported file extensions from https://github.com/NVIDIA/nv-ingest/blob/main/README.md
VALID_EXTENSIONS = ["pdf", "docx", "pptx", "jpeg", "png", "svg", "tiff", "txt"]
except ImportError:
msg = (
"NVIDIA Retriever Extraction (nv-ingest) dependencies missing. "
@ -29,6 +37,11 @@ class NvidiaIngestComponent(BaseFileComponent):
name="base_url",
display_name="Base URL",
info="The URL of the NVIDIA NeMo Retriever Extraction API.",
required=True,
),
SecretStrInput(
name="api_key",
display_name="NVIDIA API Key",
),
BoolInput(
name="extract_text",
@ -46,6 +59,12 @@ class NvidiaIngestComponent(BaseFileComponent):
name="extract_tables",
display_name="Extract Tables",
info="Extract text from tables",
value=False,
),
BoolInput(
name="extract_images",
display_name="Extract Images",
info="Extract images from document",
value=True,
),
DropdownInput(
@ -56,7 +75,7 @@ class NvidiaIngestComponent(BaseFileComponent):
"Support for 'block', 'line', 'span' varies by document type."
),
options=["document", "page", "block", "line", "span"],
value="document", # Default value
value="page", # Default value
advanced=True,
),
BoolInput(
@ -64,43 +83,64 @@ class NvidiaIngestComponent(BaseFileComponent):
display_name="Split Text",
info="Split text into smaller chunks",
value=True,
),
DropdownInput(
name="split_by",
display_name="Split By",
info="How to split into chunks ('size' splits by number of characters)",
options=["page", "sentence", "word", "size"],
value="word", # Default value
advanced=True,
),
IntInput(
name="split_length",
display_name="Split Length",
info="The size of each chunk based on the 'split_by' method",
value=200,
name="chunk_size",
display_name="Chunk size",
info="The number of tokens per chunk",
value=500,
advanced=True,
),
IntInput(
name="split_overlap",
display_name="Split Overlap",
info="Number of segments (as determined by the 'split_by' method) to overlap from previous chunk",
value=20,
name="chunk_overlap",
display_name="Chunk Overlap",
info="Number of tokens to overlap from previous chunk",
value=150,
advanced=True,
),
IntInput(
name="max_character_length",
display_name="Max Character Length",
info="Maximum number of characters in each chunk",
value=1000,
BoolInput(
name="filter_images",
display_name="Filter Images",
info="Filter images (see advanced options for filtering criteria).",
advanced=True,
value=True,
),
IntInput(
name="sentence_window_size",
display_name="Sentence Window Size",
info="Number of sentences to include from previous and following chunk (when split_by='sentence')",
value=0,
name="min_image_size",
display_name="Minimum Image Size Filter",
info="Minimum image width/length in pixels",
value=128,
advanced=True,
),
FloatInput(
name="min_aspect_ratio",
display_name="Minimum Aspect Ratio Filter",
info="Minimum allowed aspect ratio (width / height). Images narrower than this will be filtered out.",
value=0.2,
advanced=True,
),
FloatInput(
name="max_aspect_ratio",
display_name="Maximum Aspect Ratio Filter",
info="Maximum allowed aspect ratio (width / height). Images taller than this will be filtered out.",
value=5.0,
advanced=True,
),
BoolInput(
name="dedup_images",
display_name="Deduplicate Images",
info="Filter duplicated images.",
advanced=True,
value=True,
),
BoolInput(
name="caption_images",
display_name="Caption Images",
info="Generate captions for images using the NVIDIA captioning model.",
advanced=True,
value=True,
),
]
outputs = [
@ -117,8 +157,6 @@ class NvidiaIngestComponent(BaseFileComponent):
)
raise ImportError(msg) from e
self.base_url: str | None = self.base_url.strip() if self.base_url else None
if not file_list:
err_msg = "No files to process."
self.log(err_msg)
@ -126,50 +164,70 @@ class NvidiaIngestComponent(BaseFileComponent):
file_paths = [str(file.path) for file in file_list]
try:
parsed_url = urlparse(self.base_url)
if not parsed_url.hostname or not parsed_url.port:
err_msg = "Invalid URL: Missing hostname or port."
self.log(err_msg)
raise ValueError(err_msg)
except Exception as e:
self.log(f"Error parsing URL: {e}")
raise
self.base_url: str | None = self.base_url.strip() if self.base_url else None
if self.base_url:
try:
urlparse(self.base_url)
except Exception as e:
error_msg = f"Invalid Base URL format: {e}"
self.log(error_msg)
raise ValueError(error_msg) from e
else:
base_url_error = "Base URL is required"
raise ValueError(base_url_error)
self.log(
f"Creating Ingestor for host: {parsed_url.hostname!r}, port: {parsed_url.port!r}",
f"Creating Ingestor for Base URL: {self.base_url!r}",
)
try:
from nv_ingest_client.client import Ingestor
try:
ingestor = (
Ingestor(message_client_hostname=parsed_url.hostname, message_client_port=parsed_url.port)
Ingestor(
message_client_kwargs={
"base_url": self.base_url,
"headers": {"Authorization": f"Bearer {self.api_key}"},
"max_retries": 3,
"timeout": 60,
}
)
.files(file_paths)
.extract(
extract_text=self.extract_text,
extract_tables=self.extract_tables,
extract_charts=self.extract_charts,
extract_images=False, # Currently not supported
extract_images=self.extract_images,
text_depth=self.text_depth,
)
)
except Exception as e:
self.log(f"Error creating Ingestor: {e}")
raise
if self.split_text:
ingestor = ingestor.split(
split_by=self.split_by,
split_length=self.split_length,
split_overlap=self.split_overlap,
max_character_length=self.max_character_length,
sentence_window_size=self.sentence_window_size,
)
if self.extract_images:
if self.dedup_images:
ingestor = ingestor.dedup(content_type="image", filter=True)
if self.filter_images:
ingestor = ingestor.filter(
content_type="image",
min_size=self.min_image_size,
min_aspect_ratio=self.min_aspect_ratio,
max_aspect_ratio=self.max_aspect_ratio,
filter=True,
)
if self.caption_images:
ingestor = ingestor.caption()
if self.extract_text and self.split_text:
ingestor = ingestor.split(
tokenizer="intfloat/e5-large-unsupervised",
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
params={"split_source_types": ["PDF"]},
)
try:
result = ingestor.ingest()
except Exception as e:
self.log(f"Error during ingestion: {e}")
ingest_error = f"Error during ingestion: {e}"
self.log(ingest_error)
raise
self.log(f"Results: {result}")
@ -181,37 +239,55 @@ class NvidiaIngestComponent(BaseFileComponent):
# Result is a list of segments as determined by the text_depth option (if "document" then only one segment)
# each segment is a list of elements (text, structured, image)
for segment in result:
for element in segment:
document_type = element.get("document_type")
metadata = element.get("metadata", {})
source_metadata = metadata.get("source_metadata", {})
content_metadata = metadata.get("content_metadata", {})
if segment:
for element in segment:
document_type = element.get("document_type")
metadata = element.get("metadata", {})
source_metadata = metadata.get("source_metadata", {})
if document_type == document_type_text:
data.append(
Data(
text=metadata.get("content", ""),
file_path=source_metadata.get("source_name", ""),
document_type=document_type,
description=content_metadata.get("description", ""),
if document_type == document_type_text:
data.append(
Data(
text=metadata.get("content", ""),
file_path=source_metadata.get("source_name", ""),
document_type=document_type,
metadata=metadata,
)
)
)
# Both charts and tables are returned as "structured" document type,
# with extracted text in "table_content"
elif document_type == document_type_structured:
table_metadata = metadata.get("table_metadata", {})
data.append(
Data(
text=table_metadata.get("table_content", ""),
file_path=source_metadata.get("source_name", ""),
document_type=document_type,
description=content_metadata.get("description", ""),
)
)
else:
# image is not yet supported; skip if encountered
self.log(f"Unsupported document type: {document_type}")
# Both charts and tables are returned as "structured" document type,
# with extracted text in "table_content"
elif document_type == document_type_structured:
table_metadata = metadata.get("table_metadata", {})
# reformat chart/table images as binary data
if "content" in metadata:
metadata["content"] = {"$binary": metadata["content"]}
data.append(
Data(
text=table_metadata.get("table_content", ""),
file_path=source_metadata.get("source_name", ""),
document_type=document_type,
metadata=metadata,
)
)
elif document_type == "image":
image_metadata = metadata.get("image_metadata", {})
# reformat images as binary data
if "content" in metadata:
metadata["content"] = {"$binary": metadata["content"]}
data.append(
Data(
text=image_metadata.get("caption", "No caption available"),
file_path=source_metadata.get("source_name", ""),
document_type=document_type,
metadata=metadata,
)
)
else:
self.log(f"Unsupported document type {document_type}")
self.status = data or "No data"
# merge processed data with BaseFile objects

8631
uv.lock generated

File diff suppressed because it is too large Load diff