feat: NV-ingest image extraction and authentication support (#7795)
* nv-ingest auth workaround * nv-ingest local dependency * workaround for RestClient * use forked nv-ingest-client with auth/url improvements * update to nv-ingest-client 20250422 * image extraction * fix splitter options * improving defaults * review feedback fixes * [autofix.ci] apply automated fixes * ruff fixes * require base url * [autofix.ci] apply automated fixes * add check for empty segment * Remove html from list of supported * Use hardcoded list of supported types * prayer to lint gods --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Jordan Frazier <jordan.frazier@datastax.com> Co-authored-by: Jordan Frazier <122494242+jordanrfrazier@users.noreply.github.com>
This commit is contained in:
parent
0a628e1fb1
commit
4610bc3eb4
3 changed files with 4506 additions and 4379 deletions
|
|
@ -208,12 +208,8 @@ clickhouse-connect = [
|
|||
]
|
||||
|
||||
nv-ingest = [
|
||||
# nv-ingest-client 2025.2.7.dev0 does not correctly install its
|
||||
# dependencies, so we need to install some manually.
|
||||
"nv-ingest-client==2025.2.7.dev0",
|
||||
"python-pptx==0.6.23",
|
||||
"pymilvus[bulk_writer,model]==2.5.0",
|
||||
"llama-index-embeddings-nvidia==0.1.5",
|
||||
"nv-ingest-api==2025.4.22.dev20250422",
|
||||
"nv-ingest-client==2025.4.22.dev20250422",
|
||||
]
|
||||
|
||||
postgresql = [
|
||||
|
|
@ -320,4 +316,4 @@ ignore_missing_imports = true
|
|||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
build-backend = "hatchling.build"
|
||||
|
|
@ -1,7 +1,14 @@
|
|||
from urllib.parse import urlparse
|
||||
|
||||
from langflow.base.data import BaseFileComponent
|
||||
from langflow.io import BoolInput, DropdownInput, IntInput, MessageTextInput
|
||||
from langflow.io import (
|
||||
BoolInput,
|
||||
DropdownInput,
|
||||
FloatInput,
|
||||
IntInput,
|
||||
MessageTextInput,
|
||||
SecretStrInput,
|
||||
)
|
||||
from langflow.schema import Data
|
||||
|
||||
|
||||
|
|
@ -15,7 +22,8 @@ class NvidiaIngestComponent(BaseFileComponent):
|
|||
try:
|
||||
from nv_ingest_client.util.file_processing.extract import EXTENSION_TO_DOCUMENT_TYPE
|
||||
|
||||
VALID_EXTENSIONS = list(EXTENSION_TO_DOCUMENT_TYPE.keys())
|
||||
# Supported file extensions from https://github.com/NVIDIA/nv-ingest/blob/main/README.md
|
||||
VALID_EXTENSIONS = ["pdf", "docx", "pptx", "jpeg", "png", "svg", "tiff", "txt"]
|
||||
except ImportError:
|
||||
msg = (
|
||||
"NVIDIA Retriever Extraction (nv-ingest) dependencies missing. "
|
||||
|
|
@ -29,6 +37,11 @@ class NvidiaIngestComponent(BaseFileComponent):
|
|||
name="base_url",
|
||||
display_name="Base URL",
|
||||
info="The URL of the NVIDIA NeMo Retriever Extraction API.",
|
||||
required=True,
|
||||
),
|
||||
SecretStrInput(
|
||||
name="api_key",
|
||||
display_name="NVIDIA API Key",
|
||||
),
|
||||
BoolInput(
|
||||
name="extract_text",
|
||||
|
|
@ -46,6 +59,12 @@ class NvidiaIngestComponent(BaseFileComponent):
|
|||
name="extract_tables",
|
||||
display_name="Extract Tables",
|
||||
info="Extract text from tables",
|
||||
value=False,
|
||||
),
|
||||
BoolInput(
|
||||
name="extract_images",
|
||||
display_name="Extract Images",
|
||||
info="Extract images from document",
|
||||
value=True,
|
||||
),
|
||||
DropdownInput(
|
||||
|
|
@ -56,7 +75,7 @@ class NvidiaIngestComponent(BaseFileComponent):
|
|||
"Support for 'block', 'line', 'span' varies by document type."
|
||||
),
|
||||
options=["document", "page", "block", "line", "span"],
|
||||
value="document", # Default value
|
||||
value="page", # Default value
|
||||
advanced=True,
|
||||
),
|
||||
BoolInput(
|
||||
|
|
@ -64,43 +83,64 @@ class NvidiaIngestComponent(BaseFileComponent):
|
|||
display_name="Split Text",
|
||||
info="Split text into smaller chunks",
|
||||
value=True,
|
||||
),
|
||||
DropdownInput(
|
||||
name="split_by",
|
||||
display_name="Split By",
|
||||
info="How to split into chunks ('size' splits by number of characters)",
|
||||
options=["page", "sentence", "word", "size"],
|
||||
value="word", # Default value
|
||||
advanced=True,
|
||||
),
|
||||
IntInput(
|
||||
name="split_length",
|
||||
display_name="Split Length",
|
||||
info="The size of each chunk based on the 'split_by' method",
|
||||
value=200,
|
||||
name="chunk_size",
|
||||
display_name="Chunk size",
|
||||
info="The number of tokens per chunk",
|
||||
value=500,
|
||||
advanced=True,
|
||||
),
|
||||
IntInput(
|
||||
name="split_overlap",
|
||||
display_name="Split Overlap",
|
||||
info="Number of segments (as determined by the 'split_by' method) to overlap from previous chunk",
|
||||
value=20,
|
||||
name="chunk_overlap",
|
||||
display_name="Chunk Overlap",
|
||||
info="Number of tokens to overlap from previous chunk",
|
||||
value=150,
|
||||
advanced=True,
|
||||
),
|
||||
IntInput(
|
||||
name="max_character_length",
|
||||
display_name="Max Character Length",
|
||||
info="Maximum number of characters in each chunk",
|
||||
value=1000,
|
||||
BoolInput(
|
||||
name="filter_images",
|
||||
display_name="Filter Images",
|
||||
info="Filter images (see advanced options for filtering criteria).",
|
||||
advanced=True,
|
||||
value=True,
|
||||
),
|
||||
IntInput(
|
||||
name="sentence_window_size",
|
||||
display_name="Sentence Window Size",
|
||||
info="Number of sentences to include from previous and following chunk (when split_by='sentence')",
|
||||
value=0,
|
||||
name="min_image_size",
|
||||
display_name="Minimum Image Size Filter",
|
||||
info="Minimum image width/length in pixels",
|
||||
value=128,
|
||||
advanced=True,
|
||||
),
|
||||
FloatInput(
|
||||
name="min_aspect_ratio",
|
||||
display_name="Minimum Aspect Ratio Filter",
|
||||
info="Minimum allowed aspect ratio (width / height). Images narrower than this will be filtered out.",
|
||||
value=0.2,
|
||||
advanced=True,
|
||||
),
|
||||
FloatInput(
|
||||
name="max_aspect_ratio",
|
||||
display_name="Maximum Aspect Ratio Filter",
|
||||
info="Maximum allowed aspect ratio (width / height). Images taller than this will be filtered out.",
|
||||
value=5.0,
|
||||
advanced=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="dedup_images",
|
||||
display_name="Deduplicate Images",
|
||||
info="Filter duplicated images.",
|
||||
advanced=True,
|
||||
value=True,
|
||||
),
|
||||
BoolInput(
|
||||
name="caption_images",
|
||||
display_name="Caption Images",
|
||||
info="Generate captions for images using the NVIDIA captioning model.",
|
||||
advanced=True,
|
||||
value=True,
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
|
|
@ -117,8 +157,6 @@ class NvidiaIngestComponent(BaseFileComponent):
|
|||
)
|
||||
raise ImportError(msg) from e
|
||||
|
||||
self.base_url: str | None = self.base_url.strip() if self.base_url else None
|
||||
|
||||
if not file_list:
|
||||
err_msg = "No files to process."
|
||||
self.log(err_msg)
|
||||
|
|
@ -126,50 +164,70 @@ class NvidiaIngestComponent(BaseFileComponent):
|
|||
|
||||
file_paths = [str(file.path) for file in file_list]
|
||||
|
||||
try:
|
||||
parsed_url = urlparse(self.base_url)
|
||||
if not parsed_url.hostname or not parsed_url.port:
|
||||
err_msg = "Invalid URL: Missing hostname or port."
|
||||
self.log(err_msg)
|
||||
raise ValueError(err_msg)
|
||||
except Exception as e:
|
||||
self.log(f"Error parsing URL: {e}")
|
||||
raise
|
||||
self.base_url: str | None = self.base_url.strip() if self.base_url else None
|
||||
if self.base_url:
|
||||
try:
|
||||
urlparse(self.base_url)
|
||||
except Exception as e:
|
||||
error_msg = f"Invalid Base URL format: {e}"
|
||||
self.log(error_msg)
|
||||
raise ValueError(error_msg) from e
|
||||
else:
|
||||
base_url_error = "Base URL is required"
|
||||
raise ValueError(base_url_error)
|
||||
|
||||
self.log(
|
||||
f"Creating Ingestor for host: {parsed_url.hostname!r}, port: {parsed_url.port!r}",
|
||||
f"Creating Ingestor for Base URL: {self.base_url!r}",
|
||||
)
|
||||
try:
|
||||
from nv_ingest_client.client import Ingestor
|
||||
|
||||
try:
|
||||
ingestor = (
|
||||
Ingestor(message_client_hostname=parsed_url.hostname, message_client_port=parsed_url.port)
|
||||
Ingestor(
|
||||
message_client_kwargs={
|
||||
"base_url": self.base_url,
|
||||
"headers": {"Authorization": f"Bearer {self.api_key}"},
|
||||
"max_retries": 3,
|
||||
"timeout": 60,
|
||||
}
|
||||
)
|
||||
.files(file_paths)
|
||||
.extract(
|
||||
extract_text=self.extract_text,
|
||||
extract_tables=self.extract_tables,
|
||||
extract_charts=self.extract_charts,
|
||||
extract_images=False, # Currently not supported
|
||||
extract_images=self.extract_images,
|
||||
text_depth=self.text_depth,
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
self.log(f"Error creating Ingestor: {e}")
|
||||
raise
|
||||
|
||||
if self.split_text:
|
||||
ingestor = ingestor.split(
|
||||
split_by=self.split_by,
|
||||
split_length=self.split_length,
|
||||
split_overlap=self.split_overlap,
|
||||
max_character_length=self.max_character_length,
|
||||
sentence_window_size=self.sentence_window_size,
|
||||
)
|
||||
if self.extract_images:
|
||||
if self.dedup_images:
|
||||
ingestor = ingestor.dedup(content_type="image", filter=True)
|
||||
|
||||
if self.filter_images:
|
||||
ingestor = ingestor.filter(
|
||||
content_type="image",
|
||||
min_size=self.min_image_size,
|
||||
min_aspect_ratio=self.min_aspect_ratio,
|
||||
max_aspect_ratio=self.max_aspect_ratio,
|
||||
filter=True,
|
||||
)
|
||||
|
||||
if self.caption_images:
|
||||
ingestor = ingestor.caption()
|
||||
|
||||
if self.extract_text and self.split_text:
|
||||
ingestor = ingestor.split(
|
||||
tokenizer="intfloat/e5-large-unsupervised",
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
params={"split_source_types": ["PDF"]},
|
||||
)
|
||||
|
||||
try:
|
||||
result = ingestor.ingest()
|
||||
except Exception as e:
|
||||
self.log(f"Error during ingestion: {e}")
|
||||
ingest_error = f"Error during ingestion: {e}"
|
||||
self.log(ingest_error)
|
||||
raise
|
||||
|
||||
self.log(f"Results: {result}")
|
||||
|
|
@ -181,37 +239,55 @@ class NvidiaIngestComponent(BaseFileComponent):
|
|||
# Result is a list of segments as determined by the text_depth option (if "document" then only one segment)
|
||||
# each segment is a list of elements (text, structured, image)
|
||||
for segment in result:
|
||||
for element in segment:
|
||||
document_type = element.get("document_type")
|
||||
metadata = element.get("metadata", {})
|
||||
source_metadata = metadata.get("source_metadata", {})
|
||||
content_metadata = metadata.get("content_metadata", {})
|
||||
if segment:
|
||||
for element in segment:
|
||||
document_type = element.get("document_type")
|
||||
metadata = element.get("metadata", {})
|
||||
source_metadata = metadata.get("source_metadata", {})
|
||||
|
||||
if document_type == document_type_text:
|
||||
data.append(
|
||||
Data(
|
||||
text=metadata.get("content", ""),
|
||||
file_path=source_metadata.get("source_name", ""),
|
||||
document_type=document_type,
|
||||
description=content_metadata.get("description", ""),
|
||||
if document_type == document_type_text:
|
||||
data.append(
|
||||
Data(
|
||||
text=metadata.get("content", ""),
|
||||
file_path=source_metadata.get("source_name", ""),
|
||||
document_type=document_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
)
|
||||
# Both charts and tables are returned as "structured" document type,
|
||||
# with extracted text in "table_content"
|
||||
elif document_type == document_type_structured:
|
||||
table_metadata = metadata.get("table_metadata", {})
|
||||
data.append(
|
||||
Data(
|
||||
text=table_metadata.get("table_content", ""),
|
||||
file_path=source_metadata.get("source_name", ""),
|
||||
document_type=document_type,
|
||||
description=content_metadata.get("description", ""),
|
||||
)
|
||||
)
|
||||
else:
|
||||
# image is not yet supported; skip if encountered
|
||||
self.log(f"Unsupported document type: {document_type}")
|
||||
# Both charts and tables are returned as "structured" document type,
|
||||
# with extracted text in "table_content"
|
||||
elif document_type == document_type_structured:
|
||||
table_metadata = metadata.get("table_metadata", {})
|
||||
|
||||
# reformat chart/table images as binary data
|
||||
if "content" in metadata:
|
||||
metadata["content"] = {"$binary": metadata["content"]}
|
||||
|
||||
data.append(
|
||||
Data(
|
||||
text=table_metadata.get("table_content", ""),
|
||||
file_path=source_metadata.get("source_name", ""),
|
||||
document_type=document_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
elif document_type == "image":
|
||||
image_metadata = metadata.get("image_metadata", {})
|
||||
|
||||
# reformat images as binary data
|
||||
if "content" in metadata:
|
||||
metadata["content"] = {"$binary": metadata["content"]}
|
||||
|
||||
data.append(
|
||||
Data(
|
||||
text=image_metadata.get("caption", "No caption available"),
|
||||
file_path=source_metadata.get("source_name", ""),
|
||||
document_type=document_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.log(f"Unsupported document type {document_type}")
|
||||
self.status = data or "No data"
|
||||
|
||||
# merge processed data with BaseFile objects
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue