From fabf8641d27f9fa3aa06c6a20555a06fee112109 Mon Sep 17 00:00:00 2001 From: Lucas Oliveira <62335616+lucaseduoli@users.noreply.github.com> Date: Mon, 19 May 2025 11:56:05 -0300 Subject: [PATCH] fix: add error handling in url component (#8073) * Fixed URL component not loading PDF * Implemented generating error when cant load document * removed pdf fetching form url component * Update src/backend/base/langflow/components/data/url.py Co-authored-by: Gabriel Luiz Freitas Almeida * Fixed indentation * Refactor error handling in URLComponent to improve clarity of error messages --------- Co-authored-by: Gabriel Luiz Freitas Almeida --- .../base/langflow/components/data/url.py | 35 ++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/src/backend/base/langflow/components/data/url.py b/src/backend/base/langflow/components/data/url.py index cd590522d..702edb5c1 100644 --- a/src/backend/base/langflow/components/data/url.py +++ b/src/backend/base/langflow/components/data/url.py @@ -1,6 +1,6 @@ import re -import requests +import httpx from bs4 import BeautifulSoup from langchain_community.document_loaders import RecursiveUrlLoader from loguru import logger @@ -144,17 +144,28 @@ class URLComponent(Component): if not urls: raise ValueError(no_urls_msg) + # If there's only one URL, we'll make sure to propagate any errors + single_url = len(urls) == 1 + for processed_url in urls: msg = f"Loading documents from {processed_url}" logger.info(msg) - extractor = (lambda x: x) if self.format == "HTML" else (lambda x: BeautifulSoup(x, "lxml").get_text()) + # Create headers dictionary headers_dict = {header["key"]: header["value"] for header in self.headers} + + # Configure RecursiveUrlLoader with httpx-compatible settings + extractor = (lambda x: x) if self.format == "HTML" else (lambda x: BeautifulSoup(x, "lxml").get_text()) + + # Modified settings for RecursiveUrlLoader + # Note: We need to pass a compatible client or settings to RecursiveUrlLoader + # This will depend on how RecursiveUrlLoader is implemented loader = RecursiveUrlLoader( url=processed_url, max_depth=self.max_depth, prevent_outside=self.prevent_outside, use_async=self.use_async, + continue_on_failure=not single_url, extractor=extractor, timeout=self.timeout, headers=headers_dict, @@ -165,19 +176,35 @@ class URLComponent(Component): if not docs: msg = f"No documents found for {processed_url}" logger.warning(msg) + if single_url: + message = f"No documents found for {processed_url}" + raise ValueError(message) else: msg = f"Found {len(docs)} documents from {processed_url}" logger.info(msg) all_docs.extend(docs) - except requests.exceptions.RequestException as e: + except (httpx.HTTPError, httpx.RequestError) as e: msg = f"Error loading documents from {processed_url}: {e}" logger.exception(msg) + if single_url: + raise # Re-raise the exception if it's the only URL + except UnicodeDecodeError as e: + msg = f"Error decoding content from {processed_url}: {e}" + logger.error(msg) + if single_url: + raise # Re-raise the exception if it's the only URL + except Exception as e: + msg = f"Unexpected error loading documents from {processed_url}: {e}" + logger.exception(msg) + if single_url: + raise # Re-raise the exception if it's the only URL data = [Data(text=doc.page_content, **doc.metadata) for doc in all_docs] self.status = data except Exception as e: - msg = f"Error loading documents: {e!s}" + error_msg = e.message if hasattr(e, "message") else e + msg = f"Error loading documents: {error_msg!s}" logger.exception(msg) raise ValueError(msg) from e