From 65c3aa478e4ec7d274fc9c004e4126f776d781bc Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Mon, 12 May 2025 14:55:16 -0400 Subject: [PATCH] feat: add user agents to URL Component header (#7590) * Update url.py * Update url.py * format fix * added dataframe table input for header. * Update url.py * [autofix.ci] apply automated fixes * Update url.py * Update url.py * remove unused variables --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- .../base/langflow/components/data/url.py | 52 +++++++++++++++++-- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/src/backend/base/langflow/components/data/url.py b/src/backend/base/langflow/components/data/url.py index 172edf919..cd590522d 100644 --- a/src/backend/base/langflow/components/data/url.py +++ b/src/backend/base/langflow/components/data/url.py @@ -1,15 +1,18 @@ import re +import requests from bs4 import BeautifulSoup from langchain_community.document_loaders import RecursiveUrlLoader from loguru import logger from langflow.custom.custom_component.component import Component from langflow.helpers.data import data_to_text +from langflow.inputs.inputs import TableInput from langflow.io import BoolInput, DropdownInput, IntInput, MessageTextInput, Output from langflow.schema import Data from langflow.schema.dataframe import DataFrame from langflow.schema.message import Message +from langflow.services.deps import get_settings_service class URLComponent(Component): @@ -73,6 +76,36 @@ class URLComponent(Component): value="Text", advanced=True, ), + IntInput( + name="timeout", + display_name="Timeout", + info="Timeout for the request in seconds.", + value=30, + required=False, + advanced=True, + ), + TableInput( + name="headers", + display_name="Headers", + info="The headers to send with the request", + table_schema=[ + { + "name": "key", + "display_name": "Header", + "type": "str", + "description": "Header name", + }, + { + "name": "value", + "display_name": "Value", + "type": "str", + "description": "Header value", + }, + ], + value=[{"key": "User-Agent", "value": get_settings_service().settings.user_agent}], + advanced=True, + input_types=["DataFrame"], + ), ] outputs = [ @@ -116,18 +149,29 @@ class URLComponent(Component): logger.info(msg) extractor = (lambda x: x) if self.format == "HTML" else (lambda x: BeautifulSoup(x, "lxml").get_text()) + headers_dict = {header["key"]: header["value"] for header in self.headers} loader = RecursiveUrlLoader( url=processed_url, max_depth=self.max_depth, prevent_outside=self.prevent_outside, use_async=self.use_async, extractor=extractor, + timeout=self.timeout, + headers=headers_dict, ) - docs = loader.load() - msg = f"Found {len(docs)} documents from {processed_url}" - logger.info(msg) - all_docs.extend(docs) + try: + docs = loader.load() + if not docs: + msg = f"No documents found for {processed_url}" + logger.warning(msg) + else: + msg = f"Found {len(docs)} documents from {processed_url}" + logger.info(msg) + all_docs.extend(docs) + except requests.exceptions.RequestException as e: + msg = f"Error loading documents from {processed_url}: {e}" + logger.exception(msg) data = [Data(text=doc.page_content, **doc.metadata) for doc in all_docs] self.status = data