feat: add user agents to URL Component header (#7590)

* Update url.py

* Update url.py

* format fix

* added dataframe table input for header.

* Update url.py

* [autofix.ci] apply automated fixes

* Update url.py

* Update url.py

* remove unused variables

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
Edwin Jose 2025-05-12 14:55:16 -04:00 committed by GitHub
commit 65c3aa478e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,15 +1,18 @@
import re
import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import RecursiveUrlLoader
from loguru import logger
from langflow.custom.custom_component.component import Component
from langflow.helpers.data import data_to_text
from langflow.inputs.inputs import TableInput
from langflow.io import BoolInput, DropdownInput, IntInput, MessageTextInput, Output
from langflow.schema import Data
from langflow.schema.dataframe import DataFrame
from langflow.schema.message import Message
from langflow.services.deps import get_settings_service
class URLComponent(Component):
@ -73,6 +76,36 @@ class URLComponent(Component):
value="Text",
advanced=True,
),
IntInput(
name="timeout",
display_name="Timeout",
info="Timeout for the request in seconds.",
value=30,
required=False,
advanced=True,
),
TableInput(
name="headers",
display_name="Headers",
info="The headers to send with the request",
table_schema=[
{
"name": "key",
"display_name": "Header",
"type": "str",
"description": "Header name",
},
{
"name": "value",
"display_name": "Value",
"type": "str",
"description": "Header value",
},
],
value=[{"key": "User-Agent", "value": get_settings_service().settings.user_agent}],
advanced=True,
input_types=["DataFrame"],
),
]
outputs = [
@ -116,18 +149,29 @@ class URLComponent(Component):
logger.info(msg)
extractor = (lambda x: x) if self.format == "HTML" else (lambda x: BeautifulSoup(x, "lxml").get_text())
headers_dict = {header["key"]: header["value"] for header in self.headers}
loader = RecursiveUrlLoader(
url=processed_url,
max_depth=self.max_depth,
prevent_outside=self.prevent_outside,
use_async=self.use_async,
extractor=extractor,
timeout=self.timeout,
headers=headers_dict,
)
docs = loader.load()
msg = f"Found {len(docs)} documents from {processed_url}"
logger.info(msg)
all_docs.extend(docs)
try:
docs = loader.load()
if not docs:
msg = f"No documents found for {processed_url}"
logger.warning(msg)
else:
msg = f"Found {len(docs)} documents from {processed_url}"
logger.info(msg)
all_docs.extend(docs)
except requests.exceptions.RequestException as e:
msg = f"Error loading documents from {processed_url}: {e}"
logger.exception(msg)
data = [Data(text=doc.page_content, **doc.metadata) for doc in all_docs]
self.status = data