feat: add user agents to URL Component header (#7590)
* Update url.py * Update url.py * format fix * added dataframe table input for header. * Update url.py * [autofix.ci] apply automated fixes * Update url.py * Update url.py * remove unused variables --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
116bee825c
commit
65c3aa478e
1 changed files with 48 additions and 4 deletions
|
|
@ -1,15 +1,18 @@
|
|||
import re
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from langchain_community.document_loaders import RecursiveUrlLoader
|
||||
from loguru import logger
|
||||
|
||||
from langflow.custom.custom_component.component import Component
|
||||
from langflow.helpers.data import data_to_text
|
||||
from langflow.inputs.inputs import TableInput
|
||||
from langflow.io import BoolInput, DropdownInput, IntInput, MessageTextInput, Output
|
||||
from langflow.schema import Data
|
||||
from langflow.schema.dataframe import DataFrame
|
||||
from langflow.schema.message import Message
|
||||
from langflow.services.deps import get_settings_service
|
||||
|
||||
|
||||
class URLComponent(Component):
|
||||
|
|
@ -73,6 +76,36 @@ class URLComponent(Component):
|
|||
value="Text",
|
||||
advanced=True,
|
||||
),
|
||||
IntInput(
|
||||
name="timeout",
|
||||
display_name="Timeout",
|
||||
info="Timeout for the request in seconds.",
|
||||
value=30,
|
||||
required=False,
|
||||
advanced=True,
|
||||
),
|
||||
TableInput(
|
||||
name="headers",
|
||||
display_name="Headers",
|
||||
info="The headers to send with the request",
|
||||
table_schema=[
|
||||
{
|
||||
"name": "key",
|
||||
"display_name": "Header",
|
||||
"type": "str",
|
||||
"description": "Header name",
|
||||
},
|
||||
{
|
||||
"name": "value",
|
||||
"display_name": "Value",
|
||||
"type": "str",
|
||||
"description": "Header value",
|
||||
},
|
||||
],
|
||||
value=[{"key": "User-Agent", "value": get_settings_service().settings.user_agent}],
|
||||
advanced=True,
|
||||
input_types=["DataFrame"],
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
|
|
@ -116,18 +149,29 @@ class URLComponent(Component):
|
|||
logger.info(msg)
|
||||
|
||||
extractor = (lambda x: x) if self.format == "HTML" else (lambda x: BeautifulSoup(x, "lxml").get_text())
|
||||
headers_dict = {header["key"]: header["value"] for header in self.headers}
|
||||
loader = RecursiveUrlLoader(
|
||||
url=processed_url,
|
||||
max_depth=self.max_depth,
|
||||
prevent_outside=self.prevent_outside,
|
||||
use_async=self.use_async,
|
||||
extractor=extractor,
|
||||
timeout=self.timeout,
|
||||
headers=headers_dict,
|
||||
)
|
||||
|
||||
docs = loader.load()
|
||||
msg = f"Found {len(docs)} documents from {processed_url}"
|
||||
logger.info(msg)
|
||||
all_docs.extend(docs)
|
||||
try:
|
||||
docs = loader.load()
|
||||
if not docs:
|
||||
msg = f"No documents found for {processed_url}"
|
||||
logger.warning(msg)
|
||||
else:
|
||||
msg = f"Found {len(docs)} documents from {processed_url}"
|
||||
logger.info(msg)
|
||||
all_docs.extend(docs)
|
||||
except requests.exceptions.RequestException as e:
|
||||
msg = f"Error loading documents from {processed_url}: {e}"
|
||||
logger.exception(msg)
|
||||
|
||||
data = [Data(text=doc.page_content, **doc.metadata) for doc in all_docs]
|
||||
self.status = data
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue