feat: New Web search component (#8135)

* web search component

* update tests

* Update test_web_search.py

* [autofix.ci] apply automated fixes

* add timeout input, santitize query, type check

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Yuqi Tang <yuqi.tang@datastax.com>
This commit is contained in:
Edwin Jose 2025-05-20 15:08:37 -04:00 committed by GitHub
commit dec42bf447
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 159 additions and 0 deletions

View file

@ -6,6 +6,7 @@ from .json_to_data import JSONToDataComponent
from .rss import RSSReaderComponent
from .sql_executor import SQLComponent
from .url import URLComponent
from .web_search import WebSearchComponent
from .webhook import WebhookComponent
__all__ = [
@ -17,5 +18,6 @@ __all__ = [
"RSSReaderComponent",
"SQLComponent",
"URLComponent",
"WebSearchComponent",
"WebhookComponent",
]

View file

@ -0,0 +1,115 @@
import re
from urllib.parse import parse_qs, unquote, urlparse
import pandas as pd
import requests
from bs4 import BeautifulSoup
from langflow.custom import Component
from langflow.io import IntInput, MessageTextInput, Output
from langflow.schema import DataFrame
from langflow.services.deps import get_settings_service
class WebSearchComponent(Component):
display_name = "Web Search"
description = (
"Performs a basic DuckDuckGo search (HTML scraping) "
"and returns a DataFrame of titles, links, snippets, and fetched content."
)
icon = "search"
name = "WebSearchNoAPI"
inputs = [
MessageTextInput(
name="query",
display_name="Search Query",
info="Keywords to search for.",
tool_mode=True,
input_types=[],
required=True,
),
IntInput(
name="timeout",
display_name="Timeout",
info="Timeout for the web search request.",
value=5,
advanced=True,
),
]
outputs = [Output(name="results", display_name="Search Results", method="perform_search")]
def validate_url(self, string: str) -> bool:
url_regex = re.compile(
r"^(https?:\/\/)?" r"(www\.)?" r"([a-zA-Z0-9.-]+)" r"(\.[a-zA-Z]{2,})?" r"(:\d+)?" r"(\/[^\s]*)?$",
re.IGNORECASE,
)
return bool(url_regex.match(string))
def ensure_url(self, url: str) -> str:
if not url.startswith(("http://", "https://")):
url = "https://" + url
if not self.validate_url(url):
msg = f"Invalid URL: {url}"
raise ValueError(msg)
return url
def _sanitize_query(self, query: str) -> str:
"""Sanitize search query."""
# Remove potentially dangerous characters
return re.sub(r'[<>"\']', "", query.strip())
def perform_search(self) -> DataFrame:
query = self._sanitize_query(self.query)
if not query:
msg = "Empty search query"
raise ValueError(msg)
headers = {"User-Agent": get_settings_service().settings.user_agent}
params = {"q": query, "kl": "us-en"}
url = "https://html.duckduckgo.com/html/"
try:
response = requests.get(url, params=params, headers=headers, timeout=self.timeout)
response.raise_for_status()
except requests.RequestException as e:
self.status = f"Failed request: {e!s}"
return DataFrame(pd.DataFrame([{"title": "Error", "link": "", "snippet": str(e), "content": ""}]))
if not response.text or "text/html" not in response.headers.get("content-type", "").lower():
self.status = "No results found"
return DataFrame(
pd.DataFrame([{"title": "Error", "link": "", "snippet": "No results found", "content": ""}])
)
soup = BeautifulSoup(response.text, "html.parser")
results = []
for result in soup.select("div.result"):
title_tag = result.select_one("a.result__a")
snippet_tag = result.select_one("a.result__snippet")
if title_tag:
raw_link = title_tag.get("href", "")
parsed = urlparse(raw_link)
uddg = parse_qs(parsed.query).get("uddg", [""])[0]
decoded_link = unquote(uddg) if uddg else raw_link
try:
final_url = self.ensure_url(decoded_link)
page = requests.get(final_url, headers=headers, timeout=self.timeout)
page.raise_for_status()
content = BeautifulSoup(page.text, "lxml").get_text(separator=" ", strip=True)
except requests.RequestException as e:
final_url = decoded_link
content = f"(Failed to fetch: {e!s}"
results.append(
{
"title": title_tag.get_text(strip=True),
"link": final_url,
"snippet": snippet_tag.get_text(strip=True) if snippet_tag else "",
"content": content,
}
)
df_results = pd.DataFrame(results)
return DataFrame(df_results)

View file

@ -0,0 +1,42 @@
import pytest
from langflow.components.data.web_search import WebSearchComponent
from langflow.schema import DataFrame
from tests.base import ComponentTestBaseWithoutClient
class TestWebSearchComponent(ComponentTestBaseWithoutClient):
@pytest.fixture
def component_class(self):
"""Return the component class to test."""
return WebSearchComponent
@pytest.fixture
def default_kwargs(self):
"""Return the default kwargs for the component."""
return {
"query": "OpenAI GPT-4",
}
@pytest.fixture
def file_names_mapping(self):
"""Return the file names mapping for the component."""
return []
async def test_invalid_url_handling(self):
# Create a test instance of the component
component = WebSearchComponent()
# Set an invalid URL
invalid_url = "htp://invalid-url"
# Ensure the URL is invalid
with pytest.raises(ValueError, match="Invalid URL"):
component.ensure_url(invalid_url)
def test_successful_web_search(self):
component = WebSearchComponent()
component.query = "OpenAI GPT-4"
result = component.perform_search()
assert isinstance(result, DataFrame)
assert not result.empty