feat: New Web search component (#8135)
* web search component * update tests * Update test_web_search.py * [autofix.ci] apply automated fixes * add timeout input, santitize query, type check --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Yuqi Tang <yuqi.tang@datastax.com>
This commit is contained in:
parent
6c58c6efc3
commit
dec42bf447
3 changed files with 159 additions and 0 deletions
|
|
@ -6,6 +6,7 @@ from .json_to_data import JSONToDataComponent
|
|||
from .rss import RSSReaderComponent
|
||||
from .sql_executor import SQLComponent
|
||||
from .url import URLComponent
|
||||
from .web_search import WebSearchComponent
|
||||
from .webhook import WebhookComponent
|
||||
|
||||
__all__ = [
|
||||
|
|
@ -17,5 +18,6 @@ __all__ = [
|
|||
"RSSReaderComponent",
|
||||
"SQLComponent",
|
||||
"URLComponent",
|
||||
"WebSearchComponent",
|
||||
"WebhookComponent",
|
||||
]
|
||||
|
|
|
|||
115
src/backend/base/langflow/components/data/web_search.py
Normal file
115
src/backend/base/langflow/components/data/web_search.py
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
import re
|
||||
from urllib.parse import parse_qs, unquote, urlparse
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.io import IntInput, MessageTextInput, Output
|
||||
from langflow.schema import DataFrame
|
||||
from langflow.services.deps import get_settings_service
|
||||
|
||||
|
||||
class WebSearchComponent(Component):
|
||||
display_name = "Web Search"
|
||||
description = (
|
||||
"Performs a basic DuckDuckGo search (HTML scraping) "
|
||||
"and returns a DataFrame of titles, links, snippets, and fetched content."
|
||||
)
|
||||
icon = "search"
|
||||
name = "WebSearchNoAPI"
|
||||
|
||||
inputs = [
|
||||
MessageTextInput(
|
||||
name="query",
|
||||
display_name="Search Query",
|
||||
info="Keywords to search for.",
|
||||
tool_mode=True,
|
||||
input_types=[],
|
||||
required=True,
|
||||
),
|
||||
IntInput(
|
||||
name="timeout",
|
||||
display_name="Timeout",
|
||||
info="Timeout for the web search request.",
|
||||
value=5,
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [Output(name="results", display_name="Search Results", method="perform_search")]
|
||||
|
||||
def validate_url(self, string: str) -> bool:
|
||||
url_regex = re.compile(
|
||||
r"^(https?:\/\/)?" r"(www\.)?" r"([a-zA-Z0-9.-]+)" r"(\.[a-zA-Z]{2,})?" r"(:\d+)?" r"(\/[^\s]*)?$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
return bool(url_regex.match(string))
|
||||
|
||||
def ensure_url(self, url: str) -> str:
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = "https://" + url
|
||||
if not self.validate_url(url):
|
||||
msg = f"Invalid URL: {url}"
|
||||
raise ValueError(msg)
|
||||
return url
|
||||
|
||||
def _sanitize_query(self, query: str) -> str:
|
||||
"""Sanitize search query."""
|
||||
# Remove potentially dangerous characters
|
||||
return re.sub(r'[<>"\']', "", query.strip())
|
||||
|
||||
def perform_search(self) -> DataFrame:
|
||||
query = self._sanitize_query(self.query)
|
||||
if not query:
|
||||
msg = "Empty search query"
|
||||
raise ValueError(msg)
|
||||
headers = {"User-Agent": get_settings_service().settings.user_agent}
|
||||
params = {"q": query, "kl": "us-en"}
|
||||
url = "https://html.duckduckgo.com/html/"
|
||||
|
||||
try:
|
||||
response = requests.get(url, params=params, headers=headers, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
self.status = f"Failed request: {e!s}"
|
||||
return DataFrame(pd.DataFrame([{"title": "Error", "link": "", "snippet": str(e), "content": ""}]))
|
||||
|
||||
if not response.text or "text/html" not in response.headers.get("content-type", "").lower():
|
||||
self.status = "No results found"
|
||||
return DataFrame(
|
||||
pd.DataFrame([{"title": "Error", "link": "", "snippet": "No results found", "content": ""}])
|
||||
)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
results = []
|
||||
|
||||
for result in soup.select("div.result"):
|
||||
title_tag = result.select_one("a.result__a")
|
||||
snippet_tag = result.select_one("a.result__snippet")
|
||||
if title_tag:
|
||||
raw_link = title_tag.get("href", "")
|
||||
parsed = urlparse(raw_link)
|
||||
uddg = parse_qs(parsed.query).get("uddg", [""])[0]
|
||||
decoded_link = unquote(uddg) if uddg else raw_link
|
||||
|
||||
try:
|
||||
final_url = self.ensure_url(decoded_link)
|
||||
page = requests.get(final_url, headers=headers, timeout=self.timeout)
|
||||
page.raise_for_status()
|
||||
content = BeautifulSoup(page.text, "lxml").get_text(separator=" ", strip=True)
|
||||
except requests.RequestException as e:
|
||||
final_url = decoded_link
|
||||
content = f"(Failed to fetch: {e!s}"
|
||||
|
||||
results.append(
|
||||
{
|
||||
"title": title_tag.get_text(strip=True),
|
||||
"link": final_url,
|
||||
"snippet": snippet_tag.get_text(strip=True) if snippet_tag else "",
|
||||
"content": content,
|
||||
}
|
||||
)
|
||||
|
||||
df_results = pd.DataFrame(results)
|
||||
return DataFrame(df_results)
|
||||
42
src/backend/tests/unit/components/data/test_web_search.py
Normal file
42
src/backend/tests/unit/components/data/test_web_search.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import pytest
|
||||
from langflow.components.data.web_search import WebSearchComponent
|
||||
from langflow.schema import DataFrame
|
||||
|
||||
from tests.base import ComponentTestBaseWithoutClient
|
||||
|
||||
|
||||
class TestWebSearchComponent(ComponentTestBaseWithoutClient):
|
||||
@pytest.fixture
|
||||
def component_class(self):
|
||||
"""Return the component class to test."""
|
||||
return WebSearchComponent
|
||||
|
||||
@pytest.fixture
|
||||
def default_kwargs(self):
|
||||
"""Return the default kwargs for the component."""
|
||||
return {
|
||||
"query": "OpenAI GPT-4",
|
||||
}
|
||||
|
||||
@pytest.fixture
|
||||
def file_names_mapping(self):
|
||||
"""Return the file names mapping for the component."""
|
||||
return []
|
||||
|
||||
async def test_invalid_url_handling(self):
|
||||
# Create a test instance of the component
|
||||
component = WebSearchComponent()
|
||||
|
||||
# Set an invalid URL
|
||||
invalid_url = "htp://invalid-url"
|
||||
|
||||
# Ensure the URL is invalid
|
||||
with pytest.raises(ValueError, match="Invalid URL"):
|
||||
component.ensure_url(invalid_url)
|
||||
|
||||
def test_successful_web_search(self):
|
||||
component = WebSearchComponent()
|
||||
component.query = "OpenAI GPT-4"
|
||||
result = component.perform_search()
|
||||
assert isinstance(result, DataFrame)
|
||||
assert not result.empty
|
||||
Loading…
Add table
Add a link
Reference in a new issue