diff --git a/src/backend/base/langflow/components/data/__init__.py b/src/backend/base/langflow/components/data/__init__.py index 4e2b86499..cd5acd3fe 100644 --- a/src/backend/base/langflow/components/data/__init__.py +++ b/src/backend/base/langflow/components/data/__init__.py @@ -6,6 +6,7 @@ from .json_to_data import JSONToDataComponent from .rss import RSSReaderComponent from .sql_executor import SQLComponent from .url import URLComponent +from .web_search import WebSearchComponent from .webhook import WebhookComponent __all__ = [ @@ -17,5 +18,6 @@ __all__ = [ "RSSReaderComponent", "SQLComponent", "URLComponent", + "WebSearchComponent", "WebhookComponent", ] diff --git a/src/backend/base/langflow/components/data/web_search.py b/src/backend/base/langflow/components/data/web_search.py new file mode 100644 index 000000000..9f94f071b --- /dev/null +++ b/src/backend/base/langflow/components/data/web_search.py @@ -0,0 +1,115 @@ +import re +from urllib.parse import parse_qs, unquote, urlparse + +import pandas as pd +import requests +from bs4 import BeautifulSoup + +from langflow.custom import Component +from langflow.io import IntInput, MessageTextInput, Output +from langflow.schema import DataFrame +from langflow.services.deps import get_settings_service + + +class WebSearchComponent(Component): + display_name = "Web Search" + description = ( + "Performs a basic DuckDuckGo search (HTML scraping) " + "and returns a DataFrame of titles, links, snippets, and fetched content." + ) + icon = "search" + name = "WebSearchNoAPI" + + inputs = [ + MessageTextInput( + name="query", + display_name="Search Query", + info="Keywords to search for.", + tool_mode=True, + input_types=[], + required=True, + ), + IntInput( + name="timeout", + display_name="Timeout", + info="Timeout for the web search request.", + value=5, + advanced=True, + ), + ] + + outputs = [Output(name="results", display_name="Search Results", method="perform_search")] + + def validate_url(self, string: str) -> bool: + url_regex = re.compile( + r"^(https?:\/\/)?" r"(www\.)?" r"([a-zA-Z0-9.-]+)" r"(\.[a-zA-Z]{2,})?" r"(:\d+)?" r"(\/[^\s]*)?$", + re.IGNORECASE, + ) + return bool(url_regex.match(string)) + + def ensure_url(self, url: str) -> str: + if not url.startswith(("http://", "https://")): + url = "https://" + url + if not self.validate_url(url): + msg = f"Invalid URL: {url}" + raise ValueError(msg) + return url + + def _sanitize_query(self, query: str) -> str: + """Sanitize search query.""" + # Remove potentially dangerous characters + return re.sub(r'[<>"\']', "", query.strip()) + + def perform_search(self) -> DataFrame: + query = self._sanitize_query(self.query) + if not query: + msg = "Empty search query" + raise ValueError(msg) + headers = {"User-Agent": get_settings_service().settings.user_agent} + params = {"q": query, "kl": "us-en"} + url = "https://html.duckduckgo.com/html/" + + try: + response = requests.get(url, params=params, headers=headers, timeout=self.timeout) + response.raise_for_status() + except requests.RequestException as e: + self.status = f"Failed request: {e!s}" + return DataFrame(pd.DataFrame([{"title": "Error", "link": "", "snippet": str(e), "content": ""}])) + + if not response.text or "text/html" not in response.headers.get("content-type", "").lower(): + self.status = "No results found" + return DataFrame( + pd.DataFrame([{"title": "Error", "link": "", "snippet": "No results found", "content": ""}]) + ) + soup = BeautifulSoup(response.text, "html.parser") + results = [] + + for result in soup.select("div.result"): + title_tag = result.select_one("a.result__a") + snippet_tag = result.select_one("a.result__snippet") + if title_tag: + raw_link = title_tag.get("href", "") + parsed = urlparse(raw_link) + uddg = parse_qs(parsed.query).get("uddg", [""])[0] + decoded_link = unquote(uddg) if uddg else raw_link + + try: + final_url = self.ensure_url(decoded_link) + page = requests.get(final_url, headers=headers, timeout=self.timeout) + page.raise_for_status() + content = BeautifulSoup(page.text, "lxml").get_text(separator=" ", strip=True) + except requests.RequestException as e: + final_url = decoded_link + content = f"(Failed to fetch: {e!s}" + + results.append( + { + "title": title_tag.get_text(strip=True), + "link": final_url, + "snippet": snippet_tag.get_text(strip=True) if snippet_tag else "", + "content": content, + } + ) + + df_results = pd.DataFrame(results) + return DataFrame(df_results) diff --git a/src/backend/tests/unit/components/data/test_web_search.py b/src/backend/tests/unit/components/data/test_web_search.py new file mode 100644 index 000000000..6b6e73f0a --- /dev/null +++ b/src/backend/tests/unit/components/data/test_web_search.py @@ -0,0 +1,42 @@ +import pytest +from langflow.components.data.web_search import WebSearchComponent +from langflow.schema import DataFrame + +from tests.base import ComponentTestBaseWithoutClient + + +class TestWebSearchComponent(ComponentTestBaseWithoutClient): + @pytest.fixture + def component_class(self): + """Return the component class to test.""" + return WebSearchComponent + + @pytest.fixture + def default_kwargs(self): + """Return the default kwargs for the component.""" + return { + "query": "OpenAI GPT-4", + } + + @pytest.fixture + def file_names_mapping(self): + """Return the file names mapping for the component.""" + return [] + + async def test_invalid_url_handling(self): + # Create a test instance of the component + component = WebSearchComponent() + + # Set an invalid URL + invalid_url = "htp://invalid-url" + + # Ensure the URL is invalid + with pytest.raises(ValueError, match="Invalid URL"): + component.ensure_url(invalid_url) + + def test_successful_web_search(self): + component = WebSearchComponent() + component.query = "OpenAI GPT-4" + result = component.perform_search() + assert isinstance(result, DataFrame) + assert not result.empty