From 9b5cfa58b57cfb5bd98d323cfa71ccca7c60de5b Mon Sep 17 00:00:00 2001 From: Edwin Jose Date: Fri, 23 May 2025 15:36:31 -0400 Subject: [PATCH] feat: News Search Component (#8190) * Create news_search.py * add tests * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- .../base/langflow/components/data/__init__.py | 2 + .../langflow/components/data/news_search.py | 164 ++++++++++++++++++ .../unit/components/data/test_news_search.py | 88 ++++++++++ 3 files changed, 254 insertions(+) create mode 100644 src/backend/base/langflow/components/data/news_search.py create mode 100644 src/backend/tests/unit/components/data/test_news_search.py diff --git a/src/backend/base/langflow/components/data/__init__.py b/src/backend/base/langflow/components/data/__init__.py index cd5acd3fe..6e90f0426 100644 --- a/src/backend/base/langflow/components/data/__init__.py +++ b/src/backend/base/langflow/components/data/__init__.py @@ -3,6 +3,7 @@ from .csv_to_data import CSVToDataComponent from .directory import DirectoryComponent from .file import FileComponent from .json_to_data import JSONToDataComponent +from .news_search import NewsSearchComponent from .rss import RSSReaderComponent from .sql_executor import SQLComponent from .url import URLComponent @@ -15,6 +16,7 @@ __all__ = [ "DirectoryComponent", "FileComponent", "JSONToDataComponent", + "NewsSearchComponent", "RSSReaderComponent", "SQLComponent", "URLComponent", diff --git a/src/backend/base/langflow/components/data/news_search.py b/src/backend/base/langflow/components/data/news_search.py new file mode 100644 index 000000000..df3100d56 --- /dev/null +++ b/src/backend/base/langflow/components/data/news_search.py @@ -0,0 +1,164 @@ +from urllib.parse import quote_plus + +import pandas as pd +import requests +from bs4 import BeautifulSoup + +from langflow.custom import Component +from langflow.io import IntInput, MessageTextInput, Output +from langflow.schema import DataFrame + + +class NewsSearchComponent(Component): + display_name = "News Search" + description = "Searches Google News via RSS. Returns clean article data." + icon = "newspaper" + name = "NewsSearch" + + inputs = [ + MessageTextInput( + name="query", + display_name="Search Query", + info="Search keywords for news articles.", + tool_mode=True, + input_types=[], + required=True, + ), + MessageTextInput( + name="hl", + display_name="Language (hl)", + info="Language code, e.g. en-US, fr, de. Default: en-US.", + tool_mode=False, + input_types=[], + required=False, + advanced=True, + ), + MessageTextInput( + name="gl", + display_name="Country (gl)", + info="Country code, e.g. US, FR, DE. Default: US.", + tool_mode=False, + input_types=[], + required=False, + advanced=True, + ), + MessageTextInput( + name="ceid", + display_name="Country:Language (ceid)", + info="e.g. US:en, FR:fr. Default: US:en.", + tool_mode=False, + value="US:en", + input_types=[], + required=False, + advanced=True, + ), + MessageTextInput( + name="topic", + display_name="Topic", + info="One of: WORLD, NATION, BUSINESS, TECHNOLOGY, ENTERTAINMENT, SCIENCE, SPORTS, HEALTH.", + tool_mode=False, + input_types=[], + required=False, + advanced=True, + ), + MessageTextInput( + name="location", + display_name="Location (Geo)", + info="City, state, or country for location-based news. Leave blank for keyword search.", + tool_mode=False, + input_types=[], + required=False, + advanced=True, + ), + IntInput( + name="timeout", + display_name="Timeout", + info="Timeout for the request in seconds.", + value=5, + required=False, + advanced=True, + ), + ] + + outputs = [Output(name="articles", display_name="News Articles", method="search_news")] + + def search_news(self) -> DataFrame: + # Defaults + hl = getattr(self, "hl", None) or "en-US" + gl = getattr(self, "gl", None) or "US" + ceid = getattr(self, "ceid", None) or f"{gl}:{hl.split('-')[0]}" + topic = getattr(self, "topic", None) + location = getattr(self, "location", None) + query = getattr(self, "query", None) + + # Build base URL + if topic: + # Topic-based feed + base_url = f"https://news.google.com/rss/headlines/section/topic/{quote_plus(topic.upper())}" + params = f"?hl={hl}&gl={gl}&ceid={ceid}" + rss_url = base_url + params + elif location: + # Location-based feed + base_url = f"https://news.google.com/rss/headlines/section/geo/{quote_plus(location)}" + params = f"?hl={hl}&gl={gl}&ceid={ceid}" + rss_url = base_url + params + elif query: + # Keyword search feed + base_url = "https://news.google.com/rss/search?q=" + query_parts = [query] + query_encoded = quote_plus(" ".join(query_parts)) + params = f"&hl={hl}&gl={gl}&ceid={ceid}" + rss_url = f"{base_url}{query_encoded}{params}" + else: + self.status = "No search query, topic, or location provided." + self.log(self.status) + return DataFrame( + pd.DataFrame( + [ + { + "title": "Error", + "link": "", + "published": "", + "summary": "No search query, topic, or location provided.", + } + ] + ) + ) + + try: + response = requests.get(rss_url, timeout=self.timeout) + response.raise_for_status() + soup = BeautifulSoup(response.content, "xml") + items = soup.find_all("item") + except requests.RequestException as e: + self.status = f"Failed to fetch news: {e}" + self.log(self.status) + return DataFrame(pd.DataFrame([{"title": "Error", "link": "", "published": "", "summary": str(e)}])) + except (AttributeError, ValueError, TypeError) as e: + self.status = f"Unexpected error: {e!s}" + self.log(self.status) + return DataFrame(pd.DataFrame([{"title": "Error", "link": "", "published": "", "summary": str(e)}])) + + if not items: + self.status = "No news articles found." + self.log(self.status) + return DataFrame(pd.DataFrame([{"title": "No articles found", "link": "", "published": "", "summary": ""}])) + + articles = [] + for item in items: + try: + title = self.clean_html(item.title.text if item.title else "") + link = item.link.text if item.link else "" + published = item.pubDate.text if item.pubDate else "" + summary = self.clean_html(item.description.text if item.description else "") + articles.append({"title": title, "link": link, "published": published, "summary": summary}) + except (AttributeError, ValueError, TypeError) as e: + self.log(f"Error parsing article: {e!s}") + continue + + df_articles = pd.DataFrame(articles) + self.log(f"Found {len(df_articles)} articles.") + return DataFrame(df_articles) + + def clean_html(self, html_string: str) -> str: + return BeautifulSoup(html_string, "html.parser").get_text(separator=" ", strip=True) diff --git a/src/backend/tests/unit/components/data/test_news_search.py b/src/backend/tests/unit/components/data/test_news_search.py new file mode 100644 index 000000000..bb58dcf55 --- /dev/null +++ b/src/backend/tests/unit/components/data/test_news_search.py @@ -0,0 +1,88 @@ +from unittest.mock import Mock, patch + +import pytest +import requests +from langflow.components.data.news_search import NewsSearchComponent +from langflow.schema import DataFrame + +from tests.base import ComponentTestBaseWithoutClient + + +class TestNewsSearchComponent(ComponentTestBaseWithoutClient): + @pytest.fixture + def component_class(self): + return NewsSearchComponent + + @pytest.fixture + def default_kwargs(self): + return {"query": "OpenAI"} + + @pytest.fixture + def file_names_mapping(self): + return [] + + def test_successful_news_search(self): + # Mock Google News RSS feed content + mock_rss_content = """ + + + + + Test News 1 + https://example.com/1 + 2024-03-20 + Summary 1 + + + Test News 2 + https://example.com/2 + 2024-03-21 + Summary 2 + + + + """ + mock_response = Mock() + mock_response.content = mock_rss_content.encode("utf-8") + mock_response.raise_for_status = Mock() + + with patch("requests.get", return_value=mock_response): + component = NewsSearchComponent(query="OpenAI") + result = component.search_news() + assert isinstance(result, DataFrame) + df = result + assert len(df) == 2 + assert list(df.columns) == ["title", "link", "published", "summary"] + assert df.iloc[0]["title"] == "Test News 1" + assert df.iloc[1]["title"] == "Test News 2" + + def test_news_search_error(self): + with patch("requests.get", side_effect=requests.RequestException("Network error")): + component = NewsSearchComponent(query="OpenAI") + result = component.search_news() + assert isinstance(result, DataFrame) + df = result + assert len(df) == 1 + assert df.iloc[0]["title"] == "Error" + assert "Network error" in df.iloc[0]["summary"] + + def test_empty_news_results(self): + # Mock empty RSS feed + mock_rss_content = """ + + + + + + """ + mock_response = Mock() + mock_response.content = mock_rss_content.encode("utf-8") + mock_response.raise_for_status = Mock() + + with patch("requests.get", return_value=mock_response): + component = NewsSearchComponent(query="OpenAI") + result = component.search_news() + assert isinstance(result, DataFrame) + df = result + assert len(df) == 1 + assert df.iloc[0]["title"] == "No articles found"