feat: News Search Component (#8190)

* Create news_search.py * add tests * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2025-05-23 15:36:31 -04:00 · 2025-05-23 15:36:31 -04:00 · 9b5cfa58b5
commit 9b5cfa58b5
parent 570cdcd6f1
3 changed files with 254 additions and 0 deletions
--- a/src/backend/base/langflow/components/data/init.py
+++ b/src/backend/base/langflow/components/data/init.py
@ -3,6 +3,7 @@ from .csv_to_data import CSVToDataComponent
 from .directory import DirectoryComponent
 from .file import FileComponent
 from .json_to_data import JSONToDataComponent
+from .news_search import NewsSearchComponent
 from .rss import RSSReaderComponent
 from .sql_executor import SQLComponent
 from .url import URLComponent
@ -15,6 +16,7 @@ __all__ = [
    "DirectoryComponent",
    "FileComponent",
    "JSONToDataComponent",
+    "NewsSearchComponent",
    "RSSReaderComponent",
    "SQLComponent",
    "URLComponent",
--- a/src/backend/base/langflow/components/data/news_search.py
+++ b/src/backend/base/langflow/components/data/news_search.py
@ -0,0 +1,164 @@
+from urllib.parse import quote_plus
+
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+
+from langflow.custom import Component
+from langflow.io import IntInput, MessageTextInput, Output
+from langflow.schema import DataFrame
+
+
+class NewsSearchComponent(Component):
+    display_name = "News Search"
+    description = "Searches Google News via RSS. Returns clean article data."
+    icon = "newspaper"
+    name = "NewsSearch"
+
+    inputs = [
+        MessageTextInput(
+            name="query",
+            display_name="Search Query",
+            info="Search keywords for news articles.",
+            tool_mode=True,
+            input_types=[],
+            required=True,
+        ),
+        MessageTextInput(
+            name="hl",
+            display_name="Language (hl)",
+            info="Language code, e.g. en-US, fr, de. Default: en-US.",
+            tool_mode=False,
+            input_types=[],
+            required=False,
+            advanced=True,
+        ),
+        MessageTextInput(
+            name="gl",
+            display_name="Country (gl)",
+            info="Country code, e.g. US, FR, DE. Default: US.",
+            tool_mode=False,
+            input_types=[],
+            required=False,
+            advanced=True,
+        ),
+        MessageTextInput(
+            name="ceid",
+            display_name="Country:Language (ceid)",
+            info="e.g. US:en, FR:fr. Default: US:en.",
+            tool_mode=False,
+            value="US:en",
+            input_types=[],
+            required=False,
+            advanced=True,
+        ),
+        MessageTextInput(
+            name="topic",
+            display_name="Topic",
+            info="One of: WORLD, NATION, BUSINESS, TECHNOLOGY, ENTERTAINMENT, SCIENCE, SPORTS, HEALTH.",
+            tool_mode=False,
+            input_types=[],
+            required=False,
+            advanced=True,
+        ),
+        MessageTextInput(
+            name="location",
+            display_name="Location (Geo)",
+            info="City, state, or country for location-based news. Leave blank for keyword search.",
+            tool_mode=False,
+            input_types=[],
+            required=False,
+            advanced=True,
+        ),
+        IntInput(
+            name="timeout",
+            display_name="Timeout",
+            info="Timeout for the request in seconds.",
+            value=5,
+            required=False,
+            advanced=True,
+        ),
+    ]
+
+    outputs = [Output(name="articles", display_name="News Articles", method="search_news")]
+
+    def search_news(self) -> DataFrame:
+        # Defaults
+        hl = getattr(self, "hl", None) or "en-US"
+        gl = getattr(self, "gl", None) or "US"
+        ceid = getattr(self, "ceid", None) or f"{gl}:{hl.split('-')[0]}"
+        topic = getattr(self, "topic", None)
+        location = getattr(self, "location", None)
+        query = getattr(self, "query", None)
+
+        # Build base URL
+        if topic:
+            # Topic-based feed
+            base_url = f"https://news.google.com/rss/headlines/section/topic/{quote_plus(topic.upper())}"
+            params = f"?hl={hl}&gl={gl}&ceid={ceid}"
+            rss_url = base_url + params
+        elif location:
+            # Location-based feed
+            base_url = f"https://news.google.com/rss/headlines/section/geo/{quote_plus(location)}"
+            params = f"?hl={hl}&gl={gl}&ceid={ceid}"
+            rss_url = base_url + params
+        elif query:
+            # Keyword search feed
+            base_url = "https://news.google.com/rss/search?q="
+            query_parts = [query]
+            query_encoded = quote_plus(" ".join(query_parts))
+            params = f"&hl={hl}&gl={gl}&ceid={ceid}"
+            rss_url = f"{base_url}{query_encoded}{params}"
+        else:
+            self.status = "No search query, topic, or location provided."
+            self.log(self.status)
+            return DataFrame(
+                pd.DataFrame(
+                    [
+                        {
+                            "title": "Error",
+                            "link": "",
+                            "published": "",
+                            "summary": "No search query, topic, or location provided.",
+                        }
+                    ]
+                )
+            )
+
+        try:
+            response = requests.get(rss_url, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, "xml")
+            items = soup.find_all("item")
+        except requests.RequestException as e:
+            self.status = f"Failed to fetch news: {e}"
+            self.log(self.status)
+            return DataFrame(pd.DataFrame([{"title": "Error", "link": "", "published": "", "summary": str(e)}]))
+        except (AttributeError, ValueError, TypeError) as e:
+            self.status = f"Unexpected error: {e!s}"
+            self.log(self.status)
+            return DataFrame(pd.DataFrame([{"title": "Error", "link": "", "published": "", "summary": str(e)}]))
+
+        if not items:
+            self.status = "No news articles found."
+            self.log(self.status)
+            return DataFrame(pd.DataFrame([{"title": "No articles found", "link": "", "published": "", "summary": ""}]))
+
+        articles = []
+        for item in items:
+            try:
+                title = self.clean_html(item.title.text if item.title else "")
+                link = item.link.text if item.link else ""
+                published = item.pubDate.text if item.pubDate else ""
+                summary = self.clean_html(item.description.text if item.description else "")
+                articles.append({"title": title, "link": link, "published": published, "summary": summary})
+            except (AttributeError, ValueError, TypeError) as e:
+                self.log(f"Error parsing article: {e!s}")
+                continue
+
+        df_articles = pd.DataFrame(articles)
+        self.log(f"Found {len(df_articles)} articles.")
+        return DataFrame(df_articles)
+
+    def clean_html(self, html_string: str) -> str:
+        return BeautifulSoup(html_string, "html.parser").get_text(separator=" ", strip=True)
--- a/src/backend/tests/unit/components/data/test_news_search.py
+++ b/src/backend/tests/unit/components/data/test_news_search.py
@ -0,0 +1,88 @@
+from unittest.mock import Mock, patch
+
+import pytest
+import requests
+from langflow.components.data.news_search import NewsSearchComponent
+from langflow.schema import DataFrame
+
+from tests.base import ComponentTestBaseWithoutClient
+
+
+class TestNewsSearchComponent(ComponentTestBaseWithoutClient):
+    @pytest.fixture
+    def component_class(self):
+        return NewsSearchComponent
+
+    @pytest.fixture
+    def default_kwargs(self):
+        return {"query": "OpenAI"}
+
+    @pytest.fixture
+    def file_names_mapping(self):
+        return []
+
+    def test_successful_news_search(self):
+        # Mock Google News RSS feed content
+        mock_rss_content = """
+        <?xml version="1.0" encoding="UTF-8"?>
+        <rss version="2.0">
+            <channel>
+                <item>
+                    <title>Test News 1</title>
+                    <link>https://example.com/1</link>
+                    <pubDate>2024-03-20</pubDate>
+                    <description>Summary 1</description>
+                </item>
+                <item>
+                    <title>Test News 2</title>
+                    <link>https://example.com/2</link>
+                    <pubDate>2024-03-21</pubDate>
+                    <description>Summary 2</description>
+                </item>
+            </channel>
+        </rss>
+        """
+        mock_response = Mock()
+        mock_response.content = mock_rss_content.encode("utf-8")
+        mock_response.raise_for_status = Mock()
+
+        with patch("requests.get", return_value=mock_response):
+            component = NewsSearchComponent(query="OpenAI")
+            result = component.search_news()
+            assert isinstance(result, DataFrame)
+            df = result
+            assert len(df) == 2
+            assert list(df.columns) == ["title", "link", "published", "summary"]
+            assert df.iloc[0]["title"] == "Test News 1"
+            assert df.iloc[1]["title"] == "Test News 2"
+
+    def test_news_search_error(self):
+        with patch("requests.get", side_effect=requests.RequestException("Network error")):
+            component = NewsSearchComponent(query="OpenAI")
+            result = component.search_news()
+            assert isinstance(result, DataFrame)
+            df = result
+            assert len(df) == 1
+            assert df.iloc[0]["title"] == "Error"
+            assert "Network error" in df.iloc[0]["summary"]
+
+    def test_empty_news_results(self):
+        # Mock empty RSS feed
+        mock_rss_content = """
+        <?xml version="1.0" encoding="UTF-8"?>
+        <rss version="2.0">
+            <channel>
+            </channel>
+        </rss>
+        """
+        mock_response = Mock()
+        mock_response.content = mock_rss_content.encode("utf-8")
+        mock_response.raise_for_status = Mock()
+
+        with patch("requests.get", return_value=mock_response):
+            component = NewsSearchComponent(query="OpenAI")
+            result = component.search_news()
+            assert isinstance(result, DataFrame)
+            df = result
+            assert len(df) == 1
+            assert df.iloc[0]["title"] == "No articles found"