feat: add arxiv component (#5634)

* feat: add arxiv component

* [autofix.ci] apply automated fixes

* test: add initial test suite for arxiv component

* fix: correct test formatting for ArXiv component

* fix: implement tests for ArXivComponent following TestBatchRunComponent pattern

* fix: ArXivComponent test formatting

* [autofix.ci] apply automated fixes

* refactor: update imports and skip version tests for new component

* fix: fix line breaks in test file

* [autofix.ci] apply automated fixes

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
Raphael Valdetaro 2025-01-21 17:20:26 -03:00 committed by GitHub
commit bd8dbdfab6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 435 additions and 0 deletions

View file

@ -2,6 +2,7 @@ import warnings
from langchain_core._api.deprecation import LangChainDeprecationWarning
from .arxiv import ArXivComponent
from .bing_search_api import BingSearchAPIComponent
from .calculator import CalculatorToolComponent
from .calculator_core import CalculatorComponent
@ -35,6 +36,7 @@ with warnings.catch_warnings():
from .astradb_cql import AstraDBCQLToolComponent
__all__ = [
"ArXivComponent",
"AstraDBCQLToolComponent",
"AstraDBToolComponent",
"BingSearchAPIComponent",

View file

@ -0,0 +1,150 @@
import urllib.request
from urllib.parse import urlparse
from xml.etree.ElementTree import Element
from defusedxml.ElementTree import fromstring
from langflow.custom import Component
from langflow.io import DropdownInput, IntInput, MessageTextInput, Output
from langflow.schema import Data
class ArXivComponent(Component):
display_name = "arXiv"
description = "Search and retrieve papers from arXiv.org"
icon = "arXiv"
inputs = [
MessageTextInput(
name="search_query",
display_name="Search Query",
info="The search query for arXiv papers (e.g., 'quantum computing')",
tool_mode=True,
),
DropdownInput(
name="search_type",
display_name="Search Field",
info="The field to search in",
options=["all", "title", "abstract", "author", "cat"], # cat is for category
value="all",
),
IntInput(
name="max_results",
display_name="Max Results",
info="Maximum number of results to return",
value=10,
),
]
outputs = [
Output(display_name="Papers", name="papers", method="search_papers"),
]
def build_query_url(self) -> str:
"""Build the arXiv API query URL."""
base_url = "http://export.arxiv.org/api/query?"
# Build the search query
search_query = f"{self.search_type}:{self.search_query}"
# URL parameters
params = {
"search_query": search_query,
"max_results": str(self.max_results),
}
# Convert params to URL query string
query_string = "&".join([f"{k}={urllib.parse.quote(str(v))}" for k, v in params.items()])
return base_url + query_string
def parse_atom_response(self, response_text: str) -> list[dict]:
"""Parse the Atom XML response from arXiv."""
# Parse XML safely using defusedxml
root = fromstring(response_text)
# Define namespace dictionary for XML parsing
ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
papers = []
# Process each entry (paper)
for entry in root.findall("atom:entry", ns):
paper = {
"id": self._get_text(entry, "atom:id", ns),
"title": self._get_text(entry, "atom:title", ns),
"summary": self._get_text(entry, "atom:summary", ns),
"published": self._get_text(entry, "atom:published", ns),
"updated": self._get_text(entry, "atom:updated", ns),
"authors": [author.find("atom:name", ns).text for author in entry.findall("atom:author", ns)],
"arxiv_url": self._get_link(entry, "alternate", ns),
"pdf_url": self._get_link(entry, "related", ns),
"comment": self._get_text(entry, "arxiv:comment", ns),
"journal_ref": self._get_text(entry, "arxiv:journal_ref", ns),
"primary_category": self._get_category(entry, ns),
"categories": [cat.get("term") for cat in entry.findall("atom:category", ns)],
}
papers.append(paper)
return papers
def _get_text(self, element: Element, path: str, ns: dict) -> str | None:
"""Safely extract text from an XML element."""
el = element.find(path, ns)
return el.text.strip() if el is not None and el.text else None
def _get_link(self, element: Element, rel: str, ns: dict) -> str | None:
"""Get link URL based on relation type."""
for link in element.findall("atom:link", ns):
if link.get("rel") == rel:
return link.get("href")
return None
def _get_category(self, element: Element, ns: dict) -> str | None:
"""Get primary category."""
cat = element.find("arxiv:primary_category", ns)
return cat.get("term") if cat is not None else None
def search_papers(self) -> list[Data]:
"""Search arXiv and return results."""
try:
# Build the query URL
url = self.build_query_url()
# Validate URL scheme and host
parsed_url = urlparse(url)
if parsed_url.scheme not in ("http", "https"):
error_msg = f"Invalid URL scheme: {parsed_url.scheme}"
raise ValueError(error_msg)
if parsed_url.hostname != "export.arxiv.org":
error_msg = f"Invalid host: {parsed_url.hostname}"
raise ValueError(error_msg)
# Create a custom opener that only allows http/https schemes
class RestrictedHTTPHandler(urllib.request.HTTPHandler):
def http_open(self, req):
return super().http_open(req)
class RestrictedHTTPSHandler(urllib.request.HTTPSHandler):
def https_open(self, req):
return super().https_open(req)
# Build opener with restricted handlers
opener = urllib.request.build_opener(RestrictedHTTPHandler, RestrictedHTTPSHandler)
urllib.request.install_opener(opener)
# Make the request with validated URL using restricted opener
response = opener.open(url)
response_text = response.read().decode("utf-8")
# Parse the response
papers = self.parse_atom_response(response_text)
# Convert to Data objects
results = [Data(data=paper) for paper in papers]
self.status = results
except (urllib.error.URLError, ValueError) as e:
error_data = Data(data={"error": f"Request error: {e!s}"})
self.status = error_data
return [error_data]
else:
return results

View file

@ -222,6 +222,7 @@
"show": true,
"title_case": false,
"type": "code",
"value": "import re\n\nfrom langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader\n\nfrom langflow.custom import Component\nfrom langflow.helpers.data import data_to_text\nfrom langflow.io import DropdownInput, MessageTextInput, Output\nfrom langflow.schema import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = \"Load and retrive data from specified URLs.\"\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n is_list=True,\n tool_mode=True,\n placeholder=\"Enter a URL...\",\n list_add_label=\"Add URL\",\n ),\n DropdownInput(\n name=\"format\",\n display_name=\"Output Format\",\n info=\"Output Format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.\",\n options=[\"Text\", \"Raw HTML\"],\n value=\"Text\",\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Message\", name=\"text\", method=\"fetch_content_text\"),\n Output(display_name=\"DataFrame\", name=\"dataframe\", method=\"as_dataframe\"),\n ]\n\n def ensure_url(self, string: str) -> str:\n \"\"\"Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'.\n\n Raises an error if the string is not a valid URL.\n\n Parameters:\n string (str): The string to be checked and possibly modified.\n\n Returns:\n str: The modified string that is ensured to be a URL.\n\n Raises:\n ValueError: If the string is not a valid URL.\n \"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n # Basic URL validation regex\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\" # optional protocol\n r\"(www\\.)?\" # optional www\n r\"([a-zA-Z0-9.-]+)\" # domain\n r\"(\\.[a-zA-Z]{2,})?\" # top-level domain\n r\"(:\\d+)?\" # optional port\n r\"(\\/[^\\s]*)?$\", # optional path\n re.IGNORECASE,\n )\n\n if not url_regex.match(string):\n msg = f\"Invalid URL: {string}\"\n raise ValueError(msg)\n\n return string\n\n def fetch_content(self) -> list[Data]:\n urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]\n if self.format == \"Raw HTML\":\n loader = AsyncHtmlLoader(web_path=urls, encoding=\"utf-8\")\n else:\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n docs = loader.load()\n data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n self.status = data\n return data\n\n def fetch_content_text(self) -> Message:\n data = self.fetch_content()\n\n result_string = data_to_text(\"{text}\", data)\n self.status = result_string\n return Message(text=result_string)\n\n def as_dataframe(self) -> DataFrame:\n return DataFrame(self.fetch_content())\n"
},
"format": {

View file

@ -0,0 +1,124 @@
from unittest.mock import patch
import pytest
from tests.base import ComponentTestBaseWithClient
class TestArXivComponent(ComponentTestBaseWithClient):
def test_component_versions(self, default_kwargs, file_names_mapping):
"""Test component compatibility across versions."""
from langflow.components.tools.arxiv import ArXivComponent
# Test current version
component = ArXivComponent(**default_kwargs)
frontend_node = component.to_frontend_node()
assert frontend_node is not None
# Test backward compatibility
for mapping in file_names_mapping:
try:
module = __import__(
f"langflow.components.{mapping['module']}",
fromlist=[mapping["file_name"]],
)
component_class = getattr(module, mapping["file_name"])
component = component_class(**default_kwargs)
frontend_node = component.to_frontend_node()
assert frontend_node is not None
except (ImportError, AttributeError) as e:
pytest.fail(f"Failed to load component version {mapping['version']}: {e!s}")
@pytest.fixture
def component_class(self):
from langflow.components.tools.arxiv import ArXivComponent
return ArXivComponent
@pytest.fixture
def default_kwargs(self):
return {
"search_query": "quantum computing",
"search_type": "all",
"max_results": 10,
"_session_id": "test-session",
}
@pytest.fixture
def file_names_mapping(self):
return []
def test_component_initialization(self, component_class, default_kwargs):
# Arrange
component = component_class(**default_kwargs)
# Act
frontend_node = component.to_frontend_node()
# Assert
node_data = frontend_node["data"]["node"]
assert node_data["template"]["search_query"]["value"] == "quantum computing"
assert node_data["template"]["search_type"]["value"] == "all"
assert node_data["template"]["max_results"]["value"] == 10
def test_build_query_url(self, component_class, default_kwargs):
# Arrange
component = component_class(**default_kwargs)
# Act
url = component.build_query_url()
# Assert
assert "http://export.arxiv.org/api/query?" in url
assert "search_query=all%3Aquantum%20computing" in url
assert "max_results=10" in url
def test_parse_atom_response(self, component_class, default_kwargs):
# Arrange
component = component_class(**default_kwargs)
sample_xml = """<feed xmlns="http://www.w3.org/2005/Atom"
xmlns:arxiv="http://arxiv.org/schemas/atom">
<entry>
<id>http://arxiv.org/abs/quant-ph/0000001</id>
<title>Test Paper</title>
<summary>Test summary</summary>
<published>2023-01-01</published>
<updated>2023-01-01</updated>
<author><name>Test Author</name></author>
<link rel="alternate" href="http://arxiv.org/abs/quant-ph/0000001"/>
<link rel="related" href="http://arxiv.org/pdf/quant-ph/0000001"/>
<category term="quant-ph" scheme="http://arxiv.org/schemas/atom"/>
<arxiv:comment>Test comment</arxiv:comment>
<arxiv:journal_ref>Test Journal</arxiv:journal_ref>
<arxiv:primary_category term="quant-ph"/>
</entry>
</feed>""".replace("<", "<").replace(">", ">")
# Act
papers = component.parse_atom_response(sample_xml)
# Assert
assert len(papers) == 1
paper = papers[0]
assert paper["title"] == "Test Paper"
assert paper["summary"] == "Test summary"
assert paper["authors"] == ["Test Author"]
assert paper["arxiv_url"] == "http://arxiv.org/abs/quant-ph/0000001"
assert paper["pdf_url"] == "http://arxiv.org/pdf/quant-ph/0000001"
assert paper["comment"] == "Test comment"
assert paper["journal_ref"] == "Test Journal"
assert paper["primary_category"] == "quant-ph"
@patch("urllib.request.build_opener")
def test_invalid_url_handling(self, mock_build_opener, component_class, default_kwargs):
# Arrange
component = component_class(**default_kwargs)
mock_build_opener.return_value.open.side_effect = ValueError("Invalid URL")
# Act
results = component.search_papers()
# Assert
assert len(results) == 1
assert hasattr(results[0], "error")
assert "Invalid URL" in results[0].error

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 17 KiB

View file

@ -0,0 +1,10 @@
import React, { forwardRef } from "react";
import SvgArXivIcon from "./ArXivIcon";
export const ArXivIcon = forwardRef<SVGSVGElement, React.PropsWithChildren<{}>>(
(props, ref) => {
return <SvgArXivIcon ref={ref} {...props} />;
},
);
export default ArXivIcon;

View file

@ -238,6 +238,7 @@ import { AWSIcon } from "../icons/AWS";
import { AgentQLIcon } from "../icons/AgentQL";
import { AirbyteIcon } from "../icons/Airbyte";
import { AnthropicIcon } from "../icons/Anthropic";
import { ArXivIcon } from "../icons/ArXiv";
import { ArizeIcon } from "../icons/Arize";
import { AssemblyAIIcon } from "../icons/AssemblyAI";
import { AstraDBIcon } from "../icons/AstraDB";
@ -625,6 +626,7 @@ export const nodeIconsLucide: iconsType = {
AmazonBedrockEmbeddings: AWSIcon,
Amazon: AWSIcon,
Anthropic: AnthropicIcon,
ArXiv: ArXivIcon,
ChatAnthropic: AnthropicIcon,
assemblyai: AssemblyAIIcon,
AgentQL: AgentQLIcon,