feat: add arxiv component (#5634)
* feat: add arxiv component * [autofix.ci] apply automated fixes * test: add initial test suite for arxiv component * fix: correct test formatting for ArXiv component * fix: implement tests for ArXivComponent following TestBatchRunComponent pattern * fix: ArXivComponent test formatting * [autofix.ci] apply automated fixes * refactor: update imports and skip version tests for new component * fix: fix line breaks in test file * [autofix.ci] apply automated fixes --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
71b8c762a2
commit
bd8dbdfab6
8 changed files with 435 additions and 0 deletions
|
|
@ -2,6 +2,7 @@ import warnings
|
|||
|
||||
from langchain_core._api.deprecation import LangChainDeprecationWarning
|
||||
|
||||
from .arxiv import ArXivComponent
|
||||
from .bing_search_api import BingSearchAPIComponent
|
||||
from .calculator import CalculatorToolComponent
|
||||
from .calculator_core import CalculatorComponent
|
||||
|
|
@ -35,6 +36,7 @@ with warnings.catch_warnings():
|
|||
from .astradb_cql import AstraDBCQLToolComponent
|
||||
|
||||
__all__ = [
|
||||
"ArXivComponent",
|
||||
"AstraDBCQLToolComponent",
|
||||
"AstraDBToolComponent",
|
||||
"BingSearchAPIComponent",
|
||||
|
|
|
|||
150
src/backend/base/langflow/components/tools/arxiv.py
Normal file
150
src/backend/base/langflow/components/tools/arxiv.py
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
import urllib.request
|
||||
from urllib.parse import urlparse
|
||||
from xml.etree.ElementTree import Element
|
||||
|
||||
from defusedxml.ElementTree import fromstring
|
||||
|
||||
from langflow.custom import Component
|
||||
from langflow.io import DropdownInput, IntInput, MessageTextInput, Output
|
||||
from langflow.schema import Data
|
||||
|
||||
|
||||
class ArXivComponent(Component):
|
||||
display_name = "arXiv"
|
||||
description = "Search and retrieve papers from arXiv.org"
|
||||
icon = "arXiv"
|
||||
|
||||
inputs = [
|
||||
MessageTextInput(
|
||||
name="search_query",
|
||||
display_name="Search Query",
|
||||
info="The search query for arXiv papers (e.g., 'quantum computing')",
|
||||
tool_mode=True,
|
||||
),
|
||||
DropdownInput(
|
||||
name="search_type",
|
||||
display_name="Search Field",
|
||||
info="The field to search in",
|
||||
options=["all", "title", "abstract", "author", "cat"], # cat is for category
|
||||
value="all",
|
||||
),
|
||||
IntInput(
|
||||
name="max_results",
|
||||
display_name="Max Results",
|
||||
info="Maximum number of results to return",
|
||||
value=10,
|
||||
),
|
||||
]
|
||||
|
||||
outputs = [
|
||||
Output(display_name="Papers", name="papers", method="search_papers"),
|
||||
]
|
||||
|
||||
def build_query_url(self) -> str:
|
||||
"""Build the arXiv API query URL."""
|
||||
base_url = "http://export.arxiv.org/api/query?"
|
||||
|
||||
# Build the search query
|
||||
search_query = f"{self.search_type}:{self.search_query}"
|
||||
|
||||
# URL parameters
|
||||
params = {
|
||||
"search_query": search_query,
|
||||
"max_results": str(self.max_results),
|
||||
}
|
||||
|
||||
# Convert params to URL query string
|
||||
query_string = "&".join([f"{k}={urllib.parse.quote(str(v))}" for k, v in params.items()])
|
||||
|
||||
return base_url + query_string
|
||||
|
||||
def parse_atom_response(self, response_text: str) -> list[dict]:
|
||||
"""Parse the Atom XML response from arXiv."""
|
||||
# Parse XML safely using defusedxml
|
||||
root = fromstring(response_text)
|
||||
|
||||
# Define namespace dictionary for XML parsing
|
||||
ns = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
|
||||
|
||||
papers = []
|
||||
# Process each entry (paper)
|
||||
for entry in root.findall("atom:entry", ns):
|
||||
paper = {
|
||||
"id": self._get_text(entry, "atom:id", ns),
|
||||
"title": self._get_text(entry, "atom:title", ns),
|
||||
"summary": self._get_text(entry, "atom:summary", ns),
|
||||
"published": self._get_text(entry, "atom:published", ns),
|
||||
"updated": self._get_text(entry, "atom:updated", ns),
|
||||
"authors": [author.find("atom:name", ns).text for author in entry.findall("atom:author", ns)],
|
||||
"arxiv_url": self._get_link(entry, "alternate", ns),
|
||||
"pdf_url": self._get_link(entry, "related", ns),
|
||||
"comment": self._get_text(entry, "arxiv:comment", ns),
|
||||
"journal_ref": self._get_text(entry, "arxiv:journal_ref", ns),
|
||||
"primary_category": self._get_category(entry, ns),
|
||||
"categories": [cat.get("term") for cat in entry.findall("atom:category", ns)],
|
||||
}
|
||||
papers.append(paper)
|
||||
|
||||
return papers
|
||||
|
||||
def _get_text(self, element: Element, path: str, ns: dict) -> str | None:
|
||||
"""Safely extract text from an XML element."""
|
||||
el = element.find(path, ns)
|
||||
return el.text.strip() if el is not None and el.text else None
|
||||
|
||||
def _get_link(self, element: Element, rel: str, ns: dict) -> str | None:
|
||||
"""Get link URL based on relation type."""
|
||||
for link in element.findall("atom:link", ns):
|
||||
if link.get("rel") == rel:
|
||||
return link.get("href")
|
||||
return None
|
||||
|
||||
def _get_category(self, element: Element, ns: dict) -> str | None:
|
||||
"""Get primary category."""
|
||||
cat = element.find("arxiv:primary_category", ns)
|
||||
return cat.get("term") if cat is not None else None
|
||||
|
||||
def search_papers(self) -> list[Data]:
|
||||
"""Search arXiv and return results."""
|
||||
try:
|
||||
# Build the query URL
|
||||
url = self.build_query_url()
|
||||
|
||||
# Validate URL scheme and host
|
||||
parsed_url = urlparse(url)
|
||||
if parsed_url.scheme not in ("http", "https"):
|
||||
error_msg = f"Invalid URL scheme: {parsed_url.scheme}"
|
||||
raise ValueError(error_msg)
|
||||
if parsed_url.hostname != "export.arxiv.org":
|
||||
error_msg = f"Invalid host: {parsed_url.hostname}"
|
||||
raise ValueError(error_msg)
|
||||
|
||||
# Create a custom opener that only allows http/https schemes
|
||||
class RestrictedHTTPHandler(urllib.request.HTTPHandler):
|
||||
def http_open(self, req):
|
||||
return super().http_open(req)
|
||||
|
||||
class RestrictedHTTPSHandler(urllib.request.HTTPSHandler):
|
||||
def https_open(self, req):
|
||||
return super().https_open(req)
|
||||
|
||||
# Build opener with restricted handlers
|
||||
opener = urllib.request.build_opener(RestrictedHTTPHandler, RestrictedHTTPSHandler)
|
||||
urllib.request.install_opener(opener)
|
||||
|
||||
# Make the request with validated URL using restricted opener
|
||||
response = opener.open(url)
|
||||
response_text = response.read().decode("utf-8")
|
||||
|
||||
# Parse the response
|
||||
papers = self.parse_atom_response(response_text)
|
||||
|
||||
# Convert to Data objects
|
||||
results = [Data(data=paper) for paper in papers]
|
||||
self.status = results
|
||||
except (urllib.error.URLError, ValueError) as e:
|
||||
error_data = Data(data={"error": f"Request error: {e!s}"})
|
||||
self.status = error_data
|
||||
return [error_data]
|
||||
else:
|
||||
return results
|
||||
|
|
@ -222,6 +222,7 @@
|
|||
"show": true,
|
||||
"title_case": false,
|
||||
"type": "code",
|
||||
|
||||
"value": "import re\n\nfrom langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader\n\nfrom langflow.custom import Component\nfrom langflow.helpers.data import data_to_text\nfrom langflow.io import DropdownInput, MessageTextInput, Output\nfrom langflow.schema import Data\nfrom langflow.schema.dataframe import DataFrame\nfrom langflow.schema.message import Message\n\n\nclass URLComponent(Component):\n display_name = \"URL\"\n description = \"Load and retrive data from specified URLs.\"\n icon = \"layout-template\"\n name = \"URL\"\n\n inputs = [\n MessageTextInput(\n name=\"urls\",\n display_name=\"URLs\",\n is_list=True,\n tool_mode=True,\n placeholder=\"Enter a URL...\",\n list_add_label=\"Add URL\",\n ),\n DropdownInput(\n name=\"format\",\n display_name=\"Output Format\",\n info=\"Output Format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.\",\n options=[\"Text\", \"Raw HTML\"],\n value=\"Text\",\n ),\n ]\n\n outputs = [\n Output(display_name=\"Data\", name=\"data\", method=\"fetch_content\"),\n Output(display_name=\"Message\", name=\"text\", method=\"fetch_content_text\"),\n Output(display_name=\"DataFrame\", name=\"dataframe\", method=\"as_dataframe\"),\n ]\n\n def ensure_url(self, string: str) -> str:\n \"\"\"Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'.\n\n Raises an error if the string is not a valid URL.\n\n Parameters:\n string (str): The string to be checked and possibly modified.\n\n Returns:\n str: The modified string that is ensured to be a URL.\n\n Raises:\n ValueError: If the string is not a valid URL.\n \"\"\"\n if not string.startswith((\"http://\", \"https://\")):\n string = \"http://\" + string\n\n # Basic URL validation regex\n url_regex = re.compile(\n r\"^(https?:\\/\\/)?\" # optional protocol\n r\"(www\\.)?\" # optional www\n r\"([a-zA-Z0-9.-]+)\" # domain\n r\"(\\.[a-zA-Z]{2,})?\" # top-level domain\n r\"(:\\d+)?\" # optional port\n r\"(\\/[^\\s]*)?$\", # optional path\n re.IGNORECASE,\n )\n\n if not url_regex.match(string):\n msg = f\"Invalid URL: {string}\"\n raise ValueError(msg)\n\n return string\n\n def fetch_content(self) -> list[Data]:\n urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]\n if self.format == \"Raw HTML\":\n loader = AsyncHtmlLoader(web_path=urls, encoding=\"utf-8\")\n else:\n loader = WebBaseLoader(web_paths=urls, encoding=\"utf-8\")\n docs = loader.load()\n data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]\n self.status = data\n return data\n\n def fetch_content_text(self) -> Message:\n data = self.fetch_content()\n\n result_string = data_to_text(\"{text}\", data)\n self.status = result_string\n return Message(text=result_string)\n\n def as_dataframe(self) -> DataFrame:\n return DataFrame(self.fetch_content())\n"
|
||||
},
|
||||
"format": {
|
||||
|
|
|
|||
124
src/backend/tests/unit/components/tools/test_arxiv_component.py
Normal file
124
src/backend/tests/unit/components/tools/test_arxiv_component.py
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.base import ComponentTestBaseWithClient
|
||||
|
||||
|
||||
class TestArXivComponent(ComponentTestBaseWithClient):
|
||||
def test_component_versions(self, default_kwargs, file_names_mapping):
|
||||
"""Test component compatibility across versions."""
|
||||
from langflow.components.tools.arxiv import ArXivComponent
|
||||
|
||||
# Test current version
|
||||
component = ArXivComponent(**default_kwargs)
|
||||
frontend_node = component.to_frontend_node()
|
||||
assert frontend_node is not None
|
||||
|
||||
# Test backward compatibility
|
||||
for mapping in file_names_mapping:
|
||||
try:
|
||||
module = __import__(
|
||||
f"langflow.components.{mapping['module']}",
|
||||
fromlist=[mapping["file_name"]],
|
||||
)
|
||||
component_class = getattr(module, mapping["file_name"])
|
||||
component = component_class(**default_kwargs)
|
||||
frontend_node = component.to_frontend_node()
|
||||
assert frontend_node is not None
|
||||
except (ImportError, AttributeError) as e:
|
||||
pytest.fail(f"Failed to load component version {mapping['version']}: {e!s}")
|
||||
|
||||
@pytest.fixture
|
||||
def component_class(self):
|
||||
from langflow.components.tools.arxiv import ArXivComponent
|
||||
|
||||
return ArXivComponent
|
||||
|
||||
@pytest.fixture
|
||||
def default_kwargs(self):
|
||||
return {
|
||||
"search_query": "quantum computing",
|
||||
"search_type": "all",
|
||||
"max_results": 10,
|
||||
"_session_id": "test-session",
|
||||
}
|
||||
|
||||
@pytest.fixture
|
||||
def file_names_mapping(self):
|
||||
return []
|
||||
|
||||
def test_component_initialization(self, component_class, default_kwargs):
|
||||
# Arrange
|
||||
component = component_class(**default_kwargs)
|
||||
|
||||
# Act
|
||||
frontend_node = component.to_frontend_node()
|
||||
|
||||
# Assert
|
||||
node_data = frontend_node["data"]["node"]
|
||||
assert node_data["template"]["search_query"]["value"] == "quantum computing"
|
||||
assert node_data["template"]["search_type"]["value"] == "all"
|
||||
assert node_data["template"]["max_results"]["value"] == 10
|
||||
|
||||
def test_build_query_url(self, component_class, default_kwargs):
|
||||
# Arrange
|
||||
component = component_class(**default_kwargs)
|
||||
|
||||
# Act
|
||||
url = component.build_query_url()
|
||||
|
||||
# Assert
|
||||
assert "http://export.arxiv.org/api/query?" in url
|
||||
assert "search_query=all%3Aquantum%20computing" in url
|
||||
assert "max_results=10" in url
|
||||
|
||||
def test_parse_atom_response(self, component_class, default_kwargs):
|
||||
# Arrange
|
||||
component = component_class(**default_kwargs)
|
||||
sample_xml = """<feed xmlns="http://www.w3.org/2005/Atom"
|
||||
xmlns:arxiv="http://arxiv.org/schemas/atom">
|
||||
<entry>
|
||||
<id>http://arxiv.org/abs/quant-ph/0000001</id>
|
||||
<title>Test Paper</title>
|
||||
<summary>Test summary</summary>
|
||||
<published>2023-01-01</published>
|
||||
<updated>2023-01-01</updated>
|
||||
<author><name>Test Author</name></author>
|
||||
<link rel="alternate" href="http://arxiv.org/abs/quant-ph/0000001"/>
|
||||
<link rel="related" href="http://arxiv.org/pdf/quant-ph/0000001"/>
|
||||
<category term="quant-ph" scheme="http://arxiv.org/schemas/atom"/>
|
||||
<arxiv:comment>Test comment</arxiv:comment>
|
||||
<arxiv:journal_ref>Test Journal</arxiv:journal_ref>
|
||||
<arxiv:primary_category term="quant-ph"/>
|
||||
</entry>
|
||||
</feed>""".replace("<", "<").replace(">", ">")
|
||||
|
||||
# Act
|
||||
papers = component.parse_atom_response(sample_xml)
|
||||
|
||||
# Assert
|
||||
assert len(papers) == 1
|
||||
paper = papers[0]
|
||||
assert paper["title"] == "Test Paper"
|
||||
assert paper["summary"] == "Test summary"
|
||||
assert paper["authors"] == ["Test Author"]
|
||||
assert paper["arxiv_url"] == "http://arxiv.org/abs/quant-ph/0000001"
|
||||
assert paper["pdf_url"] == "http://arxiv.org/pdf/quant-ph/0000001"
|
||||
assert paper["comment"] == "Test comment"
|
||||
assert paper["journal_ref"] == "Test Journal"
|
||||
assert paper["primary_category"] == "quant-ph"
|
||||
|
||||
@patch("urllib.request.build_opener")
|
||||
def test_invalid_url_handling(self, mock_build_opener, component_class, default_kwargs):
|
||||
# Arrange
|
||||
component = component_class(**default_kwargs)
|
||||
mock_build_opener.return_value.open.side_effect = ValueError("Invalid URL")
|
||||
|
||||
# Act
|
||||
results = component.search_papers()
|
||||
|
||||
# Assert
|
||||
assert len(results) == 1
|
||||
assert hasattr(results[0], "error")
|
||||
assert "Invalid URL" in results[0].error
|
||||
122
src/frontend/src/icons/ArXiv/ArXivIcon.jsx
Normal file
122
src/frontend/src/icons/ArXiv/ArXivIcon.jsx
Normal file
File diff suppressed because one or more lines are too long
24
src/frontend/src/icons/ArXiv/arxiv.svg
Normal file
24
src/frontend/src/icons/ArXiv/arxiv.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 17 KiB |
10
src/frontend/src/icons/ArXiv/index.tsx
Normal file
10
src/frontend/src/icons/ArXiv/index.tsx
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
import React, { forwardRef } from "react";
|
||||
import SvgArXivIcon from "./ArXivIcon";
|
||||
|
||||
export const ArXivIcon = forwardRef<SVGSVGElement, React.PropsWithChildren<{}>>(
|
||||
(props, ref) => {
|
||||
return <SvgArXivIcon ref={ref} {...props} />;
|
||||
},
|
||||
);
|
||||
|
||||
export default ArXivIcon;
|
||||
|
|
@ -238,6 +238,7 @@ import { AWSIcon } from "../icons/AWS";
|
|||
import { AgentQLIcon } from "../icons/AgentQL";
|
||||
import { AirbyteIcon } from "../icons/Airbyte";
|
||||
import { AnthropicIcon } from "../icons/Anthropic";
|
||||
import { ArXivIcon } from "../icons/ArXiv";
|
||||
import { ArizeIcon } from "../icons/Arize";
|
||||
import { AssemblyAIIcon } from "../icons/AssemblyAI";
|
||||
import { AstraDBIcon } from "../icons/AstraDB";
|
||||
|
|
@ -625,6 +626,7 @@ export const nodeIconsLucide: iconsType = {
|
|||
AmazonBedrockEmbeddings: AWSIcon,
|
||||
Amazon: AWSIcon,
|
||||
Anthropic: AnthropicIcon,
|
||||
ArXiv: ArXivIcon,
|
||||
ChatAnthropic: AnthropicIcon,
|
||||
assemblyai: AssemblyAIIcon,
|
||||
AgentQL: AgentQLIcon,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue