-
Notifications
You must be signed in to change notification settings - Fork 309
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add arxiv_search service function (#148)
- Loading branch information
Showing
4 changed files
with
324 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,7 @@ | |
"pymongo", | ||
"pymysql", | ||
"beautifulsoup4", | ||
"feedparser", | ||
] | ||
|
||
doc_requires = [ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,293 @@ | ||
# -*- coding: utf-8 -*- | ||
"""Search papers in arXiv API. This implementation refers to the repository | ||
https://github.com/lukasschwab/arxiv.py, which is MIT licensed. | ||
""" | ||
import json | ||
import re | ||
import time | ||
import urllib | ||
from calendar import timegm | ||
from datetime import datetime, timezone | ||
from typing import List, Optional, Union | ||
|
||
try: | ||
import feedparser | ||
except ImportError: | ||
feedparser = None | ||
from loguru import logger | ||
|
||
from agentscope.service.service_response import ( | ||
ServiceResponse, | ||
ServiceExecStatus, | ||
) | ||
|
||
ARXIV_SEARCH_URL = "http://export.arxiv.org/api/query?{parameters_str}" | ||
|
||
LOGIC_OPERATORS = ["ANDNOT", "AND", "OR"] | ||
|
||
SYMBOLS = ["(", ")"] | ||
|
||
QUERY_PREFIX = ["all:", "ti:", "au:", "abs:", "co:", "jr:", "cat:", "rn:"] | ||
|
||
|
||
class _Result(dict): | ||
"""The class for arXiv search results.""" | ||
|
||
__getattr__ = dict.__getitem__ | ||
__setattr__ = dict.__setitem__ | ||
|
||
id: str | ||
"""A url of the form `https://arxiv.org/abs/{id}`.""" | ||
|
||
title: str | ||
"""The title of the result.""" | ||
|
||
updated: str | ||
"""When the result was last updated.""" | ||
|
||
published: str | ||
"""When the result was published.""" | ||
|
||
summary: str | ||
"""The summary of the search result.""" | ||
|
||
authors: List[str] | ||
"""The authors of the search result.""" | ||
|
||
comment: Optional[str] | ||
"""The authors' comment if present.""" | ||
|
||
primary_category: Optional[str] | ||
"""The result's primary arXiv category. See [arXiv: Category | ||
Taxonomy](https://arxiv.org/category_taxonomy).""" | ||
|
||
tags: List[str] | ||
"""All of the result's tags. See [arXiv: Category | ||
Taxonomy](https://arxiv.org/category_taxonomy).""" | ||
|
||
journal_ref: Optional[str] | ||
"""A journal reference if present.""" | ||
|
||
doi: Optional[str] | ||
"""A URL for the resolved DOI to an external resource if present.""" | ||
|
||
def __init__( | ||
self, | ||
entry_id: str, | ||
title: str, | ||
updated: str, | ||
published: str, | ||
summary: str, | ||
authors: List[str], | ||
pdf_url: Optional[str] = None, | ||
comment: Optional[str] = None, | ||
primary_category: Optional[str] = None, | ||
tags: List[str] = None, | ||
journal_ref: Optional[str] = None, | ||
doi: Optional[str] = None, | ||
) -> None: | ||
"""The class for arXiv search results.""" | ||
self.entry_id = entry_id | ||
self.title = title | ||
self.updated = updated | ||
self.published = published | ||
self.summary = summary | ||
self.authors = authors | ||
self.pdf_url = pdf_url | ||
self.comment = comment | ||
self.primary_category = primary_category | ||
self.tags = tags | ||
self.journal_ref = journal_ref | ||
self.doi = doi | ||
|
||
def __str__(self) -> str: | ||
cleaned_dict = {} | ||
for key in self: | ||
if self[key] is not None: | ||
cleaned_dict[key] = self[key] | ||
return json.dumps(cleaned_dict, ensure_ascii=False) | ||
|
||
def __repr__(self) -> str: | ||
return self.__str__() | ||
|
||
|
||
def _parse_pdf_url(links: List) -> Union[str, None]: | ||
"""Parse the pdf url from the links.""" | ||
for link in links: | ||
if link.get("title") == "pdf": | ||
return link.get("href") | ||
return None | ||
|
||
|
||
def _parse_timestamp(timestamp: time.struct_time) -> str: | ||
"""Parse the timestamp to a string.""" | ||
timestamp = datetime.fromtimestamp(timegm(timestamp), tz=timezone.utc) | ||
return timestamp.strftime("%Y-%m-%d %H:%M:%S") | ||
|
||
|
||
def _clean_arxiv_search_results(result: dict) -> dict: | ||
"""Clean the arXiv search results, and remove unnecessary information.""" | ||
feed = result.feed | ||
|
||
# Basic information | ||
cleaned_dict = { | ||
"updated": _parse_timestamp(feed.updated_parsed), | ||
"opensearch_total_results": int(feed.opensearch_totalresults), | ||
"opensearch_start_index": int(feed.opensearch_startindex), | ||
"opensearch_itemsperpage": int(feed.opensearch_itemsperpage), | ||
} | ||
|
||
# Entries | ||
entries = [] | ||
for entry in result.entries: | ||
title = "0" | ||
if hasattr(entry, "title"): | ||
title = entry.title | ||
else: | ||
logger.warning( | ||
"Result %s is missing title attribute; defaulting to '0'", | ||
entry.id, | ||
) | ||
|
||
tags = [tag.get("term") for tag in entry.tags] | ||
if len(tags) == 0: | ||
tags = None | ||
|
||
entry_dict = _Result( | ||
# Basic properties | ||
entry_id=entry.id, | ||
title=title, | ||
updated=_parse_timestamp(entry.updated_parsed), | ||
published=_parse_timestamp(entry.published_parsed), | ||
summary=entry.summary, | ||
authors=[author.name for author in entry.authors], | ||
# Optional properties | ||
pdf_url=_parse_pdf_url(entry.links), | ||
comment=entry.get("arxiv_comment"), | ||
primary_category=entry.arxiv_primary_category.get("term"), | ||
tags=tags, | ||
journal_ref=entry.get("arxiv_journal_ref"), | ||
doi=entry.get("arxiv_doi"), | ||
) | ||
|
||
entries.append(entry_dict) | ||
|
||
cleaned_dict["entries"] = entries | ||
|
||
return cleaned_dict | ||
|
||
|
||
def _reformat_query(query: str) -> str: | ||
"""Reformat the query string for arxiv search, refer to | ||
https://info.arxiv.org/help/api/user-manual.html.""" | ||
delimiter_regex = ( | ||
"(" | ||
+ "|".join( | ||
map(re.escape, LOGIC_OPERATORS + QUERY_PREFIX + SYMBOLS), | ||
) | ||
+ ")" | ||
) | ||
|
||
parts = re.split(delimiter_regex, query) | ||
|
||
parts = [part.strip() for part in parts if part.strip()] | ||
|
||
for i, part in enumerate(parts): | ||
if part not in LOGIC_OPERATORS + QUERY_PREFIX + SYMBOLS: | ||
# Add double quotes if it does not contain double quotes | ||
part = part.replace('"', "%22").replace(" ", "+") | ||
|
||
if not part.startswith("%22"): | ||
part = f"%22{part}" | ||
if not part.endswith("%22"): | ||
part = f"{part}%22" | ||
parts[i] = part | ||
elif part in SYMBOLS: | ||
parts[i] = part.replace("(", "%28").replace(")", "%29") | ||
elif part in LOGIC_OPERATORS: | ||
parts[i] = f"+{part}+" | ||
|
||
refined_query = "".join(parts) | ||
|
||
return refined_query | ||
|
||
|
||
def arxiv_search( | ||
search_query: str, | ||
id_list: List[str] = None, | ||
start: int = 0, | ||
max_results: Optional[int] = None, | ||
) -> ServiceResponse: | ||
"""Search arXiv paper by a given query string. | ||
Args: | ||
search_query (`str`): | ||
The query string, supporting prefixes "all:", "ti:", "au:", | ||
"abs:", "co:", "jr:", "cat:", and "rn:", boolean operators "AND", | ||
"OR" and "ANDNOT". For example, searching for papers with | ||
title "Deep Learning" and author "LeCun" by a | ||
search_query ti:"Deep Learning" AND au:"LeCun" | ||
id_list (`List[str]`, defaults to `None`): | ||
A list of arXiv IDs to search. | ||
start (`int`, defaults to `0`): | ||
The index of the first search result to return. | ||
max_results (`Optional[int]`, defaults to `None`): | ||
The maximum number of search results to return. | ||
Returns: | ||
`ServiceResponse`: A dictionary with two variables: `status` and | ||
`content`. The `status` variable is from the ServiceExecStatus enum, | ||
and `content` is a list of search results or error information, | ||
which depends on the `status` variable. | ||
""" | ||
|
||
if feedparser is None: | ||
raise ImportError( | ||
"The `feedparser` module is not installed. Please install it by " | ||
"running `pip install feedparser`.", | ||
) | ||
|
||
# construct url | ||
search_query = _reformat_query(search_query) | ||
|
||
parameters = {"search_query": search_query} | ||
|
||
if id_list: | ||
parameters["id_list"] = ",".join(id_list) | ||
|
||
if start > 0: | ||
parameters["start"] = str(start) | ||
|
||
if max_results: | ||
parameters["max_results"] = str(max_results) | ||
|
||
parameters_str = "&".join([f"{k}={v}" for k, v in parameters.items()]) | ||
|
||
url = ARXIV_SEARCH_URL.format(parameters_str=parameters_str) | ||
|
||
try: | ||
logger.debug(f"Searching arXiv by url: {url}") | ||
|
||
with urllib.request.urlopen(url) as data: | ||
# Parse the results by feedparser | ||
feedparser_dict = feedparser.parse(data.read().decode("utf-8")) | ||
|
||
# Remove unnecessary information | ||
results = _clean_arxiv_search_results(feedparser_dict) | ||
|
||
if data.code == 200: | ||
# Return the searching results | ||
return ServiceResponse( | ||
status=ServiceExecStatus.SUCCESS, | ||
content=results, | ||
) | ||
else: | ||
return ServiceResponse( | ||
status=ServiceExecStatus.ERROR, | ||
content=f"Error: {data.code}, {data}", | ||
) | ||
except Exception as e: | ||
return ServiceResponse( | ||
status=ServiceExecStatus.ERROR, | ||
content=f"Error: {e}", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters