-
Notifications
You must be signed in to change notification settings - Fork 0
/
retrieve_arxiv.py
118 lines (96 loc) · 4.18 KB
/
retrieve_arxiv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import requests
import json
import xml.etree.ElementTree as ET
import time
import logging
import random
from tenacity import retry, stop_after_attempt, wait_random_exponential
# cs.AI: Artificial Intelligence
# cs.CL: Computation and Language
# cs.CV: Computer Vision
# cs.LG: Machine Learning
# cs.MA: Multiagent Systems
# cs.NE: Neural and Evolutionary Computing
# CATEGORY = ["cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.MA", "cs.NE"]
CATEGORY = ["cs.CL"]
BASE_URL = "https://export.arxiv.org/api/query?"
MAX_RESULTS_PER_BATCH = 200
TOTAL_RESULTS_TO_RETRIEVE = 25000
NS = {'atom': 'http://www.w3.org/2005/Atom',
'arxiv': 'http://arxiv.org/schemas/atom'}
# JSON_FILE = "data/arxiv.cs.AI.CL.CV.LG.MA.NE.mini.json"
# PARQUET_FILE = "data/arxiv.cs.AI.CL.CV.LG.MA.NE.mini.parquet"
JSON_FILE = "data/2024/arxiv.cs.CL.25k.desc.json"
PARQUET_FILE = "data/2024/arxiv.cs.CL.25k.desc.parquet"
logging.basicConfig(level=logging.DEBUG,
format="%(asctime)s [%(levelname)s] %(message)s")
@retry(wait=wait_random_exponential(min=5, max=15), stop=stop_after_attempt(3))
def retrieve_batch_metadata(start, max_results):
"""
Retrieve a batch of papers metadata from arXiv API.
"""
search_query = '+'.join(CATEGORY)
# most recent first
query_url = f"{BASE_URL}search_query=cat:{search_query}&sortBy=submittedDate&sortOrder=descending&start={start}&max_results={max_results}"
# latest first
#query_url = f"{BASE_URL}search_query=cat:{search_query}&sortBy=submittedDate&sortOrder=ascending&start={start}&max_results={max_results}"
response = requests.get(query_url)
return response.text
def parse_xml(xml_data):
"""
Parse XML data and return a list of dictionaries containing the metadata.
"""
root = ET.fromstring(xml_data)
entries = []
for entry in root.findall('.//atom:entry', namespaces=NS):
categories = [category.attrib.get("term") for category in entry.findall(
".//atom:category", namespaces=NS)]
authors = [author.find(
'.//atom:name', namespaces=NS).text for author in entry.findall('.//atom:author', namespaces=NS)]
entry_dict = {
'id': entry.find('.//atom:id', namespaces=NS).text,
'doc_url': entry.find('.//atom:link[@title="pdf"]', namespaces=NS).attrib.get("href"),
'title': entry.find('.//atom:title', namespaces=NS).text.replace('\n', '').strip(),
'publication_date': entry.find('.//atom:published', namespaces=NS).text,
'update_date': entry.find('.//atom:updated', namespaces=NS).text,
'authors': ', '.join(authors),
'category_primary': entry.find('.//arxiv:primary_category', namespaces=NS).attrib.get("term"),
'category_all': ', '.join(categories),
'abstract': entry.find('.//atom:summary', namespaces=NS).text.strip(),
}
entries.append(entry_dict)
return entries
def save_to_json(data, file_path):
""""
Save list of papers' metadata to a JSON file.
"""
try:
with open(file_path, "w", encoding="utf-8") as json_file:
json.dump(data, json_file, ensure_ascii=False, indent=4)
logging.info(f"JSON file: {file_path}")
except Exception as e:
logging.error(f"An error occurred: {e}")
def main():
start_index = 0
total_retrieved = 0
all_paper_metadata = []
while total_retrieved < TOTAL_RESULTS_TO_RETRIEVE:
try:
batch_metadata = retrieve_batch_metadata(
start_index, MAX_RESULTS_PER_BATCH)
entries = parse_xml(batch_metadata)
all_paper_metadata.extend(entries)
batch_retrieved = len(entries)
total_retrieved += batch_retrieved
logging.info(
f"Retrieved {batch_retrieved} papers. Total: {total_retrieved}")
start_index += batch_retrieved
time.sleep(random.randint(5, 25))
except Exception as e:
logging.error(f"An error occurred: {e}")
break
logging.info(
f"Total {total_retrieved} papers in the '{CATEGORY}' category retrieved and saved to '{JSON_FILE}'.")
save_to_json(all_paper_metadata, JSON_FILE)
if __name__ == "__main__":
main()