-
Notifications
You must be signed in to change notification settings - Fork 1
/
openreview_scrapper.py
286 lines (231 loc) · 11.6 KB
/
openreview_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import argparse
from pathlib import Path
from random import uniform
from time import sleep
from typing import Any
from openreview import Client
from openreview.api import OpenReviewClient
import pandas as pd
from tqdm import tqdm
# got from `client.get_group(id='venues').members`
VENUES_NAMES = {
'aaai': 'AAAI.org',
'acl': 'aclweb.org/ACL',
'coling': 'COLING.org',
# 'cvpr': 'thecvf.com/CVPR',
'eacl': 'eacl.org/EACL',
'eccv': 'thecvf.com/ECCV',
'emnlp': 'EMNLP',
# 'iccv': 'thecvf.com/ICCV',
'iclr': 'ICLR.cc',
'icml': 'ICML.cc',
'ijcai': 'ijcai.org/IJCAI',
'ijcnlp': 'aclweb.org/AACL-IJCNLP',
'kdd': 'KDD.org',
'naacl': 'aclweb.org/NAACL',
'neurips': 'NeurIPS.cc',
'sigchi': 'acm.org/CHI',
'sigdial': 'SIGDIAL.org',
# 'wacv': 'thecvf.com/WACV',
}
DECISION_KEYS = (
('Decision', 'decision'),
('Meta_Review', 'recommendation'),
)
def _save_and_download_papers(
papers_infos: list[dict[str, Any]],
conference: str,
year: str,
out_dir: str = './',
get_pdfs: bool = False,
client: None | Client | OpenReviewClient = None,
) -> None:
# save papers to csv files, and download pdfs if requested
paper_info_df = pd.DataFrame(columns=['title', 'abstract_url', 'pdf_url', 'source_url'])
abstracts_df = pd.DataFrame(columns=['title', 'abstract'])
authors_df = pd.DataFrame(columns=['title', 'authors'])
for paper_info in papers_infos:
abstract = paper_info['abstract']
authors = paper_info['authors']
paper_id = paper_info['paper_id']
title = paper_info['title']
paper_info_df = pd.concat([paper_info_df, pd.Series({'title': title,
'abstract_url': f'{paper_id}',
'pdf_url': f'{paper_id}',
'source_url': 0}).to_frame().T],
ignore_index=True)
abstracts_df = pd.concat([abstracts_df, pd.Series({'title': title,
'abstract': repr(abstract)}).to_frame().T],
ignore_index=True)
authors_df = pd.concat([authors_df, pd.Series({'title': title,
'authors': authors}).to_frame().T],
ignore_index=True)
print('\tWriting tables to files')
save_dir = Path(out_dir) / f'{conference}' / f'{year}'
if not save_dir.exists():
print(f'\tCreating folder {save_dir}')
save_dir.mkdir(parents=True)
# if there are papers already, append to them
if (save_dir / 'paper_info.csv').exists():
paper_info_df = pd.concat([pd.read_csv(save_dir / 'paper_info.csv', sep=';'), paper_info_df], ignore_index=True)
abstracts_df = pd.concat([pd.read_csv(save_dir / 'abstracts.csv', sep='|'), abstracts_df], ignore_index=True)
authors_df = pd.concat([pd.read_csv(save_dir / 'authors.csv', sep=';'), authors_df], ignore_index=True)
# remove duplicates
previous_len = len(paper_info_df)
paper_info_df.drop_duplicates(subset=['title'], inplace=True)
abstracts_df.drop_duplicates(subset=['title'], inplace=True)
authors_df.drop_duplicates(subset=['title'], inplace=True)
if len(paper_info_df) != previous_len:
print(f'\tFound {previous_len - len(paper_info_df)} duplicates')
if len(paper_info_df) != len(abstracts_df) or len(paper_info_df) != len(authors_df):
print(f'\tError: different number of papers in tables: {len(paper_info_df)}, {len(abstracts_df)}, {len(authors_df)}')
print(f'\tNo values were written for {conference} {year}')
return
paper_info_df.to_csv(save_dir / 'paper_info.csv', sep=';', index=False)
abstracts_df.to_csv(save_dir / 'abstracts.csv', sep='|', index=False)
authors_df.to_csv(save_dir / 'authors.csv', sep=';', index=False)
# if requested, download pdfs to a subdirectory.
if get_pdfs:
if client is None:
print('Cannot download pdfs without a client')
return
pdf_out_dir = save_dir / 'papers'
if not pdf_out_dir.exists():
print(f'\tCreating folder {pdf_out_dir}')
pdf_out_dir.mkdir(parents=True)
print('Downloading pdf files')
with tqdm(papers_infos, unit='pdf') as pbar:
for paper_info in pbar:
paper_id = paper_info['paper_id']
filename = f'{paper_id}.pdf'
pbar.set_description(filename)
pdf_outfile = pdf_out_dir / filename
if not pdf_outfile.exists():
try:
pdf_binary = client.get_pdf(paper_id)
pdf_outfile.write_bytes(pdf_binary)
except:
tqdm.write(f'Error while trying to get pdf for {paper_id}: {paper_info["title"]}\n'
f'at https://openreview.net/pdf?id={paper_id}')
# add random sleep between api calls
sleep(uniform(1., 2.))
def _download_conference_info(
client: Client | OpenReviewClient,
conference: str,
year: str,
main_conference: bool = True,
) -> list[dict[str, Any]]:
'''
Main function for downloading conference metadata
forum here means the paper id
'''
venues = _get_all_venues(client)
if main_conference:
submissions_urls = [f'{v}/-/Submission' for v in venues if f'{VENUES_NAMES[conference.lower()]}/{year}/Conference' in v]
blind_submissions_urls = [f'{v}/-/Blind_Submission' for v in venues if f'{VENUES_NAMES[conference.lower()]}/{year}/Conference' in v]
else:
submissions_urls = [f'{v}/-/Submission' for v in venues if f'{VENUES_NAMES[conference.lower()]}/{year}/Workshop' in v]
blind_submissions_urls = [f'{v}/-/Blind_Submission' for v in venues if f'{VENUES_NAMES[conference.lower()]}/{year}/Workshop' in v]
submissions = []
for url, blind_url in zip(submissions_urls, blind_submissions_urls):
# test which string returns the correct submissions
try:
new_submissions = client.get_all_notes(invitation=url, details='directReplies')
sleep(uniform(1., 2.))
except:
print(f'Error while trying to get papers submissions for {conference} {year}')
continue
if len(new_submissions) > 0:
submissions += new_submissions
continue
new_submissions = client.get_all_notes(invitation=blind_url, details='directReplies')
if len(new_submissions) > 0:
submissions += new_submissions
if len(submissions) == 0:
# print(f'\tNo submissions found for {conference} {year}')
return []
for review_end, decision_key in DECISION_KEYS:
if len(submissions[0].details["directReplies"]) > 0:
if 'invitation' in submissions[0].details["directReplies"][0]:
accepted_papers = {submission.forum: submission.content for submission in submissions
for reply in submission.details["directReplies"]
if reply["invitation"].endswith(review_end) and reply["content"][decision_key] != 'Reject'}
elif 'invitations' in submissions[0].details["directReplies"][0]:
accepted_papers = {submission.forum: submission.content for submission in submissions
for reply in submission.details["directReplies"]
for invitation in reply["invitations"]
if invitation.endswith(review_end) and reply["content"][decision_key]["value"] != 'Reject'}
else:
accepted_papers = {}
else:
accepted_papers = {submission.forum: submission.content for submission in submissions}
if len(accepted_papers) > 0:
# we don't need to check the other decision key
break
if len(accepted_papers) == 0:
# print(f'\tNo accepted papers found for {conference} {year}')
return []
print(f'\t{len(accepted_papers)} papers found for {conference} {year}')
papers_infos = []
for paper_id, paper_info in accepted_papers.items():
if 'authors' not in paper_info:
continue
if isinstance(paper_info['abstract'], dict):
abstract = paper_info['abstract']['value'].strip()
else:
abstract = paper_info['abstract'].strip()
abstract = ' '.join(abstract.split())
if isinstance(paper_info['abstract'], dict):
authors = paper_info['authors']['value']
else:
authors = paper_info['authors']
authors = ', '.join(authors)
if isinstance(paper_info['abstract'], dict):
title = paper_info['title']['value'].strip()
else:
title = paper_info['title'].strip()
papers_infos.append({
'abstract': abstract,
'authors': authors,
'paper_id': f'{paper_id}',
'title': title,
})
return papers_infos
def _get_all_venues(client: OpenReviewClient) -> list[str]:
return client.get_group(id='venues').members
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--conference', type=str, required=True,
choices=tuple(VENUES_NAMES.keys()), help='conference to scrape data')
parser.add_argument('-d', '--download_pdfs', action='store_true', help='if included, download pdfs')
parser.add_argument('-l', '--log_level', type=str, default='warning',
choices=('debug', 'info', 'warning', 'error', 'critical', 'print'),
help='log level to debug')
parser.add_argument('-o', '--out_dir', type=Path, default=Path('data/'), help='directory where data should be saved')
parser.add_argument('--password', default='', help='defaults to empty string (guest user)')
parser.add_argument('--username', default='', help='defaults to empty string (guest user)')
parser.add_argument('-w', '--include_workshops', action='store_true', help='if included, include workshops')
parser.add_argument('-y', '--year', type=str, required=True, help='year of the conference')
args = parser.parse_args()
year = int(args.year)
if year >= 2023:
client = OpenReviewClient(baseurl='https://api2.openreview.net', username=args.username, password=args.password)
else:
client = Client(baseurl='https://api.openreview.net', username=args.username, password=args.password)
print('Getting openreview metadata for main conference')
papers_infos = _download_conference_info(client, args.conference, args.year, main_conference=True)
if len(papers_infos) == 0:
print(f'\tNo data found for main conference {args.conference} {args.year}')
if args.include_workshops:
print('Getting openreview metadata for workshops')
# add random sleep between api calls
sleep(uniform(1., 2.))
workshop_papers_infos = _download_conference_info(client, args.conference, args.year, main_conference=False)
if len(workshop_papers_infos) == 0:
print(f'\tNo data found for workshops in conference {args.conference} {args.year}')
else:
papers_infos += workshop_papers_infos
if len(papers_infos) > 0:
_save_and_download_papers(papers_infos, args.conference, args.year, args.out_dir, args.download_pdfs, client)
else:
print(f'\tNo data found for conference {args.conference} {args.year} in openreview')