-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommons-statistik.py
412 lines (379 loc) · 16.1 KB
/
commons-statistik.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
import requests
import requests_cache
import hashlib
from urllib.parse import quote
import pandas as pd
import json
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import pandas as pd
from datetime import datetime, timedelta
requests_cache.install_cache('http_cache', expire_after=86400)
def read_list_of_categories(filepath):
categories = []
with open(filepath, "r") as file:
for line in file:
if line.startswith("#") or line.strip() == "":
continue
else:
print(line.strip())
categories.append(line.strip())
return categories
def get_files_in_category(category):
BASE_URL = "https://commons.wikimedia.org/w/api.php"
params = {
'action': 'query',
'list': 'categorymembers',
'cmtitle': 'Category:' + category,
'cmlimit': 'max',
'cmtype': 'file',
'format': 'json'
}
files = []
while True:
response = requests.get(BASE_URL, params=params)
data = response.json()
files.extend([page['title'] for page in data['query']['categorymembers']])
if 'continue' in data:
params.update(data['continue'])
else:
break
return files
def get_image_usage(file_title):
BASE_URL = "https://commons.wikimedia.org/w/api.php"
params = {
'action': 'query',
'titles': file_title,
'prop': 'globalusage',
'gulimit': 'max',
'format': 'json'
}
response = requests.get(BASE_URL, params=params)
data = response.json()
page_id = list(data['query']['pages'].keys())[0]
return [(entry['wiki'], entry['title']) for entry in data['query']['pages'][page_id]['globalusage']]
def construct_image_url(filename):
# Remove 'File:' prefix and spaces for MD5 hash calculation
name_without_prefix = filename.replace("File:", "").replace(" ", "_")
md5_hash = hashlib.md5(name_without_prefix.encode('utf-8')).hexdigest()
url_path = f"/wikipedia/commons/{md5_hash[0]}/{md5_hash[0:2]}/{quote(name_without_prefix)}"
url_path = url_path.replace("/","%2F")
return url_path
def get_image_views(file_path, start = "2022110600", end = "2023110500"):
# uses https://wikitech.wikimedia.org/wiki/Analytics/AQS/Mediarequests
referer = "all-referers" # all-referers, internal, external, unknown, or the specific wiki where the media was loaded
agent = "user" # all-agents, spider, user
granularity = "monthly" # monthly or daily
file_path = construct_image_url(file_path)
url = f"https://wikimedia.org/api/rest_v1/metrics/mediarequests/per-file/{referer}/{agent}/{file_path}/{granularity}/{start}/{end}"
response = requests.get(url)
data = ""
sum = 0
if response.status_code == 200:
data = response.json()
for item in data['items']:
sum = sum + int(item['requests'])
return sum
else:
print(f"Failed to retrieve image views with url: {response.url} ")
print(f"Status code: {response.status_code}")
return None
def get_history_and_find_image_addition(page_title, file_title, wiki, rvstart='2000-01-01T00:00:00Z'):
file_name = file_title.split(":", 1)[-1]
item_id = None
last_continue = ""
if wiki == "www.wikidata.org":
return find_file_revision_on_wikidata(page_title, file_name)
if wiki.endswith("wikipedia.org"):
item_id = get_wikidata_qid(page_title, wiki.split(".")[0])
BASE_URL = f"https://{wiki}/w/api.php"
i = 0
while True:
params = {
'action': 'query',
'prop': 'revisions',
'titles': page_title,
'rvprop': 'ids|user|timestamp|content',
'rvlimit': 'max', # Fetch as many revisions as possible
'rvdir': 'newer', # Start from the order revisions and go to newer
'rvslots': 'main',
'rvstart': rvstart, # could be filtered to only revisions after 1st April 2022 (when project was started)
'format': 'json',
}
i = i+1
response = requests.get(BASE_URL, params=params)
data = response.json()
page_id = next(iter(data['query']['pages']), None)
# If the page does not exist or other error occurs
if page_id is None:
break
if 'revisions' not in data['query']['pages'][page_id]:
break
revisions = data['query']['pages'][page_id]['revisions']
for rev in revisions: # Reverse to start from newest
#print(rev['slots']['main'])
current_content = rev['slots']['main'].get('*',"error")
if current_content == "error":
print(f"WARNING ERROR, no * in {page_title}")
if file_name in current_content or file_name.replace(" ","_") in current_content:
return rev['user'], rev['timestamp'], item_id, i
# If there's more data, continue
if 'continue' in data:
if last_continue is not data['continue']:
break
else:
params.update(data['continue'])
else:
break
return None, None, item_id, i # Image addition not found
def find_file_revision_on_wikidata(item_id, file_name, rvstart='2010-01-01T00:00:00Z'):
BASE_URL = "https://www.wikidata.org/w/api.php"
i = 0
last_continue = ""
while True:
params = {
'action': 'query',
'prop': 'revisions',
'titles': item_id, # e.g., 'Q12345'
'rvlimit': 'max',
'rvprop': 'ids|user|timestamp|content',
'rvdir': 'newer',
'rvslots': 'main',
'rvstart': rvstart,
'format': 'json',
'formatversion': 2
}
i = i+1
response = requests.get(BASE_URL, params=params)
data = response.json()
# Check for valid page (there might be cases where the item doesn't exist)
if '-1' in data['query']['pages']:
break
print("page in data")
# Check each revision if file name is mentioned
for page in data['query']['pages']:
if "revisions" not in page:
print("no revision")
if rvstart != '2010-01-01T00:00:00Z':
return find_file_revision_on_wikidata(item_id, file_name, rvstart='2010-01-01T00:00:00Z')
else:
break
for revision in page['revisions']:
content = revision['slots']['main']['content']
content = bytes(content, "utf-8").decode("unicode_escape")
if file_name in content or file_name.replace(" ","_") in content:
return revision['user'], revision['timestamp'], item_id, i
# If there's more data, continue
if 'continue' in data:
if last_continue is not data['continue']:
break
else:
params.update(data['continue'])
else:
break
return None, None, item_id, i
def get_wikidata_qid(title, lang):
url = f"https://{lang}.wikipedia.org/w/api.php"
params = {
"action": "query",
"format": "json",
"titles": title,
"prop": "pageprops",
"ppprop": "wikibase_item"
}
response = requests.get(url, params=params)
data = response.json()
pages = data.get("query", {}).get("pages", {})
for _, page in pages.items():
qid = page.get("pageprops", {}).get("wikibase_item")
if qid:
return qid
return None
import requests
def get_upload_date(file_title):
file_title = file_title.replace(' ', '_')
file_title = file_title.split(':')[1]
API_ENDPOINT = 'https://commons.wikimedia.org/w/api.php'
params = {
'action': 'query', # Action is query
'format': 'json', # Format the output as JSON
'prop': 'imageinfo', # Get image information
'titles': f'File:{file_title}', # Specify the title of the file
'iiprop': 'timestamp' # Get the timestamp (upload date)
}
response = requests.get(API_ENDPOINT, params=params)
response.raise_for_status()
data = response.json()
page_id = next(iter(data['query']['pages']))
if 'missing' in data['query']['pages'][page_id]:
return f"File '{file_title}' does not exist on Wikimedia Commons."
upload_date = data['query']['pages'][page_id]['imageinfo'][0]['timestamp']
return upload_date
def get_wikipedia_articles_by_qid(qids):
url = "https://www.wikidata.org/w/api.php"
articles = [["item_id","wiki","url","views_30d"]]
i = 1
for qid in qids:
print(f"{i}/{len(qids)} fetching {qid} from {url}")
i = i+1
params = {
"action": "wbgetentities",
"ids": qid,
"format": "json",
"props": "sitelinks/urls"
}
response = requests.get(url, params=params)
try:
data = response.json()
except:
print(f"error getting articles for {qid} from {url}")
#print(response.text)
if 'entities' in data and qid in data['entities']:
sitelinks = data['entities'][qid].get('sitelinks', {})
for key, value in sitelinks.items():
if 'wiki' in key:
language_code = key.replace('wiki', '')
article_url = value['url']
views = None
if "commonswiki" not in key or "voyage" not in key or "quote" not in key or "be_x_old" not in key or "news" not in key:
article_title = article_url.split("/")[len(article_url.split("/"))-1]
platform = language_code+".wikipedia"
views = get_wikipedia_views(article_title, platform)
articles.append([qid, language_code+".wikipedia.org", article_url, views])
return articles
def get_wikipedia_views(article_title, platform, days=30):
headers = {
'User-Agent': 'AppName/1.0 (email)' # Replace with your app and contact info
}
# Format today's date and the start date (30 days ago)
end_date = datetime.today()
start_date = end_date - timedelta(days=days)
end_date = end_date.strftime('%Y%m%d')
start_date = start_date.strftime('%Y%m%d')
# Wikipedia API endpoint for pageviews
url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{platform}/all-access/user/{article_title}/daily/{start_date}/{end_date}"
# Make the request to the Wikipedia API
response = requests.get(url, headers=headers)
try:
data = response.json()
except:
print("error getting views: "+url)
#print(response.text)
# Check if 'items' key is in the data
if 'items' in data:
total_views = sum(day['views'] for day in data['items'])
print(f"{platform} {article_title} views_30d:{total_views}")
return total_views
else:
print(f"No 'items' key found in the response: {url}")
return None
def main():
path = "./"
filepath = path+"commons-statistik-categories.txt"
categories = read_list_of_categories(filepath)
results = []
for category in categories:
print(f"\nStarting new category: {category}")
files = get_files_in_category(category)
print(f"Found {len(files)} files in {category}")
for index, file in enumerate(files):
avgpermonth = 0
yearviews = None
yearviews = get_image_views(file, start = "2022110100", end = "2023103100")
if yearviews == None:
print("no year")
else:
if yearviews:
yearviews = round(yearviews)
avgpermonth = round(yearviews/12)
usages = get_image_usage(file)
uploaddate = None
uploaddate = get_upload_date(file)
if not usages:
print(index, len(files), category, file)
results.append({"category":category, "image":file, "views_year":yearviews, "avg_month": avgpermonth, "uploaddate":uploaddate})
else:
for wiki, page_title in usages:
#print("Next", page_title)
user, timestamp, item_id, i = get_history_and_find_image_addition(page_title, file, wiki)
print(index, len(files), category, file,wiki,page_title,user,timestamp, item_id, i)
results.append({"category":category, "image":file, "views_year":yearviews, "avg_month": avgpermonth, "wiki":wiki, "page_title":page_title, "item_id":item_id, "user":user, "revtimestamp":timestamp, "uploaddate":uploaddate, "in_use":"True"})
output_file1 = "commons_statistics.xlsx"
output_file2 = "commons_statistics_with_potential.xlsx"
df = pd.DataFrame(results)
lang_df = pd.read_csv('lang_code.csv')
lang_dict = dict(zip(lang_df['WP-code'], lang_df['Language']))
df['language_code'] = df['wiki'].str.split('.').str[0]
df['platform'] = df['wiki'].str.split('.').str[1]
df['language'] = df['language_code'].map(lang_dict)
df = df.drop('language_code', axis=1)
cols = df.columns.tolist()
cols.insert(cols.index('wiki') + 1, cols.pop(cols.index('language')))
df = df[cols]
cols = df.columns.tolist()
cols.insert(cols.index('wiki') + 1, cols.pop(cols.index('platform')))
df = df[cols]
df.fillna("", inplace=True)
print(df)
# Create a new Excel workbook
workbook = Workbook()
worksheet = workbook.active
for row in dataframe_to_rows(df, index=False, header=True):
worksheet.append(row)
for cell in worksheet['A'][1:]:
cell.hyperlink = "https://commons.wikimedia.org/wiki/Category:"+str(cell.value)
for cell in worksheet['B'][1:]:
cell.hyperlink = "https://commons.wikimedia.org/wiki/"+str(cell.value)
for index, cell in enumerate(worksheet['H'][1:], start=1):
domain = worksheet['E'][index].value
if domain != "":
cell.hyperlink = f"https://{domain}/wiki/"+str(cell.value)
for cell in worksheet['J'][1:]:
if cell.value != "":
cell.hyperlink = "https://wikidata.wikiscan.org/?menu=userstats&user="+str(cell.value)
# Save the workbook to an XLSX file
print("saving statistics to worksheet")
workbook.save(output_file1)
print("done")
qids = df['item_id'].unique().tolist()
print(qids)
rows = get_wikipedia_articles_by_qid(qids)
dfarticles = pd.DataFrame(rows[1:], columns=rows[0])
print(dfarticles)
merged_df = pd.merge(df[df['in_use'] == "True"], dfarticles, on=['item_id', 'wiki'], how='outer') # [dfarticles['wiki'].isin(['sv.wikipedia.org', 'fi.wikipedia.org', 'en.wikipedia.org'])]
merged_df = merged_df.sort_values(by=['item_id','image','category'])
merged_df['category'] = merged_df['category'].fillna(method='ffill')
merged_df['image'] = merged_df['image'].fillna(method='ffill')
merged_df['in_use'] = merged_df['in_use'].fillna("False")
df = merged_df
df = df.sort_values(by=['category','image','in_use'], ascending=[True, True, False])
print(df)
# Create a new Excel workbook
workbook = Workbook()
worksheet = workbook.active
for row in dataframe_to_rows(df, index=False, header=True):
worksheet.append(row)
print("A")
for cell in worksheet['A'][1:]:
cell.hyperlink = "https://commons.wikimedia.org/wiki/Category:"+str(cell.value)
print("B")
for cell in worksheet['B'][1:]:
cell.hyperlink = "https://commons.wikimedia.org/wiki/"+str(cell.value)
#print("H")
#for index, cell in enumerate(worksheet['H'][1:], start=1):
# domain = worksheet['E'][index].value
# if domain != "":
# cell.hyperlink = f"https://{domain}/wiki/"+str(cell.value)
print("J")
for cell in worksheet['J'][1:]:
if cell.value != "":
cell.hyperlink = "https://wikidata.wikiscan.org/?menu=userstats&user="+str(cell.value)
print("N")
for cell in worksheet['N'][1:]:
cell.hyperlink = str(cell.value)
# Save the workbook to an XLSX file
print("saving statistics with potential articles to worksheet")
workbook.save(output_file2)
print("done")
if __name__ == "__main__":
main()