-
Notifications
You must be signed in to change notification settings - Fork 0
/
wiki_to_csv.py
executable file
·207 lines (181 loc) · 6.24 KB
/
wiki_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/env python3
import re
import logging
from collections import defaultdict
from collections import OrderedDict
import datetime
import csv
import sys
import argparse
import pywikibot
from pywikibot import pagegenerators
METRIC_NAMES = [
"deltagare_♀",
"deltagare_♂",
"deltagare_⚥",
"deltagare_?",
"deltagare_total",
"redigerare_♀",
"redigerare_♂",
"redigerare_⚥",
"redigerare_?",
"redigerare_total",
"org_♀",
"org_♂",
"org_⚥",
"org_?",
"org_total",
"content_wp",
"content_com",
"content_wd",
"content_other",
"content_total"
]
def is_int(value):
"""Check if the given value is an integer.
@param value: The value to check
@type value: str, or int
@return bool
"""
try:
int(value)
return True
except (ValueError, TypeError):
return False
def extract_elements_from_template_param(template_param):
"""Extract and sanitize the contents of a parsed template param."""
(field, _, value) = template_param.partition('=')
# Remove leading or trailing spaces
field = field.strip()
return (field, sanitize_wikitext_string(value))
def sanitize_wikitext_string(value):
"""Remove undesirable wikitext features from a string."""
value = value.split("<ref")[0].strip()
value = re.sub(r"\s?<!--.*?-->\s?", ' ', value)
return value.strip()
def extract_all_data_on_page(page, year):
logging.debug("=== {} ===".format(page))
templates = page.templatesWithParams()
contents = defaultdict(int)
number_of_events = 0
for (template, params) in templates:
logging.debug("--- Template ---")
template_name = template.title(with_ns=False)
if template_name != 'Core metrics':
logging.info(
"Skipping template with incorrect name '{}'".format(template)
)
continue
template_metrics = {}
for param in params:
(field, value) = extract_elements_from_template_param(param)
template_metrics[field] = value
if template_metrics["year"] != str(year):
logging.info(
"Skipping template with wrong year: {} on page {}."
.format(template_metrics["year"], page)
)
continue
number_of_events += 1
logging.debug("--- Core metrics #{} ---".format(number_of_events))
for key, value in template_metrics.items():
if key in METRIC_NAMES:
if value == "":
logging.warning(
"Empty value on page '{}' for key '{}'"
.format(page, key)
)
continue
if not is_int(value):
logging.warning(
"Non-integer value on page '{}' for key '{}': {}"
.format(page, key, value)
)
continue
logging.debug("Adding value to {}, {}".format(key, value))
contents[key] += int(value)
category = key.split("_")[0]
contents[category + "_total"] += int(value)
project_data = {
"metrics": contents,
"number_of_events": number_of_events
}
return project_data
def get_all_page_data(year):
site = pywikibot.Site('se', 'wikimediachapter')
cat = pywikibot.Category(site, 'Globala mätetal {}'.format(year))
pages = cat.articles()
data = OrderedDict()
for page in pagegenerators.PreloadingGenerator(pages, 100):
if (page.title().endswith("/Global Metrics") or
page.title().endswith("/Resultat och mätetal")):
project_data = extract_all_data_on_page(page, year)
if project_data:
project_name = page.title().split(":")[1].split("/")[0]
data[project_name] = project_data
else:
logging.warning(
"Template found outside of an expected subpage: {}."
.format(page.title())
)
return data
def print_csv(data):
writer = csv.writer(sys.stdout)
meta_data = ["meta_data"]
# Order projects according to file, if given.
if args.project_order_file:
ordered_data = {}
order = []
blank_counter = 0
for l in open(args.project_order_file):
name = l.strip()
order.append(name)
for project in order:
if project in data:
ordered_data[project] = data[project]
elif project == "":
ordered_data["_{}".format(blank_counter)] = {
"number_of_events": "",
"metrics": {metric: "" for metric in METRIC_NAMES}
}
blank_counter += 1
else:
# If no data was found for a project,
# make a set of empty data.
ordered_data[project] = {
"number_of_events": 0,
"metrics": {metric: 0 for metric in METRIC_NAMES}
}
data = ordered_data
writer.writerow([""] + list(data.keys()))
for name, project in data.items():
if name.startswith('_'):
meta_data.append("")
else:
meta_data.append(
"{} ({} st. event)"
.format(datetime.date.today(), project["number_of_events"])
)
writer.writerow(meta_data)
for key in METRIC_NAMES:
line = [key]
for project in data:
metric_value = str(data[project]["metrics"][key])
line.append(metric_value)
writer.writerow(line)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--year", "-y", required=True)
parser.add_argument("--verbose", "-v", action="store_true")
parser.add_argument(
"project_order_file",
help=("Path to a file containing project names in Swedish, one per line."
"The projects in the output will have the same order."),
nargs="?"
)
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
print_csv(get_all_page_data(args.year))