-
-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathgenerate.py
395 lines (345 loc) · 16.4 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
"""
A script that generates the base MD stats report file, parses the stats Cloudflare provides
and exports them to the DB as well as generating an MD table of them for the report.
"""
import calendar
import os
import re
import sqlite3
from random import choice, random
from typing import List, Dict, Union
import beautifultable
import requests
import graph
def generate_regex(order: List[str]) -> str:
final = []
patterns = {
"requests": "\"?(?P<requests>\\d+)\"?",
"resources": "\"?(?P<resource>[^\\s\"]+)\"?",
"bandwidth": "\"?(?P<bandwidth>\\d+(?:\\.\\d+)?)\"?"
}
joiner = "[\\s|│?,]*"
for item in order:
if item in patterns:
final.append(patterns[item])
final = "^" + joiner + joiner.join(final) + joiner + "$"
return final
def parse_raw(url: str, column_regex: str) -> List[Dict[str, Union[int, float, str]]]:
# Get the raw dump from the url
raw = requests.get(url).text
items = []
file_regex = "^cdnjs\\.cloudflare\\.com\\/ajax\\/libs\\/(?P<library>.+?)\\/(?P<version>.+?)\\/(?P<file>.+)$"
# Iterate over each line of the dump
for line in raw.split("\n"):
try:
# Attempt to find the components through regex
x = re.search(column_regex, line)
y = x.groupdict()
# Ensure correct datatypes
y["requests"] = int(y["requests"])
y["bandwidth"] = float(y["bandwidth"])
# Parse resource further
z = re.search(file_regex, y["resource"]).groupdict()
for key, value in z.items():
y[key] = value
items.append(y)
except:
continue
return items
def database_data(request_data: List[Dict[str, Union[int, float, str]]], month: int, year: int, total_data: dict):
# Create db
conn = sqlite3.connect("data.db")
c = conn.cursor()
c.execute(
'''CREATE TABLE IF NOT EXISTS DATA (resource text, bandwidth float, requests integer, library text, version text, file text, month integer, year integer)''')
conn.commit()
c.execute(
'''CREATE TABLE IF NOT EXISTS DATA_TOTALS (requests_1_percent integer, requests_3_days integer, bandwidth_1_percent float, bandwidth_3_days float, month integer, year integer)''')
conn.commit()
# Export all the data
for row in request_data:
c.execute(
'''INSERT INTO DATA (resource,bandwidth,requests,library,version,file,month,year) VALUES (?,?,?,?,?,?,?,?)''',
[
row["resource"], row["bandwidth"], row["requests"], row["library"], row["version"], row["file"], month,
year
])
conn.commit()
c.execute(
'''INSERT INTO DATA_TOTALS (requests_1_percent,requests_3_days,bandwidth_1_percent,bandwidth_3_days,month,year) VALUES (?,?,?,?,?,?)''',
[
total_data["REQUESTS_1_PER"], total_data["REQUESTS_3_DAY"], total_data["BANDWIDTH_1_PER"],
total_data["BANDWIDTH_3_DAY"], month, year
])
conn.commit()
# Create libraries view
c.execute('''CREATE VIEW IF NOT EXISTS libraries AS
SELECT
*,
total_requests / days AS requests_per_day,
total_bandwidth / days AS bandwidth_per_day
FROM
(
SELECT
SUM(requests) * 100 AS total_requests,
SUM(bandwidth) * 100 AS total_bandwidth,
library,
COUNT(DISTINCT(version || file)) AS files,
COUNT(DISTINCT(version)) AS versions,
year,
month,
(julianday(DATE(year || "-" || printf("%02d", month) || "-01", "+1 month")) - julianday(year || "-" || printf("%02d", month) || "-01")) AS days,
year || "-" || printf("%02d", month) as date
FROM DATA GROUP BY library, date ORDER BY total_requests DESC
);''')
conn.commit()
# Create totals view
c.execute('''CREATE VIEW IF NOT EXISTS totals AS
SELECT
*,
total_requests / days AS requests_per_day,
total_bandwidth / days AS bandwidth_per_day,
total_bandwidth / total_requests AS bandwidth_per_request
FROM
(
SELECT
year,
month,
date,
days,
(CASE
WHEN bandwidth_3_days IS NULL THEN bandwidth_1_percent * 100
ELSE (bandwidth_1_percent * 100 * 0.75) + ((bandwidth_3_days / 3) * days * 0.25)
END) AS total_bandwidth,
(CASE
WHEN requests_3_days IS NULL THEN requests_1_percent * 100
ELSE (requests_1_percent * 100 * 0.75) + ((requests_3_days / 3) * days * 0.25)
END) AS total_requests
FROM
(
SELECT
bandwidth_1_percent,
bandwidth_3_days,
requests_1_percent,
requests_3_days,
year,
month,
(julianday(DATE(year || "-" || printf("%02d", month) || "-01", "+1 month")) - julianday(year || "-" || printf("%02d", month) || "-01")) AS days,
year || "-" || printf("%02d", month) as date
FROM DATA_TOTALS ORDER BY date DESC
)
);''')
conn.commit()
conn.close()
def table_data(request_data: List[Dict[str, Union[int, float, str]]]) -> str:
# Create the header
header = beautifultable.BeautifulTable(max_width=1000, default_alignment=beautifultable.ALIGN_LEFT)
header.set_style(beautifultable.STYLE_MARKDOWN)
header.column_headers = ["#", "Requests", "Bandwidth", "cdnjs Resource URL"]
header.append_row(["", "", "", ""])
# Create the base table
base_table = beautifultable.BeautifulTable(max_width=1000, default_alignment=beautifultable.ALIGN_LEFT)
base_table.set_style(beautifultable.STYLE_MARKDOWN)
base_table.detect_numerics = False # This breaks the formatting
# Sort the data
request_data = sorted(request_data, key=lambda x: x["requests"], reverse=True)
# Generate the table
for i, row in enumerate(request_data):
try:
base_table.append_row([
"{:,}".format(i + 1),
"{:,}".format(row["requests"]),
"{:,.2f}".format(row["bandwidth"]),
"[{0}](https://{0})".format(row["resource"])
])
except:
continue
# Export header + table
base_table.column_alignments[2] = beautifultable.ALIGN_RIGHT
header = str(header)
base_table = str(base_table)
return header[:header.rfind("\n")] + "\n" + base_table
def over_under_nearly(exact: Union[float, int], rounded: int) -> str:
return "over" if exact > rounded else ("just under" if rounded - exact <= 0.2 else "nearly")
def get_template(template_file: str, variables: dict) -> str:
# Load in template file
with open("template/" + template_file) as f:
template = f.read()
# Substitute in variables
for key, value in variables.items():
template = template.replace("{{" + key + "}}", str(value))
return template
def create_file(table_string: str, month: int, year: int, total_data: dict):
variables = {
"MONTH": calendar.month_name[month],
"YEAR": year,
"DAYS": calendar.monthrange(year, month)[1],
"TABLE": table_string
}
# Requests
REQUESTS_1_PER_TOTAL = total_data["REQUESTS_1_PER"] * 100
REQUESTS_3_DAY_TOTAL = None if total_data["REQUESTS_3_DAY"] is None else (total_data["REQUESTS_3_DAY"] / 3) * variables["DAYS"]
REQUESTS = REQUESTS_1_PER_TOTAL if REQUESTS_3_DAY_TOTAL is None else REQUESTS_1_PER_TOTAL * .75 + REQUESTS_3_DAY_TOTAL * .25
variables.update({
"REQUESTS_1_PER": "{:,}".format(total_data["REQUESTS_1_PER"]),
"REQUESTS_1_PER_TOTAL": "{:,.0f}".format(REQUESTS_1_PER_TOTAL),
"REQUESTS_3_DAY": "" if total_data["REQUESTS_3_DAY"] is None else "{:,}".format(total_data["REQUESTS_3_DAY"]),
"REQUESTS_3_DAY_TOTAL": "" if REQUESTS_3_DAY_TOTAL is None else "{:,.0f}".format(REQUESTS_3_DAY_TOTAL),
"REQUESTS": "{:,.0f}".format(REQUESTS),
"REQUESTS_STAT": "",
"REQUESTS_HIGHLIGHT": "", # Key highlights
"REQUESTS_DESCRIPTION": "" # Total number of requests
})
# Requests insights
requests_billion = REQUESTS / 1000000000
requests_billion_rounded = round(requests_billion)
variables["REQUESTS_STAT"] = over_under_nearly(requests_billion, requests_billion_rounded) \
+ " {:,.0f} billion requests".format(requests_billion_rounded)
state = bool(round(random()))
variables["REQUESTS_HIGHLIGHT"] = ("This month ({} {}), ".format(variables["MONTH"], year) if state else "") \
+ "cdnjs served **{}**".format(variables["REQUESTS_STAT"]) \
+ (" in {} {}.".format(variables["MONTH"], year) if not state else ".")
variables["REQUESTS_DESCRIPTION"] = variables["REQUESTS_STAT"].capitalize() \
+ " or {} {:,.1f} billion requests {} day of {}".format(
choice(["around", "roughly", "approximately"]),
requests_billion / variables["DAYS"],
choice(["every", "every single", "each"]), variables["MONTH"])
# Sites
SITES = total_data["SITES_1_PER"] * 100
variables.update({
"SITES_1_PER": "{:,}".format(total_data["SITES_1_PER"]),
"SITES": "{:,.0f}".format(SITES),
"SITES_STAT": "",
"SITES_HIGHLIGHT": "", # Key highlights
"SITES_DESCRIPTION": "" # Websites using cdnjs
})
# Sites insights
variables["SITES_STAT"] = "{:,.2f} billion {} websites used cdnjs {}".format(
SITES / 1000000000,
choice(["unique", "different"]),
choice(["assets", "resources"])
)
state = not state
variables["SITES_HIGHLIGHT"] = ("In {}, ".format(variables["MONTH"]) if state else "") \
+ "**{}**".format(variables["SITES_STAT"]) \
+ (" this month!" if not state else "!")
variables["SITES_DESCRIPTION"] = variables["SITES_STAT"].capitalize() \
+ " in {} {}".format(variables["MONTH"], year)
# Bandwidth
BANDWIDTH_1_PER_TOTAL_GB = total_data["BANDWIDTH_1_PER"] * 100
BANDWIDTH_1_PER_TOTAL_PB = BANDWIDTH_1_PER_TOTAL_GB / 1000000
BANDWIDTH_3_DAY_TOTAL_GB = None if total_data["BANDWIDTH_3_DAY"] is None else (total_data["BANDWIDTH_3_DAY"] / 3) * variables["DAYS"]
BANDWIDTH_3_DAY_TOTAL_PB = None if BANDWIDTH_3_DAY_TOTAL_GB is None else BANDWIDTH_3_DAY_TOTAL_GB / 1000000
BANDWIDTH_GB = BANDWIDTH_1_PER_TOTAL_GB if BANDWIDTH_3_DAY_TOTAL_GB is None else BANDWIDTH_1_PER_TOTAL_GB * .75 + BANDWIDTH_3_DAY_TOTAL_GB * .25
BANDWIDTH_PB = BANDWIDTH_GB / 1000000
variables.update({
"BANDWIDTH_1_PER": "{:,}".format(total_data["BANDWIDTH_1_PER"]),
"BANDWIDTH_1_PER_TOTAL_GB": "{:,.1f}".format(BANDWIDTH_1_PER_TOTAL_GB),
"BANDWIDTH_1_PER_TOTAL_PB": "{:,.2f}".format(BANDWIDTH_1_PER_TOTAL_PB),
"BANDWIDTH_3_DAY": "" if total_data["BANDWIDTH_3_DAY"] is None else "{:,}".format(total_data["BANDWIDTH_3_DAY"]),
"BANDWIDTH_3_DAY_TOTAL_GB": "" if BANDWIDTH_3_DAY_TOTAL_GB is None else "{:,.1f}".format(BANDWIDTH_3_DAY_TOTAL_GB),
"BANDWIDTH_3_DAY_TOTAL_PB": "" if BANDWIDTH_3_DAY_TOTAL_PB is None else "{:,.2f}".format(BANDWIDTH_3_DAY_TOTAL_PB),
"BANDWIDTH_GB": "{:,.1f}".format(BANDWIDTH_GB),
"BANDWIDTH_PB": "{:,.2f}".format(BANDWIDTH_PB),
"BANDWIDTH_STAT": "",
"BANDWIDTH_HIGHLIGHT": "", # Key highlights
"BANDWIDTH_DESCRIPTION": "" # Total bandwidth usage
})
# Bandwidth insights
variables["BANDWIDTH_STAT"] = "a {} consumption of {} PB of data".format(
choice(["massive", "huge"]), variables["BANDWIDTH_PB"])
state = not state
variables["BANDWIDTH_HIGHLIGHT"] = ("This month, " if state else "") \
+ "cdnjs used **{}** to serve these requests" \
.format(variables["BANDWIDTH_STAT"]) \
+ (" this month." if not state else ".")
variables["BANDWIDTH_DESCRIPTION"] = "This gives cdnjs {} for requests {}".format(
variables["BANDWIDTH_STAT"].replace("PB", "petabytes").replace("consumption", "bandwidth consumption"),
choice(["this month", "in {}".format(variables["MONTH"])]))
# Per day key highlight
bandwidth_days_tb = (BANDWIDTH_GB / variables["DAYS"]) / 1000
bandwidth_days_tb_rounded = round(bandwidth_days_tb)
requests_days_billion = (REQUESTS / variables["DAYS"]) / 1000000000
state = bool(round(random()))
variables["PER_DAY_HIGHLIGHT"] = "{} **{} {:,} {} of data and {:,.1f} billion requests {} day** (averaged)." \
.format(
"That's" if state else "That works out to",
over_under_nearly(bandwidth_days_tb, bandwidth_days_tb_rounded),
bandwidth_days_tb_rounded,
"terabytes" if state else "TB",
requests_days_billion,
choice(["each", "every"])
)
# Per request key highlight
bandwidth_requests_kb = (BANDWIDTH_GB / REQUESTS) * 1000000
state = bool(round(random()))
variables["PER_REQUEST_HIGHLIGHT"] = "In {}, **each request to cdnjs{} used only {:,.2f} KB of data{}**." \
.format(
variables["MONTH"],
" (on average)" if state else "",
bandwidth_requests_kb,
"" if state else " on average",
)
# Connect to DB
conn = sqlite3.connect("data.db")
c = conn.cursor()
# Get top assets
c.execute("SELECT library, version, file, requests, bandwidth FROM DATA WHERE month = ? AND year = ? ORDER BY requests DESC LIMIT 5", (month, year))
rows = c.fetchall()
for i, row in enumerate(rows):
variables["TOP_ASSETS_" + str(i + 1) + "_LIBRARY"] = row[0]
variables["TOP_ASSETS_" + str(i + 1) + "_VERSION"] = row[1]
variables["TOP_ASSETS_" + str(i + 1) + "_FILE"] = row[2]
variables["TOP_ASSETS_" + str(i + 1) + "_REQUESTS"] = "{:,.2f} billion".format((row[3] * 100) / 1000000000)
variables["TOP_ASSETS_" + str(i + 1) + "_BANDWIDTH"] = "{:,.2f} TB".format((row[4] * 100) / 1000)
# Get top libraries
c.execute("SELECT library, total_requests, total_bandwidth FROM libraries WHERE month = ? AND year = ? ORDER BY total_requests DESC LIMIT 5", (month, year))
rows = c.fetchall()
for i, row in enumerate(rows):
variables["TOP_LIBRARIES_" + str(i + 1) + "_NAME"] = row[0]
variables["TOP_LIBRARIES_" + str(i + 1) + "_REQUESTS"] = "{:,.2f} billion".format(row[1] / 1000000000)
variables["TOP_LIBRARIES_" + str(i + 1) + "_BANDWIDTH"] = "{:,.2f} TB".format(row[2] / 1000)
# Done with DB
conn.close()
# Load in requests template
variables["REQUESTS_PRETEXT"] = get_template("requests_1_per.md" if REQUESTS_3_DAY_TOTAL is None else "requests_combined.md", variables)
# Load in bandwidth template
variables["BANDWIDTH_PRETEXT"] = get_template("bandwidth_1_per.md" if REQUESTS_3_DAY_TOTAL is None else "bandwidth_combined.md", variables)
# Final template
template = get_template("template.md", variables)
# Save to new file
os.makedirs("./{}".format(year), exist_ok=True)
with open("./{}/cdnjs_{}_{}.md".format(year, variables["MONTH"], year), "w+") as f:
f.write(template)
if __name__ == "__main__":
REQUESTS_1_PER = 1500000000
REQUESTS_3_DAY = None
SITES_1_PER = 0
BANDWIDTH_1_PER = 30000.00 # GB
BANDWIDTH_3_DAY = None # GB
RAW_TABLE_DATA_URL = "" # 1% data
RAW_TABLE_DATA_ORDER = ["requests", "resources", "bandwidth"]
MONTH = 1
YEAR = 2020
stats = {
"REQUESTS_1_PER": REQUESTS_1_PER,
"REQUESTS_3_DAY": REQUESTS_3_DAY,
"SITES_1_PER": SITES_1_PER,
"BANDWIDTH_1_PER": BANDWIDTH_1_PER,
"BANDWIDTH_3_DAY": BANDWIDTH_3_DAY
}
regex = generate_regex(RAW_TABLE_DATA_ORDER)
items = parse_raw(RAW_TABLE_DATA_URL, regex)
database_data(items, MONTH, YEAR, stats)
request_table = table_data(items)
create_file(request_table, MONTH, YEAR, stats)
graph.top_5_resources()
graph.top_5_libraries()
graph.total_requests_and_bandwidth()
graph.daily_requests_and_bandwidth()
"""
Create venv: python3 -m venv
Activate venv: source venv/bin/activate
Install requirements: python3 -m pip install -r requirements.txt
Run script: python3 generate.py
"""