-
Notifications
You must be signed in to change notification settings - Fork 5
/
commands_history.txt
244 lines (171 loc) · 4.22 KB
/
commands_history.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
python ~/wmf/util/sqoop_utils.py --db clickstream --langs ar,fa,en --tables page,redirect,pagelinks
python get_clickstream.py \
--start 2016-04-01 \
--stop 2016-04-30 \
--table 2016_04_en \
--lang en \
--priority
python get_clickstream.py \
--start 2016-04-01 \
--stop 2016-04-30 \
--table 2016_04_ar \
--lang ar \
--priority
python get_clickstream.py \
--start 2016-04-01 \
--stop 2016-04-30 \
--table 2016_04_fa \
--lang fa \
--priority
python get_clickstream.py \
--start 2016-08-01 \
--stop 2016-08-31 \
--table 2016_08_en \
--lang en \
--priority
python get_clickstream.py \
--start 2016-09-01 \
--stop 2016-09-30 \
--table 2016_09_en \
--lang en \
--priority
python get_clickstream.py \
--start 2017-01-01 \
--stop 2017-01-31 \
--table 2017_01_en \
--lang en
Sanity Checks:
select count(*) from en_page_raw; 12415519 41483269
select count(distinct(page_id)) from en_page_raw; 12415519
select count(*) from en_redirect_raw; 7403622
select count(distinct(rd_from)) from en_redirect_raw; 7403622
select count(*) from en_redirect; 7294925
select count(distinct(rd_from)) from en_redirect; 7294925
Test:
SELECT
COUNT(*)
FROM
wmf.webrequest
WHERE
year=2016
AND month=2
AND day=1
AND hour = 1
AND webrequest_source = 'text'
AND normalized_host.project_class = 'wikipedia'
AND normalized_host.project = 'en'
AND is_pageview
AND LENGTH(REGEXP_EXTRACT(reflect('java.net.URLDecoder', 'decode', uri_path), '/wiki/(.*)', 1)) > 0
AND agent_type = 'user';
12174742
SELECT
count(*) n_pairs,
SUM(n) n_requests
FROM
clickstream.2016_04_ar;
n_pairs n_requests
27123320 6696035191
n_pairs n_requests
21777059 6375791557
SELECT
count(*) n_pairs,
SUM(n) n_requests,
type
FROM
clickstream.2016_04_en
GROUP BY
type;
n_pairs n_requests type
14640150 1585215153 link
2380426 152567908 other
10102744 4958252130 external
n_pairs n_requests type
12442174 1126970900 link
591406 37604641 other
8743479 5211216016 external
SELECT * FROM
clickstream.2016_02
WHERE curr RLIKE 'Talk:';
##################################
Sanity checks for 2016_08_en:
SELECT
count(*) n_pairs,
SUM(n) n_requests
FROM
clickstream.2016_08_en;
24197165 7502925551
SELECT
count(*) n_pairs,
SUM(n) n_requests,
type
FROM
clickstream.2016_08_en
GROUP BY
type;
n_pairs n_requests type
14255237 1252275529 link
676316 51135807 other
9265612 6199514215 external
select * from 2016_08_en where type = 'other' order by n desc limit 10;
SELECT * FROM
clickstream.2016_08_en
WHERE curr RLIKE 'User:';
None
##################################
Sanity checks for 2016_09_en:
SELECT
count(*) n_pairs,
SUM(n) n_requests
FROM
clickstream.2016_09_en;
24187538 6844193790
SELECT
count(*) n_pairs,
SUM(n) n_requests,
type
FROM
clickstream.2016_09_en
GROUP BY
type;
n_pairs n_requests type
13951370 1187620828 link
608573 42237813 other
9627595 5614335149 external
select * from 2016_09_en where type = 'other' order by n desc limit 10;
SELECT * FROM
clickstream.2016_09_en
WHERE curr RLIKE 'User:';
None
##################################
Sanity checks for 2017_01_en:
ETL Checks:
select count(*) from en_page_raw; 41483269
select count(distinct(page_id)) from en_page_raw; 41483269
select count(*) from en_redirect_raw; 9364638
select count(distinct(rd_from)) from en_redirect_raw; 9364638
select count(*) from en_redirect; 9364432
select count(distinct(rd_from)) from en_redirect; 9364432
Clickstream Checks:
SELECT
count(*) n_pairs,
SUM(n) n_requests
FROM
clickstream.2017_01_en;
n_pairs n_requests
25755132 8012700871
SELECT
count(*) n_pairs,
SUM(n) n_requests,
type
FROM
clickstream.2017_01_en
GROUP BY
type;
n_pairs n_requests type
15444330 1445520593 link
753355 61218844 other
9557447 6505961434 external
select * from 2017_01_en where type = 'other' order by n desc limit 10;
SELECT * FROM
clickstream.2017_01_en
WHERE curr RLIKE 'User:';