-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathsk.py
595 lines (478 loc) · 15 KB
/
sk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
"""
## Hyptotheis Testing Stuff
### Standard Stuff
#### Standard Headers
"""
from __future__ import division
import sys
import random
import math
from functools import reduce
from pdb import set_trace
sys.dont_write_bytecode = True
"""
#### Standard Utils
"""
class o():
"Anonymous container"
def __init__(i, **fields):
i.override(fields)
def override(i, d):
i.__dict__.update(d)
return i
def __repr__(i):
d = i.__dict__
name = i.__class__.__name__
return name + '{' + ' '.join([':%s %s' % (k, pretty(d[k]))
for k in i.show()]) + '}'
def show(i):
return [k for k in sorted(i.__dict__.keys())
if not "_" in k]
"""
Misc functions:
"""
rand = random.random
any = random.choice
seed = random.seed
exp = lambda n: math.e ** n
ln = lambda n: math.log(n, math.e)
g = lambda n: round(n, 2)
def median(lst, ordered=False):
if not ordered:
lst = sorted(lst)
n = len(lst)
p = n // 2
if n % 2:
return lst[p]
q = p - 1
q = max(0, min(q, n))
return (lst[p] + lst[q]) / 2
def msecs(f):
import time
t1 = time.time()
f()
return (time.time() - t1) * 1000
def pairs(lst):
"Return all pairs of items i,i+1 from a list."
last = lst[0]
for i in lst[1:]:
yield last, i
last = i
def xtile(lst,lo=10e32,hi=-10e32,width=50,
chops=[0.25,0.5,0.75],
marks=["-" ,"-","-"],
bar="|",star="*",show=" %0.2f"):
"""The function _xtile_ takes a list of (possibly)
unsorted numbers and presents them as a horizontal
xtile chart (in ascii format). The default is a
contracted _quintile_ that shows the
10,30,50,70,90 breaks in the data (but this can be
changed- see the optional flags of the function).
"""
def pos(p) : return ordered[int(len(lst)*p)]
def place(x) :
return int(width*float((x - lo))/(hi - lo+0.00001))
def pretty(lst) :
return ', '.join([show % x for x in lst])
ordered = sorted(lst)
lo = min(lo,ordered[0])
hi = max(hi,ordered[-1])
what = [pos(p) for p in chops]
where = [place(n) for n in what]
out = [" "] * width
for one,two in pairs(where):
for i in range(one,two):
out[i] = marks[0]
marks = marks[1:]
out[int(width/2)] = bar
out[place(pos(0.5))] = star
return '('+''.join(out) + ")," + pretty(what)
def _tileX():
import random
random.seed(1)
nums = [random.random() ** 2 for _ in range(100)]
print xtile(nums, lo=0, hi=1.0, width=25, show=" %0.3E")
"""
### Standard Accumulator for Numbers
Note the _lt_ method: this accumulator can be sorted by median values.
Warning: this accumulator keeps _all_ numbers. Might be better to use
a bounded cache.
"""
class Num:
"An Accumulator for numbers"
def __init__(i, name, inits=[]):
i.n = i.m2 = i.mu = 0.0
i.all = []
i._median = None
i.name = name
i.rank = 0
for x in inits:
i.add(x)
def s(i):
return (i.m2 / (i.n - 1)) ** 0.5
def add(i, x):
i._median = None
i.n += 1
i.all += [x]
delta = x - i.mu
i.mu += delta * 1.0 / i.n
i.m2 += delta * (x - i.mu)
def __add__(i, j):
return Num(i.name + j.name, i.all + j.all)
def quartiles(i):
def p(x):
return int(100 * g(xs[x]))
i.median()
xs = i.all
n = int(len(xs) * 0.25)
return p(n), p(2 * n), p(3 * n)
def median(i):
if not i._median:
i.all = sorted(i.all)
i._median = median(i.all)
return i._median
def __lt__(i, j):
return i.median() < j.median()
def spread(i):
i.all = sorted(i.all)
n1 = i.n * 0.25
n2 = i.n * 0.75
if len(i.all) <= 1:
return 0
if len(i.all) == 2:
return i.all[1] - i.all[0]
else:
return i.all[int(n2)] - i.all[int(n1)]
"""
### The A12 Effect Size Test
"""
def a12slow(lst1, lst2):
"how often is x in lst1 more than y in lst2?"
more = same = 0.0
for x in lst1:
for y in lst2:
if x == y:
same += 1
elif x > y:
more += 1
x = (more + 0.5 * same) / (len(lst1) * len(lst2))
return x
def a12(lst1, lst2):
"how often is x in lst1 more than y in lst2?"
def loop(t, t1, t2):
while t1.j < t1.n and t2.j < t2.n:
h1 = t1.l[t1.j]
h2 = t2.l[t2.j]
h3 = t2.l[t2.j + 1] if t2.j + 1 < t2.n else None
if h1 > h2:
t1.j += 1
t1.gt += t2.n - t2.j
elif h1 == h2:
if h3 and h1 > h3:
t1.gt += t2.n - t2.j - 1
t1.j += 1
t1.eq += 1
t2.eq += 1
else:
t2, t1 = t1, t2
return t.gt * 1.0, t.eq * 1.0
#--------------------------
lst1 = sorted(lst1, reverse=True)
lst2 = sorted(lst2, reverse=True)
n1 = len(lst1)
n2 = len(lst2)
t1 = o(l=lst1, j=0, eq=0, gt=0, n=n1)
t2 = o(l=lst2, j=0, eq=0, gt=0, n=n2)
gt, eq = loop(t1, t1, t2)
return gt / (n1 * n2) + eq / 2 / (n1 * n2)
def _a12():
def f1():
return a12slow(l1, l2)
def f2():
return a12(l1, l2)
for n in [100, 200, 400, 800, 1600, 3200, 6400]:
l1 = [rand() for _ in xrange(n)]
l2 = [rand() for _ in xrange(n)]
t1 = msecs(f1)
t2 = msecs(f2)
print n, g(f1()), g(f2()), int((t1 / t2))
"""Output:
````
n a12(fast) a12(slow) tfast / tslow
--- --------------- -------------- --------------
100 0.53 0.53 4
200 0.48 0.48 6
400 0.49 0.49 28
800 0.5 0.5 26
1600 0.51 0.51 72
3200 0.49 0.49 109
6400 0.5 0.5 244
````
## Non-Parametric Hypothesis Testing
The following _bootstrap_ method was introduced in
1979 by Bradley Efron at Stanford University. It
was inspired by earlier work on the
jackknife.
Improved estimates of the variance were [developed later][efron01].
[efron01]: http://goo.gl/14n8Wf "Bradley Efron and R.J. Tibshirani. An Introduction to the Bootstrap (Chapman & Hall/CRC Monographs on Statistics & Applied Probability), 1993"
To check if two populations _(y0,z0)_
are different, many times sample with replacement
from both to generate _(y1,z1), (y2,z2), (y3,z3)_.. etc.
"""
def sampleWithReplacement(lst):
"returns a list same size as list"
def any(n):
return random.uniform(0, n)
def one(lst):
return lst[int(any(len(lst)))]
return [one(lst) for _ in lst]
"""
Then, for all those samples,
check if some *testStatistic* in the original pair
hold for all the other pairs. If it does more than (say) 99%
of the time, then we are 99% confident in that the
populations are the same.
In such a _bootstrap_ hypothesis test, the *some property*
is the difference between the two populations, muted by the
joint standard deviation of the populations.
"""
def testStatistic(y, z):
"""Checks if two means are different, tempered
by the sample size of 'y' and 'z'"""
tmp1 = tmp2 = 0
for y1 in y.all:
tmp1 += (y1 - y.mu) ** 2
for z1 in z.all:
tmp2 += (z1 - z.mu) ** 2
s1 = (float(tmp1) / (y.n - 1)) ** 0.5
s2 = (float(tmp2) / (z.n - 1)) ** 0.5
delta = z.mu - y.mu
if s1 + s2:
delta = delta / ((s1 / y.n + s2 / z.n) ** 0.5)
return delta
"""
The rest is just details:
+ Efron advises
to make the mean of the populations the same (see
the _yhat,zhat_ stuff shown below).
+ The class _total_ is a just a quick and dirty accumulation class.
+ For more details see [the Efron text][efron01].
"""
def bootstrap(y0, z0, conf=0.01, b=1000):
"""The bootstrap hypothesis test from
p220 to 223 of Efron's book 'An
introduction to the boostrap."""
class total():
"quick and dirty data collector"
def __init__(i, some=[]):
i.sum = i.n = i.mu = 0
i.all = []
for one in some:
i.put(one)
def put(i, x):
i.all.append(x)
i.sum += x
i.n += 1
i.mu = float(i.sum) / i.n
def __add__(i1, i2):
return total(i1.all + i2.all)
y, z = total(y0), total(z0)
x = y + z
tobs = testStatistic(y, z)
yhat = [y1 - y.mu + x.mu for y1 in y.all]
zhat = [z1 - z.mu + x.mu for z1 in z.all]
bigger = 0.0
for i in range(b):
if testStatistic(total(sampleWithReplacement(yhat)),
total(sampleWithReplacement(zhat))) > tobs:
bigger += 1
return bigger / b < conf
"""
#### Examples
"""
def _bootstraped():
def worker(n=1000,
mu1=10, sigma1=1,
mu2=10.2, sigma2=1):
def g(mu, sigma):
return random.gauss(mu, sigma)
x = [g(mu1, sigma1) for i in range(n)]
y = [g(mu2, sigma2) for i in range(n)]
return n, mu1, sigma1, mu2, sigma2, \
'different' if bootstrap(x, y) else 'same'
# very different means, same std
print worker(mu1=10, sigma1=10,
mu2=100, sigma2=10)
# similar means and std
print worker(mu1=10.1, sigma1=1,
mu2=10.2, sigma2=1)
# slightly different means, same std
print worker(mu1=10.1, sigma1=1,
mu2=10.8, sigma2=1)
# different in mu eater by large std
print worker(mu1=10.1, sigma1=10,
mu2=10.8, sigma2=1)
"""
Output:
````
_bootstraped()
(1000, 10, 10, 100, 10, 'different')
(1000, 10.1, 1, 10.2, 1, 'same')
(1000, 10.1, 1, 10.8, 1, 'different')
(1000, 10.1, 10, 10.8, 1, 'same')
````
Warning- the above took 8 seconds to generate since we used 1000 bootstraps.
As to how many bootstraps are enough, that depends on the data. There are
results saying 200 to 400 are enough but, since I am suspicious man, I run it for 1000.
Which means the runtimes associated with bootstrapping is a significant issue.
To reduce that runtime, I avoid things like an all-pairs comparison of all treatments
(see below: Scott-knott). Also, BEFORE I do the boostrap, I first run
the effect size test (and only go to bootstrapping in effect size passes:
"""
def different(l1, l2):
# return bootstrap(l1,l2) and a12(l2,l1)
return a12(l2, l1) and bootstrap(l1, l2)
"""
## Saner Hypothesis Testing
The following code, which you should use verbatim does the following:
+ All treatments are clustered into _ranks_. In practice, dozens
of treatments end up generating just a handful of ranks.
+ The numbers of calls to the hypothesis tests are minimized:
+ Treatments are sorted by their median value.
+ Treatments are divided into two groups such that the
expected value of the mean values _after_ the split is minimized;
+ Hypothesis tests are called to test if the two groups are truly difference.
+ All hypothesis tests are non-parametric and include (1) effect size tests
and (2) tests for statistically significant numbers;
+ Slow bootstraps are executed if the faster _A12_ tests are passed;
In practice, this means that the hypothesis tests (with confidence of say, 95%)
are called on only a logarithmic number of times. So...
+ With this method, 16 treatments can be studied using less than _∑<sub>1,2,4,8,16</sub>log<sub>2</sub>i =15_ hypothesis tests and confidence _0.99<sup>15</sup>=0.86_.
+ But if did this with the 120 all-pairs comparisons of the 16 treatments, we would have total confidence _0.99<sup>120</sup>=0.30.
For examples on using this code, see _rdivDemo_ (below).
"""
def scottknott(data, cohen=0.3, small=3, useA12=False, epsilon=0.01):
"""Recursively split data, maximizing delta of
the expected value of the mean before and
after the splits.
Reject splits with under 3 items"""
all = reduce(lambda x, y: x + y, data)
same = lambda l, r: abs(l.median() - r.median()) <= all.s() * cohen
if useA12:
same = lambda l, r: not different(l.all, r.all)
big = lambda n: n > small
return rdiv(data, all, minMu, big, same, epsilon)
def rdiv(data, # a list of class Nums
all, # all the data combined into one num
div, # function: find the best split
big, # function: rejects small splits
same, # function: rejects similar splits
epsilon): # small enough to split two parts
"""Looks for ways to split sorted data,
Recurses into each split. Assigns a 'rank' number
to all the leaf splits found in this way.
"""
def recurse(parts, all, rank=0):
"Split, then recurse on each part."
cut, left, right = maybeIgnore(div(parts, all, big, epsilon),
same, parts)
if cut:
# if cut, rank "right" higher than "left"
rank = recurse(parts[:cut], left, rank) + 1
rank = recurse(parts[cut:], right, rank)
else:
# if no cut, then all get same rank
for part in parts:
part.rank = rank
return rank
recurse(sorted(data), all)
return data
def maybeIgnore(xxx_todo_changeme, same, parts):
(cut, left, right) = xxx_todo_changeme
if cut:
if same(sum(parts[:cut], Num('upto')),
sum(parts[cut:], Num('above'))):
cut = left = right = None
return cut, left, right
def minMu(parts, all, big, epsilon):
"""Find a cut in the parts that maximizes
the expected value of the difference in
the mean before and after the cut.
Reject splits that are insignificantly
different or that generate very small subsets.
"""
cut, left, right = None, None, None
before, mu = 0, all.mu
for i, l, r in leftRight(parts, epsilon):
if big(l.n) and big(r.n):
n = all.n * 1.0
now = l.n / n * (mu - l.mu) ** 2 + r.n / n * (mu - r.mu) ** 2
if now > before:
before, cut, left, right = now, i, l, r
return cut, left, right
def leftRight(parts, epsilon=0.01):
"""Iterator. For all items in 'parts',
return everything to the left and everything
from here to the end. For reasons of
efficiency, take a first pass over the data
to pre-compute and cache right-hand-sides
"""
rights = {}
n = j = len(parts) - 1
while j > 0:
rights[j] = parts[j]
if j < n:
rights[j] += rights[j + 1]
j -= 1
left = parts[0]
for i, one in enumerate(parts):
if i > 0:
if parts[i]._median - parts[i - 1]._median > epsilon:
yield i, left, rights[i]
left += one
"""
## Putting it All Together
Driver for the demos:
"""
def rdivDemo(data):
def z(x):
return int(100 * (x - lo) / (hi - lo + 0.00001))
data = map(lambda lst: Num(lst[0], lst[1:]),
data)
print ""
ranks = []
for x in scottknott(data, useA12=True):
ranks += [(x.rank, x.median(), x)]
all = []
for _, __, x in sorted(ranks):
all += x.all
all = sorted(all)
lo, hi = all[0], all[-1]
line = "----------------------------------------------------"
last = None
print ('%4s , %12s , %s , %4s ' %
('rank', 'name', 'med', 'iqr')) + "\n" + line
for _, __, x in sorted(ranks):
q1, q2, q3 = x.quartiles()
print ('%4s , %12s , %0.2f , %0.2f ' %
(x.rank + 1, x.name, x.median(), x.spread())) + \
xtile(x.all, lo=lo, hi=hi, width=30)
last = x.rank
return ranks
def fromFile(f="data.dat",rev=True,enough=0.66):
"utility for reading sample data from disk"
import re
cache = {}
num, space = r'^\+?-?[0-9]', r'[ \t\n]+'
for line in open(f):
line = line.strip()
if line:
for word in re.split(space,line):
if re.match(num,word[0]):
cache[now] += [float(word)]
else:
now = word
cache[now] = [now]
rdivDemo(cache.values())
if __name__=='__main__':
fromFile()