""" Watch over a classifier making predictions. As each prediction (and actual) classification becomes available, send them to a logger class that incrementally calculates accuracy, recall, false alarm rate, precision, f, g etc. For example: a,b,c,d=list("abcd") log = abcd("data","rx") for want,got in [(a,b), (a,a), (a,c), (a,d), (b,a)]: log(want, got) log.report() This prints # db rx n a b c d acc pd pf prec f g class ---------------------------------------------------------------------------------------------------- # data rx 4 0 3 1 1 20 25 100 50 33 0 a # data rx 1 3 1 1 0 20 0 25 0 33 0 b # data rx 0 4 0 1 0 20 0 20 0 33 0 c # data rx 0 4 0 1 0 20 0 20 0 33 0 d ---------------------------------------------------------------------------------------------------- # data rx 2 2 1 1 0 20 10 53 20 33 0 (The last line is the weighted sum of the column above it.) If called from the command line, this code expects to read two words per line, for multiple lines. - Line1 mentions the data and treatment applied. - On all other lines, the words are first and section the actual and predicted values (respectively). E.g. cat <<EOF | python3 abcd.py data rx a b a a a c a d b a EOF This prints out the same report as above. ### Notes on Performance Measures Classifiers can be assessed according to the following measures: Example has class X +-------+-----+ | not X | X | +-----+-------+-----+ classifier predicts not X | no | a | b | +-----+-------+-----+ classifier predicts X | yes | c | d | +-----+-------+-----+ accuracy = acc = (a+d)/(a+b+c+d prob detection = pd = recall = d/(b+d) prob false alarm = pf = c/(a+c) precision = prec = d/(c+d) Ideally, detectors have high PDs, low PFs, and low effort. This ideal state rarely happens: - PD and effort are linked. The more modules that trigger the detector, the higher the PD. However, effort also gets increases - High PD or low PF comes at the cost of high PF or low PD (respectively). This linkage can be seen in a standard receiver operator curve (ROC). Suppose, for example, LOC> x is used as the detector (i.e. we assume large modules have more errors). LOC > x represents a family of detectors. At x=0, EVERY module is predicted to have errors. This detector has a high PD but also a high false alarm rate. At x=0, NO module is predicted to have errors. This detector has a low false alarm rate but won't detect anything at all. At 0<x<1, a set of detectors are generated as shown below: pd 1 | x x x KEY: | x . "." denotes the line PD=PF | x . "x" denotes the roc curve | x . for a set of detectors | x . | x . | x . |x . |x x------------------ pf 0 1 Note that: - The only way to make no mistakes (PF=0) is to do nothing (PD=0) - The only way to catch more detects is to make more mistakes (increasing PD means increasing PF). - Our detector bends towards the "sweet spot" of <PD=1,PF=0> but does not reach it. - The line pf=pd on the above graph represents the "no information" line. If pf=pd then the detector is pretty useless. The better the detector, the more it rises above PF=PD towards the "sweet spot". _____ ## Programmer's guide """ import sys,re class abcd: def __init__(i,db="all",rx="all"): "Initialize" i.db = str(db); i.rx=str(rx); i.yes = i.no = 0 i.known = {}; i.a= {}; i.b= {}; i.c= {}; i.d={} def __call__(i,actual=None,predict=None): "Incrementally update" i.knowns(actual) i.knowns(predict) if actual == predict: i.yes += 1 else : i.no += 1 for x in i.known: if actual == x: if predict == actual: i.d[x] += 1 else : i.b[x] += 1 else: if predict == x : i.c[x] += 1 else : i.a[x] += 1 def knowns(i,x): """ Ensure we know class `x`. If `x` is new, then we have to back date the "a" value (true negatives). """ if not x in i.known: i.known[x]= i.a[x]= i.b[x]= i.c[x]=i.d[x]=0.0 i.known[x] += 1 if (i.known[x] == 1): i.a[x] = i.yes + i.no def header(i): "Pretty print header" print("#", ('{0:20s} {1:11s} {2:4s} {3:4s} {4:4s}'+\ '{5:4s}{6:4s} {7:3s} {8:3s} {9:3s} '+ \ '{10:3s} {11:3s}{12:3s}{13:10s}').format( "db","rx","n","a","b","c","d","acc","pd", "pf","prec","f","g","class")) print('-'*100) def scores(i): "Computer the performance scores" # Convenience class. Can acces fields as x.f not x["f"]. class oo: def __init__(i, **adds): i.__dict__.update(adds) def p(y) : return int(100*y + 0.5) def n(y) : return int(y) out = {} ass=bs=cs=ds=accs=pds=pfs=precs=fs=gs=yess= 0 for x in i.known: pd = pf = pn = prec = g = f = acc = 0 a = i.a[x]; b= i.b[x]; c= i.c[x]; d= i.d[x] if (b+d) : pd = d / (b+d) if (a+c) : pf = c / (a+c) if (a+c) : pn = (b+d) / (a+c) if (c+d) : prec = d / (c+d) if (1-pf+pd): g = 2*(1-pf)*pd / (1-pf+pd) if (prec+pd): f = 2*prec*pd/(prec+pd) if (i.yes + i.no): acc= i.yes/(i.yes+i.no) out[x] = oo(db=i.db, rx=i.rx, yes= n(b+d), all=n(a+b+c+d), a=n(a), b=n(b), c=n(c), d=n(d), acc=p(acc), pd=p(pd), pf=p(pf), prec=p(prec), f=p(f), g=p(g),x=x) # computer weighted sums ratio = (c + d)/(i.yes + i.no) ass += a * ratio bs += b * ratio cs += c * ratio ds += d * ratio accs += acc * ratio pds += pd * ratio pfs += pf * ratio precs += prec * ratio fs += f * ratio gs += g * ratio out["__all__"] = oo( db=i.db, rx=i.rx, yes= n(yess), all=n(ass+bs+cs+ds), a=n(ass), b=n(bs), c=n(cs), d=n(ds), acc=p(accs), pd=p(pds), pf=p(pfs), prec=p(precs), f=p(fs), g=p(gs),x="__all__") return out def report(i,brief=False): """ Write the performance scores for each class, then the weighted sum of those scores across all classes. """ i.header() for x,s in sorted(i.scores().items()): if not brief: print("#", ('{0:20s} {1:10s} {2:4d} {3:4d} {4:4d}'+\ '{5:4d} {6:4d} {7:4d} {8:3d} {9:3d} '+ \ '{10:3d} {11:3d} {12:3d} {13:10s}').format( s.db, s.rx, s.yes, s.a, s.b, s.c, s.d, s.acc, s.pd, s.pf, s.prec, s.f, s.g, x)) # Tool for reading in the data from standard input. if __name__ == "__main__": log = None for line in sys.stdin: words= re.sub(r"[\n\r]","",line).split(" ") one,two= words[0],words[1] if log: log(one,two) else: log=abcd(one,two) log.report()