Skip to content

Commit

Permalink
Merge pull request #131 from tmbdev/levenshtein
Browse files Browse the repository at this point in the history
[econf] Handle possibility that filtered GT is empty
  • Loading branch information
zuphilip authored Jan 23, 2017
2 parents ad8b9af + 952910e commit 7020a76
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 7 deletions.
2 changes: 2 additions & 0 deletions ocrolib/edist.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def xlevenshtein(a,b,context=1):
"""Calculates the Levensthein distance between a and b
and generates a list of differences by context."""
n, m = len(a), len(b)
assert m>0 # xlevenshtein should only be called with non-empty b string (ground truth)
if a == b: return 0,[] # speed up for the easy case
sources = empty((m+1,n+1),object)
sources[:,:] = None
dists = 99999*ones((m+1,n+1))
Expand Down
24 changes: 17 additions & 7 deletions ocropus-econf
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,19 @@ def process1(fname):
else:
missing = len(gt)
txt = ""
if args.confusion>0 or args.allconf is not None:
# Also the ground truth cannot be empty, it is possible that
# after filtering (args.kind) the gt string is empty.
if len(gt) == 0:
err = len(txt)
if(len(txt)>0):
cs = [(txt,'_'*len(txt))]
else:
cs = []
else:
err,cs = edist.xlevenshtein(txt,gt,context=args.context)
if args.confusion>0 or args.allconf is not None:
for u,v in cs:
counts[(u,v)] += 1
else:
err = edist.levenshtein(txt,gt)
#assert err==xerr
return fname,err,len(gt),missing,counts

outputs = ocrolib.parallel_map(process1,args.files,parallel=args.parallel,chunksize=10)
Expand Down Expand Up @@ -89,11 +95,15 @@ if allconf is not None: allconf.close()
sys.stderr.write("errors %8d\n"%errs)
sys.stderr.write("missing %8d\n"%missing)
sys.stderr.write("total %8d\n"%total)
sys.stderr.write("err %8.3f %%\n"%(errs*100.0/total))
sys.stderr.write("errnomiss %8.3f %%\n"%((errs-missing)*100.0/total))
if (total>0):
sys.stderr.write("err %8.3f %%\n"%(errs*100.0/total))
sys.stderr.write("errnomiss %8.3f %%\n"%((errs-missing)*100.0/total))

if args.confusion>0:
for (a,b),v in counts.most_common(args.confusion):
print("%d\t%s\t%s" % (v, a, b))

print(errs * 1.0 / total)
if (total>0):
print(errs * 1.0 / total)
else:
print("Nothing to compare")
13 changes: 13 additions & 0 deletions tests/run-unit
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env python

from ocrolib import edist
def testLevenshtein(a, b, should):
if edist.levenshtein(a, b) == should:
print 'ok - levenshtein(%s, %s) == %s' % (a,b,should)
else:
print 'not ok - levenshtein(%s, %s) == %s' % (a,b,should)
testLevenshtein('a', 'a', 0)
testLevenshtein('', '', 0)
testLevenshtein('a', '', 1)
testLevenshtein('', 'a', 1)
testLevenshtein('aa', 'aaaaaa', 4)

0 comments on commit 7020a76

Please sign in to comment.