Skip to content

Commit

Permalink
Changed indentation.
Browse files Browse the repository at this point in the history
  • Loading branch information
mfelice committed May 17, 2015
1 parent f524e8c commit fc79fdf
Showing 1 changed file with 94 additions and 94 deletions.
188 changes: 94 additions & 94 deletions m2_to_ixml.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,45 +29,45 @@
### FUNCTIONS ###

def cluster_has_overlap(c, e):
return any(edit_has_overlap(e, ce) for ce in c)
return any(edit_has_overlap(e, ce) for ce in c)

def edit_has_overlap(e1, e2):
# [0]: start offset
# [1]: end offset
if e1[0] == e2[0] and e1[1] == e2[1]:
return True
elif e1[0] == e1[1]:
return (e1[0] > e2[0] and e1[1] < e2[1])
elif e2[0] == e2[1]:
return (e2[0] > e1[0] and e2[1] < e1[1])
else:
return (e1[1] > e2[0] and e1[1] <= e2[1]) or \
(e1[0] >= e2[0] and e1[0] < e2[1])
# [0]: start offset
# [1]: end offset
if e1[0] == e2[0] and e1[1] == e2[1]:
return True
elif e1[0] == e1[1]:
return (e1[0] > e2[0] and e1[1] < e2[1])
elif e2[0] == e2[1]:
return (e2[0] > e1[0] and e2[1] < e1[1])
else:
return (e1[1] > e2[0] and e1[1] <= e2[1]) or \
(e1[0] >= e2[0] and e1[0] < e2[1])

def group_by_alternatives(cluster):
alt_list = []
# Sort and group by annotator
cluster.sort(key=lambda x: x[-1])
for key, group in groupby(cluster, lambda x: x[-1]):
alt_list.append([x for x in group])
return alt_list
alt_list = []
# Sort and group by annotator
cluster.sort(key=lambda x: x[-1])
for key, group in groupby(cluster, lambda x: x[-1]):
alt_list.append([x for x in group])
return alt_list

def get_type(cluster):
types = set(x[2] for x in cluster)
return '/'.join(types)
types = set(x[2] for x in cluster)
return '/'.join(types)

### MAIN ###

for i in range(1,len(sys.argv)):
if sys.argv[i].startswith("-in:"):
in_file = sys.argv[i][4:]
if sys.argv[i].startswith("-out:"):
out_file = sys.argv[i][5:]
if sys.argv[i].startswith("-in:"):
in_file = sys.argv[i][4:]
if sys.argv[i].startswith("-out:"):
out_file = sys.argv[i][5:]

# Do we have what we need?
if not in_file:
print help_str
exit(0)
print help_str
exit(0)

# Read gold standard annotations
f_in = open(in_file,"r")
Expand All @@ -76,85 +76,85 @@ def get_type(cluster):
ref_annot = []
s = -1
for line in f_in:
if line[0] == "S":
s += 1
# Get and save original sentence
src_sents.append(line.split()[1:])
annotators.append(set())
ref_annot.append([])
elif line[0] == "A":
# Save annotations
tokens = line.split("|||")
coords = tokens[0].split()
c_start = int(coords[1])
c_end = int(coords[2])
etype = tokens[1]
# Uses only the first correction
correction = tokens[2].split("||")[0]
# Tokenise it just in case!
correction = tokens[2].split("||")[0]
correction = ' '.join(nltk.word_tokenize(correction))
required = tokens[3]
annotator = int(tokens[5])
annotators[s].add(annotator)
if c_start == -1 and c_end == -1 and etype.lower() == "noop":
# Noop --> empty set of edits (source is right)
pass
else:
ref_annot[s].append([c_start, c_end, etype, correction, annotator])
if line[0] == "S":
s += 1
# Get and save original sentence
src_sents.append(line.split()[1:])
annotators.append(set())
ref_annot.append([])
elif line[0] == "A":
# Save annotations
tokens = line.split("|||")
coords = tokens[0].split()
c_start = int(coords[1])
c_end = int(coords[2])
etype = tokens[1]
# Uses only the first correction
correction = tokens[2].split("||")[0]
# Tokenise it just in case!
correction = tokens[2].split("||")[0]
correction = ' '.join(nltk.word_tokenize(correction))
required = tokens[3]
annotator = int(tokens[5])
annotators[s].add(annotator)
if c_start == -1 and c_end == -1 and etype.lower() == "noop":
# Noop --> empty set of edits (source is right)
pass
else:
ref_annot[s].append([c_start, c_end, etype, correction, annotator])
f_in.close()

# Create the output XML
if not out_file:
out_file = in_file + ".ieval.xml"
out_file = in_file + ".ieval.xml"
f_out = XMLWriter(out_file, "UTF-8")
f_out.declaration()
f_out.start("scripts")
f_out.start("script", id="1") # Assume only one script

# Do clustering
for s in xrange(len(ref_annot)):
sys.stdout.write("\rSentence %s..." % (s+1))
sys.stdout.flush()
clusters = []
# Sort edits from longest to shortest range
ref_annot[s].sort(key=lambda x: x[0] - x[1])
for e in ref_annot[s]: # Go through each edit
merge = False
for c in clusters:
if cluster_has_overlap(c, e):
# If the edit overlaps with an existing cluster, merge
c.append(e)
merge = True
break
if not merge:
# If the edit couldn't be merged, create a new cluster
clusters.append([e])
# Sort clusters by start and end offsets
clusters.sort(key=lambda x: (x[0][0],x[0][1]))
# Write to XML
f_out.start("sentence", id=str(s+1), numann=str(len(annotators[s])))
f_out.element("text", ' '.join(src_sents[s]))
f_out.start("error-list")

# Clusters
for i in xrange(len(clusters)):
alternatives = group_by_alternatives(clusters[i])
f_out.start("error", id=str(i+1), type=get_type(clusters[i]),
req=('yes' if len(alternatives)==len(annotators[s]) else 'no'))
# Alternatives
for j in xrange(len(alternatives)):
f_out.start("alt", ann=str(alternatives[j][0][4]))
# Corrections
for k in xrange(len(alternatives[j])):
f_out.element("c", alternatives[j][k][3], start=str(alternatives[j][k][0]), end=str(alternatives[j][k][1]))
f_out.end("alt")
f_out.end("error")
f_out.end("error-list")
f_out.end("sentence")
sys.stdout.write("\rSentence %s..." % (s+1))
sys.stdout.flush()
clusters = []
# Sort edits from longest to shortest range
ref_annot[s].sort(key=lambda x: x[0] - x[1])
for e in ref_annot[s]: # Go through each edit
merge = False
for c in clusters:
if cluster_has_overlap(c, e):
# If the edit overlaps with an existing cluster, merge
c.append(e)
merge = True
break
if not merge:
# If the edit couldn't be merged, create a new cluster
clusters.append([e])
# Sort clusters by start and end offsets
clusters.sort(key=lambda x: (x[0][0],x[0][1]))
# Write to XML
f_out.start("sentence", id=str(s+1), numann=str(len(annotators[s])))
f_out.element("text", ' '.join(src_sents[s]))
f_out.start("error-list")

# Clusters
for i in xrange(len(clusters)):
alternatives = group_by_alternatives(clusters[i])
f_out.start("error", id=str(i+1), type=get_type(clusters[i]),
req=('yes' if len(alternatives)==len(annotators[s]) else 'no'))
# Alternatives
for j in xrange(len(alternatives)):
f_out.start("alt", ann=str(alternatives[j][0][4]))
# Corrections
for k in xrange(len(alternatives[j])):
f_out.element("c", alternatives[j][k][3], start=str(alternatives[j][k][0]), end=str(alternatives[j][k][1]))
f_out.end("alt")
f_out.end("error")
f_out.end("error-list")
f_out.end("sentence")
f_out.end("script")
f_out.end("scripts")
print ""

0 comments on commit fc79fdf

Please sign in to comment.