Skip to content

Commit

Permalink
Add handling of discontinuous annotations (brat >= 1.3).
Browse files Browse the repository at this point in the history
Discontinuous annotations can be split into multiple annotations, one for each
fragment, or joined into a continuous annotation that starts with the first
fragment and ends with the last. This behavior is controlled by a new parameter
`split_discontinuous` whose default is `False` (i.e., joining discontinuous
annotations).
  • Loading branch information
jamesdunham committed Sep 3, 2017
1 parent 9ad7789 commit 54d2c3b
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 32 deletions.
110 changes: 79 additions & 31 deletions src/brat_to_conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,41 +66,82 @@ def get_sentences_and_tokens_from_stanford(text, core_nlp):
sentences.append(tokens)
return sentences

def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False):
def get_entities_from_brat(text_filepath, annotation_filepath, split_discontinuous, verbose=False):
# load text
with codecs.open(text_filepath, 'r', 'UTF-8') as f:
text =f.read()
text = f.read()
if verbose: print("\ntext:\n{0}\n".format(text))

# parse annotation file
entities = []
with codecs.open(annotation_filepath, 'r', 'UTF-8') as f:
for line in f.read().splitlines():
anno = line.split()
id_anno = anno[0]
# parse entity
if id_anno[0] == 'T':
entity = {}
entity['id'] = id_anno
entity['type'] = anno[1]
entity['start'] = int(anno[2])
entity['end'] = int(anno[3])
entity['text'] = ' '.join(anno[4:])
if verbose:
print("entity: {0}".format(entity))
# Check compatibility between brat text and anootation
if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \
utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']):
print("Warning: brat text and annotation do not match.")
print("\ttext: {0}".format(text[entity['start']:entity['end']]))
print("\tanno: {0}".format(entity['text']))
# add to entitys data
entities.append(entity)
if verbose: print("\n\n")

ann = f.read().splitlines()
entities = parse_brat_annotations(ann, text, split_discontinuous)
return text, entities

def check_brat_annotation_and_text_compatibility(brat_folder):
def parse_brat_annotations(ann, text, split_discontinuous, verbose=False):
'''
Parse the contents of brat annotation files (.ann and .txt) for entities.
For compatibility with discontinuous annotations in brat >= 1.3, entity text
is from slicing the text per the annotation offsets, rather than from the
annotation reference text.
:param split_discontinuous: If True, split each discontinuous annotation
(brat >= 1.3) into separate annotations. If False, join the fragments into a
continuous annotation that starts with the first fragment and ends with the
last.
'''
ann = [line for line in ann if line[0] == 'T']
entities = []
for line in ann:
brat_id, entity_type, offsets, line_text = split_ann(line)
if split_discontinuous:
offsets = [(min(pair[0] for pair in offsets), max(pair[1] for pair in offsets))]
for start, end in offsets:
entity = {
'id': brat_id,
'type': entity_type,
'start': start,
'end': end,
'text': text[start:end],
}
entities.append(entity)
return entities

def split_ann(line):
'''
Split a line from a brat .ann file into its components.
In a line from an .ann file that represents a text-bound annotation, a
sequential numeric ID prefixed with 'T' is followed by a tab, then an entity
type, a space, and at least one pair of space-delimited offsets.
Each of the offset pairs gives the range of a zero-indexed annotation span,
[start, end). With brat >= 1.3, annotations can be composed of discontinuous
"fragments." Multiple offset pairs are delimited by semicolons.
After the offset pair(s) and a tab comes the reference text. For
discontinous annotations, the reference text is the concatenation of the
fragments delimited by spaces. Note that this means that the annotated
entity as it appears in the text cannot necessarily be recovered from the
reference text.
See http://brat.nlplab.org/standoff.html.
Return:
- brat_id: brat annotation ID, e.g. 'T1'.
- entity_type: entity type, e.g. 'Org'.
- offsets: list of int offset tuples, e.g., [(0, 4)] for a continuous
annotation or [(0, 4), (6, 9)] for a discontinuous annotation with 2
fragments.
- line_text: reference text, e.g. 'Lorem ipsum'.
'''
brat_id, type_offsets, line_text = line.split('\t', maxsplit=2)
entity_type, offsets = type_offsets.split(maxsplit=1)
offsets = [pair.split() for pair in offsets.split(';')]
offsets = [(int(pair[0]), int(pair[1])) for pair in offsets]
return brat_id, entity_type, offsets, line_text

def check_brat_annotation_and_text_compatibility(brat_folder, split_discontinuous):
'''
Check if brat annotation and text files are compatible.
'''
Expand All @@ -113,10 +154,16 @@ def check_brat_annotation_and_text_compatibility(brat_folder):
# check if annotation file exists
if not os.path.exists(annotation_filepath):
raise IOError("Annotation file does not exist: {0}".format(annotation_filepath))
text, entities = get_entities_from_brat(text_filepath, annotation_filepath)
text, entities = get_entities_from_brat(text_filepath, annotation_filepath, split_discontinuous)
for entity in entities:
if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \
utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']):
print('Warning: brat text and annotation do not match:')
print("\ttext: {0}".format(text[entity['start']:entity['end']]))
print("\tanno: {0}".format(entity['text']))
print("Done.")

def brat_to_conll(input_folder, output_filepath, tokenizer, language):
def brat_to_conll(input_folder, output_filepath, tokenizer, language, split_discontinuous):
'''
Assumes '.txt' and '.ann' files are in the input_folder.
Checks for the compatibility between .txt and .ann at the same time.
Expand All @@ -139,7 +186,8 @@ def brat_to_conll(input_folder, output_filepath, tokenizer, language):
if not os.path.exists(annotation_filepath):
codecs.open(annotation_filepath, 'w', 'UTF-8').close()

text, entities = get_entities_from_brat(text_filepath, annotation_filepath)
text, entities = get_entities_from_brat(text_filepath,
annotation_filepath, split_discontinuous)
entities = sorted(entities, key=lambda entity:entity["start"])

if tokenizer == 'spacy':
Expand Down
5 changes: 5 additions & 0 deletions src/parameters.ini
Original file line number Diff line number Diff line change
Expand Up @@ -122,5 +122,10 @@ reload_token_lstm = True
reload_feedforward = True
reload_crf = True

# If split_discontinuous is True, then when reading datasets created with brat >= 1.3, split each discontinuous annotation into multiple annotations of the
# same entity type, one for each fragment. If False, join the fragments of each discontinuous annotation into a continuous annotation starting with the first
# fragment and ending with the last. Note that in brat >= 1.3, annotations spanning newlines are represented as discontinuous annotations.
split_discontinuous = False

parameters_filepath = ./parameters.ini

5 changes: 5 additions & 0 deletions src/test/test-brat.ann
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
T1 Org 0 26 Lorem ipsum dolor sit amet
T2 Org 299 304;305 314 purus convallis
T3 Org 321 324;354 361 Nam sodales
T4 Org 444 460 Aliquam lobortis
T5 Org 555 559;570 576;587 601 Sed; tellus tempor; semper
9 changes: 9 additions & 0 deletions src/test/test-brat.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam quam ligula,
faucibus ut dignissim eget, consequat sed urna. Mauris tortor erat, semper vel
dolor a, euismod elementum metus. Aenean aliquet magna sed nibh consequat, a
feugiat enim consequat. Nulla rhoncus metus nulla, ac sollicitudin purus
convallis quis. Nam consequat nisi quis eleifend sodales. Integer luctus massa
sit amet ex cursus elementum. Nam dictum ac sem a faucibus. Aliquam lobortis,
ipsum mattis dignissim rhoncus, nisi lectus fringilla eros, at sagittis elit
nisl eu mauris. Sed; fringilla tellus quis quam tempor; semper.

8 changes: 7 additions & 1 deletion src/test/test-parameters-training.ini
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,10 @@ reload_token_lstm = True
reload_feedforward = True
reload_crf = True

parameters_filepath = ./parameters.ini
# If split_discontinuous is True, then when reading datasets created with brat >= 1.3, split each discontinuous annotation into multiple annotations of the
# same entity type, one for each fragment. If False, join the fragments of each discontinuous annotation into a continuous annotation starting with the first
# fragment and ending with the last. Note that in brat >= 1.3, annotations spanning newlines are represented as discontinuous annotations.
split_discontinuous = False

parameters_filepath = ./parameters.ini

48 changes: 48 additions & 0 deletions src/test_brat_to_conll.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
'''
Tests for BRAT parsing.
'''

import unittest
import os
import brat_to_conll

GOLD_EXPANDED = [
'Lorem ipsum dolor sit amet',
'purus\nconvallis',
'Nam consequat nisi quis eleifend sodales',
'Aliquam lobortis',
'Sed; fringilla tellus quis quam tempor; semper',
]

GOLD_SPLIT = [
'Lorem ipsum dolor sit amet',
'purus',
'convallis',
'Nam',
'sodales',
'Aliquam lobortis',
'Sed;',
'tellus',
'tempor; semper',
]

class TestBrat(unittest.TestCase):
test_folder = os.path.join(os.path.dirname(__file__), "test")
txt = os.path.join(test_folder, 'test-brat.txt')
ann = os.path.join(test_folder, 'test-brat.ann')

def test_fragments(self):
for expand, gold in zip([True, False], [GOLD_EXPANDED, GOLD_SPLIT]):
print('expand_fragments={}'.format(expand))
text, entities = brat_to_conll.get_entities_from_brat(self.txt,
self.ann, expand)
self.assertTrue(len(entities) == len(gold))
for i, entity in enumerate(entities):
print('[parse {}] {}'.format(i, entity['text']))
print('[truth {}] {}'.format(i, gold[i]))
self.assertTrue(entity['text'] == gold[i])
print('')

if __name__ == "__main__":
unittest.main()

0 comments on commit 54d2c3b

Please sign in to comment.