From 54d2c3ba23db605cbb83b91d10ecae78e769b944 Mon Sep 17 00:00:00 2001
From: James Dunham <james@jdunham.io>
Date: Sun, 3 Sep 2017 15:10:52 -0400
Subject: [PATCH] Add handling of discontinuous annotations (brat >= 1.3).

Discontinuous annotations can be split into multiple annotations, one for each
fragment, or joined into a continuous annotation that starts with the first
fragment and ends with the last. This behavior is controlled by a new parameter
`split_discontinuous` whose default is `False` (i.e., joining discontinuous
annotations).
---
 src/brat_to_conll.py                  | 110 ++++++++++++++++++--------
 src/parameters.ini                    |   5 ++
 src/test/test-brat.ann                |   5 ++
 src/test/test-brat.txt                |   9 +++
 src/test/test-parameters-training.ini |   8 +-
 src/test_brat_to_conll.py             |  48 +++++++++++
 6 files changed, 153 insertions(+), 32 deletions(-)
 create mode 100644 src/test/test-brat.ann
 create mode 100644 src/test/test-brat.txt
 create mode 100644 src/test_brat_to_conll.py

diff --git a/src/brat_to_conll.py b/src/brat_to_conll.py
index cb2b62f6..b87cc014 100755
--- a/src/brat_to_conll.py
+++ b/src/brat_to_conll.py
@@ -66,41 +66,82 @@ def get_sentences_and_tokens_from_stanford(text, core_nlp):
         sentences.append(tokens)
     return sentences
 
-def get_entities_from_brat(text_filepath, annotation_filepath, verbose=False):
+def get_entities_from_brat(text_filepath, annotation_filepath, split_discontinuous, verbose=False):
     # load text
     with codecs.open(text_filepath, 'r', 'UTF-8') as f:
-        text =f.read()
+        text = f.read()
     if verbose: print("\ntext:\n{0}\n".format(text))
-
     # parse annotation file
-    entities = []
     with codecs.open(annotation_filepath, 'r', 'UTF-8') as f:
-        for line in f.read().splitlines():
-            anno = line.split()
-            id_anno = anno[0]
-            # parse entity
-            if id_anno[0] == 'T':
-                entity = {}
-                entity['id'] = id_anno
-                entity['type'] = anno[1]
-                entity['start'] = int(anno[2])
-                entity['end'] = int(anno[3])
-                entity['text'] = ' '.join(anno[4:])
-                if verbose:
-                    print("entity: {0}".format(entity))
-                # Check compatibility between brat text and anootation
-                if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \
-                    utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']):
-                    print("Warning: brat text and annotation do not match.")
-                    print("\ttext: {0}".format(text[entity['start']:entity['end']]))
-                    print("\tanno: {0}".format(entity['text']))
-                # add to entitys data
-                entities.append(entity)
-    if verbose: print("\n\n")
-    
+        ann = f.read().splitlines()
+    entities = parse_brat_annotations(ann, text, split_discontinuous)
     return text, entities
 
-def check_brat_annotation_and_text_compatibility(brat_folder):
+def parse_brat_annotations(ann, text, split_discontinuous, verbose=False):
+    '''
+    Parse the contents of brat annotation files (.ann and .txt) for entities.
+
+    For compatibility with discontinuous annotations in brat >= 1.3, entity text
+    is from slicing the text per the annotation offsets, rather than from the
+    annotation reference text.
+
+    :param split_discontinuous: If True, split each discontinuous annotation
+    (brat >= 1.3) into separate annotations. If False, join the fragments into a
+    continuous annotation that starts with the first fragment and ends with the
+    last.
+    '''
+    ann = [line for line in ann if line[0] == 'T']
+    entities = []
+    for line in ann:
+        brat_id, entity_type, offsets, line_text = split_ann(line)
+        if split_discontinuous:
+            offsets = [(min(pair[0] for pair in offsets), max(pair[1] for pair in offsets))]
+        for start, end in offsets:
+            entity = {
+                    'id': brat_id,
+                    'type': entity_type,
+                    'start': start,
+                    'end': end,
+                    'text': text[start:end],
+                    }
+            entities.append(entity)
+    return entities
+
+def split_ann(line):
+    '''
+    Split a line from a brat .ann file into its components.
+
+    In a line from an .ann file that represents a text-bound annotation, a
+    sequential numeric ID prefixed with 'T' is followed by a tab, then an entity
+    type, a space, and at least one pair of space-delimited offsets.
+
+    Each of the offset pairs gives the range of a zero-indexed annotation span,
+    [start, end). With brat >= 1.3, annotations can be composed of discontinuous
+    "fragments." Multiple offset pairs are delimited by semicolons.
+
+    After the offset pair(s) and a tab comes the reference text. For
+    discontinous annotations, the reference text is the concatenation of the
+    fragments delimited by spaces. Note that this means that the annotated
+    entity as it appears in the text cannot necessarily be recovered from the
+    reference text.
+
+    See http://brat.nlplab.org/standoff.html.
+
+    Return:
+    - brat_id: brat annotation ID, e.g. 'T1'.
+    - entity_type: entity type, e.g. 'Org'.
+    - offsets: list of int offset tuples, e.g., [(0, 4)] for a continuous
+      annotation or [(0, 4), (6, 9)] for a discontinuous annotation with 2
+      fragments.
+    - line_text: reference text, e.g. 'Lorem ipsum'.
+    '''
+    brat_id, type_offsets, line_text = line.split('\t', maxsplit=2)
+    entity_type, offsets = type_offsets.split(maxsplit=1)
+    offsets = [pair.split() for pair in offsets.split(';')]
+    offsets = [(int(pair[0]), int(pair[1])) for pair in offsets]
+    return brat_id, entity_type, offsets, line_text
+
+def check_brat_annotation_and_text_compatibility(brat_folder, split_discontinuous):
     '''
     Check if brat annotation and text files are compatible.
     '''
@@ -113,10 +154,16 @@ def check_brat_annotation_and_text_compatibility(brat_folder):
         # check if annotation file exists
         if not os.path.exists(annotation_filepath):
             raise IOError("Annotation file does not exist: {0}".format(annotation_filepath))
-        text, entities = get_entities_from_brat(text_filepath, annotation_filepath)
+        text, entities = get_entities_from_brat(text_filepath, annotation_filepath, split_discontinuous)
+        for entity in entities:
+            if utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(text[entity['start']:entity['end']]) != \
+                utils_nlp.replace_unicode_whitespaces_with_ascii_whitespace(entity['text']):
+                print('Warning: brat text and annotation do not match:')
+                print("\ttext: {0}".format(text[entity['start']:entity['end']]))
+                print("\tanno: {0}".format(entity['text']))
     print("Done.")
 
-def brat_to_conll(input_folder, output_filepath, tokenizer, language):
+def brat_to_conll(input_folder, output_filepath, tokenizer, language, split_discontinuous):
     '''
     Assumes '.txt' and '.ann' files are in the input_folder.
     Checks for the compatibility between .txt and .ann at the same time.
@@ -139,7 +186,8 @@ def brat_to_conll(input_folder, output_filepath, tokenizer, language):
         if not os.path.exists(annotation_filepath):
             codecs.open(annotation_filepath, 'w', 'UTF-8').close()
 
-        text, entities = get_entities_from_brat(text_filepath, annotation_filepath)
+        text, entities = get_entities_from_brat(text_filepath,
+                annotation_filepath, split_discontinuous)
         entities = sorted(entities, key=lambda entity:entity["start"])
         
         if tokenizer == 'spacy':
diff --git a/src/parameters.ini b/src/parameters.ini
index 17d9aade..0c65697d 100644
--- a/src/parameters.ini
+++ b/src/parameters.ini
@@ -122,5 +122,10 @@ reload_token_lstm = True
 reload_feedforward = True
 reload_crf = True
 
+# If split_discontinuous is True, then when reading datasets created with brat >= 1.3, split each discontinuous annotation into multiple annotations of the
+# same entity type, one for each fragment. If False, join the fragments of each discontinuous annotation into a continuous annotation starting with the first
+# fragment and ending with the last. Note that in brat >= 1.3, annotations spanning newlines are represented as discontinuous annotations.
+split_discontinuous = False
+
 parameters_filepath = ./parameters.ini
 
diff --git a/src/test/test-brat.ann b/src/test/test-brat.ann
new file mode 100644
index 00000000..fcd194c9
--- /dev/null
+++ b/src/test/test-brat.ann
@@ -0,0 +1,5 @@
+T1	Org 0 26	Lorem ipsum dolor sit amet
+T2	Org 299 304;305 314	purus convallis
+T3	Org 321 324;354 361	Nam sodales
+T4	Org 444 460	Aliquam lobortis
+T5	Org 555 559;570 576;587 601	Sed; tellus tempor; semper
diff --git a/src/test/test-brat.txt b/src/test/test-brat.txt
new file mode 100644
index 00000000..c0801aaf
--- /dev/null
+++ b/src/test/test-brat.txt
@@ -0,0 +1,9 @@
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam quam ligula,
+faucibus ut dignissim eget, consequat sed urna. Mauris tortor erat, semper vel
+dolor a, euismod elementum metus. Aenean aliquet magna sed nibh consequat, a
+feugiat enim consequat. Nulla rhoncus metus nulla, ac sollicitudin purus
+convallis quis. Nam consequat nisi quis eleifend sodales. Integer luctus massa
+sit amet ex cursus elementum. Nam dictum ac sem a faucibus. Aliquam lobortis,
+ipsum mattis dignissim rhoncus, nisi lectus fringilla eros, at sagittis elit
+nisl eu mauris. Sed; fringilla tellus quis quam tempor; semper.
+
diff --git a/src/test/test-parameters-training.ini b/src/test/test-parameters-training.ini
index 425573a8..dce50a3b 100644
--- a/src/test/test-parameters-training.ini
+++ b/src/test/test-parameters-training.ini
@@ -114,4 +114,10 @@ reload_token_lstm = True
 reload_feedforward = True
 reload_crf = True
 
-parameters_filepath = ./parameters.ini
\ No newline at end of file
+# If split_discontinuous is True, then when reading datasets created with brat >= 1.3, split each discontinuous annotation into multiple annotations of the
+# same entity type, one for each fragment. If False, join the fragments of each discontinuous annotation into a continuous annotation starting with the first
+# fragment and ending with the last. Note that in brat >= 1.3, annotations spanning newlines are represented as discontinuous annotations.
+split_discontinuous = False
+
+parameters_filepath = ./parameters.ini
+
diff --git a/src/test_brat_to_conll.py b/src/test_brat_to_conll.py
new file mode 100644
index 00000000..898dd6f4
--- /dev/null
+++ b/src/test_brat_to_conll.py
@@ -0,0 +1,48 @@
+'''
+Tests for BRAT parsing.
+'''
+
+import unittest
+import os
+import brat_to_conll
+
+GOLD_EXPANDED = [
+        'Lorem ipsum dolor sit amet',
+        'purus\nconvallis',
+        'Nam consequat nisi quis eleifend sodales',
+        'Aliquam lobortis',
+        'Sed; fringilla tellus quis quam tempor; semper',
+        ]
+
+GOLD_SPLIT = [
+        'Lorem ipsum dolor sit amet',
+        'purus',
+        'convallis',
+        'Nam',
+        'sodales',
+        'Aliquam lobortis',
+        'Sed;',
+        'tellus',
+        'tempor; semper',
+        ]
+
+class TestBrat(unittest.TestCase):
+    test_folder = os.path.join(os.path.dirname(__file__), "test")
+    txt = os.path.join(test_folder, 'test-brat.txt')
+    ann = os.path.join(test_folder, 'test-brat.ann')
+
+    def test_fragments(self):
+        for expand, gold in zip([True, False], [GOLD_EXPANDED, GOLD_SPLIT]):
+            print('expand_fragments={}'.format(expand))
+            text, entities = brat_to_conll.get_entities_from_brat(self.txt,
+                    self.ann, expand)
+            self.assertTrue(len(entities) == len(gold))
+            for i, entity in enumerate(entities):
+                print('[parse {}] {}'.format(i, entity['text']))
+                print('[truth {}] {}'.format(i, gold[i]))
+                self.assertTrue(entity['text'] == gold[i])
+            print('')
+
+if __name__ == "__main__":
+    unittest.main()
+