-
Notifications
You must be signed in to change notification settings - Fork 474
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add handling of discontinuous annotations (brat >= 1.3).
Discontinuous annotations can be split into multiple annotations, one for each fragment, or joined into a continuous annotation that starts with the first fragment and ends with the last. This behavior is controlled by a new parameter `split_discontinuous` whose default is `False` (i.e., joining discontinuous annotations).
- Loading branch information
1 parent
9ad7789
commit 54d2c3b
Showing
6 changed files
with
153 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
T1 Org 0 26 Lorem ipsum dolor sit amet | ||
T2 Org 299 304;305 314 purus convallis | ||
T3 Org 321 324;354 361 Nam sodales | ||
T4 Org 444 460 Aliquam lobortis | ||
T5 Org 555 559;570 576;587 601 Sed; tellus tempor; semper |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam quam ligula, | ||
faucibus ut dignissim eget, consequat sed urna. Mauris tortor erat, semper vel | ||
dolor a, euismod elementum metus. Aenean aliquet magna sed nibh consequat, a | ||
feugiat enim consequat. Nulla rhoncus metus nulla, ac sollicitudin purus | ||
convallis quis. Nam consequat nisi quis eleifend sodales. Integer luctus massa | ||
sit amet ex cursus elementum. Nam dictum ac sem a faucibus. Aliquam lobortis, | ||
ipsum mattis dignissim rhoncus, nisi lectus fringilla eros, at sagittis elit | ||
nisl eu mauris. Sed; fringilla tellus quis quam tempor; semper. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
''' | ||
Tests for BRAT parsing. | ||
''' | ||
|
||
import unittest | ||
import os | ||
import brat_to_conll | ||
|
||
GOLD_EXPANDED = [ | ||
'Lorem ipsum dolor sit amet', | ||
'purus\nconvallis', | ||
'Nam consequat nisi quis eleifend sodales', | ||
'Aliquam lobortis', | ||
'Sed; fringilla tellus quis quam tempor; semper', | ||
] | ||
|
||
GOLD_SPLIT = [ | ||
'Lorem ipsum dolor sit amet', | ||
'purus', | ||
'convallis', | ||
'Nam', | ||
'sodales', | ||
'Aliquam lobortis', | ||
'Sed;', | ||
'tellus', | ||
'tempor; semper', | ||
] | ||
|
||
class TestBrat(unittest.TestCase): | ||
test_folder = os.path.join(os.path.dirname(__file__), "test") | ||
txt = os.path.join(test_folder, 'test-brat.txt') | ||
ann = os.path.join(test_folder, 'test-brat.ann') | ||
|
||
def test_fragments(self): | ||
for expand, gold in zip([True, False], [GOLD_EXPANDED, GOLD_SPLIT]): | ||
print('expand_fragments={}'.format(expand)) | ||
text, entities = brat_to_conll.get_entities_from_brat(self.txt, | ||
self.ann, expand) | ||
self.assertTrue(len(entities) == len(gold)) | ||
for i, entity in enumerate(entities): | ||
print('[parse {}] {}'.format(i, entity['text'])) | ||
print('[truth {}] {}'.format(i, gold[i])) | ||
self.assertTrue(entity['text'] == gold[i]) | ||
print('') | ||
|
||
if __name__ == "__main__": | ||
unittest.main() | ||
|