-
Notifications
You must be signed in to change notification settings - Fork 5
/
geniatagger.py
76 lines (59 loc) · 2.11 KB
/
geniatagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Copyright (c) 2018 Borna Bešić
#
# This software is released under the MIT License.
# https://opensource.org/licenses/MIT
import subprocess
import re
import os
class GENIATagger:
'''GENIA tagger wrapper
Spawns a tagger subprocess in the background.
The communication is done through stdin & stdout pipes.
Messages that executable writes to stderr are ignored.
'''
def __init__(self, executable_path):
'''
Constructs a GENIA tagger wrapper object
Arguments:
- executable_path : str
Path to the compiled GENIA tagger executable
'''
self.executable_path = os.path.abspath(executable_path)
directory, executable = os.path.split(self.executable_path)
self.process = subprocess.Popen(
os.path.join(".", executable),
stdin = subprocess.PIPE,
stdout = subprocess.PIPE,
stderr = subprocess.DEVNULL,
cwd = directory
)
def tag(self, text):
'''
A generator function that tags the specified text.
Arguments:
- text : str
Text that will be tagged
Yields:
(word, base form, POS tag, chunk, named entity)
tuple for each word in the specified text.
'''
text_lf = text + os.linesep
self.process.stdin.write(text_lf.encode("utf-8"))
self.process.stdin.flush()
while True:
line = self.process.stdout.readline().decode("utf-8").strip()
if line == "":
break
row = tuple(re.split("\s+", line))
if len(row) == 5: # word, base form, POS tag, chunk, named entity
yield row
def stop(self):
''' Terminates the subprocess running in the backgroud. '''
self.process.terminate()
return self.process.wait()
def __enter__(self):
''' Enables the object to be used with the 'with' statement '''
return self
def __exit__(self, type, value, traceback):
''' Calls self.stop() after exiting the 'with' block '''
self.stop()