forked from clips/MBSP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.py
executable file
·184 lines (154 loc) · 8.25 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#### MEMORY-BASED SHALLOW PARSER ######################################################################
# Copyright (c) 2003-2010 University of Antwerp, Belgium and Tilburg University, The Netherlands
# Vincent Van Asch <vincent.vanasch@ua.ac.be>, Tom De Smedt <tom@organisms.be>
# License: GNU General Public License, see LICENSE.txt
#### CONFIG ##########################################################################################
# Settings for MBSP:
# - verbosity
# - ports to run severs on,
# - start servers when imported or not,
# - path to MBSP module,
# - path to TiMBL6, MBT3 and resource files,
# - character encoding to be used,
# - slash special character.
import os, sys, stat
#-----------------------------------------------------------------------------------------------------
# Verbosity of the MBSP parser.
# If verbosity set to True more info is printed during startup.
verbose = True
#-----------------------------------------------------------------------------------------------------
# Network ports for the servers (chunk / lemma / relation / PP-attachment).
# The default startup order of the servers is:
# servers = ['chunk', 'lemma', 'relation', 'preposition']
servers = ['chunk', 'lemma', 'relation', 'preposition']
ports = [6061, 6062, 6063, 6064] # Restart servers when changed.
#-----------------------------------------------------------------------------------------------------
# The hosts where the servers are running.
# The order is the same as the ports.
LOCALHOST = 'localhost'
hosts = [LOCALHOST, LOCALHOST, LOCALHOST, LOCALHOST]
#-----------------------------------------------------------------------------------------------------
# Automatically start servers at localhost when importing or not?
# If set to False you have to start the servers manually or make HOSTS point
# to a host where the servers are running before you can use the parser.
autostart = True
autostop = False
timeout = 60
#-----------------------------------------------------------------------------------------------------
# Keep logs of the requests sent to the TiMBL and MBT servers (and their response) or not?
# Logging increases the memory overhead and amounts to some extra operations during a server request,
# but there is a (small) chance the request is already in cache so we don't have to contact the server.
log = False
#-----------------------------------------------------------------------------------------------------
# As of TiMBL 6.3.0 + TiMBLServer 1.0.0, there is stable support for concurrent server requests.
# Enabling threading to contact a 6.3+ server can increase performance by 25% - 200%.
threading = False
#-----------------------------------------------------------------------------------------------------
# The folder where MBSP resides.
# By default this is the same path as config.py.
MODULE = os.path.dirname(os.path.abspath(__file__))
#-----------------------------------------------------------------------------------------------------
# If servers are run at localhost you must also set:
# - path to the local TiMBL executable and the local MBT executable,
# - the MBLEM lemmatizer executable,
# - the folder where the training data is.
paths = dict(
timbl = os.path.join(MODULE, 'timbl', 'Timbl'), \
mbt = os.path.join(MODULE, 'mbt', 'Mbt'), \
mblem = os.path.join(MODULE, 'mblem', 'mblem_english_bmt'), \
models = os.path.join(MODULE, 'models')
)
#-----------------------------------------------------------------------------------------------------
# Path to the Perl binary.
# This is deprecated, all Perl dependencies now have pure-Python implementations.
perl = '/usr/bin/perl'
#-----------------------------------------------------------------------------------------------------
# Default string encoding used with the parse() command:
encoding = 'utf-8'
#-----------------------------------------------------------------------------------------------------
# MBSP uses a / to separate tags in a tagged word.
# Slashes in the word itself are encoded (e.g. hello/goodbye => hello/UH &slash;/SYM goodbye/NN).
SLASH = "&slash;"
#-----------------------------------------------------------------------------------------------------
# If you create your own models and start the servers you may want to change
# the settings used to start the TiMBL and MBT servers in the file server.py.
######################################################################################################
# You don't need to change anything below this line.
######################################################################################################
# Token tags:
ALL = "all"
WORD = "word" # The word as it appears in the sentence.
POS = "part-of-speech" # The part-of-speech of the word (e.g. noun, adjective).
CHUNK = "chunk" # The chunk tag of a group of words (e.g. noun phrase, verb phrase).
PNP = "preposition" # Indicates a prepositional noun phrase (e.g. with a fork)
REL = "relation" # Verb/argument tags (e.g. sentence subject, sentence object).
ANCHOR = "anchor" # The chunk has prepositional noun phrases attached (e.g. eat => with a fork).
LEMMA = "lemma" # The base form of the word (e.g. mice => mouse).
PART_OF_SPEECH = POS
RELATION = REL
PREPOSITION = PNP
# Common chunk tags:
NP = "NP" # noun phrase
VP = "VP" # verb phrase
PP = "PP" # preposition
ADJP = "ADJP" # adjective phrase
ADVP = "ADVP" # adverb phrase
DT = "PRT" # determiner
CC = "CC" # coordinating conjunction
CHUNKS = PHRASES = [NP, VP, PP, ADJP, ADVP, DT, CC]
# Common role tags:
SBJ = "SBJ" # subject
OBJ = "OBJ" # object
PRD = "PRD" # predicate
CLR = "CLR" # closely related
DIR = "DIR" # direction
EXT = "EXT" # extent
LOC = "LOC" # location
PRP = "PRP" # purpose
ROLES = [SBJ, OBJ, PRD, CLR, DIR, EXT, LOC, PRP]
#-----------------------------------------------------------------------------------------------------
# True when running on Windows.
WINDOWS = sys.platform.startswith('win')
#-----------------------------------------------------------------------------------------------------
class adict(dict):
""" A dictionary in which every key is available as attribute, i.e. adict.key => value.
"""
def __setattr__(self, k, v):
self[k] = v
def __getattr__(self, k):
if k in self: return self[k]
raise AttributeError, "'adict' object has no attribute '%s'" % k
# Events offer a simple way to customize to MBSP.
# You can inject your own functions at various stages of the parsing process
# (for example, to alter tokenization or token tags).
events = adict()
# Server event handlers take a Server object as input.
# Function format: handler(server) => None
events.server = adict.fromkeys([
'on_register', # When a server is registered in Servers, but before it is started.
'on_start', # When a server has successfully started.
'on_stop' # When a server has successfully stopped.
])
# Parser event handlers take the processed input string of MBSP.parse() at various stages.
# Function format: handler(tokenstring) => tokenstring
# The tokenstring.tags contains all tags parsed so far.
# Note: token tags cannot be added, deleted or reordered since the parser is still using them;
# they can only be edited.
events.parser = adict.fromkeys([
'on_tokenize', # When the string has been tokenized.
'on_parse_tags_and_chunks', # When the string has been parsed for part-of-speech and chunk tags.
'on_parse_prepositions', # When the string has been parsed for PNP-chunks.
'on_parse_relations', # When the string has been parsed for relation tags.
'on_parse_pp_attachments ', # When the string has been parsed for PP-attachments.
'on_lemmatize', # When the string has been parsed for word lemmata.
], None)
#-----------------------------------------------------------------------------------------------------
def _executable(path):
mode = stat.S_IXUSR | stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH | stat.S_IXGRP | stat.S_IXOTH
if os.path.isfile(path):
if not os.access(path, os.X_OK):
os.chmod(path, mode)
# Ensure TiMBL, MBT and mblem are executable.
_executable(paths['timbl'])
_executable(paths['mbt'])
_executable(paths['mblem'])