-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate.py
executable file
·294 lines (259 loc) · 9.07 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/usr/bin/env python
"""First-pass English stimulus generation.
This simply generates potential stimuli and codes their basic properties.
Subsequent steps will remove actual lexical items and balance the stimuli
according to our assumptions about grammaticality.
"""
import argparse
import csv
import dataclasses
import itertools
import logging
from typing import Iterator, List, Set
import citylex
MONOSYLLABLES = "monosyllables.tsv"
DISYLLABLES = "disyllables.tsv"
COLUMNS = [
"onset1",
"nucleus1",
"coda1",
"onset2",
"nucleus2",
"coda2",
"shape",
"syllable.contact.code",
"transcription",
]
# Onsets.
VOICELESS_STOPS = ["p", "t", "k"]
VOICED_STOPS = ["b", "d", "g"]
STOPS = VOICELESS_STOPS + VOICED_STOPS
STOPS_PLUS_S = STOPS + ["s"]
LIQUID = ["l", "r"]
# ŋ does not occur word-initially
NASALS = ["m", "n"]
SIMPLE_ONSETS = STOPS + NASALS
SIMPLE_ONSETS_PLUS_S = STOPS + NASALS + ["s"]
# I put aside /ɔɪ/ as it's pretty rare.
TENSE_NUCLEI = ["iː", "uː", "eɪ", "oʊ", "ɑ", "aɪ", "aʊ"]
# Of course some dialects have a tense variant of [æ].
LAX_NUCLEI = ["ɪ", "ʊ", "ɛ", "æ"]
NUCLEI = TENSE_NUCLEI + LAX_NUCLEI
# Codas.
NASAL_CODAS = ["m", "n", "ŋ"]
STOP_CODAS = ["p", "t", "k"]
SIMPLE_CODAS = NASAL_CODAS + STOP_CODAS
SIMPLE_CODAS_PLUS_S = SIMPLE_CODAS + ["s"]
# Place coding; add segments as needed.
PLACE = {
"p": "labial",
"t": "coronal",
"k": "velar",
"b": "labial",
"d": "coronal",
"g": "velar",
"s": "coronal",
"m": "labial",
"n": "coronal",
"ŋ": "velar",
}
class Error(Exception):
pass
@dataclasses.dataclass
class Monosyllable:
onset: str
nucleus: str
coda: str
shape: str
@property
def transcription(self) -> str:
return self.onset + self.nucleus + self.coda
@property
def line(self) -> List[str]:
return [
self.onset,
self.nucleus,
self.coda,
"", # onset2
"", # nucleus2
"", # coda2,
self.shape,
"", # syllable.contact.code,
self.transcription,
]
@dataclasses.dataclass
class Disyllable:
syl1: Monosyllable
syl2: Monosyllable
@property
def shape(self) -> str:
return f"{self.syl1.shape}.{self.syl2.shape}"
@property
def syllable_contact_code(self) -> str:
coda = self.syl1.coda
onset = self.syl2.onset
if coda in NASAL_CODAS:
# Check nasal place agreement (see Gorman 2013:75).
if PLACE[coda] == PLACE[onset]:
return "+NPA"
else:
return "-NPA"
else:
# Check obstruent voice assimilation (see Gorman 2013:74).
coda_coding = coda == "s" or coda in VOICELESS_STOPS
onset_coding = onset == "s" or onset in VOICELESS_STOPS
return "+OVA" if coda_coding == onset_coding else "-OVA"
@property
def transcription(self) -> str:
return f"{self.syl1.transcription}{self.syl2.transcription}"
@property
def line(self) -> List[str]:
return [
self.syl1.onset,
self.syl1.nucleus,
self.syl1.coda,
self.syl2.onset,
self.syl2.nucleus,
self.syl2.coda,
self.shape,
self.syllable_contact_code,
self.transcription,
]
def _borowsky_test(monosyllable: Monosyllable) -> bool:
# Borowsky claims that a tense nucleus cannot be followed by angma.
return monosyllable.nucleus in TENSE_NUCLEI and monosyllable.coda == "ŋ"
def _npa_test(disyllable: Disyllable) -> bool:
# It is impossible to tell from a transcription if NPA has obtained when
# the coda is /n/ and the onset is a velar, since both assimilated
# /ŋk, ŋg/ and unassimilated /nk, ng/ would presumably be spelled the
# same.
return disyllable.syl1.coda == "n" and disyllable.syl2.onset in ["k", "g"]
def _monosyllables() -> Iterator[Monosyllable]:
# CVC.
for onset in SIMPLE_ONSETS_PLUS_S:
for nucleus in NUCLEI:
for coda in SIMPLE_CODAS_PLUS_S:
if onset == coda:
continue
yield Monosyllable(onset, nucleus, coda, "CVC")
# sCVC.
# I could do sl here too, but not a lot gained from it.
for stop in VOICELESS_STOPS:
onset = "s" + stop
for nucleus in NUCLEI:
for coda in SIMPLE_CODAS:
if stop == coda:
continue
yield Monosyllable(onset, nucleus, coda, "sCVC")
# CwVC.
for stop in STOPS:
onset = stop + "w"
for nucleus in LAX_NUCLEI:
for coda in SIMPLE_CODAS_PLUS_S:
if stop == coda:
continue
yield Monosyllable(onset, nucleus, coda, "CwVC")
# T[liquid]VC.
for stop in ["t", "d"]:
for nucleus in NUCLEI:
for coda in SIMPLE_CODAS_PLUS_S:
if stop == coda:
continue
yield Monosyllable(stop + "l", nucleus, coda, "TlVC")
yield Monosyllable(stop + "ɹ", nucleus, coda, "TɹVC")
# Prenasal and postnasal.
for stop in STOPS:
for coda in STOP_CODAS:
if stop == coda:
continue
for nasal in NASALS:
for nucleus in NUCLEI:
yield Monosyllable(stop + nasal, nucleus, coda, "CNVC")
yield Monosyllable(nasal + stop, nucleus, coda, "NCVC")
def _disyllables() -> Iterator[Disyllable]:
# We enforce lax v1, lax v2.
for nucleus1, nucleus2 in itertools.permutations(LAX_NUCLEI, 2):
# CVCCVC.
for onset1 in SIMPLE_ONSETS_PLUS_S:
for coda1 in SIMPLE_CODAS_PLUS_S:
if onset1 == coda1:
continue
syl1 = Monosyllable(onset1, nucleus1, coda1, "CVC")
for onset2 in STOPS_PLUS_S:
if onset1 == onset2 or coda1 == onset2:
continue
for coda2 in SIMPLE_CODAS_PLUS_S:
if coda1 == coda2:
continue
syl2 = Monosyllable(onset2, nucleus2, coda2, "CVC")
yield Disyllable(syl1, syl2)
# T[liquid]VCCVC.
for stop1 in ["t", "d"]:
for coda1 in SIMPLE_CODAS_PLUS_S:
if stop1 == coda1:
continue
syls1 = [
Monosyllable(stop1 + "l", nucleus1, coda1, "TlVC"),
Monosyllable(stop1 + "ɹ", nucleus1, coda1, "TɹVC"),
]
for onset2 in STOPS_PLUS_S:
if stop1 == onset2 or coda1 == onset2:
continue
for coda2 in SIMPLE_CODAS_PLUS_S:
if coda1 == coda2:
continue
syl2 = Monosyllable(onset2, nucleus2, coda2, "CVC")
for syl1 in syls1:
yield Disyllable(syl1, syl2)
def main(args: argparse.Namespace) -> None:
cl = citylex.read_textproto("citylex.textproto")
lexicon: Set[str] = set()
for entry in cl.entry.values():
for pron in entry.wikipron_us_pron:
lexicon.add(pron.replace(" ", ""))
if args.extra_lexicon:
with open(args.extra_lexicon, "r") as source:
for line in source:
lexicon.add(line.rstrip())
logging.info(f"{len(lexicon):,} lexicon entries")
with open(MONOSYLLABLES, "w") as sink:
tsv_writer = csv.writer(sink, delimiter="\t")
tsv_writer.writerow(COLUMNS)
filtered = 0
for entry in _monosyllables():
if _borowsky_test(entry):
continue
if entry.transcription in lexicon:
logging.info(f"{entry.transcription} is lexical")
filtered += 1
continue
tsv_writer.writerow(entry.line)
logging.info(f"{filtered:,} monosyllables filtered")
with open(DISYLLABLES, "w") as sink:
tsv_writer = csv.writer(sink, delimiter="\t")
tsv_writer.writerow(COLUMNS)
filtered = 0
for entry in _disyllables():
if _npa_test(entry):
continue
# Inefficient surely, but harmless.
if any(
part in lexicon
for part in [
entry.syl1.transcription,
entry.syl2.transcription,
entry.transcription,
]
):
logging.info(
f"{entry.transcription} or subsyllable is lexical"
)
filtered += 1
continue
tsv_writer.writerow(entry.line)
logging.info(f"{filtered:,} disyllables filtered")
if __name__ == "__main__":
logging.basicConfig(format="%(levelname)s: %(message)s", level="INFO")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--extra-lexicon", help="Optional list of pronunciations (one per line) to exclude")
main(parser.parse_args())