-
Notifications
You must be signed in to change notification settings - Fork 1
/
rhymeFinder.py
431 lines (372 loc) · 18.7 KB
/
rhymeFinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
# syllables.py 1.1
#
# the Scandroid
# Copyright (C) 2005 Charles Hartman
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version. See the accompanying file, gpl.txt, for full
# details.
# OSI Certified Open Source Software
#
# This module implements the old Paul Holzer method (Byte, Feb 1986) to divide
# a word into syllables, but relies on regular expressions. (Parts of the old
# code remain mysterious; if I can get hold of the original Pascal...) When
# the class is declared its init compiles the REs. A program needs to know
# only about the central method Syllabize, which takes a one-word string and
# returns a list of syllable-strings, the stressed one upper-cased. (This
# interface is probably not much like the original.)
# I inserted code, not in Holzer, to deal with disyllabic vowel pairs. This
# led to rethinking the [yi]V combination: leave iV to disyl test, check for
# VyV (= a test for consonantal 'y'). This also screws up the old single-
# vowel-group shortcut, so I took it out.
# I did a lot of testing to try out new suffixes (and new kinds: ones with
# more than one syllable; ones that force stress to the preceding syllable);
# most didn't work well, but a couple make noticeable differences. Word-end
# indications are omitted from *most* suffixes so as to handle multiple-suffix
# word endings. The ones that remain are found only at word-end or, as in the
# case of '-er', cause too much trouble if we allow them earlier.
# A note on "vowels": there's always an ambiguity about 'y', which I've resolved
# more or less by hunch individually in each place.
import sys
import sre
#imports for rhyme finder
import pronouncing
import string
import re
import os
SIBILANTS = '40xzjgsc' # weird ones are encoded, 7th bit set
MIDS = 'bdfgklmnpstw%0245'
MULTISUFFIX = ('ible', 'able')
STRESSSUFFIX = ('tion', 'sion', 'tiou', 'ciou', 'tious', 'cious', 'cion', 'gion', 'giou', 'gious')
PREFIXES = ('a', 'as', 'be', 'con', 'de', 'di', 'ex', 're', 'un', 'en')
# out-of-class functions to handle encoding of special-combination characters
def encode(ch): return chr(ord(ch) & 0x3F)
def decode(ch): return chr(ord(ch) | 0x40)
def handleCiV(match): # encode [st] and i but not following vowel
c1 = encode(match.group()[0])
c2 = encode(match.group()[1])
return c1 + c2 + match.group()[2]
def handleCC(match): # adjusted for third-char test! expand this opport.?
ret = encode(match.group()[0]) + encode(match.group()[1])
if len(match.group()) > 2: ret += match.group()[2]
return ret
def handleVyV(match):
return match.group()[0] + encode(match.group()[1]) + match.group()[2]
class Syllabizer:
def __init__(self):
self.suffixes = sre.compile(r""" [^aeiouhr]y\b | er\b | age | est | ing |
ness\b | less | ful | ment\b | time\b | [st]ion | [ia]ble\b | [ct]ial
| [ctg]iou | [ctg]ious
""", sre.VERBOSE)
# | ical\b | icle\b | ual\b | ism \b | [ae]ry\b # don't work (as 2-syl)
# Note: left out special-character "*ag$" and "tim$" -- don't understand!
# final syllable spelled with liquid or nasal and silent 'e'
self.liquidterm = sre.compile(r" [^aeiouy] [rl] e \b", sre.X)
# the collection of special-character groups
self.finalE = sre.compile(r" [^aeiouy] e \b ", sre.X)
self.CiVcomb = sre.compile(r" [st] i [aeiouy] ", sre.X)
self.CCpair = sre.compile(r" [cgprstw] h | gn | gu[aeiouy] | qu | ck", sre.X)
self.VyVcomb = sre.compile(r" [aeiou] y [aeiou]", sre.X)
# vowel pairs reliably disyllabic (not 'ui' ('juice' vs 'intuition'! some
# 'ue' missed ('constituent'), some 'oe' ('poem'))
self.sylvowels = sre.compile(r" [aeiu] o | [iu] a | iu", sre.X)
# divisions should fall before or after, not within, these consonant pairs
self.splitLeftPairs = sre.compile(r""" [bdfk%02] [rl] | g [rln] | [tw] r | p
[rlsn] s [nml]""", sre.X)
def Syllabize(self, word):
if len(word) < 3: return [word.upper()] # 'ax' etc
self.wd = word.lower()
self.sylBounds = []
self.Preliminaries()
self.SpecialCodes()
self.DivideCV()
stressed = self.StressGuesser(word)
self.sylBounds.insert(0, 0) # ease the calc of syllable indices
self.sylBounds.append(len(word)) # within the word
listOfSyls = []
i = 0
for s in self.sylBounds:
if not s: continue
i += 1
if i != stressed:
listOfSyls.append(word[self.sylBounds[i-1]:s])
else:
listOfSyls.append(word[self.sylBounds[i-1]:s].upper())
return listOfSyls
def Preliminaries(self):
apostrophe = self.wd.find("\'", -2) # just at end of word ('twas)
if apostrophe != -1: # poss.; check if syllabic and remove
if self.wd[-1] != '\'' and self.wd[-1] in 'se' and self.wd[-2] in SIBILANTS:
self.sylBounds.append(apostrophe)
self.wd = self.wd[:apostrophe] # cut off ' or 's until last stage
# cut final s/d from plurals/pasts if not syllabic
self.isPast = self.isPlural = False # defaults used also for suffixes
if sre.search(r"[^s]s\b", self.wd): self.isPlural = True # terminal single s (DUMB!)
if sre.search(r"ed\b", self.wd): self.isPast = True # terminal 'ed'
if self.isPast or self.isPlural: self.wd = self.wd[:-1]
# final-syl test turns out to do better work *after* suffices cut off
self.FindSuffix()
# if final syllable is l/r+e, reverse letters for processing as syllable
if len(self.wd) > 3 and self.liquidterm.search(self.wd):
self.wd = self.wd[:-2] + self.wd[-1] + self.wd[-2]
def FindSuffix(self):
"""Identify any known suffixes, mark off as syllables and possible stresses.
Syllables are stored in a class-wide compiled RE. We identify them and
list them backwards so as to "cut off" the last first. We consult a
global-to-module list of those that force stress on previous syllable.
"""
self.numSuffixes = 0
self.forceStress = 0
resultslist = []
for f in self.suffixes.finditer(self.wd):
resultslist.append((f.group(), f.start()))
if not resultslist: return
# make sure *end* of word is in list! otherwise, 'DESP erate'
if resultslist[-1][1] + len(resultslist[-1][0]) < len(self.wd):
return
resultslist.reverse()
for res in resultslist:
# if no vowel left before, false suffix ('singing')
# n.b.: will choke on 'quest' etc! put in dictionary, I guess
if not sre.search('[aeiouy]', self.wd[:res[1]]): break
if res[0] == 'ing' and self.wd[res[1]-1] == self.wd[res[1]-2]:
self.sylBounds.append(res[1] - 1) # freq special case
else: self.sylBounds.append(res[1]) # sorted later
self.wd = self.wd[:res[1]]
self.numSuffixes += 1
if res[0] in STRESSSUFFIX:
self.forceStress = 0 - len(self.sylBounds)
if res[0] in MULTISUFFIX:
# tricky bit! it *happens* that secondary division in all these
# comes after its first character; NOT inevitable!
# also does not allow for 3-syl: 'ically' (which are reliable!)
self.sylBounds.append(res[1]+1)
self.numSuffixes += 1
def SpecialCodes(self):
"""Encode character-combinations so as to trick DivideCV.
The combinations are contained in regexes compiled in the class's
__init__. Encoding (*not* to be confused with Unicode functions!) is
done by small functions outside of (and preceding) the class.
The combinations in Paul Holzer's original code have been supplemented
and tweaked in various ways. For example, the original test for [iy]V is
poor; 'avionics' defeats it; so we leave that to a new disyllabic-vowel
test.
The messy encoding-and-sometimes-decoding of nonsyllabic final 'e'
after a C seems the best that can be done, though I hope not.
"""
if sre.search(r"[^aeiouy]e\b", self.wd): # nonsyllabic final e after C
if ((not self.isPlural or self.wd[-2] not in SIBILANTS) and (not
self.isPast or self.wd[-2] not in 'dt')):
self.wd = self.wd[:-1] + encode(self.wd[-1])
if not sre.search(r"[aeiouy]", self.wd): # any vowel left??
self.wd = self.wd[:-1] + 'e' # undo the encoding
self.wd = self.CiVcomb.sub(handleCiV, self.wd)
self.wd = self.CCpair.sub(handleCC, self.wd)
self.wd = self.VyVcomb.sub(handleVyV, self.wd)
def DivideCV(self):
"""Divide the word among C and V groups to fill the sylBounds list.
Here, and here alone, we need to catch e-with-grave-accent to count it
as not only a vowel but syllabic ('an aged man' vs. 'aged beef'). Other
special characters might be useful to recognize, but won't make the
same syllabic difference.
"""
unicodeVowels = u"[ae\N{LATIN SMALL LETTER E WITH GRAVE}iouy]+"
uniConsonants = u"[^ae\N{LATIN SMALL LETTER E WITH GRAVE}iouy]+"
firstvowel = sre.search(unicodeVowels, self.wd)
if firstvowel is not None:
for v in sre.finditer(unicodeVowels, self.wd):
lastvowel = v.end() # replaced for each group, last sticks
disyllabicvowels = self.sylvowels.search(v.group())
if disyllabicvowels:
self.sylBounds.append(v.start() + disyllabicvowels.start() + 1)
for cc in sre.finditer(uniConsonants, self.wd):
if cc.start() < firstvowel or cc.end() >= lastvowel: continue
numcons = len(cc.group())
if numcons < 3: pos = cc.end() - 1 # before single C or betw. 2
elif numcons > 3: pos = cc.end() - 2 # before penult C
else: # 3 consonants, divide 1/2 or 2/1?
cg = cc.group() # our CCC cluster
if cg[-3] == cg[-2] or self.splitLeftPairs.search(cg):
pos = cc.end() - 2 # divide 1/2
else: pos = cc.end() - 1 # divide 2/1
if not self.wd[pos-1].isalpha() and not self.wd[pos].isalpha():
self.sylBounds.append(pos-1)
else: self.sylBounds.append(pos)
def StressGuesser(self, origword):
"""Try to locate stressed syllable; return *1*-based index.
Use Nessly's Default (not great), with hints from stress-forcing
suffixes, a few prefixes, and number of suffixes. Nessly's Default
for disyllables is more useful if we apply it also before suffixes.
As I've added suffix and prefix twists to Nessly, I've steadily
*compromised* Nessly. (Example: 'for EV er' in older version, now
'FOR ev er'; it happens that the 3+-syl version of Nessly works for
this word while the 2-syl version applied after -er is removed doesn't.
This in-between state should probably be resolved, but resolving it
well is not easy. Adding 'en' to prefixes fixes 'encourage' but breaks
'entry'. At some point the returns from new compromises and special
cases diminish; there will *always* be an exceptions dictionary.
"""
numsyls = len(self.sylBounds) + 1
if numsyls == 1: return 1
self.sylBounds.sort() # suffixes may have been marked first
if self.forceStress: # suffixes like 'tion', 'cious'
return numsyls + self.forceStress
if numsyls - self.numSuffixes == 1: # pretty reliable I think
return 1
isprefix = self.wd[:self.sylBounds[0]] in PREFIXES
if numsyls - self.numSuffixes == 2: # Nessly w/ suffix twist
if isprefix: return 2
else: return 1
elif isprefix and (numsyls - self.numSuffixes == 3):
return 2
else: # Nessley: 3+ syls, str penult if closed, else antepenult
# syl n is origword[self.sylBounds[n-1]:self.sylBounds[n]-1]; so?
if origword[self.sylBounds[-1] - 1] not in 'aeiouy': # last char penult
retstress = numsyls - 1 # if closed, stress penult
else: retstress = numsyls - 2 # else, antepenult
if self.numSuffixes == numsyls:
retstress -= 1
return retstress
#Start of Rhyme Finder
# Rhyme detector
# Outputs vector of rhyme pattern for a given poem
# Uses pronouncing library based on CMU dict used by other papers for computational analysis of rhyme
# Takes in poem as text file
# Stores line final words
# Finds their equivalent pronunciations from cmu dict
# Figures out which ones rhyme with which others
# Calculates distribution of rhyme patterns
#rhymeMatch returns true if the two patterns presented rhyme and false otherwise
#could be extended to match various rhyming patterns
def analyse(poemm, length):
lastWords = []
#get list of line final words
for line in poem:
words = line.split()
lastWords.append(words[-1])
length+=1
#strip last words of any of their punctuation
lastWords = [word.rstrip(string.punctuation) for word in lastWords]
lastWords = [word.lower() for word in lastWords]
#look up their equivalents in cmu dict using pronouncing, print out results and output any empty strings (this means a pronunciation has not been found for a word)
pronunciations = [pronouncing.phones_for_word(word)[0] if (len(pronouncing.phones_for_word(word))>0) else '' for word in lastWords]
pronunciations = [(re.sub('[^A-Za-z\W]+', '', word)) for word in pronunciations]
completePronuns = handleMissingPronunciations(pronunciations, lastWords)
poemRhyme = getLastSyllable(completePronuns)
finalRhymeCount = groupLinesByRhyme(poemRhyme, [], [])
#finalRhymeCount = labelLinesWithRhyme(poemRhyme, [])
#print finalRhymeCount
return finalRhymeCount
def handleMissingPronunciations(ps, lastWords):
#check if there are any missing pronunciations
if '' in ps:
print 'Oh no! There are words whose pronunciation has not been found :( Here they are:'
for p in range(0, len(ps)):
if ps[p] == '':
rubbishWord = lastWords[p]
print rubbishWord
#gradually pass in smaller substrings to see if they have a pronunciation equivalent, then use that instead
rhymeFound = False
for i in range(1, len(rubbishWord)):
pronun = pronouncing.phones_for_word(rubbishWord[i:])
if len(pronun)>0:
ps[p] = pronun[0]
print 'found pronunciation: ', pronun
rhymeFound = True
if rhymeFound:
break
return ps
def getLastSyllable(pronuns):
#iterate over all line final words and isolate their last syllable (not taking into account onset of syllable)
vowels = ['A', 'E', 'I', 'O', 'U', 'Y', 'H', 'W'] #here y and h counts as a vowel due to how its used in cmudict
for i in range(0, len(pronuns)):
word = pronuns[i]
pattern = []
for c in range(len(word)-1, 0, -1) :
if word[c] in vowels:
pattern.insert(0, word[c])
if not word[c-1]==' ':
pattern.insert(0, word[c-1])
break
break
else:
pattern.insert(0, word[c])
poemRhyme.append(''.join(pattern)) #store these last syllables in poemRhyme
return poemRhyme
def rhymeMatch(bucketRhyme, wordRhyme):
if bucketRhyme==wordRhyme: #if the rhymes match
return True
else:
return False
def labelLinesWithRhyme(poemRhymes, groupedRhymes):
lineRhymeNumbers = []
for i in range(0, len(poemRhymes)):
matched = False
for j in range(0, len(groupedRhymes)):
matched = rhymeMatch(groupedRhymes[j], poemRhymes[i])
if matched:
lineRhymeNumbers.append(j+1)
break
if (not matched) and (not poemRhymes[i]==''):
groupedRhymes.append(poemRhymes[i])
lineRhymeNumbers.append(len(groupedRhymes))
return lineRhymeNumbers
def groupLinesByRhyme(poemRhymes, groupedRhymes, rhymeCounts):
rhymeCounts.append(0)
rhymeCounts.append(0)
rhymeCounts.append(0)
rhymeCounts.append(0)
rhymeCounts.append(0)
rhymeCounts.append(0)
rhymeCounts.append(0)
rhymeCounts.append(0)
#group lines by rhyming pattern
for i in range(0, len(poemRhymes)): #iterate through the rest of the lines
#print 'Rhyme ', rhyme, 'line ', i
#print 'Analysing ', poemRhyme[i]
matched = False
for j in range(0, len(groupedRhymes)):
#print 'j: ', j
matched = rhymeMatch(groupedRhymes[j], poemRhymes[i])
if matched:
rhymeCounts[j]+=1 #if it matches one of the patterns then add it to the list of lines that have that rhyme
break
if (not matched) and (not poemRhymes[i]==''):
if len(groupedRhymes)==7:
#we've already reached 7 distinct rhyme patterns so now all new ones fall in the 'other' category
rhymeCounts[7]+=1 #add it to 8th element count
else:
groupedRhymes.append(poemRhymes[i])
rhymeCounts[len(groupedRhymes)-1]+=1
return rhymeCounts
def printResults(rhymes):
#print rhyme
#print rhymes
#write results to file
rhymes.append(0) #append with corresponding class of poems being analysed
resultsFile = open('allPoems.csv', 'a')
resultsFile.write(str(rhymes).strip(string.punctuation))
resultsFile.write('\n')
#main program part
poems = []
with open(sys.argv[1], 'r') as poemFile:
print 'Opened file...'
i = 0
poems.append([])
for line in poemFile:
if not line.isspace():
poems[i].append(line)
else:
print 'New poem...'
poems.append([])
i+=1
poemFile.close()
for poem in poems:
print 'Analysing new poem...'
poemRhyme = [] #to store the overall pattern of the poem - should be as long as the number of lines in the poem
rhyme = []
poemLength = 0
pronunciations = []
poemRhymeCount = analyse(poem, poemLength)
printResults(poemRhymeCount)