Skip to content

Commit

Permalink
minor changes for new lexicon version
Browse files Browse the repository at this point in the history
  • Loading branch information
Harald Berthelsen committed May 11, 2017
1 parent 102a2c2 commit 6d80f4b
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 11 deletions.
6 changes: 3 additions & 3 deletions test.html
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,13 @@ <h5>SSML markup</h5>

The title of the movie is:
<phoneme alphabet="x-sampa"
ph="' l A ' v i - t @ ' E ' b E - l A">
ph="' l A . ' v i . t @ . ' E . ' b E . l A">
La vita è bella</phoneme>,
<!-- The IPA pronunciation is ˈlɑ ˈviːɾə ˈʔeɪ ˈbɛlə -->
(Life is beautiful),
which is directed by
<phoneme alphabet="x-sampa"
ph="r O - ' b r= - t O - b E - ' n I n - j i">
ph="r O . ' b r= . t O . b E . ' n I n . j i">
Roberto Benigni</phoneme>.
<!-- The IPA pronunciation is ɹəˈbɛːɹɾoʊ bɛˈniːnji -->

Expand Down Expand Up @@ -247,5 +247,5 @@ <h5>SSML markup</h5>
<audio id="audio_player"></audio>
<hr>
<address></address>
<!-- hhmts start -->Last modified: Thu Mar 30 16:21:38 CEST 2017 <!-- hhmts end -->
<!-- hhmts start -->Last modified: Thu May 11 13:12:46 CEST 2017 <!-- hhmts end -->
</body> </html>
41 changes: 34 additions & 7 deletions wikispeech_mockup/adapters/lexicon_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,37 @@
import wikispeech_mockup.config as config
import wikispeech_mockup.log as log

lexica = []

def cleanupOrth(orth):

orig = orth

#Remove soft hyphen if it occurs - it's a hidden character that causes problems in lookup
orth = orth.replace("\xad","")

#Remove Arabic diacritics if they occur
#Bad place for this but where else? In mapper?
FATHATAN = '\u064b'
DAMMATAN = '\u064c'
KASRATAN = '\u064d'
FATHA = '\u064e'
DAMMA = '\u064f'
KASRA = '\u0650'
SHADDA = '\u0651'
SUKUN = '\u0652'

TASHKEEL = (FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN,SHADDA)

orth = re.sub("("+"|".join(TASHKEEL)+")","", orth)


orth = orth.lower()

log.debug("lexicon_client.cleanupOrth: %s -> %s" % (orig, orth))

return orth

lexica = []
def loadLexicon(lexicon_name):
lexicon = Lexicon(lexicon_name)
lexica.append(lexicon)
Expand Down Expand Up @@ -55,8 +84,7 @@ def getOrth(tokenlist):
orthlist = []
for t in tokenlist:
orth = t["orth"]
#Remove soft hyphen if it occurs - it's a hidden character that causes problems in lookup
orth = orth.replace("\xad","")
orth = cleanupOrth(orth)
orthlist.append(orth)
return " ".join(orthlist)

Expand Down Expand Up @@ -97,10 +125,9 @@ def convertResponse(response_json):
def addTransFromResponse(tokenlist, responseDict):
for t in tokenlist:
orth = t["orth"]
#Remove soft hyphen if it occurs - it's a hidden character that causes problems in lookup
orth = orth.replace("\xad","")
if orth.lower() in responseDict:
ph = responseDict[orth.lower()]
orth = cleanupOrth(orth)
if orth in responseDict:
ph = responseDict[orth]
t["trans"] = ph
t["g2p_method"] = "lexicon"
else:
Expand Down
6 changes: 5 additions & 1 deletion wikispeech_mockup/voice_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,11 @@
"components":[
{
"module":"adapters.marytts_adapter",
"call":"marytts_preproc"
"call":"marytts_preproc",
"mapper": {
"from":"en-us_ws-sampa",
"to":"en-us_sampa_mary"
},
},
{
"module":"wikilex",
Expand Down

0 comments on commit 6d80f4b

Please sign in to comment.