minor changes for new lexicon version

stts-se · May 11, 2017 · 6d80f4b · 6d80f4b
1 parent 102a2c2
commit 6d80f4b
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 11 deletions.
diff --git a/test.html b/test.html
@@ -157,13 +157,13 @@ <h5>SSML markup</h5>
 
   The title of the movie is: 
   <phoneme alphabet="x-sampa"
-    ph="' l A ' v i - t @ ' E ' b E - l A"> 
+    ph="' l A . ' v i . t @ . ' E . ' b E . l A"> 
   La vita è bella</phoneme>,
   <!-- The IPA pronunciation is ˈlɑ ˈviːɾə ˈʔeɪ ˈbɛlə -->
   (Life is beautiful), 
   which is directed by 
   <phoneme alphabet="x-sampa"
-    ph="r O - ' b r= - t O - b E - ' n I n - j i"> 
+    ph="r O . ' b r= . t O . b E . ' n I n . j i"> 
   Roberto Benigni</phoneme>.
   <!-- The IPA pronunciation is ɹəˈbɛːɹɾoʊ bɛˈniːnji -->
 
@@ -247,5 +247,5 @@ <h5>SSML markup</h5>
 <audio id="audio_player"></audio>
 <hr>
 <address></address>
-<!-- hhmts start -->Last modified: Thu Mar 30 16:21:38 CEST 2017 <!-- hhmts end -->
+<!-- hhmts start -->Last modified: Thu May 11 13:12:46 CEST 2017 <!-- hhmts end -->
 </body> </html>
diff --git a/wikispeech_mockup/adapters/lexicon_client.py b/wikispeech_mockup/adapters/lexicon_client.py
@@ -3,8 +3,37 @@
 import wikispeech_mockup.config as config
 import wikispeech_mockup.log as log
 
-lexica = []
 
+def cleanupOrth(orth):
+
+    orig = orth
+
+    #Remove soft hyphen if it occurs - it's a hidden character that causes problems in lookup
+    orth = orth.replace("\xad","")
+
+    #Remove Arabic diacritics if they occur
+    #Bad place for this but where else? In mapper?
+    FATHATAN         = '\u064b' 
+    DAMMATAN         = '\u064c' 
+    KASRATAN         = '\u064d' 
+    FATHA            = '\u064e' 
+    DAMMA            = '\u064f' 
+    KASRA            = '\u0650' 
+    SHADDA           = '\u0651' 
+    SUKUN            = '\u0652' 
+
+    TASHKEEL  = (FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN,SHADDA)
+
+    orth = re.sub("("+"|".join(TASHKEEL)+")","", orth)
+
+
+    orth = orth.lower()
+
+    log.debug("lexicon_client.cleanupOrth: %s -> %s" % (orig, orth))
+
+    return orth
+
+lexica = []
 def loadLexicon(lexicon_name):
     lexicon = Lexicon(lexicon_name)
     lexica.append(lexicon)
@@ -55,8 +84,7 @@ def getOrth(tokenlist):
     orthlist = []
     for t in tokenlist:
         orth = t["orth"]
-        #Remove soft hyphen if it occurs - it's a hidden character that causes problems in lookup
-        orth = orth.replace("\xad","")
+        orth = cleanupOrth(orth)
         orthlist.append(orth)
     return " ".join(orthlist)
 
@@ -97,10 +125,9 @@ def convertResponse(response_json):
 def addTransFromResponse(tokenlist, responseDict):
     for t in tokenlist:
         orth = t["orth"]
-        #Remove soft hyphen if it occurs - it's a hidden character that causes problems in lookup
-        orth = orth.replace("\xad","")
-        if orth.lower() in responseDict:
-            ph = responseDict[orth.lower()]
+        orth = cleanupOrth(orth)
+        if orth in responseDict:
+            ph = responseDict[orth]
             t["trans"] = ph
             t["g2p_method"] = "lexicon"
         else:

diff --git a/wikispeech_mockup/voice_config.py b/wikispeech_mockup/voice_config.py
@@ -41,7 +41,11 @@
      "components":[
          {
              "module":"adapters.marytts_adapter",
-             "call":"marytts_preproc"
+             "call":"marytts_preproc",
+             "mapper": {
+                 "from":"en-us_ws-sampa",
+                 "to":"en-us_sampa_mary"
+             },
          },
          {
              "module":"wikilex",