Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
fix ordinals and conjunctions in tts normalizer (#341)
Browse files Browse the repository at this point in the history
* fix ordinals and conjunctions in tts normalizer

* fix comment
  • Loading branch information
Spycsh authored Sep 20, 2023
1 parent be651be commit 0892f8a
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 7 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
Expand All @@ -17,6 +16,7 @@
# limitations under the License.

from num2words import num2words
import re

class EnglishNormalizer:
def __init__(self):
Expand Down Expand Up @@ -50,10 +50,10 @@ def __init__(self):
}

def correct_abbreviation(self, text):
# if one word is all capital letters, then correct this whole word
# TODO mixed abbreviation like i7 12th W3C should be supported
# TODO mixed abbreviation or proper noun like i7, ffmpeg, BTW should be supported

words = text.split()
# words = text.split() # CVPR-15 will be upper but 1 and 5 will be splitted to two numbers
words = re.split(' |-|_', text)
results = []
for idx, word in enumerate(words):
if word.isupper(): # W3C is also upper
Expand All @@ -72,6 +72,7 @@ def correct_number(self, text):
results = []
prepositions_year = ["in", "on"]
prev = ""
ordinal_pattern = re.compile("^.*[0-9](st|nd|rd|th)$")
for idx, word in enumerate(words):
suffix = ""
if len(word) > 0 and word[-1] in [",", ".", "?", "!"]:
Expand All @@ -83,6 +84,7 @@ def correct_number(self, text):
if prev.lower() in prepositions_year and potential_year < 2999 and potential_year > 1000 \
and potential_year % 1000 != 0:
word = num2words(word, to="year")
word = word.replace("-", "") # nineteen eighty-seven => nineteen eightyseven
else:
word = num2words(word)
except Exception as e:
Expand All @@ -98,12 +100,14 @@ def correct_number(self, text):
except ValueError:
# print("not a number, fallback to original word")
pass

if ordinal_pattern.search(word):
word = num2words(word[:-2], to='ordinal').replace("-", " ")
word = word + suffix
results.append(word)
prev = word
results = " ".join(results)
# if the text is not truncated correctly by early stop token, then manually add one.
if len(results) > 0 and results[-1] not in [",", ".", "?", "!"]:
results += "."
return results

return results
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,18 @@ def test_correct_abbreviation(self):
def test_correct_year(self):
text = "In 1986, there are more than 2000 people participating in that party."
result = self.normalizer.correct_number(text)
self.assertEqual(result, "In nineteen eighty-six, there are more than two thousand people participating in that party.")
self.assertEqual(result, "In nineteen eightysix, there are more than two thousand people participating in that party.")

def test_correct_ordinal(self):
text = "1st 2nd 3rd 4th 5th 11th 12th 21st 22nd"
result = self.normalizer.correct_number(text)
self.assertEqual(result, "first second third fourth fifth eleventh twelfth twenty first twenty second.")

def test_correct_conjunctions(self):
text = "CVPR-15 ICML-21"
text = self.normalizer.correct_abbreviation(text)
result = self.normalizer.correct_number(text)
self.assertEqual(result, "cee vee pee ar fifteen I cee em el twenty-one.")

if __name__ == "__main__":
unittest.main()

0 comments on commit 0892f8a

Please sign in to comment.