Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

fix ordinals and conjunctions in tts normalizer #341

Merged
merged 2 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
Expand All @@ -17,6 +16,7 @@
# limitations under the License.

from num2words import num2words
import re

class EnglishNormalizer:
def __init__(self):
Expand Down Expand Up @@ -50,10 +50,10 @@ def __init__(self):
}

def correct_abbreviation(self, text):
# if one word is all capital letters, then correct this whole word
# TODO mixed abbreviation like i7 12th W3C should be supported
hshen14 marked this conversation as resolved.
Show resolved Hide resolved
# TODO mixed abbreviation or proper noun like i7, ffmpeg, BTW should be supported

words = text.split()
# words = text.split() # CVPR-15 will be upper but 1 and 5 will be splitted to two numbers
words = re.split(' |-|_', text)
results = []
for idx, word in enumerate(words):
if word.isupper(): # W3C is also upper
Expand All @@ -72,6 +72,7 @@ def correct_number(self, text):
results = []
prepositions_year = ["in", "on"]
prev = ""
ordinal_pattern = re.compile("^.*[0-9](st|nd|rd|th)$")
for idx, word in enumerate(words):
suffix = ""
if len(word) > 0 and word[-1] in [",", ".", "?", "!"]:
Expand All @@ -83,6 +84,7 @@ def correct_number(self, text):
if prev.lower() in prepositions_year and potential_year < 2999 and potential_year > 1000 \
and potential_year % 1000 != 0:
word = num2words(word, to="year")
word = word.replace("-", "") # nineteen eighty-seven => nineteen eightyseven
else:
word = num2words(word)
except Exception as e:
Expand All @@ -98,12 +100,14 @@ def correct_number(self, text):
except ValueError:
# print("not a number, fallback to original word")
pass

if ordinal_pattern.search(word):
word = num2words(word[:-2], to='ordinal').replace("-", " ")
word = word + suffix
results.append(word)
prev = word
results = " ".join(results)
# if the text is not truncated correctly by early stop token, then manually add one.
if len(results) > 0 and results[-1] not in [",", ".", "?", "!"]:
results += "."
return results

return results
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,18 @@ def test_correct_abbreviation(self):
def test_correct_year(self):
text = "In 1986, there are more than 2000 people participating in that party."
result = self.normalizer.correct_number(text)
self.assertEqual(result, "In nineteen eighty-six, there are more than two thousand people participating in that party.")
self.assertEqual(result, "In nineteen eightysix, there are more than two thousand people participating in that party.")

def test_correct_ordinal(self):
text = "1st 2nd 3rd 4th 5th 11th 12th 21st 22nd"
result = self.normalizer.correct_number(text)
self.assertEqual(result, "first second third fourth fifth eleventh twelfth twenty first twenty second.")

def test_correct_conjunctions(self):
text = "CVPR-15 ICML-21"
text = self.normalizer.correct_abbreviation(text)
result = self.normalizer.correct_number(text)
self.assertEqual(result, "cee vee pee ar fifteen I cee em el twenty-one.")

if __name__ == "__main__":
unittest.main()
Loading