From f32483221276412fc2a4571b94e1a119023bd24f Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 19 Sep 2023 00:00:49 -0700 Subject: [PATCH 1/2] fix ordinals and conjunctions in tts normalizer --- .../plugins/audio/utils/english_normalizer.py | 17 +++++++++++------ .../tests/audio/test_english_normalizer.py | 13 ++++++++++++- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py index bcf5d604d7f..a93e3fb88ee 100644 --- a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py +++ b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py @@ -1,4 +1,3 @@ - #!/usr/bin/env python # -*- coding: utf-8 -*- # @@ -17,6 +16,7 @@ # limitations under the License. from num2words import num2words +import re class EnglishNormalizer: def __init__(self): @@ -50,10 +50,10 @@ def __init__(self): } def correct_abbreviation(self, text): - # if one word is all capital letters, then correct this whole word - # TODO mixed abbreviation like i7 12th W3C should be supported + # TODO mixed abbreviation or proper noun like i7, ffmpeg, BTW should be supported - words = text.split() + # words = text.split() # CVPR-15 will be upper but 1 and 5 will be splitted to two numbers + words = re.split(' |-|_', text) results = [] for idx, word in enumerate(words): if word.isupper(): # W3C is also upper @@ -72,6 +72,7 @@ def correct_number(self, text): results = [] prepositions_year = ["in", "on"] prev = "" + ordinal_pattern = re.compile("^.*[0-9](st|nd|rd|th)$") for idx, word in enumerate(words): suffix = "" if len(word) > 0 and word[-1] in [",", ".", "?", "!"]: @@ -80,9 +81,11 @@ def correct_number(self, text): if word.isdigit(): # if word is positive integer, it must can be num2words try: potential_year = int(word) + # We ignore the preposition here for demo TODO fix it in a more elegant way! if prev.lower() in prepositions_year and potential_year < 2999 and potential_year > 1000 \ and potential_year % 1000 != 0: word = num2words(word, to="year") + word = word.replace("-", "") # nineteen eighty-seven => nineteen eightyseven else: word = num2words(word) except Exception as e: @@ -98,6 +101,9 @@ def correct_number(self, text): except ValueError: # print("not a number, fallback to original word") pass + + if ordinal_pattern.search(word): + word = num2words(word[:-2], to='ordinal').replace("-", " ") word = word + suffix results.append(word) prev = word @@ -105,5 +111,4 @@ def correct_number(self, text): # if the text is not truncated correctly by early stop token, then manually add one. if len(results) > 0 and results[-1] not in [",", ".", "?", "!"]: results += "." - return results - + return results \ No newline at end of file diff --git a/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py b/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py index d92e60a1dc6..2075030c454 100644 --- a/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py +++ b/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py @@ -40,7 +40,18 @@ def test_correct_abbreviation(self): def test_correct_year(self): text = "In 1986, there are more than 2000 people participating in that party." result = self.normalizer.correct_number(text) - self.assertEqual(result, "In nineteen eighty-six, there are more than two thousand people participating in that party.") + self.assertEqual(result, "In nineteen eightysix, there are more than two thousand people participating in that party.") + + def test_correct_ordinal(self): + text = "1st 2nd 3rd 4th 5th 11th 12th 21st 22nd" + result = self.normalizer.correct_number(text) + self.assertEqual(result, "first second third fourth fifth eleventh twelfth twenty first twenty second.") + + def test_correct_conjunctions(self): + text = "CVPR-15 ICML-21" + text = self.normalizer.correct_abbreviation(text) + result = self.normalizer.correct_number(text) + self.assertEqual(result, "cee vee pee ar fifteen I cee em el twenty-one.") if __name__ == "__main__": unittest.main() From a4aaf589db9ec63e26fc68435e3411be7f2c941b Mon Sep 17 00:00:00 2001 From: Spycsh Date: Tue, 19 Sep 2023 00:11:42 -0700 Subject: [PATCH 2/2] fix comment --- .../pipeline/plugins/audio/utils/english_normalizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py index a93e3fb88ee..22c73771d03 100644 --- a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py +++ b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py @@ -81,7 +81,6 @@ def correct_number(self, text): if word.isdigit(): # if word is positive integer, it must can be num2words try: potential_year = int(word) - # We ignore the preposition here for demo TODO fix it in a more elegant way! if prev.lower() in prepositions_year and potential_year < 2999 and potential_year > 1000 \ and potential_year % 1000 != 0: word = num2words(word, to="year")