diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py index bcf5d604d7f..22c73771d03 100644 --- a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py +++ b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py @@ -1,4 +1,3 @@ - #!/usr/bin/env python # -*- coding: utf-8 -*- # @@ -17,6 +16,7 @@ # limitations under the License. from num2words import num2words +import re class EnglishNormalizer: def __init__(self): @@ -50,10 +50,10 @@ def __init__(self): } def correct_abbreviation(self, text): - # if one word is all capital letters, then correct this whole word - # TODO mixed abbreviation like i7 12th W3C should be supported + # TODO mixed abbreviation or proper noun like i7, ffmpeg, BTW should be supported - words = text.split() + # words = text.split() # CVPR-15 will be upper but 1 and 5 will be splitted to two numbers + words = re.split(' |-|_', text) results = [] for idx, word in enumerate(words): if word.isupper(): # W3C is also upper @@ -72,6 +72,7 @@ def correct_number(self, text): results = [] prepositions_year = ["in", "on"] prev = "" + ordinal_pattern = re.compile("^.*[0-9](st|nd|rd|th)$") for idx, word in enumerate(words): suffix = "" if len(word) > 0 and word[-1] in [",", ".", "?", "!"]: @@ -83,6 +84,7 @@ def correct_number(self, text): if prev.lower() in prepositions_year and potential_year < 2999 and potential_year > 1000 \ and potential_year % 1000 != 0: word = num2words(word, to="year") + word = word.replace("-", "") # nineteen eighty-seven => nineteen eightyseven else: word = num2words(word) except Exception as e: @@ -98,6 +100,9 @@ def correct_number(self, text): except ValueError: # print("not a number, fallback to original word") pass + + if ordinal_pattern.search(word): + word = num2words(word[:-2], to='ordinal').replace("-", " ") word = word + suffix results.append(word) prev = word @@ -105,5 +110,4 @@ def correct_number(self, text): # if the text is not truncated correctly by early stop token, then manually add one. if len(results) > 0 and results[-1] not in [",", ".", "?", "!"]: results += "." - return results - + return results \ No newline at end of file diff --git a/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py b/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py index d92e60a1dc6..2075030c454 100644 --- a/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py +++ b/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py @@ -40,7 +40,18 @@ def test_correct_abbreviation(self): def test_correct_year(self): text = "In 1986, there are more than 2000 people participating in that party." result = self.normalizer.correct_number(text) - self.assertEqual(result, "In nineteen eighty-six, there are more than two thousand people participating in that party.") + self.assertEqual(result, "In nineteen eightysix, there are more than two thousand people participating in that party.") + + def test_correct_ordinal(self): + text = "1st 2nd 3rd 4th 5th 11th 12th 21st 22nd" + result = self.normalizer.correct_number(text) + self.assertEqual(result, "first second third fourth fifth eleventh twelfth twenty first twenty second.") + + def test_correct_conjunctions(self): + text = "CVPR-15 ICML-21" + text = self.normalizer.correct_abbreviation(text) + result = self.normalizer.correct_number(text) + self.assertEqual(result, "cee vee pee ar fifteen I cee em el twenty-one.") if __name__ == "__main__": unittest.main()