intel · hshen14 · Sep 20, 2023 · Sep 19, 2023 · Sep 19, 2023
diff --git a/...extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py b/...extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py
@@ -1,4 +1,3 @@
-
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
@@ -17,6 +16,7 @@
 # limitations under the License.
 
 from num2words import num2words
+import re
 
 class EnglishNormalizer:
     def __init__(self):
@@ -50,10 +50,10 @@ def __init__(self):
         }
 
     def correct_abbreviation(self, text):
-        # if one word is all capital letters, then correct this whole word
-        # TODO mixed abbreviation like i7 12th W3C should be supported
+        # TODO mixed abbreviation or proper noun like i7, ffmpeg, BTW should be supported
 
-        words = text.split()
+        # words = text.split()    # CVPR-15 will be upper but 1 and 5 will be splitted to two numbers
+        words = re.split(' |-|_', text)
         results = []
         for idx, word in enumerate(words):
             if word.isupper(): # W3C is also upper
@@ -72,6 +72,7 @@ def correct_number(self, text):
         results = []
         prepositions_year = ["in", "on"]
         prev = ""
+        ordinal_pattern = re.compile("^.*[0-9](st|nd|rd|th)$")
         for idx, word in enumerate(words):
             suffix = ""
             if len(word) > 0 and word[-1] in [",", ".", "?", "!"]:
@@ -83,6 +84,7 @@ def correct_number(self, text):
                     if prev.lower() in prepositions_year and potential_year < 2999 and potential_year > 1000 \
                           and potential_year % 1000 != 0:
                         word = num2words(word, to="year")
+                        word = word.replace("-", "") # nineteen eighty-seven => nineteen eightyseven
                     else:
                         word = num2words(word)
                 except Exception as e:
@@ -98,12 +100,14 @@ def correct_number(self, text):
                     except ValueError:
                         # print("not a number, fallback to original word")
                         pass
+
+            if ordinal_pattern.search(word):
+                word = num2words(word[:-2], to='ordinal').replace("-", " ")
             word = word + suffix
             results.append(word)
             prev = word
         results = " ".join(results)
         # if the text is not truncated correctly by early stop token, then manually add one.
         if len(results) > 0 and results[-1] not in [",", ".", "?", "!"]:
             results += "."
-        return results
-
+        return results
diff --git a/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py b/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py
@@ -40,7 +40,18 @@ def test_correct_abbreviation(self):
     def test_correct_year(self):
         text = "In 1986, there are more than 2000 people participating in that party."
         result = self.normalizer.correct_number(text)
-        self.assertEqual(result, "In nineteen eighty-six, there are more than two thousand people participating in that party.")
+        self.assertEqual(result, "In nineteen eightysix, there are more than two thousand people participating in that party.")
+
+    def test_correct_ordinal(self):
+        text = "1st 2nd 3rd 4th 5th 11th 12th 21st 22nd"
+        result = self.normalizer.correct_number(text)
+        self.assertEqual(result, "first second third fourth fifth eleventh twelfth twenty first twenty second.")
+
+    def test_correct_conjunctions(self):
+        text = "CVPR-15 ICML-21"
+        text = self.normalizer.correct_abbreviation(text)
+        result = self.normalizer.correct_number(text)
+        self.assertEqual(result, "cee vee pee ar fifteen I cee em el twenty-one.")
 
 if __name__ == "__main__":
     unittest.main()