From f32483221276412fc2a4571b94e1a119023bd24f Mon Sep 17 00:00:00 2001
From: Spycsh <sihan.chen@intel.com>
Date: Tue, 19 Sep 2023 00:00:49 -0700
Subject: [PATCH 1/2] fix ordinals and conjunctions in tts normalizer

---
 .../plugins/audio/utils/english_normalizer.py   | 17 +++++++++++------
 .../tests/audio/test_english_normalizer.py      | 13 ++++++++++++-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py
index bcf5d604d7f..a93e3fb88ee 100644
--- a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py
+++ b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py
@@ -1,4 +1,3 @@
-
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
@@ -17,6 +16,7 @@
 # limitations under the License.
 
 from num2words import num2words
+import re
 
 class EnglishNormalizer:
     def __init__(self):
@@ -50,10 +50,10 @@ def __init__(self):
         }
         
     def correct_abbreviation(self, text):
-        # if one word is all capital letters, then correct this whole word
-        # TODO mixed abbreviation like i7 12th W3C should be supported
+        # TODO mixed abbreviation or proper noun like i7, ffmpeg, BTW should be supported
 
-        words = text.split()
+        # words = text.split()    # CVPR-15 will be upper but 1 and 5 will be splitted to two numbers
+        words = re.split(' |-|_', text)
         results = []
         for idx, word in enumerate(words):
             if word.isupper(): # W3C is also upper
@@ -72,6 +72,7 @@ def correct_number(self, text):
         results = []
         prepositions_year = ["in", "on"]
         prev = ""
+        ordinal_pattern = re.compile("^.*[0-9](st|nd|rd|th)$")
         for idx, word in enumerate(words):
             suffix = ""
             if len(word) > 0 and word[-1] in [",", ".", "?", "!"]:
@@ -80,9 +81,11 @@ def correct_number(self, text):
             if word.isdigit(): # if word is positive integer, it must can be num2words
                 try:
                     potential_year = int(word)
+                    # We ignore the preposition here for demo TODO fix it in a more elegant way!
                     if prev.lower() in prepositions_year and potential_year < 2999 and potential_year > 1000 \
                           and potential_year % 1000 != 0:
                         word = num2words(word, to="year")
+                        word = word.replace("-", "") # nineteen eighty-seven => nineteen eightyseven
                     else:
                         word = num2words(word)
                 except Exception as e:
@@ -98,6 +101,9 @@ def correct_number(self, text):
                     except ValueError:
                         # print("not a number, fallback to original word")
                         pass
+
+            if ordinal_pattern.search(word):
+                word = num2words(word[:-2], to='ordinal').replace("-", " ")
             word = word + suffix
             results.append(word)
             prev = word
@@ -105,5 +111,4 @@ def correct_number(self, text):
         # if the text is not truncated correctly by early stop token, then manually add one.
         if len(results) > 0 and results[-1] not in [",", ".", "?", "!"]:
             results += "."
-        return results
-
+        return results
\ No newline at end of file
diff --git a/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py b/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py
index d92e60a1dc6..2075030c454 100644
--- a/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py
+++ b/intel_extension_for_transformers/neural_chat/tests/audio/test_english_normalizer.py
@@ -40,7 +40,18 @@ def test_correct_abbreviation(self):
     def test_correct_year(self):
         text = "In 1986, there are more than 2000 people participating in that party."
         result = self.normalizer.correct_number(text)
-        self.assertEqual(result, "In nineteen eighty-six, there are more than two thousand people participating in that party.")
+        self.assertEqual(result, "In nineteen eightysix, there are more than two thousand people participating in that party.")
+
+    def test_correct_ordinal(self):
+        text = "1st 2nd 3rd 4th 5th 11th 12th 21st 22nd"
+        result = self.normalizer.correct_number(text)
+        self.assertEqual(result, "first second third fourth fifth eleventh twelfth twenty first twenty second.")
+
+    def test_correct_conjunctions(self):
+        text = "CVPR-15 ICML-21"
+        text = self.normalizer.correct_abbreviation(text)
+        result = self.normalizer.correct_number(text)
+        self.assertEqual(result, "cee vee pee ar fifteen I cee em el twenty-one.")
 
 if __name__ == "__main__":
     unittest.main()

From a4aaf589db9ec63e26fc68435e3411be7f2c941b Mon Sep 17 00:00:00 2001
From: Spycsh <sihan.chen@intel.com>
Date: Tue, 19 Sep 2023 00:11:42 -0700
Subject: [PATCH 2/2] fix comment

---
 .../pipeline/plugins/audio/utils/english_normalizer.py           | 1 -
 1 file changed, 1 deletion(-)

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py
index a93e3fb88ee..22c73771d03 100644
--- a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py
+++ b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/utils/english_normalizer.py
@@ -81,7 +81,6 @@ def correct_number(self, text):
             if word.isdigit(): # if word is positive integer, it must can be num2words
                 try:
                     potential_year = int(word)
-                    # We ignore the preposition here for demo TODO fix it in a more elegant way!
                     if prev.lower() in prepositions_year and potential_year < 2999 and potential_year > 1000 \
                           and potential_year % 1000 != 0:
                         word = num2words(word, to="year")