From f521211a6a7f3a9c072d2bd91bd21fb448ff370b Mon Sep 17 00:00:00 2001 From: shavit Date: Sat, 28 Sep 2024 08:04:09 -0400 Subject: [PATCH 1/4] Add normalizer type C to text cleaners --- TTS/tts/utils/text/cleaners.py | 7 +++++++ tests/text_tests/test_text_cleaners.py | 24 +++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index fc87025f00..2bf08bd055 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -6,6 +6,7 @@ from typing import Optional from anyascii import anyascii +from unicodedata import normalize from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text @@ -186,3 +187,9 @@ def no_cleaners(text: str) -> str: # remove newline characters text = text.replace("\n", "") return text + + +def normalize_nfc(text: str) -> str: + """Canonical decomposition followed by canonical composition""" + text = normalize("NFC", text) + return text diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py index bf0c8d5d8a..ccd7bd7cda 100644 --- a/tests/text_tests/test_text_cleaners.py +++ b/tests/text_tests/test_text_cleaners.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -from TTS.tts.utils.text.cleaners import english_cleaners, multilingual_phoneme_cleaners, phoneme_cleaners +from TTS.tts.utils.text.cleaners import english_cleaners, multilingual_phoneme_cleaners, phoneme_cleaners, normalize_nfc def test_time() -> None: @@ -24,3 +24,25 @@ def test_expand_numbers() -> None: def test_multilingual_phoneme_cleaners() -> None: assert multilingual_phoneme_cleaners("(Hello)") == "Hello" assert multilingual_phoneme_cleaners("1:") == "1," + + +def test_normalize_nfc() -> None: + test_cases = [ + ("Häagen-Dazs", "Häagen-Dazs"), + ("你好!", "你好!"), + ("𝔄𝔅ℭ⓵⓶⓷︷,︸,i⁹,i₉,㌀,¼", "𝔄𝔅ℭ⓵⓶⓷︷,︸,i⁹,i₉,㌀,¼"), + ("é", "é"), + ("e\u0301", "é"), + ("a\u0300", "à"), + ("a\u0327", "a̧"), + ("na\u0303", "nã"), + ("o\u0302u", "ôu"), + ("n\u0303", "ñ"), + (u"\u4E2D\u56FD", u"中国"), + (u"niño", u"niño"), + (u"a\u0308", u"ä"), + (u"\u3053\u3093\u306b\u3061\u306f", u"こんにちは"), + (u"\u03B1\u03B2", u"αβ") + ] + for arg, expect in test_cases: + assert normalize_nfc(arg) == expect From 636ea59761ccc05b6e6e57da21571f5747cca5de Mon Sep 17 00:00:00 2001 From: shavit Date: Mon, 30 Sep 2024 10:55:11 -0400 Subject: [PATCH 2/4] Linter recommendations --- TTS/tts/utils/text/cleaners.py | 8 +++----- tests/text_tests/test_text_cleaners.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 2bf08bd055..58c8747c21 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -1,12 +1,10 @@ """Set of default text cleaners""" -# TODO: pick the cleaner for languages dynamically - import re from typing import Optional +from unicodedata import normalize from anyascii import anyascii -from unicodedata import normalize from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text @@ -189,7 +187,7 @@ def no_cleaners(text: str) -> str: return text -def normalize_nfc(text: str) -> str: - """Canonical decomposition followed by canonical composition""" +def normalize_unicode(text: str) -> str: + """Normalize Unicode characters.""" text = normalize("NFC", text) return text diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py index ccd7bd7cda..dbd2e603b5 100644 --- a/tests/text_tests/test_text_cleaners.py +++ b/tests/text_tests/test_text_cleaners.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -from TTS.tts.utils.text.cleaners import english_cleaners, multilingual_phoneme_cleaners, phoneme_cleaners, normalize_nfc +from TTS.tts.utils.text.cleaners import english_cleaners, multilingual_phoneme_cleaners, normalize_nfc, phoneme_cleaners def test_time() -> None: From 41b0f4caa79ad363de10e4ce143c8df6a07f182c Mon Sep 17 00:00:00 2001 From: shavit Date: Mon, 30 Sep 2024 10:58:02 -0400 Subject: [PATCH 3/4] Add unicode normalize to every cleaner --- TTS/tts/utils/text/cleaners.py | 11 +++++++++++ tests/text_tests/test_text_cleaners.py | 6 +++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 58c8747c21..f496b9f0dd 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -82,6 +82,7 @@ def replace_symbols(text: str, lang: Optional[str] = "en") -> str: def basic_cleaners(text: str) -> str: """Basic pipeline that lowercases and collapses whitespace without transliteration.""" + text = normalize_unicode(text) text = lowercase(text) text = collapse_whitespace(text) return text @@ -89,6 +90,7 @@ def basic_cleaners(text: str) -> str: def transliteration_cleaners(text: str) -> str: """Pipeline for non-English text that transliterates to ASCII.""" + text = normalize_unicode(text) # text = convert_to_ascii(text) text = lowercase(text) text = collapse_whitespace(text) @@ -97,6 +99,7 @@ def transliteration_cleaners(text: str) -> str: def basic_german_cleaners(text: str) -> str: """Pipeline for German text""" + text = normalize_unicode(text) text = lowercase(text) text = collapse_whitespace(text) return text @@ -105,6 +108,7 @@ def basic_german_cleaners(text: str) -> str: # TODO: elaborate it def basic_turkish_cleaners(text: str) -> str: """Pipeline for Turkish text""" + text = normalize_unicode(text) text = text.replace("I", "ı") text = lowercase(text) text = collapse_whitespace(text) @@ -113,6 +117,7 @@ def basic_turkish_cleaners(text: str) -> str: def english_cleaners(text: str) -> str: """Pipeline for English text, including number and abbreviation expansion.""" + text = normalize_unicode(text) # text = convert_to_ascii(text) text = lowercase(text) text = expand_time_english(text) @@ -130,6 +135,7 @@ def phoneme_cleaners(text: str) -> str: NB: This cleaner converts numbers into English words, for other languages use multilingual_phoneme_cleaners(). """ + text = normalize_unicode(text) text = en_normalize_numbers(text) text = expand_abbreviations(text) text = replace_symbols(text) @@ -140,6 +146,7 @@ def phoneme_cleaners(text: str) -> str: def multilingual_phoneme_cleaners(text: str) -> str: """Pipeline for phonemes mode, including number and abbreviation expansion.""" + text = normalize_unicode(text) text = replace_symbols(text, lang=None) text = remove_aux_symbols(text) text = collapse_whitespace(text) @@ -148,6 +155,7 @@ def multilingual_phoneme_cleaners(text: str) -> str: def french_cleaners(text: str) -> str: """Pipeline for French text. There is no need to expand numbers, phonemizer already does that""" + text = normalize_unicode(text) text = expand_abbreviations(text, lang="fr") text = lowercase(text) text = replace_symbols(text, lang="fr") @@ -159,6 +167,7 @@ def french_cleaners(text: str) -> str: def portuguese_cleaners(text: str) -> str: """Basic pipeline for Portuguese text. There is no need to expand abbreviation and numbers, phonemizer already does that""" + text = normalize_unicode(text) text = lowercase(text) text = replace_symbols(text, lang="pt") text = remove_aux_symbols(text) @@ -168,12 +177,14 @@ def portuguese_cleaners(text: str) -> str: def chinese_mandarin_cleaners(text: str) -> str: """Basic pipeline for chinese""" + text = normalize_unicode(text) text = replace_numbers_to_characters_in_text(text) return text def multilingual_cleaners(text: str) -> str: """Pipeline for multilingual text""" + text = normalize_unicode(text) text = lowercase(text) text = replace_symbols(text, lang=None) text = remove_aux_symbols(text) diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py index dbd2e603b5..47a2948c98 100644 --- a/tests/text_tests/test_text_cleaners.py +++ b/tests/text_tests/test_text_cleaners.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -from TTS.tts.utils.text.cleaners import english_cleaners, multilingual_phoneme_cleaners, normalize_nfc, phoneme_cleaners +from TTS.tts.utils.text.cleaners import english_cleaners, multilingual_phoneme_cleaners, normalize_unicode, phoneme_cleaners def test_time() -> None: @@ -26,7 +26,7 @@ def test_multilingual_phoneme_cleaners() -> None: assert multilingual_phoneme_cleaners("1:") == "1," -def test_normalize_nfc() -> None: +def test_normalize_unicode() -> None: test_cases = [ ("Häagen-Dazs", "Häagen-Dazs"), ("你好!", "你好!"), @@ -45,4 +45,4 @@ def test_normalize_nfc() -> None: (u"\u03B1\u03B2", u"αβ") ] for arg, expect in test_cases: - assert normalize_nfc(arg) == expect + assert normalize_unicode(arg) == expect From 8ec5d15201c34ec9e0880b1783dceb166c8b9262 Mon Sep 17 00:00:00 2001 From: shavit Date: Mon, 30 Sep 2024 10:11:10 -0400 Subject: [PATCH 4/4] Format test_text_cleaners.py --- tests/text_tests/test_text_cleaners.py | 37 +++++++++++++++----------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py index 47a2948c98..9be1f0bf41 100644 --- a/tests/text_tests/test_text_cleaners.py +++ b/tests/text_tests/test_text_cleaners.py @@ -1,6 +1,11 @@ #!/usr/bin/env python3 -from TTS.tts.utils.text.cleaners import english_cleaners, multilingual_phoneme_cleaners, normalize_unicode, phoneme_cleaners +from TTS.tts.utils.text.cleaners import ( + english_cleaners, + multilingual_phoneme_cleaners, + normalize_unicode, + phoneme_cleaners, +) def test_time() -> None: @@ -28,21 +33,21 @@ def test_multilingual_phoneme_cleaners() -> None: def test_normalize_unicode() -> None: test_cases = [ - ("Häagen-Dazs", "Häagen-Dazs"), - ("你好!", "你好!"), - ("𝔄𝔅ℭ⓵⓶⓷︷,︸,i⁹,i₉,㌀,¼", "𝔄𝔅ℭ⓵⓶⓷︷,︸,i⁹,i₉,㌀,¼"), - ("é", "é"), - ("e\u0301", "é"), - ("a\u0300", "à"), - ("a\u0327", "a̧"), - ("na\u0303", "nã"), - ("o\u0302u", "ôu"), - ("n\u0303", "ñ"), - (u"\u4E2D\u56FD", u"中国"), - (u"niño", u"niño"), - (u"a\u0308", u"ä"), - (u"\u3053\u3093\u306b\u3061\u306f", u"こんにちは"), - (u"\u03B1\u03B2", u"αβ") + ("Häagen-Dazs", "Häagen-Dazs"), + ("你好!", "你好!"), + ("𝔄𝔅ℭ⓵⓶⓷︷,︸,i⁹,i₉,㌀,¼", "𝔄𝔅ℭ⓵⓶⓷︷,︸,i⁹,i₉,㌀,¼"), + ("é", "é"), + ("e\u0301", "é"), + ("a\u0300", "à"), + ("a\u0327", "a̧"), + ("na\u0303", "nã"), + ("o\u0302u", "ôu"), + ("n\u0303", "ñ"), + ("\u4E2D\u56FD", "中国"), + ("niño", "niño"), + ("a\u0308", "ä"), + ("\u3053\u3093\u306b\u3061\u306f", "こんにちは"), + ("\u03B1\u03B2", "αβ"), ] for arg, expect in test_cases: assert normalize_unicode(arg) == expect