Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add normalizer type C to text cleaners #85

Merged
merged 4 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions TTS/tts/utils/text/cleaners.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
"""Set of default text cleaners"""

# TODO: pick the cleaner for languages dynamically
eginhard marked this conversation as resolved.
Show resolved Hide resolved

import re
from typing import Optional
from unicodedata import normalize

from anyascii import anyascii

Expand Down Expand Up @@ -83,13 +82,15 @@ def replace_symbols(text: str, lang: Optional[str] = "en") -> str:

def basic_cleaners(text: str) -> str:
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
text = normalize_unicode(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text


def transliteration_cleaners(text: str) -> str:
"""Pipeline for non-English text that transliterates to ASCII."""
text = normalize_unicode(text)
# text = convert_to_ascii(text)
text = lowercase(text)
text = collapse_whitespace(text)
Expand All @@ -98,6 +99,7 @@ def transliteration_cleaners(text: str) -> str:

def basic_german_cleaners(text: str) -> str:
"""Pipeline for German text"""
text = normalize_unicode(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text
Expand All @@ -106,6 +108,7 @@ def basic_german_cleaners(text: str) -> str:
# TODO: elaborate it
def basic_turkish_cleaners(text: str) -> str:
"""Pipeline for Turkish text"""
text = normalize_unicode(text)
text = text.replace("I", "ı")
text = lowercase(text)
text = collapse_whitespace(text)
Expand All @@ -114,6 +117,7 @@ def basic_turkish_cleaners(text: str) -> str:

def english_cleaners(text: str) -> str:
"""Pipeline for English text, including number and abbreviation expansion."""
text = normalize_unicode(text)
# text = convert_to_ascii(text)
text = lowercase(text)
text = expand_time_english(text)
Expand All @@ -131,6 +135,7 @@ def phoneme_cleaners(text: str) -> str:
NB: This cleaner converts numbers into English words, for other languages
use multilingual_phoneme_cleaners().
"""
text = normalize_unicode(text)
text = en_normalize_numbers(text)
text = expand_abbreviations(text)
text = replace_symbols(text)
Expand All @@ -141,6 +146,7 @@ def phoneme_cleaners(text: str) -> str:

def multilingual_phoneme_cleaners(text: str) -> str:
"""Pipeline for phonemes mode, including number and abbreviation expansion."""
text = normalize_unicode(text)
text = replace_symbols(text, lang=None)
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
Expand All @@ -149,6 +155,7 @@ def multilingual_phoneme_cleaners(text: str) -> str:

def french_cleaners(text: str) -> str:
"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
text = normalize_unicode(text)
text = expand_abbreviations(text, lang="fr")
text = lowercase(text)
text = replace_symbols(text, lang="fr")
Expand All @@ -160,6 +167,7 @@ def french_cleaners(text: str) -> str:
def portuguese_cleaners(text: str) -> str:
"""Basic pipeline for Portuguese text. There is no need to expand abbreviation and
numbers, phonemizer already does that"""
text = normalize_unicode(text)
text = lowercase(text)
text = replace_symbols(text, lang="pt")
text = remove_aux_symbols(text)
Expand All @@ -169,12 +177,14 @@ def portuguese_cleaners(text: str) -> str:

def chinese_mandarin_cleaners(text: str) -> str:
"""Basic pipeline for chinese"""
text = normalize_unicode(text)
text = replace_numbers_to_characters_in_text(text)
return text


def multilingual_cleaners(text: str) -> str:
"""Pipeline for multilingual text"""
text = normalize_unicode(text)
text = lowercase(text)
text = replace_symbols(text, lang=None)
text = remove_aux_symbols(text)
Expand All @@ -186,3 +196,9 @@ def no_cleaners(text: str) -> str:
# remove newline characters
text = text.replace("\n", "")
return text


def normalize_unicode(text: str) -> str:
"""Normalize Unicode characters."""
text = normalize("NFC", text)
return text
29 changes: 28 additions & 1 deletion tests/text_tests/test_text_cleaners.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#!/usr/bin/env python3

from TTS.tts.utils.text.cleaners import english_cleaners, multilingual_phoneme_cleaners, phoneme_cleaners
from TTS.tts.utils.text.cleaners import (
english_cleaners,
multilingual_phoneme_cleaners,
normalize_unicode,
phoneme_cleaners,
)


def test_time() -> None:
Expand All @@ -24,3 +29,25 @@ def test_expand_numbers() -> None:
def test_multilingual_phoneme_cleaners() -> None:
assert multilingual_phoneme_cleaners("(Hello)") == "Hello"
assert multilingual_phoneme_cleaners("1:") == "1,"


def test_normalize_unicode() -> None:
test_cases = [
("Häagen-Dazs", "Häagen-Dazs"),
("你好!", "你好!"),
("𝔄𝔅ℭ⓵⓶⓷︷,︸,i⁹,i₉,㌀,¼", "𝔄𝔅ℭ⓵⓶⓷︷,︸,i⁹,i₉,㌀,¼"),
("é", "é"),
("e\u0301", "é"),
("a\u0300", "à"),
("a\u0327", "a̧"),
("na\u0303", "nã"),
("o\u0302u", "ôu"),
("n\u0303", "ñ"),
("\u4E2D\u56FD", "中国"),
("niño", "niño"),
("a\u0308", "ä"),
("\u3053\u3093\u306b\u3061\u306f", "こんにちは"),
("\u03B1\u03B2", "αβ"),
]
for arg, expect in test_cases:
assert normalize_unicode(arg) == expect
Loading