Skip to content

Commit

Permalink
feat: Simplify language generator
Browse files Browse the repository at this point in the history
This rewrites the language generator (not using JavaScript) and runs it for the language changes

fix: Languages removed: `cy` (Welsh),  `eo` (Esperanto), `mk` (Macedonian), `ms` (Malay), `zh-CN` (Chinese)

fix: Languages added: `zh-CN` (Chinese (Simplified)), `zh-TW` (Chinese (Traditional))
  • Loading branch information
Ryan5453 authored Nov 21, 2022
1 parent 165190e commit 5dbdf10
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 73 deletions.
10 changes: 3 additions & 7 deletions gtts/langs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,10 @@
"bs": "Bosnian",
"ca": "Catalan",
"cs": "Czech",
"cy": "Welsh",
"da": "Danish",
"de": "German",
"el": "Greek",
"en": "English",
"eo": "Esperanto",
"es": "Spanish",
"et": "Estonian",
"fi": "Finnish",
Expand All @@ -21,7 +19,6 @@
"hi": "Hindi",
"hr": "Croatian",
"hu": "Hungarian",
"hy": "Armenian",
"id": "Indonesian",
"is": "Icelandic",
"it": "Italian",
Expand All @@ -33,10 +30,9 @@
"ko": "Korean",
"la": "Latin",
"lv": "Latvian",
"mk": "Macedonian",
"ms": "Malay",
"ml": "Malayalam",
"mr": "Marathi",
"ms": "Malay",
"my": "Myanmar (Burmese)",
"ne": "Nepali",
"nl": "Dutch",
Expand All @@ -60,9 +56,9 @@
"uk": "Ukrainian",
"ur": "Urdu",
"vi": "Vietnamese",
"zh-CN": "Chinese",
"zh-CN": "Chinese (Simplified)",
"zh-TW": "Chinese (Traditional)"
}


def _main_langs():
return _langs
91 changes: 25 additions & 66 deletions scripts/gen_langs.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,28 @@
# -*- coding: utf-8 -*-
from gtts.utils import _translate_url
from bs4 import BeautifulSoup
from gtts.tts import gTTSError
from gtts import gTTS
import requests
import logging
import js2py
import uuid
import json
import sys
import re
import io

# Logger
log = logging.getLogger(__name__)
log.addHandler(logging.NullHandler())

# This file is used to generate the language dict (as a module)
# Needs cleaning up, very much WIP
# Usage:
# * Install gTTS
# * $ python gen_langs.py <path to gtts>/langs.py


def _get_data_by_key(js_list):
"""JavaScript function to generate the languages.
A payload with the languages is passed to a JavaScript function.
Instead of parsing that payload (combersome), we 'overload' that
function to return what we want.
"""

js_function = r"""
function AF_initDataCallback(args) {
return { key: args['key'], data: args['data'] };
};
"""

data_by_key = {}
for js in js_list:
js_code = js_function + js
py_eval = js2py.eval_js(js_code)
data_by_key[py_eval['key']] = py_eval['data']

return data_by_key


def _fetch_langs(tld="com"):
"""Fetch (scrape) languages from Google Translate.
"""Fetch all the valid languages from Google Translate.
Google Translate loads a JavaScript Array of 'languages codes' that can
be spoken. We intersect this list with all the languages Google Translate
provides to get the ones that support text-to-speech.
There's no easy way to get the list of languages that have TTS voices, so we can just grab the list of languages from Google Translate and try to get a TTS voice for each one, and only keep the ones that work.
Args:
tld (string): Top-level domain for the Google Translate host
Expand All @@ -61,47 +35,32 @@ def _fetch_langs(tld="com"):
dict: A dictionnary of languages from Google Translate
"""

URL_BASE = _translate_url(tld)
LANGUAGES_URL = _translate_url(tld + "/translate_a/l").strip("/")

headers = {
'User-Agent':
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) "
"Version/14.0 Safari/605.1.15"
}

page = requests.get(URL_BASE, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

scripts = soup.find_all(name='script', string=re.compile(r"^AF_initDataCallback"))
scripts = [s.text for s in scripts]

data_by_key = _get_data_by_key(scripts)

# Get all languages (ds:3)
# data for 'ds:3' is
# [
# [['hi', 'Hindi'], ['ps', 'Pashto'], ... ]],
# [['hi', 'Hindi'], ['ps', 'Pashto'], ... ]]
# ]
# (Note: list[0] and list[1] are identical)
all_langs_raw = data_by_key["ds:3"]

# Get languages codes that have TTS (ds:6)
# data for 'ds:6' is
# [
# [['af', 200], ['ar', 200], ...]
# ]
tts_langs_raw = data_by_key["ds:6"]
tts_langs = [lang[0] for lang in tts_langs_raw[0]]

# Create language dict (and filter only TTS-enabled langs)
# langs = { lang[0], lang[1] for lang in all_langs_raw[0] }

langs = {k: v for k, v in all_langs_raw[0] if k in tts_langs}
return langs

params = {
'client': 't',
'alpha': 'true'
}
data = requests.get(LANGUAGES_URL, headers=headers, params=params)
json = data.json()

working_languages = {}
test_text = str(uuid.uuid4())
for key in json["tl"]:
try:
tts = gTTS(test_text, lang=key)
tts.write_to_fp(io.BytesIO())
working_languages[key] = json["tl"][key]
except (gTTSError, ValueError): # Language not supported
pass

return working_languages

if __name__ == "__main__":
"""Language list generation 'main'
Expand Down

1 comment on commit 5dbdf10

@Armikas
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lithuanian language is missing, please, add - "lt": "Lithuanian",

Please sign in to comment.