From ee2537a02dd695a251a41527b6e8ba735179ccf9 Mon Sep 17 00:00:00 2001 From: rtobar Date: Sun, 10 Nov 2024 10:03:38 +0800 Subject: [PATCH] Perform update in parallel (#110) The update command implementation runs over files that are independent from each other. As such, the overall update operation can be trivially parallelised to speed things up. This change introduces The list of files that need to be compared/updated is collected in a first past. This list is then given to a multiprocessing pool to farm out the actual update of each individual file. The amount of parallelism is controlled through a new "jobs" parameter, command line option and environment variable. If no value is given for this option, all CPUs are used. I noticed this chance for improvement when doing a test run of the update of .po files for the Spanish translation of the CPython documentation. Local numbers in my 8-core, hyper-threaded AMD Ryzen 7 5825U: -j 1 (same as old behaviour) real 12m5.402s user 12m4.942s sys 0m0.273s -j 8 real 2m23.609s user 17m45.201s sys 0m0.460s real 1m57.398s user 26m22.654s sys 0m0.989s Signed-off-by: Rodrigo Tobar --- sphinx_intl/basic.py | 105 ++++++++++++++++++++++++++-------------- sphinx_intl/commands.py | 22 ++++++++- 2 files changed, 90 insertions(+), 37 deletions(-) diff --git a/sphinx_intl/basic.py b/sphinx_intl/basic.py index a5b7a3f..98335a7 100644 --- a/sphinx_intl/basic.py +++ b/sphinx_intl/basic.py @@ -1,5 +1,8 @@ +import dataclasses +import multiprocessing as mp import os from glob import glob +from typing import Optional import click @@ -24,7 +27,56 @@ def get_lang_dirs(path): # commands -def update(locale_dir, pot_dir, languages, line_width=76, ignore_obsolete=False): +@dataclasses.dataclass(frozen=True) +class UpdateItem: + po_file: str + pot_file: str + lang: str + line_width: int + ignore_obsolete: bool + + +@dataclasses.dataclass(frozen=True) +class UpdateResult: + po_file: str + status: str + added: Optional[int] = 0 + deleted: Optional[int] = 0 + + +def _update_single_file(update_item: UpdateItem): + cat_pot = c.load_po(update_item.pot_file) + if os.path.exists(update_item.po_file): + cat = c.load_po(update_item.po_file) + msgids = {m.id for m in cat if m.id} + c.update_with_fuzzy(cat, cat_pot) + new_msgids = {m.id for m in cat if m.id} + if msgids != new_msgids: + added = new_msgids - msgids + deleted = msgids - new_msgids + c.dump_po( + update_item.po_file, + cat, + width=update_item.line_width, + ignore_obsolete=update_item.ignore_obsolete, + ) + return UpdateResult(update_item.po_file, "update", len(added), len(deleted)) + else: + return UpdateResult(update_item.po_file, "notchanged") + else: # new po file + cat_pot.locale = update_item.lang + c.dump_po( + update_item.po_file, + cat_pot, + width=update_item.line_width, + ignore_obsolete=update_item.ignore_obsolete, + ) + return UpdateResult(update_item.po_file, "create") + + +def update( + locale_dir, pot_dir, languages, line_width=76, ignore_obsolete=False, jobs=0 +): """ Update specified language's po files from pot. @@ -33,6 +85,7 @@ def update(locale_dir, pot_dir, languages, line_width=76, ignore_obsolete=False) :param tuple languages: languages to update po files :param number line_width: maximum line width of po files :param bool ignore_obsolete: ignore obsolete entries in po files + :param number jobs: number of CPUs to use :return: {'create': 0, 'update': 0, 'notchanged': 0} :rtype: dict """ @@ -42,6 +95,7 @@ def update(locale_dir, pot_dir, languages, line_width=76, ignore_obsolete=False) "notchanged": 0, } + to_translate = [] for dirpath, dirnames, filenames in os.walk(pot_dir): for filename in filenames: pot_file = os.path.join(dirpath, filename) @@ -52,40 +106,21 @@ def update(locale_dir, pot_dir, languages, line_width=76, ignore_obsolete=False) for lang in languages: po_dir = os.path.join(locale_dir, lang, "LC_MESSAGES") po_file = os.path.join(po_dir, basename + ".po") - cat_pot = c.load_po(pot_file) - if os.path.exists(po_file): - cat = c.load_po(po_file) - msgids = {m.id for m in cat if m.id} - c.update_with_fuzzy(cat, cat_pot) - new_msgids = {m.id for m in cat if m.id} - if msgids != new_msgids: - added = new_msgids - msgids - deleted = msgids - new_msgids - status["update"] += 1 - click.echo( - "Update: {} +{}, -{}".format( - po_file, len(added), len(deleted) - ) - ) - c.dump_po( - po_file, - cat, - width=line_width, - ignore_obsolete=ignore_obsolete, - ) - else: - status["notchanged"] += 1 - click.echo(f"Not Changed: {po_file}") - else: # new po file - status["create"] += 1 - click.echo(f"Create: {po_file}") - cat_pot.locale = lang - c.dump_po( - po_file, - cat_pot, - width=line_width, - ignore_obsolete=ignore_obsolete, - ) + to_translate.append( + UpdateItem(po_file, pot_file, lang, line_width, ignore_obsolete) + ) + + with mp.Pool(processes=jobs or None) as pool: + for result in pool.imap_unordered(_update_single_file, to_translate): + status[result.status] += 1 + if result.status == "update": + click.echo( + f"Update: {result.po_file} +{result.added}, -{result.deleted}" + ) + elif result.status == "create": + click.echo(f"Create: {result.po_file}") + else: + click.echo(f"Not Changed: {result.po_file}") return status diff --git a/sphinx_intl/commands.py b/sphinx_intl/commands.py index e4ff672..d6f485c 100644 --- a/sphinx_intl/commands.py +++ b/sphinx_intl/commands.py @@ -157,6 +157,18 @@ def convert(self, value, param, ctx): "disable line wrapping", ) +option_jobs = click.option( + "-j", + "--jobs", + envvar=ENVVAR_PREFIX + "_JOBS", + type=int, + default=0, + metavar="", + show_default=True, + multiple=False, + help="The number of CPUs to use for updates. 0 means all", +) + option_no_obsolete = click.option( "--no-obsolete", envvar=ENVVAR_PREFIX + "_NO_OBSOLETE", @@ -271,7 +283,8 @@ def main(ctx, config, tag): @option_language @option_line_width @option_no_obsolete -def update(locale_dir, pot_dir, language, line_width, no_obsolete): +@option_jobs +def update(locale_dir, pot_dir, language, line_width, no_obsolete, jobs): """ Update specified language's po files from pot. @@ -300,7 +313,12 @@ def update(locale_dir, pot_dir, language, line_width, no_obsolete): raise click.BadParameter(msg, param_hint="language") basic.update( - locale_dir, pot_dir, languages, line_width, ignore_obsolete=no_obsolete + locale_dir, + pot_dir, + languages, + line_width, + ignore_obsolete=no_obsolete, + jobs=jobs, )