diff --git a/users/hilmes/experiments/__init__.py b/users/hilmes/experiments/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/__init__.py b/users/hilmes/experiments/nick_setups/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/__init__.py new file mode 100644 index 000000000..6ac5dd240 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/__init__.py @@ -0,0 +1 @@ +PACKAGE = __package__ diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/config.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/config.py new file mode 100644 index 000000000..c6536eb6b --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/config.py @@ -0,0 +1,156 @@ +import copy +import numpy as np +from sisyphus import tk +from typing import Any, Dict + +from i6_core.returnn.config import ReturnnConfig, CodeWrapper + +from i6_experiments.common.setups.returnn_pytorch.serialization import ( + Collection as TorchCollection, +) +from i6_experiments.common.setups.serialization import Import +from .data.common import TrainingDatasets +from .serializer import get_pytorch_serializer_v3, PACKAGE + +from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset + + +def get_training_config( + training_datasets: TrainingDatasets, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine: bool = False, + use_speed_perturbation: bool = False, +) -> ReturnnConfig: + """ + :param training_datasets: datasets for training + :param network_module: path to the pytorch config file containing Model + :param net_args: extra arguments for the model + :param config: + :param debug: run training in debug mode (linking from recipe instead of copy) + """ + + # changing these does not change the hash + post_config = { + "cleanup_old_models": True, + "stop_on_nonfinite_train_score": True, # this might break now with True + "num_workers_per_gpu": 2, + } + + base_config = { + ############# + "train": copy.deepcopy(training_datasets.train.as_returnn_opts()), + "dev": training_datasets.cv.as_returnn_opts(), + "eval_datasets": {"devtrain": training_datasets.devtrain.as_returnn_opts()}, + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, net_args=net_args, debug=debug, use_custom_engine=use_custom_engine + ) + python_prolog = None + + # TODO: maybe make nice + if use_speed_perturbation: + prolog_serializer = TorchCollection( + serializer_objects=[ + Import( + code_object_path=PACKAGE + ".dataset_code.speed_perturbation.legacy_speed_perturbation", + unhashed_package_root=PACKAGE, + ) + ] + ) + python_prolog = [prolog_serializer] + config["train"]["datasets"]["zip_dataset"]["audio"]["pre_process"] = CodeWrapper("legacy_speed_perturbation") + + returnn_config = ReturnnConfig( + config=config, post_config=post_config, python_prolog=python_prolog, python_epilog=[serializer] + ) + return returnn_config + + +def get_prior_config( + training_datasets: TrainingDatasets, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = {} + + base_config = { + ############# + "batch_size": 500 * 16000, + "max_seqs": 60, + ############# + "forward": training_datasets.prior.as_returnn_opts(), + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + prior=True, + ) + returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) + return returnn_config + + +def get_search_config( + network_module: str, + net_args: Dict[str, Any], + decoder: [str], + decoder_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = {} + + base_config = { + ############# + "batch_size": 240 * 16000, + "max_seqs": 60, + ############# + # dataset is added later in the pipeline during search_single + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + decoder=decoder, + decoder_args=decoder_args, + ) + returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) + return returnn_config diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_bpe/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_bpe/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_bpe/exp_ls100_1023_base.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_bpe/exp_ls100_1023_base.py new file mode 100644 index 000000000..ce66e544f --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_bpe/exp_ls100_1023_base.py @@ -0,0 +1,339 @@ +from sisyphus import tk + +import copy +from dataclasses import asdict +import numpy as np +from typing import cast + + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + +from ..lm import get_4gram_binary_lm +from ..data.bpe import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon +from ..data.common import build_test_dataset +from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT, KENLM_BINARY_PATH + +from ..pipeline import training, search, compute_prior + +from ..config import get_training_config, get_search_config, get_prior_config + + +def conformer_baseline(): + prefix_name = "experiments/librispeech/standalone_2023/ls100_ctc_bpe/" + + BPE_SIZE = 300 + + train_settings = TrainingDatasetSettings( + custom_processing_function=None, + partition_epoch=3, + epoch_wise_filters=[], + seq_ordering="laplace:.1000", + preemphasis=0.97, + ) + + train_settings_retrain = copy.deepcopy(train_settings) + train_settings_retrain.epoch_wise_filters = [] + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data = build_bpe_training_datasets( + librispeech_key="train-clean-100", + bpe_size=BPE_SIZE, + settings=train_settings, + ) + label_datastream = cast(LabelDatastream, train_data.datastreams["labels"]) + vocab_size_without_blank = label_datastream.vocab_size + + # build testing datasets + test_dataset_tuples = {} + # for testset in ["dev", "test"]: + for testset in ["dev-other"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + ) + + arpa_4gram_lm = get_4gram_binary_lm() + + # ---------------------------------------------------------------------------------------------------------------- # + + def run_exp( + ft_name, + datasets, + train_args, + search_args=None, + with_prior=False, + num_epochs=250, + decoder="ctc.decoder.flashlight_bpe_ctc", + ): + training_name = "/".join(ft_name.split("/")[:-1]) + search_args = search_args if search_args is not None else {} + + returnn_config = get_training_config(training_datasets=datasets, **train_args) + train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs) + + if with_prior: + returnn_config = get_prior_config(training_datasets=datasets, **train_args) + prior_file = compute_prior( + ft_name, + returnn_config, + checkpoint=train_job.out_checkpoints[num_epochs], + returnn_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + tk.register_output(training_name + "/prior.txt", prior_file) + search_args["prior_file"] = prior_file + + returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder) + + _, _, search_jobs = search( + ft_name + "/last_%i" % num_epochs, + returnn_search_config, + train_job.out_checkpoints[num_epochs], + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + + return train_job, search_jobs + + from ..pytorch_networks.ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + LogMelFeatureExtractionV1Config, + ) + + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + model_config = ModelConfig( + feature_extraction_config=fe_config, + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=1, + ) + + train_args_adamw03_accum2_jjlr = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 2, + }, + "debug": False, + } + + default_search_args = { + "lexicon": get_text_lexicon(librispeech_key="train-clean-100", bpe_size=BPE_SIZE), + "returnn_vocab": label_datastream.vocab, + "beam_size": 1024, + "beam_size_token": 128, + "arpa_lm": arpa_4gram_lm, + "beam_threshold": 14, + } + + # DIverged + # train_args = { + # **copy.deepcopy(train_args_adamw03_accum2_jjlr), + # "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6", + # "net_args": {"model_config_dict": asdict(model_config)}, + # } + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # } + # run_exp( + # prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR/lm%.1f_prior%.2f_bs1024_th14" % ( + # lm_weight, prior_scale), + # datasets=train_data, train_args=train_args, search_args=search_args, with_prior=True) + + model_config_start11 = copy.deepcopy(model_config) + model_config_start11.specauc_start_epoch = 11 + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6", + "net_args": {"model_config_dict": asdict(model_config_start11)}, + } + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_start11/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + + # from here on onwards, use default AdamW with same OCLR + train_args_adamw_02 = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-2}, + "learning_rates": list(np.linspace(1e-5, 1e-3, 150)) + list(np.linspace(1e-3, 1e-6, 150)), + ############# + "batch_size": 200 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 2, + }, + } + + model_config_smaller = ModelConfig( + feature_extraction_config=fe_config, + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=384, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=9, + final_dropout=0.2, + specauc_start_epoch=1, + ) + + train_args = { + **copy.deepcopy(train_args_adamw_02), + "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6", + "net_args": {"model_config_dict": asdict(model_config_smaller)}, + } + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_smaller_decay1e-2/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + + model_config_smaller_start11 = copy.deepcopy(model_config_smaller) + model_config_smaller_start11.specauc_start_epoch = 11 + train_args_start11 = copy.deepcopy(train_args) + train_args_start11["net_args"]["model_config_dict"] = asdict(model_config_smaller_start11) + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_smaller_decay1e-2_start11/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args_start11, + search_args=search_args, + with_prior=True, + ) + + from ..pytorch_networks.ctc_conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + ) + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=9, + final_dropout=0.2, + ) diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_phon/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_phon/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_phon/exp_ls100_1023_base.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_phon/exp_ls100_1023_base.py new file mode 100644 index 000000000..6d10fabf2 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_phon/exp_ls100_1023_base.py @@ -0,0 +1,313 @@ +from sisyphus import tk + +import copy +from dataclasses import asdict +import numpy as np +from typing import cast + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + +from ..data.phon import build_eow_phon_training_datasets, TrainingDatasetSettings, get_text_lexicon +from ..data.common import build_test_dataset +from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT +from ..lm import get_4gram_binary_lm + +from ..pipeline import training, search, compute_prior + +from ..config import get_training_config, get_search_config, get_prior_config + + +def eow_phon_ls100_1023_base(): + prefix_name = "experiments/librispeech/standalone_2023/ls100_ctc_eow_phon/" + + train_settings = TrainingDatasetSettings( + custom_processing_function=None, + partition_epoch=3, + epoch_wise_filters=[], + seq_ordering="laplace:.1000", + preemphasis=0.97, + ) + + train_settings_retrain = copy.deepcopy(train_settings) + train_settings_retrain.epoch_wise_filters = [] + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data = build_eow_phon_training_datasets( + librispeech_key="train-clean-100", + settings=train_settings, + ) + label_datastream = cast(LabelDatastream, train_data.datastreams["labels"]) + vocab_size_without_blank = label_datastream.vocab_size + + # build testing datasets + test_dataset_tuples = {} + # for testset in ["dev", "test"]: + for testset in ["dev-other"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + ) + + arpa_4gram_lm = get_4gram_binary_lm() + + # ---------------------------------------------------------------------------------------------------------------- # + + def run_exp( + ft_name, + datasets, + train_args, + search_args=None, + with_prior=False, + num_epochs=250, + decoder="ctc.decoder.flashlight_phoneme_ctc", + ): + training_name = "/".join(ft_name.split("/")[:-1]) + search_args = search_args if search_args is not None else {} + + returnn_config = get_training_config(training_datasets=datasets, **train_args) + train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs) + + if with_prior: + returnn_config = get_prior_config(training_datasets=datasets, **train_args) + prior_file = compute_prior( + ft_name, + returnn_config, + checkpoint=train_job.out_checkpoints[num_epochs], + returnn_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + tk.register_output(training_name + "/prior.txt", prior_file) + search_args["prior_file"] = prior_file + + returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder) + + _, _, search_jobs = search( + ft_name + "/last_%i" % num_epochs, + returnn_search_config, + train_job.out_checkpoints[num_epochs], + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + + return train_job, search_jobs + + from ..pytorch_networks.ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + LogMelFeatureExtractionV1Config, + ) + + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + model_config = ModelConfig( + feature_extraction_config=fe_config, + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=1, + ) + + train_args_adamw03_accum2_jjlr = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 2, + }, + "debug": False, + } + + default_search_args = { + "lexicon": get_text_lexicon(librispeech_key="train-clean-100"), + "returnn_vocab": label_datastream.vocab, + "beam_size": 1024, + "beam_size_token": 128, + "arpa_lm": arpa_4gram_lm, + "beam_threshold": 14, + } + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6", + "net_args": {"model_config_dict": asdict(model_config)}, + } + # diverged with hiccup + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # } + # run_exp( + # prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR/lm%.1f_prior%.2f_bs1024_th14" % ( + # lm_weight, prior_scale), + # datasets=train_data, train_args=train_args, search_args=search_args, with_prior=True) + + train_args_gc1 = copy.deepcopy(train_args) + train_args_gc1["config"]["gradient_clip"] = 1.0 + for lm_weight in [2.5, 3.0, 3.5]: + for prior_scale in [0.0, 0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_gc1/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args_gc1, + search_args=search_args, + with_prior=True, + ) + + train_args_decay1e_2 = copy.deepcopy(train_args) + train_args_decay1e_2["config"]["optimizer"]["weight_decay"] = 1e-2 + for lm_weight in [2.5, 3.0, 3.5]: + for prior_scale in [0.0, 0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_decay1e-2/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args_decay1e_2, + search_args=search_args, + with_prior=True, + ) + + search_args = { + **default_search_args, + "lm_weight": 3.5, + "prior_scale": 0.3, + "sil_score": -1000.0, + } + run_exp( + prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_decay1e-2/lm_test1_bs1024_th14", + datasets=train_data, + train_args=train_args_decay1e_2, + search_args=search_args, + with_prior=True, + decoder="ctc.decoder.flashlight_phoneme_ctc_v2", + ) + + search_args = { + "lexicon": get_text_lexicon(librispeech_key="train-clean-100"), + "returnn_vocab": label_datastream.vocab, + "beam_size": 1024, + "arpa_lm": arpa_4gram_lm, + "beam_threshold": 16, + "lm_weight": 3.5, + "prior_scale": 0.3, + } + run_exp( + prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_decay1e-2/lm_test2_bs1024_th16", + datasets=train_data, + train_args=train_args_decay1e_2, + search_args=search_args, + with_prior=True, + decoder="ctc.decoder.flashlight_phoneme_ctc", + ) + + ###### trying to reproduce 14.5% result from librispeech/librispeech_100_phon_ctc ######### + + train_args_adamw_02 = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-8, "weight_decay": 1e-2}, + "learning_rates": list(np.linspace(1e-5, 1e-3, 150)) + list(np.linspace(1e-3, 1e-6, 150)), + ############# + "batch_size": 200 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + }, + } + model_config_small_ff = ModelConfig( + feature_extraction_config=fe_config, + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=384, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=1, + ) + train_args = { + **copy.deepcopy(train_args_adamw_02), + "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6", + "net_args": {"model_config_dict": asdict(model_config_small_ff)}, + } + for lm_weight in [2.5, 3.0, 3.5]: + for prior_scale in [0.0, 0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_legacy_decay1e-2_FF384/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/bpe.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/bpe.py new file mode 100644 index 000000000..4deb3781e --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/bpe.py @@ -0,0 +1,92 @@ +""" +The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups +""" +from sisyphus import tk +from functools import lru_cache +from typing import Dict, List, Optional, Tuple + + +from i6_experiments.common.datasets.librispeech import get_ogg_zip_dict, get_bliss_lexicon +from i6_experiments.common.datasets.librispeech.vocab import get_subword_nmt_bpe_v2 +from i6_experiments.common.helpers.text_labels.subword_nmt_bpe import get_returnn_subword_nmt + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import BpeDatastream +from i6_experiments.users.rossenbach.lexicon.bpe_lexicon import CreateBPELexiconJob + +from .common import TrainingDatasetSettings, TrainingDatasets, build_training_datasets, DATA_PREFIX +from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE + + +@lru_cache() +def get_bpe_datastream(librispeech_key: str, bpe_size: int, is_recog: bool) -> BpeDatastream: + """ + Returns the datastream for the bpe labels + + Uses the legacy BPE setup that is compatible with old LM models + + :param librispeech_key: + :param bpe_size: size for the bpe labels + :param is_recog: removes the UNK label when not in training + """ + bpe_settings = get_subword_nmt_bpe_v2(corpus_key=librispeech_key, bpe_size=bpe_size, unk_label="") + + # TODO: Try without sequence postfix (seq_postfix=None) + # otherwise every sequence gets a at the end + bpe_targets = BpeDatastream(available_for_inference=False, bpe_settings=bpe_settings, use_unk_label=is_recog) + return bpe_targets + + +def get_lexicon(librispeech_key: str, bpe_size: int) -> tk.Path: + subword_nmt_repo = get_returnn_subword_nmt( + commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", output_prefix=DATA_PREFIX + ) + subword_nmt_repo.hash_overwrite = "I6_SUBWORD_NMT_V2" + + bpe_datastream = get_bpe_datastream(librispeech_key=librispeech_key, bpe_size=bpe_size, is_recog=False) + bpe_lexicon = CreateBPELexiconJob( + base_lexicon_path=get_bliss_lexicon( + add_unknown_phoneme_and_mapping=False, add_silence=False, output_prefix="librispeech_datasets" + ), + bpe_codes=bpe_datastream.codes, + bpe_vocab=bpe_datastream.vocab, + subword_nmt_repo=subword_nmt_repo, + unk_label="", + ).out_lexicon + + return bpe_lexicon + + +def get_text_lexicon(librispeech_key: str, bpe_size: int) -> tk.Path: + """ + + :return: + """ + bliss_lex = get_lexicon(librispeech_key=librispeech_key, bpe_size=bpe_size) + from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon + + word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon + return word_lexicon + + +def build_bpe_training_datasets( + librispeech_key: str, + bpe_size: int, + settings: TrainingDatasetSettings, +) -> TrainingDatasets: + """ + :param settings: configuration object for the dataset pipeline + """ + label_datastream = get_bpe_datastream(librispeech_key=librispeech_key, bpe_size=bpe_size, is_recog=False) + + ogg_zip_dict = get_ogg_zip_dict("corpora", returnn_root=MINI_RETURNN_ROOT, returnn_python_exe=RETURNN_EXE) + train_ogg = ogg_zip_dict[librispeech_key] + dev_clean_ogg = ogg_zip_dict["dev-clean"] + dev_other_ogg = ogg_zip_dict["dev-other"] + + return build_training_datasets( + train_ogg=train_ogg, + dev_clean_ogg=dev_clean_ogg, + dev_other_ogg=dev_other_ogg, + settings=settings, + label_datastream=label_datastream, + ) diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/common.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/common.py new file mode 100644 index 000000000..0ca95abad --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/common.py @@ -0,0 +1,201 @@ +""" +The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups +""" +from sisyphus import tk +from dataclasses import dataclass +from functools import lru_cache +from typing import Dict, List, Optional, Tuple + +from i6_core.returnn import CodeWrapper +from i6_core.returnn.oggzip import BlissToOggZipJob + +from i6_experiments.common.datasets.librispeech import get_ogg_zip_dict, get_bliss_corpus_dict + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.audio import ( + AudioRawDatastream, + ReturnnAudioRawOptions, +) +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.base import Datastream +from i6_experiments.users.rossenbach.datasets.librispeech import get_mixed_cv_segments + +from returnn_common.datasets import Dataset, OggZipDataset, MetaDataset + +from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE + +DATA_PREFIX = "experiments/librispeech/2023_standalone/data/" + +# -------------- Dataclasses for configuration and data passing ------------------- + +# here: ( , , ) +EpochWiseFilter = Tuple[int, int, int] + + +@dataclass(frozen=True) +class TrainingDatasets: + train: Dataset + cv: Dataset + devtrain: Dataset + datastreams: Dict[str, Datastream] + prior: Optional[Dataset] + + +@dataclass() +class TrainingDatasetSettings: + # features settings + custom_processing_function: Optional[str] + + # training settings + partition_epoch: int + epoch_wise_filters: List[EpochWiseFilter] + seq_ordering: str + preemphasis: float + + +# --------------------------- Helper functions ----------------------------------- + + +@lru_cache() +def get_audio_raw_datastream(preemphasis: Optional[float] = None) -> AudioRawDatastream: + """ + :param preemphasis: set the pre-emphasis filter factor + """ + audio_datastream = AudioRawDatastream( + available_for_inference=True, options=ReturnnAudioRawOptions(peak_normalization=True, preemphasis=preemphasis) + ) + return audio_datastream + + +def get_zip(name: str, bliss_dataset: tk.Path): + """ + + :param name: + :param bliss_dataset: + :return: + """ + zip_dataset_job = BlissToOggZipJob( + bliss_corpus=bliss_dataset, + no_conversion=True, # for Librispeech we are already having ogg + returnn_python_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + zip_dataset_job.add_alias(DATA_PREFIX + name) + + return zip_dataset_job.out_ogg_zip + + +# --------------------------- Dataset functions ----------------------------------- + + +def build_training_datasets( + train_ogg: tk.Path, + dev_clean_ogg: tk.Path, + dev_other_ogg: tk.Path, + label_datastream: Datastream, + settings: TrainingDatasetSettings, +) -> TrainingDatasets: + """ + :param train_ogg: + :param dev_clean_ogg: + :param dev_other_ogg: + :param label_datastream: + :param settings: + """ + audio_datastream = get_audio_raw_datastream(settings.preemphasis) + + datastreams = { + "raw_audio": audio_datastream, + "labels": label_datastream, + } + + data_map = {"raw_audio": ("zip_dataset", "data"), "labels": ("zip_dataset", "classes")} + + training_audio_opts = audio_datastream.as_returnn_audio_opts() + if settings.custom_processing_function: + training_audio_opts["pre_process"] = CodeWrapper(settings.custom_processing_function) + + additional_opts = {} + if settings.epoch_wise_filters: + additional_opts["epoch_wise_filter"] = {} + for fr, to, max_mean_len in settings.epoch_wise_filters: + additional_opts["epoch_wise_filter"][(fr, to)] = {"max_mean_len": max_mean_len} + + def make_meta(dataset: OggZipDataset): + return MetaDataset( + data_map=data_map, datasets={"zip_dataset": dataset}, seq_order_control_dataset="zip_dataset" + ) + + train_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=training_audio_opts, + target_options=label_datastream.as_returnn_targets_opts(), + partition_epoch=settings.partition_epoch, + seq_ordering=settings.seq_ordering, + additional_options=additional_opts, + ) + train_dataset = make_meta(train_zip_dataset) + + cv_zip_dataset = OggZipDataset( + files=[dev_clean_ogg, dev_other_ogg], + audio_options=audio_datastream.as_returnn_audio_opts(), + target_options=label_datastream.as_returnn_targets_opts(), + segment_file=get_mixed_cv_segments(), + seq_ordering="sorted_reverse", + ) + cv_dataset = make_meta(cv_zip_dataset) + + devtrain_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=audio_datastream.as_returnn_audio_opts(), + target_options=label_datastream.as_returnn_targets_opts(), + seq_ordering="sorted_reverse", + random_subset=3000, + ) + devtrain_dataset = make_meta(devtrain_zip_dataset) + + prior_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=training_audio_opts, + target_options=label_datastream.as_returnn_targets_opts(), + partition_epoch=1, + seq_ordering="sorted_reverse", + additional_options=additional_opts, + ) + prior_dataset = make_meta(prior_zip_dataset) + + return TrainingDatasets( + train=train_dataset, + cv=cv_dataset, + devtrain=devtrain_dataset, + datastreams=datastreams, + prior=prior_dataset, + ) + + +@lru_cache() +def build_test_dataset( + dataset_key: str, + preemphasis: Optional[float] = None, +): + """ + + :param librispeech_key: e.g. train-clean-100h, used for basic BPE stream + :param dataset_key: e.g. dev-other, which test set to create + :param preemphasis: + :return: + """ + ogg_zip_dict = get_ogg_zip_dict("corpora", returnn_root=MINI_RETURNN_ROOT, returnn_python_exe=RETURNN_EXE) + bliss_dict = get_bliss_corpus_dict() + test_ogg = ogg_zip_dict[dataset_key] + + audio_datastream = get_audio_raw_datastream(preemphasis) + + data_map = {"raw_audio": ("zip_dataset", "data")} + + test_zip_dataset = OggZipDataset( + files=[test_ogg], audio_options=audio_datastream.as_returnn_audio_opts(), seq_ordering="sorted_reverse" + ) + test_dataset = MetaDataset( + data_map=data_map, datasets={"zip_dataset": test_zip_dataset}, seq_order_control_dataset="zip_dataset" + ) + + return test_dataset, bliss_dict[dataset_key] diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/phon.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/phon.py new file mode 100644 index 000000000..450f00fc6 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/phon.py @@ -0,0 +1,142 @@ +""" +The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups +""" +from sisyphus import tk + +from dataclasses import dataclass +from functools import lru_cache +import os +from typing import Any, Dict, List, Optional, Tuple + +from i6_core.returnn.vocabulary import ReturnnVocabFromPhonemeInventory +from i6_core.corpus.transform import ApplyLexiconToCorpusJob +from i6_core.lexicon.modification import AddEowPhonemesToLexiconJob + +from i6_experiments.common.datasets.librispeech import ( + get_g2p_augmented_bliss_lexicon_dict, + get_bliss_corpus_dict, + get_ogg_zip_dict, + get_bliss_lexicon, +) + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + + +from .common import get_zip, DATA_PREFIX, build_training_datasets, TrainingDatasets, TrainingDatasetSettings + + +def get_eow_lexicon(librispeech_key: str, with_g2p=True) -> tk.Path: + + """ + get the g2p bliss lexicon with EOW tokens added + :return: + """ + if with_g2p: + lex = get_g2p_augmented_bliss_lexicon_dict( + use_stress_marker=False, add_silence=False, output_prefix="librispeech_g2p_datasets" + )[librispeech_key] + else: + lex = get_bliss_lexicon(use_stress_marker=False, add_silence=False, output_prefix="librispeech_datasets") + + return AddEowPhonemesToLexiconJob(lex).out_lexicon + + +def get_eow_bliss(librispeech_key: str, train_librispeech_key: str, remove_unk_seqs=False) -> tk.Path: + """ + get an EOW modified corpus with optional unknown removed for cross validation + + :param corpus_key: train, dev, test + :param remove_unk_seqs: remove all sequences with unknowns, used for dev-clean and dev-other + in case of using them for cross validation + :return: + """ + bliss = get_bliss_corpus_dict(audio_format="ogg")[librispeech_key] + if remove_unk_seqs: + from i6_core.corpus.filter import FilterCorpusRemoveUnknownWordSegmentsJob + + bliss = FilterCorpusRemoveUnknownWordSegmentsJob( + bliss_corpus=bliss, + bliss_lexicon=get_eow_lexicon( + librispeech_key=train_librispeech_key, with_g2p=True + ), # cv may include words from g2p + all_unknown=False, + ).out_corpus + + # default train lexicon + lexicon = get_eow_lexicon(librispeech_key=train_librispeech_key, with_g2p=True) + converted_bliss_corpus = ApplyLexiconToCorpusJob(bliss, lexicon, word_separation_orth=None).out_corpus + + return converted_bliss_corpus + + +def get_eow_bliss_and_zip(librispeech_key: str, train_librispeech_key: str, remove_unk_seqs=False): + """ + :param corpus_key: e.g. "train", "dev", or "test, + :param remove_unk_seqs: remove all sequences with unknowns, used for dev-clean and dev-other + in case of using them for cross validation + :return: tuple of bliss and zip + """ + + bliss_dataset = get_eow_bliss( + librispeech_key=librispeech_key, train_librispeech_key=train_librispeech_key, remove_unk_seqs=remove_unk_seqs + ) + zip_dataset = get_zip(f"{librispeech_key}_eow", bliss_dataset=bliss_dataset) + + return bliss_dataset, zip_dataset + + +def get_eow_vocab_datastream(librispeech_key: str) -> LabelDatastream: + """ + Phoneme with EOW LabelDatastream for Tedlium-2 + + :param with_blank: datastream for CTC training + """ + lexicon = get_eow_lexicon(librispeech_key=librispeech_key) + returnn_vocab_job = ReturnnVocabFromPhonemeInventory(lexicon) + returnn_vocab_job.add_alias(os.path.join(DATA_PREFIX, f"{librispeech_key}", "eow_returnn_vocab_job")) + + vocab_datastream = LabelDatastream( + available_for_inference=True, vocab=returnn_vocab_job.out_vocab, vocab_size=returnn_vocab_job.out_vocab_size + ) + + return vocab_datastream + + +def get_text_lexicon(librispeech_key: str) -> tk.Path: + """ + + :return: + """ + bliss_lex = get_eow_lexicon(librispeech_key=librispeech_key, with_g2p=False) + from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon + + word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon + return word_lexicon + + +def build_eow_phon_training_datasets( + librispeech_key: str, + settings: TrainingDatasetSettings, +) -> TrainingDatasets: + """ + :param settings: configuration object for the dataset pipeline + """ + label_datastream = get_eow_vocab_datastream(librispeech_key=librispeech_key) + + _, train_ogg = get_eow_bliss_and_zip( + librispeech_key=librispeech_key, train_librispeech_key=librispeech_key, remove_unk_seqs=False + ) + _, dev_clean_ogg = get_eow_bliss_and_zip( + librispeech_key="dev-clean", train_librispeech_key=librispeech_key, remove_unk_seqs=True + ) + _, dev_other_ogg = get_eow_bliss_and_zip( + librispeech_key="dev-other", train_librispeech_key=librispeech_key, remove_unk_seqs=True + ) + + return build_training_datasets( + train_ogg=train_ogg, + dev_clean_ogg=dev_clean_ogg, + dev_other_ogg=dev_other_ogg, + settings=settings, + label_datastream=label_datastream, + ) diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/default_tools.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/default_tools.py new file mode 100644 index 000000000..5c33c776a --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/default_tools.py @@ -0,0 +1,20 @@ +from sisyphus import tk +from i6_core.tools.git import CloneGitRepositoryJob + + +# python from apptainer +RETURNN_EXE = tk.Path("/usr/bin/python3", hash_overwrite="GENERIC_RETURNN_LAUNCHER") +MINI_RETURNN_ROOT = tk.Path("/u/hilmes/dev/MiniReturnn", hash_overwrite="LIBRISPEECH_DEFAULT_RETURNN_ROOT") + +from i6_experiments.common.tools.sctk import compile_sctk + +SCTK_BINARY_PATH = compile_sctk(branch="v2.4.12") # use last published version +# SCTK_BINARY_PATH = compile_sctk() # use most recent SCTK +SCTK_BINARY_PATH.hash_overwrite = "LIBRISPEECH_DEFAULT_SCTK_BINARY_PATH" + +from i6_core.tools.git import CloneGitRepositoryJob +from i6_core.lm.kenlm import CompileKenLMJob, CreateBinaryLMJob + +kenlm_repo = CloneGitRepositoryJob("https://github.com/kpu/kenlm").out_repository +KENLM_BINARY_PATH = CompileKenLMJob(repository=kenlm_repo).out_binaries +KENLM_BINARY_PATH.hash_overwrite = "LIBRISPEECH_DEFAULT_KENLM_BINARY_PATH" diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/lm.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/lm.py new file mode 100644 index 000000000..42a910233 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/lm.py @@ -0,0 +1,18 @@ +from i6_core.lm.kenlm import CreateBinaryLMJob + +from i6_experiments.common.datasets.librispeech.language_model import get_arpa_lm_dict + +from .default_tools import KENLM_BINARY_PATH + + +def get_4gram_binary_lm(): + """ + + :param output_prefix: + :return: + """ + arpa_4gram_binary_lm_job = CreateBinaryLMJob( + arpa_lm=get_arpa_lm_dict()["4gram"], kenlm_binary_folder=KENLM_BINARY_PATH + ) + arpa_4gram_binary_lm_job.add_alias("experiments/librispeech/standalone_2023/lm/create_4gram_binary_lm") + return arpa_4gram_binary_lm_job.out_lm diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pipeline.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pipeline.py new file mode 100644 index 000000000..1327b42db --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pipeline.py @@ -0,0 +1,179 @@ +import copy +import os.path + +from sisyphus import tk + +from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset + +from i6_core.returnn.config import ReturnnConfig +from i6_core.returnn.training import ReturnnTrainingJob +from i6_core.returnn.training import GetBestTFCheckpointJob +from i6_core.returnn.forward import ReturnnForwardJob, ReturnnForwardJobV2 +from i6_core.returnn.search import SearchBPEtoWordsJob, ReturnnComputeWERJob +from i6_experiments.users.rossenbach.returnn.training import AverageCheckpointsJobV2 + +from .default_tools import RETURNN_EXE, MINI_RETURNN_ROOT, SCTK_BINARY_PATH + + +@tk.block() +def training(prefix_name, returnn_config, returnn_exe, returnn_root, num_epochs): + """ + + :param prefix_name: + :param returnn_config: + :param returnn_exe: + :param returnn_root: + :return: + """ + default_rqmt = { + "mem_rqmt": 15, + "time_rqmt": 168, + "cpu_rqmt": 4, + "log_verbosity": 5, + "returnn_python_exe": returnn_exe, + "returnn_root": returnn_root, + } + + train_job = ReturnnTrainingJob(returnn_config=returnn_config, num_epochs=num_epochs, **default_rqmt) + train_job.add_alias(prefix_name + "/training") + tk.register_output(prefix_name + "/learning_rates", train_job.out_learning_rates) + + return train_job + + +@tk.block() +def search_single( + prefix_name, + returnn_config, + checkpoint, + recognition_dataset: GenericDataset, + recognition_bliss_corpus, + returnn_exe, + returnn_root, + mem_rqmt=8, + use_gpu=False, +): + """ + Run search for a specific test dataset + + :param str prefix_name: + :param ReturnnConfig returnn_config: + :param Checkpoint checkpoint: + :param returnn_standalone.data.datasets.dataset.GenericDataset recognition_dataset: + :param Path recognition_reference: Path to a py-dict format reference file + :param Path returnn_exe: + :param Path returnn_root: + """ + returnn_config = copy.deepcopy(returnn_config) + returnn_config.config["forward"] = recognition_dataset.as_returnn_opts() + search_job = ReturnnForwardJobV2( + model_checkpoint=checkpoint, + returnn_config=returnn_config, + log_verbosity=5, + mem_rqmt=mem_rqmt, + time_rqmt=24, + device="gpu" if use_gpu else "cpu", + cpu_rqmt=2, + returnn_python_exe=returnn_exe, + returnn_root=returnn_root, + output_files=["search_out.py"], + ) + search_job.add_alias(prefix_name + "/search_job") + + from i6_core.returnn.search import SearchWordsToCTMJob + from i6_core.corpus.convert import CorpusToStmJob + from i6_core.recognition.scoring import ScliteJob + + search_ctm = SearchWordsToCTMJob( + recog_words_file=search_job.out_files["search_out.py"], + bliss_corpus=recognition_bliss_corpus, + ).out_ctm_file + + stm_file = CorpusToStmJob(bliss_corpus=recognition_bliss_corpus).out_stm_path + + sclite_job = ScliteJob(ref=stm_file, hyp=search_ctm, sctk_binary_path=SCTK_BINARY_PATH) + tk.register_output(prefix_name + "/sclite/wer", sclite_job.out_wer) + tk.register_output(prefix_name + "/sclite/report", sclite_job.out_report_dir) + + return sclite_job.out_wer, search_job + + +@tk.block() +def search(prefix_name, returnn_config, checkpoint, test_dataset_tuples, returnn_exe, returnn_root, use_gpu=False): + """ + + :param str prefix_name: + :param ReturnnConfig returnn_config: + :param Checkpoint checkpoint: + :param test_dataset_tuples: + :param returnn_exe: + :param returnn_root: + :return: + """ + # use fixed last checkpoint for now, needs more fine-grained selection / average etc. here + wers = {} + search_jobs = [] + for key, (test_dataset, test_dataset_reference) in test_dataset_tuples.items(): + wers[key], search_job = search_single( + prefix_name + "/%s" % key, + returnn_config, + checkpoint, + test_dataset, + test_dataset_reference, + returnn_exe, + returnn_root, + use_gpu=use_gpu, + ) + search_jobs.append(search_job) + + from i6_core.report import GenerateReportStringJob, MailJob + + format_string_report = ",".join(["{%s_val}" % (prefix_name + key) for key in test_dataset_tuples.keys()]) + format_string = " - ".join( + ["{%s}: {%s_val}" % (prefix_name + key, prefix_name + key) for key in test_dataset_tuples.keys()] + ) + values = {} + values_report = {} + for key in test_dataset_tuples.keys(): + values[prefix_name + key] = key + values["%s_val" % (prefix_name + key)] = wers[key] + values_report["%s_val" % (prefix_name + key)] = wers[key] + + report = GenerateReportStringJob(report_values=values, report_template=format_string, compress=False).out_report + # mail = MailJob(result=report, subject=prefix_name, send_contents=True).out_status + # tk.register_output(os.path.join(prefix_name, "mail_status"), mail) + return format_string_report, values_report, search_jobs + + +@tk.block() +def compute_prior( + prefix_name, + returnn_config, + checkpoint, + returnn_exe, + returnn_root, + mem_rqmt=8, +): + """ + Run search for a specific test dataset + + :param str prefix_name: + :param ReturnnConfig returnn_config: + :param Checkpoint checkpoint: + :param Path returnn_exe: + :param Path returnn_root: + """ + search_job = ReturnnForwardJobV2( + model_checkpoint=checkpoint, + returnn_config=returnn_config, + log_verbosity=5, + mem_rqmt=mem_rqmt, + time_rqmt=1, + device="gpu", + cpu_rqmt=4, + returnn_python_exe=returnn_exe, + returnn_root=returnn_root, + output_files=["prior.txt"], + ) + search_job.add_alias(prefix_name + "/prior_job") + return search_job.out_files["prior.txt"] diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6.py new file mode 100644 index 000000000..bd5860dc5 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6.py @@ -0,0 +1,184 @@ +""" +Like v2, but with i6_models specaugment (v3) +and now controllable start time for when specaugment is applied (v4) +and with the proper feature extraction from i6-models +""" + +import numpy as np +import torch +from torch import nn + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=self.cfg.feature_extraction_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py new file mode 100644 index 000000000..a57c949fd --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py @@ -0,0 +1,84 @@ +""" +Config for the base CTC models v4, including specaug start time +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1Config + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class ConformerEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + +@dataclass +class ModelConfig: + feature_extraction_config: LogMelFeatureExtractionV1Config + frontend_config: VGG4LayerActFrontendV1Config + specaug_config: SpecaugConfig + specauc_start_epoch: int + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["feature_extraction_config"] = LogMelFeatureExtractionV1Config(**d["feature_extraction_config"]) + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig(**d["specaug_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py new file mode 100644 index 000000000..3012eee33 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py @@ -0,0 +1,114 @@ +""" +Flashlight/Torchaudio CTC decoder and prior computation functions +""" + +import time +import numpy as np +import torch +from torch import nn + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[blank]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + run_ctx.running_audio_len_s = 0 + run_ctx.total_am_time = 0 + run_ctx.total_search_time = 0 + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print( + "Total-AM-Time: %.2fs, AM-RTF: %.3f" + % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s) + ) + print( + "Total-Search-Time: %.2fs, Search-RTF: %.3f" + % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s) + ) + total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time + print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + am_start = time.time() + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + am_time = time.time() - am_start + run_ctx.total_am_time += am_time + + search_start = time.time() + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + search_time = time.time() - search_start + run_ctx.total_search_time += search_time + + print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch)) + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch)) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py new file mode 100644 index 000000000..39d942e9b --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py @@ -0,0 +1,114 @@ +""" +Flashlight/Torchaudio CTC decoder and prior computation functions +""" + +import time +import numpy as np +import torch +from torch import nn + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[SILENCE]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + run_ctx.running_audio_len_s = 0 + run_ctx.total_am_time = 0 + run_ctx.total_search_time = 0 + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print( + "Total-AM-Time: %.2fs, AM-RTF: %.3f" + % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s) + ) + print( + "Total-Search-Time: %.2fs, Search-RTF: %.3f" + % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s) + ) + total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time + print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + am_start = time.time() + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + am_time = time.time() - am_start + run_ctx.total_am_time += am_time + + search_start = time.time() + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + search_time = time.time() - search_start + run_ctx.total_search_time += search_time + + print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch)) + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch)) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc_v2.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc_v2.py new file mode 100644 index 000000000..815b283ba --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc_v2.py @@ -0,0 +1,115 @@ +""" +Flashlight/Torchaudio CTC decoder and prior computation functions +""" + +import time +import numpy as np +import torch +from torch import nn + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[blank]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + run_ctx.running_audio_len_s = 0 + run_ctx.total_am_time = 0 + run_ctx.total_search_time = 0 + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print( + "Total-AM-Time: %.2fs, AM-RTF: %.3f" + % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s) + ) + print( + "Total-Search-Time: %.2fs, Search-RTF: %.3f" + % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s) + ) + total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time + print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + am_start = time.time() + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + am_time = time.time() - am_start + run_ctx.total_am_time += am_time + + search_start = time.time() + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + search_time = time.time() - search_start + run_ctx.total_search_time += search_time + + print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch)) + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch)) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + # TODO: Check if "[" removal is unnecessary + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py new file mode 100644 index 000000000..cabf6d47d --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py @@ -0,0 +1,59 @@ +""" +Greedy CTC decoder without any extras +""" + +import time +import torch + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + run_ctx.labels = vocab.labels + + run_ctx.running_audio_len_s = 0 + run_ctx.total_time = 0 + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print("Total-time: %.2f, Batch-RTF: %.3f" % (run_ctx.total_time, run_ctx.total_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + am_start = time.time() + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + batch_indices = [] + for lp, l in zip(logprobs, audio_features_len): + batch_indices.append(torch.unique_consecutive(torch.argmax(lp[:l], dim=-1), dim=0).detach().cpu().numpy()) + + am_time = time.time() - am_start + run_ctx.total_time += am_time + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + + tags = data["seq_tag"] + + for indices, tag in zip(batch_indices, tags): + print(indices) + sequence = [run_ctx.labels[idx] for idx in indices if idx < len(run_ctx.labels)] + sequence = [s for s in sequence if (not s.startswith("<") and not s.startswith("["))] + text = " ".join(sequence).replace("@@ ", "") + print(text) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(text))) diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py new file mode 100644 index 000000000..c28566b92 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py @@ -0,0 +1,87 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class ConformerEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return SpecaugConfig(**d) + + +@dataclass +class ModelConfig: + frontend_config: VGG4LayerActFrontendV1Config + specaug_config: SpecaugConfig + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_posenc.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_posenc.py new file mode 100644 index 000000000..8d45dab3f --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_posenc.py @@ -0,0 +1,369 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +import numpy as np +import torch +from torch import nn +from typing import Tuple +import math + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig +from .specaugment_fixed import returnn_specaugment_by_length +from .legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformulated in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Masked tensor [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class TransparentConformerEncoderV1(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.1) + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,))) + + torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1)) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + x = self.posenc(x) + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + final = transparent_weights[0] * x + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + final = final + (transparent_weights[i + 1] * x) + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + self.label_target_size = self.cfg.label_target_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + self.export_mode = False + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + + :param raw_audio: + :param raw_audio_len: + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = returnn_specaugment_by_length( + audio_features, + repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + bpe_labels = data["bpe_labels"] # [B, N] (sparse) + bpe_labels_len = data["bpe_labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + bpe_labels, + input_lengths=audio_features_len, + target_lengths=bpe_labels_len, + blank=model.label_target_size, + reduction="sum", + ) + num_phonemes = torch.sum(bpe_labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + # tokens=labels + ["[blank]", "[SILENCE]"], + tokens=labels + ["[blank]"], + blank_token="[blank]", + # sil_token="[SILENCE]", + sil_token="[blank]", + unk_word="[UNKNOWN2]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + # write empty HDF until new ForwardJob exists + f = open("output.hdf", "wt") + f.write(" ") + f.close() + + +def search_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + print(words) + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/serializer.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/serializer.py new file mode 100644 index 000000000..63171ae7c --- /dev/null +++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/serializer.py @@ -0,0 +1,109 @@ +import copy +from sisyphus import tk +from typing import Any, Dict, Optional + +from i6_core.tools.git import CloneGitRepositoryJob + +from i6_experiments.common.setups.returnn_pytorch.serialization import ( + Collection as TorchCollection, +) +from i6_experiments.common.setups.serialization import ExternalImport + +from . import PACKAGE + +from i6_experiments.common.setups.serialization import Import, PartialImport + + +def get_pytorch_serializer_v3( + network_module: str, + net_args: Dict[str, Any], + decoder: Optional[str] = None, + decoder_args: Optional[Dict[str, Any]] = None, + post_decoder_args: Optional[Dict[str, Any]] = None, + prior=False, + debug=False, + **kwargs +) -> TorchCollection: + """ + + :param network_module: path to the pytorch config file containing Model + :param net_args: extra arguments for the model + :param decoder: path to the search decoder, if provided will link search functions + :param decoder_args: + :param post_decoder_args: + :param prior: build config for prior computation + :param debug: run training in debug mode (linking from recipe instead of copy) + :param kwargs: + :return: + """ + package = PACKAGE + ".pytorch_networks" + + pytorch_model_import = PartialImport( + code_object_path=package + ".%s.Model" % network_module, + unhashed_package_root=PACKAGE, + hashed_arguments=net_args, + unhashed_arguments={}, + import_as="get_model", + ) + pytorch_train_step = Import( + code_object_path=package + ".%s.train_step" % network_module, unhashed_package_root=PACKAGE + ) + + # TODO: add flag to switch and maybe move to default tools + # i6_models_repo = CloneGitRepositoryJob( + # url="https://github.com/rwth-i6/i6_models", + # commit="1e94a4d9d1aa48fe3ac7f60de2cd7bd3fea19c3e", + # checkout_folder_name="i6_models" + # ).out_repository + i6_models_repo = tk.Path("/u/hilmes/experiments/nick_asr/i6_models") + i6_models_repo.hash_overwrite = "LIBRISPEECH_DEFAULT_I6_MODELS" + i6_models = ExternalImport(import_path=i6_models_repo) + + serializer_objects = [ + i6_models, + pytorch_model_import, + pytorch_train_step, + ] + if decoder: + # Just a hack to test the phoneme-based recognition + forward_step = Import( + code_object_path=package + ".%s.forward_step" % decoder, + unhashed_package_root=PACKAGE, + ) + init_hook = PartialImport( + code_object_path=package + ".%s.forward_init_hook" % decoder, + unhashed_package_root=PACKAGE, + hashed_arguments=decoder_args or {}, + unhashed_arguments=post_decoder_args or {}, + ) + finish_hook = Import( + code_object_path=package + ".%s.forward_finish_hook" % decoder, + unhashed_package_root=PACKAGE, + ) + serializer_objects.extend([forward_step, init_hook, finish_hook]) + if prior: + forward_step = Import( + code_object_path=package + ".%s.prior_step" % network_module, + unhashed_package_root=PACKAGE, + import_as="forward_step", + ) + init_hook = Import( + code_object_path=package + ".%s.prior_init_hook" % network_module, + unhashed_package_root=PACKAGE, + import_as="forward_init_hook", + ) + finish_hook = Import( + code_object_path=package + ".%s.prior_finish_hook" % network_module, + import_as="forward_finish_hook", + unhashed_package_root=PACKAGE, + ) + serializer_objects.extend([forward_step, init_hook, finish_hook]) + serializer = TorchCollection( + serializer_objects=serializer_objects, + make_local_package_copy=not debug, + packages={ + package, + }, + ) + + return serializer diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/__init__.py new file mode 100644 index 000000000..6ac5dd240 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/__init__.py @@ -0,0 +1 @@ +PACKAGE = __package__ diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/config.py b/users/hilmes/experiments/nick_setups/standalone_2023/config.py new file mode 100644 index 000000000..c6536eb6b --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/config.py @@ -0,0 +1,156 @@ +import copy +import numpy as np +from sisyphus import tk +from typing import Any, Dict + +from i6_core.returnn.config import ReturnnConfig, CodeWrapper + +from i6_experiments.common.setups.returnn_pytorch.serialization import ( + Collection as TorchCollection, +) +from i6_experiments.common.setups.serialization import Import +from .data.common import TrainingDatasets +from .serializer import get_pytorch_serializer_v3, PACKAGE + +from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset + + +def get_training_config( + training_datasets: TrainingDatasets, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine: bool = False, + use_speed_perturbation: bool = False, +) -> ReturnnConfig: + """ + :param training_datasets: datasets for training + :param network_module: path to the pytorch config file containing Model + :param net_args: extra arguments for the model + :param config: + :param debug: run training in debug mode (linking from recipe instead of copy) + """ + + # changing these does not change the hash + post_config = { + "cleanup_old_models": True, + "stop_on_nonfinite_train_score": True, # this might break now with True + "num_workers_per_gpu": 2, + } + + base_config = { + ############# + "train": copy.deepcopy(training_datasets.train.as_returnn_opts()), + "dev": training_datasets.cv.as_returnn_opts(), + "eval_datasets": {"devtrain": training_datasets.devtrain.as_returnn_opts()}, + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, net_args=net_args, debug=debug, use_custom_engine=use_custom_engine + ) + python_prolog = None + + # TODO: maybe make nice + if use_speed_perturbation: + prolog_serializer = TorchCollection( + serializer_objects=[ + Import( + code_object_path=PACKAGE + ".dataset_code.speed_perturbation.legacy_speed_perturbation", + unhashed_package_root=PACKAGE, + ) + ] + ) + python_prolog = [prolog_serializer] + config["train"]["datasets"]["zip_dataset"]["audio"]["pre_process"] = CodeWrapper("legacy_speed_perturbation") + + returnn_config = ReturnnConfig( + config=config, post_config=post_config, python_prolog=python_prolog, python_epilog=[serializer] + ) + return returnn_config + + +def get_prior_config( + training_datasets: TrainingDatasets, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = {} + + base_config = { + ############# + "batch_size": 500 * 16000, + "max_seqs": 60, + ############# + "forward": training_datasets.prior.as_returnn_opts(), + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + prior=True, + ) + returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) + return returnn_config + + +def get_search_config( + network_module: str, + net_args: Dict[str, Any], + decoder: [str], + decoder_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = {} + + base_config = { + ############# + "batch_size": 240 * 16000, + "max_seqs": 60, + ############# + # dataset is added later in the pipeline during search_single + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + decoder=decoder, + decoder_args=decoder_args, + ) + returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) + return returnn_config diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/ctc_bpe/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/ctc_bpe/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/ctc_bpe/exp_ls100_1023_base.py b/users/hilmes/experiments/nick_setups/standalone_2023/ctc_bpe/exp_ls100_1023_base.py new file mode 100644 index 000000000..75d1f3bc2 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/ctc_bpe/exp_ls100_1023_base.py @@ -0,0 +1,296 @@ +from sisyphus import tk + +import copy +from dataclasses import asdict +import numpy as np +from typing import cast + + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + +from ..lm import get_4gram_binary_lm +from ..data.bpe import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon +from ..data.common import build_test_dataset +from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT, KENLM_BINARY_PATH + +from ..pipeline import training, search, compute_prior + +from ..config import get_training_config, get_search_config, get_prior_config + + +def conformer_baseline(): + prefix_name = "experiments/librispeech/standalone_2023/ls100_ctc_bpe/" + + BPE_SIZE = 300 + + train_settings = TrainingDatasetSettings( + custom_processing_function=None, + partition_epoch=3, + epoch_wise_filters=[], + seq_ordering="laplace:.1000", + preemphasis=0.97, + peak_normalization=True, # TODO: this is wrong compared to old setupsa and rescale, better test if it degrades + ) + + train_settings_retrain = copy.deepcopy(train_settings) + train_settings_retrain.epoch_wise_filters = [] + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data = build_bpe_training_datasets( + librispeech_key="train-clean-100", + bpe_size=BPE_SIZE, + settings=train_settings, + ) + label_datastream = cast(LabelDatastream, train_data.datastreams["labels"]) + vocab_size_without_blank = label_datastream.vocab_size + + # build testing datasets + test_dataset_tuples = {} + # for testset in ["dev", "test"]: + for testset in ["dev-other"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + preemphasis=train_settings.preemphasis, + peak_normalization=train_settings.peak_normalization, + ) + + arpa_4gram_lm = get_4gram_binary_lm() + + # ---------------------------------------------------------------------------------------------------------------- # + + def run_exp( + ft_name, + datasets, + train_args, + search_args=None, + with_prior=False, + num_epochs=250, + decoder="ctc.decoder.flashlight_bpe_ctc", + ): + training_name = "/".join(ft_name.split("/")[:-1]) + search_args = search_args if search_args is not None else {} + + returnn_config = get_training_config(training_datasets=datasets, **train_args) + train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs) + + if with_prior: + returnn_config = get_prior_config(training_datasets=datasets, **train_args) + prior_file = compute_prior( + ft_name, + returnn_config, + checkpoint=train_job.out_checkpoints[num_epochs], + returnn_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + tk.register_output(training_name + "/prior.txt", prior_file) + search_args["prior_file"] = prior_file + + returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder) + + _, _, search_jobs = search( + ft_name + "/last_%i" % num_epochs, + returnn_search_config, + train_job.out_checkpoints[num_epochs], + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + + return train_job, search_jobs + + from ..pytorch_networks.ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + LogMelFeatureExtractionV1Config, + ) + + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + model_config = ModelConfig( + feature_extraction_config=fe_config, + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=1, + ) + + train_args_adamw03_accum2_jjlr = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 2, + }, + "debug": False, + } + + default_search_args = { + "lexicon": get_text_lexicon(librispeech_key="train-clean-100", bpe_size=BPE_SIZE), + "returnn_vocab": label_datastream.vocab, + "beam_size": 1024, + "beam_size_token": 128, + "arpa_lm": arpa_4gram_lm, + "beam_threshold": 14, + } + + # DIverged + # train_args = { + # **copy.deepcopy(train_args_adamw03_accum2_jjlr), + # "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6", + # "net_args": {"model_config_dict": asdict(model_config)}, + # } + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # } + # run_exp( + # prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR/lm%.1f_prior%.2f_bs1024_th14" % ( + # lm_weight, prior_scale), + # datasets=train_data, train_args=train_args, search_args=search_args, with_prior=True) + + model_config_start11 = copy.deepcopy(model_config) + model_config_start11.specauc_start_epoch = 11 + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6", + "net_args": {"model_config_dict": asdict(model_config_start11)}, + } + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_peaknorm_start11/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + + # from here on onwards, use default AdamW with same OCLR + train_args_adamw_02 = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-2}, + "learning_rates": list(np.linspace(1e-5, 1e-3, 150)) + list(np.linspace(1e-3, 1e-6, 150)), + ############# + "batch_size": 200 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 2, + }, + } + + model_config_smaller = ModelConfig( + feature_extraction_config=fe_config, + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=384, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=9, + final_dropout=0.2, + specauc_start_epoch=1, + ) + + train_args = { + **copy.deepcopy(train_args_adamw_02), + "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6", + "net_args": {"model_config_dict": asdict(model_config_smaller)}, + } + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_peaknorm_smaller_decay1e-2/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + + model_config_smaller_start11 = copy.deepcopy(model_config_smaller) + model_config_smaller_start11.specauc_start_epoch = 11 + train_args_start11 = copy.deepcopy(train_args) + train_args_start11["net_args"]["model_config_dict"] = asdict(model_config_smaller_start11) + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_peaknorm_smaller_decay1e-2_start11/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args_start11, + search_args=search_args, + with_prior=True, + ) diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/ctc_phon/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/ctc_phon/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/ctc_phon/exp_ls100_1023_base.py b/users/hilmes/experiments/nick_setups/standalone_2023/ctc_phon/exp_ls100_1023_base.py new file mode 100644 index 000000000..76ae5305a --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/ctc_phon/exp_ls100_1023_base.py @@ -0,0 +1,340 @@ +from sisyphus import tk + +import copy +from dataclasses import asdict +import numpy as np +from typing import cast + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + +from ..data.phon import build_eow_phon_training_datasets, TrainingDatasetSettings, get_text_lexicon +from ..data.common import build_test_dataset +from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT +from ..lm import get_4gram_binary_lm + +from ..pipeline import training, search, compute_prior + +from ..config import get_training_config, get_search_config, get_prior_config + + +def eow_phon_ls100_1023_base(): + prefix_name = "experiments/librispeech/standalone_2023/ls100_ctc_eow_phon/" + + train_settings = TrainingDatasetSettings( + custom_processing_function=None, + partition_epoch=3, + epoch_wise_filters=[], + seq_ordering="laplace:.1000", + preemphasis=0.97, + peak_normalization=True, # TODO: this is wrong compared to old setupsa and rescale, better test if it degrades + ) + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data = build_eow_phon_training_datasets( + librispeech_key="train-clean-100", + settings=train_settings, + ) + label_datastream = cast(LabelDatastream, train_data.datastreams["labels"]) + vocab_size_without_blank = label_datastream.vocab_size + + # build testing datasets + test_dataset_tuples = {} + # for testset in ["dev", "test"]: + for testset in ["dev-other"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + preemphasis=train_settings.preemphasis, + peak_normalization=train_settings.peak_normalization, + ) + + arpa_4gram_lm = get_4gram_binary_lm() + + # ---------------------------------------------------------------------------------------------------------------- # + + def run_exp( + ft_name, + datasets, + train_args, + search_args=None, + with_prior=False, + num_epochs=250, + decoder="ctc.decoder.flashlight_phoneme_ctc", + ): + training_name = "/".join(ft_name.split("/")[:-1]) + search_args = search_args if search_args is not None else {} + + returnn_config = get_training_config(training_datasets=datasets, **train_args) + train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs) + + if with_prior: + returnn_config = get_prior_config(training_datasets=datasets, **train_args) + prior_file = compute_prior( + ft_name, + returnn_config, + checkpoint=train_job.out_checkpoints[num_epochs], + returnn_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + tk.register_output(training_name + "/prior.txt", prior_file) + search_args["prior_file"] = prior_file + + returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder) + + _, _, search_jobs = search( + ft_name + "/last_%i" % num_epochs, + returnn_search_config, + train_job.out_checkpoints[num_epochs], + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + + return train_job, search_jobs + + from ..pytorch_networks.ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + LogMelFeatureExtractionV1Config, + ) + + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + model_config = ModelConfig( + feature_extraction_config=fe_config, + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=1, + ) + + train_args_adamw03_accum2_jjlr = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 2, + }, + "debug": False, + } + + default_search_args = { + "lexicon": get_text_lexicon(librispeech_key="train-clean-100"), + "returnn_vocab": label_datastream.vocab, + "beam_size": 1024, + "beam_size_token": 128, + "arpa_lm": arpa_4gram_lm, + "beam_threshold": 14, + } + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6", + "net_args": {"model_config_dict": asdict(model_config)}, + } + # diverged with hiccup + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # } + # run_exp( + # prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR/lm%.1f_prior%.2f_bs1024_th14" % ( + # lm_weight, prior_scale), + # datasets=train_data, train_args=train_args, search_args=search_args, with_prior=True) + + train_args_gc1 = copy.deepcopy(train_args) + train_args_gc1["config"]["gradient_clip"] = 1.0 + for lm_weight in [2.5, 3.0, 3.5]: + for prior_scale in [0.0, 0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_peaknorm_gc1/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args_gc1, + search_args=search_args, + with_prior=True, + ) + + train_args_decay1e_2 = copy.deepcopy(train_args) + train_args_decay1e_2["config"]["optimizer"]["weight_decay"] = 1e-2 + for lm_weight in [2.5, 3.0, 3.5]: + for prior_scale in [0.0, 0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_peaknorm_decay1e-2/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args_decay1e_2, + search_args=search_args, + with_prior=True, + ) + + search_args = { + **default_search_args, + "lm_weight": 3.5, + "prior_scale": 0.3, + "sil_score": -1000.0, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_peaknorm_decay1e-2/lm_test1_bs1024_th14", + datasets=train_data, + train_args=train_args_decay1e_2, + search_args=search_args, + with_prior=True, + decoder="ctc.decoder.flashlight_phoneme_ctc_v2", + ) + + search_args = { + "lexicon": get_text_lexicon(librispeech_key="train-clean-100"), + "returnn_vocab": label_datastream.vocab, + "beam_size": 1024, + "arpa_lm": arpa_4gram_lm, + "beam_threshold": 16, + "lm_weight": 3.5, + "prior_scale": 0.3, + } + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_peaknorm_decay1e-2/lm_test2_bs1024_th16", + datasets=train_data, + train_args=train_args_decay1e_2, + search_args=search_args, + with_prior=True, + decoder="ctc.decoder.flashlight_phoneme_ctc", + ) + + ###### trying to reproduce 14.5% result from librispeech/librispeech_100_phon_ctc ######### + + train_args_adamw_02 = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-8, "weight_decay": 1e-2}, + "learning_rates": list(np.linspace(1e-5, 1e-3, 150)) + list(np.linspace(1e-3, 1e-6, 150)), + ############# + "batch_size": 200 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + }, + } + model_config_small_ff = ModelConfig( + feature_extraction_config=fe_config, + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=384, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=1, + ) + train_args = { + **copy.deepcopy(train_args_adamw_02), + "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6", + "net_args": {"model_config_dict": asdict(model_config_small_ff)}, + } + for lm_weight in [2.5, 3.0, 3.5]: + for prior_scale in [0.0, 0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + # TODO: add num_epochs 300 + run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_legacy_peaknorm_decay1e-2_FF384_accum1/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + + train_args = { + **copy.deepcopy(train_args_adamw_02), + "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6", + "net_args": {"model_config_dict": asdict(model_config_small_ff)}, + } + train_args["config"]["accum_grad_multiple_step"] = 2 + for lm_weight in [2.5, 3.0, 3.5]: + for prior_scale in [0.0, 0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + train_job, _ = run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_legacy_peaknorm_decay1e-2_FF384_accum2/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + num_epochs=300, + ) diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/data/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/data/bpe.py b/users/hilmes/experiments/nick_setups/standalone_2023/data/bpe.py new file mode 100644 index 000000000..4deb3781e --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/data/bpe.py @@ -0,0 +1,92 @@ +""" +The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups +""" +from sisyphus import tk +from functools import lru_cache +from typing import Dict, List, Optional, Tuple + + +from i6_experiments.common.datasets.librispeech import get_ogg_zip_dict, get_bliss_lexicon +from i6_experiments.common.datasets.librispeech.vocab import get_subword_nmt_bpe_v2 +from i6_experiments.common.helpers.text_labels.subword_nmt_bpe import get_returnn_subword_nmt + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import BpeDatastream +from i6_experiments.users.rossenbach.lexicon.bpe_lexicon import CreateBPELexiconJob + +from .common import TrainingDatasetSettings, TrainingDatasets, build_training_datasets, DATA_PREFIX +from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE + + +@lru_cache() +def get_bpe_datastream(librispeech_key: str, bpe_size: int, is_recog: bool) -> BpeDatastream: + """ + Returns the datastream for the bpe labels + + Uses the legacy BPE setup that is compatible with old LM models + + :param librispeech_key: + :param bpe_size: size for the bpe labels + :param is_recog: removes the UNK label when not in training + """ + bpe_settings = get_subword_nmt_bpe_v2(corpus_key=librispeech_key, bpe_size=bpe_size, unk_label="") + + # TODO: Try without sequence postfix (seq_postfix=None) + # otherwise every sequence gets a at the end + bpe_targets = BpeDatastream(available_for_inference=False, bpe_settings=bpe_settings, use_unk_label=is_recog) + return bpe_targets + + +def get_lexicon(librispeech_key: str, bpe_size: int) -> tk.Path: + subword_nmt_repo = get_returnn_subword_nmt( + commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", output_prefix=DATA_PREFIX + ) + subword_nmt_repo.hash_overwrite = "I6_SUBWORD_NMT_V2" + + bpe_datastream = get_bpe_datastream(librispeech_key=librispeech_key, bpe_size=bpe_size, is_recog=False) + bpe_lexicon = CreateBPELexiconJob( + base_lexicon_path=get_bliss_lexicon( + add_unknown_phoneme_and_mapping=False, add_silence=False, output_prefix="librispeech_datasets" + ), + bpe_codes=bpe_datastream.codes, + bpe_vocab=bpe_datastream.vocab, + subword_nmt_repo=subword_nmt_repo, + unk_label="", + ).out_lexicon + + return bpe_lexicon + + +def get_text_lexicon(librispeech_key: str, bpe_size: int) -> tk.Path: + """ + + :return: + """ + bliss_lex = get_lexicon(librispeech_key=librispeech_key, bpe_size=bpe_size) + from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon + + word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon + return word_lexicon + + +def build_bpe_training_datasets( + librispeech_key: str, + bpe_size: int, + settings: TrainingDatasetSettings, +) -> TrainingDatasets: + """ + :param settings: configuration object for the dataset pipeline + """ + label_datastream = get_bpe_datastream(librispeech_key=librispeech_key, bpe_size=bpe_size, is_recog=False) + + ogg_zip_dict = get_ogg_zip_dict("corpora", returnn_root=MINI_RETURNN_ROOT, returnn_python_exe=RETURNN_EXE) + train_ogg = ogg_zip_dict[librispeech_key] + dev_clean_ogg = ogg_zip_dict["dev-clean"] + dev_other_ogg = ogg_zip_dict["dev-other"] + + return build_training_datasets( + train_ogg=train_ogg, + dev_clean_ogg=dev_clean_ogg, + dev_other_ogg=dev_other_ogg, + settings=settings, + label_datastream=label_datastream, + ) diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/data/common.py b/users/hilmes/experiments/nick_setups/standalone_2023/data/common.py new file mode 100644 index 000000000..d25d0e764 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/data/common.py @@ -0,0 +1,207 @@ +""" +The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups +""" +from sisyphus import tk +from dataclasses import dataclass +from functools import lru_cache +from typing import Dict, List, Optional, Tuple + +from i6_core.returnn import CodeWrapper +from i6_core.returnn.oggzip import BlissToOggZipJob + +from i6_experiments.common.datasets.librispeech import get_ogg_zip_dict, get_bliss_corpus_dict + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.audio import ( + AudioRawDatastream, + ReturnnAudioRawOptions, +) +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.base import Datastream +from i6_experiments.users.rossenbach.datasets.librispeech import get_mixed_cv_segments + +from returnn_common.datasets import Dataset, OggZipDataset, MetaDataset + +from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE + +DATA_PREFIX = "experiments/librispeech/2023_standalone/data/" + +# -------------- Dataclasses for configuration and data passing ------------------- + +# here: ( , , ) +EpochWiseFilter = Tuple[int, int, int] + + +@dataclass(frozen=True) +class TrainingDatasets: + train: Dataset + cv: Dataset + devtrain: Dataset + datastreams: Dict[str, Datastream] + prior: Optional[Dataset] + + +@dataclass() +class TrainingDatasetSettings: + # features settings + custom_processing_function: Optional[str] + + # training settings + partition_epoch: int + epoch_wise_filters: List[EpochWiseFilter] + seq_ordering: str + preemphasis: float + peak_normalization: bool + + +# --------------------------- Helper functions ----------------------------------- + + +@lru_cache() +def get_audio_raw_datastream( + preemphasis: Optional[float] = None, peak_normalization: bool = False +) -> AudioRawDatastream: + """ + :param preemphasis: set the pre-emphasis filter factor + :param peak_normalization: normalize every utterance to peak amplitude 1 + """ + audio_datastream = AudioRawDatastream( + available_for_inference=True, + options=ReturnnAudioRawOptions(peak_normalization=peak_normalization, preemphasis=preemphasis), + ) + return audio_datastream + + +def get_zip(name: str, bliss_dataset: tk.Path): + """ + + :param name: + :param bliss_dataset: + :return: + """ + zip_dataset_job = BlissToOggZipJob( + bliss_corpus=bliss_dataset, + no_conversion=True, # for Librispeech we are already having ogg + returnn_python_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + zip_dataset_job.add_alias(DATA_PREFIX + name) + + return zip_dataset_job.out_ogg_zip + + +# --------------------------- Dataset functions ----------------------------------- + + +def build_training_datasets( + train_ogg: tk.Path, + dev_clean_ogg: tk.Path, + dev_other_ogg: tk.Path, + label_datastream: Datastream, + settings: TrainingDatasetSettings, +) -> TrainingDatasets: + """ + :param train_ogg: + :param dev_clean_ogg: + :param dev_other_ogg: + :param label_datastream: + :param settings: + """ + audio_datastream = get_audio_raw_datastream(settings.preemphasis, settings.peak_normalization) + + datastreams = { + "raw_audio": audio_datastream, + "labels": label_datastream, + } + + data_map = {"raw_audio": ("zip_dataset", "data"), "labels": ("zip_dataset", "classes")} + + training_audio_opts = audio_datastream.as_returnn_audio_opts() + if settings.custom_processing_function: + training_audio_opts["pre_process"] = CodeWrapper(settings.custom_processing_function) + + additional_opts = {} + if settings.epoch_wise_filters: + additional_opts["epoch_wise_filter"] = {} + for fr, to, max_mean_len in settings.epoch_wise_filters: + additional_opts["epoch_wise_filter"][(fr, to)] = {"max_mean_len": max_mean_len} + + def make_meta(dataset: OggZipDataset): + return MetaDataset( + data_map=data_map, datasets={"zip_dataset": dataset}, seq_order_control_dataset="zip_dataset" + ) + + train_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=training_audio_opts, + target_options=label_datastream.as_returnn_targets_opts(), + partition_epoch=settings.partition_epoch, + seq_ordering=settings.seq_ordering, + additional_options=additional_opts, + ) + train_dataset = make_meta(train_zip_dataset) + + cv_zip_dataset = OggZipDataset( + files=[dev_clean_ogg, dev_other_ogg], + audio_options=audio_datastream.as_returnn_audio_opts(), + target_options=label_datastream.as_returnn_targets_opts(), + segment_file=get_mixed_cv_segments(), + seq_ordering="sorted_reverse", + ) + cv_dataset = make_meta(cv_zip_dataset) + + devtrain_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=audio_datastream.as_returnn_audio_opts(), + target_options=label_datastream.as_returnn_targets_opts(), + seq_ordering="sorted_reverse", + random_subset=3000, + ) + devtrain_dataset = make_meta(devtrain_zip_dataset) + + prior_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=training_audio_opts, + target_options=label_datastream.as_returnn_targets_opts(), + partition_epoch=1, + seq_ordering="sorted_reverse", + additional_options=additional_opts, + ) + prior_dataset = make_meta(prior_zip_dataset) + + return TrainingDatasets( + train=train_dataset, + cv=cv_dataset, + devtrain=devtrain_dataset, + datastreams=datastreams, + prior=prior_dataset, + ) + + +@lru_cache() +def build_test_dataset( + dataset_key: str, + preemphasis: Optional[float] = None, + peak_normalization: bool = False, +): + """ + + :param dataset_key: e.g. dev-other, which test set to create + :param preemphasis: + :param peak_normalization: + :return: + """ + ogg_zip_dict = get_ogg_zip_dict("corpora", returnn_root=MINI_RETURNN_ROOT, returnn_python_exe=RETURNN_EXE) + bliss_dict = get_bliss_corpus_dict() + test_ogg = ogg_zip_dict[dataset_key] + + audio_datastream = get_audio_raw_datastream(preemphasis, peak_normalization) + + data_map = {"raw_audio": ("zip_dataset", "data")} + + test_zip_dataset = OggZipDataset( + files=[test_ogg], audio_options=audio_datastream.as_returnn_audio_opts(), seq_ordering="sorted_reverse" + ) + test_dataset = MetaDataset( + data_map=data_map, datasets={"zip_dataset": test_zip_dataset}, seq_order_control_dataset="zip_dataset" + ) + + return test_dataset, bliss_dict[dataset_key] diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/data/phon.py b/users/hilmes/experiments/nick_setups/standalone_2023/data/phon.py new file mode 100644 index 000000000..c330f79a4 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/data/phon.py @@ -0,0 +1,142 @@ +""" +The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups +""" +from sisyphus import tk + +from dataclasses import dataclass +from functools import lru_cache +import os +from typing import Any, Dict, List, Optional, Tuple + +from i6_core.returnn.vocabulary import ReturnnVocabFromPhonemeInventory +from i6_core.corpus.transform import ApplyLexiconToCorpusJob +from i6_core.lexicon.modification import AddEowPhonemesToLexiconJob + +from i6_experiments.common.datasets.librispeech import ( + get_g2p_augmented_bliss_lexicon_dict, + get_bliss_corpus_dict, + get_ogg_zip_dict, + get_bliss_lexicon, +) + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + + +from .common import get_zip, DATA_PREFIX, build_training_datasets, TrainingDatasets, TrainingDatasetSettings + + +def get_eow_lexicon(librispeech_key: str, with_g2p=True) -> tk.Path: + + """ + get the g2p bliss lexicon with EOW tokens added + :return: + """ + if with_g2p: + lex = get_g2p_augmented_bliss_lexicon_dict( + use_stress_marker=False, add_silence=False, output_prefix="librispeech_datasets" + )[librispeech_key] + else: + lex = get_bliss_lexicon(use_stress_marker=False, add_silence=False, output_prefix="librispeech_datasets") + + return AddEowPhonemesToLexiconJob(lex).out_lexicon + + +def get_eow_bliss(librispeech_key: str, train_librispeech_key: str, remove_unk_seqs=False) -> tk.Path: + """ + get an EOW modified corpus with optional unknown removed for cross validation + + :param corpus_key: train, dev, test + :param remove_unk_seqs: remove all sequences with unknowns, used for dev-clean and dev-other + in case of using them for cross validation + :return: + """ + bliss = get_bliss_corpus_dict(audio_format="ogg")[librispeech_key] + if remove_unk_seqs: + from i6_core.corpus.filter import FilterCorpusRemoveUnknownWordSegmentsJob + + bliss = FilterCorpusRemoveUnknownWordSegmentsJob( + bliss_corpus=bliss, + bliss_lexicon=get_eow_lexicon( + librispeech_key=train_librispeech_key, with_g2p=True + ), # cv may include words from g2p + all_unknown=False, + ).out_corpus + + # default train lexicon + lexicon = get_eow_lexicon(librispeech_key=train_librispeech_key, with_g2p=True) + converted_bliss_corpus = ApplyLexiconToCorpusJob(bliss, lexicon, word_separation_orth=None).out_corpus + + return converted_bliss_corpus + + +def get_eow_bliss_and_zip(librispeech_key: str, train_librispeech_key: str, remove_unk_seqs=False): + """ + :param corpus_key: e.g. "train", "dev", or "test, + :param remove_unk_seqs: remove all sequences with unknowns, used for dev-clean and dev-other + in case of using them for cross validation + :return: tuple of bliss and zip + """ + + bliss_dataset = get_eow_bliss( + librispeech_key=librispeech_key, train_librispeech_key=train_librispeech_key, remove_unk_seqs=remove_unk_seqs + ) + zip_dataset = get_zip(f"{librispeech_key}_eow", bliss_dataset=bliss_dataset) + + return bliss_dataset, zip_dataset + + +def get_eow_vocab_datastream(librispeech_key: str) -> LabelDatastream: + """ + Phoneme with EOW LabelDatastream for Tedlium-2 + + :param with_blank: datastream for CTC training + """ + lexicon = get_eow_lexicon(librispeech_key=librispeech_key) + returnn_vocab_job = ReturnnVocabFromPhonemeInventory(lexicon) + returnn_vocab_job.add_alias(os.path.join(DATA_PREFIX, f"{librispeech_key}", "eow_returnn_vocab_job")) + + vocab_datastream = LabelDatastream( + available_for_inference=True, vocab=returnn_vocab_job.out_vocab, vocab_size=returnn_vocab_job.out_vocab_size + ) + + return vocab_datastream + + +def get_text_lexicon(librispeech_key: str) -> tk.Path: + """ + + :return: + """ + bliss_lex = get_eow_lexicon(librispeech_key=librispeech_key, with_g2p=False) + from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon + + word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon + return word_lexicon + + +def build_eow_phon_training_datasets( + librispeech_key: str, + settings: TrainingDatasetSettings, +) -> TrainingDatasets: + """ + :param settings: configuration object for the dataset pipeline + """ + label_datastream = get_eow_vocab_datastream(librispeech_key=librispeech_key) + + _, train_ogg = get_eow_bliss_and_zip( + librispeech_key=librispeech_key, train_librispeech_key=librispeech_key, remove_unk_seqs=False + ) + _, dev_clean_ogg = get_eow_bliss_and_zip( + librispeech_key="dev-clean", train_librispeech_key=librispeech_key, remove_unk_seqs=True + ) + _, dev_other_ogg = get_eow_bliss_and_zip( + librispeech_key="dev-other", train_librispeech_key=librispeech_key, remove_unk_seqs=True + ) + + return build_training_datasets( + train_ogg=train_ogg, + dev_clean_ogg=dev_clean_ogg, + dev_other_ogg=dev_other_ogg, + settings=settings, + label_datastream=label_datastream, + ) diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/default_tools.py b/users/hilmes/experiments/nick_setups/standalone_2023/default_tools.py new file mode 100644 index 000000000..5c33c776a --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/default_tools.py @@ -0,0 +1,20 @@ +from sisyphus import tk +from i6_core.tools.git import CloneGitRepositoryJob + + +# python from apptainer +RETURNN_EXE = tk.Path("/usr/bin/python3", hash_overwrite="GENERIC_RETURNN_LAUNCHER") +MINI_RETURNN_ROOT = tk.Path("/u/hilmes/dev/MiniReturnn", hash_overwrite="LIBRISPEECH_DEFAULT_RETURNN_ROOT") + +from i6_experiments.common.tools.sctk import compile_sctk + +SCTK_BINARY_PATH = compile_sctk(branch="v2.4.12") # use last published version +# SCTK_BINARY_PATH = compile_sctk() # use most recent SCTK +SCTK_BINARY_PATH.hash_overwrite = "LIBRISPEECH_DEFAULT_SCTK_BINARY_PATH" + +from i6_core.tools.git import CloneGitRepositoryJob +from i6_core.lm.kenlm import CompileKenLMJob, CreateBinaryLMJob + +kenlm_repo = CloneGitRepositoryJob("https://github.com/kpu/kenlm").out_repository +KENLM_BINARY_PATH = CompileKenLMJob(repository=kenlm_repo).out_binaries +KENLM_BINARY_PATH.hash_overwrite = "LIBRISPEECH_DEFAULT_KENLM_BINARY_PATH" diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/lm.py b/users/hilmes/experiments/nick_setups/standalone_2023/lm.py new file mode 100644 index 000000000..42a910233 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/lm.py @@ -0,0 +1,18 @@ +from i6_core.lm.kenlm import CreateBinaryLMJob + +from i6_experiments.common.datasets.librispeech.language_model import get_arpa_lm_dict + +from .default_tools import KENLM_BINARY_PATH + + +def get_4gram_binary_lm(): + """ + + :param output_prefix: + :return: + """ + arpa_4gram_binary_lm_job = CreateBinaryLMJob( + arpa_lm=get_arpa_lm_dict()["4gram"], kenlm_binary_folder=KENLM_BINARY_PATH + ) + arpa_4gram_binary_lm_job.add_alias("experiments/librispeech/standalone_2023/lm/create_4gram_binary_lm") + return arpa_4gram_binary_lm_job.out_lm diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pipeline.py b/users/hilmes/experiments/nick_setups/standalone_2023/pipeline.py new file mode 100644 index 000000000..1327b42db --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/pipeline.py @@ -0,0 +1,179 @@ +import copy +import os.path + +from sisyphus import tk + +from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset + +from i6_core.returnn.config import ReturnnConfig +from i6_core.returnn.training import ReturnnTrainingJob +from i6_core.returnn.training import GetBestTFCheckpointJob +from i6_core.returnn.forward import ReturnnForwardJob, ReturnnForwardJobV2 +from i6_core.returnn.search import SearchBPEtoWordsJob, ReturnnComputeWERJob +from i6_experiments.users.rossenbach.returnn.training import AverageCheckpointsJobV2 + +from .default_tools import RETURNN_EXE, MINI_RETURNN_ROOT, SCTK_BINARY_PATH + + +@tk.block() +def training(prefix_name, returnn_config, returnn_exe, returnn_root, num_epochs): + """ + + :param prefix_name: + :param returnn_config: + :param returnn_exe: + :param returnn_root: + :return: + """ + default_rqmt = { + "mem_rqmt": 15, + "time_rqmt": 168, + "cpu_rqmt": 4, + "log_verbosity": 5, + "returnn_python_exe": returnn_exe, + "returnn_root": returnn_root, + } + + train_job = ReturnnTrainingJob(returnn_config=returnn_config, num_epochs=num_epochs, **default_rqmt) + train_job.add_alias(prefix_name + "/training") + tk.register_output(prefix_name + "/learning_rates", train_job.out_learning_rates) + + return train_job + + +@tk.block() +def search_single( + prefix_name, + returnn_config, + checkpoint, + recognition_dataset: GenericDataset, + recognition_bliss_corpus, + returnn_exe, + returnn_root, + mem_rqmt=8, + use_gpu=False, +): + """ + Run search for a specific test dataset + + :param str prefix_name: + :param ReturnnConfig returnn_config: + :param Checkpoint checkpoint: + :param returnn_standalone.data.datasets.dataset.GenericDataset recognition_dataset: + :param Path recognition_reference: Path to a py-dict format reference file + :param Path returnn_exe: + :param Path returnn_root: + """ + returnn_config = copy.deepcopy(returnn_config) + returnn_config.config["forward"] = recognition_dataset.as_returnn_opts() + search_job = ReturnnForwardJobV2( + model_checkpoint=checkpoint, + returnn_config=returnn_config, + log_verbosity=5, + mem_rqmt=mem_rqmt, + time_rqmt=24, + device="gpu" if use_gpu else "cpu", + cpu_rqmt=2, + returnn_python_exe=returnn_exe, + returnn_root=returnn_root, + output_files=["search_out.py"], + ) + search_job.add_alias(prefix_name + "/search_job") + + from i6_core.returnn.search import SearchWordsToCTMJob + from i6_core.corpus.convert import CorpusToStmJob + from i6_core.recognition.scoring import ScliteJob + + search_ctm = SearchWordsToCTMJob( + recog_words_file=search_job.out_files["search_out.py"], + bliss_corpus=recognition_bliss_corpus, + ).out_ctm_file + + stm_file = CorpusToStmJob(bliss_corpus=recognition_bliss_corpus).out_stm_path + + sclite_job = ScliteJob(ref=stm_file, hyp=search_ctm, sctk_binary_path=SCTK_BINARY_PATH) + tk.register_output(prefix_name + "/sclite/wer", sclite_job.out_wer) + tk.register_output(prefix_name + "/sclite/report", sclite_job.out_report_dir) + + return sclite_job.out_wer, search_job + + +@tk.block() +def search(prefix_name, returnn_config, checkpoint, test_dataset_tuples, returnn_exe, returnn_root, use_gpu=False): + """ + + :param str prefix_name: + :param ReturnnConfig returnn_config: + :param Checkpoint checkpoint: + :param test_dataset_tuples: + :param returnn_exe: + :param returnn_root: + :return: + """ + # use fixed last checkpoint for now, needs more fine-grained selection / average etc. here + wers = {} + search_jobs = [] + for key, (test_dataset, test_dataset_reference) in test_dataset_tuples.items(): + wers[key], search_job = search_single( + prefix_name + "/%s" % key, + returnn_config, + checkpoint, + test_dataset, + test_dataset_reference, + returnn_exe, + returnn_root, + use_gpu=use_gpu, + ) + search_jobs.append(search_job) + + from i6_core.report import GenerateReportStringJob, MailJob + + format_string_report = ",".join(["{%s_val}" % (prefix_name + key) for key in test_dataset_tuples.keys()]) + format_string = " - ".join( + ["{%s}: {%s_val}" % (prefix_name + key, prefix_name + key) for key in test_dataset_tuples.keys()] + ) + values = {} + values_report = {} + for key in test_dataset_tuples.keys(): + values[prefix_name + key] = key + values["%s_val" % (prefix_name + key)] = wers[key] + values_report["%s_val" % (prefix_name + key)] = wers[key] + + report = GenerateReportStringJob(report_values=values, report_template=format_string, compress=False).out_report + # mail = MailJob(result=report, subject=prefix_name, send_contents=True).out_status + # tk.register_output(os.path.join(prefix_name, "mail_status"), mail) + return format_string_report, values_report, search_jobs + + +@tk.block() +def compute_prior( + prefix_name, + returnn_config, + checkpoint, + returnn_exe, + returnn_root, + mem_rqmt=8, +): + """ + Run search for a specific test dataset + + :param str prefix_name: + :param ReturnnConfig returnn_config: + :param Checkpoint checkpoint: + :param Path returnn_exe: + :param Path returnn_root: + """ + search_job = ReturnnForwardJobV2( + model_checkpoint=checkpoint, + returnn_config=returnn_config, + log_verbosity=5, + mem_rqmt=mem_rqmt, + time_rqmt=1, + device="gpu", + cpu_rqmt=4, + returnn_python_exe=returnn_exe, + returnn_root=returnn_root, + output_files=["prior.txt"], + ) + search_job.add_alias(prefix_name + "/prior_job") + return search_job.out_files["prior.txt"] diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6.py new file mode 100644 index 000000000..bd5860dc5 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6.py @@ -0,0 +1,184 @@ +""" +Like v2, but with i6_models specaugment (v3) +and now controllable start time for when specaugment is applied (v4) +and with the proper feature extraction from i6-models +""" + +import numpy as np +import torch +from torch import nn + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=self.cfg.feature_extraction_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py new file mode 100644 index 000000000..a57c949fd --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py @@ -0,0 +1,84 @@ +""" +Config for the base CTC models v4, including specaug start time +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1Config + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class ConformerEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + +@dataclass +class ModelConfig: + feature_extraction_config: LogMelFeatureExtractionV1Config + frontend_config: VGG4LayerActFrontendV1Config + specaug_config: SpecaugConfig + specauc_start_epoch: int + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["feature_extraction_config"] = LogMelFeatureExtractionV1Config(**d["feature_extraction_config"]) + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig(**d["specaug_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py new file mode 100644 index 000000000..3012eee33 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py @@ -0,0 +1,114 @@ +""" +Flashlight/Torchaudio CTC decoder and prior computation functions +""" + +import time +import numpy as np +import torch +from torch import nn + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[blank]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + run_ctx.running_audio_len_s = 0 + run_ctx.total_am_time = 0 + run_ctx.total_search_time = 0 + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print( + "Total-AM-Time: %.2fs, AM-RTF: %.3f" + % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s) + ) + print( + "Total-Search-Time: %.2fs, Search-RTF: %.3f" + % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s) + ) + total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time + print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + am_start = time.time() + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + am_time = time.time() - am_start + run_ctx.total_am_time += am_time + + search_start = time.time() + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + search_time = time.time() - search_start + run_ctx.total_search_time += search_time + + print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch)) + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch)) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py new file mode 100644 index 000000000..39d942e9b --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py @@ -0,0 +1,114 @@ +""" +Flashlight/Torchaudio CTC decoder and prior computation functions +""" + +import time +import numpy as np +import torch +from torch import nn + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[SILENCE]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + run_ctx.running_audio_len_s = 0 + run_ctx.total_am_time = 0 + run_ctx.total_search_time = 0 + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print( + "Total-AM-Time: %.2fs, AM-RTF: %.3f" + % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s) + ) + print( + "Total-Search-Time: %.2fs, Search-RTF: %.3f" + % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s) + ) + total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time + print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + am_start = time.time() + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + am_time = time.time() - am_start + run_ctx.total_am_time += am_time + + search_start = time.time() + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + search_time = time.time() - search_start + run_ctx.total_search_time += search_time + + print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch)) + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch)) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc_v2.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc_v2.py new file mode 100644 index 000000000..815b283ba --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc_v2.py @@ -0,0 +1,115 @@ +""" +Flashlight/Torchaudio CTC decoder and prior computation functions +""" + +import time +import numpy as np +import torch +from torch import nn + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[blank]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + run_ctx.running_audio_len_s = 0 + run_ctx.total_am_time = 0 + run_ctx.total_search_time = 0 + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print( + "Total-AM-Time: %.2fs, AM-RTF: %.3f" + % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s) + ) + print( + "Total-Search-Time: %.2fs, Search-RTF: %.3f" + % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s) + ) + total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time + print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + am_start = time.time() + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + am_time = time.time() - am_start + run_ctx.total_am_time += am_time + + search_start = time.time() + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + search_time = time.time() - search_start + run_ctx.total_search_time += search_time + + print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch)) + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch)) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + # TODO: Check if "[" removal is unnecessary + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py new file mode 100644 index 000000000..cabf6d47d --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py @@ -0,0 +1,59 @@ +""" +Greedy CTC decoder without any extras +""" + +import time +import torch + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + run_ctx.labels = vocab.labels + + run_ctx.running_audio_len_s = 0 + run_ctx.total_time = 0 + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print("Total-time: %.2f, Batch-RTF: %.3f" % (run_ctx.total_time, run_ctx.total_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + am_start = time.time() + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + batch_indices = [] + for lp, l in zip(logprobs, audio_features_len): + batch_indices.append(torch.unique_consecutive(torch.argmax(lp[:l], dim=-1), dim=0).detach().cpu().numpy()) + + am_time = time.time() - am_start + run_ctx.total_time += am_time + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + + tags = data["seq_tag"] + + for indices, tag in zip(batch_indices, tags): + print(indices) + sequence = [run_ctx.labels[idx] for idx in indices if idx < len(run_ctx.labels)] + sequence = [s for s in sequence if (not s.startswith("<") and not s.startswith("["))] + text = " ".join(sequence).replace("@@ ", "") + print(text) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(text))) diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py new file mode 100644 index 000000000..c28566b92 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py @@ -0,0 +1,87 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class ConformerEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return SpecaugConfig(**d) + + +@dataclass +class ModelConfig: + frontend_config: VGG4LayerActFrontendV1Config + specaug_config: SpecaugConfig + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_posenc.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_posenc.py new file mode 100644 index 000000000..8d45dab3f --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_posenc.py @@ -0,0 +1,369 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +import numpy as np +import torch +from torch import nn +from typing import Tuple +import math + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig +from .specaugment_fixed import returnn_specaugment_by_length +from .legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformulated in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Masked tensor [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class TransparentConformerEncoderV1(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.1) + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,))) + + torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1)) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + x = self.posenc(x) + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + final = transparent_weights[0] * x + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + final = final + (transparent_weights[i + 1] * x) + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + self.label_target_size = self.cfg.label_target_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + self.export_mode = False + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + + :param raw_audio: + :param raw_audio_len: + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = returnn_specaugment_by_length( + audio_features, + repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + bpe_labels = data["bpe_labels"] # [B, N] (sparse) + bpe_labels_len = data["bpe_labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + bpe_labels, + input_lengths=audio_features_len, + target_lengths=bpe_labels_len, + blank=model.label_target_size, + reduction="sum", + ) + num_phonemes = torch.sum(bpe_labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + # tokens=labels + ["[blank]", "[SILENCE]"], + tokens=labels + ["[blank]"], + blank_token="[blank]", + # sil_token="[SILENCE]", + sil_token="[blank]", + unk_word="[UNKNOWN2]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + # write empty HDF until new ForwardJob exists + f = open("output.hdf", "wt") + f.write(" ") + f.close() + + +def search_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + print(words) + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/serializer.py b/users/hilmes/experiments/nick_setups/standalone_2023/serializer.py new file mode 100644 index 000000000..63171ae7c --- /dev/null +++ b/users/hilmes/experiments/nick_setups/standalone_2023/serializer.py @@ -0,0 +1,109 @@ +import copy +from sisyphus import tk +from typing import Any, Dict, Optional + +from i6_core.tools.git import CloneGitRepositoryJob + +from i6_experiments.common.setups.returnn_pytorch.serialization import ( + Collection as TorchCollection, +) +from i6_experiments.common.setups.serialization import ExternalImport + +from . import PACKAGE + +from i6_experiments.common.setups.serialization import Import, PartialImport + + +def get_pytorch_serializer_v3( + network_module: str, + net_args: Dict[str, Any], + decoder: Optional[str] = None, + decoder_args: Optional[Dict[str, Any]] = None, + post_decoder_args: Optional[Dict[str, Any]] = None, + prior=False, + debug=False, + **kwargs +) -> TorchCollection: + """ + + :param network_module: path to the pytorch config file containing Model + :param net_args: extra arguments for the model + :param decoder: path to the search decoder, if provided will link search functions + :param decoder_args: + :param post_decoder_args: + :param prior: build config for prior computation + :param debug: run training in debug mode (linking from recipe instead of copy) + :param kwargs: + :return: + """ + package = PACKAGE + ".pytorch_networks" + + pytorch_model_import = PartialImport( + code_object_path=package + ".%s.Model" % network_module, + unhashed_package_root=PACKAGE, + hashed_arguments=net_args, + unhashed_arguments={}, + import_as="get_model", + ) + pytorch_train_step = Import( + code_object_path=package + ".%s.train_step" % network_module, unhashed_package_root=PACKAGE + ) + + # TODO: add flag to switch and maybe move to default tools + # i6_models_repo = CloneGitRepositoryJob( + # url="https://github.com/rwth-i6/i6_models", + # commit="1e94a4d9d1aa48fe3ac7f60de2cd7bd3fea19c3e", + # checkout_folder_name="i6_models" + # ).out_repository + i6_models_repo = tk.Path("/u/hilmes/experiments/nick_asr/i6_models") + i6_models_repo.hash_overwrite = "LIBRISPEECH_DEFAULT_I6_MODELS" + i6_models = ExternalImport(import_path=i6_models_repo) + + serializer_objects = [ + i6_models, + pytorch_model_import, + pytorch_train_step, + ] + if decoder: + # Just a hack to test the phoneme-based recognition + forward_step = Import( + code_object_path=package + ".%s.forward_step" % decoder, + unhashed_package_root=PACKAGE, + ) + init_hook = PartialImport( + code_object_path=package + ".%s.forward_init_hook" % decoder, + unhashed_package_root=PACKAGE, + hashed_arguments=decoder_args or {}, + unhashed_arguments=post_decoder_args or {}, + ) + finish_hook = Import( + code_object_path=package + ".%s.forward_finish_hook" % decoder, + unhashed_package_root=PACKAGE, + ) + serializer_objects.extend([forward_step, init_hook, finish_hook]) + if prior: + forward_step = Import( + code_object_path=package + ".%s.prior_step" % network_module, + unhashed_package_root=PACKAGE, + import_as="forward_step", + ) + init_hook = Import( + code_object_path=package + ".%s.prior_init_hook" % network_module, + unhashed_package_root=PACKAGE, + import_as="forward_init_hook", + ) + finish_hook = Import( + code_object_path=package + ".%s.prior_finish_hook" % network_module, + import_as="forward_finish_hook", + unhashed_package_root=PACKAGE, + ) + serializer_objects.extend([forward_step, init_hook, finish_hook]) + serializer = TorchCollection( + serializer_objects=serializer_objects, + make_local_package_copy=not debug, + packages={ + package, + }, + ) + + return serializer diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/__init__.py new file mode 100644 index 000000000..6ac5dd240 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/__init__.py @@ -0,0 +1 @@ +PACKAGE = __package__ diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/data.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/data.py new file mode 100644 index 000000000..aa3fabdec --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/data.py @@ -0,0 +1,212 @@ +""" +The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups + +Here are (or rather, should be) the definitions for Tedlium-V2 data and RETURNN datasets that +are consistent across Phon/BPE as well as CTC/RNN-T/Attention systems +""" +from sisyphus import tk + +from dataclasses import dataclass +from functools import lru_cache +from typing import Dict, List, Optional, Tuple, Union + +from i6_core.returnn import CodeWrapper, BlissToOggZipJob + +from i6_experiments.common.datasets.tedlium2.corpus import get_bliss_corpus_dict + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.base import Datastream +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream, BpeDatastream + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.audio import ( + ReturnnAudioRawOptions, + AudioRawDatastream, +) +from i6_experiments.common.setups.returnn.datasets import Dataset, OggZipDataset, MetaDataset + +from .default_tools import MINI_RETURNN_ROOT, RETURNN_EXE + +DATA_PREFIX = "rescale/tedlium2_standalone_2023/data/" + +# -------------- Dataclasses for configuration and data passing ------------------- + +# here: ( , , ) +EpochWiseFilter = Tuple[int, int, int] + + +@dataclass(frozen=True) +class TrainingDatasets: + train: Dataset + cv: Dataset + devtrain: Dataset + datastreams: Dict[str, Datastream] + prior: Optional[Dataset] + + +@dataclass() +class TrainingDatasetSettings: + # features settings + custom_processing_function: Optional[str] + + # training settings + partition_epoch: int + epoch_wise_filters: List[EpochWiseFilter] + seq_ordering: str + + +# --------------------------- Helper functions ----------------------------------- + + +def get_zip(name: str, bliss_dataset: tk.Path): + """ + + :param name: + :param bliss_dataset: + :return: + """ + zip_dataset_job = BlissToOggZipJob( + bliss_corpus=bliss_dataset, + no_conversion=False, + returnn_python_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + zip_dataset_job.add_alias(DATA_PREFIX + name) + + return zip_dataset_job.out_ogg_zip + + +def get_test_bliss_and_zip(corpus_key): + """ + for now just return the original ogg zip + + :param corpus_key: e.g. "train", "dev", "test" + :return: + """ + bliss = get_bliss_corpus_dict(audio_format="wav")[corpus_key] + zip_dataset = BlissToOggZipJob( + bliss_corpus=bliss, + no_conversion=False, + returnn_python_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + ).out_ogg_zip + return bliss, zip_dataset + + +@lru_cache() +def get_audio_raw_datastream(): + audio_datastream = AudioRawDatastream( + available_for_inference=True, options=ReturnnAudioRawOptions(peak_normalization=False, preemphasis=0.97) + ) + return audio_datastream + + +# --------------------------- Dataset functions ----------------------------------- + + +def build_training_datasets( + settings: TrainingDatasetSettings, + train_ogg: tk.Path, + dev_ogg: tk.Path, + label_datastream: Union[LabelDatastream, BpeDatastream], +): + """ + builds the training RETURNN datasets using raw audio input for arbitrary label type + + :param settings: configuration object for the dataset pipeline + :param train_ogg: ogg zip for training data + :param dev_ogg: ogg zip for dev data + :param label_datastream: phoneme or bpe datastream + :return: + """ + audio_datastream = get_audio_raw_datastream() + + datastreams = { + "raw_audio": audio_datastream, + "labels": label_datastream, + } + + data_map = {"raw_audio": ("zip_dataset", "data"), "labels": ("zip_dataset", "classes")} + + training_audio_opts = audio_datastream.as_returnn_audio_opts() + if settings.custom_processing_function: + training_audio_opts["pre_process"] = CodeWrapper(settings.custom_processing_function) + + additional_opts = {} + if settings.epoch_wise_filters: + additional_opts["epoch_wise_filter"] = {} + for fr, to, max_mean_len in settings.epoch_wise_filters: + additional_opts["epoch_wise_filter"][(fr, to)] = {"max_mean_len": max_mean_len} + + def make_meta(dataset: OggZipDataset): + return MetaDataset( + data_map=data_map, datasets={"zip_dataset": dataset}, seq_order_control_dataset="zip_dataset" + ) + + train_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=training_audio_opts, + target_options=label_datastream.as_returnn_targets_opts(), + partition_epoch=settings.partition_epoch, + seq_ordering=settings.seq_ordering, + additional_options=additional_opts, + ) + train_dataset = make_meta(train_zip_dataset) + + cv_zip_dataset = OggZipDataset( + files=dev_ogg, + audio_options=audio_datastream.as_returnn_audio_opts(), + target_options=label_datastream.as_returnn_targets_opts(), + seq_ordering="sorted_reverse", + ) + cv_dataset = make_meta(cv_zip_dataset) + + devtrain_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=audio_datastream.as_returnn_audio_opts(), + target_options=label_datastream.as_returnn_targets_opts(), + seq_ordering="sorted_reverse", + random_subset=3000, + ) + devtrain_dataset = make_meta(devtrain_zip_dataset) + + prior_zip_dataset = OggZipDataset( + files=train_ogg, + audio_options=training_audio_opts, + target_options=label_datastream.as_returnn_targets_opts(), + partition_epoch=1, + seq_ordering="sorted_reverse", + additional_options=additional_opts, + ) + prior_dataset = make_meta(prior_zip_dataset) + + return TrainingDatasets( + train=train_dataset, + cv=cv_dataset, + devtrain=devtrain_dataset, + datastreams=datastreams, + prior=prior_dataset, + ) + + +@lru_cache() +def build_test_dataset(dataset_key: str): + """ + :param dataset_key: test dataset to generate ("eval" or "test") + """ + + _, test_ogg = get_test_bliss_and_zip(dataset_key) + bliss_dict = get_bliss_corpus_dict() # unmodified bliss + + audio_datastream = get_audio_raw_datastream() + + data_map = {"raw_audio": ("zip_dataset", "data")} + + test_zip_dataset = OggZipDataset( + files=[test_ogg], + audio_options=audio_datastream.as_returnn_audio_opts(), + seq_ordering="sorted_reverse", + ) + test_dataset = MetaDataset( + data_map=data_map, datasets={"zip_dataset": test_zip_dataset}, seq_order_control_dataset="zip_dataset" + ) + + return test_dataset, bliss_dict[dataset_key] diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/dataset_code/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/dataset_code/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/dataset_code/speed_perturbation.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/dataset_code/speed_perturbation.py new file mode 100644 index 000000000..63a564c5e --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/dataset_code/speed_perturbation.py @@ -0,0 +1,20 @@ +""" +RETURNN Dataset compatible processing code snippets +""" + + +def legacy_speed_perturbation(audio, sample_rate, random_state): + """ + Use with the old TF setups Rossenbach/Zeineldeen + + :param audio: + :param sample_rate: + :param random_state: + :return: + """ + import librosa + + new_sample_rate = int(sample_rate * (1 + random_state.randint(-1, 2) * 0.1)) + if new_sample_rate != sample_rate: + audio = librosa.core.resample(audio, orig_sr=sample_rate, target_sr=new_sample_rate, res_type="kaiser_fast") + return audio diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/default_tools.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/default_tools.py new file mode 100644 index 000000000..6d49ab99d --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/default_tools.py @@ -0,0 +1,20 @@ +from sisyphus import tk +from i6_core.tools.git import CloneGitRepositoryJob + + +# python from apptainer +RETURNN_EXE = tk.Path("/usr/bin/python3", hash_overwrite="GENERIC_RETURNN_LAUNCHER") +MINI_RETURNN_ROOT = tk.Path("/u/hilmes/dev/MiniReturnn", hash_overwrite="TEDLIUM2_DEFAULT_RETURNN_ROOT") + +from i6_experiments.common.tools.sctk import compile_sctk + +SCTK_BINARY_PATH = compile_sctk(branch="v2.4.12") # use last published version +# SCTK_BINARY_PATH = compile_sctk() # use most recent SCTK +SCTK_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SCTK_BINARY_PATH" + +from i6_core.tools.git import CloneGitRepositoryJob +from i6_core.lm.kenlm import CompileKenLMJob, CreateBinaryLMJob + +kenlm_repo = CloneGitRepositoryJob("https://github.com/kpu/kenlm").out_repository +KENLM_BINARY_PATH = CompileKenLMJob(repository=kenlm_repo).out_binaries +KENLM_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_KENLM_BINARY_PATH" diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/config.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/config.py new file mode 100644 index 000000000..86049448b --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/config.py @@ -0,0 +1,164 @@ +import copy +import numpy as np +from sisyphus import tk +from typing import Any, Dict, Optional, List + +from i6_core.returnn.config import ReturnnConfig, CodeWrapper + +from i6_experiments.common.setups.returnn_pytorch.serialization import ( + Collection as TorchCollection, +) +from i6_experiments.common.setups.serialization import Import +from ..data import TrainingDatasets +from ..flashlight_phon_ctc.serializer import get_pytorch_serializer_v3, PACKAGE + +from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset + + +def get_training_config( + training_datasets: TrainingDatasets, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine: bool = False, + use_speed_perturbation: bool = False, + keep_epochs: Optional[List] = None, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these do not change the hash + post_config = { + "cleanup_old_models": True, + "stop_on_nonfinite_train_score": True, # this might break now with True + "num_workers_per_gpu": 2, + } + if keep_epochs is not None: + post_config["cleanup_old_models"] = { + "keep_last_n": 2, + "keep_best_n": 4, + "keep": keep_epochs, + } + + base_config = { + "max_seqs": 60, + ############# + "train": copy.deepcopy(training_datasets.train.as_returnn_opts()), + "dev": training_datasets.cv.as_returnn_opts(), + "eval_datasets": {"devtrain": training_datasets.devtrain.as_returnn_opts()}, + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, net_args=net_args, debug=debug, use_custom_engine=use_custom_engine + ) + python_prolog = None + if use_speed_perturbation: + prolog_serializer = TorchCollection( + serializer_objects=[ + Import( + code_object_path=PACKAGE + ".dataset_code.speed_perturbation.legacy_speed_perturbation", + unhashed_package_root=PACKAGE, + ) + ] + ) + python_prolog = [prolog_serializer] + config["train"]["datasets"]["zip_dataset"]["audio"]["pre_process"] = CodeWrapper("legacy_speed_perturbation") + + returnn_config = ReturnnConfig( + config=config, post_config=post_config, python_prolog=python_prolog, python_epilog=[serializer] + ) + return returnn_config + + +def get_prior_config( + training_datasets: TrainingDatasets, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = {} + + base_config = { + ############# + "batch_size": 50000 * 160, + "max_seqs": 60, + ############# + "forward": training_datasets.prior.as_returnn_opts(), + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + prior=True, + ) + returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) + return returnn_config + + +def get_search_config( + network_module: str, + net_args: Dict[str, Any], + decoder: [str], + decoder_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine:bool = False, + export:bool = False, + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = {} + + base_config = { + ############# + "batch_size": 24000 * 160, + "max_seqs": 60, + ############# + # dataset is added later in the pipeline during search_single + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + decoder=decoder, + decoder_args=decoder_args, + export=export, + ) + returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) + return returnn_config diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/data.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/data.py new file mode 100644 index 000000000..35d50f268 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/data.py @@ -0,0 +1,92 @@ +""" +The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups +""" +from sisyphus import tk +from dataclasses import dataclass +from functools import lru_cache +from typing import Dict, List, Optional, Tuple + +from i6_core.returnn import CodeWrapper + +from i6_experiments.common.datasets.tedlium2.corpus import get_ogg_zip_dict +from i6_experiments.common.datasets.tedlium2.vocab import get_subword_nmt_bpe_v2 +from i6_experiments.common.datasets.tedlium2.lexicon import get_bliss_lexicon +from i6_experiments.common.helpers.text_labels.subword_nmt_bpe import get_returnn_subword_nmt + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import BpeDatastream +from i6_experiments.users.rossenbach.lexicon.bpe_lexicon import CreateBPELexiconJob + +from returnn_common.datasets import Dataset, OggZipDataset, MetaDataset + +from ..data import build_training_datasets, TrainingDatasetSettings, TrainingDatasets + +from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE + + +from ..data import DATA_PREFIX + + +def get_lexicon(bpe_size: int) -> tk.Path: + subword_nmt_repo = get_returnn_subword_nmt( + commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", output_prefix=DATA_PREFIX + ) + subword_nmt_repo.hash_overwrite = "I6_SUBWORD_NMT_V2" + + bpe_datastream = get_bpe_datastream(bpe_size=bpe_size, is_recog=False) + bpe_lexicon = CreateBPELexiconJob( + base_lexicon_path=get_bliss_lexicon( + add_unknown_phoneme_and_mapping=False, add_silence=False, output_prefix="tedliumv2_datasets" + ), + bpe_codes=bpe_datastream.codes, + bpe_vocab=bpe_datastream.vocab, + subword_nmt_repo=subword_nmt_repo, + unk_label="", + ).out_lexicon + + return bpe_lexicon + + +def get_text_lexicon(bpe_size: int) -> tk.Path: + """ + + :return: + """ + bliss_lex = get_lexicon(bpe_size=bpe_size) + from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon + + word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon + return word_lexicon + + +def get_bpe_datastream(bpe_size: int, is_recog: bool) -> BpeDatastream: + """ + Returns the datastream for the bpe labels + + Uses the legacy BPE setup that is compatible with old LM models + + :param librispeech_key: + :param bpe_size: size for the bpe labels + :param is_recog: removes the UNK label when not in training + :param use_v2: subword_nmt had a bug where it would not find python, use corrected version which changes hash + """ + bpe_settings = get_subword_nmt_bpe_v2(bpe_size=bpe_size, unk_label="") + bpe_targets = BpeDatastream(available_for_inference=False, bpe_settings=bpe_settings, use_unk_label=is_recog) + return bpe_targets + + +def build_bpe_training_datasets( + bpe_size: int, + settings: TrainingDatasetSettings, +) -> TrainingDatasets: + """ + :param settings: configuration object for the dataset pipeline + """ + label_datastream = get_bpe_datastream(bpe_size=bpe_size, is_recog=False) + + ogg_zip_dict = get_ogg_zip_dict(returnn_python_exe=RETURNN_EXE, returnn_root=MINI_RETURNN_ROOT) + train_ogg = ogg_zip_dict["train"] + dev_ogg = ogg_zip_dict["dev"] + + return build_training_datasets( + settings=settings, train_ogg=train_ogg, dev_ogg=dev_ogg, label_datastream=label_datastream + ) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/exp_baseline.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/exp_baseline.py new file mode 100644 index 000000000..a02529db8 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/exp_baseline.py @@ -0,0 +1,549 @@ +from sisyphus import tk + +import copy +from dataclasses import asdict +import numpy as np +from typing import cast, List, Optional + +from i6_core.report.report import _Report_Type + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + +from .data import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon +from ..data import build_test_dataset +from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT + +from ..pipeline import training, search, compute_prior + +from .config import get_training_config, get_search_config, get_prior_config + + +def flash_bpe_ctc_report_format(report: _Report_Type) -> str: + extra_ls = [] + out = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)] + out = sorted(out, key=lambda x: float(x[1])) + best_ls = [out[0]] + for extra in extra_ls: + out2 = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if extra in recog] + out2 = sorted(out2, key=lambda x: float(x[1])) + if len(out2) > 0: + out.append((extra, "")) + out.extend(out2) + best_ls.append(out2[0]) + best_ls = sorted(best_ls, key=lambda x: float(x[1])) + out.append(("Best Results", "")) + out.extend(best_ls) + return "\n".join([f"{pair[0]}: {str(pair[1])}" for pair in out]) + + +def conformer_baseline(): + prefix_name = "experiments/rescale/tedliumv2/flashlight_bpe_ctc/" + + BPE_SIZE = 1000 + + train_settings = TrainingDatasetSettings( + custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000" + ) + + train_settings_retrain = copy.deepcopy(train_settings) + train_settings_retrain.epoch_wise_filters = [] + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data = build_bpe_training_datasets( + bpe_size=BPE_SIZE, + settings=train_settings, + ) + label_datastream = cast(LabelDatastream, train_data.datastreams["labels"]) + vocab_size_without_blank = label_datastream.vocab_size + + # build testing datasets + test_dataset_tuples = {} + # for testset in ["dev", "test"]: + for testset in ["dev"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + ) + from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm + + lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False) + lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] + arpa_ted_lm = lm.ngram_lm + + # ---------------------------------------------------------------------------------------------------------------- # + + def run_exp( + ft_name, + datasets, + train_args, + search_args=None, + with_prior=False, + num_epochs=250, + decoder="ctc.decoder.flashlight_bpe_ctc", + eval_epochs: Optional[List] = None, + ): + training_name = "/".join(ft_name.split("/")[:-1]) + search_args = search_args if search_args is not None else {} + + returnn_config = get_training_config(training_datasets=datasets, keep_epochs=eval_epochs, **train_args) + train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs) + + if eval_epochs is None: + eval_epochs = [num_epochs] + search_job_ls = [] + report = {} + for epoch in eval_epochs: + if with_prior: + prior_args = copy.deepcopy(train_args) + if "max_seqs" in prior_args["config"]: + del prior_args["config"]["max_seqs"] + returnn_config = get_prior_config(training_datasets=datasets, **prior_args) + prior_file = compute_prior( + ft_name, + returnn_config, + checkpoint=train_job.out_checkpoints[epoch], + returnn_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + epoch=str(epoch) # just for alias generation + ) + tk.register_output(training_name + f"/prior/{epoch}.txt", prior_file) + search_args["prior_file"] = prior_file + + returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder) + format_string_report, values_report, search_jobs = search( + ft_name + "/default_%i" % epoch, + returnn_search_config, + train_job.out_checkpoints[epoch], + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + search_job_ls += search_jobs + report.update(values_report) + from i6_core.returnn import GetBestPtCheckpointJob + best_job = GetBestPtCheckpointJob(train_job.out_model_dir, train_job.out_learning_rates, key="dev_loss_ctc") + best_job.add_alias(ft_name + "/get_best_job") + format_string_report, values_report, search_jobs = search( + ft_name + "/best_chkpt", + returnn_search_config, + best_job.out_checkpoint, + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + search_job_ls += search_jobs + report.update(values_report) + + return train_job, search_job_ls, format_string_report, report + + def generate_report(results, exp_name): + from i6_core.report import GenerateReportStringJob, MailJob + + report = GenerateReportStringJob(report_values=results, report_template=flash_bpe_ctc_report_format) + report.add_alias(f"report/report/{exp_name}") + mail = MailJob(report.out_report, send_contents=True, subject=exp_name) + mail.add_alias(f"report/mail/{exp_name}") + tk.register_output("mail/" + exp_name, mail.out_status) + + # from here on onwards, use default AdamW with same OCLR + train_args_adamw03_accum2 = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(1e-5, 1e-3, 125)) + list(np.linspace(1e-3, 1e-6, 125)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 2, + }, + "debug": False, + } + + train_args_adamw03_accum2_jjlr = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 2, + }, + "debug": False, + } + + default_search_args = { + "lexicon": get_text_lexicon(bpe_size=BPE_SIZE), + "returnn_vocab": label_datastream.vocab, + "beam_size": 1024, + "arpa_lm": arpa_ted_lm, + "beam_threshold": 14, + } + + #### New experiments with corrected FF-Dim + + from ..pytorch_networks.ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + ) + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + ) + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v2", + "net_args": {"model_config_dict": asdict(model_config)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2_JJLR/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 97.9, not converged + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2_JJLR" + ) + del results + + from ..pytorch_networks.ctc.conformer_0923 import i6modelsV1_VGG4LayerActFrontendV1_v4_cfg + + model_config_v4_start11 = i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + specauc_start_epoch=11, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + ) + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v4", + "net_args": {"model_config_dict": asdict(model_config_v4_start11)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + "beam_size_token": 128, + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_JJLR_specstart11/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 8.0 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_JJLR_specstart11" + ) + del results + # TODO: This here is the subsampling 4 baseline giving 8.0% with LM 1.6 and prior 0.5 + results = {} + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v5", + "net_args": {"model_config_dict": asdict(model_config_v4_start11)}, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 130)) + list(np.linspace(7e-4, 7e-5, 230)) + list(np.linspace(7e-5, 1e-8, 140)) + ) + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + "beam_size_token": 128, + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_specstart11_longer/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + num_epochs=500, + ) + results.update(wer_values) + del wer_values + generate_report( # 7.7 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_specstart11_longer" + ) + del results + + results = {} + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v5", + "net_args": {"model_config_dict": asdict(model_config_v4_start11)}, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + [7e-5]) + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + "beam_size_token": 128, + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_specstart11_longerend/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + num_epochs=500, + ) + results.update(wer_values) + del wer_values + generate_report( # 7.6 + results=results, + exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_specstart11_longerend" + ) + del results + + results = {} + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v5", + "net_args": {"model_config_dict": asdict(model_config_v4_start11)}, + } + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + "beam_size_token": 128, + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_specstart11/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 7.9, most likely better due to noise + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_specstart11" + ) + del results + # --------------------------------------------------------------------------------------------------------------- # + # SUB 6 from here + + model_config_v4_sub6_start11 = copy.deepcopy(model_config_v4_start11) + model_config_v4_sub6_start11.frontend_config.pool1_stride = (3, 1) + model_config_v4_sub6_start11.frontend_config.pool1_kernel_size = (3, 1) + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v4", + "net_args": {"model_config_dict": asdict(model_config_v4_sub6_start11)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_JJLR_sub6_specstart11/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # did not converge 98.0 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_JJLR_sub6_specstart11" + ) + del results + + model_config_sub6 = copy.deepcopy(model_config) + model_config_sub6.frontend_config.pool1_stride = (3, 1) + model_config_sub6.frontend_config.pool1_kernel_size = (3, 1) + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3_transparent", + "net_args": {"model_config_dict": asdict(model_config_sub6)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_transparent_sub6/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 97.8 not converged + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_transparent_sub6" + ) + del results + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent", + "net_args": {"model_config_dict": asdict(model_config_sub6)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_posenc_transparent_sub6/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 99.2, not converged + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_posenc_transparent_sub6" + ) + del results + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_latespecaug", + "net_args": {"model_config_dict": asdict(model_config_sub6)}, + } + results = {} + for lm_weight in [1.4, 1.6, 1.8, 2.0]: + for prior_scale in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: + for beam_size in [512, 1024]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + "beam_size_token": 128, + "beam_size": beam_size, + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_posenc_transparent_sub6_latespecaug/lm%.1f_prior%.2f_bs%i_th14" + % (lm_weight, prior_scale, beam_size), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 8.4 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_posenc_transparent_sub6_latespecaug" + ) + del results + + train_args_debug = copy.deepcopy(train_args) + train_args_debug["debug"] = True + # greedy + search_args = { + "returnn_vocab": label_datastream.vocab, + } + run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_posenc_transparent_sub6_latespecaug/greedy", + datasets=train_data, + train_args=train_args_debug, + search_args=search_args, + with_prior=True, + decoder="ctc.decoder.greedy_bpe_ctc_v2", + ) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/exp_pretrain.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/exp_pretrain.py new file mode 100644 index 000000000..851735b38 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/exp_pretrain.py @@ -0,0 +1,1143 @@ +import itertools + +from sisyphus import tk + +import copy +from dataclasses import asdict +import numpy as np +from typing import cast, List, Optional, Dict +from onnxruntime.quantization.quantize import QuantType, QuantFormat +from onnxruntime.quantization.calibrate import CalibrationMethod + +from i6_core.report.report import _Report_Type +from i6_core.returnn import GetBestPtCheckpointJob, TorchOnnxExportJob + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + +from .data import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon +from ..data import build_test_dataset, TrainingDatasets +from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT + +from ..pipeline import training, search, compute_prior + +from .config import get_training_config, get_search_config, get_prior_config + + +def flash_bpe_ctc_report_format(report: _Report_Type) -> str: + extra_ls = [] + out = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)] + out = sorted(out, key=lambda x: float(x[1])) + best_ls = [out[0]] + for extra in extra_ls: + out2 = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if extra in recog] + out2 = sorted(out2, key=lambda x: float(x[1])) + if len(out2) > 0: + out.append((extra, "")) + out.extend(out2) + best_ls.append(out2[0]) + best_ls = sorted(best_ls, key=lambda x: float(x[1])) + out.append(("Best Results", "")) + out.extend(best_ls) + return "\n".join([f"{pair[0]}: {str(pair[1])}" for pair in out]) + + +def get_quant_str(num_seqs, quant_mode, activation_type, weight_type, average, sym, quant_ops, quant_format): + if quant_mode == CalibrationMethod.MinMax: + mode_str = "quant_min_max" + elif quant_mode == CalibrationMethod.Entropy: + mode_str = "quant_entropy" + else: + mode_str = "quant_percentile" + mode_str += f"_{num_seqs}" + for x in [activation_type, weight_type]: + if x == QuantType.QInt8: + mode_str += "_QInt8" + elif x == QuantType.QUInt8: + mode_str += "_QUint8" + if average: + mode_str += "_avg" + if sym: + mode_str += "_sym" + if quant_ops is not None: + mode_str += "_" + "_".join(quant_ops) + else: + mode_str += "_full" + if quant_format == QuantFormat.QDQ: + mode_str += "_QDQ" + else: + mode_str += "QOperator" + return mode_str + + +def pretrained_experiments(): + prefix_name = "experiments/rescale/tedliumv2/flashlight_bpe_ctc/" + + BPE_SIZE = 1000 + + train_settings = TrainingDatasetSettings( + custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000" + ) + + train_settings_retrain = copy.deepcopy(train_settings) + train_settings_retrain.epoch_wise_filters = [] + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data = build_bpe_training_datasets( + bpe_size=BPE_SIZE, + settings=train_settings, + ) + label_datastream = cast(LabelDatastream, train_data.datastreams["labels"]) + vocab_size_without_blank = label_datastream.vocab_size + + # build testing datasets + test_dataset_tuples = {} + # for testset in ["dev", "test"]: + for testset in ["dev"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + ) + from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm + + lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False) + lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] + arpa_ted_lm = lm.ngram_lm + + # ---------------------------------------------------------------------------------------------------------------- # + + def run_exp( + ft_name, + datasets: TrainingDatasets, + train_args, + search_args=None, + with_prior=False, + num_epochs=250, + decoder="ctc.decoder.flashlight_bpe_ctc", + eval_epochs: Optional[List] = None, + quantize_args: Optional[Dict[str, str]] = None + ): + training_name = "/".join(ft_name.split("/")[:-1]) + search_args = search_args if search_args is not None else {} + + returnn_config = get_training_config(training_datasets=datasets, keep_epochs=eval_epochs, **train_args) + train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs) + + if eval_epochs is None or "onnx" in ft_name: + eval_epochs = [num_epochs] + search_job_ls = [] + report = {} + returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder) + for epoch in eval_epochs: + if with_prior: + prior_args = copy.deepcopy(train_args) + if "max_seqs" in prior_args["config"]: + prior_args["config"]["max_seqs"] = 15 + returnn_config = get_prior_config(training_datasets=datasets, **prior_args) + prior_file = compute_prior( + ft_name, + returnn_config, + checkpoint=train_job.out_checkpoints[epoch], + returnn_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + epoch=str(epoch) # just for alias generation + ) + tk.register_output(training_name + f"/prior/{epoch}.txt", prior_file) + search_args["prior_file"] = prior_file + if quantize_args is not None: + from i6_experiments.users.hilmes.tools.onnx import ModelQuantizeStaticJob + returnn_export_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder, export=True) + onnx_job = TorchOnnxExportJob( + returnn_config=returnn_export_config, + checkpoint=train_job.out_checkpoints[epoch], + returnn_root=MINI_RETURNN_ROOT, + returnn_python_exe=RETURNN_EXE, + ) + onnx_job.add_alias(ft_name + f"/onnx_export_{epoch}") + quant_job = ModelQuantizeStaticJob( + dataset=datasets.train.as_returnn_opts(), + model=onnx_job.out_onnx_model, + **quantize_args + ) + quant_job.add_alias(ft_name + f"/quantization_{epoch}") + decoder_args = copy.deepcopy(search_args) + decoder_args["quantized_model"] = quant_job.out_model + returnn_search_config = get_search_config(**train_args, decoder_args=decoder_args, decoder=decoder) + format_string_report, values_report, search_jobs = search( + ft_name + "/quantized_%i" % epoch, + returnn_search_config, + train_job.out_checkpoints[epoch], + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + #for search_job in search_jobs: + # search_job.add_input(quant_job.out_model) + search_job_ls += search_jobs + report.update(values_report) + else: + format_string_report, values_report, search_jobs = search( + ft_name + "/default_%i" % epoch, + returnn_search_config, + train_job.out_checkpoints[epoch], + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + search_job_ls += search_jobs + report.update(values_report) + + best_job = GetBestPtCheckpointJob(train_job.out_model_dir, train_job.out_learning_rates, key="dev_loss_ctc") + best_job.add_alias(ft_name + "/get_best_job") + format_string_report, values_report, search_jobs = search( + ft_name + "/best_chkpt", + returnn_search_config, + best_job.out_checkpoint, + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + search_job_ls += search_jobs + report.update(values_report) + + return train_job, search_job_ls, format_string_report, report + + def generate_report(results, exp_name): + from i6_core.report import GenerateReportStringJob, MailJob + + report = GenerateReportStringJob(report_values=results, report_template=flash_bpe_ctc_report_format) + report.add_alias(f"report/report/{exp_name}") + mail = MailJob(report.out_report, send_contents=True, subject=exp_name) + mail.add_alias(f"report/mail/{exp_name}") + tk.register_output("mail/" + exp_name, mail.out_status) + + # from here on onwards, use default Adam with same OCLR + default_search_args = { + "lexicon": get_text_lexicon(bpe_size=BPE_SIZE), + "returnn_vocab": label_datastream.vocab, + "beam_size": 1024, + "arpa_lm": arpa_ted_lm, + "beam_threshold": 14, + } + + from ..pytorch_networks.ctc.conformer_0923 import whisper_pretrained_v2_cfg + + whisper_cfg_2 = whisper_pretrained_v2_cfg.WhisperConfig( + just_encoder=True, + finetune_layer=6, + split_seq=True, + name="base.en", + dropout=0, + ) + model_config_whisper_base_v1 = whisper_pretrained_v2_cfg.ModelConfig( + specauc_start_epoch=0, + label_target_size=vocab_size_without_blank, + final_dropout=0.2, + whisper_config=whisper_cfg_2, + ) + train_args_whisper_adam_accum50_jjlr = { + "config": { + "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "max_seqs": 3, + "accum_grad_multiple_step": 50, + }, + "debug": True, + } + eval_epochs = [50, 75, 100, 150, 200, 250] + train_args = { + **copy.deepcopy(train_args_whisper_adam_accum50_jjlr), + "network_module": "ctc.conformer_0923.whisper_pretrained_v5", + "net_args": {"model_config_dict": asdict(model_config_whisper_base_v1)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + "beam_size_token": 128, + } + train_job, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/whisper_base_pretrain_v5_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_epochs=eval_epochs, + ) + #train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( + results=results, exp_name=prefix_name + "conformer_0923/whisper_base_pretrain_v5_jjlr" + ) + del results + + # whisper_cfg_1 = whisper_pretrained_v2_cfg.WhisperConfig( + # just_encoder=True, + # finetune_layer=1, + # split_seq=True, + # name="base.en", + # dropout=0, + # ) + # model_config_whisper_v2 = whisper_pretrained_v2_cfg.ModelConfig( + # specauc_start_epoch=0, + # label_target_size=vocab_size_without_blank, + # final_dropout=0.2, + # whisper_config=whisper_cfg_1, + # ) + # train_args_whisper_adam_accum30_lr2e5 = { + # "config": { + # "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + # "learning_rates": [2e-5], + # ############# + # "batch_size": 180 * 16000, + # "max_seq_length": {"audio_features": 35 * 16000}, + # "max_seqs": 5, + # "accum_grad_multiple_step": 30, + # }, + # "debug": False, + # } + # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250] + # train_args = { + # **copy.deepcopy(train_args_whisper_adam_accum30_lr2e5), + # "network_module": "ctc.conformer_0923.whisper_pretrained_v5", + # "net_args": {"model_config_dict": asdict(model_config_whisper_v2)}, + # } + # results = {} + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # "beam_size_token": 128, + # } + # train_job, _, _, wer_values = run_exp( + # prefix_name + # + "conformer_0923/whisper_pretrain_v5_base_1e-5/lm%.1f_prior%.2f_bs1024_th14" % ( + # lm_weight, prior_scale), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # eval_epochs=eval_epochs, + # ) + # # train_job.rqmt["gpu_mem"] = 24 + # results.update(wer_values) + # del wer_values + # generate_report( + # results=results, exp_name=prefix_name + "conformer_0923/whisper_pretrain_v5_base_1e-5" + # ) + # del results + # + # whisper_cfg_1 = whisper_pretrained_v2_cfg.WhisperConfig( + # just_encoder=True, + # finetune_layer=1, + # split_seq=True, + # name="base.en", + # dropout=0, + # ) + # model_config_whisper_v2_later_spec = whisper_pretrained_v2_cfg.ModelConfig( + # specauc_start_epoch=11, + # label_target_size=vocab_size_without_blank, + # final_dropout=0.2, + # whisper_config=whisper_cfg_1, + # ) + # train_args_whisper_adam_accum30_lr1e5 = { + # "config": { + # "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + # "learning_rates": [1e-5], + # ############# + # "batch_size": 180 * 16000, + # "max_seq_length": {"audio_features": 35 * 16000}, + # "max_seqs": 5, + # "accum_grad_multiple_step": 30, + # }, + # "debug": False, + # } + # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250] + # train_args = { + # **copy.deepcopy(train_args_whisper_adam_accum30_lr1e5), + # "network_module": "ctc.conformer_0923.whisper_pretrained_v5", + # "net_args": {"model_config_dict": asdict(model_config_whisper_v2_later_spec)}, + # } + # results = {} + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # "beam_size_token": 128, + # } + # train_job, _, _, wer_values = run_exp( + # prefix_name + # + "conformer_0923/whisper_pretrain_v5_base_1e-5_specstart11/lm%.1f_prior%.2f_bs1024_th14" % ( + # lm_weight, prior_scale), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # eval_epochs=eval_epochs, + # ) + # # train_job.rqmt["gpu_mem"] = 24 + # results.update(wer_values) + # del wer_values + # generate_report( + # results=results, exp_name=prefix_name + "conformer_0923/whisper_pretrain_v5_base_1e-5_specstart11" + # ) + # del results + # + # model_config_whisper_v2_no_spec = whisper_pretrained_v2_cfg.ModelConfig( + # specauc_start_epoch=5000, + # label_target_size=vocab_size_without_blank, + # final_dropout=0.2, + # whisper_config=whisper_cfg_1, + # ) + # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250] + # train_args = { + # **copy.deepcopy(train_args_whisper_adam_accum30_lr1e5), + # "network_module": "ctc.conformer_0923.whisper_pretrained_v5", + # "net_args": {"model_config_dict": asdict(model_config_whisper_v2_no_spec)}, + # } + # results = {} + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # "beam_size_token": 128, + # } + # train_job, _, _, wer_values = run_exp( + # prefix_name + # + "conformer_0923/whisper_pretrain_v5_base_1e-5_nospec/lm%.1f_prior%.2f_bs1024_th14" % ( + # lm_weight, prior_scale), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # eval_epochs=eval_epochs, + # ) + # # train_job.rqmt["gpu_mem"] = 24 + # results.update(wer_values) + # del wer_values + # generate_report( + # results=results, exp_name=prefix_name + "conformer_0923/whisper_pretrain_v5_base_1e-5_nospec" + # ) + # del results + # + # train_args = { + # **copy.deepcopy(train_args_whisper_adam_accum30_lr2e5), + # "network_module": "ctc.conformer_0923.whisper_pretrained_v5", + # "net_args": {"model_config_dict": asdict(model_config_whisper_v2_no_spec)}, + # } + # results = {} + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # "beam_size_token": 128, + # } + # train_job, _, _, wer_values = run_exp( + # prefix_name + # + "conformer_0923/whisper_pretrain_v5_base_2e-5_nospec/lm%.1f_prior%.2f_bs1024_th14" % ( + # lm_weight, prior_scale), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # eval_epochs=eval_epochs, + # ) + # # train_job.rqmt["gpu_mem"] = 24 + # results.update(wer_values) + # del wer_values + # generate_report( + # results=results, exp_name=prefix_name + "conformer_0923/whisper_pretrain_v5_base_2e-5_nospec" + # ) + # del results + # + # whisper_cfg_tune_2 = whisper_pretrained_v2_cfg.WhisperConfig( + # just_encoder=True, + # finetune_layer=2, + # split_seq=True, + # name="base.en", + # dropout=0, + # ) + # model_config_whisper_v2 = whisper_pretrained_v2_cfg.ModelConfig( + # specauc_start_epoch=0, + # label_target_size=vocab_size_without_blank, + # final_dropout=0.2, + # whisper_config=whisper_cfg_tune_2, + # ) + # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250] + # train_args = { + # **copy.deepcopy(train_args_whisper_adam_accum30_lr1e5), + # "network_module": "ctc.conformer_0923.whisper_pretrained_v5", + # "net_args": {"model_config_dict": asdict(model_config_whisper_v2)}, + # } + # results = {} + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # "beam_size_token": 128, + # } + # train_job, _, _, wer_values = run_exp( + # prefix_name + # + "conformer_0923/whisper_pretrain_v5_base_2_1e-5/lm%.1f_prior%.2f_bs1024_th14" % ( + # lm_weight, prior_scale), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # eval_epochs=eval_epochs, + # ) + # # train_job.rqmt["gpu_mem"] = 24 + # results.update(wer_values) + # del wer_values + # generate_report( + # results=results, exp_name=prefix_name + "conformer_0923/whisper_pretrain_v5_base_2_1e-5" + # ) + # del results + # + # whisper_cfg_tune_3 = whisper_pretrained_v2_cfg.WhisperConfig( + # just_encoder=True, + # finetune_layer=3, + # split_seq=True, + # name="base.en", + # dropout=0, + # ) + # model_config_whisper_v2 = whisper_pretrained_v2_cfg.ModelConfig( + # specauc_start_epoch=0, + # label_target_size=vocab_size_without_blank, + # final_dropout=0.2, + # whisper_config=whisper_cfg_tune_3, + # ) + # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250] + # train_args = { + # **copy.deepcopy(train_args_whisper_adam_accum30_lr1e5), + # "network_module": "ctc.conformer_0923.whisper_pretrained_v5", + # "net_args": {"model_config_dict": asdict(model_config_whisper_v2)}, + # } + # results = {} + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # "beam_size_token": 128, + # } + # train_job, _, _, wer_values = run_exp( + # prefix_name + # + "conformer_0923/whisper_pretrain_v5_base_3_1e-5/lm%.1f_prior%.2f_bs1024_th14" % ( + # lm_weight, prior_scale), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # eval_epochs=eval_epochs, + # ) + # # train_job.rqmt["gpu_mem"] = 24 + # results.update(wer_values) + # del wer_values + # generate_report( + # results=results, exp_name=prefix_name + "conformer_0923/whisper_pretrain_v5_base_3_1e-5" + # ) + # del results + + from ..pytorch_networks.ctc.conformer_0923 import hubert_pretrained_v1_cfg + + hubert_cfg_1 = hubert_pretrained_v1_cfg.HubertConfig( + finetune_layer=1, + name="base-ls960", + ) + model_config_hubert_v1 = hubert_pretrained_v1_cfg.ModelConfig( + specauc_start_epoch=0, + label_target_size=vocab_size_without_blank, + final_dropout=0.2, + hubert_cfg=hubert_cfg_1, + ) + # train_args_hubert_adam_accum25_jjlr = { + # "config": { + # "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + # "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + # + list(np.linspace(7e-4, 7e-5, 110)) + # + list(np.linspace(7e-5, 1e-8, 30)), + # ############# + # "batch_size": 180 * 16000, + # "max_seq_length": {"audio_features": 35 * 16000}, + # "max_seqs": 3, + # "accum_grad_multiple_step": 25, + # }, + # "debug": True, + # } + # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250] + # train_args = { + # **copy.deepcopy(train_args_hubert_adam_accum25_jjlr), + # "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + # "net_args": {"model_config_dict": asdict(model_config_hubert_v1)}, + # } + # results = {} + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # "beam_size_token": 128, + # } + # train_job, _, _, wer_values = run_exp( + # prefix_name + # + "conformer_0923/hubert_pretrain_v3_base_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # eval_epochs=eval_epochs, + # ) + # train_job.rqmt["gpu_mem"] = 24 + # results.update(wer_values) + # del wer_values + # generate_report( # 8.2 + # results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_base_jjlr" + # ) + # del results + # + # train_args_hubert_adam_accum25_jjlr = { + # "config": { + # "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + # "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + # + list(np.linspace(7e-4, 7e-5, 110)) + # + list(np.linspace(7e-5, 1e-8, 30)), + # ############# + # "batch_size": 180 * 16000, + # "max_seq_length": {"audio_features": 35 * 16000}, + # "max_seqs": 3, + # "accum_grad_multiple_step": 10, + # }, + # "debug": False, + # } + # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250] + # train_args = { + # **copy.deepcopy(train_args_hubert_adam_accum25_jjlr), + # "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + # "net_args": {"model_config_dict": asdict(model_config_hubert_v1)}, + # } + # results = {} + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # "beam_size_token": 128, + # } + # train_job, _, _, wer_values = run_exp( + # prefix_name + # + "conformer_0923/hubert_pretrain_v3_base_smallaccum_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # eval_epochs=eval_epochs, + # ) + # train_job.rqmt["gpu_mem"] = 24 + # results.update(wer_values) + # del wer_values + # generate_report( # 7.9 + # results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_base_smallaccum_jjlr" + # ) + # del results + # + # train_args_hubert_adam_accum25_jjlr = { + # "config": { + # "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + # "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + # + list(np.linspace(7e-4, 7e-5, 110)) + # + list(np.linspace(7e-5, 1e-8, 30)), + # ############# + # "batch_size": 180 * 16000, + # "max_seq_length": {"audio_features": 35 * 16000}, + # "max_seqs": 3, + # "accum_grad_multiple_step": 100, + # }, + # "debug": False, + # } + # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250] + # train_args = { + # **copy.deepcopy(train_args_hubert_adam_accum25_jjlr), + # "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + # "net_args": {"model_config_dict": asdict(model_config_hubert_v1)}, + # } + # results = {} + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # "beam_size_token": 128, + # } + # train_job, _, _, wer_values = run_exp( + # prefix_name + # + "conformer_0923/hubert_pretrain_v3_base_largeaccum_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # eval_epochs=eval_epochs, + # ) + # train_job.rqmt["gpu_mem"] = 24 + # results.update(wer_values) + # del wer_values + # + # if prior_scale == 0.5 and lm_weight == 1.6: + # train_job, _, _, wer_values = run_exp( + # prefix_name + # + "conformer_0923/hubert_pretrain_v3_base_largeaccum_jjlr/lm%.1f_prior%.2f_bs1024_th14_onnx" % ( + # lm_weight, prior_scale), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # eval_epochs=eval_epochs, + # decoder="ctc.decoder.flashlight_onnx_bpe_ctc" + # ) + # results.update(wer_values) + # del wer_values + # generate_report( # 8.1 + # results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_base_largeaccum_jjlr" + # ) + # del results + + + hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig( + finetune_layer=2, + name="base-ls960", + ) + model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig( + specauc_start_epoch=0, + label_target_size=vocab_size_without_blank, + final_dropout=0.2, + hubert_cfg=hubert_cfg_2, + ) + train_args_hubert_adam_accum25_jjlr = { + "config": { + "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "max_seqs": 3, + "accum_grad_multiple_step": 25, + }, + "debug": True, + } + eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250] + train_args = { + **copy.deepcopy(train_args_hubert_adam_accum25_jjlr), + "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + "net_args": {"model_config_dict": asdict(model_config_hubert_2)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + "beam_size_token": 128, + } + train_job, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/hubert_pretrain_v3_base_tune2_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_epochs=eval_epochs, + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + if lm_weight == 1.8 and prior_scale == 0.5: + epochs = [200] + #num_seqs_ls = [10, 100, 1000] + num_seqs_ls = [10] + quant_modes = [CalibrationMethod.MinMax] + activation_types = [QuantType.QInt8] + weight_types = [QuantType.QInt8] + #average_modes = [True, False] + average_modes = [True] + #sym_modes = [True, False] + sym_modes = [True] + #quant_ops_ls = [None, ["Conv"], ["Linear"], ["Conv", "Linear"]] + quant_ops_ls = [None] + #quant_formats = [QuantFormat.QDQ, QuantFormat.QOperator] + quant_formats = [QuantFormat.QDQ] + for num_seqs, quant_mode, activation_type, weight_type, average, sym, quant_ops, quant_format in ( + itertools.product( + num_seqs_ls, quant_modes, activation_types, weight_types, average_modes, + sym_modes, quant_ops_ls, quant_formats)): + quant_str = get_quant_str(num_seqs, quant_mode, activation_type, weight_type, average, sym, quant_ops, quant_format) + train_job, _, _, wer_values = run_exp( + prefix_name + + f"conformer_0923/hubert_pretrain_v3_base_tune2_jjlr/lm%.1f_prior%.2f_bs1024_th14_quant/{quant_str}" % ( + lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_epochs=epochs, + decoder="ctc.decoder.flashlight_quantized_bpe_ctc", + quantize_args={ + "num_seqs": num_seqs, + "num_parallel_seqs": 10, + "calibrate_method": CalibrationMethod.MinMax, + "moving_average": average, + "symmetric": sym, + "activation_type": activation_type, + "weight_type": weight_type, + "ops_to_quant": quant_ops, + "quant_format": quant_format, + } + ) + results.update(wer_values) + del wer_values + generate_report( # 7.0 + results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_base_tune2_jjlr" + ) + del results + + # hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig( + # finetune_layer=3, + # name="base-ls960", + # ) + # model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig( + # specauc_start_epoch=0, + # label_target_size=vocab_size_without_blank, + # final_dropout=0.2, + # hubert_cfg=hubert_cfg_2, + # ) + # train_args_hubert_adam_accum25_jjlr = { + # "config": { + # "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + # "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + # + list(np.linspace(7e-4, 7e-5, 110)) + # + list(np.linspace(7e-5, 1e-8, 30)), + # ############# + # "batch_size": 180 * 16000, + # "max_seq_length": {"audio_features": 35 * 16000}, + # "max_seqs": 3, + # "accum_grad_multiple_step": 25, + # }, + # "debug": True, + # } + # eval_epochs = [100, 150, 200, 225, 250] + # train_args = { + # **copy.deepcopy(train_args_hubert_adam_accum25_jjlr), + # "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + # "net_args": {"model_config_dict": asdict(model_config_hubert_2)}, + # } + # results = {} + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # "beam_size_token": 128, + # } + # train_job, _, _, wer_values = run_exp( + # prefix_name + # + "conformer_0923/hubert_pretrain_v3_base_tune3_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # eval_epochs=eval_epochs, + # ) + # train_job.rqmt["gpu_mem"] = 24 + # results.update(wer_values) + # del wer_values + # generate_report( # 7.1 + # results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_base_tune3_jjlr" + # ) + # del results + + # hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig( + # finetune_layer=2, + # name="base-ls960", + # ) + # model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig( + # specauc_start_epoch=0, + # label_target_size=vocab_size_without_blank, + # final_dropout=0.2, + # hubert_cfg=hubert_cfg_2, + # ) + # train_args_hubert_adam_accum25_jjlr = { + # "config": { + # "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + # "learning_rates": list(np.linspace(7e-6, 7e-4, 220)) + # + list(np.linspace(7e-4, 7e-5, 220)) + # + list(np.linspace(7e-5, 1e-8, 60)), + # ############# + # "batch_size": 180 * 16000, + # "max_seq_length": {"audio_features": 35 * 16000}, + # "max_seqs": 3, + # "accum_grad_multiple_step": 25, + # }, + # "debug": True, + # } + # eval_epochs = [100, 200, 250, 400, 500] + # train_args = { + # **copy.deepcopy(train_args_hubert_adam_accum25_jjlr), + # "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + # "net_args": {"model_config_dict": asdict(model_config_hubert_2)}, + # } + # results = {} + # for lm_weight in [1.6, 1.8, 2.0, 2.2]: + # for prior_scale in [0.3, 0.5]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # "beam_size_token": 128, + # } + # train_job, _, _, wer_values = run_exp( + # prefix_name + # + "conformer_0923/hubert_pretrain_v3_base_tune2_longer_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # eval_epochs=eval_epochs, + # num_epochs=500 + # ) + # train_job.rqmt["gpu_mem"] = 24 + # results.update(wer_values) + # del wer_values + # generate_report( # 7.2 + # results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_base_tune2_longer_jjlr" + # ) + # del results + + hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig( + finetune_layer=2, + name="large-ls960-ft", + ) + model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig( + specauc_start_epoch=0, + label_target_size=vocab_size_without_blank, + final_dropout=0.2, + hubert_cfg=hubert_cfg_2, + ) + train_args_hubert_adam_accum25_jjlr = { + "config": { + "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "max_seqs": 3, + "accum_grad_multiple_step": 25, + }, + "debug": True, + } + eval_epochs = [200, 250] + train_args = { + **copy.deepcopy(train_args_hubert_adam_accum25_jjlr), + "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + "net_args": {"model_config_dict": asdict(model_config_hubert_2)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + "beam_size_token": 128, + } + train_job, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/hubert_pretrain_v3_large960_tune2_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_epochs=eval_epochs, + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( # 5.5 + results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_large960_tune2_jjlr" + ) + del results + + train_args_hubert_adam_accum25_jjlr_longflat = { + "config": { + "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 130)) + + list(np.linspace(7e-4, 7e-5, 230)) + + list(np.linspace(7e-5, 1e-8, 140)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "max_seqs": 3, + "accum_grad_multiple_step": 25, + }, + "debug": False, + } + eval_epochs = [250, 400, 450, 500] + train_args = { + **copy.deepcopy(train_args_hubert_adam_accum25_jjlr_longflat), + "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + "net_args": {"model_config_dict": asdict(model_config_hubert_2)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + "beam_size_token": 128, + } + train_job, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/hubert_pretrain_v3_large960_tune2_jjlr_longflat/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_epochs=eval_epochs, + num_epochs=500 + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( # TODO 5.3 !! + results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_large960_tune2_jjlr_longflat" + ) + del results + + hubert_cfg_6 = hubert_pretrained_v1_cfg.HubertConfig( + finetune_layer=6, + name="large-ls960-ft", + ) + model_config_hubert_6 = hubert_pretrained_v1_cfg.ModelConfig( + specauc_start_epoch=0, + label_target_size=vocab_size_without_blank, + final_dropout=0.2, + hubert_cfg=hubert_cfg_6, + ) + eval_epochs = [250, 400, 450, 500] + train_args = { + **copy.deepcopy(train_args_hubert_adam_accum25_jjlr_longflat), + "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + "net_args": {"model_config_dict": asdict(model_config_hubert_6)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + "beam_size_token": 128, + } + train_job, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/hubert_pretrain_v3_large960_tune6_jjlr_longflat/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_epochs=eval_epochs, + num_epochs=500 + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( + results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_large960_tune6_jjlr_longflat" + ) + del results + + hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig( + finetune_layer=2, + name="large-ll60k", + ) + model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig( + specauc_start_epoch=0, + label_target_size=vocab_size_without_blank, + final_dropout=0.2, + hubert_cfg=hubert_cfg_2, + ) + train_args_hubert_adam_accum25_jjlr = { + "config": { + "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 130)) + + list(np.linspace(7e-4, 7e-5, 230)) + + list(np.linspace(7e-5, 1e-8, 140)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "max_seqs": 3, + "accum_grad_multiple_step": 25, + }, + "debug": True, + } + eval_epochs = [250, 300, 400, 500] + train_args = { + **copy.deepcopy(train_args_hubert_adam_accum25_jjlr), + "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + "net_args": {"model_config_dict": asdict(model_config_hubert_2)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + "beam_size_token": 128, + } + train_job, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/hubert_pretrain_v3_large60k_tune2_jjlr/lm%.1f_prior%.2f_bs1024_th14" % ( + lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_epochs=eval_epochs, + num_epochs=500 + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( + results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_large60k_tune2_jjlr" + ) + del results + diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/config.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/config.py new file mode 100644 index 000000000..d97f74b26 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/config.py @@ -0,0 +1,162 @@ +import copy +import numpy as np +from sisyphus import tk +from typing import Any, Dict, Optional, List + +from i6_core.returnn.config import ReturnnConfig, CodeWrapper + +from i6_experiments.common.setups.returnn_pytorch.serialization import ( + Collection as TorchCollection, +) +from i6_experiments.common.setups.serialization import Import +from ..data import TrainingDatasets +from .serializer import get_pytorch_serializer_v3, PACKAGE + +from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset + + +def get_training_config( + training_datasets: TrainingDatasets, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine: bool = False, + use_speed_perturbation: bool = False, + keep_epochs: Optional[List] = None, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = { + "cleanup_old_models": True, + "stop_on_nonfinite_train_score": True, # this might break now with True + "num_workers_per_gpu": 2, + } + if keep_epochs is not None: + post_config["cleanup_old_models"] = { + "keep_last_n": 2, + "keep_best_n": 4, + "keep": keep_epochs, + } + + base_config = { + "max_seqs": 60, + ############# + "train": copy.deepcopy(training_datasets.train.as_returnn_opts()), + "dev": training_datasets.cv.as_returnn_opts(), + "eval_datasets": {"devtrain": training_datasets.devtrain.as_returnn_opts()}, + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, net_args=net_args, debug=debug, use_custom_engine=use_custom_engine + ) + python_prolog = None + if use_speed_perturbation: + prolog_serializer = TorchCollection( + serializer_objects=[ + Import( + code_object_path=PACKAGE + ".dataset_code.speed_perturbation.legacy_speed_perturbation", + unhashed_package_root=PACKAGE, + ) + ] + ) + python_prolog = [prolog_serializer] + config["train"]["datasets"]["zip_dataset"]["audio"]["pre_process"] = CodeWrapper("legacy_speed_perturbation") + + returnn_config = ReturnnConfig( + config=config, post_config=post_config, python_prolog=python_prolog, python_epilog=[serializer] + ) + return returnn_config + + +def get_prior_config( + training_datasets: TrainingDatasets, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = {} + + base_config = { + ############# + "batch_size": 50000 * 160, + "max_seqs": 60, + ############# + "forward": training_datasets.prior.as_returnn_opts(), + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + prior=True, + ) + returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) + return returnn_config + + +def get_search_config( + network_module: str, + net_args: Dict[str, Any], + decoder: [str], + decoder_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = {} + + base_config = { + ############# + "batch_size": 24000 * 160, + "max_seqs": 60, + ############# + # dataset is added later in the pipeline during search_single + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + decoder=decoder, + decoder_args=decoder_args, + ) + returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) + return returnn_config diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/data.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/data.py new file mode 100644 index 000000000..5af44370f --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/data.py @@ -0,0 +1,116 @@ +""" + + +""" +import os +from sisyphus import tk + +from i6_core.corpus.transform import ApplyLexiconToCorpusJob +from i6_core.lexicon.modification import AddEowPhonemesToLexiconJob +from i6_core.returnn.vocabulary import ReturnnVocabFromPhonemeInventory + +from i6_experiments.common.datasets.tedlium2.corpus import get_bliss_corpus_dict +from i6_experiments.common.datasets.tedlium2.lexicon import get_g2p_augmented_bliss_lexicon, get_bliss_lexicon +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + +from ..data import TrainingDatasetSettings, TrainingDatasets, build_training_datasets, get_zip +from ..data import DATA_PREFIX + + +def get_eow_lexicon(with_g2p=True) -> tk.Path: + """ + Standard bliss lexicon modified with EOW + :return: + """ + if with_g2p: + lex = get_g2p_augmented_bliss_lexicon(output_prefix="tedliumv2_g2p_datasets") + else: + lex = get_bliss_lexicon(output_prefix="tedliumv2_eow_datasets") + + return AddEowPhonemesToLexiconJob(lex).out_lexicon + + +def get_eow_text_lexicon() -> tk.Path: + """ + + :return: + """ + bliss_lex = get_eow_lexicon(with_g2p=False) + from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon + + word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon + return word_lexicon + + +def get_eow_bliss(corpus_key, remove_unk_seqs=False) -> tk.Path: + """ + get an EOW modified corpus with optional unknown removed for cross validation + + :param corpus_key: train, dev, test + :param remove_unk_seqs: remove all sequences with unknowns, used for dev-clean and dev-other + in case of using them for cross validation + :return: + """ + bliss = get_bliss_corpus_dict(audio_format="wav")[corpus_key] + if remove_unk_seqs: + from i6_core.corpus.filter import FilterCorpusRemoveUnknownWordSegmentsJob + + bliss = FilterCorpusRemoveUnknownWordSegmentsJob( + bliss_corpus=bliss, + bliss_lexicon=get_eow_lexicon(), # assume no g2p when removing unknown for test sets + all_unknown=False, + ).out_corpus + + # default train lexicon + lexicon = get_eow_lexicon(with_g2p=True) + converted_bliss_corpus = ApplyLexiconToCorpusJob(bliss, lexicon, word_separation_orth=None).out_corpus + + return converted_bliss_corpus + + +def get_eow_bliss_and_zip(corpus_key, remove_unk_seqs=False): + """ + :param corpus_key: e.g. "train", "dev", or "test, + :param remove_unk_seqs: remove all sequences with unknowns, used for dev-clean and dev-other + in case of using them for cross validation + :return: tuple of bliss and zip + """ + + bliss_dataset = get_eow_bliss(corpus_key=corpus_key, remove_unk_seqs=remove_unk_seqs) + zip_dataset = get_zip(f"{corpus_key}_eow", bliss_dataset=bliss_dataset) + + return bliss_dataset, zip_dataset + + +def get_eow_vocab_datastream() -> LabelDatastream: + """ + Phoneme with EOW LabelDatastream for Tedlium-2 + + :param with_blank: datastream for CTC training + """ + lexicon = get_eow_lexicon() + blacklist = {"[SILENCE]"} + returnn_vocab_job = ReturnnVocabFromPhonemeInventory(lexicon, blacklist=blacklist) + returnn_vocab_job.add_alias(os.path.join(DATA_PREFIX, "eow_returnn_vocab_job")) + + vocab_datastream = LabelDatastream( + available_for_inference=True, vocab=returnn_vocab_job.out_vocab, vocab_size=returnn_vocab_job.out_vocab_size + ) + + return vocab_datastream + + +def build_phon_training_datasets( + settings: TrainingDatasetSettings, +) -> TrainingDatasets: + """ + :param settings: configuration object for the dataset pipeline + """ + label_datastream = get_eow_vocab_datastream() + + _, train_ogg = get_eow_bliss_and_zip("train") + _, dev_ogg = get_eow_bliss_and_zip("dev", remove_unk_seqs=True) + + return build_training_datasets( + settings=settings, train_ogg=train_ogg, dev_ogg=dev_ogg, label_datastream=label_datastream + ) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/exp_baseline.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/exp_baseline.py new file mode 100644 index 000000000..c9eb200e9 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/exp_baseline.py @@ -0,0 +1,1703 @@ +from sisyphus import tk + +import copy +from dataclasses import asdict +import numpy as np +from typing import cast, Optional, List + +from i6_core.report.report import _Report_Type + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + +from .data import build_phon_training_datasets, TrainingDatasetSettings, get_eow_text_lexicon +from ..data import build_test_dataset +from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT + +from ..pipeline import training, search, compute_prior + +from .config import get_training_config, get_search_config, get_prior_config + +def flash_phon_ctc_report_format(report: _Report_Type) -> str: + extra_ls = [] + out = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)] + out = sorted(out, key=lambda x: float(x[1])) + best_ls = [out[0]] + for extra in extra_ls: + out2 = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if extra in recog] + out2 = sorted(out2, key=lambda x: float(x[1])) + if len(out2) > 0: + out.append((extra, "")) + out.extend(out2) + best_ls.append(out2[0]) + best_ls = sorted(best_ls, key=lambda x: float(x[1])) + out.append(("Best Results", "")) + out.extend(best_ls) + return "\n".join([f"{pair[0]}: {str(pair[1])}" for pair in out]) + + +def conformer_baseline(): + prefix_name = "experiments/rescale/tedliumv2/flashlight_phon_ctc/" + + train_settings = TrainingDatasetSettings( + custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000" + ) + + train_settings_retrain = copy.deepcopy(train_settings) + train_settings_retrain.epoch_wise_filters = [] + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data = build_phon_training_datasets(settings=train_settings) + label_datastream = cast(LabelDatastream, train_data.datastreams["labels"]) + vocab_size_without_blank = label_datastream.vocab_size + + # build testing datasets + test_dataset_tuples = {} + # for testset in ["dev", "test"]: + for testset in ["dev"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + ) + + from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm + + lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False) + lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] + arpa_ted_lm = lm.ngram_lm + # TODO: Add binary conversion job + + # ---------------------------------------------------------------------------------------------------------------- # + + def run_exp( + ft_name, + datasets, + train_args, + search_args=None, + with_prior=False, + num_epochs=250, + decoder="ctc.decoder.flashlight_phoneme_ctc", + eval_epochs: Optional[List] = None, + eval_best: bool = True, + ): + training_name = "/".join(ft_name.split("/")[:-1]) + search_args = search_args if search_args is not None else {} + + returnn_config = get_training_config(training_datasets=datasets, **train_args) + train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs) + + if with_prior: + returnn_config = get_prior_config(training_datasets=datasets, **train_args) + prior_file = compute_prior( + ft_name, + returnn_config, + checkpoint=train_job.out_checkpoints[num_epochs], + returnn_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + tk.register_output(training_name + "/prior.txt", prior_file) + search_args["prior_file"] = prior_file + + returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder) + + if eval_epochs is None: + eval_epochs = [num_epochs] + search_job_ls = [] + report = {} + for epoch in eval_epochs: + format_string_report, values_report, search_jobs = search( + ft_name + "/default_%i" % epoch, + returnn_search_config, + train_job.out_checkpoints[epoch], + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + search_job_ls += search_jobs + report.update(values_report) + from i6_core.returnn import GetBestPtCheckpointJob + if eval_best: + best_job = GetBestPtCheckpointJob(train_job.out_model_dir, train_job.out_learning_rates, key="dev_loss_ctc") + best_job.add_alias(ft_name + "/get_best_job") + format_string_report, values_report, search_jobs = search( + ft_name + "/best_chkpt", + returnn_search_config, + best_job.out_checkpoint, + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + search_job_ls += search_jobs + report.update(values_report) + + return train_job, search_job_ls, format_string_report, report + + def generate_report(results, exp_name): + from i6_core.report import GenerateReportStringJob, MailJob + + report = GenerateReportStringJob(report_values=results, report_template=flash_phon_ctc_report_format) + report.add_alias(f"report/report/{exp_name}") + mail = MailJob(report.out_report, send_contents=True, subject=exp_name) + mail.add_alias(f"report/mail/{exp_name}") + tk.register_output("mail/" + exp_name, mail.out_status) + + from ..pytorch_networks.ctc.conformer_0923.transparent_i6modelsV1_2x1D_frontend_xavierinit_cfg import ( + SpecaugConfig, + TwoLayer1DFrontendConfig, + ModelConfig, + ) + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = TwoLayer1DFrontendConfig( + in_features=80, + conv1_channels=256, + conv2_channels=384, + conv1_kernel_size=5, + conv2_kernel_size=5, + conv1_stride=2, + conv2_stride=2, + dropout=0.1, + ) + model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + ) + + # from here on onwards, use default AdamW with same OCLR + train_args_adamw03_accum2 = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(1e-5, 1e-3, 125)) + list(np.linspace(1e-3, 1e-6, 125)), + ############# + "batch_size": 300 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 2, + }, + } + + default_search_args = { + "lexicon": get_eow_text_lexicon(), + "returnn_vocab": label_datastream.vocab, + "beam_size": 64, + "arpa_lm": arpa_ted_lm, + "beam_threshold": 50, + } + + train_args = { + **train_args_adamw03_accum2, + "network_module": "ctc.conformer_0923.transparent_i6modelsV1_2x1D_frontend_xavierinit", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + + results = {} + for lm_weight in [1.5, 2.0, 2.5]: + for prior_scale in [0.3, 0.5, 0.75, 1.0]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit/lm%.1f_prior%.2f" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False, + ) + results.update(wer_values) + del wer_values + + for pruning in [10, 20, 30, 40, 50]: + search_args = { + **default_search_args, + "lm_weight": 2.0, + "prior_scale": 0.5, + } + search_args["beam_size"] = 256 + search_args["beam_threshold"] = pruning + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit/lm2.0_prior0.5_bs256_prune%i" % pruning, + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False, + ) + results.update(wer_values) + del wer_values + + for pruning in [10, 12, 14, 16, 18, 20]: + # 10 = 10.0 + # 12 = 9.9 + # 14 = 9.9 + # 16 = 9.8 + search_args = { + **default_search_args, + "lm_weight": 2.0, + "prior_scale": 0.5, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = pruning + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit/lm2.0_prior0.5_bs1024_prune%i" % pruning, + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False, + ) + results.update(wer_values) + del wer_values + + generate_report( # 9.8 + results=results, exp_name=prefix_name + "conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit" + ) + del results + + results = {} + # re-tune prior and lm-weight using beampruning 16 + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.0, 0.3, 0.4, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit/lm%.1f_prior%.1f_bs1024_prune16" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 9.8 + results=results, exp_name=prefix_name + "conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit_bs1024_prune16" + ) + del results + +# Ted-Lium can be larger + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = TwoLayer1DFrontendConfig( + in_features=80, + conv1_channels=512, + conv2_channels=512, + conv1_kernel_size=5, + conv2_kernel_size=5, + conv1_stride=2, + conv2_stride=2, + dropout=0.1, + ) + model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=512, + num_layers=12, + num_heads=8, + ff_dim=2048, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + ) + + # from here on onwards, use default AdamW with same OCLR + train_args_adamw03_accum2 = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(1e-5, 1e-3, 125)) + list(np.linspace(1e-3, 1e-6, 125)), + ############# + "batch_size": 300 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 2, + }, + } + + default_search_args = { + "lexicon": get_eow_text_lexicon(), + "returnn_vocab": label_datastream.vocab, + "beam_size": 64, + "arpa_lm": arpa_ted_lm, + "beam_threshold": 50, + } + + train_args = { + **train_args_adamw03_accum2, + "network_module": "ctc.conformer_0923.transparent_i6modelsV1_2x1D_frontend_xavierinit", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + + results = {} + for lm_weight in [1.5, 2.0, 2.5]: + for prior_scale in [0.3, 0.5, 0.75, 1.0]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/transparent_12x512_i6modelsV1_2x1D_frontend_xavierinit/lm%.1f_prior%.2f" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, # 10.2 + search_args=search_args, + with_prior=True, + eval_best=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 10.2 + results=results, exp_name=prefix_name + "conformer_0923/transparent_12x512_i6modelsV1_2x1D_frontend_xavierinit" + ) + del results + + # TODO: not converging same with AMP + # train_args_amp = copy.deepcopy(train_args) + # train_args_amp["config"]["torch_amp_options"] = {"dtype": "float16"} # Pascal / 1080 GPUs can only do float16 + # for lm_weight in [1.5, 2.0, 2.5]: + # for prior_scale in [0.3, 0.5, 0.75, 1.0]: + # search_args = { + # **default_search_args, + # "lm_weight": lm_weight, + # "prior_scale": prior_scale, + # } + # run_exp(prefix_name + "conformer_0923/transparent_12x512_i6modelsV1_2x1D_frontend_xavierinit_amp/lm%.1f_prior%.2f" % ( + # lm_weight, prior_scale), + # datasets=train_data, train_args=train_args_amp, search_args=search_args, with_prior=True) + + from ..pytorch_networks.ctc.conformer_0923.i6modelsV1_2x1D_frontend_xavierinit_cfg import ( + SpecaugConfig, + TwoLayer1DFrontendConfig, + ModelConfig, + ) + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = TwoLayer1DFrontendConfig( + in_features=80, + conv1_channels=256, + conv2_channels=384, + conv1_kernel_size=5, + conv2_kernel_size=5, + conv1_stride=2, + conv2_stride=2, + dropout=0.1, + ) + model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + ) + + train_args = { + **train_args_adamw03_accum2, + "network_module": "ctc.conformer_0923.i6modelsV1_2x1D_frontend_xavierinit", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 256 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_2x1D_frontend_xavierinit/lm%.1f_prior%.2f" % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 9.2 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_2x1D_frontend_xavierinit" + ) + del results + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_2x1D_frontend_xavierinit", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + train_args["config"]["optimizer"] = {"class": "adam", "epsilon": 1e-16} + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 256 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_2x1D_frontend_xavierinit_adam/lm%.1f_prior%.2f" % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 9.4 + results=results, + exp_name=prefix_name + "conformer_0923/i6modelsV1_2x1D_frontend_xavierinit_adam" + ) + del results + + from ..pytorch_networks.ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + ) + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=2048, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + ) + + train_args = { + **train_args_adamw03_accum2, + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 256 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1/lm%.1f_prior%.2f" % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1/lm%.1f_prior%.2f_bs1024" % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 8.1 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1" + ) + del results + + train_args = { + **train_args_adamw03_accum2, + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_posenc", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 256 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc/lm%.1f_prior%.2f" % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 8.1 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc" + ) + del results + + train_args = { + **train_args_adamw03_accum2, + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_convfirst", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 256 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_convfirst/lm%.1f_prior%.2f" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 8.4 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_convfirst" + ) + del results + + train_args = { + **train_args_adamw03_accum2, + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_posenc_convfirst", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 256 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_convfirst/lm%.1f_prior%.2f" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 8.0 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_convfirst" + ) + del results + + train_args = { + **train_args_adamw03_accum2, + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_xavierinit", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 256 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_xavierinit/lm%.1f_prior%.2f" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False + ) + results.update(wer_values) + del wer_values + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_xavierinit/lm%.1f_prior%.2f_bs1024" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False, + ) + results.update(wer_values) + del wer_values + + generate_report( # 7.9 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_xavierinit" + ) + del results + + train_args = { + **train_args_adamw03_accum2, + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_posenc_xavierinit", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 256 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_xavierinit/lm%.1f_prior%.2f" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 8.2 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_xavierinit" + ) + del results + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30)) + ) + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR/lm%.1f_prior%.2f_bs1024" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + + generate_report( # 7.8 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR" + ) + del results + ###################################################### + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30)) + ) + train_args["config"]["optimizer"] = {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-2} + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR_decay-2/lm%.1f_prior%.2f_bs1024" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 7.9 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR_decay-2" + ) + del results + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30)) + ) + train_args["config"]["optimizer"] = {"class": "adamw", "epsilon": 1e-16, "weight_decay": 5e-3} + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR_decay5-3/lm%.1f_prior%.2f_bs1024" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 7.8 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR_decay5-3" + ) + del results + + ############################################# + + # Train long basic + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + train_args["config"]["learning_rates"] = list(np.linspace(1e-5, 1e-3, 250)) + list(np.linspace(1e-3, 1e-6, 250)) + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_ep500/lm%.1f_prior%.2f_bs1024" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + num_epochs=500, + ) + results.update(wer_values) + del wer_values + generate_report( # 7.6 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_ep500" + ) + del results + + # Train long skewed + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + train_args["config"]["learning_rates"] = list(np.linspace(1e-5, 1e-3, 200)) + list(np.linspace(1e-3, 1e-7, 300)) + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_ep500skewed/lm%.1f_prior%.2f_bs1024" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + num_epochs=500, + ) + results.update(wer_values) + del wer_values + generate_report( # 7.7 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_ep500skewed" + ) + del results + + bene_model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=6, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=9, + final_dropout=0.2, + ) + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1", + "debug": True, + "net_args": { + "model_config_dict": asdict(bene_model_config), + }, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30)) + ) + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 16 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_bene_param/lm%.1f_prior%.2f_bs1024" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 92.4, not converged + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_bene_param" + ) + del results + + # No Subsampling + from ..pytorch_networks.ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + ) + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config_nosub = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(1, 1), + pool1_stride=(1, 1), + pool1_padding=None, + pool2_kernel_size=(1, 1), + pool2_stride=(1, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + model_config_nosub = ModelConfig( + frontend_config=frontend_config_nosub, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=2048, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + ) + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config_nosub), + }, + } + train_args["config"]["batch_size"] = 150 * 12000 + train_args["config"]["accum_grad_multiple_step"] = 5 + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30)) + ) + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 16 + train_job, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR_nosub/lm%.1f_prior%.2f_bs1024" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( # 10.0 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR_nosub" + ) + del results + + #### New experiments with corrected FF-Dim + + from ..pytorch_networks.ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + ) + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + ) + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v2", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30)) + ) + train_args["config"]["batch_size"] = 180 * 16000 + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2_JJLR/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 7.2 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2_JJLR" + ) + del results + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30)) + ) + train_args["config"]["batch_size"] = 180 * 16000 + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + + # beam search token + if lm_weight == 2.0 and prior_scale == 0.5: + for bst in [10, 20, 30, 40, 50]: + search_args = copy.deepcopy(search_args) + search_args["beam_size_token"] = bst + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR/lm%.1f_prior%.2f_bs1024_th14_bst_%i" + % (lm_weight, prior_scale, bst), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + # if bst == 20: # Does currently not work since SFTF cannot be onnx exported + # _, search_jobs, _, _ = run_exp( + # prefix_name + # + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR/lm%.1f_prior%.2f_bs1024_th14_bst_%i_exp1" + # % (lm_weight, prior_scale, bst), + # datasets=train_data, + # train_args=train_args, + # search_args=search_args, + # with_prior=True, + # decoder="ctc.decoder.flashlight_experimental_phoneme_ctc", + # ) + + # Search GRID + for lm_weight in [1.6, 1.8, 2.0, 2.2, 2.4]: # 5 + for prior_scale in [0.0, 0.3, 0.4, 0.5, 0.6, 0.7]: # 5 + for beam_threshold in [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]: # 12 + # for beam_size in [256, 1024, 4096, 8192]: # 4 + for beam_size in [256, 1024]: # 4 + search_args = { + **copy.deepcopy(default_search_args), + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = beam_size + search_args["beam_threshold"] = beam_threshold + search_args["node"] = "intel" + _, search_jobs, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR/search_grid_intel_full/lm%.1f_prior%.2f_bs%i_th%i" + % (lm_weight, prior_scale, beam_size, beam_threshold), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + for search_job in search_jobs: + search_job.rqmt["sbatch_args"] = "-p rescale_intel -A rescale_speed" + if beam_size > 1024: + search_job.rqmt["mem"] = 12 + elif beam_size > 4096: + search_job.rqmt["mem"] = 16 + + generate_report( # 7.2 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR" + ) + del results + + # with speed perturbation + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config), + }, + "use_speed_perturbation": True, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30)) + ) + train_args["config"]["batch_size"] = 180 * 16000 + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_speed/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 7.4 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_speed" + ) + del results + + from ..pytorch_networks.ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ( + ModelConfig as ModelConfigV4, + ) + + model_config_v4 = ModelConfigV4( + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=1, + ) + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v5", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config_v4), + }, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30)) + ) + train_args["config"]["batch_size"] = 180 * 16000 + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.5, 0.7]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 7.2 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR" + ) + del results + # TODO: this here above is the best baseline, use as starting point, giving 7.2% with LM 2.2 and Prior 0.7 + + train_args = copy.deepcopy(train_args) + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 220)) + list(np.linspace(7e-4, 7e-5, 220)) + list(np.linspace(7e-5, 1e-8, 60)) + ) + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.5, 0.7]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_longerJJLR_500ep/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + num_epochs=500 + ) + results.update(wer_values) + del wer_values + generate_report( # 7.3 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_longerJJLR_500ep" + ) + del results + + train_args = copy.deepcopy(train_args) + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 130)) + list(np.linspace(7e-4, 7e-5, 230)) + list(np.linspace(7e-5, 1e-8, 140)) + ) + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.5, 0.7]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_flatterJJLR_500ep/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + num_epochs=500 + ) + results.update(wer_values) + del wer_values + generate_report( # 6.8 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_flatterJJLR_500ep" + ) + del results + + train_args = copy.deepcopy(train_args) + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 31) + [7e-5]) + ) + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.5, 0.7]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_endJJLR_500ep/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + num_epochs=500 + ) + results.update(wer_values) + del wer_values + generate_report( # 6.9 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_endJJLR_500ep" + ) + del results + + model_config_v4_start11 = copy.deepcopy(model_config_v4) + model_config_v4_start11.specauc_start_epoch = 11 + train_args = copy.deepcopy(train_args) + train_args["net_args"]["model_config_dict"] = asdict(model_config_v4_start11) + train_args["config"]["learning_rates"] = list(np.linspace(1e-5, 1e-3, 150)) + list(np.linspace(1e-3, 1e-5, 150)) + train_args["config"]["batch_size"] = 500 * 16000 + train_args["config"]["accum_grad_multiple_step"] = 1 + train_args["config"]["optimizer"]["weight_decay"] = 1e-2 + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.5, 0.7]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + train_job, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_24gb_bs500/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_best=False, + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( # 7.8 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_24gb_bs500" + ) + del results + + frontend_config_large = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=512, + activation=None, + ) + model_config_large = ModelConfig( + frontend_config=frontend_config_large, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=512, + num_layers=12, + num_heads=4, + ff_dim=2048, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + ) + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config_large), + }, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30)) + ) + train_args["config"]["batch_size"] = 100 * 16000 + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_large_accum2/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 7.2 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_large_accum2" + ) + del results + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config_large), + }, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30)) + ) + train_args["config"]["batch_size"] = 100 * 16000 + train_args["config"]["accum_grad_multiple_step"] = 3 + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.3, 0.5]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_large_accum3/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + ) + results.update(wer_values) + del wer_values + generate_report( # 94.4, not converged + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_large_accum3" + ) + del results + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2), + "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3", + "debug": True, + "net_args": { + "model_config_dict": asdict(model_config_large), + }, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 135)) + list(np.linspace(7e-4, 7e-5, 135)) + list(np.linspace(7e-5, 1e-8, 30)) + ) + train_args["config"]["batch_size"] = 100 * 16000 + train_args["config"]["accum_grad_multiple_step"] = 4 + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.5, 0.7]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_large_accum4_300ep/lm%.1f_prior%.2f_bs1024_th14" + % (lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + num_epochs=300, + ) + results.update(wer_values) + del wer_values + generate_report( # 7.2 + results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_large_accum4_300ep" + ) + del results diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/exp_pretrain.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/exp_pretrain.py new file mode 100644 index 000000000..faa97ccf5 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/exp_pretrain.py @@ -0,0 +1,320 @@ +from sisyphus import tk + +import copy +from dataclasses import asdict +import numpy as np +from typing import cast, Optional, List + +from i6_core.report.report import _Report_Type + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + +from .data import build_phon_training_datasets, TrainingDatasetSettings, get_eow_text_lexicon +from ..data import build_test_dataset +from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT + +from ..pipeline import training, search, compute_prior + +from .config import get_training_config, get_search_config, get_prior_config + +def flash_phon_ctc_report_format(report: _Report_Type) -> str: + extra_ls = [] + out = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)] + out = sorted(out, key=lambda x: float(x[1])) + best_ls = [out[0]] + for extra in extra_ls: + out2 = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if extra in recog] + out2 = sorted(out2, key=lambda x: float(x[1])) + if len(out2) > 0: + out.append((extra, "")) + out.extend(out2) + best_ls.append(out2[0]) + best_ls = sorted(best_ls, key=lambda x: float(x[1])) + out.append(("Best Results", "")) + out.extend(best_ls) + return "\n".join([f"{pair[0]}: {str(pair[1])}" for pair in out]) + + +def pretrained_experiments(): + prefix_name = "experiments/rescale/tedliumv2/flashlight_phon_ctc/" + + train_settings = TrainingDatasetSettings( + custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000" + ) + + train_settings_retrain = copy.deepcopy(train_settings) + train_settings_retrain.epoch_wise_filters = [] + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data = build_phon_training_datasets(settings=train_settings) + label_datastream = cast(LabelDatastream, train_data.datastreams["labels"]) + vocab_size_without_blank = label_datastream.vocab_size + + # build testing datasets + test_dataset_tuples = {} + # for testset in ["dev", "test"]: + for testset in ["dev"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + ) + + from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm + + lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False) + lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] + arpa_ted_lm = lm.ngram_lm + # TODO: Add binary conversion job + + # ---------------------------------------------------------------------------------------------------------------- # + + def run_exp( + ft_name, + datasets, + train_args, + search_args=None, + with_prior=False, + num_epochs=250, + decoder="ctc.decoder.flashlight_phoneme_ctc", + eval_epochs: Optional[List] = None, + eval_best: bool = True, + ): + training_name = "/".join(ft_name.split("/")[:-1]) + search_args = search_args if search_args is not None else {} + + returnn_config = get_training_config(training_datasets=datasets, keep_epochs=eval_epochs, **train_args) + train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs) + + if with_prior: + returnn_config = get_prior_config(training_datasets=datasets, **train_args) + prior_file = compute_prior( + ft_name, + returnn_config, + checkpoint=train_job.out_checkpoints[num_epochs], + returnn_exe=RETURNN_EXE, + returnn_root=MINI_RETURNN_ROOT, + ) + tk.register_output(training_name + "/prior.txt", prior_file) + search_args["prior_file"] = prior_file + + returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder) + + if eval_epochs is None: + eval_epochs = [num_epochs] + search_job_ls = [] + report = {} + for epoch in eval_epochs: + format_string_report, values_report, search_jobs = search( + ft_name + "/default_%i" % epoch, + returnn_search_config, + train_job.out_checkpoints[epoch], + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + search_job_ls += search_jobs + report.update(values_report) + from i6_core.returnn import GetBestPtCheckpointJob + if eval_best: + best_job = GetBestPtCheckpointJob(train_job.out_model_dir, train_job.out_learning_rates, key="dev_loss_ctc") + best_job.add_alias(ft_name + "/get_best_job") + format_string_report, values_report, search_jobs = search( + ft_name + "/best_chkpt", + returnn_search_config, + best_job.out_checkpoint, + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + search_job_ls += search_jobs + report.update(values_report) + + return train_job, search_job_ls, format_string_report, report + + def generate_report(results, exp_name): + from i6_core.report import GenerateReportStringJob, MailJob + + report = GenerateReportStringJob(report_values=results, report_template=flash_phon_ctc_report_format) + report.add_alias(f"report/report/{exp_name}") + mail = MailJob(report.out_report, send_contents=True, subject=exp_name) + mail.add_alias(f"report/mail/{exp_name}") + tk.register_output("mail/" + exp_name, mail.out_status) + + default_search_args = { + "lexicon": get_eow_text_lexicon(), + "returnn_vocab": label_datastream.vocab, + "beam_size": 64, + "arpa_lm": arpa_ted_lm, + "beam_threshold": 50, + } + from ..pytorch_networks.ctc.conformer_0923 import hubert_pretrained_v1_cfg + + hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig( + finetune_layer=2, + name="base-ls960", + ) + model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig( + specauc_start_epoch=0, + label_target_size=vocab_size_without_blank, + final_dropout=0.2, + hubert_cfg=hubert_cfg_2, + ) + train_args_hubert_adam_accum25_jjlr = { + "config": { + "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "max_seqs": 3, + "accum_grad_multiple_step": 25, + }, + "debug": True, + } + eval_epochs = [250] + train_args = { + **copy.deepcopy(train_args_hubert_adam_accum25_jjlr), + "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + "net_args": {"model_config_dict": asdict(model_config_hubert_2)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.5, 0.7]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + train_job, _, _, wer_values = run_exp( + prefix_name + + "hubert/pretrain_v3_base_tune2_jjlr/lm%.1f_prior%.2f_bs1024_th14" % ( + lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_epochs=eval_epochs, + eval_best=True + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( # 6.6 + results=results, exp_name=prefix_name + "hubert/pretrain_v3_base_tune2_jjlr" + ) + del results + + hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig( + finetune_layer=2, + name="large-ls960-ft", + ) + model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig( + specauc_start_epoch=0, + label_target_size=vocab_size_without_blank, + final_dropout=0.2, + hubert_cfg=hubert_cfg_2, + ) + train_args_hubert_adam_accum25_jjlr = { + "config": { + "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)) + [7e-5], + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "max_seqs": 3, + "accum_grad_multiple_step": 25, + }, + "debug": False, + } + eval_epochs = [250, 300, 350, 400, 450, 500] + train_args = { + **copy.deepcopy(train_args_hubert_adam_accum25_jjlr), + "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + "net_args": {"model_config_dict": asdict(model_config_hubert_2)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.5, 0.7]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + train_job, _, _, wer_values = run_exp( + prefix_name + + "hubert/pretrain_v3_large960_tune2_jjlr_longer/lm%.1f_prior%.2f_bs1024_th14" % ( + lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_epochs=eval_epochs, + eval_best=True, + num_epochs=500 + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( + results=results, exp_name=prefix_name + "hubert/pretrain_v3_large960_tune2_jjlr_longer" + ) + del results + + train_args_hubert_adam_accum25_jjlr = { + "config": { + "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 130)) + + list(np.linspace(7e-4, 7e-5, 230)) + + list(np.linspace(7e-5, 1e-8, 140)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "max_seqs": 3, + "accum_grad_multiple_step": 25, + }, + "debug": False, + } + eval_epochs = [250, 300, 350, 400, 450, 500] + train_args = { + **copy.deepcopy(train_args_hubert_adam_accum25_jjlr), + "network_module": "ctc.conformer_0923.hubert_pretrained_v3", + "net_args": {"model_config_dict": asdict(model_config_hubert_2)}, + } + results = {} + for lm_weight in [1.6, 1.8, 2.0, 2.2]: + for prior_scale in [0.5, 0.7]: + search_args = { + **default_search_args, + "lm_weight": lm_weight, + "prior_scale": prior_scale, + } + search_args["beam_size"] = 1024 + search_args["beam_threshold"] = 14 + train_job, _, _, wer_values = run_exp( + prefix_name + + "hubert/pretrain_v3_large960_tune2_jjlr_longflat/lm%.1f_prior%.2f_bs1024_th14" % ( + lm_weight, prior_scale), + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=True, + eval_epochs=eval_epochs, + eval_best=True, + num_epochs=500 + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( + results=results, exp_name=prefix_name + "hubert/pretrain_v3_large960_tune2_jjlr_longflat" + ) + del results + + diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/serializer.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/serializer.py new file mode 100644 index 000000000..3f154399c --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/serializer.py @@ -0,0 +1,114 @@ +import copy +from sisyphus import tk +from typing import Any, Dict, Optional + +from i6_core.tools.git import CloneGitRepositoryJob + +from i6_experiments.common.setups.returnn_pytorch.serialization import ( + Collection as TorchCollection, +) +from i6_experiments.common.setups.serialization import ExternalImport + +from .. import PACKAGE + +from i6_experiments.common.setups.serialization import Import, PartialImport + + +def get_pytorch_serializer_v3( + network_module: str, + net_args: Dict[str, Any], + decoder: Optional[str] = None, + decoder_args: Optional[Dict[str, Any]] = None, + post_decoder_args: Optional[Dict[str, Any]] = None, + prior: bool = False, + debug: bool = False, + export:bool = False, + **kwargs +) -> TorchCollection: + """ + + :param network_module: path to the pytorch config file containing Model + :param net_args: extra arguments for the model + :param decoder: path to the search decoder, if provided will link search functions + :param decoder_args: + :param post_decoder_args: + :param prior: build config for prior computation + :param debug: run training in debug mode (linking from recipe instead of copy) + :param kwargs: + :return: + """ + package = PACKAGE + ".pytorch_networks" + + pytorch_model_import = PartialImport( + code_object_path=package + ".%s.Model" % network_module, + unhashed_package_root=PACKAGE, + hashed_arguments=net_args, + unhashed_arguments={}, + import_as="get_model", + ) + pytorch_train_step = Import( + code_object_path=package + ".%s.train_step" % network_module, unhashed_package_root=PACKAGE + ) + # i6_models_repo = CloneGitRepositoryJob( + # url="https://github.com/rwth-i6/i6_models", + # commit="1e94a4d9d1aa48fe3ac7f60de2cd7bd3fea19c3e", + # checkout_folder_name="i6_models" + # ).out_repository + i6_models_repo = tk.Path("/u/hilmes/experiments/nick_asr/i6_models") + i6_models_repo.hash_overwrite = "LIBRISPEECH_DEFAULT_I6_MODELS" + i6_models = ExternalImport(import_path=i6_models_repo) + + serializer_objects = [ + i6_models, + pytorch_model_import, + pytorch_train_step, + ] + if decoder: + # Just a hack to test the phoneme-based recognition + forward_step = Import( + code_object_path=package + ".%s.forward_step" % decoder, + unhashed_package_root=PACKAGE, + ) + init_hook = PartialImport( + code_object_path=package + ".%s.forward_init_hook" % decoder, + unhashed_package_root=PACKAGE, + hashed_arguments=decoder_args or {}, + unhashed_arguments=post_decoder_args or {}, + ) + finish_hook = Import( + code_object_path=package + ".%s.forward_finish_hook" % decoder, + unhashed_package_root=PACKAGE, + ) + serializer_objects.extend([forward_step, init_hook, finish_hook]) + if prior: + forward_step = Import( + code_object_path=package + ".%s.prior_step" % network_module, + unhashed_package_root=PACKAGE, + import_as="forward_step", + ) + init_hook = Import( + code_object_path=package + ".%s.prior_init_hook" % network_module, + unhashed_package_root=PACKAGE, + import_as="forward_init_hook", + ) + finish_hook = Import( + code_object_path=package + ".%s.prior_finish_hook" % network_module, + import_as="forward_finish_hook", + unhashed_package_root=PACKAGE, + ) + serializer_objects.extend([forward_step, init_hook, finish_hook]) + if export: + export_step = Import( + code_object_path=package + ".%s.export" % network_module, + unhashed_package_root=PACKAGE, + ) + serializer_objects.extend([export_step]) + serializer = TorchCollection( + serializer_objects=serializer_objects, + make_local_package_copy=not debug, + packages={ + package, + }, + ) + + return serializer diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pipeline.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pipeline.py new file mode 100644 index 000000000..6ec37b9b5 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pipeline.py @@ -0,0 +1,188 @@ +import copy +import os.path + +from sisyphus import tk + +from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset + +from i6_core.returnn.config import ReturnnConfig +from i6_core.returnn.training import ReturnnTrainingJob +from i6_core.returnn.training import GetBestTFCheckpointJob +from i6_core.returnn.forward import ReturnnForwardJob, ReturnnForwardJobV2 +from i6_core.returnn.search import SearchBPEtoWordsJob, ReturnnComputeWERJob +from i6_experiments.users.rossenbach.returnn.training import AverageCheckpointsJobV2 + +from .default_tools import RETURNN_EXE, MINI_RETURNN_ROOT, SCTK_BINARY_PATH + + +@tk.block() +def training(prefix_name, returnn_config, returnn_exe, returnn_root, num_epochs): + """ + + :param prefix_name: + :param returnn_config: + :param returnn_exe: + :param returnn_root: + :return: + """ + default_rqmt = { + "mem_rqmt": 15, + "time_rqmt": 168, + "cpu_rqmt": 4, + "log_verbosity": 5, + "returnn_python_exe": returnn_exe, + "returnn_root": returnn_root, + } + + train_job = ReturnnTrainingJob(returnn_config=returnn_config, num_epochs=num_epochs, **default_rqmt) + train_job.add_alias(prefix_name + "/training") + tk.register_output(prefix_name + "/learning_rates", train_job.out_learning_rates) + + return train_job + + +@tk.block() +def search_single( + prefix_name, + returnn_config, + checkpoint, + recognition_dataset: GenericDataset, + recognition_bliss_corpus, + returnn_exe, + returnn_root, + mem_rqmt=8, + use_gpu=False, +): + """ + Run search for a specific test dataset + + :param str prefix_name: + :param ReturnnConfig returnn_config: + :param Checkpoint checkpoint: + :param returnn_standalone.data.datasets.dataset.GenericDataset recognition_dataset: + :param Path recognition_reference: Path to a py-dict format reference file + :param Path returnn_exe: + :param Path returnn_root: + """ + returnn_config = copy.deepcopy(returnn_config) + returnn_config.config["forward"] = recognition_dataset.as_returnn_opts() + search_job = ReturnnForwardJobV2( + model_checkpoint=checkpoint, + returnn_config=returnn_config, + log_verbosity=5, + mem_rqmt=mem_rqmt, + time_rqmt=24, + device="gpu" if use_gpu else "cpu", + cpu_rqmt=2, + returnn_python_exe=returnn_exe, + returnn_root=returnn_root, + output_files=["search_out.py"], + ) + search_job.add_alias(prefix_name + "/search_job") + + search_words = SearchBPEtoWordsJob(search_job.out_files["search_out.py"]).out_word_search_results + + from i6_core.returnn.search import SearchWordsToCTMJob + from i6_core.corpus.convert import CorpusToStmJob + from i6_core.recognition.scoring import ScliteJob + + search_ctm = SearchWordsToCTMJob( + recog_words_file=search_words, + bliss_corpus=recognition_bliss_corpus, + ).out_ctm_file + + stm_file = CorpusToStmJob(bliss_corpus=recognition_bliss_corpus).out_stm_path + + sclite_job = ScliteJob(ref=stm_file, hyp=search_ctm, sctk_binary_path=SCTK_BINARY_PATH) + tk.register_output(prefix_name + "/sclite/wer", sclite_job.out_wer) + tk.register_output(prefix_name + "/sclite/report", sclite_job.out_report_dir) + + return sclite_job.out_wer, search_job + + +@tk.block() +def search(prefix_name, returnn_config, checkpoint, test_dataset_tuples, returnn_exe, returnn_root, use_gpu=False): + """ + + :param str prefix_name: + :param ReturnnConfig returnn_config: + :param Checkpoint checkpoint: + :param test_dataset_tuples: + :param returnn_exe: + :param returnn_root: + :return: + """ + # use fixed last checkpoint for now, needs more fine-grained selection / average etc. here + wers = {} + search_jobs = [] + for key, (test_dataset, test_dataset_reference) in test_dataset_tuples.items(): + wers[key], search_job = search_single( + prefix_name + "/%s" % key, + returnn_config, + checkpoint, + test_dataset, + test_dataset_reference, + returnn_exe, + returnn_root, + mem_rqmt=16 if not "whisper" in prefix_name else 64, + use_gpu=use_gpu, + ) + search_jobs.append(search_job) + + from i6_core.report import GenerateReportStringJob, MailJob + + format_string_report = ",".join(["{%s_val}" % (prefix_name + key) for key in test_dataset_tuples.keys()]) + format_string = " - ".join( + ["{%s}: {%s_val}" % (prefix_name + key, prefix_name + key) for key in test_dataset_tuples.keys()] + ) + values = {} + values_report = {} + for key in test_dataset_tuples.keys(): + values[prefix_name + key] = key + values["%s_val" % (prefix_name + key)] = wers[key] + values_report["%s_val" % (prefix_name + key)] = wers[key] + + report = GenerateReportStringJob(report_values=values, report_template=format_string, compress=False).out_report + # mail = MailJob(result=report, subject=prefix_name, send_contents=True).out_status + # tk.register_output(os.path.join(prefix_name, "mail_status"), mail) + return format_string_report, values_report, search_jobs + + +@tk.block() +def compute_prior( + prefix_name, + returnn_config, + checkpoint, + returnn_exe, + returnn_root, + mem_rqmt=8, + epoch=None +): + """ + Run search for a specific test dataset + + :param str prefix_name: + :param ReturnnConfig returnn_config: + :param Checkpoint checkpoint: + :param Path returnn_exe: + :param Path returnn_root: + :param Optional[str] epoch: alias generation + """ + search_job = ReturnnForwardJobV2( + model_checkpoint=checkpoint, + returnn_config=returnn_config, + log_verbosity=5, + mem_rqmt=mem_rqmt, + time_rqmt=2 if not "whisper" in prefix_name else 4, + device="gpu", + cpu_rqmt=4, + returnn_python_exe=returnn_exe, + returnn_root=returnn_root, + output_files=["prior.txt"], + ) + if epoch is None: + epoch = "" + else: + epoch = "/" + epoch + search_job.add_alias(prefix_name + "/prior" + epoch) + return search_job.out_files["prior.txt"] diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/hubert_pretrained_v1_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/hubert_pretrained_v1_cfg.py new file mode 100644 index 000000000..6278f0a95 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/hubert_pretrained_v1_cfg.py @@ -0,0 +1,70 @@ +""" +Config for the base CTC models v4, including specaug start time +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return SpecaugConfig(**d) + + +@dataclass +class HubertConfig(ModelConfiguration): + name: str + finetune_layer: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return HubertConfig(**d) + + +@dataclass +class ModelConfig: + specauc_start_epoch: int + label_target_size: int + final_dropout: float + hubert_cfg: HubertConfig + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["hubert_cfg"] = HubertConfig.from_dict(d["hubert_cfg"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/hubert_pretrained_v3.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/hubert_pretrained_v3.py new file mode 100644 index 000000000..285eb6a7a --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/hubert_pretrained_v3.py @@ -0,0 +1,160 @@ +""" +Same as v1 with fix to finetune layer numbers (range +1) +with additional fix to loading +""" + +import numpy as np +import torch +from torch import nn + +from transformers import HubertModel, HubertConfig +from returnn.torch.context import get_run_ctx +from .hubert_pretrained_v1_cfg import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.model_dict = None + + self.hubert_cfg = self.cfg.hubert_cfg + run_ctx = get_run_ctx() + print("TEST", run_ctx.global_step, run_ctx.epoch) + if not run_ctx.global_step and run_ctx.epoch == 1: + print("Load Hubert model parameters") + self.hubert: HubertModel = HubertModel.from_pretrained(f"facebook/hubert-{self.hubert_cfg.name}", + cache_dir="/work/asr4/hilmes/debug/whisper/transformers/") + else: + self.hubert: HubertModel = HubertModel(HubertConfig.from_pretrained(f"facebook/hubert-{self.hubert_cfg.name}", + cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")) + if self.training: + for param in self.hubert.parameters(): + param.requires_grad_(False) + for layer_num in range(1, self.hubert_cfg.finetune_layer + 1): + for name, param in self.hubert.encoder.layers[-layer_num].named_parameters(): + param.requires_grad_(True) + for name, param in self.hubert.encoder.named_parameters(): + if param.requires_grad: + print(name) + self.final_linear = nn.Linear(self.hubert.config.hidden_size, self.cfg.label_target_size + 1) # + CTC blank + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + assert any(param.requires_grad for param in self.hubert.parameters()) or self.hubert_cfg.finetune_layer == 0 + squeezed_features = torch.squeeze(raw_audio, dim=-1) + hubert_outputs = self.hubert(input_values=squeezed_features) + encoder_output = hubert_outputs.last_hidden_state + encoder_output = self.final_dropout(encoder_output) + logits = self.final_linear(encoder_output) + + log_probs = torch.log_softmax(logits, dim=2) + return log_probs, self.hubert._get_feat_extract_output_lengths(raw_audio_len) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) + + +def export(*, model: Model, f: str, **kwargs): + from torch.onnx import export + model.export_mode = True + dummy_data = torch.randn(1, 30000, 1) + dummy_data_len = torch.IntTensor([30000]) + export( + model, + (dummy_data, dummy_data_len), + f=f, + verbose=True, + input_names=["data", "data_len"], + output_names=["classes"], + dynamic_axes={ + "data": {0: "batch", 1: "time"}, + "data_len": {0: "batch"}, + "classes": {0: "batch", 1: "time"}, + }, + opset_version=17, + ) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_2x1D_frontend_xavierinit.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_2x1D_frontend_xavierinit.py new file mode 100644 index 000000000..af6e468d3 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_2x1D_frontend_xavierinit.py @@ -0,0 +1,330 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +import numpy as np +import torch +from torch import nn +from typing import Tuple +import math + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from i6_models.parts.frontend.common import mask_pool + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config + +from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import ( + returnn_specaugment_by_length, +) + + +from .i6modelsV1_2x1D_frontend_xavierinit_cfg import TwoLayer1DFrontendConfig, ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformatted in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: torch.Tensor): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Tensor with encoding and dropout applied [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class TwoLayer1DFrontend(nn.Module): + """ + Convolutional Front-End using two 1-D Convolutions + + + - Contains Batch-Norm, but no activation functions. + - Applies absolute positional encoding on the output. + - With additional linear mapping + """ + + def __init__(self, model_cfg: TwoLayer1DFrontendConfig): + """ + :param model_cfg: model configuration for this module + """ + super().__init__() + + model_cfg.check_valid() + + self.cfg = model_cfg + + self.conv1 = nn.Conv1d( + in_channels=model_cfg.in_features, + out_channels=model_cfg.conv1_channels, + kernel_size=model_cfg.conv1_kernel_size, + stride=model_cfg.conv1_stride, + ) + self.conv2 = nn.Conv1d( + in_channels=model_cfg.conv1_channels, + out_channels=model_cfg.conv2_channels, + kernel_size=model_cfg.conv2_kernel_size, + stride=model_cfg.conv2_stride, + ) + + self.bn1 = nn.BatchNorm1d(num_features=model_cfg.conv1_channels) + self.bn2 = nn.BatchNorm1d(num_features=model_cfg.conv2_channels) + self.pos_encoding = ESPNetPositionalEncoding(model_cfg.conv2_channels, model_cfg.dropout) + + def forward(self, tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + T might be reduced to T' or T'' depending on stride of the layers + + stride is only allowed for the pool1 and pool2 operation. + other ops do not have stride configurable -> no update of mask sequence required but added anyway + + :param tensor: input tensor of shape [B,T,F] + :param sequence_mask: the sequence mask for the tensor + :return: torch.Tensor of shape [B,T",F'] and the shape of the sequence mask + """ + tensor = tensor.permute(0, 2, 1) # [B,T,F] -> [B,C,T] + + tensor = self.conv1(tensor) + tensor = self.bn1(tensor) + sequence_mask = mask_pool( + seq_mask=sequence_mask, + kernel_size=self.conv1.kernel_size[0], + stride=self.conv1.stride[0], + padding=self.conv1.padding[0], + ) + + tensor = self.conv2(tensor) + tensor = self.bn2(tensor) + sequence_mask = mask_pool( + sequence_mask, + kernel_size=self.conv2.kernel_size[0], + stride=self.conv2.stride[0], + padding=self.conv2.padding[0], + ) + + tensor = tensor.permute(0, 2, 1) # [B,C,T] -> [B, T, hidden] + tensor = self.pos_encoding(tensor) + + return tensor, sequence_mask + + def _calculate_dim(self) -> int: + return self.conv2.out_channels + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=TwoLayer1DFrontend, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=conformer_size, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + self.export_mode = False + + # initialize weights + self.apply(self._weight_init) + + @staticmethod + def _weight_init(module: torch.nn.Module): + if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)): + print("apply weight init for %s" % str(module)) + nn.init.xavier_uniform_(module.weight) + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + + :param raw_audio: + :param raw_audio_len: + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = returnn_specaugment_by_length( + audio_features, + repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_2x1D_frontend_xavierinit_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_2x1D_frontend_xavierinit_cfg.py new file mode 100644 index 000000000..f65ac2482 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_2x1D_frontend_xavierinit_cfg.py @@ -0,0 +1,95 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass + + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass +class TwoLayer1DFrontendConfig(ModelConfiguration): + """ + Attributes: + in_features: number of input features to module + conv1_channels: number of channels for first conv layer + conv2_channels: number of channels for second conv layer + """ + + in_features: int + conv1_channels: int + conv2_channels: int + conv1_kernel_size: int + conv1_stride: int + conv2_kernel_size: int + conv2_stride: int + dropout: float + + def check_valid(self): + pass + + def __post__init__(self): + super().__post_init__() + self.check_valid() + + @classmethod + def from_dict(cls, d): + d = d.copy() + return TwoLayer1DFrontendConfig(**d) + + +@dataclass +class ConformerEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return SpecaugConfig(**d) + + +@dataclass +class ModelConfig: + frontend_config: TwoLayer1DFrontendConfig + specaug_config: SpecaugConfig + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["frontend_config"] = TwoLayer1DFrontendConfig.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1.py new file mode 100644 index 000000000..89762f151 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1.py @@ -0,0 +1,195 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +import numpy as np +import torch +from torch import nn +from typing import Tuple +import math + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig +from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import ( + returnn_specaugment_by_length, +) + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=conformer_size, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + self.export_mode = False + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + + :param raw_audio: + :param raw_audio_len: + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = returnn_specaugment_by_length( + audio_features, + repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py new file mode 100644 index 000000000..f120d4c5f --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py @@ -0,0 +1,85 @@ +""" +Config objects for the base CTC models v1 till v3 +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class ConformerEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return SpecaugConfig(**d) + + +@dataclass +class ModelConfig: + frontend_config: VGG4LayerActFrontendV1Config + specaug_config: SpecaugConfig + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_convfirst.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_convfirst.py new file mode 100644 index 000000000..83002a44d --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_convfirst.py @@ -0,0 +1,263 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +import numpy as np +import torch +from torch import nn +from typing import Tuple +import math + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config, ConformerConvolutionV1 +from i6_models.parts.conformer.feedforward import ( + ConformerPositionwiseFeedForwardV1Config, + ConformerPositionwiseFeedForwardV1, +) +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config, ConformerMHSAV1 + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig +from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import ( + returnn_specaugment_by_length, +) + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ConformerBlockV1ConvFirst(nn.Module): + """ + Conformer block module with convolution first + """ + + def __init__(self, cfg: ConformerBlockV1Config): + """ + :param cfg: conformer block configuration with subunits for the different conformer parts + """ + super().__init__() + self.ff1 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg) + self.conv = ConformerConvolutionV1(model_cfg=cfg.conv_cfg) + self.mhsa = ConformerMHSAV1(cfg=cfg.mhsa_cfg) + self.ff2 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg) + self.final_layer_norm = torch.nn.LayerNorm(cfg.ff_cfg.input_dim) + + def forward(self, x: torch.Tensor, /, sequence_mask: torch.Tensor) -> torch.Tensor: + """ + :param x: input tensor of shape [B, T, F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T] + :return: torch.Tensor of shape [B, T, F] + """ + x = 0.5 * self.ff1(x) + x # [B, T, F] + x = self.conv(x) + x # [B, T, F] + x = self.mhsa(x, sequence_mask) + x # [B, T, F] + x = 0.5 * self.ff2(x) + x # [B, T, F] + x = self.final_layer_norm(x) # [B, T, F] + return x + + +class ConformerEncoderV1ConvFirst(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.module_list = torch.nn.ModuleList( + [ConformerBlockV1ConvFirst(cfg.block_cfg) for _ in range(cfg.num_layers)] + ) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + for module in self.module_list: + x = module(x, sequence_mask) # [B, T, F'] + + return x, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=conformer_size, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1ConvFirst(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + self.export_mode = False + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + + :param raw_audio: + :param raw_audio_len: + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = returnn_specaugment_by_length( + audio_features, + repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc.py new file mode 100644 index 000000000..480b7f952 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc.py @@ -0,0 +1,255 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +import numpy as np +import torch +from torch import nn +from typing import Tuple +import math + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig +from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import ( + returnn_specaugment_by_length, +) + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformatted in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: torch.Tensor): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Tensor with encoding and dropout applied [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class VGG4LayerActFrontendV1PosEnc(VGG4LayerActFrontendV1): + def __init__(self, cfg: VGG4LayerActFrontendV1Config): + super().__init__(cfg) + self.posenc = ESPNetPositionalEncoding(self.cfg.out_features, 0.1) + + def forward(self, tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + tensor, sequence_mask = super().forward(tensor, sequence_mask) + tensor = self.posenc(tensor) + return tensor, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1PosEnc, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=conformer_size, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + self.export_mode = False + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + + :param raw_audio: + :param raw_audio_len: + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = returnn_specaugment_by_length( + audio_features, + repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_convfirst.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_convfirst.py new file mode 100644 index 000000000..25aa19a3d --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_convfirst.py @@ -0,0 +1,315 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +import numpy as np +import torch +from torch import nn +from typing import Tuple +import math + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config, ConformerConvolutionV1 +from i6_models.parts.conformer.feedforward import ( + ConformerPositionwiseFeedForwardV1Config, + ConformerPositionwiseFeedForwardV1, +) +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config, ConformerMHSAV1 + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig +from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import ( + returnn_specaugment_by_length, +) + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformatted in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: torch.Tensor): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Tensor with encoding and dropout applied [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class ConformerBlockV1ConvFirst(nn.Module): + """ + Conformer block module with convolution first + """ + + def __init__(self, cfg: ConformerBlockV1Config): + """ + :param cfg: conformer block configuration with subunits for the different conformer parts + """ + super().__init__() + self.ff1 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg) + self.conv = ConformerConvolutionV1(model_cfg=cfg.conv_cfg) + self.mhsa = ConformerMHSAV1(cfg=cfg.mhsa_cfg) + self.ff2 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg) + self.final_layer_norm = torch.nn.LayerNorm(cfg.ff_cfg.input_dim) + + def forward(self, x: torch.Tensor, /, sequence_mask: torch.Tensor) -> torch.Tensor: + """ + :param x: input tensor of shape [B, T, F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T] + :return: torch.Tensor of shape [B, T, F] + """ + x = 0.5 * self.ff1(x) + x # [B, T, F] + x = self.conv(x) + x # [B, T, F] + x = self.mhsa(x, sequence_mask) + x # [B, T, F] + x = 0.5 * self.ff2(x) + x # [B, T, F] + x = self.final_layer_norm(x) # [B, T, F] + return x + + +class ConformerEncoderV1ConvFirst(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.posenc = ESPNetPositionalEncoding(cfg.frontend.cfg.out_features, 0.1) + self.module_list = torch.nn.ModuleList( + [ConformerBlockV1ConvFirst(cfg.block_cfg) for _ in range(cfg.num_layers)] + ) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + x = self.posenc(x) + for module in self.module_list: + x = module(x, sequence_mask) # [B, T, F'] + + return x, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=conformer_size, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1ConvFirst(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + self.export_mode = False + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + + :param raw_audio: + :param raw_audio_len: + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = returnn_specaugment_by_length( + audio_features, + repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_xavierinit.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_xavierinit.py new file mode 100644 index 000000000..74c17b41a --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_xavierinit.py @@ -0,0 +1,262 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +import numpy as np +import torch +from torch import nn +from typing import Tuple +import math + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig +from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import ( + returnn_specaugment_by_length, +) + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformatted in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: torch.Tensor): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Tensor with encoding and dropout applied [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class VGG4LayerActFrontendV1PosEnc(VGG4LayerActFrontendV1): + def __init__(self, cfg: VGG4LayerActFrontendV1Config): + super().__init__(cfg) + self.posenc = ESPNetPositionalEncoding(self.cfg.out_features, 0.1) + + def forward(self, tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + tensor, sequence_mask = super().forward(tensor, sequence_mask) + tensor = self.posenc(tensor) + return tensor, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1PosEnc, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=conformer_size, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + self.export_mode = False + + # initialize weights + self.apply(self._weight_init) + + @staticmethod + def _weight_init(module: torch.nn.Module): + if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)): + print("apply xavier uniform weight init for %s" % str(module)) + nn.init.xavier_uniform_(module.weight) + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + + :param raw_audio: + :param raw_audio_len: + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = returnn_specaugment_by_length( + audio_features, + repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg.py new file mode 100644 index 000000000..39ed46b44 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg.py @@ -0,0 +1,89 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class TransparentConformerEncoderV2Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + transparent_weights: dict[int, float] + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return SpecaugConfig(**d) + + +@dataclass +class ModelConfig: + frontend_config: VGG4LayerActFrontendV1Config + specaug_config: SpecaugConfig + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + transparent_weights: dict[int, float] + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2.py new file mode 100644 index 000000000..2f02ac93b --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2.py @@ -0,0 +1,193 @@ +""" +Like the initial version, but with correctly set FF_dim +""" + +import numpy as np +import torch +from torch import nn +from typing import Tuple +import math + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig +from ...specaugment import ( + returnn_specaugment_by_length, +) + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + self.export_mode = False + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + + :param raw_audio: + :param raw_audio_len: + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = returnn_specaugment_by_length( + audio_features, + repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3.py new file mode 100644 index 000000000..25b5bb663 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3.py @@ -0,0 +1,189 @@ +""" +Like v2, but with i6_models specaugment +""" + +import numpy as np +import torch +from torch import nn + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_convfirst_posenc_xavierinit_transparent_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_convfirst_posenc_xavierinit_transparent_v2.py new file mode 100644 index 000000000..97b723e66 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_convfirst_posenc_xavierinit_transparent_v2.py @@ -0,0 +1,348 @@ +""" +Like v2, but with i6_models specaugment +""" + +import math +import numpy as np +import torch +from torch import nn +from typing import Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config, ConformerConvolutionV1 +from i6_models.parts.conformer.feedforward import ( + ConformerPositionwiseFeedForwardV1Config, + ConformerPositionwiseFeedForwardV1, +) +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config, ConformerMHSAV1 +from i6_models.primitives.specaugment import specaugment_v1_by_length + +from .i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg import ModelConfig, TransparentConformerEncoderV2Config + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformatted in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: torch.Tensor): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Tensor with encoding and dropout applied [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class ConvFirstConformerBlockV1(nn.Module): + """ + Conformer block module + """ + + def __init__(self, cfg: ConformerBlockV1Config): + """ + :param cfg: conformer block configuration with subunits for the different conformer parts + """ + super().__init__() + self.ff1 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg) + self.mhsa = ConformerMHSAV1(cfg=cfg.mhsa_cfg) + self.conv = ConformerConvolutionV1(model_cfg=cfg.conv_cfg) + self.ff2 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg) + self.final_layer_norm = torch.nn.LayerNorm(cfg.ff_cfg.input_dim) + + def forward(self, x: torch.Tensor, /, sequence_mask: torch.Tensor) -> torch.Tensor: + """ + :param x: input tensor of shape [B, T, F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T] + :return: torch.Tensor of shape [B, T, F] + """ + x = 0.5 * self.ff1(x) + x # [B, T, F] + x = self.conv(x) + x # [B, T, F] + x = self.mhsa(x, sequence_mask) + x # [B, T, F] + x = 0.5 * self.ff2(x) + x # [B, T, F] + x = self.final_layer_norm(x) # [B, T, F] + return x + + +class TransparentConformerEncoderV2(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: TransparentConformerEncoderV2Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.0) + self.module_list = torch.nn.ModuleList( + [ConvFirstConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)] + ) + self.transparent_scales = nn.Parameter(torch.empty((len(cfg.transparent_weights),))) + self.transparent_keys = list(cfg.transparent_weights.keys()) + + torch.nn.init.zeros_(self.transparent_scales) + with torch.no_grad(): + for i, (k, v) in enumerate(sorted(cfg.transparent_weights.items())): + self.transparent_scales[i] = v + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + x = self.posenc(x) + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + if 0 in self.transparent_keys: + final = transparent_weights[0] * x + scale_index = 1 + else: + final = 0 * x + scale_index = 0 + + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + if (i + 1) in self.transparent_keys: + # the current layer is part of the transparent layers, add to final and shift index value + final = final + (transparent_weights[scale_index] * x) + scale_index += 1 + + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = TransparentConformerEncoderV2Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + transparent_weights=self.cfg.transparent_weights, + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV2(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + # No particular weight init! + # initialize weights + self.apply(self._weight_init) + + @staticmethod + def _weight_init(module: torch.nn.Module): + if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)): + print("apply xavier uniform weight init for %s" % str(module)) + nn.init.xavier_uniform_(module.weight) + if isinstance(module, (torch.nn.MultiheadAttention)): + if module._qkv_same_embed_dim: + print("apply 1/sqrt(2) scaled xavier uniform weight init for %s" % str(module)) + nn.init.xavier_uniform_(module.in_proj_weight, gain=1 / np.sqrt(2)) + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_convfirst_posenc_xavierinit_v2_transparent_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_convfirst_posenc_xavierinit_v2_transparent_v2.py new file mode 100644 index 000000000..0fb3cb41c --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_convfirst_posenc_xavierinit_v2_transparent_v2.py @@ -0,0 +1,352 @@ +""" +Like v2, but with i6_models specaugment +""" + +import math +import numpy as np +import torch +from torch import nn +from typing import Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config, ConformerConvolutionV1 +from i6_models.parts.conformer.feedforward import ( + ConformerPositionwiseFeedForwardV1Config, + ConformerPositionwiseFeedForwardV1, +) +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config, ConformerMHSAV1 +from i6_models.primitives.specaugment import specaugment_v1_by_length + +from .i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg import ModelConfig, TransparentConformerEncoderV2Config + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformatted in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: torch.Tensor): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Tensor with encoding and dropout applied [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class ConvFirstConformerBlockV1(nn.Module): + """ + Conformer block module + """ + + def __init__(self, cfg: ConformerBlockV1Config): + """ + :param cfg: conformer block configuration with subunits for the different conformer parts + """ + super().__init__() + self.ff1 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg) + self.mhsa = ConformerMHSAV1(cfg=cfg.mhsa_cfg) + self.conv = ConformerConvolutionV1(model_cfg=cfg.conv_cfg) + self.ff2 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg) + self.final_layer_norm = torch.nn.LayerNorm(cfg.ff_cfg.input_dim) + + def forward(self, x: torch.Tensor, /, sequence_mask: torch.Tensor) -> torch.Tensor: + """ + :param x: input tensor of shape [B, T, F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T] + :return: torch.Tensor of shape [B, T, F] + """ + x = 0.5 * self.ff1(x) + x # [B, T, F] + x = self.conv(x) + x # [B, T, F] + x = self.mhsa(x, sequence_mask) + x # [B, T, F] + x = 0.5 * self.ff2(x) + x # [B, T, F] + x = self.final_layer_norm(x) # [B, T, F] + return x + + +class TransparentConformerEncoderV2(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: TransparentConformerEncoderV2Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.0) + self.module_list = torch.nn.ModuleList( + [ConvFirstConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)] + ) + self.transparent_scales = nn.Parameter(torch.empty((len(cfg.transparent_weights),))) + self.transparent_keys = list(cfg.transparent_weights.keys()) + + torch.nn.init.zeros_(self.transparent_scales) + with torch.no_grad(): + for i, (k, v) in enumerate(sorted(cfg.transparent_weights.items())): + self.transparent_scales[i] = v + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + x = self.posenc(x) + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + if 0 in self.transparent_keys: + final = transparent_weights[0] * x + scale_index = 1 + else: + final = 0 * x + scale_index = 0 + + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + if (i + 1) in self.transparent_keys: + # the current layer is part of the transparent layers, add to final and shift index value + final = final + (transparent_weights[scale_index] * x) + scale_index += 1 + + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = TransparentConformerEncoderV2Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + transparent_weights=self.cfg.transparent_weights, + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV2(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + # No particular weight init! + # initialize weights + self.apply(self._weight_init) + + @staticmethod + def _weight_init(module: torch.nn.Module): + from torch.nn.modules.linear import NonDynamicallyQuantizableLinear + + if isinstance(module, NonDynamicallyQuantizableLinear): + nn.init.xavier_uniform_(module.weight, gain=1 / np.sqrt(2)) + elif isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)): + print("apply xavier uniform weight init for %s" % str(module)) + nn.init.xavier_uniform_(module.weight) + elif isinstance(module, (torch.nn.MultiheadAttention)): + if module._qkv_same_embed_dim: + print("apply 1/sqrt(2) scaled xavier uniform weight init for %s" % str(module)) + nn.init.xavier_uniform_(module.in_proj_weight, gain=1 / np.sqrt(2)) + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent.py new file mode 100644 index 000000000..09b73d036 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent.py @@ -0,0 +1,286 @@ +""" +Like v2, but with i6_models specaugment +""" + +import math +import numpy as np +import torch +from torch import nn +from typing import Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformatted in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: torch.Tensor): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Tensor with encoding and dropout applied [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class TransparentConformerEncoderV1(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.0) + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,))) + + torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1)) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + x = self.posenc(x) + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + final = transparent_weights[0] * x + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + final = final + (transparent_weights[i + 1] * x) + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_latespecaug.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_latespecaug.py new file mode 100644 index 000000000..ea8c7ab4e --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_latespecaug.py @@ -0,0 +1,289 @@ +""" +Like v2, but with i6_models specaugment +""" + +import math +import numpy as np +import torch +from torch import nn +from typing import Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformatted in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: torch.Tensor): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Tensor with encoding and dropout applied [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class TransparentConformerEncoderV1(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.0) + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,))) + + torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1)) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + x = self.posenc(x) + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + final = transparent_weights[0] * x + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + final = final + (transparent_weights[i + 1] * x) + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch > 10: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_v2.py new file mode 100644 index 000000000..35f4adf1f --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_v2.py @@ -0,0 +1,301 @@ +""" +Like v2, but with i6_models specaugment +""" + +import math +import numpy as np +import torch +from torch import nn +from typing import Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length + +from .i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg import ModelConfig, TransparentConformerEncoderV2Config + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformatted in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: torch.Tensor): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Tensor with encoding and dropout applied [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class TransparentConformerEncoderV2(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: TransparentConformerEncoderV2Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.0) + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((len(cfg.transparent_weights),))) + self.transparent_keys = list(cfg.transparent_weights.keys()) + + torch.nn.init.zeros_(self.transparent_scales) + with torch.no_grad(): + for i, (k, v) in enumerate(sorted(cfg.transparent_weights.items())): + self.transparent_scales[i] = v + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + x = self.posenc(x) + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + if 0 in self.transparent_keys: + final = transparent_weights[0] * x + scale_index = 1 + else: + final = 0 * x + scale_index = 0 + + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + if (i + 1) in self.transparent_keys: + # the current layer is part of the transparent layers, add to final and shift index value + final = final + (transparent_weights[scale_index] * x) + scale_index += 1 + + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = TransparentConformerEncoderV2Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + transparent_weights=self.cfg.transparent_weights, + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV2(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_xavierinit_transparent_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_xavierinit_transparent_v2.py new file mode 100644 index 000000000..185de8ca9 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_xavierinit_transparent_v2.py @@ -0,0 +1,313 @@ +""" +Like v2, but with i6_models specaugment +""" + +import math +import numpy as np +import torch +from torch import nn +from typing import Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length + +from .i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg import ModelConfig, TransparentConformerEncoderV2Config + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformatted in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: torch.Tensor): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Tensor with encoding and dropout applied [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class TransparentConformerEncoderV2(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: TransparentConformerEncoderV2Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.0) + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((len(cfg.transparent_weights),))) + self.transparent_keys = list(cfg.transparent_weights.keys()) + + torch.nn.init.zeros_(self.transparent_scales) + with torch.no_grad(): + for i, (k, v) in enumerate(sorted(cfg.transparent_weights.items())): + self.transparent_scales[i] = v + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + x = self.posenc(x) + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + if 0 in self.transparent_keys: + final = transparent_weights[0] * x + scale_index = 1 + else: + final = 0 * x + scale_index = 0 + + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + if (i + 1) in self.transparent_keys: + # the current layer is part of the transparent layers, add to final and shift index value + final = final + (transparent_weights[scale_index] * x) + scale_index += 1 + + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = TransparentConformerEncoderV2Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + transparent_weights=self.cfg.transparent_weights, + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV2(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + # No particular weight init! + # initialize weights + self.apply(self._weight_init) + + @staticmethod + def _weight_init(module: torch.nn.Module): + if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)): + print("apply xavier uniform weight init for %s" % str(module)) + nn.init.xavier_uniform_(module.weight) + if isinstance(module, (torch.nn.MultiheadAttention)): + if module._qkv_same_embed_dim: + print("apply 1/sqrt(2) scaled xavier uniform weight init for %s" % str(module)) + nn.init.xavier_uniform_(module.in_proj_weight, gain=1 / np.sqrt(2)) + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_transparent.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_transparent.py new file mode 100644 index 000000000..9bc7c8c77 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_transparent.py @@ -0,0 +1,233 @@ +""" +Like v2, but with i6_models specaugment +""" + +import numpy as np +import torch +from torch import nn +from typing import Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class TransparentConformerEncoderV1(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,))) + + torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1)) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + final = transparent_weights[0] * x + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + final = final + (transparent_weights[i + 1] * x) + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4.py new file mode 100644 index 000000000..2926a5a1e --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4.py @@ -0,0 +1,194 @@ +""" +Like v2, but with i6_models specaugment (v3) +and now controllable start time for when specaugment is applied +""" + +import numpy as np +import torch +from torch import nn + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py new file mode 100644 index 000000000..c5ff0e77c --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py @@ -0,0 +1,86 @@ +""" +Config for the base CTC models v4, including specaug start time +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class ConformerEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return SpecaugConfig(**d) + + +@dataclass +class ModelConfig: + frontend_config: VGG4LayerActFrontendV1Config + specaug_config: SpecaugConfig + specauc_start_epoch: int + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5.py new file mode 100644 index 000000000..6f29f7364 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5.py @@ -0,0 +1,194 @@ +""" +Like v2, but with i6_models specaugment (v3) +and now controllable start time for when specaugment is applied (v4) +and with the proper feature extraction from i6-models (v5) +""" + +import numpy as np +import torch +from torch import nn + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_xavierinit.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_xavierinit.py new file mode 100644 index 000000000..bf3dbf043 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_xavierinit.py @@ -0,0 +1,201 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +import numpy as np +import torch +from torch import nn +from typing import Tuple +import math + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config + +from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig +from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import ( + returnn_specaugment_by_length, +) + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=conformer_size, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + self.export_mode = False + + # initialize weights + self.apply(self._weight_init) + + @staticmethod + def _weight_init(module: torch.nn.Module): + if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)): + print("apply xavier uniform weight init for %s" % str(module)) + nn.init.xavier_uniform_(module.weight) + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + + :param raw_audio: + :param raw_audio_len: + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = returnn_specaugment_by_length( + audio_features, + repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/hubert_pretrained_v1.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/hubert_pretrained_v1.py new file mode 100644 index 000000000..ed320c7e3 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/hubert_pretrained_v1.py @@ -0,0 +1,136 @@ +""" +Based on i6modelsV1_VGG4LayerActFrontendV1_v5, modified to include Hubert pretraining. +""" + +import numpy as np +import torch +from torch import nn + +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from transformers import HubertModel + +from returnn.torch.context import get_run_ctx + +from i6_experiments.users.hilmes.experiments.nick_setups.tedlium2_standalone_2023.pytorch_networks.ctc.conformer_0923.hubert_pretrained_v1_cfg import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.model_dict = None + + self.hubert_cfg = self.cfg.hubert_cfg + self.hubert: HubertModel = HubertModel.from_pretrained(f"facebook/hubert-{self.hubert_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/") + self.upsampling_layer = torch.nn.ConvTranspose1d( + in_channels=self.hubert.config.hidden_size, out_channels=512, kernel_size=5, stride=2, padding=1 + ) + for param in self.hubert.parameters(): + param.requires_grad_(False) + for layer_num in range(self.hubert_cfg.finetune_layer): + print(self.hubert.encoder.layers[-layer_num]) + print(layer_num) + for name, param in self.hubert.encoder.layers[-layer_num].named_parameters(): + param.requires_grad_(True) + self.final_linear = nn.Linear(self.hubert.config.hidden_size, self.cfg.label_target_size + 1) # + CTC blank + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + squeezed_features = torch.squeeze(raw_audio, dim=-1) + hubert_outputs = self.hubert(input_values=squeezed_features) + encoder_output = hubert_outputs.last_hidden_state + encoder_output = self.final_dropout(encoder_output) + #encoder_output = self.upsampling_layer(encoder_output.transpose(1, 2)).transpose(1, 2) + #encoder_output = encoder_output[:, :torch.sum(attention_mask, dim=1).max(), :] + logits = self.final_linear(encoder_output) + + log_probs = torch.log_softmax(logits, dim=2) + return log_probs, self.hubert._get_feat_extract_output_lengths(raw_audio_len) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_modules_v1.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_modules_v1.py new file mode 100644 index 000000000..fb140340d --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_modules_v1.py @@ -0,0 +1,89 @@ +from typing import Optional, Iterable +import torch +from torch import nn +import torch.nn.functional as F +import whisper +from whisper.model import MultiHeadAttention, LayerNorm, Linear, Tensor, Conv1d, sinusoids + + +class ResidualAttentionBlock(nn.Module): + def __init__(self, n_state: int, n_head: int, cross_attention: bool = False, dropout: float = 0.0): + super().__init__() + + self.attn = MultiHeadAttention(n_state, n_head) + self.attn_ln = LayerNorm(n_state) + + self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None + self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None + + n_mlp = n_state * 4 + self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)) + self.dropout = nn.Dropout(p=dropout) if dropout != 0.0 else None + self.mlp_ln = LayerNorm(n_state) + + def forward( + self, + x: Tensor, + xa: Optional[Tensor] = None, + mask: Optional[Tensor] = None, + kv_cache: Optional[dict] = None, + ): + x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0] + if self.cross_attn: + x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0] + y = self.mlp_ln(x) + y = self.mlp[0](y) + y = self.mlp[1](y) + if self.dropout: + y = self.dropout(y) + y = self.mlp[2](y) + x = x + y + return x + + +class Whisper(nn.Module): + def __init__(self, dims: whisper.ModelDimensions, dropout: float): + super().__init__() + self.dims = dims + self.encoder = AudioEncoder( + self.dims.n_mels, + self.dims.n_audio_ctx, + self.dims.n_audio_state, + self.dims.n_audio_head, + self.dims.n_audio_layer, + dropout=dropout, + ) + + def forward(self, mel: torch.Tensor) -> torch.Tensor: + return self.encoder(mel) + + +class AudioEncoder(nn.Module): + def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int, dropout: float): + super().__init__() + self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1) + self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1) + self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state)) + + self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList( + [ResidualAttentionBlock(n_state, n_head, dropout=dropout) for _ in range(n_layer)] + ) + self.ln_post = LayerNorm(n_state) + + def forward(self, x: Tensor): + """ + x : torch.Tensor, shape = (batch_size, n_mels, n_ctx) + the mel spectrogram of the audio + """ + x = F.gelu(self.conv1(x)) + x = F.gelu(self.conv2(x)) + x = x.permute(0, 2, 1) + + assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape" + x = (x + self.positional_embedding).to(x.dtype) + + for block in self.blocks: + x = block(x) + + x = self.ln_post(x) + return x diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v1.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v1.py new file mode 100644 index 000000000..33e4d167d --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v1.py @@ -0,0 +1,196 @@ +""" +Based on i6modelsV1_VGG4LayerActFrontendV1_v5, modified to include whisper pretraining. +""" + +import numpy as np +import torch +from torch import nn + +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +import whisper +from i6_experiments.users.hilmes.experiments.nick_setups.tedlium2_standalone_2023.pytorch_networks.ctc.conformer_0923.old_unusued.whisper_modules_v1 import \ + Whisper +from whisper.audio import N_FRAMES, pad_or_trim + +from returnn.torch.context import get_run_ctx + +from .whisper_pretrained_v1_cfg import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + self.model_dict = None + + self.whisper_cfg = self.cfg.whisper_config + if self.whisper_cfg.just_encoder: + with open(f"/work/asr4/hilmes/debug/whisper/{self.whisper_cfg.name}.pt", "rb") as f: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.whisper_checkpoint = torch.load(f, map_location=device) + self.whisper_dims = whisper.ModelDimensions(**self.whisper_checkpoint["dims"]) + self.whisper = Whisper(self.whisper_dims, self.whisper_cfg.dropout) + else: + raise NotImplementedError + + self.upsampling_layer = torch.nn.ConvTranspose1d( + in_channels=self.whisper.dims.n_audio_state, out_channels=512, kernel_size=5, stride=2, padding=1 + ) + + self.final_linear = nn.Linear(512, self.cfg.label_target_size + 1) # + CTC blank + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + run_ctx = get_run_ctx() + if run_ctx.global_step == 0 and run_ctx.epoch == 1: + self.model_dict = self.whisper.state_dict() + print(self.model_dict.keys()) + pretrained_dict = {k: v for k, v in self.whisper_checkpoint["model_state_dict"].items() if k in self.model_dict} + print(pretrained_dict.keys()) + self.whisper.load_state_dict(pretrained_dict) + for param in self.whisper.parameters(): + param.requires_grad_(False) + for layer_num in range(self.whisper_cfg.finetune_layer): + for param in self.whisper.encoder.blocks[-layer_num].parameters(): + param.requires_grad_(True) + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + audio_features_masked_2 = torch.transpose(audio_features_masked_2, 1, 2) + if audio_features_masked_2.shape[-1] > N_FRAMES: + audio_features_masked_2 = pad_or_trim(audio_features_masked_2, 2 * N_FRAMES, axis=-1) + trans_audio_mel_features_1: torch.Tensor = audio_features_masked_2.index_select( + dim=-1, index=torch.arange(end=N_FRAMES, device=audio_features_masked_2.device) + ) + trans_audio_mel_features_2: torch.Tensor = audio_features_masked_2.index_select( + dim=-1, index=torch.arange(start=N_FRAMES, end=2 * N_FRAMES, device=audio_features_masked_2.device) + ) + x_1: torch.Tensor = self.whisper.encoder(trans_audio_mel_features_1) + x_1 = self.upsampling_layer(x_1.transpose(1, 2)).transpose(1, 2) + x_2: torch.Tensor = self.whisper.encoder(trans_audio_mel_features_2) + x_2 = self.upsampling_layer(x_2.transpose(1, 2)).transpose(1, 2) + x = torch.cat((x_1, x_2), dim=1) + else: + audio_features_masked_2 = pad_or_trim(audio_features_masked_2, N_FRAMES) + x: torch.Tensor = self.whisper.encoder(audio_features_masked_2) + x = self.upsampling_layer(x.transpose(1, 2)).transpose(1, 2) + # create the mask for the conformer input + out_mask = mask_tensor(x, audio_features_len) + conformer_out = self.final_dropout(x) + conformer_out = conformer_out[:, :audio_features_len.max(), :] + logits = self.final_linear(conformer_out) + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v1_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v1_cfg.py new file mode 100644 index 000000000..1a611d3fa --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v1_cfg.py @@ -0,0 +1,77 @@ +""" +Config for the base CTC models v4, including specaug start time +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return SpecaugConfig(**d) + + +@dataclass +class WhisperConfig(ModelConfiguration): + name: str + just_encoder: bool + finetune_layer: int + split_seq: bool + dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + return WhisperConfig(**d) + + +@dataclass +class ModelConfig: + frontend_config: VGG4LayerActFrontendV1Config + specaug_config: SpecaugConfig + specauc_start_epoch: int + label_target_size: int + final_dropout: float + whisper_config: WhisperConfig + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"]) + d["whisper_config"] = WhisperConfig.from_dict(d["whisper_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v2.py new file mode 100644 index 000000000..272b44c4e --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v2.py @@ -0,0 +1,158 @@ +""" +Based on i6modelsV1_VGG4LayerActFrontendV1_v5, modified to include whisper pretraining. +""" + +import numpy as np +import torch +from torch import nn + + +from transformers import WhisperModel, WhisperFeatureExtractor, WhisperConfig + +from returnn.torch.context import get_run_ctx + +from i6_experiments.users.hilmes.experiments.nick_setups.tedlium2_standalone_2023.pytorch_networks.ctc.conformer_0923.whisper_pretrained_v2_cfg import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.model_dict = None + + self.whisper_cfg = self.cfg.whisper_config + self.whisper_feature_extractor = WhisperFeatureExtractor() + self.whisper = WhisperModel(WhisperConfig().from_pretrained( + f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")) + for param in self.whisper.parameters(): + param.requires_grad_(False) + for layer_num in range(self.whisper_cfg.finetune_layer): + for name, param in self.whisper.encoder.layers[-layer_num].named_parameters(): + param.requires_grad_(True) + print(name) + print(param) + self.upsampling_layer = torch.nn.ConvTranspose1d( + in_channels=self.whisper.config.d_model, out_channels=512, kernel_size=5, stride=2, padding=1 + ) + + self.final_linear = nn.Linear(512, self.cfg.label_target_size + 1) # + CTC blank + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + run_ctx = get_run_ctx() + if run_ctx.global_step == 0 and run_ctx.epoch == 1: + self.whisper_feature_extractor: WhisperFeatureExtractor = WhisperFeatureExtractor.from_pretrained( + f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/") + self.whisper: WhisperModel = WhisperModel.from_pretrained(f"openai/whisper-{self.whisper_cfg.name}", + cache_dir="/work/asr4/hilmes/debug/whisper/transformers/") + assert any(param.require_grad for param in self.whisper.encoder.parameters()) or self.whisper_cfg.finetune_layer == 0 + squeezed_features = torch.squeeze(raw_audio) + squeezed_features = squeezed_features.cpu().numpy() + features = self.whisper_feature_extractor(raw_speech=squeezed_features, return_tensors="pt", return_attention_mask=True, sampling_rate=16000) + features = features.to(device="cuda") + audio_features = features["input_features"] + attention_mask = features["attention_mask"] + # TODO: try to remove specagument for now + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + input_features = self.whisper._mask_input_features(audio_features, attention_mask=attention_mask) + else: + input_features = audio_features + whisper_outputs = self.whisper.encoder(input_features=input_features) + encoder_output = whisper_outputs.last_hidden_state + encoder_output = self.final_dropout(encoder_output) + encoder_output = self.upsampling_layer(encoder_output.transpose(1, 2)).transpose(1, 2) + encoder_output = encoder_output[:, :torch.sum(attention_mask, dim=1).max(), :] + logits = self.final_linear(encoder_output) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(attention_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v3.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v3.py new file mode 100644 index 000000000..95415805d --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v3.py @@ -0,0 +1,159 @@ +""" +Same as v2 with fix to finetune layer numbers (range +1) +""" + +import numpy as np +import torch +from torch import nn + + +from transformers import WhisperModel, WhisperFeatureExtractor, WhisperConfig + +from returnn.torch.context import get_run_ctx + +from i6_experiments.users.hilmes.experiments.nick_setups.tedlium2_standalone_2023.pytorch_networks.ctc.conformer_0923.whisper_pretrained_v2_cfg import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.model_dict = None + + self.whisper_cfg = self.cfg.whisper_config + self.whisper_feature_extractor = WhisperFeatureExtractor() + self.whisper = WhisperModel(WhisperConfig().from_pretrained( + f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")) + for param in self.whisper.parameters(): + param.requires_grad_(False) + for layer_num in range(self.whisper_cfg.finetune_layer): + for name, param in self.whisper.encoder.layers[-layer_num].named_parameters(): + param.requires_grad_(True) + print(name) + print(param) + self.upsampling_layer = torch.nn.ConvTranspose1d( + in_channels=self.whisper.config.d_model, out_channels=512, kernel_size=5, stride=2, padding=1 + ) + + self.final_linear = nn.Linear(512, self.cfg.label_target_size + 1) # + CTC blank + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + run_ctx = get_run_ctx() + if run_ctx.global_step == 0 and run_ctx.epoch == 1: + self.whisper_feature_extractor: WhisperFeatureExtractor = WhisperFeatureExtractor.from_pretrained( + f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/") + self.whisper: WhisperModel = WhisperModel.from_pretrained(f"openai/whisper-{self.whisper_cfg.name}", + cache_dir="/work/asr4/hilmes/debug/whisper/transformers/") + assert any(param.requires_grad for param in self.whisper.encoder.parameters()) or self.whisper_cfg.finetune_layer == 0 + squeezed_features = torch.squeeze(raw_audio) + squeezed_features = squeezed_features.cpu().numpy() + features = self.whisper_feature_extractor(raw_speech=squeezed_features, return_tensors="pt", return_attention_mask=True, sampling_rate=16000) + features = features.to(device="cuda") + audio_features = features["input_features"] + attention_mask = features["attention_mask"] + #audio_features_masked_2 = torch.transpose(audio_features_masked_2, 1, 2) # B, F, T + # TODO: try to remove specagument for now + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + input_features = self.whisper._mask_input_features(audio_features, attention_mask=attention_mask) + else: + input_features = audio_features + whisper_outputs = self.whisper.encoder(input_features=input_features) + encoder_output = whisper_outputs.last_hidden_state + encoder_output = self.final_dropout(encoder_output) + encoder_output = self.upsampling_layer(encoder_output.transpose(1, 2)).transpose(1, 2) + encoder_output = encoder_output[:, :torch.sum(attention_mask, dim=1).max(), :] + logits = self.final_linear(encoder_output) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(attention_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v4.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v4.py new file mode 100644 index 000000000..501409628 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v4.py @@ -0,0 +1,164 @@ +""" +v3: with fix to finetune layer numbers (range +1) +v4: change loading of whisper +""" + +import numpy as np +import torch +from torch import nn + + +from transformers import WhisperModel, WhisperFeatureExtractor, WhisperConfig + +from returnn.torch.context import get_run_ctx + +from .whisper_pretrained_v2_cfg import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.model_dict = None + + self.whisper_cfg = self.cfg.whisper_config + run_ctx = get_run_ctx() + if run_ctx.global_step == 0 and run_ctx.epoch == 1: + print("Load Whisper model parameters") + self.whisper_feature_extractor: WhisperFeatureExtractor = WhisperFeatureExtractor.from_pretrained( + f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/") + self.whisper: WhisperModel = WhisperModel.from_pretrained(f"openai/whisper-{self.whisper_cfg.name}", + cache_dir="/work/asr4/hilmes/debug/whisper/transformers/") + else: + self.whisper_feature_extractor = WhisperFeatureExtractor() + self.whisper = WhisperModel(WhisperConfig().from_pretrained( + f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")) + for param in self.whisper.parameters(): + param.requires_grad_(False) + for layer_num in range(1, self.whisper_cfg.finetune_layer + 1): + for name, param in self.whisper.encoder.layers[-layer_num].named_parameters(): + param.requires_grad_(True) + for name, param in self.whisper.encoder.named_parameters(): + if param.requires_grad: + print(name) + self.upsampling_layer = torch.nn.ConvTranspose1d( + in_channels=self.whisper.config.d_model, out_channels=512, kernel_size=5, stride=2, padding=1 + ) + + self.final_linear = nn.Linear(512, self.cfg.label_target_size + 1) # + CTC blank + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + assert any(param.requires_grad for param in self.whisper.encoder.parameters()) or self.whisper_cfg.finetune_layer == 0 + squeezed_features = torch.squeeze(raw_audio) + squeezed_features = squeezed_features.cpu().numpy() + features = self.whisper_feature_extractor(raw_speech=squeezed_features, return_tensors="pt", return_attention_mask=True, sampling_rate=16000) + features = features.to(device="cuda") + audio_features = features["input_features"] + attention_mask = features["attention_mask"] + #audio_features_masked_2 = torch.transpose(audio_features_masked_2, 1, 2) # B, F, T + # TODO: try to remove specagument for now + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + input_features = self.whisper._mask_input_features(audio_features, attention_mask=attention_mask) + else: + input_features = audio_features + whisper_outputs = self.whisper.encoder(input_features=input_features) + assert input_features.shape[2] == whisper_outputs.shape[1], (input_features.shape, whisper_outputs.shape) + encoder_output = whisper_outputs.last_hidden_state + encoder_output = self.final_dropout(encoder_output) + encoder_output = self.upsampling_layer(encoder_output.transpose(1, 2)).transpose(1, 2) + encoder_output = encoder_output[:, :torch.sum(attention_mask, dim=1).max(), :] + logits = self.final_linear(encoder_output) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(attention_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit.py new file mode 100644 index 000000000..5224bad9a --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit.py @@ -0,0 +1,371 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +import numpy as np +import torch +from torch import nn +from typing import Tuple +import math + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config +from i6_models.parts.frontend.common import mask_pool + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config + +from .transparent_i6modelsV1_2x1D_frontend_xavierinit_cfg import TwoLayer1DFrontendConfig, ModelConfig +from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import ( + returnn_specaugment_by_length, +) + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class ESPNetPositionalEncoding(torch.nn.Module): + """ + Absolute positional encoding taken from ESPNet, reformatted in i6-style + https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35 + + :param d_model: Embedding dimension. + :param dropout_rate: Dropout rate. + :param max_len: Maximum input length. + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000): + super(ESPNetPositionalEncoding, self).__init__() + self.d_model = d_model + self.xscale = math.sqrt(self.d_model) + self.dropout = torch.nn.Dropout(p=dropout_rate) + self.pe = None + self.extend_pe(torch.tensor(0.0).expand(1, max_len)) + + def extend_pe(self, x: torch.Tensor): + """ + Reset the positional encodings. + + :param x: + """ + if self.pe is not None: + if self.pe.size(1) >= x.size(1): + if self.pe.dtype != x.dtype or self.pe.device != x.device: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + pe = torch.zeros(x.size(1), self.d_model) + position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.pe = pe.to(device=x.device, dtype=x.dtype) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Add positional encoding. + + :param x: Input tensor [B, T, *] + :returns: Tensor with encoding and dropout applied [B, T, *] + """ + self.extend_pe(x) + x = x * self.xscale + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class TwoLayer1DFrontend(nn.Module): + """ + Convolutional Front-End using two 1-D Convolutions + + + - Contains Batch-Norm, but no activation functions. + - Applies absolute positional encoding on the output. + - With additional linear mapping + """ + + def __init__(self, model_cfg: TwoLayer1DFrontendConfig): + """ + :param model_cfg: model configuration for this module + """ + super().__init__() + + model_cfg.check_valid() + + self.cfg = model_cfg + + self.conv1 = nn.Conv1d( + in_channels=model_cfg.in_features, + out_channels=model_cfg.conv1_channels, + kernel_size=model_cfg.conv1_kernel_size, + stride=model_cfg.conv1_stride, + ) + self.conv2 = nn.Conv1d( + in_channels=model_cfg.conv1_channels, + out_channels=model_cfg.conv2_channels, + kernel_size=model_cfg.conv2_kernel_size, + stride=model_cfg.conv2_stride, + ) + + self.bn1 = nn.BatchNorm1d(num_features=model_cfg.conv1_channels) + self.bn2 = nn.BatchNorm1d(num_features=model_cfg.conv2_channels) + self.pos_encoding = ESPNetPositionalEncoding(model_cfg.conv2_channels, model_cfg.dropout) + + def forward(self, tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + T might be reduced to T' or T'' depending on stride of the layers + + stride is only allowed for the pool1 and pool2 operation. + other ops do not have stride configurable -> no update of mask sequence required but added anyway + + :param tensor: input tensor of shape [B,T,F] + :param sequence_mask: the sequence mask for the tensor + :return: torch.Tensor of shape [B,T",F'] and the shape of the sequence mask + """ + tensor = tensor.permute(0, 2, 1) # [B,T,F] -> [B,C,T] + + tensor = self.conv1(tensor) + tensor = self.bn1(tensor) + sequence_mask = mask_pool( + seq_mask=sequence_mask, + kernel_size=self.conv1.kernel_size[0], + stride=self.conv1.stride[0], + padding=self.conv1.padding[0], + ) + + tensor = self.conv2(tensor) + tensor = self.bn2(tensor) + sequence_mask = mask_pool( + sequence_mask, + kernel_size=self.conv2.kernel_size[0], + stride=self.conv2.stride[0], + padding=self.conv2.padding[0], + ) + + tensor = tensor.permute(0, 2, 1) # [B,C,T] -> [B, T, hidden] + tensor = self.pos_encoding(tensor) + + return tensor, sequence_mask + + def _calculate_dim(self) -> int: + return self.conv2.out_channels + + +class TransparentConformerEncoderV1(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,))) + + torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1)) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + final = transparent_weights[0] * x + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + final = final + (transparent_weights[i + 1] * x) + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=TwoLayer1DFrontend, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=conformer_size, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV1(cfg=conformer_config) + self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1) # + CTC blank + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + + self.export_mode = False + + # initialize weights + self.apply(self._weight_init) + + @staticmethod + def _weight_init(module: torch.nn.Module): + if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)): + print("apply weight init for %s" % str(module)) + nn.init.xavier_uniform_(module.weight) + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + + :param raw_audio: + :param raw_audio_len: + :return: logprobs [B, T, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training: + audio_features_masked_2 = returnn_specaugment_by_length( + audio_features, + repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + max_dim_time=self.cfg.specaug_config.max_dim_time, + num_repeat_feat=self.cfg.specaug_config.num_repeat_feat, + max_dim_feat=self.cfg.specaug_config.max_dim_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.final_dropout(conformer_out) + logits = self.final_linear(conformer_out) + + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(out_mask, dim=1) + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit_cfg.py new file mode 100644 index 000000000..f65ac2482 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit_cfg.py @@ -0,0 +1,95 @@ +""" +Trying to make the aligner more AppTek-Like + +Extended weight init code +""" + +from dataclasses import dataclass + + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass +class TwoLayer1DFrontendConfig(ModelConfiguration): + """ + Attributes: + in_features: number of input features to module + conv1_channels: number of channels for first conv layer + conv2_channels: number of channels for second conv layer + """ + + in_features: int + conv1_channels: int + conv2_channels: int + conv1_kernel_size: int + conv1_stride: int + conv2_kernel_size: int + conv2_stride: int + dropout: float + + def check_valid(self): + pass + + def __post__init__(self): + super().__post_init__() + self.check_valid() + + @classmethod + def from_dict(cls, d): + d = d.copy() + return TwoLayer1DFrontendConfig(**d) + + +@dataclass +class ConformerEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return SpecaugConfig(**d) + + +@dataclass +class ModelConfig: + frontend_config: TwoLayer1DFrontendConfig + specaug_config: SpecaugConfig + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["frontend_config"] = TwoLayer1DFrontendConfig.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/whisper_pretrained_v2_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/whisper_pretrained_v2_cfg.py new file mode 100644 index 000000000..95ddc7f40 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/whisper_pretrained_v2_cfg.py @@ -0,0 +1,34 @@ +""" +Config for the base CTC models v4, including specaug start time +""" + +from dataclasses import dataclass + +from i6_models.config import ModelConfiguration + +@dataclass +class WhisperConfig(ModelConfiguration): + name: str + just_encoder: bool + finetune_layer: int + split_seq: bool + dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + return WhisperConfig(**d) + + +@dataclass +class ModelConfig: + specauc_start_epoch: int + label_target_size: int + final_dropout: float + whisper_config: WhisperConfig + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["whisper_config"] = WhisperConfig.from_dict(d["whisper_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/whisper_pretrained_v5.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/whisper_pretrained_v5.py new file mode 100644 index 000000000..83755541d --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/whisper_pretrained_v5.py @@ -0,0 +1,183 @@ +""" +v3: with fix to finetune layer numbers (range +1) +v4: change loading of whisper +v5: add checks for dimensions +""" + +import numpy as np +import torch +from torch import nn + + +from transformers import WhisperModel, WhisperFeatureExtractor, WhisperConfig + +from returnn.torch.context import get_run_ctx + +from .whisper_pretrained_v2_cfg import ModelConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.model_dict = None + + self.whisper_cfg = self.cfg.whisper_config + run_ctx = get_run_ctx() + if run_ctx.global_step == 0 and run_ctx.epoch == 1: + print("Load Whisper model parameters") + self.whisper_feature_extractor: WhisperFeatureExtractor = WhisperFeatureExtractor.from_pretrained( + f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/") + self.whisper: WhisperModel = WhisperModel.from_pretrained(f"openai/whisper-{self.whisper_cfg.name}", + cache_dir="/work/asr4/hilmes/debug/whisper/transformers/") + else: + self.whisper_feature_extractor = WhisperFeatureExtractor() + self.whisper = WhisperModel(WhisperConfig().from_pretrained( + f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")) + for param in self.whisper.parameters(): + param.requires_grad_(False) + for layer_num in range(1, self.whisper_cfg.finetune_layer + 1): + for name, param in self.whisper.encoder.layers[-layer_num].named_parameters(): + param.requires_grad_(True) + for name, param in self.whisper.encoder.named_parameters(): + if param.requires_grad: + print(name) + #self.upsampling_layer = torch.nn.ConvTranspose1d( + # in_channels=self.whisper.config.d_model, out_channels=512, kernel_size=5, stride=2, padding=1 + #) + + self.final_linear = nn.Linear(512, self.cfg.label_target_size + 1) # + CTC blank + # No particular weight init! + + def forward( + self, + raw_audio: torch.Tensor, + raw_audio_len: torch.Tensor, + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :return: logprobs [B, T, #labels + blank] + """ + assert any(param.requires_grad for param in self.whisper.encoder.parameters()) or self.whisper_cfg.finetune_layer == 0 + squeezed_features = torch.squeeze(raw_audio, dim=-1) + if squeezed_features.shape[1] > 160 * 3000: + squeezed_features2 = squeezed_features[:, 160 * 3000:] + squeezed_features2 = squeezed_features2.cpu().numpy() + features2 = self.whisper_feature_extractor(raw_speech=squeezed_features2, return_tensors="pt", + return_attention_mask=True, sampling_rate=16000) + features2 = features2.to(device="cuda" if torch.cuda.is_available() else "cpu") + audio_features2 = features2["input_features"] + attention_mask2 = features2["attention_mask"] + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + input_features2 = self.whisper._mask_input_features(audio_features2, attention_mask=attention_mask2) + else: + input_features2 = audio_features2 + whisper_outputs2 = self.whisper.encoder(input_features=input_features2) + encoder_output2 = whisper_outputs2.last_hidden_state + encoder_output2 = self.final_dropout(encoder_output2) + logits2 = self.final_linear(encoder_output2) + + squeezed_features = squeezed_features.cpu().numpy() + features = self.whisper_feature_extractor(raw_speech=squeezed_features, return_tensors="pt", return_attention_mask=True, sampling_rate=16000) + features = features.to(device="cuda" if torch.cuda.is_available() else "cpu") + audio_features = features["input_features"] + attention_mask = features["attention_mask"] + #audio_features_masked_2 = torch.transpose(audio_features_masked_2, 1, 2) # B, F, T + # TODO: try to remove specagument for now + # TODO: fix dev set problems + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + input_features = self.whisper._mask_input_features(audio_features, attention_mask=attention_mask) + else: + input_features = audio_features + whisper_outputs = self.whisper.encoder(input_features=input_features) + encoder_output = whisper_outputs.last_hidden_state + encoder_output = self.final_dropout(encoder_output) + logits = self.final_linear(encoder_output) + if squeezed_features.shape[1] > 160 * 3000: + logits = torch.cat((logits, logits2), dim=1) + attention_mask = torch.cat((attention_mask, attention_mask2), dim=1) + log_probs = torch.log_softmax(logits, dim=2) + + return log_probs, torch.sum(attention_mask, dim=1) // 2 + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + transposed_logprobs = torch.permute(logprobs, (1, 0, 2)) # CTC needs [T, B, F] + ctc_loss = nn.functional.ctc_loss( + transposed_logprobs, + labels, + input_lengths=audio_features_len, + target_lengths=labels_len, + blank=model.cfg.label_target_size, + reduction="sum", + zero_infinity=True, + ) + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py new file mode 100644 index 000000000..3012eee33 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py @@ -0,0 +1,114 @@ +""" +Flashlight/Torchaudio CTC decoder and prior computation functions +""" + +import time +import numpy as np +import torch +from torch import nn + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[blank]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + run_ctx.running_audio_len_s = 0 + run_ctx.total_am_time = 0 + run_ctx.total_search_time = 0 + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print( + "Total-AM-Time: %.2fs, AM-RTF: %.3f" + % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s) + ) + print( + "Total-Search-Time: %.2fs, Search-RTF: %.3f" + % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s) + ) + total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time + print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + am_start = time.time() + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + am_time = time.time() - am_start + run_ctx.total_am_time += am_time + + search_start = time.time() + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + search_time = time.time() - search_start + run_ctx.total_search_time += search_time + + print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch)) + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch)) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_experimental_phoneme_ctc.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_experimental_phoneme_ctc.py new file mode 100644 index 000000000..ced6dd241 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_experimental_phoneme_ctc.py @@ -0,0 +1,137 @@ +""" +Flashlight/Torchaudio CTC decoder and prior computation functions +""" + +import time +import numpy as np +import torch +from torch import nn + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[SILENCE]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + run_ctx.running_audio_len_s = 0 + run_ctx.total_am_time = 0 + run_ctx.total_search_time = 0 + run_ctx.graph_model = None + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print( + "Total-AM-Time: %.2fs, AM-RTF: %.3f" + % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s) + ) + print( + "Total-Search-Time: %.2fs, Search-RTF: %.3f" + % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s) + ) + total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time + print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + if run_ctx.graph_model is None: + from torch.onnx import export + + dummy_data = torch.randn(3, 30000) + dummy_data_len = torch.IntTensor([30000, 20000, 15000]) + export( + model, + (dummy_data, dummy_data_len), + f="/var/tmp/some_model.onnx", + verbose=True, + input_names=["data", "data_len"], + output_names=["classes"], + dynamic_axes={ + "data": {0: "batch", 1: "time"}, + "data_len": {0: "batch"}, + "classes": {0: "batch", 1: "time"}, + }, + opset_version=17, + ) + import onnxruntime as ort + + run_ctx.ort_session = ort.InferenceSession("/var/tmp/some_model.onnx", providers=["CPUExecutionProvider"]) + + am_start = time.time() + logprobs, audio_features_len = run_ctx.ort_session.run( + None, {"data": raw_audio.cpu().numpy(), "data_len": raw_audio_len.cpu().numpy()} + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + am_time = time.time() - am_start + run_ctx.total_am_time += am_time + + search_start = time.time() + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + search_time = time.time() - search_start + run_ctx.total_search_time += search_time + + print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch)) + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch)) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_onnx_bpe_ctc.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_onnx_bpe_ctc.py new file mode 100644 index 000000000..b72af5fff --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_onnx_bpe_ctc.py @@ -0,0 +1,152 @@ +""" +Flashlight/Torchaudio CTC decoder and prior computation functions +""" + +import time +import numpy as np +import torch +import os +from torch import nn +from typing import Union + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[blank]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + run_ctx.running_audio_len_s = 0 + run_ctx.total_am_time = 0 + run_ctx.total_search_time = 0 + run_ctx.ort_session = None + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print( + "Total-AM-Time: %.2fs, AM-RTF: %.3f" + % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s) + ) + print( + "Total-Search-Time: %.2fs, Search-RTF: %.3f" + % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s) + ) + total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time + print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + if run_ctx.ort_session is None: + from torch.onnx import export + + dummy_data = torch.randn(3, 30000, 1) + dummy_data_len = torch.IntTensor([30000, 20000, 15000]) + export( + model, + (dummy_data, dummy_data_len), + f="/var/tmp/some_model.onnx", + verbose=True, + input_names=["data", "data_len"], + output_names=["classes"], + dynamic_axes={ + "data": {0: "batch", 1: "time"}, + "data_len": {0: "batch"}, + "classes": {0: "batch", 1: "time"}, + }, + opset_version=17, + ) + import onnxruntime as ort + sess_options = ort.SessionOptions() + import logging + logging.info(f"Session CPUS: {os.getenv('SLURM_CPUS_PER_TASK')}") + print("Compiled Onnx model") + if os.getenv("SLURM_CPUS_PER_TASK") is not None: + sess_options.intra_op_num_threads = int(os.getenv("SLURM_CPUS_PER_TASK")) + run_ctx.ort_session = ort.InferenceSession("/var/tmp/some_model.onnx", providers=["CPUExecutionProvider"], sess_options=sess_options) + else: + print("Taking existing model.") + + am_start = time.time() + logprobs, audio_features_len = run_ctx.ort_session.run( + None, {"data": raw_audio.cpu().numpy(), "data_len": raw_audio_len.cpu().numpy().astype(np.int32)} + ) + + tags = data["seq_tag"] + + if isinstance(logprobs, torch.Tensor): + logprobs_cpu = logprobs.cpu() + else: + logprobs_cpu = torch.from_numpy(logprobs) + if isinstance(audio_features_len, torch.Tensor): + audio_features_len_cpu = audio_features_len.cpu() + else: + audio_features_len_cpu = torch.from_numpy(audio_features_len) + + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + am_time = time.time() - am_start + run_ctx.total_am_time += am_time + + search_start = time.time() + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len_cpu) + search_time = time.time() - search_start + run_ctx.total_search_time += search_time + + print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch)) + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch)) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py new file mode 100644 index 000000000..39d942e9b --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py @@ -0,0 +1,114 @@ +""" +Flashlight/Torchaudio CTC decoder and prior computation functions +""" + +import time +import numpy as np +import torch +from torch import nn + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[SILENCE]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + run_ctx.running_audio_len_s = 0 + run_ctx.total_am_time = 0 + run_ctx.total_search_time = 0 + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print( + "Total-AM-Time: %.2fs, AM-RTF: %.3f" + % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s) + ) + print( + "Total-Search-Time: %.2fs, Search-RTF: %.3f" + % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s) + ) + total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time + print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + am_start = time.time() + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + tags = data["seq_tag"] + + logprobs_cpu = logprobs.cpu() + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + am_time = time.time() - am_start + run_ctx.total_am_time += am_time + + search_start = time.time() + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu()) + search_time = time.time() - search_start + run_ctx.total_search_time += search_time + + print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch)) + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch)) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_quantized_bpe_ctc.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_quantized_bpe_ctc.py new file mode 100644 index 000000000..8db276c0e --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_quantized_bpe_ctc.py @@ -0,0 +1,144 @@ +""" +Flashlight/Torchaudio CTC decoder and prior computation functions +""" + +import time +import numpy as np +import torch +import os +from torch import nn +from typing import Union + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + from torchaudio.models.decoder import ctc_decoder + import logging + print(kwargs) + + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + import subprocess + + if kwargs["arpa_lm"] is not None: + lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip() + else: + lm = None + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + labels = vocab.labels + run_ctx.ctc_decoder = ctc_decoder( + lexicon=kwargs["lexicon"], + lm=lm, + lm_weight=kwargs["lm_weight"], + tokens=labels + ["[blank]"], + # "[SILENCE]" and "[UNK]" are not actually part of the vocab, + # but the decoder is happy as long they are defined in the token list + # even if they do not exist as label index in the softmax output, + blank_token="[blank]", + sil_token="[blank]", + unk_word="[unknown]", + nbest=1, + beam_size=kwargs["beam_size"], + beam_size_token=kwargs.get("beam_size_token", None), + beam_threshold=kwargs["beam_threshold"], + sil_score=kwargs.get("sil_score", 0.0), + word_score=kwargs.get("word_score", 0.0), + ) + run_ctx.labels = labels + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + if kwargs.get("prior_file", None): + run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32") + run_ctx.prior_scale = kwargs["prior_scale"] + else: + run_ctx.prior = None + + run_ctx.running_audio_len_s = 0 + run_ctx.total_am_time = 0 + run_ctx.total_search_time = 0 + run_ctx.ort_session = None + run_ctx.quantized_model = kwargs.get("quantized_model", None) + print(f"Quantized model path: {run_ctx.quantized_model}") + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print( + "Total-AM-Time: %.2fs, AM-RTF: %.3f" + % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s) + ) + print( + "Total-Search-Time: %.2fs, Search-RTF: %.3f" + % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s) + ) + total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time + print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + if run_ctx.quantized_model: + import onnxruntime as ort + sess_options = ort.SessionOptions() + import logging + logging.info(f"Session CPUS: {os.getenv('SLURM_CPUS_PER_TASK')}") + print("Compiled Onnx model") + if os.getenv("SLURM_CPUS_PER_TASK") is not None: + sess_options.intra_op_num_threads = int(os.getenv("SLURM_CPUS_PER_TASK")) + run_ctx.ort_session = ort.InferenceSession( + run_ctx.quantized_model, + providers=["CPUExecutionProvider"], + sess_options=sess_options + ) + else: + assert False, "Need quantized Model for this." + + am_start = time.time() + logprobs, audio_features_len = run_ctx.ort_session.run( + None, {"data": raw_audio.cpu().numpy(), "data_len": raw_audio_len.cpu().numpy().astype(np.int32)} + ) + + tags = data["seq_tag"] + + if isinstance(logprobs, torch.Tensor): + logprobs_cpu = logprobs.cpu() + else: + logprobs_cpu = torch.from_numpy(logprobs) + if isinstance(audio_features_len, torch.Tensor): + audio_features_len_cpu = audio_features_len.cpu() + else: + audio_features_len_cpu = torch.from_numpy(audio_features_len) + + if run_ctx.blank_log_penalty is not None: + # assumes blank is last + logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty + if run_ctx.prior is not None: + logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior + + am_time = time.time() - am_start + run_ctx.total_am_time += am_time + + search_start = time.time() + hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len_cpu) + search_time = time.time() - search_start + run_ctx.total_search_time += search_time + + print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch)) + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch)) + + for hyp, tag in zip(hypothesis, tags): + words = hyp[0].words + sequence = " ".join([word for word in words if not word.startswith("[")]) + print(sequence) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence))) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py new file mode 100644 index 000000000..e4f795ee3 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py @@ -0,0 +1,61 @@ +""" +Greedy CTC decoder without any extras +""" + +import time +import numpy as np +import torch +from torch import nn + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + run_ctx.labels = vocab.labels + + run_ctx.running_audio_len_s = 0 + run_ctx.total_time = 0 + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print("Total-time: %.2f, Batch-RTF: %.3f" % (run_ctx.total_time, run_ctx.total_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + am_start = time.time() + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + batch_indices = [] + for lp, l in zip(logprobs, audio_features_len): + batch_indices.append(torch.unique_consecutive(torch.argmax(lp[:l], dim=-1), dim=0).detach().cpu().numpy()) + + am_time = time.time() - am_start + run_ctx.total_time += am_time + print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time, am_time / audio_len_batch)) + + tags = data["seq_tag"] + + for indices, tag in zip(batch_indices, tags): + print(indices) + sequence = [run_ctx.labels[idx] for idx in indices if idx < len(run_ctx.labels)] + sequence = [s for s in sequence if (not s.startswith("<") and not s.startswith("["))] + text = " ".join(sequence).replace("@@ ", "") + print(text) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(text))) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/legacy_feature_extraction.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/legacy_feature_extraction.py new file mode 100644 index 000000000..a6eb0bf1e --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/legacy_feature_extraction.py @@ -0,0 +1,110 @@ +__all__ = ["LogMelFeatureExtractionV1", "LogMelFeatureExtractionV1Config"] + +from dataclasses import dataclass +from typing import Optional, Tuple + +from librosa import filters +import torch +from torch import nn + +from i6_models.config import ModelConfiguration + + +@dataclass +class LogMelFeatureExtractionV1Config(ModelConfiguration): + """ + Attributes: + sample_rate: audio sample rate in Hz + win_size: window size in seconds + hop_size: window shift in seconds + f_min: minimum filter frequency in Hz + f_max: maximum filter frequency in Hz + min_amp: minimum amplitude for safe log + num_filters: number of mel windows + center: centered STFT with automatic padding + """ + + sample_rate: int + win_size: float + hop_size: float + f_min: int + f_max: int + min_amp: float + num_filters: int + center: bool + n_fft: Optional[int] = None + + def __post_init__(self) -> None: + super().__post_init__() + assert self.f_max <= self.sample_rate // 2, "f_max can not be larger than half of the sample rate" + assert self.f_min > 0 and self.f_max > 0 and self.sample_rate > 0, "frequencies need to be positive" + assert self.win_size > 0 and self.hop_size > 0, "window settings need to be positive" + assert self.num_filters > 0, "number of filters needs to be positive" + assert self.hop_size <= self.win_size, "using a larger hop size than window size does not make sense" + if self.n_fft is None: + # if n_fft is not given, set n_fft to the window size (in samples) + self.n_fft = int(self.win_size * self.sample_rate) + else: + assert self.n_fft >= self.win_size * self.sample_rate, "n_fft cannot to be smaller than the window size" + + +class LogMelFeatureExtractionV1(nn.Module): + """ + Librosa-compatible log-mel feature extraction using log10. Does not use torchaudio. + + Using it wrapped with torch.no_grad() is recommended if no gradient is needed + """ + + def __init__(self, cfg: LogMelFeatureExtractionV1Config): + super().__init__() + self.register_buffer("n_fft", torch.tensor(cfg.n_fft)) + self.register_buffer("window", torch.hann_window(int(cfg.win_size * cfg.sample_rate))) + self.register_buffer("hop_length", torch.tensor(int(cfg.hop_size * cfg.sample_rate))) + self.register_buffer("min_amp", torch.tensor(cfg.min_amp)) + self.center = cfg.center + self.register_buffer( + "mel_basis", + torch.tensor( + filters.mel( + sr=cfg.sample_rate, + n_fft=int(cfg.sample_rate * cfg.win_size), + n_mels=cfg.num_filters, + fmin=cfg.f_min, + fmax=cfg.f_max, + ) + ), + ) + + def forward(self, raw_audio, length) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param raw_audio: [B, T] + :param length in samples: [B] + :return features as [B,T,F] and length in frames [B] + """ + power_spectrum = ( + torch.abs( + torch.stft( + raw_audio, + n_fft=self.n_fft, + hop_length=self.hop_length, + window=self.window, + center=self.center, + pad_mode="constant", + return_complex=True, + ) + ) + ** 2 + ) + if len(power_spectrum.size()) == 2: + # For some reason torch.stft removes the batch axis for batch sizes of 1, so we need to add it again + power_spectrum = torch.unsqueeze(power_spectrum, 0) + melspec = torch.einsum("...ft,mf->...mt", power_spectrum, self.mel_basis) + log_melspec = torch.log10(torch.max(self.min_amp, melspec)) + feature_data = torch.transpose(log_melspec, 1, 2) + + if self.center: + length = (length // self.hop_length) + 1 + else: + length = ((length - self.n_fft) // self.hop_length) + 1 + + return feature_data, length.int() diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/hubert_pretrain_v1.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/hubert_pretrain_v1.py new file mode 100644 index 000000000..521e8519f --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/hubert_pretrain_v1.py @@ -0,0 +1,330 @@ +""" +Modified from v4 with proper configuration for the predictor and using i6models feature extraction + +Sets joiner dropout correctly +""" + +import numpy as np +import torch +import torchaudio +from torch import nn +from typing import List, Optional, Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from transformers import HubertModel, HubertConfig +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from returnn.torch.context import get_run_ctx + +from .hubert_pretrain_v1_cfg import ModelConfig, PredictorConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Predictor(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) prediction network. + + Taken from torchaudio + """ + + def __init__(self, cfg: PredictorConfig, label_target_size: int, output_dim: int) -> None: + """ + + :param cfg: model configuration for the predictor + :param label_target_size: shared value from model + :param output_dim: shared value from model + """ + super().__init__() + self.embedding = torch.nn.Embedding(label_target_size, cfg.symbol_embedding_dim) + self.embedding_dropout = nn.Dropout(cfg.emebdding_dropout) + self.input_layer_norm = torch.nn.LayerNorm(cfg.symbol_embedding_dim) + self.lstm_layers = torch.nn.ModuleList( + [ + nn.LSTM( + input_size=cfg.symbol_embedding_dim if idx == 0 else cfg.lstm_hidden_dim, + hidden_size=cfg.lstm_hidden_dim, + ) + for idx in range(cfg.num_lstm_layers) + ] + ) + self.dropout = torch.nn.Dropout(p=cfg.lstm_dropout) + self.linear = torch.nn.Linear(cfg.lstm_hidden_dim, output_dim) + self.output_layer_norm = torch.nn.LayerNorm(output_dim) + + self.lstm_dropout = cfg.lstm_dropout + + def forward( + self, + input: torch.Tensor, + lengths: torch.Tensor, + state: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass. + + B: batch size; + U: maximum sequence length in batch; + D: feature dimension of each input sequence element. + + Args: + input (torch.Tensor): target sequences, with shape `(B, U)` and each element + mapping to a target symbol, i.e. in range `[0, num_symbols)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing internal state generated in preceding invocation + of ``forward``. (Default: ``None``) + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output encoding sequences, with shape `(B, U, output_dim)` + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output encoding sequences. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing internal state generated in current invocation of ``forward``. + """ + input_tb = input.permute(1, 0) + embedding_out = self.embedding(input_tb) + embedding_out = self.embedding_dropout(embedding_out) + input_layer_norm_out = self.input_layer_norm(embedding_out) + + lstm_out = input_layer_norm_out + state_out: List[List[torch.Tensor]] = [] + for layer_idx, lstm in enumerate(self.lstm_layers): + lstm_out, lstm_state_out = lstm( + lstm_out, None if state is None else [s.permute(1, 0, 2) for s in state[layer_idx]] + ) + lstm_out = self.dropout(lstm_out) + state_out.append([s.permute(1, 0, 2) for s in lstm_state_out]) + + linear_out = self.linear(lstm_out) + output_layer_norm_out = self.output_layer_norm(linear_out) + return output_layer_norm_out.permute(1, 0, 2), lengths, state_out + + +class Joiner(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) joint network. + + Args: + input_dim (int): source and target input dimension. + output_dim (int): output dimension. + activation (str, optional): activation function to use in the joiner. + Must be one of ("relu", "tanh"). (Default: "relu") + + Taken directly from torchaudio + """ + + def __init__(self, input_dim: int, output_dim: int, activation: str = "relu", dropout: float = 0.0) -> None: + super().__init__() + self.linear = torch.nn.Linear(input_dim, output_dim, bias=True) + self.dropout = nn.Dropout(p=dropout) + if activation == "relu": + self.activation = torch.nn.ReLU() + elif activation == "tanh": + self.activation = torch.nn.Tanh() + else: + raise ValueError(f"Unsupported activation {activation}") + + def forward( + self, + source_encodings: torch.Tensor, + source_lengths: torch.Tensor, + target_encodings: torch.Tensor, + target_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + T: maximum source sequence length in batch; + U: maximum target sequence length in batch; + D: dimension of each source and target sequence encoding. + + Args: + source_encodings (torch.Tensor): source encoding sequences, with + shape `(B, T, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``source_encodings``. + target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``target_encodings``. + + Returns: + (torch.Tensor, torch.Tensor, torch.Tensor): + torch.Tensor + joint network output, with shape `(B, T, U, output_dim)`. + torch.Tensor + output source lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 1 for i-th batch element in joint network output. + torch.Tensor + output target lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 2 for i-th batch element in joint network output. + """ + joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous() + joint_encodings = self.dropout(joint_encodings) + activation_out = self.activation(joint_encodings) + output = self.linear(activation_out) + return output, source_lengths.to("cuda"), target_lengths + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + self.hubert_cfg = self.cfg.hubert_cfg + run_ctx = get_run_ctx() + print("TEST", run_ctx.global_step, run_ctx.epoch) + if not run_ctx.global_step and run_ctx.epoch == 1: + print("Load Hubert model parameters") + self.hubert: HubertModel = HubertModel.from_pretrained(f"facebook/hubert-{self.hubert_cfg.name}", + cache_dir="/work/asr4/hilmes/debug/whisper/transformers/") + else: + self.hubert: HubertModel = HubertModel( + HubertConfig.from_pretrained(f"facebook/hubert-{self.hubert_cfg.name}", + cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")) + for param in self.hubert.parameters(): + param.requires_grad_(False) + for layer_num in range(1, self.hubert_cfg.finetune_layer + 1): + for name, param in self.hubert.encoder.layers[-layer_num].named_parameters(): + param.requires_grad_(True) + for name, param in self.hubert.encoder.named_parameters(): + if param.requires_grad: + print(name) + + self.predictor = Predictor( + cfg=self.cfg.predictor_config, + label_target_size=self.cfg.label_target_size + 1, # ctc blank added + output_dim=self.cfg.joiner_dim, + ) + self.joiner = Joiner( + input_dim=self.cfg.joiner_dim, + output_dim=self.cfg.label_target_size + 1, + activation=self.cfg.joiner_activation, + dropout=self.cfg.joiner_dropout, + ) + self.final_dropout = nn.Dropout(p=self.cfg.final_dropout) + self.encoder_out_linear = nn.Linear(self.hubert.config.hidden_size, self.cfg.joiner_dim) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0) + # No particular weight init! + + def forward( + self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :param labels: [B, N] + :param labels_len: length of N as [B] + :return: logprobs [B, T + N, #labels + blank] + """ + assert any(param.requires_grad for param in self.hubert.parameters()) or self.hubert_cfg.finetune_layer == 0 + squeezed_features = torch.squeeze(raw_audio, dim=-1) + hubert_outputs = self.hubert(input_values=squeezed_features) + encoder_output = hubert_outputs.last_hidden_state + encoder_output = self.final_dropout(encoder_output) + encoder_output = self.encoder_out_linear(encoder_output) + + encoder_out_lengths = self.hubert._get_feat_extract_output_lengths(raw_audio_len) # [B, T] -> [B] + + predict_out, _, _ = self.predictor( + input=labels, + lengths=labels_len, + ) + + output_logits, src_len, tgt_len = self.joiner( + source_encodings=encoder_output, + source_lengths=encoder_out_lengths, + target_encodings=predict_out, + target_lengths=labels_len, + ) # output is [B, T, N, #vocab] + + return output_logits, src_len + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B], cpu transfer needed only for Mini-RETURNN + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1]) + prepended_targets[:, 1:] = labels + prepended_targets[:, 0] = model.cfg.label_target_size # blank is last index + prepended_target_lengths = labels_len + 1 + + logits, audio_features_len = model( + raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths + ) + + rnnt_loss = model.loss( + logits=logits, + logit_lengths=audio_features_len.to(dtype=torch.int32), + targets=labels, + target_lengths=labels_len.to(dtype=torch.int32), + ) + + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/hubert_pretrain_v1_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/hubert_pretrain_v1_cfg.py new file mode 100644 index 000000000..bf1a3b04d --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/hubert_pretrain_v1_cfg.py @@ -0,0 +1,57 @@ +""" +Config for the base CTC models v4, including specaug start time +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass +class PredictorConfig(ModelConfiguration): + symbol_embedding_dim: int + emebdding_dropout: float + num_lstm_layers: int + lstm_hidden_dim: int + lstm_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + return PredictorConfig(**d) + + +@dataclass +class HubertConfig(ModelConfiguration): + name: str + finetune_layer: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return HubertConfig(**d) + + +@dataclass +class ModelConfig: + predictor_config: PredictorConfig + specauc_start_epoch: int + label_target_size: int + final_dropout: float + joiner_dim: int + joiner_activation: str + joiner_dropout: float + hubert_cfg: HubertConfig + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["predictor_config"] = PredictorConfig.from_dict(d["predictor_config"]) + d["hubert_cfg"] = HubertConfig.from_dict(d["hubert_cfg"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4.py new file mode 100644 index 000000000..aa0e8bc07 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4.py @@ -0,0 +1,370 @@ +""" + +""" + +import numpy as np +import torch +import torchaudio +from torch import nn +from typing import List, Optional, Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Predictor(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) prediction network. + + Args: + num_symbols (int): size of target token lexicon. + output_dim (int): feature dimension of each output sequence element. + symbol_embedding_dim (int): dimension of each target token embedding. + num_lstm_layers (int): number of LSTM layers to instantiate. + lstm_hidden_dim (int): output dimension of each LSTM layer. + lstm_dropout (float, optional): LSTM dropout probability. (Default: 0.0) + + """ + + def __init__( + self, + num_symbols: int, + output_dim: int, + symbol_embedding_dim: int, + num_lstm_layers: int, + lstm_hidden_dim: int, + lstm_dropout: float = 0.0, + ) -> None: + super().__init__() + self.embedding = torch.nn.Embedding(num_symbols, symbol_embedding_dim) + self.input_layer_norm = torch.nn.LayerNorm(symbol_embedding_dim) + self.lstm_layers = torch.nn.ModuleList( + [ + nn.LSTM( + input_size=symbol_embedding_dim if idx == 0 else lstm_hidden_dim, + hidden_size=lstm_hidden_dim, + ) + for idx in range(num_lstm_layers) + ] + ) + self.dropout = torch.nn.Dropout(p=lstm_dropout) + self.linear = torch.nn.Linear(lstm_hidden_dim, output_dim) + self.output_layer_norm = torch.nn.LayerNorm(output_dim) + + self.lstm_dropout = lstm_dropout + + def forward( + self, + input: torch.Tensor, + lengths: torch.Tensor, + state: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass. + + B: batch size; + U: maximum sequence length in batch; + D: feature dimension of each input sequence element. + + Args: + input (torch.Tensor): target sequences, with shape `(B, U)` and each element + mapping to a target symbol, i.e. in range `[0, num_symbols)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing internal state generated in preceding invocation + of ``forward``. (Default: ``None``) + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output encoding sequences, with shape `(B, U, output_dim)` + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output encoding sequences. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing internal state generated in current invocation of ``forward``. + """ + input_tb = input.permute(1, 0) + embedding_out = self.embedding(input_tb) + input_layer_norm_out = self.input_layer_norm(embedding_out) + + lstm_out = input_layer_norm_out + state_out: List[List[torch.Tensor]] = [] + for layer_idx, lstm in enumerate(self.lstm_layers): + lstm_out, lstm_state_out = lstm(lstm_out, None if state is None else state[layer_idx]) + lstm_out = self.dropout(lstm_out) + state_out.append(lstm_state_out) + + linear_out = self.linear(lstm_out) + output_layer_norm_out = self.output_layer_norm(linear_out) + return output_layer_norm_out.permute(1, 0, 2), lengths, state_out + + +class Joiner(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) joint network. + + Args: + input_dim (int): source and target input dimension. + output_dim (int): output dimension. + activation (str, optional): activation function to use in the joiner. + Must be one of ("relu", "tanh"). (Default: "relu") + + Taken directly from torchaudio + """ + + def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None: + super().__init__() + self.linear = torch.nn.Linear(input_dim, output_dim, bias=True) + if activation == "relu": + self.activation = torch.nn.ReLU() + elif activation == "tanh": + self.activation = torch.nn.Tanh() + else: + raise ValueError(f"Unsupported activation {activation}") + + def forward( + self, + source_encodings: torch.Tensor, + source_lengths: torch.Tensor, + target_encodings: torch.Tensor, + target_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + T: maximum source sequence length in batch; + U: maximum target sequence length in batch; + D: dimension of each source and target sequence encoding. + + Args: + source_encodings (torch.Tensor): source encoding sequences, with + shape `(B, T, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``source_encodings``. + target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``target_encodings``. + + Returns: + (torch.Tensor, torch.Tensor, torch.Tensor): + torch.Tensor + joint network output, with shape `(B, T, U, output_dim)`. + torch.Tensor + output source lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 1 for i-th batch element in joint network output. + torch.Tensor + output target lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 2 for i-th batch element in joint network output. + """ + joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous() + activation_out = self.activation(joint_encodings) + output = self.linear(activation_out) + return output, source_lengths, target_lengths + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + JOINER_DIM = 512 + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.predictor = Predictor( + num_symbols=self.cfg.label_target_size + 1, + output_dim=JOINER_DIM, + symbol_embedding_dim=256, + num_lstm_layers=1, + lstm_hidden_dim=1024, + lstm_dropout=0.0, + ) + self.joiner = Joiner( + input_dim=JOINER_DIM, + output_dim=self.cfg.label_target_size + 1, + ) + self.encoder_out_linear = nn.Linear(conformer_size, JOINER_DIM) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0) + # No particular weight init! + + def forward( + self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :param labels: [B, N] + :param labels_len: length of N as [B] + :return: logprobs [B, T + N, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.encoder_out_linear(conformer_out) + conformer_out_lengths = torch.sum(out_mask, dim=1) # [B, T] -> [B] + + predict_out, _, _ = self.predictor( + input=labels, + lengths=labels_len, + ) + + output_logits, src_len, tgt_len = self.joiner( + source_encodings=conformer_out, + source_lengths=conformer_out_lengths, + target_encodings=predict_out, + target_lengths=labels_len, + ) # output is [B, T, N, #vocab] + + return output_logits, src_len + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1]) + prepended_targets[:, 1:] = labels + prepended_targets[:, 0] = model.cfg.label_target_size # blank is last index + prepended_target_lengths = labels_len + 1 + + logits, audio_features_len = model( + raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths + ) + + rnnt_loss = model.loss( + logits=logits, + logit_lengths=audio_features_len.to(dtype=torch.int32), + targets=labels, + target_lengths=labels_len.to(dtype=torch.int32), + ) + + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py new file mode 100644 index 000000000..c5ff0e77c --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py @@ -0,0 +1,86 @@ +""" +Config for the base CTC models v4, including specaug start time +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class ConformerEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return SpecaugConfig(**d) + + +@dataclass +class ModelConfig: + frontend_config: VGG4LayerActFrontendV1Config + specaug_config: SpecaugConfig + specauc_start_epoch: int + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_transparent.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_transparent.py new file mode 100644 index 000000000..e76513f95 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_transparent.py @@ -0,0 +1,413 @@ +""" + +""" + +import numpy as np +import torch +import torchaudio +from torch import nn +from typing import List, Optional, Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Predictor(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) prediction network. + + Args: + num_symbols (int): size of target token lexicon. + output_dim (int): feature dimension of each output sequence element. + symbol_embedding_dim (int): dimension of each target token embedding. + num_lstm_layers (int): number of LSTM layers to instantiate. + lstm_hidden_dim (int): output dimension of each LSTM layer. + lstm_dropout (float, optional): LSTM dropout probability. (Default: 0.0) + + """ + + def __init__( + self, + num_symbols: int, + output_dim: int, + symbol_embedding_dim: int, + num_lstm_layers: int, + lstm_hidden_dim: int, + lstm_dropout: float = 0.0, + ) -> None: + super().__init__() + self.embedding = torch.nn.Embedding(num_symbols, symbol_embedding_dim) + self.input_layer_norm = torch.nn.LayerNorm(symbol_embedding_dim) + self.lstm_layers = torch.nn.ModuleList( + [ + nn.LSTM( + input_size=symbol_embedding_dim if idx == 0 else lstm_hidden_dim, + hidden_size=lstm_hidden_dim, + ) + for idx in range(num_lstm_layers) + ] + ) + self.dropout = torch.nn.Dropout(p=lstm_dropout) + self.linear = torch.nn.Linear(lstm_hidden_dim, output_dim) + self.output_layer_norm = torch.nn.LayerNorm(output_dim) + + self.lstm_dropout = lstm_dropout + + def forward( + self, + input: torch.Tensor, + lengths: torch.Tensor, + state: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass. + + B: batch size; + U: maximum sequence length in batch; + D: feature dimension of each input sequence element. + + Args: + input (torch.Tensor): target sequences, with shape `(B, U)` and each element + mapping to a target symbol, i.e. in range `[0, num_symbols)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing internal state generated in preceding invocation + of ``forward``. (Default: ``None``) + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output encoding sequences, with shape `(B, U, output_dim)` + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output encoding sequences. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing internal state generated in current invocation of ``forward``. + """ + input_tb = input.permute(1, 0) + embedding_out = self.embedding(input_tb) + input_layer_norm_out = self.input_layer_norm(embedding_out) + + lstm_out = input_layer_norm_out + state_out: List[List[torch.Tensor]] = [] + for layer_idx, lstm in enumerate(self.lstm_layers): + lstm_out, lstm_state_out = lstm(lstm_out, None if state is None else state[layer_idx]) + lstm_out = self.dropout(lstm_out) + state_out.append(lstm_state_out) + + linear_out = self.linear(lstm_out) + output_layer_norm_out = self.output_layer_norm(linear_out) + return output_layer_norm_out.permute(1, 0, 2), lengths, state_out + + +class Joiner(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) joint network. + + Args: + input_dim (int): source and target input dimension. + output_dim (int): output dimension. + activation (str, optional): activation function to use in the joiner. + Must be one of ("relu", "tanh"). (Default: "relu") + + Taken directly from torchaudio + """ + + def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None: + super().__init__() + self.linear = torch.nn.Linear(input_dim, output_dim, bias=True) + if activation == "relu": + self.activation = torch.nn.ReLU() + elif activation == "tanh": + self.activation = torch.nn.Tanh() + else: + raise ValueError(f"Unsupported activation {activation}") + + def forward( + self, + source_encodings: torch.Tensor, + source_lengths: torch.Tensor, + target_encodings: torch.Tensor, + target_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + T: maximum source sequence length in batch; + U: maximum target sequence length in batch; + D: dimension of each source and target sequence encoding. + + Args: + source_encodings (torch.Tensor): source encoding sequences, with + shape `(B, T, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``source_encodings``. + target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``target_encodings``. + + Returns: + (torch.Tensor, torch.Tensor, torch.Tensor): + torch.Tensor + joint network output, with shape `(B, T, U, output_dim)`. + torch.Tensor + output source lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 1 for i-th batch element in joint network output. + torch.Tensor + output target lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 2 for i-th batch element in joint network output. + """ + joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous() + activation_out = self.activation(joint_encodings) + output = self.linear(activation_out) + return output, source_lengths, target_lengths + + +class TransparentConformerEncoderV1(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,))) + + torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1)) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + final = transparent_weights[0] * x + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + final = final + (transparent_weights[i + 1] * x) + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + JOINER_DIM = 512 + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV1(cfg=conformer_config) + self.predictor = Predictor( + num_symbols=self.cfg.label_target_size + 1, + output_dim=JOINER_DIM, + symbol_embedding_dim=256, + num_lstm_layers=1, + lstm_hidden_dim=1024, + lstm_dropout=0.0, + ) + self.joiner = Joiner( + input_dim=JOINER_DIM, + output_dim=self.cfg.label_target_size + 1, + ) + self.encoder_out_linear = nn.Linear(conformer_size, JOINER_DIM) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0) + # No particular weight init! + + def forward( + self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :param labels: [B, N] + :param labels_len: length of N as [B] + :return: logprobs [B, T + N, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.encoder_out_linear(conformer_out) + conformer_out_lengths = torch.sum(out_mask, dim=1) # [B, T] -> [B] + + predict_out, _, _ = self.predictor( + input=labels, + lengths=labels_len, + ) + + output_logits, src_len, tgt_len = self.joiner( + source_encodings=conformer_out, + source_lengths=conformer_out_lengths, + target_encodings=predict_out, + target_lengths=labels_len, + ) # output is [B, T, N, #vocab] + + return output_logits, src_len + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1]) + prepended_targets[:, 1:] = labels + prepended_targets[:, 0] = model.cfg.label_target_size # blank is last index + prepended_target_lengths = labels_len + 1 + + logits, audio_features_len = model( + raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths + ) + + rnnt_loss = model.loss( + logits=logits, + logit_lengths=audio_features_len.to(dtype=torch.int32), + targets=labels, + target_lengths=labels_len.to(dtype=torch.int32), + ) + + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_transparent_latepredictor.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_transparent_latepredictor.py new file mode 100644 index 000000000..bfc183d22 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_transparent_latepredictor.py @@ -0,0 +1,414 @@ +""" + +""" + +import numpy as np +import torch +import torchaudio +from torch import nn +from typing import List, Optional, Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig + +from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Predictor(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) prediction network. + + Taken from torchaudio + """ + + def __init__( + self, + num_symbols: int, + output_dim: int, + symbol_embedding_dim: int, + num_lstm_layers: int, + lstm_hidden_dim: int, + lstm_dropout: float = 0.0, + ) -> None: + super().__init__() + self.embedding = torch.nn.Embedding(num_symbols, symbol_embedding_dim) + self.input_layer_norm = torch.nn.LayerNorm(symbol_embedding_dim) + self.lstm_layers = torch.nn.ModuleList( + [ + nn.LSTM( + input_size=symbol_embedding_dim if idx == 0 else lstm_hidden_dim, + hidden_size=lstm_hidden_dim, + batch_first=True, + ) + for idx in range(num_lstm_layers) + ] + ) + self.dropout = torch.nn.Dropout(p=lstm_dropout) + self.linear = torch.nn.Linear(lstm_hidden_dim, output_dim) + self.output_layer_norm = torch.nn.LayerNorm(output_dim) + + self.lstm_dropout = lstm_dropout + + def forward( + self, + input: torch.Tensor, + lengths: torch.Tensor, + state: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass. + + B: batch size; + U: maximum sequence length in batch; + D: feature dimension of each input sequence element. + + Args: + input (torch.Tensor): target sequences, with shape `(B, U)` and each element + mapping to a target symbol, i.e. in range `[0, num_symbols)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing internal state generated in preceding invocation + of ``forward``. (Default: ``None``) + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output encoding sequences, with shape `(B, U, output_dim)` + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output encoding sequences. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing internal state generated in current invocation of ``forward``. + """ + # input_tb = input.permute(1, 0) + input_tb = input + embedding_out = self.embedding(input_tb) + input_layer_norm_out = self.input_layer_norm(embedding_out) + + lstm_out = input_layer_norm_out + state_out: List[List[torch.Tensor]] = [] + for layer_idx, lstm in enumerate(self.lstm_layers): + lstm_out, lstm_state_out = lstm( + lstm_out, None if state is None else [s.permute(1, 0, 2) for s in state[layer_idx]] + ) + lstm_out = self.dropout(lstm_out) + state_out.append([s.permute(1, 0, 2) for s in lstm_state_out]) + + linear_out = self.linear(lstm_out) + output_layer_norm_out = self.output_layer_norm(linear_out) + return output_layer_norm_out, lengths, state_out + # return output_layer_norm_out.permute(1, 0, 2), lengths, state_out + + +class Joiner(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) joint network. + + Args: + input_dim (int): source and target input dimension. + output_dim (int): output dimension. + activation (str, optional): activation function to use in the joiner. + Must be one of ("relu", "tanh"). (Default: "relu") + + Taken directly from torchaudio + """ + + def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None: + super().__init__() + self.linear = torch.nn.Linear(input_dim, output_dim, bias=True) + if activation == "relu": + self.activation = torch.nn.ReLU() + elif activation == "tanh": + self.activation = torch.nn.Tanh() + else: + raise ValueError(f"Unsupported activation {activation}") + + def forward( + self, + source_encodings: torch.Tensor, + source_lengths: torch.Tensor, + target_encodings: torch.Tensor, + target_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + T: maximum source sequence length in batch; + U: maximum target sequence length in batch; + D: dimension of each source and target sequence encoding. + + Args: + source_encodings (torch.Tensor): source encoding sequences, with + shape `(B, T, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``source_encodings``. + target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``target_encodings``. + + Returns: + (torch.Tensor, torch.Tensor, torch.Tensor): + torch.Tensor + joint network output, with shape `(B, T, U, output_dim)`. + torch.Tensor + output source lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 1 for i-th batch element in joint network output. + torch.Tensor + output target lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 2 for i-th batch element in joint network output. + """ + joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous() + activation_out = self.activation(joint_encodings) + output = self.linear(activation_out) + return output, source_lengths, target_lengths + + +class TransparentConformerEncoderV1(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,))) + + torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1)) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + final = transparent_weights[0] * x + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + final = final + (transparent_weights[i + 1] * x) + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + JOINER_DIM = 512 + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV1(cfg=conformer_config) + self.predictor = Predictor( + num_symbols=self.cfg.label_target_size + 1, + output_dim=JOINER_DIM, + symbol_embedding_dim=256, + num_lstm_layers=1, + lstm_hidden_dim=1024, + lstm_dropout=0.0, + ) + self.joiner = Joiner( + input_dim=JOINER_DIM, + output_dim=self.cfg.label_target_size + 1, + ) + self.encoder_out_linear = nn.Linear(conformer_size, JOINER_DIM) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0) + # No particular weight init! + + def forward( + self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :param labels: [B, N] + :param labels_len: length of N as [B] + :return: logprobs [B, T + N, #labels + blank] + """ + + run_ctx = get_run_ctx() + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.encoder_out_linear(conformer_out) + conformer_out_lengths = torch.sum(out_mask, dim=1) # [B, T] -> [B] + + predict_out, _, _ = self.predictor( + input=labels, + lengths=labels_len, + ) + + if self.training and run_ctx.epoch < self.specaug_start_epoch: + predict_out *= 0 + + output_logits, src_len, tgt_len = self.joiner( + source_encodings=conformer_out, + source_lengths=conformer_out_lengths, + target_encodings=predict_out, + target_lengths=labels_len, + ) # output is [B, T, N, #vocab] + + return output_logits, src_len + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1]) + prepended_targets[:, 1:] = labels + prepended_targets[:, 0] = model.cfg.label_target_size # blank is last index + prepended_target_lengths = labels_len + 1 + + logits, audio_features_len = model( + raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths + ) + + rnnt_loss = model.loss( + logits=logits, + logit_lengths=audio_features_len.to(dtype=torch.int32), + targets=labels, + target_lengths=labels_len.to(dtype=torch.int32), + ) + + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5.py new file mode 100644 index 000000000..3af72a796 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5.py @@ -0,0 +1,357 @@ +""" +Modified from v4 with proper configuration for the predictor and using i6models feature extraction +""" + +import numpy as np +import torch +import torchaudio +from torch import nn +from typing import List, Optional, Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_v5_cfg import ModelConfig, PredictorConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Predictor(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) prediction network. + + Taken from torchaudio + """ + + def __init__(self, cfg: PredictorConfig, label_target_size: int, output_dim: int) -> None: + """ + + :param cfg: model configuration for the predictor + :param label_target_size: shared value from model + :param output_dim: shared value from model + """ + super().__init__() + self.embedding = torch.nn.Embedding(label_target_size, cfg.symbol_embedding_dim) + self.input_layer_norm = torch.nn.LayerNorm(cfg.symbol_embedding_dim) + self.lstm_layers = torch.nn.ModuleList( + [ + nn.LSTM( + input_size=cfg.symbol_embedding_dim if idx == 0 else cfg.lstm_hidden_dim, + hidden_size=cfg.lstm_hidden_dim, + ) + for idx in range(cfg.num_lstm_layers) + ] + ) + self.dropout = torch.nn.Dropout(p=cfg.lstm_dropout) + self.linear = torch.nn.Linear(cfg.lstm_hidden_dim, output_dim) + self.output_layer_norm = torch.nn.LayerNorm(output_dim) + + self.lstm_dropout = cfg.lstm_dropout + + def forward( + self, + input: torch.Tensor, + lengths: torch.Tensor, + state: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass. + + B: batch size; + U: maximum sequence length in batch; + D: feature dimension of each input sequence element. + + Args: + input (torch.Tensor): target sequences, with shape `(B, U)` and each element + mapping to a target symbol, i.e. in range `[0, num_symbols)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing internal state generated in preceding invocation + of ``forward``. (Default: ``None``) + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output encoding sequences, with shape `(B, U, output_dim)` + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output encoding sequences. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing internal state generated in current invocation of ``forward``. + """ + input_tb = input.permute(1, 0) + embedding_out = self.embedding(input_tb) + input_layer_norm_out = self.input_layer_norm(embedding_out) + + lstm_out = input_layer_norm_out + state_out: List[List[torch.Tensor]] = [] + for layer_idx, lstm in enumerate(self.lstm_layers): + lstm_out, lstm_state_out = lstm(lstm_out, None if state is None else state[layer_idx]) + lstm_out = self.dropout(lstm_out) + state_out.append(lstm_state_out) + + linear_out = self.linear(lstm_out) + output_layer_norm_out = self.output_layer_norm(linear_out) + return output_layer_norm_out.permute(1, 0, 2), lengths, state_out + + +class Joiner(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) joint network. + + Args: + input_dim (int): source and target input dimension. + output_dim (int): output dimension. + activation (str, optional): activation function to use in the joiner. + Must be one of ("relu", "tanh"). (Default: "relu") + + Taken directly from torchaudio + """ + + def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None: + super().__init__() + self.linear = torch.nn.Linear(input_dim, output_dim, bias=True) + if activation == "relu": + self.activation = torch.nn.ReLU() + elif activation == "tanh": + self.activation = torch.nn.Tanh() + else: + raise ValueError(f"Unsupported activation {activation}") + + def forward( + self, + source_encodings: torch.Tensor, + source_lengths: torch.Tensor, + target_encodings: torch.Tensor, + target_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + T: maximum source sequence length in batch; + U: maximum target sequence length in batch; + D: dimension of each source and target sequence encoding. + + Args: + source_encodings (torch.Tensor): source encoding sequences, with + shape `(B, T, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``source_encodings``. + target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``target_encodings``. + + Returns: + (torch.Tensor, torch.Tensor, torch.Tensor): + torch.Tensor + joint network output, with shape `(B, T, U, output_dim)`. + torch.Tensor + output source lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 1 for i-th batch element in joint network output. + torch.Tensor + output target lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 2 for i-th batch element in joint network output. + """ + joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous() + activation_out = self.activation(joint_encodings) + output = self.linear(activation_out) + return output, source_lengths, target_lengths + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.predictor = Predictor( + cfg=self.cfg.predictor_config, + label_target_size=self.cfg.label_target_size + 1, # ctc blank added + output_dim=self.cfg.joiner_dim, + ) + self.joiner = Joiner( + input_dim=self.cfg.joiner_dim, + output_dim=self.cfg.label_target_size + 1, + activation=self.cfg.joiner_activation, + ) + self.encoder_out_linear = nn.Linear(conformer_size, self.cfg.joiner_dim) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0) + # No particular weight init! + + def forward( + self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :param labels: [B, N] + :param labels_len: length of N as [B] + :return: logprobs [B, T + N, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.encoder_out_linear(conformer_out) + conformer_out_lengths = torch.sum(out_mask, dim=1) # [B, T] -> [B] + + predict_out, _, _ = self.predictor( + input=labels, + lengths=labels_len, + ) + + output_logits, src_len, tgt_len = self.joiner( + source_encodings=conformer_out, + source_lengths=conformer_out_lengths, + target_encodings=predict_out, + target_lengths=labels_len, + ) # output is [B, T, N, #vocab] + + return output_logits, src_len + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B], cpu transfer needed only for Mini-RETURNN + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1]) + prepended_targets[:, 1:] = labels + prepended_targets[:, 0] = model.cfg.label_target_size # blank is last index + prepended_target_lengths = labels_len + 1 + + logits, audio_features_len = model( + raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths + ) + + rnnt_loss = model.loss( + logits=logits, + logit_lengths=audio_features_len.to(dtype=torch.int32), + targets=labels, + target_lengths=labels_len.to(dtype=torch.int32), + ) + + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_cfg.py new file mode 100644 index 000000000..fe9b127e4 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_cfg.py @@ -0,0 +1,103 @@ +""" +Config for the base CTC models v4, including specaug start time +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class PredictorConfig(ModelConfiguration): + symbol_embedding_dim: int + num_lstm_layers: int + lstm_hidden_dim: int + lstm_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + return PredictorConfig(**d) + + +@dataclass +class ConformerEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return SpecaugConfig(**d) + + +@dataclass +class ModelConfig: + frontend_config: VGG4LayerActFrontendV1Config + predictor_config: PredictorConfig + specaug_config: SpecaugConfig + specauc_start_epoch: int + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + joiner_dim: int + joiner_activation: str + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"]) + d["predictor_config"] = PredictorConfig.from_dict(d["predictor_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_transparent.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_transparent.py new file mode 100644 index 000000000..6ebd64d03 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_transparent.py @@ -0,0 +1,402 @@ +""" +Modified from v4 with proper configuration for the predictor and using i6models feature extraction +""" + +import numpy as np +import torch +import torchaudio +from torch import nn +from typing import List, Optional, Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_v5_cfg import ModelConfig, PredictorConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Predictor(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) prediction network. + + Taken from torchaudio + """ + + def __init__(self, cfg: PredictorConfig, label_target_size: int, output_dim: int) -> None: + """ + + :param cfg: model configuration for the predictor + :param label_target_size: shared value from model + :param output_dim: shared value from model + """ + super().__init__() + self.embedding = torch.nn.Embedding(label_target_size, cfg.symbol_embedding_dim) + self.input_layer_norm = torch.nn.LayerNorm(cfg.symbol_embedding_dim) + self.lstm_layers = torch.nn.ModuleList( + [ + nn.LSTM( + input_size=cfg.symbol_embedding_dim if idx == 0 else cfg.lstm_hidden_dim, + hidden_size=cfg.lstm_hidden_dim, + ) + for idx in range(cfg.num_lstm_layers) + ] + ) + self.dropout = torch.nn.Dropout(p=cfg.lstm_dropout) + self.linear = torch.nn.Linear(cfg.lstm_hidden_dim, output_dim) + self.output_layer_norm = torch.nn.LayerNorm(output_dim) + + self.lstm_dropout = cfg.lstm_dropout + + def forward( + self, + input: torch.Tensor, + lengths: torch.Tensor, + state: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass. + + B: batch size; + U: maximum sequence length in batch; + D: feature dimension of each input sequence element. + + Args: + input (torch.Tensor): target sequences, with shape `(B, U)` and each element + mapping to a target symbol, i.e. in range `[0, num_symbols)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing internal state generated in preceding invocation + of ``forward``. (Default: ``None``) + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output encoding sequences, with shape `(B, U, output_dim)` + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output encoding sequences. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing internal state generated in current invocation of ``forward``. + """ + input_tb = input.permute(1, 0) + embedding_out = self.embedding(input_tb) + input_layer_norm_out = self.input_layer_norm(embedding_out) + + lstm_out = input_layer_norm_out + state_out: List[List[torch.Tensor]] = [] + for layer_idx, lstm in enumerate(self.lstm_layers): + lstm_out, lstm_state_out = lstm( + lstm_out, None if state is None else [s.permute(1, 0, 2) for s in state[layer_idx]] + ) + lstm_out = self.dropout(lstm_out) + state_out.append([s.permute(1, 0, 2) for s in lstm_state_out]) + + linear_out = self.linear(lstm_out) + output_layer_norm_out = self.output_layer_norm(linear_out) + return output_layer_norm_out.permute(1, 0, 2), lengths, state_out + + +class Joiner(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) joint network. + + Args: + input_dim (int): source and target input dimension. + output_dim (int): output dimension. + activation (str, optional): activation function to use in the joiner. + Must be one of ("relu", "tanh"). (Default: "relu") + + Taken directly from torchaudio + """ + + def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None: + super().__init__() + self.linear = torch.nn.Linear(input_dim, output_dim, bias=True) + if activation == "relu": + self.activation = torch.nn.ReLU() + elif activation == "tanh": + self.activation = torch.nn.Tanh() + else: + raise ValueError(f"Unsupported activation {activation}") + + def forward( + self, + source_encodings: torch.Tensor, + source_lengths: torch.Tensor, + target_encodings: torch.Tensor, + target_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + T: maximum source sequence length in batch; + U: maximum target sequence length in batch; + D: dimension of each source and target sequence encoding. + + Args: + source_encodings (torch.Tensor): source encoding sequences, with + shape `(B, T, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``source_encodings``. + target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``target_encodings``. + + Returns: + (torch.Tensor, torch.Tensor, torch.Tensor): + torch.Tensor + joint network output, with shape `(B, T, U, output_dim)`. + torch.Tensor + output source lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 1 for i-th batch element in joint network output. + torch.Tensor + output target lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 2 for i-th batch element in joint network output. + """ + joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous() + activation_out = self.activation(joint_encodings) + output = self.linear(activation_out) + return output, source_lengths, target_lengths + + +class TransparentConformerEncoderV1(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,))) + + torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1)) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + final = transparent_weights[0] * x + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + final = final + (transparent_weights[i + 1] * x) + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV1(cfg=conformer_config) + self.predictor = Predictor( + cfg=self.cfg.predictor_config, + label_target_size=self.cfg.label_target_size + 1, # ctc blank added + output_dim=self.cfg.joiner_dim, + ) + self.joiner = Joiner( + input_dim=self.cfg.joiner_dim, + output_dim=self.cfg.label_target_size + 1, + activation=self.cfg.joiner_activation, + ) + self.encoder_out_linear = nn.Linear(conformer_size, self.cfg.joiner_dim) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0) + # No particular weight init! + + def forward( + self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :param labels: [B, N] + :param labels_len: length of N as [B] + :return: logprobs [B, T + N, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.encoder_out_linear(conformer_out) + conformer_out_lengths = torch.sum(out_mask, dim=1) # [B, T] -> [B] + + predict_out, _, _ = self.predictor( + input=labels, + lengths=labels_len, + ) + + output_logits, src_len, tgt_len = self.joiner( + source_encodings=conformer_out, + source_lengths=conformer_out_lengths, + target_encodings=predict_out, + target_lengths=labels_len, + ) # output is [B, T, N, #vocab] + + return output_logits, src_len + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B], cpu transfer needed only for Mini-RETURNN + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1]) + prepended_targets[:, 1:] = labels + prepended_targets[:, 0] = model.cfg.label_target_size # blank is last index + prepended_target_lengths = labels_len + 1 + + logits, audio_features_len = model( + raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths + ) + + rnnt_loss = model.loss( + logits=logits, + logit_lengths=audio_features_len.to(dtype=torch.int32), + targets=labels, + target_lengths=labels_len.to(dtype=torch.int32), + ) + + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py new file mode 100644 index 000000000..9d44ffb0c --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py @@ -0,0 +1,105 @@ +""" +Config for the base CTC models v4, including specaug start time +""" + +from dataclasses import dataclass + +import torch +from torch import nn +from typing import Callable, Optional, Type, Union + +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config +from i6_models.config import ModuleFactoryV1, ModelConfiguration + + +@dataclass(kw_only=True) +class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config): + activation_str: str = "" + activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None + + @classmethod + def from_dict(cls, d): + d = d.copy() + activation_str = d.pop("activation_str") + if activation_str == "ReLU": + from torch.nn import ReLU + + activation = ReLU() + else: + assert False, "Unsupported activation %s" % d["activation_str"] + d["activation"] = activation + return VGG4LayerActFrontendV1Config(**d) + + +@dataclass +class PredictorConfig(ModelConfiguration): + symbol_embedding_dim: int + emebdding_dropout: float + num_lstm_layers: int + lstm_hidden_dim: int + lstm_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + return PredictorConfig(**d) + + +@dataclass +class ConformerEncoderV1Config(ModelConfiguration): + """ + Attributes: + num_layers: Number of conformer layers in the conformer encoder + frontend: A pair of ConformerFrontend and corresponding config + block_cfg: Configuration for ConformerBlockV1 + """ + + num_layers: int + + # nested configurations + frontend: ModuleFactoryV1 + block_cfg: ConformerBlockV1Config + + +@dataclass +class SpecaugConfig(ModelConfiguration): + repeat_per_n_frames: int + max_dim_time: int + num_repeat_feat: int + max_dim_feat: int + + @classmethod + def from_dict(cls, d): + d = d.copy() + return SpecaugConfig(**d) + + +@dataclass +class ModelConfig: + frontend_config: VGG4LayerActFrontendV1Config + predictor_config: PredictorConfig + specaug_config: SpecaugConfig + specauc_start_epoch: int + label_target_size: int + conformer_size: int + num_layers: int + num_heads: int + ff_dim: int + att_weights_dropout: float + conv_dropout: float + ff_dropout: float + mhsa_dropout: float + conv_kernel_size: int + final_dropout: float + joiner_dim: int + joiner_activation: str + joiner_dropout: float + + @classmethod + def from_dict(cls, d): + d = d.copy() + d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"]) + d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"]) + d["predictor_config"] = PredictorConfig.from_dict(d["predictor_config"]) + return ModelConfig(**d) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_transparent.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_transparent.py new file mode 100644 index 000000000..cc1a499ca --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_transparent.py @@ -0,0 +1,408 @@ +""" +Modified from v4 with proper configuration for the predictor and using i6models feature extraction + +Has a bug where joiner dropout is not set +""" + +import numpy as np +import torch +import torchaudio +from torch import nn +from typing import List, Optional, Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ModelConfig, PredictorConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Predictor(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) prediction network. + + Taken from torchaudio + """ + + def __init__(self, cfg: PredictorConfig, label_target_size: int, output_dim: int) -> None: + """ + + :param cfg: model configuration for the predictor + :param label_target_size: shared value from model + :param output_dim: shared value from model + """ + super().__init__() + self.embedding = torch.nn.Embedding(label_target_size, cfg.symbol_embedding_dim) + self.embedding_dropout = nn.Dropout(cfg.emebdding_dropout) + self.input_layer_norm = torch.nn.LayerNorm(cfg.symbol_embedding_dim) + self.lstm_layers = torch.nn.ModuleList( + [ + nn.LSTM( + input_size=cfg.symbol_embedding_dim if idx == 0 else cfg.lstm_hidden_dim, + hidden_size=cfg.lstm_hidden_dim, + ) + for idx in range(cfg.num_lstm_layers) + ] + ) + self.dropout = torch.nn.Dropout(p=cfg.lstm_dropout) + self.linear = torch.nn.Linear(cfg.lstm_hidden_dim, output_dim) + self.output_layer_norm = torch.nn.LayerNorm(output_dim) + + self.lstm_dropout = cfg.lstm_dropout + + def forward( + self, + input: torch.Tensor, + lengths: torch.Tensor, + state: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass. + + B: batch size; + U: maximum sequence length in batch; + D: feature dimension of each input sequence element. + + Args: + input (torch.Tensor): target sequences, with shape `(B, U)` and each element + mapping to a target symbol, i.e. in range `[0, num_symbols)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing internal state generated in preceding invocation + of ``forward``. (Default: ``None``) + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output encoding sequences, with shape `(B, U, output_dim)` + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output encoding sequences. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing internal state generated in current invocation of ``forward``. + """ + input_tb = input.permute(1, 0) + embedding_out = self.embedding(input_tb) + embedding_out = self.embedding_dropout(embedding_out) + input_layer_norm_out = self.input_layer_norm(embedding_out) + + lstm_out = input_layer_norm_out + state_out: List[List[torch.Tensor]] = [] + for layer_idx, lstm in enumerate(self.lstm_layers): + lstm_out, lstm_state_out = lstm( + lstm_out, None if state is None else [s.permute(1, 0, 2) for s in state[layer_idx]] + ) + lstm_out = self.dropout(lstm_out) + state_out.append([s.permute(1, 0, 2) for s in lstm_state_out]) + + linear_out = self.linear(lstm_out) + output_layer_norm_out = self.output_layer_norm(linear_out) + return output_layer_norm_out.permute(1, 0, 2), lengths, state_out + + +class Joiner(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) joint network. + + Args: + input_dim (int): source and target input dimension. + output_dim (int): output dimension. + activation (str, optional): activation function to use in the joiner. + Must be one of ("relu", "tanh"). (Default: "relu") + + Taken directly from torchaudio + """ + + def __init__(self, input_dim: int, output_dim: int, activation: str = "relu", dropout: float = 0.0) -> None: + super().__init__() + self.linear = torch.nn.Linear(input_dim, output_dim, bias=True) + self.dropout = nn.Dropout(p=dropout) + if activation == "relu": + self.activation = torch.nn.ReLU() + elif activation == "tanh": + self.activation = torch.nn.Tanh() + else: + raise ValueError(f"Unsupported activation {activation}") + + def forward( + self, + source_encodings: torch.Tensor, + source_lengths: torch.Tensor, + target_encodings: torch.Tensor, + target_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + T: maximum source sequence length in batch; + U: maximum target sequence length in batch; + D: dimension of each source and target sequence encoding. + + Args: + source_encodings (torch.Tensor): source encoding sequences, with + shape `(B, T, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``source_encodings``. + target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``target_encodings``. + + Returns: + (torch.Tensor, torch.Tensor, torch.Tensor): + torch.Tensor + joint network output, with shape `(B, T, U, output_dim)`. + torch.Tensor + output source lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 1 for i-th batch element in joint network output. + torch.Tensor + output target lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 2 for i-th batch element in joint network output. + """ + joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous() + joint_encodings = self.dropout(joint_encodings) + activation_out = self.activation(joint_encodings) + output = self.linear(activation_out) + return output, source_lengths, target_lengths + + +class TransparentConformerEncoderV1(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,))) + + torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1)) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + final = transparent_weights[0] * x + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + final = final + (transparent_weights[i + 1] * x) + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV1(cfg=conformer_config) + self.predictor = Predictor( + cfg=self.cfg.predictor_config, + label_target_size=self.cfg.label_target_size + 1, # ctc blank added + output_dim=self.cfg.joiner_dim, + ) + self.joiner = Joiner( + input_dim=self.cfg.joiner_dim, + output_dim=self.cfg.label_target_size + 1, + activation=self.cfg.joiner_activation, + ) + self.encoder_out_linear = nn.Linear(conformer_size, self.cfg.joiner_dim) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0) + # No particular weight init! + + def forward( + self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :param labels: [B, N] + :param labels_len: length of N as [B] + :return: logprobs [B, T + N, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.encoder_out_linear(conformer_out) + conformer_out_lengths = torch.sum(out_mask, dim=1) # [B, T] -> [B] + + predict_out, _, _ = self.predictor( + input=labels, + lengths=labels_len, + ) + + output_logits, src_len, tgt_len = self.joiner( + source_encodings=conformer_out, + source_lengths=conformer_out_lengths, + target_encodings=predict_out, + target_lengths=labels_len, + ) # output is [B, T, N, #vocab] + + return output_logits, src_len + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B], cpu transfer needed only for Mini-RETURNN + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1]) + prepended_targets[:, 1:] = labels + prepended_targets[:, 0] = model.cfg.label_target_size # blank is last index + prepended_target_lengths = labels_len + 1 + + logits, audio_features_len = model( + raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths + ) + + rnnt_loss = model.loss( + logits=logits, + logit_lengths=audio_features_len.to(dtype=torch.int32), + targets=labels, + target_lengths=labels_len.to(dtype=torch.int32), + ) + + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7.py new file mode 100644 index 000000000..d18869944 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7.py @@ -0,0 +1,366 @@ +""" +Modified from v4 with proper configuration for the predictor and using i6models feature extraction + +Sets joiner dropout correctly +""" + +import numpy as np +import torch +import torchaudio +from torch import nn +from typing import List, Optional, Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ModelConfig, PredictorConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Predictor(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) prediction network. + + Taken from torchaudio + """ + + def __init__(self, cfg: PredictorConfig, label_target_size: int, output_dim: int) -> None: + """ + + :param cfg: model configuration for the predictor + :param label_target_size: shared value from model + :param output_dim: shared value from model + """ + super().__init__() + self.embedding = torch.nn.Embedding(label_target_size, cfg.symbol_embedding_dim) + self.embedding_dropout = nn.Dropout(cfg.emebdding_dropout) + self.input_layer_norm = torch.nn.LayerNorm(cfg.symbol_embedding_dim) + self.lstm_layers = torch.nn.ModuleList( + [ + nn.LSTM( + input_size=cfg.symbol_embedding_dim if idx == 0 else cfg.lstm_hidden_dim, + hidden_size=cfg.lstm_hidden_dim, + ) + for idx in range(cfg.num_lstm_layers) + ] + ) + self.dropout = torch.nn.Dropout(p=cfg.lstm_dropout) + self.linear = torch.nn.Linear(cfg.lstm_hidden_dim, output_dim) + self.output_layer_norm = torch.nn.LayerNorm(output_dim) + + self.lstm_dropout = cfg.lstm_dropout + + def forward( + self, + input: torch.Tensor, + lengths: torch.Tensor, + state: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass. + + B: batch size; + U: maximum sequence length in batch; + D: feature dimension of each input sequence element. + + Args: + input (torch.Tensor): target sequences, with shape `(B, U)` and each element + mapping to a target symbol, i.e. in range `[0, num_symbols)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing internal state generated in preceding invocation + of ``forward``. (Default: ``None``) + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output encoding sequences, with shape `(B, U, output_dim)` + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output encoding sequences. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing internal state generated in current invocation of ``forward``. + """ + input_tb = input.permute(1, 0) + embedding_out = self.embedding(input_tb) + embedding_out = self.embedding_dropout(embedding_out) + input_layer_norm_out = self.input_layer_norm(embedding_out) + + lstm_out = input_layer_norm_out + state_out: List[List[torch.Tensor]] = [] + for layer_idx, lstm in enumerate(self.lstm_layers): + lstm_out, lstm_state_out = lstm( + lstm_out, None if state is None else [s.permute(1, 0, 2) for s in state[layer_idx]] + ) + lstm_out = self.dropout(lstm_out) + state_out.append([s.permute(1, 0, 2) for s in lstm_state_out]) + + linear_out = self.linear(lstm_out) + output_layer_norm_out = self.output_layer_norm(linear_out) + return output_layer_norm_out.permute(1, 0, 2), lengths, state_out + + +class Joiner(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) joint network. + + Args: + input_dim (int): source and target input dimension. + output_dim (int): output dimension. + activation (str, optional): activation function to use in the joiner. + Must be one of ("relu", "tanh"). (Default: "relu") + + Taken directly from torchaudio + """ + + def __init__(self, input_dim: int, output_dim: int, activation: str = "relu", dropout: float = 0.0) -> None: + super().__init__() + self.linear = torch.nn.Linear(input_dim, output_dim, bias=True) + self.dropout = nn.Dropout(p=dropout) + if activation == "relu": + self.activation = torch.nn.ReLU() + elif activation == "tanh": + self.activation = torch.nn.Tanh() + else: + raise ValueError(f"Unsupported activation {activation}") + + def forward( + self, + source_encodings: torch.Tensor, + source_lengths: torch.Tensor, + target_encodings: torch.Tensor, + target_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + T: maximum source sequence length in batch; + U: maximum target sequence length in batch; + D: dimension of each source and target sequence encoding. + + Args: + source_encodings (torch.Tensor): source encoding sequences, with + shape `(B, T, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``source_encodings``. + target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``target_encodings``. + + Returns: + (torch.Tensor, torch.Tensor, torch.Tensor): + torch.Tensor + joint network output, with shape `(B, T, U, output_dim)`. + torch.Tensor + output source lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 1 for i-th batch element in joint network output. + torch.Tensor + output target lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 2 for i-th batch element in joint network output. + """ + joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous() + joint_encodings = self.dropout(joint_encodings) + activation_out = self.activation(joint_encodings) + output = self.linear(activation_out) + return output, source_lengths, target_lengths + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = ConformerEncoderV1(cfg=conformer_config) + self.predictor = Predictor( + cfg=self.cfg.predictor_config, + label_target_size=self.cfg.label_target_size + 1, # ctc blank added + output_dim=self.cfg.joiner_dim, + ) + self.joiner = Joiner( + input_dim=self.cfg.joiner_dim, + output_dim=self.cfg.label_target_size + 1, + activation=self.cfg.joiner_activation, + dropout=self.cfg.joiner_dropout, + ) + self.encoder_out_linear = nn.Linear(conformer_size, self.cfg.joiner_dim) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0) + # No particular weight init! + + def forward( + self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :param labels: [B, N] + :param labels_len: length of N as [B] + :return: logprobs [B, T + N, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.encoder_out_linear(conformer_out) + conformer_out_lengths = torch.sum(out_mask, dim=1) # [B, T] -> [B] + + predict_out, _, _ = self.predictor( + input=labels, + lengths=labels_len, + ) + + output_logits, src_len, tgt_len = self.joiner( + source_encodings=conformer_out, + source_lengths=conformer_out_lengths, + target_encodings=predict_out, + target_lengths=labels_len, + ) # output is [B, T, N, #vocab] + + return output_logits, src_len + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B], cpu transfer needed only for Mini-RETURNN + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1]) + prepended_targets[:, 1:] = labels + prepended_targets[:, 0] = model.cfg.label_target_size # blank is last index + prepended_target_lengths = labels_len + 1 + + logits, audio_features_len = model( + raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths + ) + + rnnt_loss = model.loss( + logits=logits, + logit_lengths=audio_features_len.to(dtype=torch.int32), + targets=labels, + target_lengths=labels_len.to(dtype=torch.int32), + ) + + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent.py new file mode 100644 index 000000000..26a8c8d94 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent.py @@ -0,0 +1,409 @@ +""" +Modified from v4 with proper configuration for the predictor and using i6models feature extraction + +Sets joiner dropout correctly +""" + +import numpy as np +import torch +import torchaudio +from torch import nn +from typing import List, Optional, Tuple + +from i6_models.parts.conformer.norm import LayerNormNC +from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config +from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1 +from i6_models.config import ModuleFactoryV1 +from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1 + +from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config +from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config +from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config +from i6_models.primitives.specaugment import specaugment_v1_by_length +from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config + +from returnn.torch.context import get_run_ctx + +from .i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ModelConfig, PredictorConfig + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Predictor(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) prediction network. + + Taken from torchaudio + """ + + def __init__(self, cfg: PredictorConfig, label_target_size: int, output_dim: int) -> None: + """ + + :param cfg: model configuration for the predictor + :param label_target_size: shared value from model + :param output_dim: shared value from model + """ + super().__init__() + self.embedding = torch.nn.Embedding(label_target_size, cfg.symbol_embedding_dim) + self.embedding_dropout = nn.Dropout(cfg.emebdding_dropout) + self.input_layer_norm = torch.nn.LayerNorm(cfg.symbol_embedding_dim) + self.lstm_layers = torch.nn.ModuleList( + [ + nn.LSTM( + input_size=cfg.symbol_embedding_dim if idx == 0 else cfg.lstm_hidden_dim, + hidden_size=cfg.lstm_hidden_dim, + ) + for idx in range(cfg.num_lstm_layers) + ] + ) + self.dropout = torch.nn.Dropout(p=cfg.lstm_dropout) + self.linear = torch.nn.Linear(cfg.lstm_hidden_dim, output_dim) + self.output_layer_norm = torch.nn.LayerNorm(output_dim) + + self.lstm_dropout = cfg.lstm_dropout + + def forward( + self, + input: torch.Tensor, + lengths: torch.Tensor, + state: Optional[List[List[torch.Tensor]]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + r"""Forward pass. + + B: batch size; + U: maximum sequence length in batch; + D: feature dimension of each input sequence element. + + Args: + input (torch.Tensor): target sequences, with shape `(B, U)` and each element + mapping to a target symbol, i.e. in range `[0, num_symbols)`. + lengths (torch.Tensor): with shape `(B,)` and i-th element representing + number of valid frames for i-th batch element in ``input``. + state (List[List[torch.Tensor]] or None, optional): list of lists of tensors + representing internal state generated in preceding invocation + of ``forward``. (Default: ``None``) + + Returns: + (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]): + torch.Tensor + output encoding sequences, with shape `(B, U, output_dim)` + torch.Tensor + output lengths, with shape `(B,)` and i-th element representing + number of valid elements for i-th batch element in output encoding sequences. + List[List[torch.Tensor]] + output states; list of lists of tensors + representing internal state generated in current invocation of ``forward``. + """ + input_tb = input.permute(1, 0) + embedding_out = self.embedding(input_tb) + embedding_out = self.embedding_dropout(embedding_out) + input_layer_norm_out = self.input_layer_norm(embedding_out) + + lstm_out = input_layer_norm_out + state_out: List[List[torch.Tensor]] = [] + for layer_idx, lstm in enumerate(self.lstm_layers): + lstm_out, lstm_state_out = lstm( + lstm_out, None if state is None else [s.permute(1, 0, 2) for s in state[layer_idx]] + ) + lstm_out = self.dropout(lstm_out) + state_out.append([s.permute(1, 0, 2) for s in lstm_state_out]) + + linear_out = self.linear(lstm_out) + output_layer_norm_out = self.output_layer_norm(linear_out) + return output_layer_norm_out.permute(1, 0, 2), lengths, state_out + + +class Joiner(torch.nn.Module): + r"""Recurrent neural network transducer (RNN-T) joint network. + + Args: + input_dim (int): source and target input dimension. + output_dim (int): output dimension. + activation (str, optional): activation function to use in the joiner. + Must be one of ("relu", "tanh"). (Default: "relu") + + Taken directly from torchaudio + """ + + def __init__(self, input_dim: int, output_dim: int, activation: str = "relu", dropout: float = 0.0) -> None: + super().__init__() + self.linear = torch.nn.Linear(input_dim, output_dim, bias=True) + self.dropout = nn.Dropout(p=dropout) + if activation == "relu": + self.activation = torch.nn.ReLU() + elif activation == "tanh": + self.activation = torch.nn.Tanh() + else: + raise ValueError(f"Unsupported activation {activation}") + + def forward( + self, + source_encodings: torch.Tensor, + source_lengths: torch.Tensor, + target_encodings: torch.Tensor, + target_lengths: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + r"""Forward pass for training. + + B: batch size; + T: maximum source sequence length in batch; + U: maximum target sequence length in batch; + D: dimension of each source and target sequence encoding. + + Args: + source_encodings (torch.Tensor): source encoding sequences, with + shape `(B, T, D)`. + source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``source_encodings``. + target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`. + target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing + valid sequence length of i-th batch element in ``target_encodings``. + + Returns: + (torch.Tensor, torch.Tensor, torch.Tensor): + torch.Tensor + joint network output, with shape `(B, T, U, output_dim)`. + torch.Tensor + output source lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 1 for i-th batch element in joint network output. + torch.Tensor + output target lengths, with shape `(B,)` and i-th element representing + number of valid elements along dim 2 for i-th batch element in joint network output. + """ + joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous() + joint_encodings = self.dropout(joint_encodings) + activation_out = self.activation(joint_encodings) + output = self.linear(activation_out) + return output, source_lengths, target_lengths + + +class TransparentConformerEncoderV1(nn.Module): + """ + Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication. + + The model consists of a frontend and a stack of N conformer blocks. + C.f. https://arxiv.org/pdf/2005.08100.pdf + """ + + def __init__(self, cfg: ConformerEncoderV1Config): + """ + :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks + """ + super().__init__() + + self.frontend = cfg.frontend() + self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]) + self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,))) + + torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1)) + + def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + :param data_tensor: input tensor of shape [B, T', F] + :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T'] + :return: (output, out_seq_mask) + where output is torch.Tensor of shape [B, T, F'], + out_seq_mask is a torch.Tensor of shape [B, T] + + F: input feature dim, F': internal and output feature dim + T': data time dim, T: down-sampled time dim (internal time dim) + """ + x, sequence_mask = self.frontend(data_tensor, sequence_mask) # [B, T, F'] + + transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0) + print(transparent_weights) + + final = transparent_weights[0] * x + for i, module in enumerate(self.module_list): + x = module(x, sequence_mask) # [B, T, F'] + final = final + (transparent_weights[i + 1] * x) + return final, sequence_mask + + +class Model(torch.nn.Module): + def __init__(self, model_config_dict, **kwargs): + super().__init__() + self.cfg = ModelConfig.from_dict(model_config_dict) + fe_config = LogMelFeatureExtractionV1Config( + sample_rate=16000, + win_size=0.025, + hop_size=0.01, + f_min=60, + f_max=7600, + min_amp=1e-10, + num_filters=80, + center=False, + ) + frontend_config = self.cfg.frontend_config + conformer_size = self.cfg.conformer_size + conformer_config = ConformerEncoderV1Config( + num_layers=self.cfg.num_layers, + frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config), + block_cfg=ConformerBlockV1Config( + ff_cfg=ConformerPositionwiseFeedForwardV1Config( + input_dim=conformer_size, + hidden_dim=self.cfg.ff_dim, + dropout=self.cfg.ff_dropout, + activation=nn.functional.silu, + ), + mhsa_cfg=ConformerMHSAV1Config( + input_dim=conformer_size, + num_att_heads=self.cfg.num_heads, + att_weights_dropout=self.cfg.att_weights_dropout, + dropout=self.cfg.mhsa_dropout, + ), + conv_cfg=ConformerConvolutionV1Config( + channels=conformer_size, + kernel_size=self.cfg.conv_kernel_size, + dropout=self.cfg.conv_dropout, + activation=nn.functional.silu, + norm=LayerNormNC(conformer_size), + ), + ), + ) + + self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config) + self.conformer = TransparentConformerEncoderV1(cfg=conformer_config) + self.predictor = Predictor( + cfg=self.cfg.predictor_config, + label_target_size=self.cfg.label_target_size + 1, # ctc blank added + output_dim=self.cfg.joiner_dim, + ) + self.joiner = Joiner( + input_dim=self.cfg.joiner_dim, + output_dim=self.cfg.label_target_size + 1, + activation=self.cfg.joiner_activation, + dropout=self.cfg.joiner_dropout, + ) + self.encoder_out_linear = nn.Linear(conformer_size, self.cfg.joiner_dim) + self.specaug_start_epoch = self.cfg.specauc_start_epoch + + self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0) + # No particular weight init! + + def forward( + self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor + ): + """ + :param raw_audio: Audio samples as [B, T, 1] + :param raw_audio_len: length of T as [B] + :param labels: [B, N] + :param labels_len: length of N as [B] + :return: logprobs [B, T + N, #labels + blank] + """ + + squeezed_features = torch.squeeze(raw_audio) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len) + + run_ctx = get_run_ctx() + if self.training and run_ctx.epoch >= self.specaug_start_epoch: + audio_features_masked_2 = specaugment_v1_by_length( + audio_features, + time_min_num_masks=2, + time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames, + time_mask_max_size=self.cfg.specaug_config.max_dim_time, + freq_min_num_masks=2, + freq_mask_max_size=self.cfg.specaug_config.max_dim_feat, + freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat, + ) + else: + audio_features_masked_2 = audio_features + + conformer_in = audio_features_masked_2 + # create the mask for the conformer input + mask = mask_tensor(conformer_in, audio_features_len) + + conformer_out, out_mask = self.conformer(conformer_in, mask) + conformer_out = self.encoder_out_linear(conformer_out) + conformer_out_lengths = torch.sum(out_mask, dim=1) # [B, T] -> [B] + + predict_out, _, _ = self.predictor( + input=labels, + lengths=labels_len, + ) + + output_logits, src_len, tgt_len = self.joiner( + source_encodings=conformer_out, + source_lengths=conformer_out_lengths, + target_encodings=predict_out, + target_lengths=labels_len, + ) # output is [B, T, N, #vocab] + + return output_logits, src_len + + +def train_step(*, model: Model, data, run_ctx, **kwargs): + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"].to("cpu") # [B], cpu transfer needed only for Mini-RETURNN + + labels = data["labels"] # [B, N] (sparse) + labels_len = data["labels:size1"] # [B, N] + + prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1]) + prepended_targets[:, 1:] = labels + prepended_targets[:, 0] = model.cfg.label_target_size # blank is last index + prepended_target_lengths = labels_len + 1 + + logits, audio_features_len = model( + raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths + ) + + rnnt_loss = model.loss( + logits=logits, + logit_lengths=audio_features_len.to(dtype=torch.int32), + targets=labels, + target_lengths=labels_len.to(dtype=torch.int32), + ) + + num_phonemes = torch.sum(labels_len) + run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes) + + +def prior_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.sum_probs = None + run_ctx.sum_frames = 0 + + +def prior_finish_hook(run_ctx, **kwargs): + all_frames = run_ctx.sum_frames.detach().cpu().numpy() + all_probs = run_ctx.sum_probs.detach().cpu().numpy() + average_probs = all_probs / all_frames + log_average_probs = np.log(average_probs) + print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs)) + with open("prior.txt", "w") as f: + np.savetxt(f, log_average_probs, delimiter=" ") + print("Saved prior in prior.txt in +log space.") + + +def prior_step(*, model: Model, data, run_ctx, **kwargs): + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + logprobs, audio_features_len = model( + raw_audio=raw_audio, + raw_audio_len=raw_audio_len, + ) + + probs = torch.exp(logprobs) + run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len) + if run_ctx.sum_probs is None: + run_ctx.sum_probs = torch.sum(probs, dim=(0, 1)) + else: + run_ctx.sum_probs += torch.sum(probs, dim=(0, 1)) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/experimental_rnnt_decoder.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/experimental_rnnt_decoder.py new file mode 100644 index 000000000..386d5dfb0 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/experimental_rnnt_decoder.py @@ -0,0 +1,153 @@ +""" +Experimental RNNT decoder +""" + +from typing import Callable, Dict, List, Optional, Tuple +import time +import numpy as np +import torch +from torch import nn + +from torchaudio.models import RNNT +from .rnnt_beam_search import ModifiedRNNTBeamSearch + +import torch + + +def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor: + """ + mask a tensor with a "positive" mask (boolean true means position is used) + + This function is traceable. + + :param tensor: [B,T,....] + :param seq_len: [B] + :return: [B,T] + """ + seq_len = seq_len.to(device=tensor.device) + r = torch.arange(tensor.shape[1], device=tensor.device) # [T] + seq_mask = torch.less(r[None, :], seq_len[:, None]) # broadcast to [B,T] + return seq_mask + + +class Transcriber(nn.Module): + def __init__(self, feature_extraction: nn.Module, encoder: nn.Module, mapping: nn.Module): + super().__init__() + self.feature_extraction = feature_extraction + self.encoder = encoder + self.mapping = mapping + + def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + + :param input: + :param lengths: + :return: + """ + + squeezed_features = torch.squeeze(input) + with torch.no_grad(): + audio_features, audio_features_len = self.feature_extraction(squeezed_features, lengths) + + mask = mask_tensor(audio_features, audio_features_len) + + encoder_out, out_mask = self.encoder(audio_features, mask) + encoder_out = self.mapping(encoder_out) + encoder_out_lengths = torch.sum(out_mask, dim=1) # [B, T] -> [B] + + return encoder_out, encoder_out_lengths + + def infer( + self, + input: torch.Tensor, + lengths: torch.Tensor, + states: Optional[List[List[torch.Tensor]]], + ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]: + output, out_lengths = self.forward(input, lengths) + return output, out_lengths, [[]] + + +def forward_init_hook(run_ctx, **kwargs): + # we are storing durations, but call it output.hdf to match + # the default output of the ReturnnForwardJob + run_ctx.recognition_file = open("search_out.py", "wt") + run_ctx.recognition_file.write("{\n") + + from returnn.datasets.util.vocabulary import Vocabulary + + vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None) + run_ctx.labels = vocab.labels + + run_ctx.rnnt_decoder = None + run_ctx.beam_size = kwargs["beam_size"] + + run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None) + + run_ctx.batched_encoder_decoding = kwargs.get("batched_encoder_decoding", False) + + run_ctx.running_audio_len_s = 0 + run_ctx.total_time = 0 + + +def forward_finish_hook(run_ctx, **kwargs): + run_ctx.recognition_file.write("}\n") + run_ctx.recognition_file.close() + + print("Total-time: %.2f, Batch-RTF: %.3f" % (run_ctx.total_time, run_ctx.total_time / run_ctx.running_audio_len_s)) + + +def forward_step(*, model, data, run_ctx, **kwargs): + + if run_ctx.rnnt_decoder is None: + print("create RNNT model...") + rnnt_model = RNNT( + transcriber=Transcriber( + feature_extraction=model.feature_extraction, encoder=model.conformer, mapping=model.encoder_out_linear + ), + predictor=model.predictor, + joiner=model.joiner, + ) + run_ctx.rnnt_decoder = ModifiedRNNTBeamSearch( + model=rnnt_model, + blank=model.cfg.label_target_size, + blank_penalty=run_ctx.blank_log_penalty, + ) + print("done!") + + raw_audio = data["raw_audio"] # [B, T', F] + raw_audio_len = data["raw_audio:size1"] # [B] + + audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000 + run_ctx.running_audio_len_s += audio_len_batch + + start = time.time() + tags = data["seq_tag"] + + hyps = [] + + if run_ctx.batched_encoder_decoding: + batched_hypotheses = run_ctx.rnnt_decoder.forward_semi_batched( + input=raw_audio, + length=raw_audio_len, + beam_width=run_ctx.beam_size, + ) + hyps = [hypothesis[0][0][:-1] for hypothesis in batched_hypotheses] # exclude last sentence end token + else: + for i in range(raw_audio.shape[0]): + hypothesis, states = run_ctx.rnnt_decoder.infer( + input=raw_audio[[i]], + length=raw_audio_len[[i]], + beam_width=run_ctx.beam_size, + ) + hyps.append(hypothesis[0][0][:-1]) # exclude last sentence end token + + total_time = time.time() - start + run_ctx.total_time += total_time + + print("Batch-time: %.2f, Batch-RTF: %.3f" % (total_time, total_time / audio_len_batch)) + + for hyp, tag in zip(hyps, tags): + sequence = [run_ctx.labels[idx] for idx in hyp if idx < len(run_ctx.labels)] + text = " ".join(sequence).replace("@@ ", "") + print(text) + run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(text))) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/rnnt_beam_search.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/rnnt_beam_search.py new file mode 100644 index 000000000..64fe7ffd1 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/rnnt_beam_search.py @@ -0,0 +1,93 @@ +import torch +from typing import Optional, Callable, List +from torchaudio.models.rnnt_decoder import RNNTBeamSearch, RNNT, Hypothesis, _get_hypo_predictor_out + + +class ModifiedRNNTBeamSearch(RNNTBeamSearch): + r"""Beam search decoder for RNN-T model. + + Modified with blank penalty + + See Also: + * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pretrained model. + + Args: + model (RNNT): RNN-T model to use. + blank (int): index of blank token in vocabulary. + temperature (float, optional): temperature to apply to joint network output. + Larger values yield more uniform samples. (Default: 1.0) + hypo_sort_key (Callable[[Hypothesis], float] or None, optional): callable that computes a score + for a given hypothesis to rank hypotheses by. If ``None``, defaults to callable that returns + hypothesis score normalized by token sequence length. (Default: None) + step_max_tokens (int, optional): maximum number of tokens to emit per input time step. (Default: 100) + blank_penalty: blank penalty in log space + """ + + def __init__( + self, + model: RNNT, + blank: int, + temperature: float = 1.0, + hypo_sort_key: Optional[Callable[[Hypothesis], float]] = None, + step_max_tokens: int = 100, + blank_penalty: Optional[float] = None, + ) -> None: + super().__init__( + model=model, + blank=blank, + temperature=temperature, + hypo_sort_key=hypo_sort_key, + step_max_tokens=step_max_tokens, + ) + self.blank_penalty = blank_penalty + + def _gen_next_token_probs( + self, enc_out: torch.Tensor, hypos: List[Hypothesis], device: torch.device + ) -> torch.Tensor: + one_tensor = torch.tensor([1], device=device) + predictor_out = torch.stack([_get_hypo_predictor_out(h) for h in hypos], dim=0) + joined_out, _, _ = self.model.join( + enc_out, + one_tensor, + predictor_out, + torch.tensor([1] * len(hypos), device=device), + ) # [beam_width, 1, 1, num_tokens] + joined_out = torch.nn.functional.log_softmax(joined_out / self.temperature, dim=3) + + if self.blank_penalty is not None: + # assumes blank is last + # joined_out[:, :, :, self.blank] -= self.blank_penalty.to(device=joined_out.device) + joined_out[:, :, :, self.blank] -= self.blank_penalty + + return joined_out[:, 0, 0] + + def forward_semi_batched( + self, input: torch.Tensor, length: torch.Tensor, beam_width: int + ) -> List[List[Hypothesis]]: + r"""Performs beam search for the given input sequence. + + T: number of frames; + D: feature dimension of each frame. + + Args: + input (torch.Tensor): sequence of input frames, with shape (B, T, D). + length (torch.Tensor): number of valid frames in input + sequence, (B,). + beam_width (int): beam size to use during search. + + Returns: + List[Hypothesis]: top-``beam_width`` hypotheses found by beam search. + """ + if input.dim() != 3: + raise ValueError("input must be of shape (B, T, D)") + + if length.dim() != 1: + raise ValueError("length must be of shape (B,)") + + enc_out_batched, _ = self.model.transcribe(input, length) + + search_outputs = [] + for enc_out in enc_out_batched: + search_outputs.append(self._search(enc_out.unsqueeze(0), None, beam_width)) + + return search_outputs diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/specaugment.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/specaugment.py new file mode 100644 index 000000000..bff395505 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/specaugment.py @@ -0,0 +1,81 @@ +import torch + + +def _mask(tensor, batch_axis, axis, pos, max_amount): + batch_dim = tensor.shape[batch_axis] + dim = tensor.shape[axis] + amount = torch.randint(low=1, high=max_amount + 1, size=(batch_dim,), dtype=torch.int32).to(device=tensor.device) + pos2 = torch.min(pos + amount, torch.tensor([dim] * batch_dim).to(device=tensor.device)) + idxs = torch.arange(0, dim).to(device=tensor.device).unsqueeze(0) # [1,dim] + pos_bc = pos.unsqueeze(1) # [B,1] + pos2_bc = pos2.unsqueeze(1) # [B,1] + cond = torch.logical_and(torch.greater_equal(idxs, pos_bc), torch.less(idxs, pos2_bc)) # [B,dim] + if batch_axis > axis: + cond = cond.transpose(0, 1) # [dim,B] + cond = torch.reshape( + cond, shape=[tensor.shape[i] if i in (batch_axis, axis) else 1 for i in range(len(tensor.shape))] + ) + tensor = torch.where(cond, 0.0, tensor) + return tensor + + +def _random_mask(tensor, batch_axis, axis, min_num, max_num, max_dims): + batch_dim = tensor.shape[batch_axis] + if min_num >= max_num: + num_masks = torch.ones((batch_dim,), dtype=torch.int64) * min_num + else: + num_masks = torch.randint(min_num, max_num, size=(batch_dim,)) # [B] + max_num_masks = num_masks.max().item() + z = -torch.log(-torch.log(torch.rand((batch_dim, tensor.shape[axis])).to(device=tensor.device))) # [B,dim] + _, indices = torch.topk(z, max_num_masks, dim=1) + + # Make num_masks broadcastable to shape of tensor for torch.where. + for i in range(tensor.dim() - 1): + if i < batch_axis: + num_masks = num_masks.unsqueeze(0) + else: + num_masks = num_masks.unsqueeze(-1) + + num_masks = num_masks.to(device=tensor.device) + + for i in range(max_num_masks): + tensor = torch.where(i < num_masks, _mask(tensor, batch_axis, axis, indices[:, i], max_dims), tensor) + + return tensor + + +def returnn_specaugment(tensor: torch.Tensor, time_num_masks, time_mask_max_size, freq_num_masks, freq_mask_max_size): + """ + Returnn like specaugment from legacy rossenbach/zeineldeen attention setups (usually called specaugment_v2 or so) + + :param tensor: + :param time_num_masks: + :param time_mask_max_size: + :param freq_num_masks: + :param freq_mask_max_size: + :return: + """ + assert len(tensor.shape) == 3 + tensor = _random_mask(tensor, 0, 1, 2, time_num_masks, time_mask_max_size) # time masking + tensor = _random_mask(tensor, 0, 2, 2, freq_num_masks, freq_mask_max_size) # freq masking + return tensor + + +def returnn_specaugment_by_length(audio_features, repeat_per_n_frames, max_dim_time, num_repeat_feat, max_dim_feat): + """ + like returnn_specaugment, but with length adaptive num of time masks + + :param audio_features: + :param repeat_per_n_frames: + :param max_dim_time: + :param num_repeat_feat: + :param max_dim_feat: + :return: + """ + return returnn_specaugment( + audio_features, + time_num_masks=audio_features.size(1) // repeat_per_n_frames, + time_mask_max_size=max_dim_time, + freq_num_masks=num_repeat_feat, + freq_mask_max_size=max_dim_feat, + ) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/config.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/config.py new file mode 100644 index 000000000..8eda0fd9e --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/config.py @@ -0,0 +1,151 @@ +import copy +from typing import Any, Dict + +from i6_core.returnn.config import ReturnnConfig, CodeWrapper + +from i6_experiments.common.setups.returnn_pytorch.serialization import ( + Collection as TorchCollection, +) +from i6_experiments.common.setups.serialization import Import +from ..data import TrainingDatasets +from ..flashlight_phon_ctc.serializer import get_pytorch_serializer_v3, PACKAGE + + +def get_training_config( + training_datasets: TrainingDatasets, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + use_speed_perturbation=False, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = { + "cleanup_old_models": True, + "stop_on_nonfinite_train_score": True, # this might break now with True + "num_workers_per_gpu": 2, + } + + base_config = { + "max_seqs": 60, + ############# + "train": copy.deepcopy(training_datasets.train.as_returnn_opts()), + "dev": training_datasets.cv.as_returnn_opts(), + "eval_datasets": {"devtrain": training_datasets.devtrain.as_returnn_opts()}, + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, net_args=net_args, debug=debug, use_custom_engine=use_custom_engine + ) + python_prolog = None + if use_speed_perturbation: + prolog_serializer = TorchCollection( + serializer_objects=[ + Import( + code_object_path=PACKAGE + ".dataset_code.speed_perturbation.legacy_speed_perturbation", + unhashed_package_root=PACKAGE, + ) + ] + ) + python_prolog = [prolog_serializer] + config["train"]["datasets"]["zip_dataset"]["audio"]["pre_process"] = CodeWrapper("legacy_speed_perturbation") + + returnn_config = ReturnnConfig( + config=config, post_config=post_config, python_prolog=python_prolog, python_epilog=[serializer] + ) + return returnn_config + + +def get_prior_config( + training_datasets: TrainingDatasets, + network_module: str, + net_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = {} + + base_config = { + ############# + "batch_size": 50000 * 160, + "max_seqs": 60, + ############# + "forward": training_datasets.prior.as_returnn_opts(), + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + prior=True, + ) + returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) + return returnn_config + + +def get_search_config( + network_module: str, + net_args: Dict[str, Any], + decoder: [str], + decoder_args: Dict[str, Any], + config: Dict[str, Any], + debug: bool = False, + use_custom_engine=False, + **kwargs, +): + """ + Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner + :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob + :param training_datasets: datasets for training + :param kwargs: arguments to be passed to the network construction + :return: RETURNN training config + """ + + # changing these does not change the hash + post_config = {} + + base_config = { + ############# + "batch_size": 24000 * 160, + "max_seqs": 60, + ############# + # dataset is added later in the pipeline during search_single + } + config = {**base_config, **copy.deepcopy(config)} + post_config["backend"] = "torch" + + serializer = get_pytorch_serializer_v3( + network_module=network_module, + net_args=net_args, + debug=debug, + use_custom_engine=use_custom_engine, + decoder=decoder, + decoder_args=decoder_args, + ) + returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer]) + return returnn_config diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/data.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/data.py new file mode 100644 index 000000000..35d50f268 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/data.py @@ -0,0 +1,92 @@ +""" +The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups +""" +from sisyphus import tk +from dataclasses import dataclass +from functools import lru_cache +from typing import Dict, List, Optional, Tuple + +from i6_core.returnn import CodeWrapper + +from i6_experiments.common.datasets.tedlium2.corpus import get_ogg_zip_dict +from i6_experiments.common.datasets.tedlium2.vocab import get_subword_nmt_bpe_v2 +from i6_experiments.common.datasets.tedlium2.lexicon import get_bliss_lexicon +from i6_experiments.common.helpers.text_labels.subword_nmt_bpe import get_returnn_subword_nmt + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import BpeDatastream +from i6_experiments.users.rossenbach.lexicon.bpe_lexicon import CreateBPELexiconJob + +from returnn_common.datasets import Dataset, OggZipDataset, MetaDataset + +from ..data import build_training_datasets, TrainingDatasetSettings, TrainingDatasets + +from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE + + +from ..data import DATA_PREFIX + + +def get_lexicon(bpe_size: int) -> tk.Path: + subword_nmt_repo = get_returnn_subword_nmt( + commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", output_prefix=DATA_PREFIX + ) + subword_nmt_repo.hash_overwrite = "I6_SUBWORD_NMT_V2" + + bpe_datastream = get_bpe_datastream(bpe_size=bpe_size, is_recog=False) + bpe_lexicon = CreateBPELexiconJob( + base_lexicon_path=get_bliss_lexicon( + add_unknown_phoneme_and_mapping=False, add_silence=False, output_prefix="tedliumv2_datasets" + ), + bpe_codes=bpe_datastream.codes, + bpe_vocab=bpe_datastream.vocab, + subword_nmt_repo=subword_nmt_repo, + unk_label="", + ).out_lexicon + + return bpe_lexicon + + +def get_text_lexicon(bpe_size: int) -> tk.Path: + """ + + :return: + """ + bliss_lex = get_lexicon(bpe_size=bpe_size) + from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon + + word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon + return word_lexicon + + +def get_bpe_datastream(bpe_size: int, is_recog: bool) -> BpeDatastream: + """ + Returns the datastream for the bpe labels + + Uses the legacy BPE setup that is compatible with old LM models + + :param librispeech_key: + :param bpe_size: size for the bpe labels + :param is_recog: removes the UNK label when not in training + :param use_v2: subword_nmt had a bug where it would not find python, use corrected version which changes hash + """ + bpe_settings = get_subword_nmt_bpe_v2(bpe_size=bpe_size, unk_label="") + bpe_targets = BpeDatastream(available_for_inference=False, bpe_settings=bpe_settings, use_unk_label=is_recog) + return bpe_targets + + +def build_bpe_training_datasets( + bpe_size: int, + settings: TrainingDatasetSettings, +) -> TrainingDatasets: + """ + :param settings: configuration object for the dataset pipeline + """ + label_datastream = get_bpe_datastream(bpe_size=bpe_size, is_recog=False) + + ogg_zip_dict = get_ogg_zip_dict(returnn_python_exe=RETURNN_EXE, returnn_root=MINI_RETURNN_ROOT) + train_ogg = ogg_zip_dict["train"] + dev_ogg = ogg_zip_dict["dev"] + + return build_training_datasets( + settings=settings, train_ogg=train_ogg, dev_ogg=dev_ogg, label_datastream=label_datastream + ) diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_baseline.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_baseline.py new file mode 100644 index 000000000..9177609e8 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_baseline.py @@ -0,0 +1,706 @@ +from sisyphus import tk + +import copy +from dataclasses import asdict +import numpy as np +from typing import cast + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream +from i6_core.report.report import _Report_Type + +from .data import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon +from ..data import build_test_dataset +from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT + +from ..pipeline import training, search, compute_prior + +from .config import get_training_config, get_search_config, get_prior_config + +def flash_bpe_rnnt_report_format(report: _Report_Type) -> str: + extra_ls = [] + out = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)] + out = sorted(out, key=lambda x: float(x[1])) + best_ls = [out[0]] + for extra in extra_ls: + out2 = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if extra in recog] + out2 = sorted(out2, key=lambda x: float(x[1])) + if len(out2) > 0: + out.append((extra, "")) + out.extend(out2) + best_ls.append(out2[0]) + best_ls = sorted(best_ls, key=lambda x: float(x[1])) + out.append(("Best Results", "")) + out.extend(best_ls) + return "\n".join([f"{pair[0]}: {str(pair[1])}" for pair in out]) + +def conformer_rnnt_baseline(): + prefix_name = "experiments/rescale/tedliumv2/torchaudio_bpe_rnnt/baseline/" + + BPE_SIZE = 1000 + + train_settings = TrainingDatasetSettings( + custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000" + ) + + train_settings_retrain = copy.deepcopy(train_settings) + train_settings_retrain.epoch_wise_filters = [] + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data = build_bpe_training_datasets( + bpe_size=BPE_SIZE, + settings=train_settings, + ) + label_datastream = cast(LabelDatastream, train_data.datastreams["labels"]) + vocab_size_without_blank = label_datastream.vocab_size + + # build testing datasets + test_dataset_tuples = {} + # for testset in ["dev", "test"]: + for testset in ["dev"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + ) + from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm + + lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False) + lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] + arpa_ted_lm = lm.ngram_lm + + # ---------------------------------------------------------------------------------------------------------------- # + + def run_exp( + ft_name, + datasets, + train_args, + search_args=None, + num_epochs=250, + decoder="rnnt.decoder.experimental_rnnt_decoder", + with_prior=False, + evaluate_epoch=None, + eval_best=True, + ): + training_name = "/".join(ft_name.split("/")[:-1]) + search_args = search_args if search_args is not None else {} + + returnn_config = get_training_config(training_datasets=datasets, **train_args) + train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs) + + if not evaluate_epoch: + evaluate_epoch = num_epochs + search_job_ls = [] + report = {} + returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder) + format_string_report, values_report, search_jobs = search( + ft_name + "/default_%i" % evaluate_epoch, + returnn_search_config, + train_job.out_checkpoints[evaluate_epoch], + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + use_gpu=search_args.get("use_gpu", False), + ) + search_job_ls += search_jobs + report.update(values_report) + + from i6_core.returnn import GetBestPtCheckpointJob + if eval_best: + best_job = GetBestPtCheckpointJob(train_job.out_model_dir, train_job.out_learning_rates, key="dev_loss_rnnt") + best_job.add_alias(ft_name + "/get_best_job") + format_string_report, values_report, search_jobs = search( + ft_name + "/best_chkpt", + returnn_search_config, + best_job.out_checkpoint, + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + ) + search_job_ls += search_jobs + report.update(values_report) + + return train_job, search_job_ls, format_string_report, report + + def generate_report(results, exp_name): + from i6_core.report import GenerateReportStringJob, MailJob + + report = GenerateReportStringJob(report_values=results, report_template=flash_bpe_rnnt_report_format) + report.add_alias(f"report/report/{exp_name}") + mail = MailJob(report.out_report, send_contents=True, subject=exp_name) + mail.add_alias(f"report/mail/{exp_name}") + tk.register_output("mail/" + exp_name, mail.out_status) + + train_args_adamw03_accum2_jjlr = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 2, + }, + "debug": True, + } + + default_search_args = { + "lexicon": get_text_lexicon(bpe_size=BPE_SIZE), # TODO: cleanup + "returnn_vocab": label_datastream.vocab, + "beam_size": 1024, + "arpa_lm": arpa_ted_lm, + "beam_threshold": 14, + } + + #### New experiments with corrected FF-Dim + + from ..pytorch_networks.rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + ) + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=10, + ) + + model_config_sub6 = copy.deepcopy(model_config) + model_config_sub6.frontend_config.pool1_stride = (3, 1) + model_config_sub6.frontend_config.pool1_kernel_size = (3, 1) + + model_config_sub6_later = copy.deepcopy(model_config_sub6) + model_config_sub6_later.specauc_start_epoch = 40 + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v4_transparent_latepredictor", + "net_args": {"model_config_dict": asdict(model_config_sub6_later)}, + } + train_args["config"]["batch_size"] = 120 * 16000 + train_args["config"]["accum_grad_multiple_step"] = 3 + search_args = { + "beam_size": 12, + "returnn_vocab": label_datastream.vocab, + } + results = {} + _, _, _, wer_values = run_exp( + prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_JJLR_sub6_transparent_latepredictor/bs12", + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 14.9 + results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_JJLR_sub6_transparent_latepredictor/bs12" + ) + del results + from ..pytorch_networks.rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v5_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + PredictorConfig, + ) + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(3, 1), + pool1_stride=(3, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + predictor_config = PredictorConfig( + symbol_embedding_dim=256, + num_lstm_layers=1, + lstm_hidden_dim=1024, + lstm_dropout=0.3, + ) + model_config_v5_sub6 = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + predictor_config=predictor_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=20, + joiner_dim=512, + joiner_activation="relu", + ) + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v5", + "net_args": {"model_config_dict": asdict(model_config_v5_sub6)}, + } + train_args["config"]["batch_size"] = 120 * 16000 + train_args["config"]["accum_grad_multiple_step"] = 3 + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v5_transparent", + "net_args": {"model_config_dict": asdict(model_config_v5_sub6)}, + } + train_args["config"]["batch_size"] = 120 * 16000 + train_args["config"]["accum_grad_multiple_step"] = 3 + search_args = { + "beam_size": 12, + "returnn_vocab": label_datastream.vocab, + } + results = {} + _, _, _, wer_values = run_exp( + prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_sub6_start20_transparent/bs12", + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 11.3 + results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_sub6_start20_transparent/bs12" + ) + del results + + results = {} + _, _, _, wer_values = run_exp( + prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_sub6_start20_transparent/bs12_ep134", + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=False, + evaluate_epoch=134, + ) + results.update(wer_values) + del wer_values + generate_report( # 13.5 + results=results, exp_name=prefix_name +"conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_sub6_start20_transparent/bs12_ep134", + ) + del results + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(3, 1), + pool1_stride=(3, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + predictor_config = PredictorConfig( + symbol_embedding_dim=256, + num_lstm_layers=1, + lstm_hidden_dim=512, + lstm_dropout=0.3, + ) + model_config_v5_sub6_512lstm = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + predictor_config=predictor_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=20, + joiner_dim=512, + joiner_activation="relu", + ) + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v5_transparent", + "net_args": {"model_config_dict": asdict(model_config_v5_sub6_512lstm)}, + } + train_args["config"]["batch_size"] = 120 * 16000 + train_args["config"]["accum_grad_multiple_step"] = 3 + search_args = { + "beam_size": 12, + "returnn_vocab": label_datastream.vocab, + } + results = {} + _, _, _, wer_values = run_exp( + prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_sub6_start20_lstm512_transparent/bs12", + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 10.4 + results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_sub6_start20_lstm512_transparent/bs12" + ) + from ..pytorch_networks.rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + PredictorConfig, + ) + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(3, 1), + pool1_stride=(3, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=384, + activation=None, + ) + predictor_config = PredictorConfig( + symbol_embedding_dim=256, + emebdding_dropout=0.1, + num_lstm_layers=1, + lstm_hidden_dim=512, + lstm_dropout=0.3, + ) + model_config_v5_sub6_512lstm = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + predictor_config=predictor_config, + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=20, + joiner_dim=512, + joiner_activation="relu", + joiner_dropout=0.1, + ) + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_transparent", + "net_args": {"model_config_dict": asdict(model_config_v5_sub6_512lstm)}, + } + train_args["config"]["batch_size"] = 120 * 16000 + train_args["config"]["accum_grad_multiple_step"] = 3 + search_args = { + "beam_size": 12, + "returnn_vocab": label_datastream.vocab, + } + results = {} + _, _, _, wer_values = run_exp( + prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs12", + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 10.1 + results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs12" + ) + del results + + results = {} + for beam_size in [1, 2, 4, 8, 12, 16, 20, 24, 32, 64, 128]: + search_args_gpu = { + "beam_size": beam_size, + "returnn_vocab": label_datastream.vocab, + "use_gpu": True, # also for new hash + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs%u_gpu" + % beam_size, + datasets=train_data, + train_args=train_args, + search_args=search_args_gpu, + with_prior=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 10.1 + results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/base" + ) + del results + + search_args_gpu = { + "beam_size": 12, + "returnn_vocab": label_datastream.vocab, + "use_gpu": True, # also for new hash + "batched_encoder_decoding": True, + } + results = {} + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs12_gpu_batched", + datasets=train_data, + train_args=train_args, + search_args=search_args_gpu, + with_prior=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 10.1 + results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs12_gpu_batched" + ) + del results + + results = {} + for blank_log_penalty in [0.1, 0.2, 0.3]: + search_args_gpu = { + "beam_size": 16, + "returnn_vocab": label_datastream.vocab, + "use_gpu": True, # also for new hash + "blank_log_penalty": blank_log_penalty, + } + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs16_bp%.1f_gpu" + % blank_log_penalty, + datasets=train_data, + train_args=train_args, + search_args=search_args_gpu, + with_prior=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 10.0 + results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs_16_penalty" + ) + del results + + train_args_const20 = copy.deepcopy(train_args) + train_args_const20["config"]["learning_rates"] = ( + list(np.linspace(1e-4, 1e-4, 20)) + + list(np.linspace(1e-4, 7e-4, 90)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)) + ) + search_args = { + "beam_size": 12, + "returnn_vocab": label_datastream.vocab, + } + results = {} + _, _, _, wer_values = run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_const20_sub6_start20_lstm512_transparent/bs12", + datasets=train_data, + train_args=train_args_const20, + search_args=search_args, + with_prior=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 10.1 + results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_const20_sub6_start20_lstm512_transparent/bs12" + ) + del results + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7_transparent", + "net_args": {"model_config_dict": asdict(model_config_v5_sub6_512lstm)}, + } + train_args["config"]["batch_size"] = 120 * 16000 + train_args["config"]["accum_grad_multiple_step"] = 3 + search_args = { + "beam_size": 12, + "returnn_vocab": label_datastream.vocab, + } + results = {} + _, _, _, wer_values = run_exp( + prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512_transparent/bs12", + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=False, + ) + results.update(wer_values) + del wer_values + generate_report( # 9.8 + results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512_transparent/bs12" + ) + del results + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7", + "net_args": {"model_config_dict": asdict(model_config_v5_sub6_512lstm)}, + } + train_args["config"]["batch_size"] = 120 * 16000 + train_args["config"]["accum_grad_multiple_step"] = 3 + search_args = { + "beam_size": 12, + "returnn_vocab": label_datastream.vocab, + } + results = {} + train_job, _, _, wer_values= run_exp( + prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512/bs12", + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=False, + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( # 9.6 + results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512/bs12" + ) + del results + # TODO: This here above is the best baseline with 9.3%, with the accum step 3 setting also runnable on 11GB GPU + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7", + "net_args": {"model_config_dict": asdict(model_config_v5_sub6_512lstm)}, + } + train_args["config"]["learning_rates"] = ( + list(np.linspace(7e-6, 7e-4, 220)) + + list(np.linspace(7e-4, 7e-5, 220)) + + list(np.linspace(7e-5, 1e-8, 60))) + train_args["config"]["batch_size"] = 120 * 16000 + train_args["config"]["accum_grad_multiple_step"] = 3 + search_args = { + "beam_size": 12, + "returnn_vocab": label_datastream.vocab, + } + results = {} + train_job, _, _, wer_values = run_exp( + prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512_longer/bs12", + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=False, + num_epochs=500 + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( # 9.6 + results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512_longer/bs12" + ) + del results + + train_args = { + **copy.deepcopy(train_args_adamw03_accum2_jjlr), + "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7", + "net_args": {"model_config_dict": asdict(model_config_v5_sub6_512lstm)}, + } + train_args["config"]["batch_size"] = 180 * 16000 + train_args["config"]["accum_grad_multiple_step"] = 2 + search_args = { + "beam_size": 12, + "returnn_vocab": label_datastream.vocab, + } + results = {} + train_job, _, _, wer_values = run_exp( + prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512_r2/bs12", + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=False, + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( # 9.5 + results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512_r2/bs12" + ) + del results diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_espnet_like.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_espnet_like.py new file mode 100644 index 000000000..9b7cb61d0 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_espnet_like.py @@ -0,0 +1,252 @@ +from sisyphus import tk + +import copy +from dataclasses import asdict +import numpy as np +from typing import cast + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream + +from .data import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon +from ..data import build_test_dataset +from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT + +from ..pipeline import training, search, compute_prior + +from .config import get_training_config, get_search_config, get_prior_config + + +def conformer_rnnt_espnet_like(): + """ + + ESPNet like means BPE 500 and subsampling 4 + + :return: + """ + + prefix_name = "experiments/rescale/tedliumv2/torchaudio_bpe_rnnt/espnet_like" + + BPE_SIZE = 500 + + train_settings = TrainingDatasetSettings( + custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000" + ) + + train_settings_retrain = copy.deepcopy(train_settings) + train_settings_retrain.epoch_wise_filters = [] + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data = build_bpe_training_datasets( + bpe_size=BPE_SIZE, + settings=train_settings, + ) + label_datastream = cast(LabelDatastream, train_data.datastreams["labels"]) + vocab_size_without_blank = label_datastream.vocab_size + + # build testing datasets + test_dataset_tuples = {} + # for testset in ["dev", "test"]: + for testset in ["dev"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + ) + from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm + + lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False) + lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] + arpa_ted_lm = lm.ngram_lm + + # ---------------------------------------------------------------------------------------------------------------- # + + def run_exp( + ft_name, + datasets, + train_args, + search_args=None, + num_epochs=250, + decoder="rnnt.decoder.experimental_rnnt_decoder", + with_prior=False, + evaluate_epoch=None, + ): + training_name = "/".join(ft_name.split("/")[:-1]) + search_args = search_args if search_args is not None else {} + + returnn_config = get_training_config(training_datasets=datasets, **train_args) + train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs) + + if not evaluate_epoch: + evaluate_epoch = num_epochs + + returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder) + _, _, search_jobs = search( + ft_name + "/default_%i" % evaluate_epoch, + returnn_search_config, + train_job.out_checkpoints[evaluate_epoch], + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + use_gpu=search_args.get("use_gpu", False), + ) + + return train_job, search_jobs + + from ..pytorch_networks.rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ( + SpecaugConfig, + VGG4LayerActFrontendV1Config_mod, + ModelConfig, + PredictorConfig, + ) + + specaug_config = SpecaugConfig( + repeat_per_n_frames=25, + max_dim_time=20, + max_dim_feat=16, + num_repeat_feat=5, + ) + frontend_config = VGG4LayerActFrontendV1Config_mod( + in_features=80, + conv1_channels=32, + conv2_channels=64, + conv3_channels=64, + conv4_channels=32, + conv_kernel_size=(3, 3), + conv_padding=None, + pool1_kernel_size=(2, 1), + pool1_stride=(2, 1), + pool1_padding=None, + pool2_kernel_size=(2, 1), + pool2_stride=(2, 1), + pool2_padding=None, + activation_str="ReLU", + out_features=256, + activation=None, + ) + predictor_config = PredictorConfig( + symbol_embedding_dim=256, + emebdding_dropout=0.2, + num_lstm_layers=1, + lstm_hidden_dim=256, + lstm_dropout=0.1, + ) + model_config = ModelConfig( + frontend_config=frontend_config, + specaug_config=specaug_config, + predictor_config=predictor_config, + label_target_size=vocab_size_without_blank, + conformer_size=256, + num_layers=12, + num_heads=4, + ff_dim=1024, + att_weights_dropout=0.1, + conv_dropout=0.1, + ff_dropout=0.1, + mhsa_dropout=0.1, + conv_kernel_size=31, + final_dropout=0.1, + specauc_start_epoch=10, + joiner_dim=320, + joiner_activation="tanh", + joiner_dropout=0.1, + ) + + train_args_adamw03_24gb_jjlr = { + "config": { + "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)), + ############# + "batch_size": 200 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "accum_grad_multiple_step": 2, + }, + "debug": True, + } + + train_args = { + **copy.deepcopy(train_args_adamw03_24gb_jjlr), + "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7_transparent", + "net_args": {"model_config_dict": asdict(model_config)}, + } + + search_args_gpu = { + "beam_size": 12, + "returnn_vocab": label_datastream.vocab, + "use_gpu": True, # also for new hash + } + train_job, _ = run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent_JJLR_sub4_small_bs200ac2/bs12_gpu", + datasets=train_data, + train_args=train_args, + search_args=search_args_gpu, + with_prior=False, + ) + train_job.rqmt["gpu_mem"] = 24 + + model_config_ff2048 = copy.deepcopy(model_config) + model_config_ff2048.ff_dim = 2048 + train_args_ff2048 = { + **copy.deepcopy(train_args_adamw03_24gb_jjlr), + "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7_transparent", + "net_args": {"model_config_dict": asdict(model_config_ff2048)}, + } + train_job, _ = run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent_JJLR_sub4_small_bs200ac2_ff2048/bs12_gpu", + datasets=train_data, + train_args=train_args_ff2048, + search_args=search_args_gpu, + with_prior=False, + ) + train_job.rqmt["gpu_mem"] = 24 + + # TODO: Does not fit + # train_args_bs300ac1 = copy.deepcopy(train_args) + # train_args_bs300ac1["config"]["batch_size"] = 300 * 16000 + # train_args_bs300ac1["config"]["accum_grad_multiple_step"] = 1 + # train_job, _ = run_exp( + # prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent_JJLR_sub4_small_bs300ac1/bs12_gpu", + # datasets=train_data, train_args=train_args_bs300ac1, search_args=search_args_gpu, with_prior=False) + # train_job.rqmt["gpu_mem"] = 24 + + # Do it large instead + model_config_v5_enc384_dec512 = ModelConfig( + frontend_config=copy.deepcopy(frontend_config), + specaug_config=specaug_config, + predictor_config=copy.deepcopy(predictor_config), + label_target_size=vocab_size_without_blank, + conformer_size=384, + num_layers=12, + num_heads=4, + ff_dim=1536, + att_weights_dropout=0.2, + conv_dropout=0.2, + ff_dropout=0.2, + mhsa_dropout=0.2, + conv_kernel_size=31, + final_dropout=0.2, + specauc_start_epoch=20, + joiner_dim=512, + joiner_activation="relu", + joiner_dropout=0.1, + ) + model_config_v5_enc384_dec512.predictor_config.lstm_hidden_dim = 512 + model_config_v5_enc384_dec512.predictor_config.lstm_dropout = 0.3 + model_config_v5_enc384_dec512.predictor_config.emebdding_dropout = 0.1 + model_config_v5_enc384_dec512.frontend_config.out_features = 384 + + train_args = { + **copy.deepcopy(train_args_adamw03_24gb_jjlr), + "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7_transparent", + "net_args": {"model_config_dict": asdict(model_config_v5_enc384_dec512)}, + } + train_job, _ = run_exp( + prefix_name + + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent_JJLR_sub4_enc384_dec512/bs12_gpu", + datasets=train_data, + train_args=train_args, + search_args=search_args_gpu, + with_prior=False, + ) + train_job.rqmt["gpu_mem"] = 24 diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_pretrained.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_pretrained.py new file mode 100644 index 000000000..edb789284 --- /dev/null +++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_pretrained.py @@ -0,0 +1,180 @@ +from sisyphus import tk + +import copy +from dataclasses import asdict +import numpy as np +from typing import cast + +from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream +from i6_core.report.report import _Report_Type + +from .data import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon +from ..data import build_test_dataset +from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT + +from ..pipeline import training, search, compute_prior + +from .config import get_training_config, get_search_config, get_prior_config + +def flash_bpe_rnnt_report_format(report: _Report_Type) -> str: + extra_ls = [] + out = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)] + out = sorted(out, key=lambda x: float(x[1])) + best_ls = [out[0]] + for extra in extra_ls: + out2 = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if extra in recog] + out2 = sorted(out2, key=lambda x: float(x[1])) + if len(out2) > 0: + out.append((extra, "")) + out.extend(out2) + best_ls.append(out2[0]) + best_ls = sorted(best_ls, key=lambda x: float(x[1])) + out.append(("Best Results", "")) + out.extend(best_ls) + return "\n".join([f"{pair[0]}: {str(pair[1])}" for pair in out]) + + +def pretrained_rnnt(): + prefix_name = "experiments/rescale/tedliumv2/torchaudio_bpe_rnnt/" + + BPE_SIZE = 1000 + + train_settings = TrainingDatasetSettings( + custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000" + ) + + train_settings_retrain = copy.deepcopy(train_settings) + train_settings_retrain.epoch_wise_filters = [] + + # build the training datasets object containing train, cv, dev-train and the extern_data dict + train_data = build_bpe_training_datasets( + bpe_size=BPE_SIZE, + settings=train_settings, + ) + label_datastream = cast(LabelDatastream, train_data.datastreams["labels"]) + vocab_size_without_blank = label_datastream.vocab_size + + # build testing datasets + test_dataset_tuples = {} + # for testset in ["dev", "test"]: + for testset in ["dev"]: + test_dataset_tuples[testset] = build_test_dataset( + dataset_key=testset, + ) + from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm + + lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False) + lm = lms_system.interpolated_lms["dev-pruned"]["4gram"] + arpa_ted_lm = lm.ngram_lm + + # ---------------------------------------------------------------------------------------------------------------- # + + def run_exp( + ft_name, + datasets, + train_args, + search_args=None, + num_epochs=250, + decoder="rnnt.decoder.experimental_rnnt_decoder", + with_prior=False, + evaluate_epoch=None, + ): + training_name = "/".join(ft_name.split("/")[:-1]) + search_args = search_args if search_args is not None else {} + + returnn_config = get_training_config(training_datasets=datasets, **train_args) + train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs) + + if not evaluate_epoch: + evaluate_epoch = num_epochs + search_job_ls = [] + report = {} + returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder) + format_string_report, values_report, search_jobs = search( + ft_name + "/default_%i" % evaluate_epoch, + returnn_search_config, + train_job.out_checkpoints[evaluate_epoch], + test_dataset_tuples, + RETURNN_EXE, + MINI_RETURNN_ROOT, + use_gpu=search_args.get("use_gpu", False), + ) + search_job_ls += search_jobs + report.update(values_report) + + return train_job, search_job_ls, format_string_report, report + + def generate_report(results, exp_name): + from i6_core.report import GenerateReportStringJob, MailJob + + report = GenerateReportStringJob(report_values=results, report_template=flash_bpe_rnnt_report_format) + report.add_alias(f"report/report/{exp_name}") + mail = MailJob(report.out_report, send_contents=True, subject=exp_name) + mail.add_alias(f"report/mail/{exp_name}") + tk.register_output("mail/" + exp_name, mail.out_status) + + from ..pytorch_networks.rnnt.conformer_1023 import hubert_pretrain_v1_cfg + + predictor_config = hubert_pretrain_v1_cfg.PredictorConfig( + symbol_embedding_dim=256, + emebdding_dropout=0.1, + num_lstm_layers=1, + lstm_hidden_dim=512, + lstm_dropout=0.3, + ) + + hubert_cfg_2 = hubert_pretrain_v1_cfg.HubertConfig( + finetune_layer=2, + name="base-ls960", + ) + model_config_hubert_2 = hubert_pretrain_v1_cfg.ModelConfig( + specauc_start_epoch=0, + label_target_size=vocab_size_without_blank, + final_dropout=0.2, + hubert_cfg=hubert_cfg_2, + predictor_config=predictor_config, + joiner_dim=512, + joiner_activation="relu", + joiner_dropout=0.1, + ) + + train_args_hubert_adam_accum25_jjlr = { + "config": { + "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)}, + "learning_rates": list(np.linspace(7e-6, 7e-4, 110)) + + list(np.linspace(7e-4, 7e-5, 110)) + + list(np.linspace(7e-5, 1e-8, 30)), + ############# + "batch_size": 180 * 16000, + "max_seq_length": {"audio_features": 35 * 16000}, + "max_seqs": 3, + "accum_grad_multiple_step": 25, + }, + "debug": False, + } + eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250] + train_args = { + **copy.deepcopy(train_args_hubert_adam_accum25_jjlr), + "network_module": "rnnt.conformer_1023.hubert_pretrain_v1", + "net_args": {"model_config_dict": asdict(model_config_hubert_2)}, + } + search_args = { + "beam_size": 12, + "returnn_vocab": label_datastream.vocab, + } + results = {} + train_job, _, _, wer_values = run_exp( + prefix_name + + "conformer_1023/hubert_pretrain_v3_base_tune2_jjlr/bs12", + datasets=train_data, + train_args=train_args, + search_args=search_args, + with_prior=False, + ) + train_job.rqmt["gpu_mem"] = 24 + results.update(wer_values) + del wer_values + generate_report( + results=results, exp_name=prefix_name + "conformer_1023/hubert_pretrain_v3_base_tune2_jjlr" + ) + del results diff --git a/users/hilmes/tools/onnx.py b/users/hilmes/tools/onnx.py new file mode 100644 index 000000000..50a8de527 --- /dev/null +++ b/users/hilmes/tools/onnx.py @@ -0,0 +1,237 @@ +import sys +import os +from sisyphus import Job, Task, tk +from typing import Any, Dict, Optional, Tuple, List, Union +import logging + +from i6_core.returnn.config import ReturnnConfig +from i6_core.returnn.training import PtCheckpoint +from onnxruntime.quantization import quant_pre_process, quantize_static, CalibrationDataReader, CalibrationMethod, QuantType, QuantFormat +from onnxruntime import InferenceSession, SessionOptions +from returnn.datasets import Dataset, init_dataset +from returnn.datasets.meta import MetaDataset +import numpy as np + +class ExportPyTorchModelToOnnxJob(Job): + """ + Experimental exporter job + + JUST FOR DEBUGGING, THIS FUNCTIONALITY SHOULD BE IN RETURNN ITSELF + """ + __sis_hash_exclude__ = {"quantize_dynamic": False, "quantize_static": False} + + def __init__(self, pytorch_checkpoint: PtCheckpoint, returnn_config: ReturnnConfig, returnn_root: tk.Path, quantize_dynamic: bool = False): + + self.pytorch_checkpoint = pytorch_checkpoint + self.returnn_config = returnn_config + self.returnn_root = returnn_root + self.quantize_dynamic = quantize_dynamic + + self.out_onnx_model = self.output_path("model.onnx") + self.rqmt = {"time": 2, "cpu": 4, "mem": 16} + + def tasks(self): + yield Task("run", rqmt=self.rqmt) + + def run(self): + sys.path.insert(0, self.returnn_root.get()) + import torch + from returnn.config import Config + config = Config() + self.returnn_config.write("returnn.config") + config.load_file("returnn.config") + + model_state = torch.load(str(self.pytorch_checkpoint),map_location=torch.device("cpu")) + if isinstance(model_state, dict): + epoch = model_state["epoch"] + step = model_state["step"] + model_state = model_state["model"] + else: + epoch = 1 + step = 0 + + get_model_func = config.typed_value("get_model") + assert get_model_func, "get_model not defined" + model = get_model_func(epoch=epoch, step=step) + assert isinstance(model, torch.nn.Module) + + model.load_state_dict(model_state) + + export_func = config.typed_value("export") + assert export_func + if self.quantize_dynamic: + import onnx + from onnxruntime.quantization import quantize_dynamic + + model_fp32 = 'tmp_model.onnx' + export_func(model=model, model_filename=model_fp32) + quantized_model = quantize_dynamic(model_fp32, self.out_onnx_model.get()) + else: + export_func(model=model, model_filename=self.out_onnx_model.get()) + + +class ModelQuantizeStaticJob(Job): + + __sis_hash_exclude__ = { + "moving_average": False, + "smoothing_factor": 0.0, + "symmetric": False, + "activation_type": QuantType.QInt8, + "quant_format": QuantFormat.QDQ, + "weight_type": QuantType.QInt8, + "final_skip": (None, None), + "ops_to_quant": None, + "smooth_quant": False, + } + + def __init__(self, + model: tk.Path, + dataset: Dict[str, Any], + num_seqs: int = 10, + num_parallel_seqs: int = 25, + calibrate_method: CalibrationMethod = CalibrationMethod.MinMax, + moving_average: bool = False, + smoothing_factor: float = 0.0, + symmetric: bool = False, + activation_type = QuantType.QInt8, + quant_format = QuantFormat.QDQ, + weight_type = QuantType.QInt8, + final_skip: Tuple[Optional[int], Optional[int]] = (None, None), + ops_to_quant: Optional[List[str]] = None, + smooth_quant: bool = False, + ): + """ + :param model: + :param dataset: + :param num_seqs: + :param num_parallel_seqs: + :param moving_average: whether to use moving average for MinMax or Symmetry for Entropy + """ + self.model = model + self.dataset = dataset + self.num_seqs = num_seqs + self.num_parallel_seqs = num_parallel_seqs + self.moving_average = moving_average + self.activation_type = activation_type + self.quant_format = quant_format + self.weight_type = weight_type + + self.out_model = self.output_path("model.onnx") + if num_seqs >= 5000: + time = 12 + elif num_seqs >= 2500: + time = 6 + elif num_seqs >= 1000: + time = 4 + else: + time = 1 + if not calibrate_method == CalibrationMethod.MinMax: + time *= 2 + + self.rqmt = {"cpu": 8 if num_seqs > 100 else 4, "mem": 16.0 if calibrate_method == CalibrationMethod.MinMax else 48, "time": time} + self.calibration_method = calibrate_method + self.smoothing_factor = smoothing_factor + self.symmetric = symmetric + self.final_skip = final_skip + self.smooth_quant = smooth_quant + self.ops_to_quant = ops_to_quant + self.out_dev_log = self.output_path("dev_log") + + def tasks(self): + yield Task("run", rqmt=self.rqmt) + + def convert_to_str(self, dataset: Dict): + res = {} + for x in dataset: + if isinstance(dataset[x], dict): + res[x] = self.convert_to_str(dataset[x]) + elif isinstance(dataset[x], tk.Path): + res[x] = str(dataset[x]) + else: + res[x] = dataset[x] + return res + + def run(self): + print("Start") + quant_pre_process( + input_model_path=self.model.get_path(), + output_model_path="model_prep.onnx") + + class DummyDataReader(CalibrationDataReader): + + def __init__(self, model_str: str, data: Union[Dataset, MetaDataset], max_seqs: int, final_skip: Optional[Tuple[int, int]] = (None, None)): + + self.max_seqs = max_seqs + self.data = data + self.idx: int = 0 + sess_option = SessionOptions() + logging.info(f"Data Loading {os.getenv('SLURM_CPUS_PER_TASK')}") + sess_option.intra_op_num_threads = int(os.getenv('SLURM_CPUS_PER_TASK')) + session = InferenceSession(model_str, sess_option) + self.input_name_1 = session.get_inputs()[0].name + self.input_name_2 = session.get_inputs()[1].name + self.final_skip_step = final_skip[0] + self.final_skip_count = final_skip[1] + + def get_next(self): + init_dataset(self.data) + key = "data" if "data" in self.data.data_keys else "raw_audio" # hack to make it compatible with both setups for now + if not self.data.is_less_than_num_seqs(self.idx) or self.idx >= self.max_seqs: + if self.final_skip_step is not None and self.idx < self.max_seqs + self.final_skip_step * self.final_skip_count: + self.idx += self.final_skip_step + logging.info(f"Skipping to Seq {self.idx}") + self.data.load_seqs(self.idx, self.idx + 1) + seq_len: np.ndarray = self.data.get_seq_length(self.idx)[key] + data: np.ndarray = self.data.get_data(self.idx, key) + seq_len = np.array([seq_len], dtype=np.int32) + data = np.expand_dims(data, axis=0) + return {self.input_name_1: data, self.input_name_2: seq_len} + else: + return None + self.data.load_seqs(self.idx, self.idx + 1) + seq_len: np.ndarray = self.data.get_seq_length(self.idx)[key] + data: np.ndarray = self.data.get_data(self.idx, key) + if self.idx % 10 == 0: + logging.info(f"{self.idx} seqs seen") + seq_len = np.array([seq_len], dtype=np.int32) + data = np.expand_dims(data, axis=0) + self.idx += 1 + return {self.input_name_1: data, self.input_name_2: seq_len} + + def __iter__(self): + data = [] + x = self.get_next() + while x is not None: + data.append(x) + x = self.get_next() + for x in data: + yield x + + self.dataset = self.convert_to_str(self.dataset) + dataset: Dataset = init_dataset(self.dataset) + dataset.init_seq_order(1) + y = DummyDataReader(model_str="model_prep.onnx", data=dataset, max_seqs=self.num_seqs, final_skip=self.final_skip) + quant_options = { + "CalibMaxIntermediateOutputs": self.num_parallel_seqs, + "CalibMovingAverage": self.moving_average, + "CalibTensorRangeSymmetric": self.symmetric, + } + if self.smoothing_factor > 0.0: + quant_options["CalibSmoothRange"] = self.smoothing_factor + if self.smooth_quant: + quant_options["SmoothQuant"] = True + quantize_static( + model_input="model_prep.onnx", + model_output=self.out_model.get_path(), + calibration_data_reader=y, + calibrate_method=self.calibration_method, + extra_options=quant_options, + quant_format=self.quant_format, + activation_type=self.activation_type, + weight_type=self.weight_type, + op_types_to_quantize=self.ops_to_quant, + ) + import shutil + if self.final_skip[0] or self.final_skip[1]: + shutil.move("calibrate_tensors_dev", self.out_dev_log) +