diff --git a/users/hilmes/experiments/__init__.py b/users/hilmes/experiments/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/__init__.py b/users/hilmes/experiments/nick_setups/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/__init__.py
new file mode 100644
index 000000000..6ac5dd240
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/__init__.py
@@ -0,0 +1 @@
+PACKAGE = __package__
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/config.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/config.py
new file mode 100644
index 000000000..c6536eb6b
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/config.py
@@ -0,0 +1,156 @@
+import copy
+import numpy as np
+from sisyphus import tk
+from typing import Any, Dict
+
+from i6_core.returnn.config import ReturnnConfig, CodeWrapper
+
+from i6_experiments.common.setups.returnn_pytorch.serialization import (
+    Collection as TorchCollection,
+)
+from i6_experiments.common.setups.serialization import Import
+from .data.common import TrainingDatasets
+from .serializer import get_pytorch_serializer_v3, PACKAGE
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset
+
+
+def get_training_config(
+    training_datasets: TrainingDatasets,
+    network_module: str,
+    net_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine: bool = False,
+    use_speed_perturbation: bool = False,
+) -> ReturnnConfig:
+    """
+    :param training_datasets: datasets for training
+    :param network_module: path to the pytorch config file containing Model
+    :param net_args: extra arguments for the model
+    :param config:
+    :param debug: run training in debug mode (linking from recipe instead of copy)
+    """
+
+    # changing these does not change the hash
+    post_config = {
+        "cleanup_old_models": True,
+        "stop_on_nonfinite_train_score": True,  # this might break now with True
+        "num_workers_per_gpu": 2,
+    }
+
+    base_config = {
+        #############
+        "train": copy.deepcopy(training_datasets.train.as_returnn_opts()),
+        "dev": training_datasets.cv.as_returnn_opts(),
+        "eval_datasets": {"devtrain": training_datasets.devtrain.as_returnn_opts()},
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module, net_args=net_args, debug=debug, use_custom_engine=use_custom_engine
+    )
+    python_prolog = None
+
+    # TODO: maybe make nice
+    if use_speed_perturbation:
+        prolog_serializer = TorchCollection(
+            serializer_objects=[
+                Import(
+                    code_object_path=PACKAGE + ".dataset_code.speed_perturbation.legacy_speed_perturbation",
+                    unhashed_package_root=PACKAGE,
+                )
+            ]
+        )
+        python_prolog = [prolog_serializer]
+        config["train"]["datasets"]["zip_dataset"]["audio"]["pre_process"] = CodeWrapper("legacy_speed_perturbation")
+
+    returnn_config = ReturnnConfig(
+        config=config, post_config=post_config, python_prolog=python_prolog, python_epilog=[serializer]
+    )
+    return returnn_config
+
+
+def get_prior_config(
+    training_datasets: TrainingDatasets,
+    network_module: str,
+    net_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine=False,
+    **kwargs,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these does not change the hash
+    post_config = {}
+
+    base_config = {
+        #############
+        "batch_size": 500 * 16000,
+        "max_seqs": 60,
+        #############
+        "forward": training_datasets.prior.as_returnn_opts(),
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module,
+        net_args=net_args,
+        debug=debug,
+        use_custom_engine=use_custom_engine,
+        prior=True,
+    )
+    returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer])
+    return returnn_config
+
+
+def get_search_config(
+    network_module: str,
+    net_args: Dict[str, Any],
+    decoder: [str],
+    decoder_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine=False,
+    **kwargs,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these does not change the hash
+    post_config = {}
+
+    base_config = {
+        #############
+        "batch_size": 240 * 16000,
+        "max_seqs": 60,
+        #############
+        # dataset is added later in the pipeline during search_single
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module,
+        net_args=net_args,
+        debug=debug,
+        use_custom_engine=use_custom_engine,
+        decoder=decoder,
+        decoder_args=decoder_args,
+    )
+    returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer])
+    return returnn_config
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_bpe/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_bpe/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_bpe/exp_ls100_1023_base.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_bpe/exp_ls100_1023_base.py
new file mode 100644
index 000000000..ce66e544f
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_bpe/exp_ls100_1023_base.py
@@ -0,0 +1,339 @@
+from sisyphus import tk
+
+import copy
+from dataclasses import asdict
+import numpy as np
+from typing import cast
+
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+
+from ..lm import get_4gram_binary_lm
+from ..data.bpe import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon
+from ..data.common import build_test_dataset
+from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT, KENLM_BINARY_PATH
+
+from ..pipeline import training, search, compute_prior
+
+from ..config import get_training_config, get_search_config, get_prior_config
+
+
+def conformer_baseline():
+    prefix_name = "experiments/librispeech/standalone_2023/ls100_ctc_bpe/"
+
+    BPE_SIZE = 300
+
+    train_settings = TrainingDatasetSettings(
+        custom_processing_function=None,
+        partition_epoch=3,
+        epoch_wise_filters=[],
+        seq_ordering="laplace:.1000",
+        preemphasis=0.97,
+    )
+
+    train_settings_retrain = copy.deepcopy(train_settings)
+    train_settings_retrain.epoch_wise_filters = []
+
+    # build the training datasets object containing train, cv, dev-train and the extern_data dict
+    train_data = build_bpe_training_datasets(
+        librispeech_key="train-clean-100",
+        bpe_size=BPE_SIZE,
+        settings=train_settings,
+    )
+    label_datastream = cast(LabelDatastream, train_data.datastreams["labels"])
+    vocab_size_without_blank = label_datastream.vocab_size
+
+    # build testing datasets
+    test_dataset_tuples = {}
+    # for testset in ["dev", "test"]:
+    for testset in ["dev-other"]:
+        test_dataset_tuples[testset] = build_test_dataset(
+            dataset_key=testset,
+        )
+
+    arpa_4gram_lm = get_4gram_binary_lm()
+
+    # ---------------------------------------------------------------------------------------------------------------- #
+
+    def run_exp(
+        ft_name,
+        datasets,
+        train_args,
+        search_args=None,
+        with_prior=False,
+        num_epochs=250,
+        decoder="ctc.decoder.flashlight_bpe_ctc",
+    ):
+        training_name = "/".join(ft_name.split("/")[:-1])
+        search_args = search_args if search_args is not None else {}
+
+        returnn_config = get_training_config(training_datasets=datasets, **train_args)
+        train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs)
+
+        if with_prior:
+            returnn_config = get_prior_config(training_datasets=datasets, **train_args)
+            prior_file = compute_prior(
+                ft_name,
+                returnn_config,
+                checkpoint=train_job.out_checkpoints[num_epochs],
+                returnn_exe=RETURNN_EXE,
+                returnn_root=MINI_RETURNN_ROOT,
+            )
+            tk.register_output(training_name + "/prior.txt", prior_file)
+            search_args["prior_file"] = prior_file
+
+        returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder)
+
+        _, _, search_jobs = search(
+            ft_name + "/last_%i" % num_epochs,
+            returnn_search_config,
+            train_job.out_checkpoints[num_epochs],
+            test_dataset_tuples,
+            RETURNN_EXE,
+            MINI_RETURNN_ROOT,
+        )
+
+        return train_job, search_jobs
+
+    from ..pytorch_networks.ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+        LogMelFeatureExtractionV1Config,
+    )
+
+    fe_config = LogMelFeatureExtractionV1Config(
+        sample_rate=16000,
+        win_size=0.025,
+        hop_size=0.01,
+        f_min=60,
+        f_max=7600,
+        min_amp=1e-10,
+        num_filters=80,
+        center=False,
+    )
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(2, 1),
+        pool1_stride=(2, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    model_config = ModelConfig(
+        feature_extraction_config=fe_config,
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+        specauc_start_epoch=1,
+    )
+
+    train_args_adamw03_accum2_jjlr = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+            + list(np.linspace(7e-4, 7e-5, 110))
+            + list(np.linspace(7e-5, 1e-8, 30)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "accum_grad_multiple_step": 2,
+        },
+        "debug": False,
+    }
+
+    default_search_args = {
+        "lexicon": get_text_lexicon(librispeech_key="train-clean-100", bpe_size=BPE_SIZE),
+        "returnn_vocab": label_datastream.vocab,
+        "beam_size": 1024,
+        "beam_size_token": 128,
+        "arpa_lm": arpa_4gram_lm,
+        "beam_threshold": 14,
+    }
+
+    # DIverged
+    # train_args = {
+    #     **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+    #     "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6",
+    #     "net_args": {"model_config_dict": asdict(model_config)},
+    # }
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #         }
+    #         run_exp(
+    #             prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR/lm%.1f_prior%.2f_bs1024_th14" % (
+    #                 lm_weight, prior_scale),
+    #             datasets=train_data, train_args=train_args, search_args=search_args, with_prior=True)
+
+    model_config_start11 = copy.deepcopy(model_config)
+    model_config_start11.specauc_start_epoch = 11
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6",
+        "net_args": {"model_config_dict": asdict(model_config_start11)},
+    }
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_start11/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+
+    # from here on onwards, use default AdamW with same OCLR
+    train_args_adamw_02 = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-2},
+            "learning_rates": list(np.linspace(1e-5, 1e-3, 150)) + list(np.linspace(1e-3, 1e-6, 150)),
+            #############
+            "batch_size": 200 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "accum_grad_multiple_step": 2,
+        },
+    }
+
+    model_config_smaller = ModelConfig(
+        feature_extraction_config=fe_config,
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=384,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=9,
+        final_dropout=0.2,
+        specauc_start_epoch=1,
+    )
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw_02),
+        "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6",
+        "net_args": {"model_config_dict": asdict(model_config_smaller)},
+    }
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_smaller_decay1e-2/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+
+    model_config_smaller_start11 = copy.deepcopy(model_config_smaller)
+    model_config_smaller_start11.specauc_start_epoch = 11
+    train_args_start11 = copy.deepcopy(train_args)
+    train_args_start11["net_args"]["model_config_dict"] = asdict(model_config_smaller_start11)
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_smaller_decay1e-2_start11/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args_start11,
+                search_args=search_args,
+                with_prior=True,
+            )
+
+    from ..pytorch_networks.ctc_conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+    )
+
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(2, 1),
+        pool1_stride=(2, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    model_config = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=9,
+        final_dropout=0.2,
+    )
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_phon/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_phon/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_phon/exp_ls100_1023_base.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_phon/exp_ls100_1023_base.py
new file mode 100644
index 000000000..6d10fabf2
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/ctc_phon/exp_ls100_1023_base.py
@@ -0,0 +1,313 @@
+from sisyphus import tk
+
+import copy
+from dataclasses import asdict
+import numpy as np
+from typing import cast
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+
+from ..data.phon import build_eow_phon_training_datasets, TrainingDatasetSettings, get_text_lexicon
+from ..data.common import build_test_dataset
+from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT
+from ..lm import get_4gram_binary_lm
+
+from ..pipeline import training, search, compute_prior
+
+from ..config import get_training_config, get_search_config, get_prior_config
+
+
+def eow_phon_ls100_1023_base():
+    prefix_name = "experiments/librispeech/standalone_2023/ls100_ctc_eow_phon/"
+
+    train_settings = TrainingDatasetSettings(
+        custom_processing_function=None,
+        partition_epoch=3,
+        epoch_wise_filters=[],
+        seq_ordering="laplace:.1000",
+        preemphasis=0.97,
+    )
+
+    train_settings_retrain = copy.deepcopy(train_settings)
+    train_settings_retrain.epoch_wise_filters = []
+
+    # build the training datasets object containing train, cv, dev-train and the extern_data dict
+    train_data = build_eow_phon_training_datasets(
+        librispeech_key="train-clean-100",
+        settings=train_settings,
+    )
+    label_datastream = cast(LabelDatastream, train_data.datastreams["labels"])
+    vocab_size_without_blank = label_datastream.vocab_size
+
+    # build testing datasets
+    test_dataset_tuples = {}
+    # for testset in ["dev", "test"]:
+    for testset in ["dev-other"]:
+        test_dataset_tuples[testset] = build_test_dataset(
+            dataset_key=testset,
+        )
+
+    arpa_4gram_lm = get_4gram_binary_lm()
+
+    # ---------------------------------------------------------------------------------------------------------------- #
+
+    def run_exp(
+        ft_name,
+        datasets,
+        train_args,
+        search_args=None,
+        with_prior=False,
+        num_epochs=250,
+        decoder="ctc.decoder.flashlight_phoneme_ctc",
+    ):
+        training_name = "/".join(ft_name.split("/")[:-1])
+        search_args = search_args if search_args is not None else {}
+
+        returnn_config = get_training_config(training_datasets=datasets, **train_args)
+        train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs)
+
+        if with_prior:
+            returnn_config = get_prior_config(training_datasets=datasets, **train_args)
+            prior_file = compute_prior(
+                ft_name,
+                returnn_config,
+                checkpoint=train_job.out_checkpoints[num_epochs],
+                returnn_exe=RETURNN_EXE,
+                returnn_root=MINI_RETURNN_ROOT,
+            )
+            tk.register_output(training_name + "/prior.txt", prior_file)
+            search_args["prior_file"] = prior_file
+
+        returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder)
+
+        _, _, search_jobs = search(
+            ft_name + "/last_%i" % num_epochs,
+            returnn_search_config,
+            train_job.out_checkpoints[num_epochs],
+            test_dataset_tuples,
+            RETURNN_EXE,
+            MINI_RETURNN_ROOT,
+        )
+
+        return train_job, search_jobs
+
+    from ..pytorch_networks.ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+        LogMelFeatureExtractionV1Config,
+    )
+
+    fe_config = LogMelFeatureExtractionV1Config(
+        sample_rate=16000,
+        win_size=0.025,
+        hop_size=0.01,
+        f_min=60,
+        f_max=7600,
+        min_amp=1e-10,
+        num_filters=80,
+        center=False,
+    )
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(2, 1),
+        pool1_stride=(2, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    model_config = ModelConfig(
+        feature_extraction_config=fe_config,
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+        specauc_start_epoch=1,
+    )
+
+    train_args_adamw03_accum2_jjlr = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+            + list(np.linspace(7e-4, 7e-5, 110))
+            + list(np.linspace(7e-5, 1e-8, 30)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "accum_grad_multiple_step": 2,
+        },
+        "debug": False,
+    }
+
+    default_search_args = {
+        "lexicon": get_text_lexicon(librispeech_key="train-clean-100"),
+        "returnn_vocab": label_datastream.vocab,
+        "beam_size": 1024,
+        "beam_size_token": 128,
+        "arpa_lm": arpa_4gram_lm,
+        "beam_threshold": 14,
+    }
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6",
+        "net_args": {"model_config_dict": asdict(model_config)},
+    }
+    # diverged with hiccup
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #         }
+    #         run_exp(
+    #             prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR/lm%.1f_prior%.2f_bs1024_th14" % (
+    #                 lm_weight, prior_scale),
+    #             datasets=train_data, train_args=train_args, search_args=search_args, with_prior=True)
+
+    train_args_gc1 = copy.deepcopy(train_args)
+    train_args_gc1["config"]["gradient_clip"] = 1.0
+    for lm_weight in [2.5, 3.0, 3.5]:
+        for prior_scale in [0.0, 0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_gc1/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args_gc1,
+                search_args=search_args,
+                with_prior=True,
+            )
+
+    train_args_decay1e_2 = copy.deepcopy(train_args)
+    train_args_decay1e_2["config"]["optimizer"]["weight_decay"] = 1e-2
+    for lm_weight in [2.5, 3.0, 3.5]:
+        for prior_scale in [0.0, 0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_decay1e-2/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args_decay1e_2,
+                search_args=search_args,
+                with_prior=True,
+            )
+
+    search_args = {
+        **default_search_args,
+        "lm_weight": 3.5,
+        "prior_scale": 0.3,
+        "sil_score": -1000.0,
+    }
+    run_exp(
+        prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_decay1e-2/lm_test1_bs1024_th14",
+        datasets=train_data,
+        train_args=train_args_decay1e_2,
+        search_args=search_args,
+        with_prior=True,
+        decoder="ctc.decoder.flashlight_phoneme_ctc_v2",
+    )
+
+    search_args = {
+        "lexicon": get_text_lexicon(librispeech_key="train-clean-100"),
+        "returnn_vocab": label_datastream.vocab,
+        "beam_size": 1024,
+        "arpa_lm": arpa_4gram_lm,
+        "beam_threshold": 16,
+        "lm_weight": 3.5,
+        "prior_scale": 0.3,
+    }
+    run_exp(
+        prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_decay1e-2/lm_test2_bs1024_th16",
+        datasets=train_data,
+        train_args=train_args_decay1e_2,
+        search_args=search_args,
+        with_prior=True,
+        decoder="ctc.decoder.flashlight_phoneme_ctc",
+    )
+
+    ###### trying to reproduce 14.5% result from librispeech/librispeech_100_phon_ctc #########
+
+    train_args_adamw_02 = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-8, "weight_decay": 1e-2},
+            "learning_rates": list(np.linspace(1e-5, 1e-3, 150)) + list(np.linspace(1e-3, 1e-6, 150)),
+            #############
+            "batch_size": 200 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+        },
+    }
+    model_config_small_ff = ModelConfig(
+        feature_extraction_config=fe_config,
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=384,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+        specauc_start_epoch=1,
+    )
+    train_args = {
+        **copy.deepcopy(train_args_adamw_02),
+        "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6",
+        "net_args": {"model_config_dict": asdict(model_config_small_ff)},
+    }
+    for lm_weight in [2.5, 3.0, 3.5]:
+        for prior_scale in [0.0, 0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_legacy_decay1e-2_FF384/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/bpe.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/bpe.py
new file mode 100644
index 000000000..4deb3781e
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/bpe.py
@@ -0,0 +1,92 @@
+"""
+The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups
+"""
+from sisyphus import tk
+from functools import lru_cache
+from typing import Dict, List, Optional, Tuple
+
+
+from i6_experiments.common.datasets.librispeech import get_ogg_zip_dict, get_bliss_lexicon
+from i6_experiments.common.datasets.librispeech.vocab import get_subword_nmt_bpe_v2
+from i6_experiments.common.helpers.text_labels.subword_nmt_bpe import get_returnn_subword_nmt
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import BpeDatastream
+from i6_experiments.users.rossenbach.lexicon.bpe_lexicon import CreateBPELexiconJob
+
+from .common import TrainingDatasetSettings, TrainingDatasets, build_training_datasets, DATA_PREFIX
+from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE
+
+
+@lru_cache()
+def get_bpe_datastream(librispeech_key: str, bpe_size: int, is_recog: bool) -> BpeDatastream:
+    """
+    Returns the datastream for the bpe labels
+
+    Uses the legacy BPE setup that is compatible with old LM models
+
+    :param librispeech_key:
+    :param bpe_size: size for the bpe labels
+    :param is_recog: removes the UNK label when not in training
+    """
+    bpe_settings = get_subword_nmt_bpe_v2(corpus_key=librispeech_key, bpe_size=bpe_size, unk_label="<unk>")
+
+    # TODO: Try without sequence postfix (seq_postfix=None)
+    # otherwise every sequence gets a <s> at the end
+    bpe_targets = BpeDatastream(available_for_inference=False, bpe_settings=bpe_settings, use_unk_label=is_recog)
+    return bpe_targets
+
+
+def get_lexicon(librispeech_key: str, bpe_size: int) -> tk.Path:
+    subword_nmt_repo = get_returnn_subword_nmt(
+        commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", output_prefix=DATA_PREFIX
+    )
+    subword_nmt_repo.hash_overwrite = "I6_SUBWORD_NMT_V2"
+
+    bpe_datastream = get_bpe_datastream(librispeech_key=librispeech_key, bpe_size=bpe_size, is_recog=False)
+    bpe_lexicon = CreateBPELexiconJob(
+        base_lexicon_path=get_bliss_lexicon(
+            add_unknown_phoneme_and_mapping=False, add_silence=False, output_prefix="librispeech_datasets"
+        ),
+        bpe_codes=bpe_datastream.codes,
+        bpe_vocab=bpe_datastream.vocab,
+        subword_nmt_repo=subword_nmt_repo,
+        unk_label="<unk>",
+    ).out_lexicon
+
+    return bpe_lexicon
+
+
+def get_text_lexicon(librispeech_key: str, bpe_size: int) -> tk.Path:
+    """
+
+    :return:
+    """
+    bliss_lex = get_lexicon(librispeech_key=librispeech_key, bpe_size=bpe_size)
+    from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon
+
+    word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon
+    return word_lexicon
+
+
+def build_bpe_training_datasets(
+    librispeech_key: str,
+    bpe_size: int,
+    settings: TrainingDatasetSettings,
+) -> TrainingDatasets:
+    """
+    :param settings: configuration object for the dataset pipeline
+    """
+    label_datastream = get_bpe_datastream(librispeech_key=librispeech_key, bpe_size=bpe_size, is_recog=False)
+
+    ogg_zip_dict = get_ogg_zip_dict("corpora", returnn_root=MINI_RETURNN_ROOT, returnn_python_exe=RETURNN_EXE)
+    train_ogg = ogg_zip_dict[librispeech_key]
+    dev_clean_ogg = ogg_zip_dict["dev-clean"]
+    dev_other_ogg = ogg_zip_dict["dev-other"]
+
+    return build_training_datasets(
+        train_ogg=train_ogg,
+        dev_clean_ogg=dev_clean_ogg,
+        dev_other_ogg=dev_other_ogg,
+        settings=settings,
+        label_datastream=label_datastream,
+    )
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/common.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/common.py
new file mode 100644
index 000000000..0ca95abad
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/common.py
@@ -0,0 +1,201 @@
+"""
+The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups
+"""
+from sisyphus import tk
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Dict, List, Optional, Tuple
+
+from i6_core.returnn import CodeWrapper
+from i6_core.returnn.oggzip import BlissToOggZipJob
+
+from i6_experiments.common.datasets.librispeech import get_ogg_zip_dict, get_bliss_corpus_dict
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.audio import (
+    AudioRawDatastream,
+    ReturnnAudioRawOptions,
+)
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.base import Datastream
+from i6_experiments.users.rossenbach.datasets.librispeech import get_mixed_cv_segments
+
+from returnn_common.datasets import Dataset, OggZipDataset, MetaDataset
+
+from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE
+
+DATA_PREFIX = "experiments/librispeech/2023_standalone/data/"
+
+# -------------- Dataclasses for configuration and data passing -------------------
+
+# here: (<from-epoch> , <to-epoch>, <max-mean-length>)
+EpochWiseFilter = Tuple[int, int, int]
+
+
+@dataclass(frozen=True)
+class TrainingDatasets:
+    train: Dataset
+    cv: Dataset
+    devtrain: Dataset
+    datastreams: Dict[str, Datastream]
+    prior: Optional[Dataset]
+
+
+@dataclass()
+class TrainingDatasetSettings:
+    # features settings
+    custom_processing_function: Optional[str]
+
+    # training settings
+    partition_epoch: int
+    epoch_wise_filters: List[EpochWiseFilter]
+    seq_ordering: str
+    preemphasis: float
+
+
+# --------------------------- Helper functions  -----------------------------------
+
+
+@lru_cache()
+def get_audio_raw_datastream(preemphasis: Optional[float] = None) -> AudioRawDatastream:
+    """
+    :param preemphasis: set the pre-emphasis filter factor
+    """
+    audio_datastream = AudioRawDatastream(
+        available_for_inference=True, options=ReturnnAudioRawOptions(peak_normalization=True, preemphasis=preemphasis)
+    )
+    return audio_datastream
+
+
+def get_zip(name: str, bliss_dataset: tk.Path):
+    """
+
+    :param name:
+    :param bliss_dataset:
+    :return:
+    """
+    zip_dataset_job = BlissToOggZipJob(
+        bliss_corpus=bliss_dataset,
+        no_conversion=True,  # for Librispeech we are already having ogg
+        returnn_python_exe=RETURNN_EXE,
+        returnn_root=MINI_RETURNN_ROOT,
+    )
+    zip_dataset_job.add_alias(DATA_PREFIX + name)
+
+    return zip_dataset_job.out_ogg_zip
+
+
+# --------------------------- Dataset functions  -----------------------------------
+
+
+def build_training_datasets(
+    train_ogg: tk.Path,
+    dev_clean_ogg: tk.Path,
+    dev_other_ogg: tk.Path,
+    label_datastream: Datastream,
+    settings: TrainingDatasetSettings,
+) -> TrainingDatasets:
+    """
+    :param train_ogg:
+    :param dev_clean_ogg:
+    :param dev_other_ogg:
+    :param label_datastream:
+    :param settings:
+    """
+    audio_datastream = get_audio_raw_datastream(settings.preemphasis)
+
+    datastreams = {
+        "raw_audio": audio_datastream,
+        "labels": label_datastream,
+    }
+
+    data_map = {"raw_audio": ("zip_dataset", "data"), "labels": ("zip_dataset", "classes")}
+
+    training_audio_opts = audio_datastream.as_returnn_audio_opts()
+    if settings.custom_processing_function:
+        training_audio_opts["pre_process"] = CodeWrapper(settings.custom_processing_function)
+
+    additional_opts = {}
+    if settings.epoch_wise_filters:
+        additional_opts["epoch_wise_filter"] = {}
+        for fr, to, max_mean_len in settings.epoch_wise_filters:
+            additional_opts["epoch_wise_filter"][(fr, to)] = {"max_mean_len": max_mean_len}
+
+    def make_meta(dataset: OggZipDataset):
+        return MetaDataset(
+            data_map=data_map, datasets={"zip_dataset": dataset}, seq_order_control_dataset="zip_dataset"
+        )
+
+    train_zip_dataset = OggZipDataset(
+        files=train_ogg,
+        audio_options=training_audio_opts,
+        target_options=label_datastream.as_returnn_targets_opts(),
+        partition_epoch=settings.partition_epoch,
+        seq_ordering=settings.seq_ordering,
+        additional_options=additional_opts,
+    )
+    train_dataset = make_meta(train_zip_dataset)
+
+    cv_zip_dataset = OggZipDataset(
+        files=[dev_clean_ogg, dev_other_ogg],
+        audio_options=audio_datastream.as_returnn_audio_opts(),
+        target_options=label_datastream.as_returnn_targets_opts(),
+        segment_file=get_mixed_cv_segments(),
+        seq_ordering="sorted_reverse",
+    )
+    cv_dataset = make_meta(cv_zip_dataset)
+
+    devtrain_zip_dataset = OggZipDataset(
+        files=train_ogg,
+        audio_options=audio_datastream.as_returnn_audio_opts(),
+        target_options=label_datastream.as_returnn_targets_opts(),
+        seq_ordering="sorted_reverse",
+        random_subset=3000,
+    )
+    devtrain_dataset = make_meta(devtrain_zip_dataset)
+
+    prior_zip_dataset = OggZipDataset(
+        files=train_ogg,
+        audio_options=training_audio_opts,
+        target_options=label_datastream.as_returnn_targets_opts(),
+        partition_epoch=1,
+        seq_ordering="sorted_reverse",
+        additional_options=additional_opts,
+    )
+    prior_dataset = make_meta(prior_zip_dataset)
+
+    return TrainingDatasets(
+        train=train_dataset,
+        cv=cv_dataset,
+        devtrain=devtrain_dataset,
+        datastreams=datastreams,
+        prior=prior_dataset,
+    )
+
+
+@lru_cache()
+def build_test_dataset(
+    dataset_key: str,
+    preemphasis: Optional[float] = None,
+):
+    """
+
+    :param librispeech_key: e.g. train-clean-100h, used for basic BPE stream
+    :param dataset_key: e.g. dev-other, which test set to create
+    :param preemphasis:
+    :return:
+    """
+    ogg_zip_dict = get_ogg_zip_dict("corpora", returnn_root=MINI_RETURNN_ROOT, returnn_python_exe=RETURNN_EXE)
+    bliss_dict = get_bliss_corpus_dict()
+    test_ogg = ogg_zip_dict[dataset_key]
+
+    audio_datastream = get_audio_raw_datastream(preemphasis)
+
+    data_map = {"raw_audio": ("zip_dataset", "data")}
+
+    test_zip_dataset = OggZipDataset(
+        files=[test_ogg], audio_options=audio_datastream.as_returnn_audio_opts(), seq_ordering="sorted_reverse"
+    )
+    test_dataset = MetaDataset(
+        data_map=data_map, datasets={"zip_dataset": test_zip_dataset}, seq_order_control_dataset="zip_dataset"
+    )
+
+    return test_dataset, bliss_dict[dataset_key]
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/phon.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/phon.py
new file mode 100644
index 000000000..450f00fc6
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/data/phon.py
@@ -0,0 +1,142 @@
+"""
+The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups
+"""
+from sisyphus import tk
+
+from dataclasses import dataclass
+from functools import lru_cache
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+from i6_core.returnn.vocabulary import ReturnnVocabFromPhonemeInventory
+from i6_core.corpus.transform import ApplyLexiconToCorpusJob
+from i6_core.lexicon.modification import AddEowPhonemesToLexiconJob
+
+from i6_experiments.common.datasets.librispeech import (
+    get_g2p_augmented_bliss_lexicon_dict,
+    get_bliss_corpus_dict,
+    get_ogg_zip_dict,
+    get_bliss_lexicon,
+)
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+
+
+from .common import get_zip, DATA_PREFIX, build_training_datasets, TrainingDatasets, TrainingDatasetSettings
+
+
+def get_eow_lexicon(librispeech_key: str, with_g2p=True) -> tk.Path:
+
+    """
+    get the g2p bliss lexicon with EOW tokens added
+    :return:
+    """
+    if with_g2p:
+        lex = get_g2p_augmented_bliss_lexicon_dict(
+            use_stress_marker=False, add_silence=False, output_prefix="librispeech_g2p_datasets"
+        )[librispeech_key]
+    else:
+        lex = get_bliss_lexicon(use_stress_marker=False, add_silence=False, output_prefix="librispeech_datasets")
+
+    return AddEowPhonemesToLexiconJob(lex).out_lexicon
+
+
+def get_eow_bliss(librispeech_key: str, train_librispeech_key: str, remove_unk_seqs=False) -> tk.Path:
+    """
+    get an EOW modified corpus with optional unknown removed for cross validation
+
+    :param corpus_key: train, dev, test
+    :param remove_unk_seqs: remove all sequences with unknowns, used for dev-clean and dev-other
+        in case of using them for cross validation
+    :return:
+    """
+    bliss = get_bliss_corpus_dict(audio_format="ogg")[librispeech_key]
+    if remove_unk_seqs:
+        from i6_core.corpus.filter import FilterCorpusRemoveUnknownWordSegmentsJob
+
+        bliss = FilterCorpusRemoveUnknownWordSegmentsJob(
+            bliss_corpus=bliss,
+            bliss_lexicon=get_eow_lexicon(
+                librispeech_key=train_librispeech_key, with_g2p=True
+            ),  # cv may include words from g2p
+            all_unknown=False,
+        ).out_corpus
+
+    # default train lexicon
+    lexicon = get_eow_lexicon(librispeech_key=train_librispeech_key, with_g2p=True)
+    converted_bliss_corpus = ApplyLexiconToCorpusJob(bliss, lexicon, word_separation_orth=None).out_corpus
+
+    return converted_bliss_corpus
+
+
+def get_eow_bliss_and_zip(librispeech_key: str, train_librispeech_key: str, remove_unk_seqs=False):
+    """
+    :param corpus_key: e.g. "train", "dev", or "test,
+    :param remove_unk_seqs: remove all sequences with unknowns, used for dev-clean and dev-other
+        in case of using them for cross validation
+    :return: tuple of bliss and zip
+    """
+
+    bliss_dataset = get_eow_bliss(
+        librispeech_key=librispeech_key, train_librispeech_key=train_librispeech_key, remove_unk_seqs=remove_unk_seqs
+    )
+    zip_dataset = get_zip(f"{librispeech_key}_eow", bliss_dataset=bliss_dataset)
+
+    return bliss_dataset, zip_dataset
+
+
+def get_eow_vocab_datastream(librispeech_key: str) -> LabelDatastream:
+    """
+    Phoneme with EOW LabelDatastream for Tedlium-2
+
+    :param with_blank: datastream for CTC training
+    """
+    lexicon = get_eow_lexicon(librispeech_key=librispeech_key)
+    returnn_vocab_job = ReturnnVocabFromPhonemeInventory(lexicon)
+    returnn_vocab_job.add_alias(os.path.join(DATA_PREFIX, f"{librispeech_key}", "eow_returnn_vocab_job"))
+
+    vocab_datastream = LabelDatastream(
+        available_for_inference=True, vocab=returnn_vocab_job.out_vocab, vocab_size=returnn_vocab_job.out_vocab_size
+    )
+
+    return vocab_datastream
+
+
+def get_text_lexicon(librispeech_key: str) -> tk.Path:
+    """
+
+    :return:
+    """
+    bliss_lex = get_eow_lexicon(librispeech_key=librispeech_key, with_g2p=False)
+    from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon
+
+    word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon
+    return word_lexicon
+
+
+def build_eow_phon_training_datasets(
+    librispeech_key: str,
+    settings: TrainingDatasetSettings,
+) -> TrainingDatasets:
+    """
+    :param settings: configuration object for the dataset pipeline
+    """
+    label_datastream = get_eow_vocab_datastream(librispeech_key=librispeech_key)
+
+    _, train_ogg = get_eow_bliss_and_zip(
+        librispeech_key=librispeech_key, train_librispeech_key=librispeech_key, remove_unk_seqs=False
+    )
+    _, dev_clean_ogg = get_eow_bliss_and_zip(
+        librispeech_key="dev-clean", train_librispeech_key=librispeech_key, remove_unk_seqs=True
+    )
+    _, dev_other_ogg = get_eow_bliss_and_zip(
+        librispeech_key="dev-other", train_librispeech_key=librispeech_key, remove_unk_seqs=True
+    )
+
+    return build_training_datasets(
+        train_ogg=train_ogg,
+        dev_clean_ogg=dev_clean_ogg,
+        dev_other_ogg=dev_other_ogg,
+        settings=settings,
+        label_datastream=label_datastream,
+    )
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/default_tools.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/default_tools.py
new file mode 100644
index 000000000..5c33c776a
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/default_tools.py
@@ -0,0 +1,20 @@
+from sisyphus import tk
+from i6_core.tools.git import CloneGitRepositoryJob
+
+
+# python from apptainer
+RETURNN_EXE = tk.Path("/usr/bin/python3", hash_overwrite="GENERIC_RETURNN_LAUNCHER")
+MINI_RETURNN_ROOT = tk.Path("/u/hilmes/dev/MiniReturnn", hash_overwrite="LIBRISPEECH_DEFAULT_RETURNN_ROOT")
+
+from i6_experiments.common.tools.sctk import compile_sctk
+
+SCTK_BINARY_PATH = compile_sctk(branch="v2.4.12")  # use last published version
+# SCTK_BINARY_PATH = compile_sctk()  # use most recent SCTK
+SCTK_BINARY_PATH.hash_overwrite = "LIBRISPEECH_DEFAULT_SCTK_BINARY_PATH"
+
+from i6_core.tools.git import CloneGitRepositoryJob
+from i6_core.lm.kenlm import CompileKenLMJob, CreateBinaryLMJob
+
+kenlm_repo = CloneGitRepositoryJob("https://github.com/kpu/kenlm").out_repository
+KENLM_BINARY_PATH = CompileKenLMJob(repository=kenlm_repo).out_binaries
+KENLM_BINARY_PATH.hash_overwrite = "LIBRISPEECH_DEFAULT_KENLM_BINARY_PATH"
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/lm.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/lm.py
new file mode 100644
index 000000000..42a910233
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/lm.py
@@ -0,0 +1,18 @@
+from i6_core.lm.kenlm import CreateBinaryLMJob
+
+from i6_experiments.common.datasets.librispeech.language_model import get_arpa_lm_dict
+
+from .default_tools import KENLM_BINARY_PATH
+
+
+def get_4gram_binary_lm():
+    """
+
+    :param output_prefix:
+    :return:
+    """
+    arpa_4gram_binary_lm_job = CreateBinaryLMJob(
+        arpa_lm=get_arpa_lm_dict()["4gram"], kenlm_binary_folder=KENLM_BINARY_PATH
+    )
+    arpa_4gram_binary_lm_job.add_alias("experiments/librispeech/standalone_2023/lm/create_4gram_binary_lm")
+    return arpa_4gram_binary_lm_job.out_lm
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pipeline.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pipeline.py
new file mode 100644
index 000000000..1327b42db
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pipeline.py
@@ -0,0 +1,179 @@
+import copy
+import os.path
+
+from sisyphus import tk
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset
+
+from i6_core.returnn.config import ReturnnConfig
+from i6_core.returnn.training import ReturnnTrainingJob
+from i6_core.returnn.training import GetBestTFCheckpointJob
+from i6_core.returnn.forward import ReturnnForwardJob, ReturnnForwardJobV2
+from i6_core.returnn.search import SearchBPEtoWordsJob, ReturnnComputeWERJob
+from i6_experiments.users.rossenbach.returnn.training import AverageCheckpointsJobV2
+
+from .default_tools import RETURNN_EXE, MINI_RETURNN_ROOT, SCTK_BINARY_PATH
+
+
+@tk.block()
+def training(prefix_name, returnn_config, returnn_exe, returnn_root, num_epochs):
+    """
+
+    :param prefix_name:
+    :param returnn_config:
+    :param returnn_exe:
+    :param returnn_root:
+    :return:
+    """
+    default_rqmt = {
+        "mem_rqmt": 15,
+        "time_rqmt": 168,
+        "cpu_rqmt": 4,
+        "log_verbosity": 5,
+        "returnn_python_exe": returnn_exe,
+        "returnn_root": returnn_root,
+    }
+
+    train_job = ReturnnTrainingJob(returnn_config=returnn_config, num_epochs=num_epochs, **default_rqmt)
+    train_job.add_alias(prefix_name + "/training")
+    tk.register_output(prefix_name + "/learning_rates", train_job.out_learning_rates)
+
+    return train_job
+
+
+@tk.block()
+def search_single(
+    prefix_name,
+    returnn_config,
+    checkpoint,
+    recognition_dataset: GenericDataset,
+    recognition_bliss_corpus,
+    returnn_exe,
+    returnn_root,
+    mem_rqmt=8,
+    use_gpu=False,
+):
+    """
+    Run search for a specific test dataset
+
+    :param str prefix_name:
+    :param ReturnnConfig returnn_config:
+    :param Checkpoint checkpoint:
+    :param returnn_standalone.data.datasets.dataset.GenericDataset recognition_dataset:
+    :param Path recognition_reference: Path to a py-dict format reference file
+    :param Path returnn_exe:
+    :param Path returnn_root:
+    """
+    returnn_config = copy.deepcopy(returnn_config)
+    returnn_config.config["forward"] = recognition_dataset.as_returnn_opts()
+    search_job = ReturnnForwardJobV2(
+        model_checkpoint=checkpoint,
+        returnn_config=returnn_config,
+        log_verbosity=5,
+        mem_rqmt=mem_rqmt,
+        time_rqmt=24,
+        device="gpu" if use_gpu else "cpu",
+        cpu_rqmt=2,
+        returnn_python_exe=returnn_exe,
+        returnn_root=returnn_root,
+        output_files=["search_out.py"],
+    )
+    search_job.add_alias(prefix_name + "/search_job")
+
+    from i6_core.returnn.search import SearchWordsToCTMJob
+    from i6_core.corpus.convert import CorpusToStmJob
+    from i6_core.recognition.scoring import ScliteJob
+
+    search_ctm = SearchWordsToCTMJob(
+        recog_words_file=search_job.out_files["search_out.py"],
+        bliss_corpus=recognition_bliss_corpus,
+    ).out_ctm_file
+
+    stm_file = CorpusToStmJob(bliss_corpus=recognition_bliss_corpus).out_stm_path
+
+    sclite_job = ScliteJob(ref=stm_file, hyp=search_ctm, sctk_binary_path=SCTK_BINARY_PATH)
+    tk.register_output(prefix_name + "/sclite/wer", sclite_job.out_wer)
+    tk.register_output(prefix_name + "/sclite/report", sclite_job.out_report_dir)
+
+    return sclite_job.out_wer, search_job
+
+
+@tk.block()
+def search(prefix_name, returnn_config, checkpoint, test_dataset_tuples, returnn_exe, returnn_root, use_gpu=False):
+    """
+
+    :param str prefix_name:
+    :param ReturnnConfig returnn_config:
+    :param Checkpoint checkpoint:
+    :param test_dataset_tuples:
+    :param returnn_exe:
+    :param returnn_root:
+    :return:
+    """
+    # use fixed last checkpoint for now, needs more fine-grained selection / average etc. here
+    wers = {}
+    search_jobs = []
+    for key, (test_dataset, test_dataset_reference) in test_dataset_tuples.items():
+        wers[key], search_job = search_single(
+            prefix_name + "/%s" % key,
+            returnn_config,
+            checkpoint,
+            test_dataset,
+            test_dataset_reference,
+            returnn_exe,
+            returnn_root,
+            use_gpu=use_gpu,
+        )
+        search_jobs.append(search_job)
+
+    from i6_core.report import GenerateReportStringJob, MailJob
+
+    format_string_report = ",".join(["{%s_val}" % (prefix_name + key) for key in test_dataset_tuples.keys()])
+    format_string = " - ".join(
+        ["{%s}: {%s_val}" % (prefix_name + key, prefix_name + key) for key in test_dataset_tuples.keys()]
+    )
+    values = {}
+    values_report = {}
+    for key in test_dataset_tuples.keys():
+        values[prefix_name + key] = key
+        values["%s_val" % (prefix_name + key)] = wers[key]
+        values_report["%s_val" % (prefix_name + key)] = wers[key]
+
+    report = GenerateReportStringJob(report_values=values, report_template=format_string, compress=False).out_report
+    # mail = MailJob(result=report, subject=prefix_name, send_contents=True).out_status
+    # tk.register_output(os.path.join(prefix_name, "mail_status"), mail)
+    return format_string_report, values_report, search_jobs
+
+
+@tk.block()
+def compute_prior(
+    prefix_name,
+    returnn_config,
+    checkpoint,
+    returnn_exe,
+    returnn_root,
+    mem_rqmt=8,
+):
+    """
+    Run search for a specific test dataset
+
+    :param str prefix_name:
+    :param ReturnnConfig returnn_config:
+    :param Checkpoint checkpoint:
+    :param Path returnn_exe:
+    :param Path returnn_root:
+    """
+    search_job = ReturnnForwardJobV2(
+        model_checkpoint=checkpoint,
+        returnn_config=returnn_config,
+        log_verbosity=5,
+        mem_rqmt=mem_rqmt,
+        time_rqmt=1,
+        device="gpu",
+        cpu_rqmt=4,
+        returnn_python_exe=returnn_exe,
+        returnn_root=returnn_root,
+        output_files=["prior.txt"],
+    )
+    search_job.add_alias(prefix_name + "/prior_job")
+    return search_job.out_files["prior.txt"]
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6.py
new file mode 100644
index 000000000..bd5860dc5
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6.py
@@ -0,0 +1,184 @@
+"""
+Like v2, but with i6_models specaugment (v3)
+and now controllable start time for when specaugment is applied (v4)
+and with the proper feature extraction from i6-models
+"""
+
+import numpy as np
+import torch
+from torch import nn
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ModelConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=self.cfg.feature_extraction_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py
new file mode 100644
index 000000000..a57c949fd
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py
@@ -0,0 +1,84 @@
+"""
+Config for the base CTC models v4, including specaug start time
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1Config
+
+
+@dataclass(kw_only=True)
+class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config):
+    activation_str: str = ""
+    activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        activation_str = d.pop("activation_str")
+        if activation_str == "ReLU":
+            from torch.nn import ReLU
+
+            activation = ReLU()
+        else:
+            assert False, "Unsupported activation %s" % d["activation_str"]
+        d["activation"] = activation
+        return VGG4LayerActFrontendV1Config(**d)
+
+
+@dataclass
+class ConformerEncoderV1Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: Number of conformer layers in the conformer encoder
+        frontend: A pair of ConformerFrontend and corresponding config
+        block_cfg: Configuration for ConformerBlockV1
+    """
+
+    num_layers: int
+
+    # nested configurations
+    frontend: ModuleFactoryV1
+    block_cfg: ConformerBlockV1Config
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+
+@dataclass
+class ModelConfig:
+    feature_extraction_config: LogMelFeatureExtractionV1Config
+    frontend_config: VGG4LayerActFrontendV1Config
+    specaug_config: SpecaugConfig
+    specauc_start_epoch: int
+    label_target_size: int
+    conformer_size: int
+    num_layers: int
+    num_heads: int
+    ff_dim: int
+    att_weights_dropout: float
+    conv_dropout: float
+    ff_dropout: float
+    mhsa_dropout: float
+    conv_kernel_size: int
+    final_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["feature_extraction_config"] = LogMelFeatureExtractionV1Config(**d["feature_extraction_config"])
+        d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig(**d["specaug_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py
new file mode 100644
index 000000000..3012eee33
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py
@@ -0,0 +1,114 @@
+"""
+Flashlight/Torchaudio CTC decoder and prior computation functions
+"""
+
+import time
+import numpy as np
+import torch
+from torch import nn
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        tokens=labels + ["[blank]"],
+        # "[SILENCE]" and "[UNK]" are not actually part of the vocab,
+        # but the decoder is happy as long they are defined in the token list
+        # even if they do not exist as label index in the softmax output,
+        blank_token="[blank]",
+        sil_token="[blank]",
+        unk_word="[unknown]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_size_token=kwargs.get("beam_size_token", None),
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_am_time = 0
+    run_ctx.total_search_time = 0
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print(
+        "Total-AM-Time: %.2fs, AM-RTF: %.3f"
+        % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s)
+    )
+    print(
+        "Total-Search-Time: %.2fs, Search-RTF: %.3f"
+        % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s)
+    )
+    total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    am_start = time.time()
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    tags = data["seq_tag"]
+
+    logprobs_cpu = logprobs.cpu()
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    am_time = time.time() - am_start
+    run_ctx.total_am_time += am_time
+
+    search_start = time.time()
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu())
+    search_time = time.time() - search_start
+    run_ctx.total_search_time += search_time
+
+    print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+    print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch))
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch))
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py
new file mode 100644
index 000000000..39d942e9b
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py
@@ -0,0 +1,114 @@
+"""
+Flashlight/Torchaudio CTC decoder and prior computation functions
+"""
+
+import time
+import numpy as np
+import torch
+from torch import nn
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"],
+        # "[SILENCE]" and "[UNK]" are not actually part of the vocab,
+        # but the decoder is happy as long they are defined in the token list
+        # even if they do not exist as label index in the softmax output,
+        blank_token="[blank]",
+        sil_token="[SILENCE]",
+        unk_word="[unknown]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_size_token=kwargs.get("beam_size_token", None),
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_am_time = 0
+    run_ctx.total_search_time = 0
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print(
+        "Total-AM-Time: %.2fs, AM-RTF: %.3f"
+        % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s)
+    )
+    print(
+        "Total-Search-Time: %.2fs, Search-RTF: %.3f"
+        % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s)
+    )
+    total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    am_start = time.time()
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    tags = data["seq_tag"]
+
+    logprobs_cpu = logprobs.cpu()
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    am_time = time.time() - am_start
+    run_ctx.total_am_time += am_time
+
+    search_start = time.time()
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu())
+    search_time = time.time() - search_start
+    run_ctx.total_search_time += search_time
+
+    print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+    print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch))
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch))
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc_v2.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc_v2.py
new file mode 100644
index 000000000..815b283ba
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc_v2.py
@@ -0,0 +1,115 @@
+"""
+Flashlight/Torchaudio CTC decoder and prior computation functions
+"""
+
+import time
+import numpy as np
+import torch
+from torch import nn
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        tokens=labels + ["[blank]"],
+        # "[SILENCE]" and "[UNK]" are not actually part of the vocab,
+        # but the decoder is happy as long they are defined in the token list
+        # even if they do not exist as label index in the softmax output,
+        blank_token="[blank]",
+        sil_token="[blank]",
+        unk_word="[unknown]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_size_token=kwargs.get("beam_size_token", None),
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_am_time = 0
+    run_ctx.total_search_time = 0
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print(
+        "Total-AM-Time: %.2fs, AM-RTF: %.3f"
+        % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s)
+    )
+    print(
+        "Total-Search-Time: %.2fs, Search-RTF: %.3f"
+        % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s)
+    )
+    total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    am_start = time.time()
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    tags = data["seq_tag"]
+
+    logprobs_cpu = logprobs.cpu()
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    am_time = time.time() - am_start
+    run_ctx.total_am_time += am_time
+
+    search_start = time.time()
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu())
+    search_time = time.time() - search_start
+    run_ctx.total_search_time += search_time
+
+    print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+    print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch))
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch))
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        # TODO: Check if "[" removal is unnecessary
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py
new file mode 100644
index 000000000..cabf6d47d
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py
@@ -0,0 +1,59 @@
+"""
+Greedy CTC decoder without any extras
+"""
+
+import time
+import torch
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    run_ctx.labels = vocab.labels
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_time = 0
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (run_ctx.total_time, run_ctx.total_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    am_start = time.time()
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    batch_indices = []
+    for lp, l in zip(logprobs, audio_features_len):
+        batch_indices.append(torch.unique_consecutive(torch.argmax(lp[:l], dim=-1), dim=0).detach().cpu().numpy())
+
+    am_time = time.time() - am_start
+    run_ctx.total_time += am_time
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+
+    tags = data["seq_tag"]
+
+    for indices, tag in zip(batch_indices, tags):
+        print(indices)
+        sequence = [run_ctx.labels[idx] for idx in indices if idx < len(run_ctx.labels)]
+        sequence = [s for s in sequence if (not s.startswith("<") and not s.startswith("["))]
+        text = " ".join(sequence).replace("@@ ", "")
+        print(text)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(text)))
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/__init__.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py
new file mode 100644
index 000000000..c28566b92
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py
@@ -0,0 +1,87 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass(kw_only=True)
+class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config):
+    activation_str: str = ""
+    activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        activation_str = d.pop("activation_str")
+        if activation_str == "ReLU":
+            from torch.nn import ReLU
+
+            activation = ReLU()
+        else:
+            assert False, "Unsupported activation %s" % d["activation_str"]
+        d["activation"] = activation
+        return VGG4LayerActFrontendV1Config(**d)
+
+
+@dataclass
+class ConformerEncoderV1Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: Number of conformer layers in the conformer encoder
+        frontend: A pair of ConformerFrontend and corresponding config
+        block_cfg: Configuration for ConformerBlockV1
+    """
+
+    num_layers: int
+
+    # nested configurations
+    frontend: ModuleFactoryV1
+    block_cfg: ConformerBlockV1Config
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return SpecaugConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    frontend_config: VGG4LayerActFrontendV1Config
+    specaug_config: SpecaugConfig
+    label_target_size: int
+    conformer_size: int
+    num_layers: int
+    num_heads: int
+    ff_dim: int
+    att_weights_dropout: float
+    conv_dropout: float
+    ff_dropout: float
+    mhsa_dropout: float
+    conv_kernel_size: int
+    final_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_posenc.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_posenc.py
new file mode 100644
index 000000000..8d45dab3f
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_posenc.py
@@ -0,0 +1,369 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+import math
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+from .specaugment_fixed import returnn_specaugment_by_length
+from .legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformulated in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Masked tensor [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class TransparentConformerEncoderV1(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.1)
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,)))
+
+        torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1))
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+        x = self.posenc(x)
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        final = transparent_weights[0] * x
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            final = final + (transparent_weights[i + 1] * x)
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        self.label_target_size = self.cfg.label_target_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        self.export_mode = False
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+
+        :param raw_audio:
+        :param raw_audio_len:
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = returnn_specaugment_by_length(
+                    audio_features,
+                    repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    max_dim_time=self.cfg.specaug_config.max_dim_time,
+                    num_repeat_feat=self.cfg.specaug_config.num_repeat_feat,
+                    max_dim_feat=self.cfg.specaug_config.max_dim_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    bpe_labels = data["bpe_labels"]  # [B, N] (sparse)
+    bpe_labels_len = data["bpe_labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        bpe_labels,
+        input_lengths=audio_features_len,
+        target_lengths=bpe_labels_len,
+        blank=model.label_target_size,
+        reduction="sum",
+    )
+    num_phonemes = torch.sum(bpe_labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        # tokens=labels + ["[blank]", "[SILENCE]"],
+        tokens=labels + ["[blank]"],
+        blank_token="[blank]",
+        # sil_token="[SILENCE]",
+        sil_token="[blank]",
+        unk_word="[UNKNOWN2]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    # write empty HDF until new ForwardJob exists
+    f = open("output.hdf", "wt")
+    f.write(" ")
+    f.close()
+
+
+def search_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    tags = data["seq_tag"]
+
+    logprobs_cpu = logprobs.cpu()
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu())
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        print(words)
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/serializer.py b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/serializer.py
new file mode 100644
index 000000000..63171ae7c
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/librispeech_standalone_2023/serializer.py
@@ -0,0 +1,109 @@
+import copy
+from sisyphus import tk
+from typing import Any, Dict, Optional
+
+from i6_core.tools.git import CloneGitRepositoryJob
+
+from i6_experiments.common.setups.returnn_pytorch.serialization import (
+    Collection as TorchCollection,
+)
+from i6_experiments.common.setups.serialization import ExternalImport
+
+from . import PACKAGE
+
+from i6_experiments.common.setups.serialization import Import, PartialImport
+
+
+def get_pytorch_serializer_v3(
+    network_module: str,
+    net_args: Dict[str, Any],
+    decoder: Optional[str] = None,
+    decoder_args: Optional[Dict[str, Any]] = None,
+    post_decoder_args: Optional[Dict[str, Any]] = None,
+    prior=False,
+    debug=False,
+    **kwargs
+) -> TorchCollection:
+    """
+
+    :param network_module: path to the pytorch config file containing Model
+    :param net_args: extra arguments for the model
+    :param decoder: path to the search decoder, if provided will link search functions
+    :param decoder_args:
+    :param post_decoder_args:
+    :param prior: build config for prior computation
+    :param debug: run training in debug mode (linking from recipe instead of copy)
+    :param kwargs:
+    :return:
+    """
+    package = PACKAGE + ".pytorch_networks"
+
+    pytorch_model_import = PartialImport(
+        code_object_path=package + ".%s.Model" % network_module,
+        unhashed_package_root=PACKAGE,
+        hashed_arguments=net_args,
+        unhashed_arguments={},
+        import_as="get_model",
+    )
+    pytorch_train_step = Import(
+        code_object_path=package + ".%s.train_step" % network_module, unhashed_package_root=PACKAGE
+    )
+
+    # TODO: add flag to switch and maybe move to default tools
+    # i6_models_repo = CloneGitRepositoryJob(
+    #     url="https://github.com/rwth-i6/i6_models",
+    #     commit="1e94a4d9d1aa48fe3ac7f60de2cd7bd3fea19c3e",
+    #     checkout_folder_name="i6_models"
+    # ).out_repository
+    i6_models_repo = tk.Path("/u/hilmes/experiments/nick_asr/i6_models")
+    i6_models_repo.hash_overwrite = "LIBRISPEECH_DEFAULT_I6_MODELS"
+    i6_models = ExternalImport(import_path=i6_models_repo)
+
+    serializer_objects = [
+        i6_models,
+        pytorch_model_import,
+        pytorch_train_step,
+    ]
+    if decoder:
+        # Just a hack to test the phoneme-based recognition
+        forward_step = Import(
+            code_object_path=package + ".%s.forward_step" % decoder,
+            unhashed_package_root=PACKAGE,
+        )
+        init_hook = PartialImport(
+            code_object_path=package + ".%s.forward_init_hook" % decoder,
+            unhashed_package_root=PACKAGE,
+            hashed_arguments=decoder_args or {},
+            unhashed_arguments=post_decoder_args or {},
+        )
+        finish_hook = Import(
+            code_object_path=package + ".%s.forward_finish_hook" % decoder,
+            unhashed_package_root=PACKAGE,
+        )
+        serializer_objects.extend([forward_step, init_hook, finish_hook])
+    if prior:
+        forward_step = Import(
+            code_object_path=package + ".%s.prior_step" % network_module,
+            unhashed_package_root=PACKAGE,
+            import_as="forward_step",
+        )
+        init_hook = Import(
+            code_object_path=package + ".%s.prior_init_hook" % network_module,
+            unhashed_package_root=PACKAGE,
+            import_as="forward_init_hook",
+        )
+        finish_hook = Import(
+            code_object_path=package + ".%s.prior_finish_hook" % network_module,
+            import_as="forward_finish_hook",
+            unhashed_package_root=PACKAGE,
+        )
+        serializer_objects.extend([forward_step, init_hook, finish_hook])
+    serializer = TorchCollection(
+        serializer_objects=serializer_objects,
+        make_local_package_copy=not debug,
+        packages={
+            package,
+        },
+    )
+
+    return serializer
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/__init__.py
new file mode 100644
index 000000000..6ac5dd240
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/__init__.py
@@ -0,0 +1 @@
+PACKAGE = __package__
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/config.py b/users/hilmes/experiments/nick_setups/standalone_2023/config.py
new file mode 100644
index 000000000..c6536eb6b
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/config.py
@@ -0,0 +1,156 @@
+import copy
+import numpy as np
+from sisyphus import tk
+from typing import Any, Dict
+
+from i6_core.returnn.config import ReturnnConfig, CodeWrapper
+
+from i6_experiments.common.setups.returnn_pytorch.serialization import (
+    Collection as TorchCollection,
+)
+from i6_experiments.common.setups.serialization import Import
+from .data.common import TrainingDatasets
+from .serializer import get_pytorch_serializer_v3, PACKAGE
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset
+
+
+def get_training_config(
+    training_datasets: TrainingDatasets,
+    network_module: str,
+    net_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine: bool = False,
+    use_speed_perturbation: bool = False,
+) -> ReturnnConfig:
+    """
+    :param training_datasets: datasets for training
+    :param network_module: path to the pytorch config file containing Model
+    :param net_args: extra arguments for the model
+    :param config:
+    :param debug: run training in debug mode (linking from recipe instead of copy)
+    """
+
+    # changing these does not change the hash
+    post_config = {
+        "cleanup_old_models": True,
+        "stop_on_nonfinite_train_score": True,  # this might break now with True
+        "num_workers_per_gpu": 2,
+    }
+
+    base_config = {
+        #############
+        "train": copy.deepcopy(training_datasets.train.as_returnn_opts()),
+        "dev": training_datasets.cv.as_returnn_opts(),
+        "eval_datasets": {"devtrain": training_datasets.devtrain.as_returnn_opts()},
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module, net_args=net_args, debug=debug, use_custom_engine=use_custom_engine
+    )
+    python_prolog = None
+
+    # TODO: maybe make nice
+    if use_speed_perturbation:
+        prolog_serializer = TorchCollection(
+            serializer_objects=[
+                Import(
+                    code_object_path=PACKAGE + ".dataset_code.speed_perturbation.legacy_speed_perturbation",
+                    unhashed_package_root=PACKAGE,
+                )
+            ]
+        )
+        python_prolog = [prolog_serializer]
+        config["train"]["datasets"]["zip_dataset"]["audio"]["pre_process"] = CodeWrapper("legacy_speed_perturbation")
+
+    returnn_config = ReturnnConfig(
+        config=config, post_config=post_config, python_prolog=python_prolog, python_epilog=[serializer]
+    )
+    return returnn_config
+
+
+def get_prior_config(
+    training_datasets: TrainingDatasets,
+    network_module: str,
+    net_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine=False,
+    **kwargs,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these does not change the hash
+    post_config = {}
+
+    base_config = {
+        #############
+        "batch_size": 500 * 16000,
+        "max_seqs": 60,
+        #############
+        "forward": training_datasets.prior.as_returnn_opts(),
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module,
+        net_args=net_args,
+        debug=debug,
+        use_custom_engine=use_custom_engine,
+        prior=True,
+    )
+    returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer])
+    return returnn_config
+
+
+def get_search_config(
+    network_module: str,
+    net_args: Dict[str, Any],
+    decoder: [str],
+    decoder_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine=False,
+    **kwargs,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these does not change the hash
+    post_config = {}
+
+    base_config = {
+        #############
+        "batch_size": 240 * 16000,
+        "max_seqs": 60,
+        #############
+        # dataset is added later in the pipeline during search_single
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module,
+        net_args=net_args,
+        debug=debug,
+        use_custom_engine=use_custom_engine,
+        decoder=decoder,
+        decoder_args=decoder_args,
+    )
+    returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer])
+    return returnn_config
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/ctc_bpe/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/ctc_bpe/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/ctc_bpe/exp_ls100_1023_base.py b/users/hilmes/experiments/nick_setups/standalone_2023/ctc_bpe/exp_ls100_1023_base.py
new file mode 100644
index 000000000..75d1f3bc2
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/ctc_bpe/exp_ls100_1023_base.py
@@ -0,0 +1,296 @@
+from sisyphus import tk
+
+import copy
+from dataclasses import asdict
+import numpy as np
+from typing import cast
+
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+
+from ..lm import get_4gram_binary_lm
+from ..data.bpe import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon
+from ..data.common import build_test_dataset
+from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT, KENLM_BINARY_PATH
+
+from ..pipeline import training, search, compute_prior
+
+from ..config import get_training_config, get_search_config, get_prior_config
+
+
+def conformer_baseline():
+    prefix_name = "experiments/librispeech/standalone_2023/ls100_ctc_bpe/"
+
+    BPE_SIZE = 300
+
+    train_settings = TrainingDatasetSettings(
+        custom_processing_function=None,
+        partition_epoch=3,
+        epoch_wise_filters=[],
+        seq_ordering="laplace:.1000",
+        preemphasis=0.97,
+        peak_normalization=True,  # TODO: this is wrong compared to old setupsa and rescale, better test if it degrades
+    )
+
+    train_settings_retrain = copy.deepcopy(train_settings)
+    train_settings_retrain.epoch_wise_filters = []
+
+    # build the training datasets object containing train, cv, dev-train and the extern_data dict
+    train_data = build_bpe_training_datasets(
+        librispeech_key="train-clean-100",
+        bpe_size=BPE_SIZE,
+        settings=train_settings,
+    )
+    label_datastream = cast(LabelDatastream, train_data.datastreams["labels"])
+    vocab_size_without_blank = label_datastream.vocab_size
+
+    # build testing datasets
+    test_dataset_tuples = {}
+    # for testset in ["dev", "test"]:
+    for testset in ["dev-other"]:
+        test_dataset_tuples[testset] = build_test_dataset(
+            dataset_key=testset,
+            preemphasis=train_settings.preemphasis,
+            peak_normalization=train_settings.peak_normalization,
+        )
+
+    arpa_4gram_lm = get_4gram_binary_lm()
+
+    # ---------------------------------------------------------------------------------------------------------------- #
+
+    def run_exp(
+        ft_name,
+        datasets,
+        train_args,
+        search_args=None,
+        with_prior=False,
+        num_epochs=250,
+        decoder="ctc.decoder.flashlight_bpe_ctc",
+    ):
+        training_name = "/".join(ft_name.split("/")[:-1])
+        search_args = search_args if search_args is not None else {}
+
+        returnn_config = get_training_config(training_datasets=datasets, **train_args)
+        train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs)
+
+        if with_prior:
+            returnn_config = get_prior_config(training_datasets=datasets, **train_args)
+            prior_file = compute_prior(
+                ft_name,
+                returnn_config,
+                checkpoint=train_job.out_checkpoints[num_epochs],
+                returnn_exe=RETURNN_EXE,
+                returnn_root=MINI_RETURNN_ROOT,
+            )
+            tk.register_output(training_name + "/prior.txt", prior_file)
+            search_args["prior_file"] = prior_file
+
+        returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder)
+
+        _, _, search_jobs = search(
+            ft_name + "/last_%i" % num_epochs,
+            returnn_search_config,
+            train_job.out_checkpoints[num_epochs],
+            test_dataset_tuples,
+            RETURNN_EXE,
+            MINI_RETURNN_ROOT,
+        )
+
+        return train_job, search_jobs
+
+    from ..pytorch_networks.ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+        LogMelFeatureExtractionV1Config,
+    )
+
+    fe_config = LogMelFeatureExtractionV1Config(
+        sample_rate=16000,
+        win_size=0.025,
+        hop_size=0.01,
+        f_min=60,
+        f_max=7600,
+        min_amp=1e-10,
+        num_filters=80,
+        center=False,
+    )
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(2, 1),
+        pool1_stride=(2, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    model_config = ModelConfig(
+        feature_extraction_config=fe_config,
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+        specauc_start_epoch=1,
+    )
+
+    train_args_adamw03_accum2_jjlr = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+            + list(np.linspace(7e-4, 7e-5, 110))
+            + list(np.linspace(7e-5, 1e-8, 30)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "accum_grad_multiple_step": 2,
+        },
+        "debug": False,
+    }
+
+    default_search_args = {
+        "lexicon": get_text_lexicon(librispeech_key="train-clean-100", bpe_size=BPE_SIZE),
+        "returnn_vocab": label_datastream.vocab,
+        "beam_size": 1024,
+        "beam_size_token": 128,
+        "arpa_lm": arpa_4gram_lm,
+        "beam_threshold": 14,
+    }
+
+    # DIverged
+    # train_args = {
+    #     **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+    #     "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6",
+    #     "net_args": {"model_config_dict": asdict(model_config)},
+    # }
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #         }
+    #         run_exp(
+    #             prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR/lm%.1f_prior%.2f_bs1024_th14" % (
+    #                 lm_weight, prior_scale),
+    #             datasets=train_data, train_args=train_args, search_args=search_args, with_prior=True)
+
+    model_config_start11 = copy.deepcopy(model_config)
+    model_config_start11.specauc_start_epoch = 11
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6",
+        "net_args": {"model_config_dict": asdict(model_config_start11)},
+    }
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_peaknorm_start11/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+
+    # from here on onwards, use default AdamW with same OCLR
+    train_args_adamw_02 = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-2},
+            "learning_rates": list(np.linspace(1e-5, 1e-3, 150)) + list(np.linspace(1e-3, 1e-6, 150)),
+            #############
+            "batch_size": 200 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "accum_grad_multiple_step": 2,
+        },
+    }
+
+    model_config_smaller = ModelConfig(
+        feature_extraction_config=fe_config,
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=384,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=9,
+        final_dropout=0.2,
+        specauc_start_epoch=1,
+    )
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw_02),
+        "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6",
+        "net_args": {"model_config_dict": asdict(model_config_smaller)},
+    }
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_peaknorm_smaller_decay1e-2/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+
+    model_config_smaller_start11 = copy.deepcopy(model_config_smaller)
+    model_config_smaller_start11.specauc_start_epoch = 11
+    train_args_start11 = copy.deepcopy(train_args)
+    train_args_start11["net_args"]["model_config_dict"] = asdict(model_config_smaller_start11)
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_peaknorm_smaller_decay1e-2_start11/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args_start11,
+                search_args=search_args,
+                with_prior=True,
+            )
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/ctc_phon/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/ctc_phon/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/ctc_phon/exp_ls100_1023_base.py b/users/hilmes/experiments/nick_setups/standalone_2023/ctc_phon/exp_ls100_1023_base.py
new file mode 100644
index 000000000..76ae5305a
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/ctc_phon/exp_ls100_1023_base.py
@@ -0,0 +1,340 @@
+from sisyphus import tk
+
+import copy
+from dataclasses import asdict
+import numpy as np
+from typing import cast
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+
+from ..data.phon import build_eow_phon_training_datasets, TrainingDatasetSettings, get_text_lexicon
+from ..data.common import build_test_dataset
+from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT
+from ..lm import get_4gram_binary_lm
+
+from ..pipeline import training, search, compute_prior
+
+from ..config import get_training_config, get_search_config, get_prior_config
+
+
+def eow_phon_ls100_1023_base():
+    prefix_name = "experiments/librispeech/standalone_2023/ls100_ctc_eow_phon/"
+
+    train_settings = TrainingDatasetSettings(
+        custom_processing_function=None,
+        partition_epoch=3,
+        epoch_wise_filters=[],
+        seq_ordering="laplace:.1000",
+        preemphasis=0.97,
+        peak_normalization=True,  # TODO: this is wrong compared to old setupsa and rescale, better test if it degrades
+    )
+
+    # build the training datasets object containing train, cv, dev-train and the extern_data dict
+    train_data = build_eow_phon_training_datasets(
+        librispeech_key="train-clean-100",
+        settings=train_settings,
+    )
+    label_datastream = cast(LabelDatastream, train_data.datastreams["labels"])
+    vocab_size_without_blank = label_datastream.vocab_size
+
+    # build testing datasets
+    test_dataset_tuples = {}
+    # for testset in ["dev", "test"]:
+    for testset in ["dev-other"]:
+        test_dataset_tuples[testset] = build_test_dataset(
+            dataset_key=testset,
+            preemphasis=train_settings.preemphasis,
+            peak_normalization=train_settings.peak_normalization,
+        )
+
+    arpa_4gram_lm = get_4gram_binary_lm()
+
+    # ---------------------------------------------------------------------------------------------------------------- #
+
+    def run_exp(
+        ft_name,
+        datasets,
+        train_args,
+        search_args=None,
+        with_prior=False,
+        num_epochs=250,
+        decoder="ctc.decoder.flashlight_phoneme_ctc",
+    ):
+        training_name = "/".join(ft_name.split("/")[:-1])
+        search_args = search_args if search_args is not None else {}
+
+        returnn_config = get_training_config(training_datasets=datasets, **train_args)
+        train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs)
+
+        if with_prior:
+            returnn_config = get_prior_config(training_datasets=datasets, **train_args)
+            prior_file = compute_prior(
+                ft_name,
+                returnn_config,
+                checkpoint=train_job.out_checkpoints[num_epochs],
+                returnn_exe=RETURNN_EXE,
+                returnn_root=MINI_RETURNN_ROOT,
+            )
+            tk.register_output(training_name + "/prior.txt", prior_file)
+            search_args["prior_file"] = prior_file
+
+        returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder)
+
+        _, _, search_jobs = search(
+            ft_name + "/last_%i" % num_epochs,
+            returnn_search_config,
+            train_job.out_checkpoints[num_epochs],
+            test_dataset_tuples,
+            RETURNN_EXE,
+            MINI_RETURNN_ROOT,
+        )
+
+        return train_job, search_jobs
+
+    from ..pytorch_networks.ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+        LogMelFeatureExtractionV1Config,
+    )
+
+    fe_config = LogMelFeatureExtractionV1Config(
+        sample_rate=16000,
+        win_size=0.025,
+        hop_size=0.01,
+        f_min=60,
+        f_max=7600,
+        min_amp=1e-10,
+        num_filters=80,
+        center=False,
+    )
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(2, 1),
+        pool1_stride=(2, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    model_config = ModelConfig(
+        feature_extraction_config=fe_config,
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+        specauc_start_epoch=1,
+    )
+
+    train_args_adamw03_accum2_jjlr = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+            + list(np.linspace(7e-4, 7e-5, 110))
+            + list(np.linspace(7e-5, 1e-8, 30)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "accum_grad_multiple_step": 2,
+        },
+        "debug": False,
+    }
+
+    default_search_args = {
+        "lexicon": get_text_lexicon(librispeech_key="train-clean-100"),
+        "returnn_vocab": label_datastream.vocab,
+        "beam_size": 1024,
+        "beam_size_token": 128,
+        "arpa_lm": arpa_4gram_lm,
+        "beam_threshold": 14,
+    }
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6",
+        "net_args": {"model_config_dict": asdict(model_config)},
+    }
+    # diverged with hiccup
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #         }
+    #         run_exp(
+    #             prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR/lm%.1f_prior%.2f_bs1024_th14" % (
+    #                 lm_weight, prior_scale),
+    #             datasets=train_data, train_args=train_args, search_args=search_args, with_prior=True)
+
+    train_args_gc1 = copy.deepcopy(train_args)
+    train_args_gc1["config"]["gradient_clip"] = 1.0
+    for lm_weight in [2.5, 3.0, 3.5]:
+        for prior_scale in [0.0, 0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_peaknorm_gc1/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args_gc1,
+                search_args=search_args,
+                with_prior=True,
+            )
+
+    train_args_decay1e_2 = copy.deepcopy(train_args)
+    train_args_decay1e_2["config"]["optimizer"]["weight_decay"] = 1e-2
+    for lm_weight in [2.5, 3.0, 3.5]:
+        for prior_scale in [0.0, 0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_peaknorm_decay1e-2/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args_decay1e_2,
+                search_args=search_args,
+                with_prior=True,
+            )
+
+    search_args = {
+        **default_search_args,
+        "lm_weight": 3.5,
+        "prior_scale": 0.3,
+        "sil_score": -1000.0,
+    }
+    run_exp(
+        prefix_name
+        + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_peaknorm_decay1e-2/lm_test1_bs1024_th14",
+        datasets=train_data,
+        train_args=train_args_decay1e_2,
+        search_args=search_args,
+        with_prior=True,
+        decoder="ctc.decoder.flashlight_phoneme_ctc_v2",
+    )
+
+    search_args = {
+        "lexicon": get_text_lexicon(librispeech_key="train-clean-100"),
+        "returnn_vocab": label_datastream.vocab,
+        "beam_size": 1024,
+        "arpa_lm": arpa_4gram_lm,
+        "beam_threshold": 16,
+        "lm_weight": 3.5,
+        "prior_scale": 0.3,
+    }
+    run_exp(
+        prefix_name
+        + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_peaknorm_decay1e-2/lm_test2_bs1024_th16",
+        datasets=train_data,
+        train_args=train_args_decay1e_2,
+        search_args=search_args,
+        with_prior=True,
+        decoder="ctc.decoder.flashlight_phoneme_ctc",
+    )
+
+    ###### trying to reproduce 14.5% result from librispeech/librispeech_100_phon_ctc #########
+
+    train_args_adamw_02 = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-8, "weight_decay": 1e-2},
+            "learning_rates": list(np.linspace(1e-5, 1e-3, 150)) + list(np.linspace(1e-3, 1e-6, 150)),
+            #############
+            "batch_size": 200 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+        },
+    }
+    model_config_small_ff = ModelConfig(
+        feature_extraction_config=fe_config,
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=384,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+        specauc_start_epoch=1,
+    )
+    train_args = {
+        **copy.deepcopy(train_args_adamw_02),
+        "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6",
+        "net_args": {"model_config_dict": asdict(model_config_small_ff)},
+    }
+    for lm_weight in [2.5, 3.0, 3.5]:
+        for prior_scale in [0.0, 0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            # TODO: add num_epochs 300
+            run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_legacy_peaknorm_decay1e-2_FF384_accum1/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw_02),
+        "network_module": "ctc.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6",
+        "net_args": {"model_config_dict": asdict(model_config_small_ff)},
+    }
+    train_args["config"]["accum_grad_multiple_step"] = 2
+    for lm_weight in [2.5, 3.0, 3.5]:
+        for prior_scale in [0.0, 0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            train_job, _ = run_exp(
+                prefix_name
+                + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_legacy_peaknorm_decay1e-2_FF384_accum2/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                num_epochs=300,
+            )
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/data/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/data/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/data/bpe.py b/users/hilmes/experiments/nick_setups/standalone_2023/data/bpe.py
new file mode 100644
index 000000000..4deb3781e
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/data/bpe.py
@@ -0,0 +1,92 @@
+"""
+The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups
+"""
+from sisyphus import tk
+from functools import lru_cache
+from typing import Dict, List, Optional, Tuple
+
+
+from i6_experiments.common.datasets.librispeech import get_ogg_zip_dict, get_bliss_lexicon
+from i6_experiments.common.datasets.librispeech.vocab import get_subword_nmt_bpe_v2
+from i6_experiments.common.helpers.text_labels.subword_nmt_bpe import get_returnn_subword_nmt
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import BpeDatastream
+from i6_experiments.users.rossenbach.lexicon.bpe_lexicon import CreateBPELexiconJob
+
+from .common import TrainingDatasetSettings, TrainingDatasets, build_training_datasets, DATA_PREFIX
+from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE
+
+
+@lru_cache()
+def get_bpe_datastream(librispeech_key: str, bpe_size: int, is_recog: bool) -> BpeDatastream:
+    """
+    Returns the datastream for the bpe labels
+
+    Uses the legacy BPE setup that is compatible with old LM models
+
+    :param librispeech_key:
+    :param bpe_size: size for the bpe labels
+    :param is_recog: removes the UNK label when not in training
+    """
+    bpe_settings = get_subword_nmt_bpe_v2(corpus_key=librispeech_key, bpe_size=bpe_size, unk_label="<unk>")
+
+    # TODO: Try without sequence postfix (seq_postfix=None)
+    # otherwise every sequence gets a <s> at the end
+    bpe_targets = BpeDatastream(available_for_inference=False, bpe_settings=bpe_settings, use_unk_label=is_recog)
+    return bpe_targets
+
+
+def get_lexicon(librispeech_key: str, bpe_size: int) -> tk.Path:
+    subword_nmt_repo = get_returnn_subword_nmt(
+        commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", output_prefix=DATA_PREFIX
+    )
+    subword_nmt_repo.hash_overwrite = "I6_SUBWORD_NMT_V2"
+
+    bpe_datastream = get_bpe_datastream(librispeech_key=librispeech_key, bpe_size=bpe_size, is_recog=False)
+    bpe_lexicon = CreateBPELexiconJob(
+        base_lexicon_path=get_bliss_lexicon(
+            add_unknown_phoneme_and_mapping=False, add_silence=False, output_prefix="librispeech_datasets"
+        ),
+        bpe_codes=bpe_datastream.codes,
+        bpe_vocab=bpe_datastream.vocab,
+        subword_nmt_repo=subword_nmt_repo,
+        unk_label="<unk>",
+    ).out_lexicon
+
+    return bpe_lexicon
+
+
+def get_text_lexicon(librispeech_key: str, bpe_size: int) -> tk.Path:
+    """
+
+    :return:
+    """
+    bliss_lex = get_lexicon(librispeech_key=librispeech_key, bpe_size=bpe_size)
+    from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon
+
+    word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon
+    return word_lexicon
+
+
+def build_bpe_training_datasets(
+    librispeech_key: str,
+    bpe_size: int,
+    settings: TrainingDatasetSettings,
+) -> TrainingDatasets:
+    """
+    :param settings: configuration object for the dataset pipeline
+    """
+    label_datastream = get_bpe_datastream(librispeech_key=librispeech_key, bpe_size=bpe_size, is_recog=False)
+
+    ogg_zip_dict = get_ogg_zip_dict("corpora", returnn_root=MINI_RETURNN_ROOT, returnn_python_exe=RETURNN_EXE)
+    train_ogg = ogg_zip_dict[librispeech_key]
+    dev_clean_ogg = ogg_zip_dict["dev-clean"]
+    dev_other_ogg = ogg_zip_dict["dev-other"]
+
+    return build_training_datasets(
+        train_ogg=train_ogg,
+        dev_clean_ogg=dev_clean_ogg,
+        dev_other_ogg=dev_other_ogg,
+        settings=settings,
+        label_datastream=label_datastream,
+    )
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/data/common.py b/users/hilmes/experiments/nick_setups/standalone_2023/data/common.py
new file mode 100644
index 000000000..d25d0e764
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/data/common.py
@@ -0,0 +1,207 @@
+"""
+The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups
+"""
+from sisyphus import tk
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Dict, List, Optional, Tuple
+
+from i6_core.returnn import CodeWrapper
+from i6_core.returnn.oggzip import BlissToOggZipJob
+
+from i6_experiments.common.datasets.librispeech import get_ogg_zip_dict, get_bliss_corpus_dict
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.audio import (
+    AudioRawDatastream,
+    ReturnnAudioRawOptions,
+)
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.base import Datastream
+from i6_experiments.users.rossenbach.datasets.librispeech import get_mixed_cv_segments
+
+from returnn_common.datasets import Dataset, OggZipDataset, MetaDataset
+
+from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE
+
+DATA_PREFIX = "experiments/librispeech/2023_standalone/data/"
+
+# -------------- Dataclasses for configuration and data passing -------------------
+
+# here: (<from-epoch> , <to-epoch>, <max-mean-length>)
+EpochWiseFilter = Tuple[int, int, int]
+
+
+@dataclass(frozen=True)
+class TrainingDatasets:
+    train: Dataset
+    cv: Dataset
+    devtrain: Dataset
+    datastreams: Dict[str, Datastream]
+    prior: Optional[Dataset]
+
+
+@dataclass()
+class TrainingDatasetSettings:
+    # features settings
+    custom_processing_function: Optional[str]
+
+    # training settings
+    partition_epoch: int
+    epoch_wise_filters: List[EpochWiseFilter]
+    seq_ordering: str
+    preemphasis: float
+    peak_normalization: bool
+
+
+# --------------------------- Helper functions  -----------------------------------
+
+
+@lru_cache()
+def get_audio_raw_datastream(
+    preemphasis: Optional[float] = None, peak_normalization: bool = False
+) -> AudioRawDatastream:
+    """
+    :param preemphasis: set the pre-emphasis filter factor
+    :param peak_normalization: normalize every utterance to peak amplitude 1
+    """
+    audio_datastream = AudioRawDatastream(
+        available_for_inference=True,
+        options=ReturnnAudioRawOptions(peak_normalization=peak_normalization, preemphasis=preemphasis),
+    )
+    return audio_datastream
+
+
+def get_zip(name: str, bliss_dataset: tk.Path):
+    """
+
+    :param name:
+    :param bliss_dataset:
+    :return:
+    """
+    zip_dataset_job = BlissToOggZipJob(
+        bliss_corpus=bliss_dataset,
+        no_conversion=True,  # for Librispeech we are already having ogg
+        returnn_python_exe=RETURNN_EXE,
+        returnn_root=MINI_RETURNN_ROOT,
+    )
+    zip_dataset_job.add_alias(DATA_PREFIX + name)
+
+    return zip_dataset_job.out_ogg_zip
+
+
+# --------------------------- Dataset functions  -----------------------------------
+
+
+def build_training_datasets(
+    train_ogg: tk.Path,
+    dev_clean_ogg: tk.Path,
+    dev_other_ogg: tk.Path,
+    label_datastream: Datastream,
+    settings: TrainingDatasetSettings,
+) -> TrainingDatasets:
+    """
+    :param train_ogg:
+    :param dev_clean_ogg:
+    :param dev_other_ogg:
+    :param label_datastream:
+    :param settings:
+    """
+    audio_datastream = get_audio_raw_datastream(settings.preemphasis, settings.peak_normalization)
+
+    datastreams = {
+        "raw_audio": audio_datastream,
+        "labels": label_datastream,
+    }
+
+    data_map = {"raw_audio": ("zip_dataset", "data"), "labels": ("zip_dataset", "classes")}
+
+    training_audio_opts = audio_datastream.as_returnn_audio_opts()
+    if settings.custom_processing_function:
+        training_audio_opts["pre_process"] = CodeWrapper(settings.custom_processing_function)
+
+    additional_opts = {}
+    if settings.epoch_wise_filters:
+        additional_opts["epoch_wise_filter"] = {}
+        for fr, to, max_mean_len in settings.epoch_wise_filters:
+            additional_opts["epoch_wise_filter"][(fr, to)] = {"max_mean_len": max_mean_len}
+
+    def make_meta(dataset: OggZipDataset):
+        return MetaDataset(
+            data_map=data_map, datasets={"zip_dataset": dataset}, seq_order_control_dataset="zip_dataset"
+        )
+
+    train_zip_dataset = OggZipDataset(
+        files=train_ogg,
+        audio_options=training_audio_opts,
+        target_options=label_datastream.as_returnn_targets_opts(),
+        partition_epoch=settings.partition_epoch,
+        seq_ordering=settings.seq_ordering,
+        additional_options=additional_opts,
+    )
+    train_dataset = make_meta(train_zip_dataset)
+
+    cv_zip_dataset = OggZipDataset(
+        files=[dev_clean_ogg, dev_other_ogg],
+        audio_options=audio_datastream.as_returnn_audio_opts(),
+        target_options=label_datastream.as_returnn_targets_opts(),
+        segment_file=get_mixed_cv_segments(),
+        seq_ordering="sorted_reverse",
+    )
+    cv_dataset = make_meta(cv_zip_dataset)
+
+    devtrain_zip_dataset = OggZipDataset(
+        files=train_ogg,
+        audio_options=audio_datastream.as_returnn_audio_opts(),
+        target_options=label_datastream.as_returnn_targets_opts(),
+        seq_ordering="sorted_reverse",
+        random_subset=3000,
+    )
+    devtrain_dataset = make_meta(devtrain_zip_dataset)
+
+    prior_zip_dataset = OggZipDataset(
+        files=train_ogg,
+        audio_options=training_audio_opts,
+        target_options=label_datastream.as_returnn_targets_opts(),
+        partition_epoch=1,
+        seq_ordering="sorted_reverse",
+        additional_options=additional_opts,
+    )
+    prior_dataset = make_meta(prior_zip_dataset)
+
+    return TrainingDatasets(
+        train=train_dataset,
+        cv=cv_dataset,
+        devtrain=devtrain_dataset,
+        datastreams=datastreams,
+        prior=prior_dataset,
+    )
+
+
+@lru_cache()
+def build_test_dataset(
+    dataset_key: str,
+    preemphasis: Optional[float] = None,
+    peak_normalization: bool = False,
+):
+    """
+
+    :param dataset_key: e.g. dev-other, which test set to create
+    :param preemphasis:
+    :param peak_normalization:
+    :return:
+    """
+    ogg_zip_dict = get_ogg_zip_dict("corpora", returnn_root=MINI_RETURNN_ROOT, returnn_python_exe=RETURNN_EXE)
+    bliss_dict = get_bliss_corpus_dict()
+    test_ogg = ogg_zip_dict[dataset_key]
+
+    audio_datastream = get_audio_raw_datastream(preemphasis, peak_normalization)
+
+    data_map = {"raw_audio": ("zip_dataset", "data")}
+
+    test_zip_dataset = OggZipDataset(
+        files=[test_ogg], audio_options=audio_datastream.as_returnn_audio_opts(), seq_ordering="sorted_reverse"
+    )
+    test_dataset = MetaDataset(
+        data_map=data_map, datasets={"zip_dataset": test_zip_dataset}, seq_order_control_dataset="zip_dataset"
+    )
+
+    return test_dataset, bliss_dict[dataset_key]
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/data/phon.py b/users/hilmes/experiments/nick_setups/standalone_2023/data/phon.py
new file mode 100644
index 000000000..c330f79a4
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/data/phon.py
@@ -0,0 +1,142 @@
+"""
+The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups
+"""
+from sisyphus import tk
+
+from dataclasses import dataclass
+from functools import lru_cache
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+from i6_core.returnn.vocabulary import ReturnnVocabFromPhonemeInventory
+from i6_core.corpus.transform import ApplyLexiconToCorpusJob
+from i6_core.lexicon.modification import AddEowPhonemesToLexiconJob
+
+from i6_experiments.common.datasets.librispeech import (
+    get_g2p_augmented_bliss_lexicon_dict,
+    get_bliss_corpus_dict,
+    get_ogg_zip_dict,
+    get_bliss_lexicon,
+)
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+
+
+from .common import get_zip, DATA_PREFIX, build_training_datasets, TrainingDatasets, TrainingDatasetSettings
+
+
+def get_eow_lexicon(librispeech_key: str, with_g2p=True) -> tk.Path:
+
+    """
+    get the g2p bliss lexicon with EOW tokens added
+    :return:
+    """
+    if with_g2p:
+        lex = get_g2p_augmented_bliss_lexicon_dict(
+            use_stress_marker=False, add_silence=False, output_prefix="librispeech_datasets"
+        )[librispeech_key]
+    else:
+        lex = get_bliss_lexicon(use_stress_marker=False, add_silence=False, output_prefix="librispeech_datasets")
+
+    return AddEowPhonemesToLexiconJob(lex).out_lexicon
+
+
+def get_eow_bliss(librispeech_key: str, train_librispeech_key: str, remove_unk_seqs=False) -> tk.Path:
+    """
+    get an EOW modified corpus with optional unknown removed for cross validation
+
+    :param corpus_key: train, dev, test
+    :param remove_unk_seqs: remove all sequences with unknowns, used for dev-clean and dev-other
+        in case of using them for cross validation
+    :return:
+    """
+    bliss = get_bliss_corpus_dict(audio_format="ogg")[librispeech_key]
+    if remove_unk_seqs:
+        from i6_core.corpus.filter import FilterCorpusRemoveUnknownWordSegmentsJob
+
+        bliss = FilterCorpusRemoveUnknownWordSegmentsJob(
+            bliss_corpus=bliss,
+            bliss_lexicon=get_eow_lexicon(
+                librispeech_key=train_librispeech_key, with_g2p=True
+            ),  # cv may include words from g2p
+            all_unknown=False,
+        ).out_corpus
+
+    # default train lexicon
+    lexicon = get_eow_lexicon(librispeech_key=train_librispeech_key, with_g2p=True)
+    converted_bliss_corpus = ApplyLexiconToCorpusJob(bliss, lexicon, word_separation_orth=None).out_corpus
+
+    return converted_bliss_corpus
+
+
+def get_eow_bliss_and_zip(librispeech_key: str, train_librispeech_key: str, remove_unk_seqs=False):
+    """
+    :param corpus_key: e.g. "train", "dev", or "test,
+    :param remove_unk_seqs: remove all sequences with unknowns, used for dev-clean and dev-other
+        in case of using them for cross validation
+    :return: tuple of bliss and zip
+    """
+
+    bliss_dataset = get_eow_bliss(
+        librispeech_key=librispeech_key, train_librispeech_key=train_librispeech_key, remove_unk_seqs=remove_unk_seqs
+    )
+    zip_dataset = get_zip(f"{librispeech_key}_eow", bliss_dataset=bliss_dataset)
+
+    return bliss_dataset, zip_dataset
+
+
+def get_eow_vocab_datastream(librispeech_key: str) -> LabelDatastream:
+    """
+    Phoneme with EOW LabelDatastream for Tedlium-2
+
+    :param with_blank: datastream for CTC training
+    """
+    lexicon = get_eow_lexicon(librispeech_key=librispeech_key)
+    returnn_vocab_job = ReturnnVocabFromPhonemeInventory(lexicon)
+    returnn_vocab_job.add_alias(os.path.join(DATA_PREFIX, f"{librispeech_key}", "eow_returnn_vocab_job"))
+
+    vocab_datastream = LabelDatastream(
+        available_for_inference=True, vocab=returnn_vocab_job.out_vocab, vocab_size=returnn_vocab_job.out_vocab_size
+    )
+
+    return vocab_datastream
+
+
+def get_text_lexicon(librispeech_key: str) -> tk.Path:
+    """
+
+    :return:
+    """
+    bliss_lex = get_eow_lexicon(librispeech_key=librispeech_key, with_g2p=False)
+    from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon
+
+    word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon
+    return word_lexicon
+
+
+def build_eow_phon_training_datasets(
+    librispeech_key: str,
+    settings: TrainingDatasetSettings,
+) -> TrainingDatasets:
+    """
+    :param settings: configuration object for the dataset pipeline
+    """
+    label_datastream = get_eow_vocab_datastream(librispeech_key=librispeech_key)
+
+    _, train_ogg = get_eow_bliss_and_zip(
+        librispeech_key=librispeech_key, train_librispeech_key=librispeech_key, remove_unk_seqs=False
+    )
+    _, dev_clean_ogg = get_eow_bliss_and_zip(
+        librispeech_key="dev-clean", train_librispeech_key=librispeech_key, remove_unk_seqs=True
+    )
+    _, dev_other_ogg = get_eow_bliss_and_zip(
+        librispeech_key="dev-other", train_librispeech_key=librispeech_key, remove_unk_seqs=True
+    )
+
+    return build_training_datasets(
+        train_ogg=train_ogg,
+        dev_clean_ogg=dev_clean_ogg,
+        dev_other_ogg=dev_other_ogg,
+        settings=settings,
+        label_datastream=label_datastream,
+    )
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/default_tools.py b/users/hilmes/experiments/nick_setups/standalone_2023/default_tools.py
new file mode 100644
index 000000000..5c33c776a
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/default_tools.py
@@ -0,0 +1,20 @@
+from sisyphus import tk
+from i6_core.tools.git import CloneGitRepositoryJob
+
+
+# python from apptainer
+RETURNN_EXE = tk.Path("/usr/bin/python3", hash_overwrite="GENERIC_RETURNN_LAUNCHER")
+MINI_RETURNN_ROOT = tk.Path("/u/hilmes/dev/MiniReturnn", hash_overwrite="LIBRISPEECH_DEFAULT_RETURNN_ROOT")
+
+from i6_experiments.common.tools.sctk import compile_sctk
+
+SCTK_BINARY_PATH = compile_sctk(branch="v2.4.12")  # use last published version
+# SCTK_BINARY_PATH = compile_sctk()  # use most recent SCTK
+SCTK_BINARY_PATH.hash_overwrite = "LIBRISPEECH_DEFAULT_SCTK_BINARY_PATH"
+
+from i6_core.tools.git import CloneGitRepositoryJob
+from i6_core.lm.kenlm import CompileKenLMJob, CreateBinaryLMJob
+
+kenlm_repo = CloneGitRepositoryJob("https://github.com/kpu/kenlm").out_repository
+KENLM_BINARY_PATH = CompileKenLMJob(repository=kenlm_repo).out_binaries
+KENLM_BINARY_PATH.hash_overwrite = "LIBRISPEECH_DEFAULT_KENLM_BINARY_PATH"
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/lm.py b/users/hilmes/experiments/nick_setups/standalone_2023/lm.py
new file mode 100644
index 000000000..42a910233
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/lm.py
@@ -0,0 +1,18 @@
+from i6_core.lm.kenlm import CreateBinaryLMJob
+
+from i6_experiments.common.datasets.librispeech.language_model import get_arpa_lm_dict
+
+from .default_tools import KENLM_BINARY_PATH
+
+
+def get_4gram_binary_lm():
+    """
+
+    :param output_prefix:
+    :return:
+    """
+    arpa_4gram_binary_lm_job = CreateBinaryLMJob(
+        arpa_lm=get_arpa_lm_dict()["4gram"], kenlm_binary_folder=KENLM_BINARY_PATH
+    )
+    arpa_4gram_binary_lm_job.add_alias("experiments/librispeech/standalone_2023/lm/create_4gram_binary_lm")
+    return arpa_4gram_binary_lm_job.out_lm
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pipeline.py b/users/hilmes/experiments/nick_setups/standalone_2023/pipeline.py
new file mode 100644
index 000000000..1327b42db
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/pipeline.py
@@ -0,0 +1,179 @@
+import copy
+import os.path
+
+from sisyphus import tk
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset
+
+from i6_core.returnn.config import ReturnnConfig
+from i6_core.returnn.training import ReturnnTrainingJob
+from i6_core.returnn.training import GetBestTFCheckpointJob
+from i6_core.returnn.forward import ReturnnForwardJob, ReturnnForwardJobV2
+from i6_core.returnn.search import SearchBPEtoWordsJob, ReturnnComputeWERJob
+from i6_experiments.users.rossenbach.returnn.training import AverageCheckpointsJobV2
+
+from .default_tools import RETURNN_EXE, MINI_RETURNN_ROOT, SCTK_BINARY_PATH
+
+
+@tk.block()
+def training(prefix_name, returnn_config, returnn_exe, returnn_root, num_epochs):
+    """
+
+    :param prefix_name:
+    :param returnn_config:
+    :param returnn_exe:
+    :param returnn_root:
+    :return:
+    """
+    default_rqmt = {
+        "mem_rqmt": 15,
+        "time_rqmt": 168,
+        "cpu_rqmt": 4,
+        "log_verbosity": 5,
+        "returnn_python_exe": returnn_exe,
+        "returnn_root": returnn_root,
+    }
+
+    train_job = ReturnnTrainingJob(returnn_config=returnn_config, num_epochs=num_epochs, **default_rqmt)
+    train_job.add_alias(prefix_name + "/training")
+    tk.register_output(prefix_name + "/learning_rates", train_job.out_learning_rates)
+
+    return train_job
+
+
+@tk.block()
+def search_single(
+    prefix_name,
+    returnn_config,
+    checkpoint,
+    recognition_dataset: GenericDataset,
+    recognition_bliss_corpus,
+    returnn_exe,
+    returnn_root,
+    mem_rqmt=8,
+    use_gpu=False,
+):
+    """
+    Run search for a specific test dataset
+
+    :param str prefix_name:
+    :param ReturnnConfig returnn_config:
+    :param Checkpoint checkpoint:
+    :param returnn_standalone.data.datasets.dataset.GenericDataset recognition_dataset:
+    :param Path recognition_reference: Path to a py-dict format reference file
+    :param Path returnn_exe:
+    :param Path returnn_root:
+    """
+    returnn_config = copy.deepcopy(returnn_config)
+    returnn_config.config["forward"] = recognition_dataset.as_returnn_opts()
+    search_job = ReturnnForwardJobV2(
+        model_checkpoint=checkpoint,
+        returnn_config=returnn_config,
+        log_verbosity=5,
+        mem_rqmt=mem_rqmt,
+        time_rqmt=24,
+        device="gpu" if use_gpu else "cpu",
+        cpu_rqmt=2,
+        returnn_python_exe=returnn_exe,
+        returnn_root=returnn_root,
+        output_files=["search_out.py"],
+    )
+    search_job.add_alias(prefix_name + "/search_job")
+
+    from i6_core.returnn.search import SearchWordsToCTMJob
+    from i6_core.corpus.convert import CorpusToStmJob
+    from i6_core.recognition.scoring import ScliteJob
+
+    search_ctm = SearchWordsToCTMJob(
+        recog_words_file=search_job.out_files["search_out.py"],
+        bliss_corpus=recognition_bliss_corpus,
+    ).out_ctm_file
+
+    stm_file = CorpusToStmJob(bliss_corpus=recognition_bliss_corpus).out_stm_path
+
+    sclite_job = ScliteJob(ref=stm_file, hyp=search_ctm, sctk_binary_path=SCTK_BINARY_PATH)
+    tk.register_output(prefix_name + "/sclite/wer", sclite_job.out_wer)
+    tk.register_output(prefix_name + "/sclite/report", sclite_job.out_report_dir)
+
+    return sclite_job.out_wer, search_job
+
+
+@tk.block()
+def search(prefix_name, returnn_config, checkpoint, test_dataset_tuples, returnn_exe, returnn_root, use_gpu=False):
+    """
+
+    :param str prefix_name:
+    :param ReturnnConfig returnn_config:
+    :param Checkpoint checkpoint:
+    :param test_dataset_tuples:
+    :param returnn_exe:
+    :param returnn_root:
+    :return:
+    """
+    # use fixed last checkpoint for now, needs more fine-grained selection / average etc. here
+    wers = {}
+    search_jobs = []
+    for key, (test_dataset, test_dataset_reference) in test_dataset_tuples.items():
+        wers[key], search_job = search_single(
+            prefix_name + "/%s" % key,
+            returnn_config,
+            checkpoint,
+            test_dataset,
+            test_dataset_reference,
+            returnn_exe,
+            returnn_root,
+            use_gpu=use_gpu,
+        )
+        search_jobs.append(search_job)
+
+    from i6_core.report import GenerateReportStringJob, MailJob
+
+    format_string_report = ",".join(["{%s_val}" % (prefix_name + key) for key in test_dataset_tuples.keys()])
+    format_string = " - ".join(
+        ["{%s}: {%s_val}" % (prefix_name + key, prefix_name + key) for key in test_dataset_tuples.keys()]
+    )
+    values = {}
+    values_report = {}
+    for key in test_dataset_tuples.keys():
+        values[prefix_name + key] = key
+        values["%s_val" % (prefix_name + key)] = wers[key]
+        values_report["%s_val" % (prefix_name + key)] = wers[key]
+
+    report = GenerateReportStringJob(report_values=values, report_template=format_string, compress=False).out_report
+    # mail = MailJob(result=report, subject=prefix_name, send_contents=True).out_status
+    # tk.register_output(os.path.join(prefix_name, "mail_status"), mail)
+    return format_string_report, values_report, search_jobs
+
+
+@tk.block()
+def compute_prior(
+    prefix_name,
+    returnn_config,
+    checkpoint,
+    returnn_exe,
+    returnn_root,
+    mem_rqmt=8,
+):
+    """
+    Run search for a specific test dataset
+
+    :param str prefix_name:
+    :param ReturnnConfig returnn_config:
+    :param Checkpoint checkpoint:
+    :param Path returnn_exe:
+    :param Path returnn_root:
+    """
+    search_job = ReturnnForwardJobV2(
+        model_checkpoint=checkpoint,
+        returnn_config=returnn_config,
+        log_verbosity=5,
+        mem_rqmt=mem_rqmt,
+        time_rqmt=1,
+        device="gpu",
+        cpu_rqmt=4,
+        returnn_python_exe=returnn_exe,
+        returnn_root=returnn_root,
+        output_files=["prior.txt"],
+    )
+    search_job.add_alias(prefix_name + "/prior_job")
+    return search_job.out_files["prior.txt"]
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6.py
new file mode 100644
index 000000000..bd5860dc5
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6.py
@@ -0,0 +1,184 @@
+"""
+Like v2, but with i6_models specaugment (v3)
+and now controllable start time for when specaugment is applied (v4)
+and with the proper feature extraction from i6-models
+"""
+
+import numpy as np
+import torch
+from torch import nn
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ModelConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=self.cfg.feature_extraction_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py
new file mode 100644
index 000000000..a57c949fd
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py
@@ -0,0 +1,84 @@
+"""
+Config for the base CTC models v4, including specaug start time
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1Config
+
+
+@dataclass(kw_only=True)
+class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config):
+    activation_str: str = ""
+    activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        activation_str = d.pop("activation_str")
+        if activation_str == "ReLU":
+            from torch.nn import ReLU
+
+            activation = ReLU()
+        else:
+            assert False, "Unsupported activation %s" % d["activation_str"]
+        d["activation"] = activation
+        return VGG4LayerActFrontendV1Config(**d)
+
+
+@dataclass
+class ConformerEncoderV1Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: Number of conformer layers in the conformer encoder
+        frontend: A pair of ConformerFrontend and corresponding config
+        block_cfg: Configuration for ConformerBlockV1
+    """
+
+    num_layers: int
+
+    # nested configurations
+    frontend: ModuleFactoryV1
+    block_cfg: ConformerBlockV1Config
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+
+@dataclass
+class ModelConfig:
+    feature_extraction_config: LogMelFeatureExtractionV1Config
+    frontend_config: VGG4LayerActFrontendV1Config
+    specaug_config: SpecaugConfig
+    specauc_start_epoch: int
+    label_target_size: int
+    conformer_size: int
+    num_layers: int
+    num_heads: int
+    ff_dim: int
+    att_weights_dropout: float
+    conv_dropout: float
+    ff_dropout: float
+    mhsa_dropout: float
+    conv_kernel_size: int
+    final_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["feature_extraction_config"] = LogMelFeatureExtractionV1Config(**d["feature_extraction_config"])
+        d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig(**d["specaug_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py
new file mode 100644
index 000000000..3012eee33
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py
@@ -0,0 +1,114 @@
+"""
+Flashlight/Torchaudio CTC decoder and prior computation functions
+"""
+
+import time
+import numpy as np
+import torch
+from torch import nn
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        tokens=labels + ["[blank]"],
+        # "[SILENCE]" and "[UNK]" are not actually part of the vocab,
+        # but the decoder is happy as long they are defined in the token list
+        # even if they do not exist as label index in the softmax output,
+        blank_token="[blank]",
+        sil_token="[blank]",
+        unk_word="[unknown]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_size_token=kwargs.get("beam_size_token", None),
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_am_time = 0
+    run_ctx.total_search_time = 0
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print(
+        "Total-AM-Time: %.2fs, AM-RTF: %.3f"
+        % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s)
+    )
+    print(
+        "Total-Search-Time: %.2fs, Search-RTF: %.3f"
+        % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s)
+    )
+    total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    am_start = time.time()
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    tags = data["seq_tag"]
+
+    logprobs_cpu = logprobs.cpu()
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    am_time = time.time() - am_start
+    run_ctx.total_am_time += am_time
+
+    search_start = time.time()
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu())
+    search_time = time.time() - search_start
+    run_ctx.total_search_time += search_time
+
+    print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+    print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch))
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch))
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py
new file mode 100644
index 000000000..39d942e9b
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py
@@ -0,0 +1,114 @@
+"""
+Flashlight/Torchaudio CTC decoder and prior computation functions
+"""
+
+import time
+import numpy as np
+import torch
+from torch import nn
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"],
+        # "[SILENCE]" and "[UNK]" are not actually part of the vocab,
+        # but the decoder is happy as long they are defined in the token list
+        # even if they do not exist as label index in the softmax output,
+        blank_token="[blank]",
+        sil_token="[SILENCE]",
+        unk_word="[unknown]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_size_token=kwargs.get("beam_size_token", None),
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_am_time = 0
+    run_ctx.total_search_time = 0
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print(
+        "Total-AM-Time: %.2fs, AM-RTF: %.3f"
+        % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s)
+    )
+    print(
+        "Total-Search-Time: %.2fs, Search-RTF: %.3f"
+        % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s)
+    )
+    total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    am_start = time.time()
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    tags = data["seq_tag"]
+
+    logprobs_cpu = logprobs.cpu()
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    am_time = time.time() - am_start
+    run_ctx.total_am_time += am_time
+
+    search_start = time.time()
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu())
+    search_time = time.time() - search_start
+    run_ctx.total_search_time += search_time
+
+    print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+    print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch))
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch))
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc_v2.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc_v2.py
new file mode 100644
index 000000000..815b283ba
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc_v2.py
@@ -0,0 +1,115 @@
+"""
+Flashlight/Torchaudio CTC decoder and prior computation functions
+"""
+
+import time
+import numpy as np
+import torch
+from torch import nn
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        tokens=labels + ["[blank]"],
+        # "[SILENCE]" and "[UNK]" are not actually part of the vocab,
+        # but the decoder is happy as long they are defined in the token list
+        # even if they do not exist as label index in the softmax output,
+        blank_token="[blank]",
+        sil_token="[blank]",
+        unk_word="[unknown]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_size_token=kwargs.get("beam_size_token", None),
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_am_time = 0
+    run_ctx.total_search_time = 0
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print(
+        "Total-AM-Time: %.2fs, AM-RTF: %.3f"
+        % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s)
+    )
+    print(
+        "Total-Search-Time: %.2fs, Search-RTF: %.3f"
+        % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s)
+    )
+    total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    am_start = time.time()
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    tags = data["seq_tag"]
+
+    logprobs_cpu = logprobs.cpu()
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    am_time = time.time() - am_start
+    run_ctx.total_am_time += am_time
+
+    search_start = time.time()
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu())
+    search_time = time.time() - search_start
+    run_ctx.total_search_time += search_time
+
+    print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+    print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch))
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch))
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        # TODO: Check if "[" removal is unnecessary
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py
new file mode 100644
index 000000000..cabf6d47d
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py
@@ -0,0 +1,59 @@
+"""
+Greedy CTC decoder without any extras
+"""
+
+import time
+import torch
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    run_ctx.labels = vocab.labels
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_time = 0
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (run_ctx.total_time, run_ctx.total_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    am_start = time.time()
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    batch_indices = []
+    for lp, l in zip(logprobs, audio_features_len):
+        batch_indices.append(torch.unique_consecutive(torch.argmax(lp[:l], dim=-1), dim=0).detach().cpu().numpy())
+
+    am_time = time.time() - am_start
+    run_ctx.total_time += am_time
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+
+    tags = data["seq_tag"]
+
+    for indices, tag in zip(batch_indices, tags):
+        print(indices)
+        sequence = [run_ctx.labels[idx] for idx in indices if idx < len(run_ctx.labels)]
+        sequence = [s for s in sequence if (not s.startswith("<") and not s.startswith("["))]
+        text = " ".join(sequence).replace("@@ ", "")
+        print(text)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(text)))
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/__init__.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py
new file mode 100644
index 000000000..c28566b92
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py
@@ -0,0 +1,87 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass(kw_only=True)
+class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config):
+    activation_str: str = ""
+    activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        activation_str = d.pop("activation_str")
+        if activation_str == "ReLU":
+            from torch.nn import ReLU
+
+            activation = ReLU()
+        else:
+            assert False, "Unsupported activation %s" % d["activation_str"]
+        d["activation"] = activation
+        return VGG4LayerActFrontendV1Config(**d)
+
+
+@dataclass
+class ConformerEncoderV1Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: Number of conformer layers in the conformer encoder
+        frontend: A pair of ConformerFrontend and corresponding config
+        block_cfg: Configuration for ConformerBlockV1
+    """
+
+    num_layers: int
+
+    # nested configurations
+    frontend: ModuleFactoryV1
+    block_cfg: ConformerBlockV1Config
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return SpecaugConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    frontend_config: VGG4LayerActFrontendV1Config
+    specaug_config: SpecaugConfig
+    label_target_size: int
+    conformer_size: int
+    num_layers: int
+    num_heads: int
+    ff_dim: int
+    att_weights_dropout: float
+    conv_dropout: float
+    ff_dropout: float
+    mhsa_dropout: float
+    conv_kernel_size: int
+    final_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_posenc.py b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_posenc.py
new file mode 100644
index 000000000..8d45dab3f
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/pytorch_networks/ctc_conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_posenc.py
@@ -0,0 +1,369 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+import math
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+from .specaugment_fixed import returnn_specaugment_by_length
+from .legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformulated in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Masked tensor [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class TransparentConformerEncoderV1(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.1)
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,)))
+
+        torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1))
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+        x = self.posenc(x)
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        final = transparent_weights[0] * x
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            final = final + (transparent_weights[i + 1] * x)
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        self.label_target_size = self.cfg.label_target_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        self.export_mode = False
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+
+        :param raw_audio:
+        :param raw_audio_len:
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = returnn_specaugment_by_length(
+                    audio_features,
+                    repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    max_dim_time=self.cfg.specaug_config.max_dim_time,
+                    num_repeat_feat=self.cfg.specaug_config.num_repeat_feat,
+                    max_dim_feat=self.cfg.specaug_config.max_dim_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    bpe_labels = data["bpe_labels"]  # [B, N] (sparse)
+    bpe_labels_len = data["bpe_labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        bpe_labels,
+        input_lengths=audio_features_len,
+        target_lengths=bpe_labels_len,
+        blank=model.label_target_size,
+        reduction="sum",
+    )
+    num_phonemes = torch.sum(bpe_labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        # tokens=labels + ["[blank]", "[SILENCE]"],
+        tokens=labels + ["[blank]"],
+        blank_token="[blank]",
+        # sil_token="[SILENCE]",
+        sil_token="[blank]",
+        unk_word="[UNKNOWN2]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    # write empty HDF until new ForwardJob exists
+    f = open("output.hdf", "wt")
+    f.write(" ")
+    f.close()
+
+
+def search_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    tags = data["seq_tag"]
+
+    logprobs_cpu = logprobs.cpu()
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu())
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        print(words)
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/standalone_2023/serializer.py b/users/hilmes/experiments/nick_setups/standalone_2023/serializer.py
new file mode 100644
index 000000000..63171ae7c
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/standalone_2023/serializer.py
@@ -0,0 +1,109 @@
+import copy
+from sisyphus import tk
+from typing import Any, Dict, Optional
+
+from i6_core.tools.git import CloneGitRepositoryJob
+
+from i6_experiments.common.setups.returnn_pytorch.serialization import (
+    Collection as TorchCollection,
+)
+from i6_experiments.common.setups.serialization import ExternalImport
+
+from . import PACKAGE
+
+from i6_experiments.common.setups.serialization import Import, PartialImport
+
+
+def get_pytorch_serializer_v3(
+    network_module: str,
+    net_args: Dict[str, Any],
+    decoder: Optional[str] = None,
+    decoder_args: Optional[Dict[str, Any]] = None,
+    post_decoder_args: Optional[Dict[str, Any]] = None,
+    prior=False,
+    debug=False,
+    **kwargs
+) -> TorchCollection:
+    """
+
+    :param network_module: path to the pytorch config file containing Model
+    :param net_args: extra arguments for the model
+    :param decoder: path to the search decoder, if provided will link search functions
+    :param decoder_args:
+    :param post_decoder_args:
+    :param prior: build config for prior computation
+    :param debug: run training in debug mode (linking from recipe instead of copy)
+    :param kwargs:
+    :return:
+    """
+    package = PACKAGE + ".pytorch_networks"
+
+    pytorch_model_import = PartialImport(
+        code_object_path=package + ".%s.Model" % network_module,
+        unhashed_package_root=PACKAGE,
+        hashed_arguments=net_args,
+        unhashed_arguments={},
+        import_as="get_model",
+    )
+    pytorch_train_step = Import(
+        code_object_path=package + ".%s.train_step" % network_module, unhashed_package_root=PACKAGE
+    )
+
+    # TODO: add flag to switch and maybe move to default tools
+    # i6_models_repo = CloneGitRepositoryJob(
+    #     url="https://github.com/rwth-i6/i6_models",
+    #     commit="1e94a4d9d1aa48fe3ac7f60de2cd7bd3fea19c3e",
+    #     checkout_folder_name="i6_models"
+    # ).out_repository
+    i6_models_repo = tk.Path("/u/hilmes/experiments/nick_asr/i6_models")
+    i6_models_repo.hash_overwrite = "LIBRISPEECH_DEFAULT_I6_MODELS"
+    i6_models = ExternalImport(import_path=i6_models_repo)
+
+    serializer_objects = [
+        i6_models,
+        pytorch_model_import,
+        pytorch_train_step,
+    ]
+    if decoder:
+        # Just a hack to test the phoneme-based recognition
+        forward_step = Import(
+            code_object_path=package + ".%s.forward_step" % decoder,
+            unhashed_package_root=PACKAGE,
+        )
+        init_hook = PartialImport(
+            code_object_path=package + ".%s.forward_init_hook" % decoder,
+            unhashed_package_root=PACKAGE,
+            hashed_arguments=decoder_args or {},
+            unhashed_arguments=post_decoder_args or {},
+        )
+        finish_hook = Import(
+            code_object_path=package + ".%s.forward_finish_hook" % decoder,
+            unhashed_package_root=PACKAGE,
+        )
+        serializer_objects.extend([forward_step, init_hook, finish_hook])
+    if prior:
+        forward_step = Import(
+            code_object_path=package + ".%s.prior_step" % network_module,
+            unhashed_package_root=PACKAGE,
+            import_as="forward_step",
+        )
+        init_hook = Import(
+            code_object_path=package + ".%s.prior_init_hook" % network_module,
+            unhashed_package_root=PACKAGE,
+            import_as="forward_init_hook",
+        )
+        finish_hook = Import(
+            code_object_path=package + ".%s.prior_finish_hook" % network_module,
+            import_as="forward_finish_hook",
+            unhashed_package_root=PACKAGE,
+        )
+        serializer_objects.extend([forward_step, init_hook, finish_hook])
+    serializer = TorchCollection(
+        serializer_objects=serializer_objects,
+        make_local_package_copy=not debug,
+        packages={
+            package,
+        },
+    )
+
+    return serializer
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/__init__.py
new file mode 100644
index 000000000..6ac5dd240
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/__init__.py
@@ -0,0 +1 @@
+PACKAGE = __package__
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/data.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/data.py
new file mode 100644
index 000000000..aa3fabdec
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/data.py
@@ -0,0 +1,212 @@
+"""
+The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups
+
+Here are (or rather, should be) the definitions for Tedlium-V2 data and RETURNN datasets that
+are consistent across Phon/BPE as well as CTC/RNN-T/Attention systems
+"""
+from sisyphus import tk
+
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Dict, List, Optional, Tuple, Union
+
+from i6_core.returnn import CodeWrapper, BlissToOggZipJob
+
+from i6_experiments.common.datasets.tedlium2.corpus import get_bliss_corpus_dict
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.base import Datastream
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream, BpeDatastream
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.audio import (
+    ReturnnAudioRawOptions,
+    AudioRawDatastream,
+)
+from i6_experiments.common.setups.returnn.datasets import Dataset, OggZipDataset, MetaDataset
+
+from .default_tools import MINI_RETURNN_ROOT, RETURNN_EXE
+
+DATA_PREFIX = "rescale/tedlium2_standalone_2023/data/"
+
+# -------------- Dataclasses for configuration and data passing -------------------
+
+# here: (<from-epoch> , <to-epoch>, <max-mean-length>)
+EpochWiseFilter = Tuple[int, int, int]
+
+
+@dataclass(frozen=True)
+class TrainingDatasets:
+    train: Dataset
+    cv: Dataset
+    devtrain: Dataset
+    datastreams: Dict[str, Datastream]
+    prior: Optional[Dataset]
+
+
+@dataclass()
+class TrainingDatasetSettings:
+    # features settings
+    custom_processing_function: Optional[str]
+
+    # training settings
+    partition_epoch: int
+    epoch_wise_filters: List[EpochWiseFilter]
+    seq_ordering: str
+
+
+# --------------------------- Helper functions  -----------------------------------
+
+
+def get_zip(name: str, bliss_dataset: tk.Path):
+    """
+
+    :param name:
+    :param bliss_dataset:
+    :return:
+    """
+    zip_dataset_job = BlissToOggZipJob(
+        bliss_corpus=bliss_dataset,
+        no_conversion=False,
+        returnn_python_exe=RETURNN_EXE,
+        returnn_root=MINI_RETURNN_ROOT,
+    )
+    zip_dataset_job.add_alias(DATA_PREFIX + name)
+
+    return zip_dataset_job.out_ogg_zip
+
+
+def get_test_bliss_and_zip(corpus_key):
+    """
+    for now just return the original ogg zip
+
+    :param corpus_key: e.g. "train", "dev", "test"
+    :return:
+    """
+    bliss = get_bliss_corpus_dict(audio_format="wav")[corpus_key]
+    zip_dataset = BlissToOggZipJob(
+        bliss_corpus=bliss,
+        no_conversion=False,
+        returnn_python_exe=RETURNN_EXE,
+        returnn_root=MINI_RETURNN_ROOT,
+    ).out_ogg_zip
+    return bliss, zip_dataset
+
+
+@lru_cache()
+def get_audio_raw_datastream():
+    audio_datastream = AudioRawDatastream(
+        available_for_inference=True, options=ReturnnAudioRawOptions(peak_normalization=False, preemphasis=0.97)
+    )
+    return audio_datastream
+
+
+# --------------------------- Dataset functions  -----------------------------------
+
+
+def build_training_datasets(
+    settings: TrainingDatasetSettings,
+    train_ogg: tk.Path,
+    dev_ogg: tk.Path,
+    label_datastream: Union[LabelDatastream, BpeDatastream],
+):
+    """
+    builds the training RETURNN datasets using raw audio input for arbitrary label type
+
+    :param settings: configuration object for the dataset pipeline
+    :param train_ogg: ogg zip for training data
+    :param dev_ogg: ogg zip for dev data
+    :param label_datastream: phoneme or bpe datastream
+    :return:
+    """
+    audio_datastream = get_audio_raw_datastream()
+
+    datastreams = {
+        "raw_audio": audio_datastream,
+        "labels": label_datastream,
+    }
+
+    data_map = {"raw_audio": ("zip_dataset", "data"), "labels": ("zip_dataset", "classes")}
+
+    training_audio_opts = audio_datastream.as_returnn_audio_opts()
+    if settings.custom_processing_function:
+        training_audio_opts["pre_process"] = CodeWrapper(settings.custom_processing_function)
+
+    additional_opts = {}
+    if settings.epoch_wise_filters:
+        additional_opts["epoch_wise_filter"] = {}
+        for fr, to, max_mean_len in settings.epoch_wise_filters:
+            additional_opts["epoch_wise_filter"][(fr, to)] = {"max_mean_len": max_mean_len}
+
+    def make_meta(dataset: OggZipDataset):
+        return MetaDataset(
+            data_map=data_map, datasets={"zip_dataset": dataset}, seq_order_control_dataset="zip_dataset"
+        )
+
+    train_zip_dataset = OggZipDataset(
+        files=train_ogg,
+        audio_options=training_audio_opts,
+        target_options=label_datastream.as_returnn_targets_opts(),
+        partition_epoch=settings.partition_epoch,
+        seq_ordering=settings.seq_ordering,
+        additional_options=additional_opts,
+    )
+    train_dataset = make_meta(train_zip_dataset)
+
+    cv_zip_dataset = OggZipDataset(
+        files=dev_ogg,
+        audio_options=audio_datastream.as_returnn_audio_opts(),
+        target_options=label_datastream.as_returnn_targets_opts(),
+        seq_ordering="sorted_reverse",
+    )
+    cv_dataset = make_meta(cv_zip_dataset)
+
+    devtrain_zip_dataset = OggZipDataset(
+        files=train_ogg,
+        audio_options=audio_datastream.as_returnn_audio_opts(),
+        target_options=label_datastream.as_returnn_targets_opts(),
+        seq_ordering="sorted_reverse",
+        random_subset=3000,
+    )
+    devtrain_dataset = make_meta(devtrain_zip_dataset)
+
+    prior_zip_dataset = OggZipDataset(
+        files=train_ogg,
+        audio_options=training_audio_opts,
+        target_options=label_datastream.as_returnn_targets_opts(),
+        partition_epoch=1,
+        seq_ordering="sorted_reverse",
+        additional_options=additional_opts,
+    )
+    prior_dataset = make_meta(prior_zip_dataset)
+
+    return TrainingDatasets(
+        train=train_dataset,
+        cv=cv_dataset,
+        devtrain=devtrain_dataset,
+        datastreams=datastreams,
+        prior=prior_dataset,
+    )
+
+
+@lru_cache()
+def build_test_dataset(dataset_key: str):
+    """
+    :param dataset_key: test dataset to generate ("eval" or "test")
+    """
+
+    _, test_ogg = get_test_bliss_and_zip(dataset_key)
+    bliss_dict = get_bliss_corpus_dict()  # unmodified bliss
+
+    audio_datastream = get_audio_raw_datastream()
+
+    data_map = {"raw_audio": ("zip_dataset", "data")}
+
+    test_zip_dataset = OggZipDataset(
+        files=[test_ogg],
+        audio_options=audio_datastream.as_returnn_audio_opts(),
+        seq_ordering="sorted_reverse",
+    )
+    test_dataset = MetaDataset(
+        data_map=data_map, datasets={"zip_dataset": test_zip_dataset}, seq_order_control_dataset="zip_dataset"
+    )
+
+    return test_dataset, bliss_dict[dataset_key]
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/dataset_code/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/dataset_code/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/dataset_code/speed_perturbation.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/dataset_code/speed_perturbation.py
new file mode 100644
index 000000000..63a564c5e
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/dataset_code/speed_perturbation.py
@@ -0,0 +1,20 @@
+"""
+RETURNN Dataset compatible processing code snippets
+"""
+
+
+def legacy_speed_perturbation(audio, sample_rate, random_state):
+    """
+    Use with the old TF setups Rossenbach/Zeineldeen
+
+    :param audio:
+    :param sample_rate:
+    :param random_state:
+    :return:
+    """
+    import librosa
+
+    new_sample_rate = int(sample_rate * (1 + random_state.randint(-1, 2) * 0.1))
+    if new_sample_rate != sample_rate:
+        audio = librosa.core.resample(audio, orig_sr=sample_rate, target_sr=new_sample_rate, res_type="kaiser_fast")
+    return audio
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/default_tools.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/default_tools.py
new file mode 100644
index 000000000..6d49ab99d
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/default_tools.py
@@ -0,0 +1,20 @@
+from sisyphus import tk
+from i6_core.tools.git import CloneGitRepositoryJob
+
+
+# python from apptainer
+RETURNN_EXE = tk.Path("/usr/bin/python3", hash_overwrite="GENERIC_RETURNN_LAUNCHER")
+MINI_RETURNN_ROOT = tk.Path("/u/hilmes/dev/MiniReturnn", hash_overwrite="TEDLIUM2_DEFAULT_RETURNN_ROOT")
+
+from i6_experiments.common.tools.sctk import compile_sctk
+
+SCTK_BINARY_PATH = compile_sctk(branch="v2.4.12")  # use last published version
+# SCTK_BINARY_PATH = compile_sctk()  # use most recent SCTK
+SCTK_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_SCTK_BINARY_PATH"
+
+from i6_core.tools.git import CloneGitRepositoryJob
+from i6_core.lm.kenlm import CompileKenLMJob, CreateBinaryLMJob
+
+kenlm_repo = CloneGitRepositoryJob("https://github.com/kpu/kenlm").out_repository
+KENLM_BINARY_PATH = CompileKenLMJob(repository=kenlm_repo).out_binaries
+KENLM_BINARY_PATH.hash_overwrite = "TEDLIUM2_DEFAULT_KENLM_BINARY_PATH"
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/config.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/config.py
new file mode 100644
index 000000000..86049448b
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/config.py
@@ -0,0 +1,164 @@
+import copy
+import numpy as np
+from sisyphus import tk
+from typing import Any, Dict, Optional, List
+
+from i6_core.returnn.config import ReturnnConfig, CodeWrapper
+
+from i6_experiments.common.setups.returnn_pytorch.serialization import (
+    Collection as TorchCollection,
+)
+from i6_experiments.common.setups.serialization import Import
+from ..data import TrainingDatasets
+from ..flashlight_phon_ctc.serializer import get_pytorch_serializer_v3, PACKAGE
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset
+
+
+def get_training_config(
+    training_datasets: TrainingDatasets,
+    network_module: str,
+    net_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine: bool = False,
+    use_speed_perturbation: bool = False,
+    keep_epochs: Optional[List] = None,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these do not change the hash
+    post_config = {
+        "cleanup_old_models": True,
+        "stop_on_nonfinite_train_score": True,  # this might break now with True
+        "num_workers_per_gpu": 2,
+    }
+    if keep_epochs is not None:
+        post_config["cleanup_old_models"] = {
+            "keep_last_n": 2,
+            "keep_best_n": 4,
+            "keep": keep_epochs,
+        }
+
+    base_config = {
+        "max_seqs": 60,
+        #############
+        "train": copy.deepcopy(training_datasets.train.as_returnn_opts()),
+        "dev": training_datasets.cv.as_returnn_opts(),
+        "eval_datasets": {"devtrain": training_datasets.devtrain.as_returnn_opts()},
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module, net_args=net_args, debug=debug, use_custom_engine=use_custom_engine
+    )
+    python_prolog = None
+    if use_speed_perturbation:
+        prolog_serializer = TorchCollection(
+            serializer_objects=[
+                Import(
+                    code_object_path=PACKAGE + ".dataset_code.speed_perturbation.legacy_speed_perturbation",
+                    unhashed_package_root=PACKAGE,
+                )
+            ]
+        )
+        python_prolog = [prolog_serializer]
+        config["train"]["datasets"]["zip_dataset"]["audio"]["pre_process"] = CodeWrapper("legacy_speed_perturbation")
+
+    returnn_config = ReturnnConfig(
+        config=config, post_config=post_config, python_prolog=python_prolog, python_epilog=[serializer]
+    )
+    return returnn_config
+
+
+def get_prior_config(
+    training_datasets: TrainingDatasets,
+    network_module: str,
+    net_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine=False,
+    **kwargs,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these does not change the hash
+    post_config = {}
+
+    base_config = {
+        #############
+        "batch_size": 50000 * 160,
+        "max_seqs": 60,
+        #############
+        "forward": training_datasets.prior.as_returnn_opts(),
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module,
+        net_args=net_args,
+        debug=debug,
+        use_custom_engine=use_custom_engine,
+        prior=True,
+    )
+    returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer])
+    return returnn_config
+
+
+def get_search_config(
+    network_module: str,
+    net_args: Dict[str, Any],
+    decoder: [str],
+    decoder_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine:bool = False,
+    export:bool = False,
+    **kwargs,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these does not change the hash
+    post_config = {}
+
+    base_config = {
+        #############
+        "batch_size": 24000 * 160,
+        "max_seqs": 60,
+        #############
+        # dataset is added later in the pipeline during search_single
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module,
+        net_args=net_args,
+        debug=debug,
+        use_custom_engine=use_custom_engine,
+        decoder=decoder,
+        decoder_args=decoder_args,
+        export=export,
+    )
+    returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer])
+    return returnn_config
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/data.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/data.py
new file mode 100644
index 000000000..35d50f268
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/data.py
@@ -0,0 +1,92 @@
+"""
+The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups
+"""
+from sisyphus import tk
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Dict, List, Optional, Tuple
+
+from i6_core.returnn import CodeWrapper
+
+from i6_experiments.common.datasets.tedlium2.corpus import get_ogg_zip_dict
+from i6_experiments.common.datasets.tedlium2.vocab import get_subword_nmt_bpe_v2
+from i6_experiments.common.datasets.tedlium2.lexicon import get_bliss_lexicon
+from i6_experiments.common.helpers.text_labels.subword_nmt_bpe import get_returnn_subword_nmt
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import BpeDatastream
+from i6_experiments.users.rossenbach.lexicon.bpe_lexicon import CreateBPELexiconJob
+
+from returnn_common.datasets import Dataset, OggZipDataset, MetaDataset
+
+from ..data import build_training_datasets, TrainingDatasetSettings, TrainingDatasets
+
+from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE
+
+
+from ..data import DATA_PREFIX
+
+
+def get_lexicon(bpe_size: int) -> tk.Path:
+    subword_nmt_repo = get_returnn_subword_nmt(
+        commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", output_prefix=DATA_PREFIX
+    )
+    subword_nmt_repo.hash_overwrite = "I6_SUBWORD_NMT_V2"
+
+    bpe_datastream = get_bpe_datastream(bpe_size=bpe_size, is_recog=False)
+    bpe_lexicon = CreateBPELexiconJob(
+        base_lexicon_path=get_bliss_lexicon(
+            add_unknown_phoneme_and_mapping=False, add_silence=False, output_prefix="tedliumv2_datasets"
+        ),
+        bpe_codes=bpe_datastream.codes,
+        bpe_vocab=bpe_datastream.vocab,
+        subword_nmt_repo=subword_nmt_repo,
+        unk_label="<unk>",
+    ).out_lexicon
+
+    return bpe_lexicon
+
+
+def get_text_lexicon(bpe_size: int) -> tk.Path:
+    """
+
+    :return:
+    """
+    bliss_lex = get_lexicon(bpe_size=bpe_size)
+    from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon
+
+    word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon
+    return word_lexicon
+
+
+def get_bpe_datastream(bpe_size: int, is_recog: bool) -> BpeDatastream:
+    """
+    Returns the datastream for the bpe labels
+
+    Uses the legacy BPE setup that is compatible with old LM models
+
+    :param librispeech_key:
+    :param bpe_size: size for the bpe labels
+    :param is_recog: removes the UNK label when not in training
+    :param use_v2: subword_nmt had a bug where it would not find python, use corrected version which changes hash
+    """
+    bpe_settings = get_subword_nmt_bpe_v2(bpe_size=bpe_size, unk_label="<unk>")
+    bpe_targets = BpeDatastream(available_for_inference=False, bpe_settings=bpe_settings, use_unk_label=is_recog)
+    return bpe_targets
+
+
+def build_bpe_training_datasets(
+    bpe_size: int,
+    settings: TrainingDatasetSettings,
+) -> TrainingDatasets:
+    """
+    :param settings: configuration object for the dataset pipeline
+    """
+    label_datastream = get_bpe_datastream(bpe_size=bpe_size, is_recog=False)
+
+    ogg_zip_dict = get_ogg_zip_dict(returnn_python_exe=RETURNN_EXE, returnn_root=MINI_RETURNN_ROOT)
+    train_ogg = ogg_zip_dict["train"]
+    dev_ogg = ogg_zip_dict["dev"]
+
+    return build_training_datasets(
+        settings=settings, train_ogg=train_ogg, dev_ogg=dev_ogg, label_datastream=label_datastream
+    )
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/exp_baseline.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/exp_baseline.py
new file mode 100644
index 000000000..a02529db8
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/exp_baseline.py
@@ -0,0 +1,549 @@
+from sisyphus import tk
+
+import copy
+from dataclasses import asdict
+import numpy as np
+from typing import cast, List, Optional
+
+from i6_core.report.report import _Report_Type
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+
+from .data import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon
+from ..data import build_test_dataset
+from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT
+
+from ..pipeline import training, search, compute_prior
+
+from .config import get_training_config, get_search_config, get_prior_config
+
+
+def flash_bpe_ctc_report_format(report: _Report_Type) -> str:
+    extra_ls = []
+    out = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)]
+    out = sorted(out, key=lambda x: float(x[1]))
+    best_ls = [out[0]]
+    for extra in extra_ls:
+        out2 = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if extra in recog]
+        out2 = sorted(out2, key=lambda x: float(x[1]))
+        if len(out2) > 0:
+            out.append((extra, ""))
+            out.extend(out2)
+            best_ls.append(out2[0])
+    best_ls = sorted(best_ls, key=lambda x: float(x[1]))
+    out.append(("Best Results", ""))
+    out.extend(best_ls)
+    return "\n".join([f"{pair[0]}:  {str(pair[1])}" for pair in out])
+
+
+def conformer_baseline():
+    prefix_name = "experiments/rescale/tedliumv2/flashlight_bpe_ctc/"
+
+    BPE_SIZE = 1000
+
+    train_settings = TrainingDatasetSettings(
+        custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000"
+    )
+
+    train_settings_retrain = copy.deepcopy(train_settings)
+    train_settings_retrain.epoch_wise_filters = []
+
+    # build the training datasets object containing train, cv, dev-train and the extern_data dict
+    train_data = build_bpe_training_datasets(
+        bpe_size=BPE_SIZE,
+        settings=train_settings,
+    )
+    label_datastream = cast(LabelDatastream, train_data.datastreams["labels"])
+    vocab_size_without_blank = label_datastream.vocab_size
+
+    # build testing datasets
+    test_dataset_tuples = {}
+    # for testset in ["dev", "test"]:
+    for testset in ["dev"]:
+        test_dataset_tuples[testset] = build_test_dataset(
+            dataset_key=testset,
+        )
+    from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm
+
+    lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False)
+    lm = lms_system.interpolated_lms["dev-pruned"]["4gram"]
+    arpa_ted_lm = lm.ngram_lm
+
+    # ---------------------------------------------------------------------------------------------------------------- #
+
+    def run_exp(
+        ft_name,
+        datasets,
+        train_args,
+        search_args=None,
+        with_prior=False,
+        num_epochs=250,
+        decoder="ctc.decoder.flashlight_bpe_ctc",
+        eval_epochs: Optional[List] = None,
+    ):
+        training_name = "/".join(ft_name.split("/")[:-1])
+        search_args = search_args if search_args is not None else {}
+
+        returnn_config = get_training_config(training_datasets=datasets, keep_epochs=eval_epochs, **train_args)
+        train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs)
+
+        if eval_epochs is None:
+            eval_epochs = [num_epochs]
+        search_job_ls = []
+        report = {}
+        for epoch in eval_epochs:
+            if with_prior:
+                prior_args = copy.deepcopy(train_args)
+                if "max_seqs" in prior_args["config"]:
+                    del prior_args["config"]["max_seqs"]
+                returnn_config = get_prior_config(training_datasets=datasets, **prior_args)
+                prior_file = compute_prior(
+                    ft_name,
+                    returnn_config,
+                    checkpoint=train_job.out_checkpoints[epoch],
+                    returnn_exe=RETURNN_EXE,
+                    returnn_root=MINI_RETURNN_ROOT,
+                    epoch=str(epoch)  # just for alias generation
+                )
+                tk.register_output(training_name + f"/prior/{epoch}.txt", prior_file)
+                search_args["prior_file"] = prior_file
+
+            returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder)
+            format_string_report, values_report, search_jobs = search(
+                ft_name + "/default_%i" % epoch,
+                returnn_search_config,
+                train_job.out_checkpoints[epoch],
+                test_dataset_tuples,
+                RETURNN_EXE,
+                MINI_RETURNN_ROOT,
+            )
+            search_job_ls += search_jobs
+            report.update(values_report)
+        from i6_core.returnn import GetBestPtCheckpointJob
+        best_job = GetBestPtCheckpointJob(train_job.out_model_dir, train_job.out_learning_rates, key="dev_loss_ctc")
+        best_job.add_alias(ft_name + "/get_best_job")
+        format_string_report, values_report, search_jobs = search(
+            ft_name + "/best_chkpt",
+            returnn_search_config,
+            best_job.out_checkpoint,
+            test_dataset_tuples,
+            RETURNN_EXE,
+            MINI_RETURNN_ROOT,
+        )
+        search_job_ls += search_jobs
+        report.update(values_report)
+
+        return train_job, search_job_ls, format_string_report, report
+
+    def generate_report(results, exp_name):
+        from i6_core.report import GenerateReportStringJob, MailJob
+
+        report = GenerateReportStringJob(report_values=results, report_template=flash_bpe_ctc_report_format)
+        report.add_alias(f"report/report/{exp_name}")
+        mail = MailJob(report.out_report, send_contents=True, subject=exp_name)
+        mail.add_alias(f"report/mail/{exp_name}")
+        tk.register_output("mail/" + exp_name, mail.out_status)
+
+    # from here on onwards, use default AdamW with same OCLR
+    train_args_adamw03_accum2 = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3},
+            "learning_rates": list(np.linspace(1e-5, 1e-3, 125)) + list(np.linspace(1e-3, 1e-6, 125)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "accum_grad_multiple_step": 2,
+        },
+        "debug": False,
+    }
+
+    train_args_adamw03_accum2_jjlr = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+            + list(np.linspace(7e-4, 7e-5, 110))
+            + list(np.linspace(7e-5, 1e-8, 30)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "accum_grad_multiple_step": 2,
+        },
+        "debug": False,
+    }
+
+    default_search_args = {
+        "lexicon": get_text_lexicon(bpe_size=BPE_SIZE),
+        "returnn_vocab": label_datastream.vocab,
+        "beam_size": 1024,
+        "arpa_lm": arpa_ted_lm,
+        "beam_threshold": 14,
+    }
+
+    #### New experiments with corrected FF-Dim
+
+    from ..pytorch_networks.ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+    )
+
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(2, 1),
+        pool1_stride=(2, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    model_config = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+    )
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v2",
+        "net_args": {"model_config_dict": asdict(model_config)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2_JJLR/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 97.9, not converged
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2_JJLR"
+    )
+    del results
+
+    from ..pytorch_networks.ctc.conformer_0923 import i6modelsV1_VGG4LayerActFrontendV1_v4_cfg
+
+    model_config_v4_start11 = i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        specauc_start_epoch=11,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+    )
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v4",
+        "net_args": {"model_config_dict": asdict(model_config_v4_start11)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+                "beam_size_token": 128,
+            }
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_JJLR_specstart11/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 8.0
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_JJLR_specstart11"
+    )
+    del results
+    # TODO: This here is the subsampling 4 baseline giving 8.0% with LM 1.6 and prior 0.5
+    results = {}
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v5",
+        "net_args": {"model_config_dict": asdict(model_config_v4_start11)},
+    }
+    train_args["config"]["learning_rates"] = (
+            list(np.linspace(7e-6, 7e-4, 130)) + list(np.linspace(7e-4, 7e-5, 230)) + list(np.linspace(7e-5, 1e-8, 140))
+    )
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+                "beam_size_token": 128,
+            }
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_specstart11_longer/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                num_epochs=500,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.7
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_specstart11_longer"
+    )
+    del results
+
+    results = {}
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v5",
+        "net_args": {"model_config_dict": asdict(model_config_v4_start11)},
+    }
+    train_args["config"]["learning_rates"] = (
+            list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + [7e-5])
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+                "beam_size_token": 128,
+            }
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_specstart11_longerend/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                num_epochs=500,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.6
+        results=results,
+        exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_specstart11_longerend"
+    )
+    del results
+
+    results = {}
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v5",
+        "net_args": {"model_config_dict": asdict(model_config_v4_start11)},
+    }
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+                "beam_size_token": 128,
+            }
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_specstart11/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.9, most likely better due to noise
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_specstart11"
+    )
+    del results
+    # --------------------------------------------------------------------------------------------------------------- #
+    # SUB 6 from here
+
+    model_config_v4_sub6_start11 = copy.deepcopy(model_config_v4_start11)
+    model_config_v4_sub6_start11.frontend_config.pool1_stride = (3, 1)
+    model_config_v4_sub6_start11.frontend_config.pool1_kernel_size = (3, 1)
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v4",
+        "net_args": {"model_config_dict": asdict(model_config_v4_sub6_start11)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_JJLR_sub6_specstart11/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # did not converge 98.0
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_JJLR_sub6_specstart11"
+    )
+    del results
+
+    model_config_sub6 = copy.deepcopy(model_config)
+    model_config_sub6.frontend_config.pool1_stride = (3, 1)
+    model_config_sub6.frontend_config.pool1_kernel_size = (3, 1)
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3_transparent",
+        "net_args": {"model_config_dict": asdict(model_config_sub6)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_transparent_sub6/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 97.8 not converged
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_transparent_sub6"
+    )
+    del results
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent",
+        "net_args": {"model_config_dict": asdict(model_config_sub6)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_posenc_transparent_sub6/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 99.2, not converged
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_posenc_transparent_sub6"
+    )
+    del results
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_latespecaug",
+        "net_args": {"model_config_dict": asdict(model_config_sub6)},
+    }
+    results = {}
+    for lm_weight in [1.4, 1.6, 1.8, 2.0]:
+        for prior_scale in [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
+            for beam_size in [512, 1024]:
+                search_args = {
+                    **default_search_args,
+                    "lm_weight": lm_weight,
+                    "prior_scale": prior_scale,
+                    "beam_size_token": 128,
+                    "beam_size": beam_size,
+                }
+                _, _, _, wer_values = run_exp(
+                    prefix_name
+                    + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_posenc_transparent_sub6_latespecaug/lm%.1f_prior%.2f_bs%i_th14"
+                    % (lm_weight, prior_scale, beam_size),
+                    datasets=train_data,
+                    train_args=train_args,
+                    search_args=search_args,
+                    with_prior=True,
+                )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 8.4
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_posenc_transparent_sub6_latespecaug"
+    )
+    del results
+
+    train_args_debug = copy.deepcopy(train_args)
+    train_args_debug["debug"] = True
+    # greedy
+    search_args = {
+        "returnn_vocab": label_datastream.vocab,
+    }
+    run_exp(
+        prefix_name
+        + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_posenc_transparent_sub6_latespecaug/greedy",
+        datasets=train_data,
+        train_args=train_args_debug,
+        search_args=search_args,
+        with_prior=True,
+        decoder="ctc.decoder.greedy_bpe_ctc_v2",
+    )
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/exp_pretrain.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/exp_pretrain.py
new file mode 100644
index 000000000..851735b38
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_bpe_ctc/exp_pretrain.py
@@ -0,0 +1,1143 @@
+import itertools
+
+from sisyphus import tk
+
+import copy
+from dataclasses import asdict
+import numpy as np
+from typing import cast, List, Optional, Dict
+from onnxruntime.quantization.quantize import QuantType, QuantFormat
+from onnxruntime.quantization.calibrate import CalibrationMethod
+
+from i6_core.report.report import _Report_Type
+from i6_core.returnn import GetBestPtCheckpointJob, TorchOnnxExportJob
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+
+from .data import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon
+from ..data import build_test_dataset, TrainingDatasets
+from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT
+
+from ..pipeline import training, search, compute_prior
+
+from .config import get_training_config, get_search_config, get_prior_config
+
+
+def flash_bpe_ctc_report_format(report: _Report_Type) -> str:
+    extra_ls = []
+    out = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)]
+    out = sorted(out, key=lambda x: float(x[1]))
+    best_ls = [out[0]]
+    for extra in extra_ls:
+        out2 = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if extra in recog]
+        out2 = sorted(out2, key=lambda x: float(x[1]))
+        if len(out2) > 0:
+            out.append((extra, ""))
+            out.extend(out2)
+            best_ls.append(out2[0])
+    best_ls = sorted(best_ls, key=lambda x: float(x[1]))
+    out.append(("Best Results", ""))
+    out.extend(best_ls)
+    return "\n".join([f"{pair[0]}:  {str(pair[1])}" for pair in out])
+
+
+def get_quant_str(num_seqs, quant_mode, activation_type, weight_type, average, sym, quant_ops, quant_format):
+    if quant_mode == CalibrationMethod.MinMax:
+        mode_str = "quant_min_max"
+    elif quant_mode == CalibrationMethod.Entropy:
+        mode_str = "quant_entropy"
+    else:
+        mode_str = "quant_percentile"
+    mode_str += f"_{num_seqs}"
+    for x in [activation_type, weight_type]:
+        if x == QuantType.QInt8:
+            mode_str += "_QInt8"
+        elif x == QuantType.QUInt8:
+            mode_str += "_QUint8"
+    if average:
+        mode_str += "_avg"
+    if sym:
+        mode_str += "_sym"
+    if quant_ops is not None:
+        mode_str += "_" + "_".join(quant_ops)
+    else:
+        mode_str += "_full"
+    if quant_format == QuantFormat.QDQ:
+        mode_str += "_QDQ"
+    else:
+        mode_str += "QOperator"
+    return mode_str
+
+
+def pretrained_experiments():
+    prefix_name = "experiments/rescale/tedliumv2/flashlight_bpe_ctc/"
+
+    BPE_SIZE = 1000
+
+    train_settings = TrainingDatasetSettings(
+        custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000"
+    )
+
+    train_settings_retrain = copy.deepcopy(train_settings)
+    train_settings_retrain.epoch_wise_filters = []
+
+    # build the training datasets object containing train, cv, dev-train and the extern_data dict
+    train_data = build_bpe_training_datasets(
+        bpe_size=BPE_SIZE,
+        settings=train_settings,
+    )
+    label_datastream = cast(LabelDatastream, train_data.datastreams["labels"])
+    vocab_size_without_blank = label_datastream.vocab_size
+
+    # build testing datasets
+    test_dataset_tuples = {}
+    # for testset in ["dev", "test"]:
+    for testset in ["dev"]:
+        test_dataset_tuples[testset] = build_test_dataset(
+            dataset_key=testset,
+        )
+    from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm
+
+    lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False)
+    lm = lms_system.interpolated_lms["dev-pruned"]["4gram"]
+    arpa_ted_lm = lm.ngram_lm
+
+    # ---------------------------------------------------------------------------------------------------------------- #
+
+    def run_exp(
+        ft_name,
+        datasets: TrainingDatasets,
+        train_args,
+        search_args=None,
+        with_prior=False,
+        num_epochs=250,
+        decoder="ctc.decoder.flashlight_bpe_ctc",
+        eval_epochs: Optional[List] = None,
+        quantize_args: Optional[Dict[str, str]] = None
+    ):
+        training_name = "/".join(ft_name.split("/")[:-1])
+        search_args = search_args if search_args is not None else {}
+
+        returnn_config = get_training_config(training_datasets=datasets, keep_epochs=eval_epochs, **train_args)
+        train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs)
+
+        if eval_epochs is None or "onnx" in ft_name:
+            eval_epochs = [num_epochs]
+        search_job_ls = []
+        report = {}
+        returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder)
+        for epoch in eval_epochs:
+            if with_prior:
+                prior_args = copy.deepcopy(train_args)
+                if "max_seqs" in prior_args["config"]:
+                    prior_args["config"]["max_seqs"] = 15
+                returnn_config = get_prior_config(training_datasets=datasets, **prior_args)
+                prior_file = compute_prior(
+                    ft_name,
+                    returnn_config,
+                    checkpoint=train_job.out_checkpoints[epoch],
+                    returnn_exe=RETURNN_EXE,
+                    returnn_root=MINI_RETURNN_ROOT,
+                    epoch=str(epoch)  # just for alias generation
+                )
+                tk.register_output(training_name + f"/prior/{epoch}.txt", prior_file)
+                search_args["prior_file"] = prior_file
+            if quantize_args is not None:
+                from i6_experiments.users.hilmes.tools.onnx import ModelQuantizeStaticJob
+                returnn_export_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder, export=True)
+                onnx_job = TorchOnnxExportJob(
+                    returnn_config=returnn_export_config,
+                    checkpoint=train_job.out_checkpoints[epoch],
+                    returnn_root=MINI_RETURNN_ROOT,
+                    returnn_python_exe=RETURNN_EXE,
+                )
+                onnx_job.add_alias(ft_name + f"/onnx_export_{epoch}")
+                quant_job = ModelQuantizeStaticJob(
+                    dataset=datasets.train.as_returnn_opts(),
+                    model=onnx_job.out_onnx_model,
+                    **quantize_args
+                )
+                quant_job.add_alias(ft_name + f"/quantization_{epoch}")
+                decoder_args = copy.deepcopy(search_args)
+                decoder_args["quantized_model"] = quant_job.out_model
+                returnn_search_config = get_search_config(**train_args, decoder_args=decoder_args, decoder=decoder)
+                format_string_report, values_report, search_jobs = search(
+                    ft_name + "/quantized_%i" % epoch,
+                    returnn_search_config,
+                    train_job.out_checkpoints[epoch],
+                    test_dataset_tuples,
+                    RETURNN_EXE,
+                    MINI_RETURNN_ROOT,
+                )
+                #for search_job in search_jobs:
+                #    search_job.add_input(quant_job.out_model)
+                search_job_ls += search_jobs
+                report.update(values_report)
+            else:
+                format_string_report, values_report, search_jobs = search(
+                    ft_name + "/default_%i" % epoch,
+                    returnn_search_config,
+                    train_job.out_checkpoints[epoch],
+                    test_dataset_tuples,
+                    RETURNN_EXE,
+                    MINI_RETURNN_ROOT,
+                )
+                search_job_ls += search_jobs
+                report.update(values_report)
+
+        best_job = GetBestPtCheckpointJob(train_job.out_model_dir, train_job.out_learning_rates, key="dev_loss_ctc")
+        best_job.add_alias(ft_name + "/get_best_job")
+        format_string_report, values_report, search_jobs = search(
+            ft_name + "/best_chkpt",
+            returnn_search_config,
+            best_job.out_checkpoint,
+            test_dataset_tuples,
+            RETURNN_EXE,
+            MINI_RETURNN_ROOT,
+        )
+        search_job_ls += search_jobs
+        report.update(values_report)
+
+        return train_job, search_job_ls, format_string_report, report
+
+    def generate_report(results, exp_name):
+        from i6_core.report import GenerateReportStringJob, MailJob
+
+        report = GenerateReportStringJob(report_values=results, report_template=flash_bpe_ctc_report_format)
+        report.add_alias(f"report/report/{exp_name}")
+        mail = MailJob(report.out_report, send_contents=True, subject=exp_name)
+        mail.add_alias(f"report/mail/{exp_name}")
+        tk.register_output("mail/" + exp_name, mail.out_status)
+
+    # from here on onwards, use default Adam with same OCLR
+    default_search_args = {
+        "lexicon": get_text_lexicon(bpe_size=BPE_SIZE),
+        "returnn_vocab": label_datastream.vocab,
+        "beam_size": 1024,
+        "arpa_lm": arpa_ted_lm,
+        "beam_threshold": 14,
+    }
+
+    from ..pytorch_networks.ctc.conformer_0923 import whisper_pretrained_v2_cfg
+
+    whisper_cfg_2 = whisper_pretrained_v2_cfg.WhisperConfig(
+        just_encoder=True,
+        finetune_layer=6,
+        split_seq=True,
+        name="base.en",
+        dropout=0,
+    )
+    model_config_whisper_base_v1 = whisper_pretrained_v2_cfg.ModelConfig(
+        specauc_start_epoch=0,
+        label_target_size=vocab_size_without_blank,
+        final_dropout=0.2,
+        whisper_config=whisper_cfg_2,
+    )
+    train_args_whisper_adam_accum50_jjlr = {
+        "config": {
+            "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+                              + list(np.linspace(7e-4, 7e-5, 110))
+                              + list(np.linspace(7e-5, 1e-8, 30)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "max_seqs": 3,
+            "accum_grad_multiple_step": 50,
+        },
+        "debug": True,
+    }
+    eval_epochs = [50, 75, 100, 150, 200, 250]
+    train_args = {
+        **copy.deepcopy(train_args_whisper_adam_accum50_jjlr),
+        "network_module": "ctc.conformer_0923.whisper_pretrained_v5",
+        "net_args": {"model_config_dict": asdict(model_config_whisper_base_v1)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+                "beam_size_token": 128,
+            }
+            train_job, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/whisper_base_pretrain_v5_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_epochs=eval_epochs,
+            )
+            #train_job.rqmt["gpu_mem"] = 24
+            results.update(wer_values)
+            del wer_values
+    generate_report(
+        results=results, exp_name=prefix_name + "conformer_0923/whisper_base_pretrain_v5_jjlr"
+    )
+    del results
+
+    # whisper_cfg_1 = whisper_pretrained_v2_cfg.WhisperConfig(
+    #     just_encoder=True,
+    #     finetune_layer=1,
+    #     split_seq=True,
+    #     name="base.en",
+    #     dropout=0,
+    # )
+    # model_config_whisper_v2 = whisper_pretrained_v2_cfg.ModelConfig(
+    #     specauc_start_epoch=0,
+    #     label_target_size=vocab_size_without_blank,
+    #     final_dropout=0.2,
+    #     whisper_config=whisper_cfg_1,
+    # )
+    # train_args_whisper_adam_accum30_lr2e5 = {
+    #     "config": {
+    #         "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+    #         "learning_rates": [2e-5],
+    #         #############
+    #         "batch_size": 180 * 16000,
+    #         "max_seq_length": {"audio_features": 35 * 16000},
+    #         "max_seqs": 5,
+    #         "accum_grad_multiple_step": 30,
+    #     },
+    #     "debug": False,
+    # }
+    # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250]
+    # train_args = {
+    #     **copy.deepcopy(train_args_whisper_adam_accum30_lr2e5),
+    #     "network_module": "ctc.conformer_0923.whisper_pretrained_v5",
+    #     "net_args": {"model_config_dict": asdict(model_config_whisper_v2)},
+    # }
+    # results = {}
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #             "beam_size_token": 128,
+    #         }
+    #         train_job, _, _, wer_values = run_exp(
+    #             prefix_name
+    #             + "conformer_0923/whisper_pretrain_v5_base_1e-5/lm%.1f_prior%.2f_bs1024_th14" % (
+    #             lm_weight, prior_scale),
+    #             datasets=train_data,
+    #             train_args=train_args,
+    #             search_args=search_args,
+    #             with_prior=True,
+    #             eval_epochs=eval_epochs,
+    #         )
+    #         # train_job.rqmt["gpu_mem"] = 24
+    #         results.update(wer_values)
+    #         del wer_values
+    # generate_report(
+    #     results=results, exp_name=prefix_name + "conformer_0923/whisper_pretrain_v5_base_1e-5"
+    # )
+    # del results
+    #
+    # whisper_cfg_1 = whisper_pretrained_v2_cfg.WhisperConfig(
+    #     just_encoder=True,
+    #     finetune_layer=1,
+    #     split_seq=True,
+    #     name="base.en",
+    #     dropout=0,
+    # )
+    # model_config_whisper_v2_later_spec = whisper_pretrained_v2_cfg.ModelConfig(
+    #     specauc_start_epoch=11,
+    #     label_target_size=vocab_size_without_blank,
+    #     final_dropout=0.2,
+    #     whisper_config=whisper_cfg_1,
+    # )
+    # train_args_whisper_adam_accum30_lr1e5 = {
+    #     "config": {
+    #         "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+    #         "learning_rates": [1e-5],
+    #         #############
+    #         "batch_size": 180 * 16000,
+    #         "max_seq_length": {"audio_features": 35 * 16000},
+    #         "max_seqs": 5,
+    #         "accum_grad_multiple_step": 30,
+    #     },
+    #     "debug": False,
+    # }
+    # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250]
+    # train_args = {
+    #     **copy.deepcopy(train_args_whisper_adam_accum30_lr1e5),
+    #     "network_module": "ctc.conformer_0923.whisper_pretrained_v5",
+    #     "net_args": {"model_config_dict": asdict(model_config_whisper_v2_later_spec)},
+    # }
+    # results = {}
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #             "beam_size_token": 128,
+    #         }
+    #         train_job, _, _, wer_values = run_exp(
+    #             prefix_name
+    #             + "conformer_0923/whisper_pretrain_v5_base_1e-5_specstart11/lm%.1f_prior%.2f_bs1024_th14" % (
+    #                 lm_weight, prior_scale),
+    #             datasets=train_data,
+    #             train_args=train_args,
+    #             search_args=search_args,
+    #             with_prior=True,
+    #             eval_epochs=eval_epochs,
+    #         )
+    #         # train_job.rqmt["gpu_mem"] = 24
+    #         results.update(wer_values)
+    #         del wer_values
+    # generate_report(
+    #     results=results, exp_name=prefix_name + "conformer_0923/whisper_pretrain_v5_base_1e-5_specstart11"
+    # )
+    # del results
+    #
+    # model_config_whisper_v2_no_spec = whisper_pretrained_v2_cfg.ModelConfig(
+    #     specauc_start_epoch=5000,
+    #     label_target_size=vocab_size_without_blank,
+    #     final_dropout=0.2,
+    #     whisper_config=whisper_cfg_1,
+    # )
+    # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250]
+    # train_args = {
+    #     **copy.deepcopy(train_args_whisper_adam_accum30_lr1e5),
+    #     "network_module": "ctc.conformer_0923.whisper_pretrained_v5",
+    #     "net_args": {"model_config_dict": asdict(model_config_whisper_v2_no_spec)},
+    # }
+    # results = {}
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #             "beam_size_token": 128,
+    #         }
+    #         train_job, _, _, wer_values = run_exp(
+    #             prefix_name
+    #             + "conformer_0923/whisper_pretrain_v5_base_1e-5_nospec/lm%.1f_prior%.2f_bs1024_th14" % (
+    #                 lm_weight, prior_scale),
+    #             datasets=train_data,
+    #             train_args=train_args,
+    #             search_args=search_args,
+    #             with_prior=True,
+    #             eval_epochs=eval_epochs,
+    #         )
+    #         # train_job.rqmt["gpu_mem"] = 24
+    #         results.update(wer_values)
+    #         del wer_values
+    # generate_report(
+    #     results=results, exp_name=prefix_name + "conformer_0923/whisper_pretrain_v5_base_1e-5_nospec"
+    # )
+    # del results
+    #
+    # train_args = {
+    #     **copy.deepcopy(train_args_whisper_adam_accum30_lr2e5),
+    #     "network_module": "ctc.conformer_0923.whisper_pretrained_v5",
+    #     "net_args": {"model_config_dict": asdict(model_config_whisper_v2_no_spec)},
+    # }
+    # results = {}
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #             "beam_size_token": 128,
+    #         }
+    #         train_job, _, _, wer_values = run_exp(
+    #             prefix_name
+    #             + "conformer_0923/whisper_pretrain_v5_base_2e-5_nospec/lm%.1f_prior%.2f_bs1024_th14" % (
+    #                 lm_weight, prior_scale),
+    #             datasets=train_data,
+    #             train_args=train_args,
+    #             search_args=search_args,
+    #             with_prior=True,
+    #             eval_epochs=eval_epochs,
+    #         )
+    #         # train_job.rqmt["gpu_mem"] = 24
+    #         results.update(wer_values)
+    #         del wer_values
+    # generate_report(
+    #     results=results, exp_name=prefix_name + "conformer_0923/whisper_pretrain_v5_base_2e-5_nospec"
+    # )
+    # del results
+    #
+    # whisper_cfg_tune_2 = whisper_pretrained_v2_cfg.WhisperConfig(
+    #     just_encoder=True,
+    #     finetune_layer=2,
+    #     split_seq=True,
+    #     name="base.en",
+    #     dropout=0,
+    # )
+    # model_config_whisper_v2 = whisper_pretrained_v2_cfg.ModelConfig(
+    #     specauc_start_epoch=0,
+    #     label_target_size=vocab_size_without_blank,
+    #     final_dropout=0.2,
+    #     whisper_config=whisper_cfg_tune_2,
+    # )
+    # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250]
+    # train_args = {
+    #     **copy.deepcopy(train_args_whisper_adam_accum30_lr1e5),
+    #     "network_module": "ctc.conformer_0923.whisper_pretrained_v5",
+    #     "net_args": {"model_config_dict": asdict(model_config_whisper_v2)},
+    # }
+    # results = {}
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #             "beam_size_token": 128,
+    #         }
+    #         train_job, _, _, wer_values = run_exp(
+    #             prefix_name
+    #             + "conformer_0923/whisper_pretrain_v5_base_2_1e-5/lm%.1f_prior%.2f_bs1024_th14" % (
+    #                 lm_weight, prior_scale),
+    #             datasets=train_data,
+    #             train_args=train_args,
+    #             search_args=search_args,
+    #             with_prior=True,
+    #             eval_epochs=eval_epochs,
+    #         )
+    #         # train_job.rqmt["gpu_mem"] = 24
+    #         results.update(wer_values)
+    #         del wer_values
+    # generate_report(
+    #     results=results, exp_name=prefix_name + "conformer_0923/whisper_pretrain_v5_base_2_1e-5"
+    # )
+    # del results
+    #
+    # whisper_cfg_tune_3 = whisper_pretrained_v2_cfg.WhisperConfig(
+    #     just_encoder=True,
+    #     finetune_layer=3,
+    #     split_seq=True,
+    #     name="base.en",
+    #     dropout=0,
+    # )
+    # model_config_whisper_v2 = whisper_pretrained_v2_cfg.ModelConfig(
+    #     specauc_start_epoch=0,
+    #     label_target_size=vocab_size_without_blank,
+    #     final_dropout=0.2,
+    #     whisper_config=whisper_cfg_tune_3,
+    # )
+    # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250]
+    # train_args = {
+    #     **copy.deepcopy(train_args_whisper_adam_accum30_lr1e5),
+    #     "network_module": "ctc.conformer_0923.whisper_pretrained_v5",
+    #     "net_args": {"model_config_dict": asdict(model_config_whisper_v2)},
+    # }
+    # results = {}
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #             "beam_size_token": 128,
+    #         }
+    #         train_job, _, _, wer_values = run_exp(
+    #             prefix_name
+    #             + "conformer_0923/whisper_pretrain_v5_base_3_1e-5/lm%.1f_prior%.2f_bs1024_th14" % (
+    #                 lm_weight, prior_scale),
+    #             datasets=train_data,
+    #             train_args=train_args,
+    #             search_args=search_args,
+    #             with_prior=True,
+    #             eval_epochs=eval_epochs,
+    #         )
+    #         # train_job.rqmt["gpu_mem"] = 24
+    #         results.update(wer_values)
+    #         del wer_values
+    # generate_report(
+    #     results=results, exp_name=prefix_name + "conformer_0923/whisper_pretrain_v5_base_3_1e-5"
+    # )
+    # del results
+
+    from ..pytorch_networks.ctc.conformer_0923 import hubert_pretrained_v1_cfg
+
+    hubert_cfg_1 = hubert_pretrained_v1_cfg.HubertConfig(
+        finetune_layer=1,
+        name="base-ls960",
+    )
+    model_config_hubert_v1 = hubert_pretrained_v1_cfg.ModelConfig(
+        specauc_start_epoch=0,
+        label_target_size=vocab_size_without_blank,
+        final_dropout=0.2,
+        hubert_cfg=hubert_cfg_1,
+    )
+    # train_args_hubert_adam_accum25_jjlr = {
+    #     "config": {
+    #         "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+    #         "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+    #                           + list(np.linspace(7e-4, 7e-5, 110))
+    #                           + list(np.linspace(7e-5, 1e-8, 30)),
+    #         #############
+    #         "batch_size": 180 * 16000,
+    #         "max_seq_length": {"audio_features": 35 * 16000},
+    #         "max_seqs": 3,
+    #         "accum_grad_multiple_step": 25,
+    #     },
+    #     "debug": True,
+    # }
+    # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250]
+    # train_args = {
+    #     **copy.deepcopy(train_args_hubert_adam_accum25_jjlr),
+    #     "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+    #     "net_args": {"model_config_dict": asdict(model_config_hubert_v1)},
+    # }
+    # results = {}
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #             "beam_size_token": 128,
+    #         }
+    #         train_job, _, _, wer_values = run_exp(
+    #             prefix_name
+    #             + "conformer_0923/hubert_pretrain_v3_base_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale),
+    #             datasets=train_data,
+    #             train_args=train_args,
+    #             search_args=search_args,
+    #             with_prior=True,
+    #             eval_epochs=eval_epochs,
+    #         )
+    #         train_job.rqmt["gpu_mem"] = 24
+    #         results.update(wer_values)
+    #         del wer_values
+    # generate_report(  # 8.2
+    #     results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_base_jjlr"
+    # )
+    # del results
+    #
+    # train_args_hubert_adam_accum25_jjlr = {
+    #     "config": {
+    #         "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+    #         "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+    #                           + list(np.linspace(7e-4, 7e-5, 110))
+    #                           + list(np.linspace(7e-5, 1e-8, 30)),
+    #         #############
+    #         "batch_size": 180 * 16000,
+    #         "max_seq_length": {"audio_features": 35 * 16000},
+    #         "max_seqs": 3,
+    #         "accum_grad_multiple_step": 10,
+    #     },
+    #     "debug": False,
+    # }
+    # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250]
+    # train_args = {
+    #     **copy.deepcopy(train_args_hubert_adam_accum25_jjlr),
+    #     "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+    #     "net_args": {"model_config_dict": asdict(model_config_hubert_v1)},
+    # }
+    # results = {}
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #             "beam_size_token": 128,
+    #         }
+    #         train_job, _, _, wer_values = run_exp(
+    #             prefix_name
+    #             + "conformer_0923/hubert_pretrain_v3_base_smallaccum_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale),
+    #             datasets=train_data,
+    #             train_args=train_args,
+    #             search_args=search_args,
+    #             with_prior=True,
+    #             eval_epochs=eval_epochs,
+    #         )
+    #         train_job.rqmt["gpu_mem"] = 24
+    #         results.update(wer_values)
+    #         del wer_values
+    # generate_report(  # 7.9
+    #     results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_base_smallaccum_jjlr"
+    # )
+    # del results
+    #
+    # train_args_hubert_adam_accum25_jjlr = {
+    #     "config": {
+    #         "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+    #         "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+    #                           + list(np.linspace(7e-4, 7e-5, 110))
+    #                           + list(np.linspace(7e-5, 1e-8, 30)),
+    #         #############
+    #         "batch_size": 180 * 16000,
+    #         "max_seq_length": {"audio_features": 35 * 16000},
+    #         "max_seqs": 3,
+    #         "accum_grad_multiple_step": 100,
+    #     },
+    #     "debug": False,
+    # }
+    # eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250]
+    # train_args = {
+    #     **copy.deepcopy(train_args_hubert_adam_accum25_jjlr),
+    #     "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+    #     "net_args": {"model_config_dict": asdict(model_config_hubert_v1)},
+    # }
+    # results = {}
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #             "beam_size_token": 128,
+    #         }
+    #         train_job, _, _, wer_values = run_exp(
+    #             prefix_name
+    #             + "conformer_0923/hubert_pretrain_v3_base_largeaccum_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale),
+    #             datasets=train_data,
+    #             train_args=train_args,
+    #             search_args=search_args,
+    #             with_prior=True,
+    #             eval_epochs=eval_epochs,
+    #         )
+    #         train_job.rqmt["gpu_mem"] = 24
+    #         results.update(wer_values)
+    #         del wer_values
+    #
+    #         if prior_scale == 0.5 and lm_weight == 1.6:
+    #             train_job, _, _, wer_values = run_exp(
+    #                 prefix_name
+    #                 + "conformer_0923/hubert_pretrain_v3_base_largeaccum_jjlr/lm%.1f_prior%.2f_bs1024_th14_onnx" % (
+    #                 lm_weight, prior_scale),
+    #                 datasets=train_data,
+    #                 train_args=train_args,
+    #                 search_args=search_args,
+    #                 with_prior=True,
+    #                 eval_epochs=eval_epochs,
+    #                 decoder="ctc.decoder.flashlight_onnx_bpe_ctc"
+    #             )
+    #             results.update(wer_values)
+    #             del wer_values
+    # generate_report(  # 8.1
+    #     results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_base_largeaccum_jjlr"
+    # )
+    # del results
+
+
+    hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig(
+        finetune_layer=2,
+        name="base-ls960",
+    )
+    model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig(
+        specauc_start_epoch=0,
+        label_target_size=vocab_size_without_blank,
+        final_dropout=0.2,
+        hubert_cfg=hubert_cfg_2,
+    )
+    train_args_hubert_adam_accum25_jjlr = {
+        "config": {
+            "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+                              + list(np.linspace(7e-4, 7e-5, 110))
+                              + list(np.linspace(7e-5, 1e-8, 30)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "max_seqs": 3,
+            "accum_grad_multiple_step": 25,
+        },
+        "debug": True,
+    }
+    eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250]
+    train_args = {
+        **copy.deepcopy(train_args_hubert_adam_accum25_jjlr),
+        "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+        "net_args": {"model_config_dict": asdict(model_config_hubert_2)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+                "beam_size_token": 128,
+            }
+            train_job, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/hubert_pretrain_v3_base_tune2_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_epochs=eval_epochs,
+            )
+            train_job.rqmt["gpu_mem"] = 24
+            results.update(wer_values)
+            del wer_values
+            if lm_weight == 1.8 and prior_scale == 0.5:
+                epochs = [200]
+                #num_seqs_ls = [10, 100, 1000]
+                num_seqs_ls = [10]
+                quant_modes = [CalibrationMethod.MinMax]
+                activation_types = [QuantType.QInt8]
+                weight_types = [QuantType.QInt8]
+                #average_modes = [True, False]
+                average_modes = [True]
+                #sym_modes = [True, False]
+                sym_modes = [True]
+                #quant_ops_ls = [None, ["Conv"], ["Linear"], ["Conv", "Linear"]]
+                quant_ops_ls = [None]
+                #quant_formats = [QuantFormat.QDQ, QuantFormat.QOperator]
+                quant_formats = [QuantFormat.QDQ]
+                for num_seqs, quant_mode, activation_type, weight_type, average, sym, quant_ops, quant_format in (
+                    itertools.product(
+                        num_seqs_ls, quant_modes, activation_types, weight_types, average_modes,
+                        sym_modes, quant_ops_ls, quant_formats)):
+                    quant_str = get_quant_str(num_seqs, quant_mode, activation_type, weight_type, average, sym, quant_ops, quant_format)
+                    train_job, _, _, wer_values = run_exp(
+                        prefix_name
+                        + f"conformer_0923/hubert_pretrain_v3_base_tune2_jjlr/lm%.1f_prior%.2f_bs1024_th14_quant/{quant_str}" % (
+                            lm_weight, prior_scale),
+                        datasets=train_data,
+                        train_args=train_args,
+                        search_args=search_args,
+                        with_prior=True,
+                        eval_epochs=epochs,
+                        decoder="ctc.decoder.flashlight_quantized_bpe_ctc",
+                        quantize_args={
+                            "num_seqs": num_seqs,
+                            "num_parallel_seqs": 10,
+                            "calibrate_method": CalibrationMethod.MinMax,
+                            "moving_average": average,
+                            "symmetric": sym,
+                            "activation_type": activation_type,
+                            "weight_type": weight_type,
+                            "ops_to_quant": quant_ops,
+                            "quant_format": quant_format,
+                        }
+                    )
+                    results.update(wer_values)
+                    del wer_values
+    generate_report(  # 7.0
+        results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_base_tune2_jjlr"
+    )
+    del results
+
+    # hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig(
+    #     finetune_layer=3,
+    #     name="base-ls960",
+    # )
+    # model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig(
+    #     specauc_start_epoch=0,
+    #     label_target_size=vocab_size_without_blank,
+    #     final_dropout=0.2,
+    #     hubert_cfg=hubert_cfg_2,
+    # )
+    # train_args_hubert_adam_accum25_jjlr = {
+    #     "config": {
+    #         "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+    #         "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+    #                           + list(np.linspace(7e-4, 7e-5, 110))
+    #                           + list(np.linspace(7e-5, 1e-8, 30)),
+    #         #############
+    #         "batch_size": 180 * 16000,
+    #         "max_seq_length": {"audio_features": 35 * 16000},
+    #         "max_seqs": 3,
+    #         "accum_grad_multiple_step": 25,
+    #     },
+    #     "debug": True,
+    # }
+    # eval_epochs = [100, 150, 200, 225, 250]
+    # train_args = {
+    #     **copy.deepcopy(train_args_hubert_adam_accum25_jjlr),
+    #     "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+    #     "net_args": {"model_config_dict": asdict(model_config_hubert_2)},
+    # }
+    # results = {}
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #             "beam_size_token": 128,
+    #         }
+    #         train_job, _, _, wer_values = run_exp(
+    #             prefix_name
+    #             + "conformer_0923/hubert_pretrain_v3_base_tune3_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale),
+    #             datasets=train_data,
+    #             train_args=train_args,
+    #             search_args=search_args,
+    #             with_prior=True,
+    #             eval_epochs=eval_epochs,
+    #         )
+    #         train_job.rqmt["gpu_mem"] = 24
+    #         results.update(wer_values)
+    #         del wer_values
+    # generate_report(  # 7.1
+    #     results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_base_tune3_jjlr"
+    # )
+    # del results
+
+    # hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig(
+    #     finetune_layer=2,
+    #     name="base-ls960",
+    # )
+    # model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig(
+    #     specauc_start_epoch=0,
+    #     label_target_size=vocab_size_without_blank,
+    #     final_dropout=0.2,
+    #     hubert_cfg=hubert_cfg_2,
+    # )
+    # train_args_hubert_adam_accum25_jjlr = {
+    #     "config": {
+    #         "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+    #         "learning_rates": list(np.linspace(7e-6, 7e-4, 220))
+    #                           + list(np.linspace(7e-4, 7e-5, 220))
+    #                           + list(np.linspace(7e-5, 1e-8, 60)),
+    #         #############
+    #         "batch_size": 180 * 16000,
+    #         "max_seq_length": {"audio_features": 35 * 16000},
+    #         "max_seqs": 3,
+    #         "accum_grad_multiple_step": 25,
+    #     },
+    #     "debug": True,
+    # }
+    # eval_epochs = [100, 200, 250, 400, 500]
+    # train_args = {
+    #     **copy.deepcopy(train_args_hubert_adam_accum25_jjlr),
+    #     "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+    #     "net_args": {"model_config_dict": asdict(model_config_hubert_2)},
+    # }
+    # results = {}
+    # for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+    #     for prior_scale in [0.3, 0.5]:
+    #         search_args = {
+    #             **default_search_args,
+    #             "lm_weight": lm_weight,
+    #             "prior_scale": prior_scale,
+    #             "beam_size_token": 128,
+    #         }
+    #         train_job, _, _, wer_values = run_exp(
+    #             prefix_name
+    #             + "conformer_0923/hubert_pretrain_v3_base_tune2_longer_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale),
+    #             datasets=train_data,
+    #             train_args=train_args,
+    #             search_args=search_args,
+    #             with_prior=True,
+    #             eval_epochs=eval_epochs,
+    #             num_epochs=500
+    #         )
+    #         train_job.rqmt["gpu_mem"] = 24
+    #         results.update(wer_values)
+    #         del wer_values
+    # generate_report(  # 7.2
+    #     results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_base_tune2_longer_jjlr"
+    # )
+    # del results
+
+    hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig(
+        finetune_layer=2,
+        name="large-ls960-ft",
+    )
+    model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig(
+        specauc_start_epoch=0,
+        label_target_size=vocab_size_without_blank,
+        final_dropout=0.2,
+        hubert_cfg=hubert_cfg_2,
+    )
+    train_args_hubert_adam_accum25_jjlr = {
+        "config": {
+            "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+                              + list(np.linspace(7e-4, 7e-5, 110))
+                              + list(np.linspace(7e-5, 1e-8, 30)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "max_seqs": 3,
+            "accum_grad_multiple_step": 25,
+        },
+        "debug": True,
+    }
+    eval_epochs = [200, 250]
+    train_args = {
+        **copy.deepcopy(train_args_hubert_adam_accum25_jjlr),
+        "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+        "net_args": {"model_config_dict": asdict(model_config_hubert_2)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+                "beam_size_token": 128,
+            }
+            train_job, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/hubert_pretrain_v3_large960_tune2_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_epochs=eval_epochs,
+            )
+            train_job.rqmt["gpu_mem"] = 24
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 5.5
+        results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_large960_tune2_jjlr"
+    )
+    del results
+
+    train_args_hubert_adam_accum25_jjlr_longflat = {
+        "config": {
+            "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 130))
+                              + list(np.linspace(7e-4, 7e-5, 230))
+                              + list(np.linspace(7e-5, 1e-8, 140)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "max_seqs": 3,
+            "accum_grad_multiple_step": 25,
+        },
+        "debug": False,
+    }
+    eval_epochs = [250, 400, 450, 500]
+    train_args = {
+        **copy.deepcopy(train_args_hubert_adam_accum25_jjlr_longflat),
+        "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+        "net_args": {"model_config_dict": asdict(model_config_hubert_2)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+                "beam_size_token": 128,
+            }
+            train_job, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/hubert_pretrain_v3_large960_tune2_jjlr_longflat/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_epochs=eval_epochs,
+                num_epochs=500
+            )
+            train_job.rqmt["gpu_mem"] = 24
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # TODO 5.3 !!
+        results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_large960_tune2_jjlr_longflat"
+    )
+    del results
+
+    hubert_cfg_6 = hubert_pretrained_v1_cfg.HubertConfig(
+        finetune_layer=6,
+        name="large-ls960-ft",
+    )
+    model_config_hubert_6 = hubert_pretrained_v1_cfg.ModelConfig(
+        specauc_start_epoch=0,
+        label_target_size=vocab_size_without_blank,
+        final_dropout=0.2,
+        hubert_cfg=hubert_cfg_6,
+    )
+    eval_epochs = [250, 400, 450, 500]
+    train_args = {
+        **copy.deepcopy(train_args_hubert_adam_accum25_jjlr_longflat),
+        "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+        "net_args": {"model_config_dict": asdict(model_config_hubert_6)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+                "beam_size_token": 128,
+            }
+            train_job, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/hubert_pretrain_v3_large960_tune6_jjlr_longflat/lm%.1f_prior%.2f_bs1024_th14" % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_epochs=eval_epochs,
+                num_epochs=500
+            )
+            train_job.rqmt["gpu_mem"] = 24
+            results.update(wer_values)
+            del wer_values
+    generate_report(
+        results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_large960_tune6_jjlr_longflat"
+    )
+    del results
+
+    hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig(
+        finetune_layer=2,
+        name="large-ll60k",
+    )
+    model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig(
+        specauc_start_epoch=0,
+        label_target_size=vocab_size_without_blank,
+        final_dropout=0.2,
+        hubert_cfg=hubert_cfg_2,
+    )
+    train_args_hubert_adam_accum25_jjlr = {
+        "config": {
+            "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 130))
+                              + list(np.linspace(7e-4, 7e-5, 230))
+                              + list(np.linspace(7e-5, 1e-8, 140)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "max_seqs": 3,
+            "accum_grad_multiple_step": 25,
+        },
+        "debug": True,
+    }
+    eval_epochs = [250, 300, 400, 500]
+    train_args = {
+        **copy.deepcopy(train_args_hubert_adam_accum25_jjlr),
+        "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+        "net_args": {"model_config_dict": asdict(model_config_hubert_2)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+                "beam_size_token": 128,
+            }
+            train_job, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/hubert_pretrain_v3_large60k_tune2_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (
+                lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_epochs=eval_epochs,
+                num_epochs=500
+            )
+            train_job.rqmt["gpu_mem"] = 24
+            results.update(wer_values)
+            del wer_values
+    generate_report(
+        results=results, exp_name=prefix_name + "conformer_0923/hubert_pretrain_v3_large60k_tune2_jjlr"
+    )
+    del results
+
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/config.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/config.py
new file mode 100644
index 000000000..d97f74b26
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/config.py
@@ -0,0 +1,162 @@
+import copy
+import numpy as np
+from sisyphus import tk
+from typing import Any, Dict, Optional, List
+
+from i6_core.returnn.config import ReturnnConfig, CodeWrapper
+
+from i6_experiments.common.setups.returnn_pytorch.serialization import (
+    Collection as TorchCollection,
+)
+from i6_experiments.common.setups.serialization import Import
+from ..data import TrainingDatasets
+from .serializer import get_pytorch_serializer_v3, PACKAGE
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset
+
+
+def get_training_config(
+    training_datasets: TrainingDatasets,
+    network_module: str,
+    net_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine: bool = False,
+    use_speed_perturbation: bool = False,
+    keep_epochs: Optional[List] = None,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these does not change the hash
+    post_config = {
+        "cleanup_old_models": True,
+        "stop_on_nonfinite_train_score": True,  # this might break now with True
+        "num_workers_per_gpu": 2,
+    }
+    if keep_epochs is not None:
+        post_config["cleanup_old_models"] = {
+            "keep_last_n": 2,
+            "keep_best_n": 4,
+            "keep": keep_epochs,
+        }
+
+    base_config = {
+        "max_seqs": 60,
+        #############
+        "train": copy.deepcopy(training_datasets.train.as_returnn_opts()),
+        "dev": training_datasets.cv.as_returnn_opts(),
+        "eval_datasets": {"devtrain": training_datasets.devtrain.as_returnn_opts()},
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module, net_args=net_args, debug=debug, use_custom_engine=use_custom_engine
+    )
+    python_prolog = None
+    if use_speed_perturbation:
+        prolog_serializer = TorchCollection(
+            serializer_objects=[
+                Import(
+                    code_object_path=PACKAGE + ".dataset_code.speed_perturbation.legacy_speed_perturbation",
+                    unhashed_package_root=PACKAGE,
+                )
+            ]
+        )
+        python_prolog = [prolog_serializer]
+        config["train"]["datasets"]["zip_dataset"]["audio"]["pre_process"] = CodeWrapper("legacy_speed_perturbation")
+
+    returnn_config = ReturnnConfig(
+        config=config, post_config=post_config, python_prolog=python_prolog, python_epilog=[serializer]
+    )
+    return returnn_config
+
+
+def get_prior_config(
+    training_datasets: TrainingDatasets,
+    network_module: str,
+    net_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine=False,
+    **kwargs,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these does not change the hash
+    post_config = {}
+
+    base_config = {
+        #############
+        "batch_size": 50000 * 160,
+        "max_seqs": 60,
+        #############
+        "forward": training_datasets.prior.as_returnn_opts(),
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module,
+        net_args=net_args,
+        debug=debug,
+        use_custom_engine=use_custom_engine,
+        prior=True,
+    )
+    returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer])
+    return returnn_config
+
+
+def get_search_config(
+    network_module: str,
+    net_args: Dict[str, Any],
+    decoder: [str],
+    decoder_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine=False,
+    **kwargs,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these does not change the hash
+    post_config = {}
+
+    base_config = {
+        #############
+        "batch_size": 24000 * 160,
+        "max_seqs": 60,
+        #############
+        # dataset is added later in the pipeline during search_single
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module,
+        net_args=net_args,
+        debug=debug,
+        use_custom_engine=use_custom_engine,
+        decoder=decoder,
+        decoder_args=decoder_args,
+    )
+    returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer])
+    return returnn_config
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/data.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/data.py
new file mode 100644
index 000000000..5af44370f
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/data.py
@@ -0,0 +1,116 @@
+"""
+
+
+"""
+import os
+from sisyphus import tk
+
+from i6_core.corpus.transform import ApplyLexiconToCorpusJob
+from i6_core.lexicon.modification import AddEowPhonemesToLexiconJob
+from i6_core.returnn.vocabulary import ReturnnVocabFromPhonemeInventory
+
+from i6_experiments.common.datasets.tedlium2.corpus import get_bliss_corpus_dict
+from i6_experiments.common.datasets.tedlium2.lexicon import get_g2p_augmented_bliss_lexicon, get_bliss_lexicon
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+
+from ..data import TrainingDatasetSettings, TrainingDatasets, build_training_datasets, get_zip
+from ..data import DATA_PREFIX
+
+
+def get_eow_lexicon(with_g2p=True) -> tk.Path:
+    """
+    Standard bliss lexicon modified with EOW
+    :return:
+    """
+    if with_g2p:
+        lex = get_g2p_augmented_bliss_lexicon(output_prefix="tedliumv2_g2p_datasets")
+    else:
+        lex = get_bliss_lexicon(output_prefix="tedliumv2_eow_datasets")
+
+    return AddEowPhonemesToLexiconJob(lex).out_lexicon
+
+
+def get_eow_text_lexicon() -> tk.Path:
+    """
+
+    :return:
+    """
+    bliss_lex = get_eow_lexicon(with_g2p=False)
+    from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon
+
+    word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon
+    return word_lexicon
+
+
+def get_eow_bliss(corpus_key, remove_unk_seqs=False) -> tk.Path:
+    """
+    get an EOW modified corpus with optional unknown removed for cross validation
+
+    :param corpus_key: train, dev, test
+    :param remove_unk_seqs: remove all sequences with unknowns, used for dev-clean and dev-other
+        in case of using them for cross validation
+    :return:
+    """
+    bliss = get_bliss_corpus_dict(audio_format="wav")[corpus_key]
+    if remove_unk_seqs:
+        from i6_core.corpus.filter import FilterCorpusRemoveUnknownWordSegmentsJob
+
+        bliss = FilterCorpusRemoveUnknownWordSegmentsJob(
+            bliss_corpus=bliss,
+            bliss_lexicon=get_eow_lexicon(),  # assume no g2p when removing unknown for test sets
+            all_unknown=False,
+        ).out_corpus
+
+    # default train lexicon
+    lexicon = get_eow_lexicon(with_g2p=True)
+    converted_bliss_corpus = ApplyLexiconToCorpusJob(bliss, lexicon, word_separation_orth=None).out_corpus
+
+    return converted_bliss_corpus
+
+
+def get_eow_bliss_and_zip(corpus_key, remove_unk_seqs=False):
+    """
+    :param corpus_key: e.g. "train", "dev", or "test,
+    :param remove_unk_seqs: remove all sequences with unknowns, used for dev-clean and dev-other
+        in case of using them for cross validation
+    :return: tuple of bliss and zip
+    """
+
+    bliss_dataset = get_eow_bliss(corpus_key=corpus_key, remove_unk_seqs=remove_unk_seqs)
+    zip_dataset = get_zip(f"{corpus_key}_eow", bliss_dataset=bliss_dataset)
+
+    return bliss_dataset, zip_dataset
+
+
+def get_eow_vocab_datastream() -> LabelDatastream:
+    """
+    Phoneme with EOW LabelDatastream for Tedlium-2
+
+    :param with_blank: datastream for CTC training
+    """
+    lexicon = get_eow_lexicon()
+    blacklist = {"[SILENCE]"}
+    returnn_vocab_job = ReturnnVocabFromPhonemeInventory(lexicon, blacklist=blacklist)
+    returnn_vocab_job.add_alias(os.path.join(DATA_PREFIX, "eow_returnn_vocab_job"))
+
+    vocab_datastream = LabelDatastream(
+        available_for_inference=True, vocab=returnn_vocab_job.out_vocab, vocab_size=returnn_vocab_job.out_vocab_size
+    )
+
+    return vocab_datastream
+
+
+def build_phon_training_datasets(
+    settings: TrainingDatasetSettings,
+) -> TrainingDatasets:
+    """
+    :param settings: configuration object for the dataset pipeline
+    """
+    label_datastream = get_eow_vocab_datastream()
+
+    _, train_ogg = get_eow_bliss_and_zip("train")
+    _, dev_ogg = get_eow_bliss_and_zip("dev", remove_unk_seqs=True)
+
+    return build_training_datasets(
+        settings=settings, train_ogg=train_ogg, dev_ogg=dev_ogg, label_datastream=label_datastream
+    )
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/exp_baseline.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/exp_baseline.py
new file mode 100644
index 000000000..c9eb200e9
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/exp_baseline.py
@@ -0,0 +1,1703 @@
+from sisyphus import tk
+
+import copy
+from dataclasses import asdict
+import numpy as np
+from typing import cast, Optional, List
+
+from i6_core.report.report import _Report_Type
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+
+from .data import build_phon_training_datasets, TrainingDatasetSettings, get_eow_text_lexicon
+from ..data import build_test_dataset
+from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT
+
+from ..pipeline import training, search, compute_prior
+
+from .config import get_training_config, get_search_config, get_prior_config
+
+def flash_phon_ctc_report_format(report: _Report_Type) -> str:
+    extra_ls = []
+    out = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)]
+    out = sorted(out, key=lambda x: float(x[1]))
+    best_ls = [out[0]]
+    for extra in extra_ls:
+        out2 = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if extra in recog]
+        out2 = sorted(out2, key=lambda x: float(x[1]))
+        if len(out2) > 0:
+            out.append((extra, ""))
+            out.extend(out2)
+            best_ls.append(out2[0])
+    best_ls = sorted(best_ls, key=lambda x: float(x[1]))
+    out.append(("Best Results", ""))
+    out.extend(best_ls)
+    return "\n".join([f"{pair[0]}:  {str(pair[1])}" for pair in out])
+
+
+def conformer_baseline():
+    prefix_name = "experiments/rescale/tedliumv2/flashlight_phon_ctc/"
+
+    train_settings = TrainingDatasetSettings(
+        custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000"
+    )
+
+    train_settings_retrain = copy.deepcopy(train_settings)
+    train_settings_retrain.epoch_wise_filters = []
+
+    # build the training datasets object containing train, cv, dev-train and the extern_data dict
+    train_data = build_phon_training_datasets(settings=train_settings)
+    label_datastream = cast(LabelDatastream, train_data.datastreams["labels"])
+    vocab_size_without_blank = label_datastream.vocab_size
+
+    # build testing datasets
+    test_dataset_tuples = {}
+    # for testset in ["dev", "test"]:
+    for testset in ["dev"]:
+        test_dataset_tuples[testset] = build_test_dataset(
+            dataset_key=testset,
+        )
+
+    from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm
+
+    lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False)
+    lm = lms_system.interpolated_lms["dev-pruned"]["4gram"]
+    arpa_ted_lm = lm.ngram_lm
+    # TODO: Add binary conversion job
+
+    # ---------------------------------------------------------------------------------------------------------------- #
+
+    def run_exp(
+        ft_name,
+        datasets,
+        train_args,
+        search_args=None,
+        with_prior=False,
+        num_epochs=250,
+        decoder="ctc.decoder.flashlight_phoneme_ctc",
+        eval_epochs: Optional[List] = None,
+        eval_best: bool = True,
+    ):
+        training_name = "/".join(ft_name.split("/")[:-1])
+        search_args = search_args if search_args is not None else {}
+
+        returnn_config = get_training_config(training_datasets=datasets, **train_args)
+        train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs)
+
+        if with_prior:
+            returnn_config = get_prior_config(training_datasets=datasets, **train_args)
+            prior_file = compute_prior(
+                ft_name,
+                returnn_config,
+                checkpoint=train_job.out_checkpoints[num_epochs],
+                returnn_exe=RETURNN_EXE,
+                returnn_root=MINI_RETURNN_ROOT,
+            )
+            tk.register_output(training_name + "/prior.txt", prior_file)
+            search_args["prior_file"] = prior_file
+
+        returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder)
+
+        if eval_epochs is None:
+            eval_epochs = [num_epochs]
+        search_job_ls = []
+        report = {}
+        for epoch in eval_epochs:
+            format_string_report, values_report, search_jobs = search(
+                ft_name + "/default_%i" % epoch,
+                returnn_search_config,
+                train_job.out_checkpoints[epoch],
+                test_dataset_tuples,
+                RETURNN_EXE,
+                MINI_RETURNN_ROOT,
+            )
+            search_job_ls += search_jobs
+            report.update(values_report)
+        from i6_core.returnn import GetBestPtCheckpointJob
+        if eval_best:
+            best_job = GetBestPtCheckpointJob(train_job.out_model_dir, train_job.out_learning_rates, key="dev_loss_ctc")
+            best_job.add_alias(ft_name + "/get_best_job")
+            format_string_report, values_report, search_jobs = search(
+                ft_name + "/best_chkpt",
+                returnn_search_config,
+                best_job.out_checkpoint,
+                test_dataset_tuples,
+                RETURNN_EXE,
+                MINI_RETURNN_ROOT,
+            )
+            search_job_ls += search_jobs
+            report.update(values_report)
+
+        return train_job, search_job_ls, format_string_report, report
+
+    def generate_report(results, exp_name):
+        from i6_core.report import GenerateReportStringJob, MailJob
+
+        report = GenerateReportStringJob(report_values=results, report_template=flash_phon_ctc_report_format)
+        report.add_alias(f"report/report/{exp_name}")
+        mail = MailJob(report.out_report, send_contents=True, subject=exp_name)
+        mail.add_alias(f"report/mail/{exp_name}")
+        tk.register_output("mail/" + exp_name, mail.out_status)
+
+    from ..pytorch_networks.ctc.conformer_0923.transparent_i6modelsV1_2x1D_frontend_xavierinit_cfg import (
+        SpecaugConfig,
+        TwoLayer1DFrontendConfig,
+        ModelConfig,
+    )
+
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = TwoLayer1DFrontendConfig(
+        in_features=80,
+        conv1_channels=256,
+        conv2_channels=384,
+        conv1_kernel_size=5,
+        conv2_kernel_size=5,
+        conv1_stride=2,
+        conv2_stride=2,
+        dropout=0.1,
+    )
+    model_config = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+    )
+
+    # from here on onwards, use default AdamW with same OCLR
+    train_args_adamw03_accum2 = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3},
+            "learning_rates": list(np.linspace(1e-5, 1e-3, 125)) + list(np.linspace(1e-3, 1e-6, 125)),
+            #############
+            "batch_size": 300 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "accum_grad_multiple_step": 2,
+        },
+    }
+
+    default_search_args = {
+        "lexicon": get_eow_text_lexicon(),
+        "returnn_vocab": label_datastream.vocab,
+        "beam_size": 64,
+        "arpa_lm": arpa_ted_lm,
+        "beam_threshold": 50,
+    }
+
+    train_args = {
+        **train_args_adamw03_accum2,
+        "network_module": "ctc.conformer_0923.transparent_i6modelsV1_2x1D_frontend_xavierinit",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+
+    results = {}
+    for lm_weight in [1.5, 2.0, 2.5]:
+        for prior_scale in [0.3, 0.5, 0.75, 1.0]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit/lm%.1f_prior%.2f"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_best=False,
+            )
+            results.update(wer_values)
+            del wer_values
+
+    for pruning in [10, 20, 30, 40, 50]:
+        search_args = {
+            **default_search_args,
+            "lm_weight": 2.0,
+            "prior_scale": 0.5,
+        }
+        search_args["beam_size"] = 256
+        search_args["beam_threshold"] = pruning
+        _, _, _, wer_values = run_exp(
+            prefix_name
+            + "conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit/lm2.0_prior0.5_bs256_prune%i" % pruning,
+            datasets=train_data,
+            train_args=train_args,
+            search_args=search_args,
+            with_prior=True,
+            eval_best=False,
+        )
+        results.update(wer_values)
+        del wer_values
+
+    for pruning in [10, 12, 14, 16, 18, 20]:
+        # 10 = 10.0
+        # 12 = 9.9
+        # 14 = 9.9
+        # 16 = 9.8
+        search_args = {
+            **default_search_args,
+            "lm_weight": 2.0,
+            "prior_scale": 0.5,
+        }
+        search_args["beam_size"] = 1024
+        search_args["beam_threshold"] = pruning
+        _, _, _, wer_values = run_exp(
+            prefix_name
+            + "conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit/lm2.0_prior0.5_bs1024_prune%i" % pruning,
+            datasets=train_data,
+            train_args=train_args,
+            search_args=search_args,
+            with_prior=True,
+            eval_best=False,
+        )
+        results.update(wer_values)
+        del wer_values
+
+    generate_report(  # 9.8
+        results=results, exp_name=prefix_name + "conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit"
+    )
+    del results
+
+    results = {}
+    # re-tune prior and lm-weight using beampruning 16
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.0, 0.3, 0.4, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit/lm%.1f_prior%.1f_bs1024_prune16"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_best=False,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 9.8
+        results=results, exp_name=prefix_name + "conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit_bs1024_prune16"
+    )
+    del results
+
+# Ted-Lium can be larger
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = TwoLayer1DFrontendConfig(
+        in_features=80,
+        conv1_channels=512,
+        conv2_channels=512,
+        conv1_kernel_size=5,
+        conv2_kernel_size=5,
+        conv1_stride=2,
+        conv2_stride=2,
+        dropout=0.1,
+    )
+    model_config = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=512,
+        num_layers=12,
+        num_heads=8,
+        ff_dim=2048,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+    )
+
+    # from here on onwards, use default AdamW with same OCLR
+    train_args_adamw03_accum2 = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3},
+            "learning_rates": list(np.linspace(1e-5, 1e-3, 125)) + list(np.linspace(1e-3, 1e-6, 125)),
+            #############
+            "batch_size": 300 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "accum_grad_multiple_step": 2,
+        },
+    }
+
+    default_search_args = {
+        "lexicon": get_eow_text_lexicon(),
+        "returnn_vocab": label_datastream.vocab,
+        "beam_size": 64,
+        "arpa_lm": arpa_ted_lm,
+        "beam_threshold": 50,
+    }
+
+    train_args = {
+        **train_args_adamw03_accum2,
+        "network_module": "ctc.conformer_0923.transparent_i6modelsV1_2x1D_frontend_xavierinit",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+
+    results = {}
+    for lm_weight in [1.5, 2.0, 2.5]:
+        for prior_scale in [0.3, 0.5, 0.75, 1.0]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/transparent_12x512_i6modelsV1_2x1D_frontend_xavierinit/lm%.1f_prior%.2f"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,  # 10.2
+                search_args=search_args,
+                with_prior=True,
+                eval_best=False,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 10.2
+        results=results, exp_name=prefix_name + "conformer_0923/transparent_12x512_i6modelsV1_2x1D_frontend_xavierinit"
+    )
+    del results
+
+    # TODO: not converging same with AMP
+    # train_args_amp = copy.deepcopy(train_args)
+    # train_args_amp["config"]["torch_amp_options"] = {"dtype": "float16"}  # Pascal / 1080 GPUs can only do float16
+    # for lm_weight in [1.5, 2.0, 2.5]:
+    #    for prior_scale in [0.3, 0.5, 0.75, 1.0]:
+    #        search_args = {
+    #            **default_search_args,
+    #            "lm_weight": lm_weight,
+    #            "prior_scale": prior_scale,
+    #        }
+    #        run_exp(prefix_name + "conformer_0923/transparent_12x512_i6modelsV1_2x1D_frontend_xavierinit_amp/lm%.1f_prior%.2f" % (
+    #        lm_weight, prior_scale),
+    #                datasets=train_data, train_args=train_args_amp, search_args=search_args, with_prior=True)
+
+    from ..pytorch_networks.ctc.conformer_0923.i6modelsV1_2x1D_frontend_xavierinit_cfg import (
+        SpecaugConfig,
+        TwoLayer1DFrontendConfig,
+        ModelConfig,
+    )
+
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = TwoLayer1DFrontendConfig(
+        in_features=80,
+        conv1_channels=256,
+        conv2_channels=384,
+        conv1_kernel_size=5,
+        conv2_kernel_size=5,
+        conv1_stride=2,
+        conv2_stride=2,
+        dropout=0.1,
+    )
+    model_config = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+    )
+
+    train_args = {
+        **train_args_adamw03_accum2,
+        "network_module": "ctc.conformer_0923.i6modelsV1_2x1D_frontend_xavierinit",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 256
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_2x1D_frontend_xavierinit/lm%.1f_prior%.2f" % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_best=False,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 9.2
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_2x1D_frontend_xavierinit"
+    )
+    del results
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_2x1D_frontend_xavierinit",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    train_args["config"]["optimizer"] = {"class": "adam", "epsilon": 1e-16}
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 256
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_2x1D_frontend_xavierinit_adam/lm%.1f_prior%.2f" % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_best=False,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 9.4
+        results=results,
+        exp_name=prefix_name + "conformer_0923/i6modelsV1_2x1D_frontend_xavierinit_adam"
+    )
+    del results
+
+    from ..pytorch_networks.ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+    )
+
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(2, 1),
+        pool1_stride=(2, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    model_config = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=2048,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+    )
+
+    train_args = {
+        **train_args_adamw03_accum2,
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 256
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1/lm%.1f_prior%.2f" % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1/lm%.1f_prior%.2f_bs1024" % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 8.1
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1"
+    )
+    del results
+
+    train_args = {
+        **train_args_adamw03_accum2,
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_posenc",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 256
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc/lm%.1f_prior%.2f" % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_best=False,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 8.1
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc"
+    )
+    del results
+
+    train_args = {
+        **train_args_adamw03_accum2,
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_convfirst",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 256
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_convfirst/lm%.1f_prior%.2f"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_best=False,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 8.4
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_convfirst"
+    )
+    del results
+
+    train_args = {
+        **train_args_adamw03_accum2,
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_posenc_convfirst",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 256
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_convfirst/lm%.1f_prior%.2f"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_best=False,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 8.0
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_convfirst"
+    )
+    del results
+
+    train_args = {
+        **train_args_adamw03_accum2,
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_xavierinit",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 256
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_xavierinit/lm%.1f_prior%.2f"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_best=False
+            )
+            results.update(wer_values)
+            del wer_values
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_xavierinit/lm%.1f_prior%.2f_bs1024"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_best=False,
+            )
+            results.update(wer_values)
+            del wer_values
+
+    generate_report(  # 7.9
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_xavierinit"
+    )
+    del results
+
+    train_args = {
+        **train_args_adamw03_accum2,
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_posenc_xavierinit",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 256
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_xavierinit/lm%.1f_prior%.2f"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_best=False,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 8.2
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_xavierinit"
+    )
+    del results
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    train_args["config"]["learning_rates"] = (
+        list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR/lm%.1f_prior%.2f_bs1024"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+
+    generate_report(  # 7.8
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR"
+    )
+    del results
+    ######################################################
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    train_args["config"]["learning_rates"] = (
+        list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    train_args["config"]["optimizer"] = {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-2}
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR_decay-2/lm%.1f_prior%.2f_bs1024"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.9
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR_decay-2"
+    )
+    del results
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    train_args["config"]["learning_rates"] = (
+        list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    train_args["config"]["optimizer"] = {"class": "adamw", "epsilon": 1e-16, "weight_decay": 5e-3}
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR_decay5-3/lm%.1f_prior%.2f_bs1024"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.8
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR_decay5-3"
+    )
+    del results
+
+    #############################################
+
+    # Train long basic
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    train_args["config"]["learning_rates"] = list(np.linspace(1e-5, 1e-3, 250)) + list(np.linspace(1e-3, 1e-6, 250))
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_ep500/lm%.1f_prior%.2f_bs1024"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                num_epochs=500,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.6
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_ep500"
+    )
+    del results
+
+    # Train long skewed
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    train_args["config"]["learning_rates"] = list(np.linspace(1e-5, 1e-3, 200)) + list(np.linspace(1e-3, 1e-7, 300))
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_ep500skewed/lm%.1f_prior%.2f_bs1024"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                num_epochs=500,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.7
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_ep500skewed"
+    )
+    del results
+
+    bene_model_config = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=6,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=9,
+        final_dropout=0.2,
+    )
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(bene_model_config),
+        },
+    }
+    train_args["config"]["learning_rates"] = (
+        list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 16
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_bene_param/lm%.1f_prior%.2f_bs1024"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 92.4, not converged
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_bene_param"
+    )
+    del results
+
+    # No Subsampling
+    from ..pytorch_networks.ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+    )
+
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config_nosub = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(1, 1),
+        pool1_stride=(1, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(1, 1),
+        pool2_stride=(1, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    model_config_nosub = ModelConfig(
+        frontend_config=frontend_config_nosub,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=2048,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+    )
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config_nosub),
+        },
+    }
+    train_args["config"]["batch_size"] = 150 * 12000
+    train_args["config"]["accum_grad_multiple_step"] = 5
+    train_args["config"]["learning_rates"] = (
+        list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 16
+            train_job, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR_nosub/lm%.1f_prior%.2f_bs1024"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            train_job.rqmt["gpu_mem"] = 24
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 10.0
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_JJLR_nosub"
+    )
+    del results
+
+    #### New experiments with corrected FF-Dim
+
+    from ..pytorch_networks.ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+    )
+
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(2, 1),
+        pool1_stride=(2, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    model_config = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+    )
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v2",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    train_args["config"]["learning_rates"] = (
+        list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    train_args["config"]["batch_size"] = 180 * 16000
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2_JJLR/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.2
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2_JJLR"
+    )
+    del results
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+    }
+    train_args["config"]["learning_rates"] = (
+        list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    train_args["config"]["batch_size"] = 180 * 16000
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+
+            # beam search token
+            if lm_weight == 2.0 and prior_scale == 0.5:
+                for bst in [10, 20, 30, 40, 50]:
+                    search_args = copy.deepcopy(search_args)
+                    search_args["beam_size_token"] = bst
+                    _, _, _, wer_values = run_exp(
+                        prefix_name
+                        + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR/lm%.1f_prior%.2f_bs1024_th14_bst_%i"
+                        % (lm_weight, prior_scale, bst),
+                        datasets=train_data,
+                        train_args=train_args,
+                        search_args=search_args,
+                        with_prior=True,
+                    )
+                    results.update(wer_values)
+                    del wer_values
+                    # if bst == 20:  # Does currently not work since SFTF cannot be onnx exported
+                    #     _, search_jobs, _, _ = run_exp(
+                    #         prefix_name
+                    #         + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR/lm%.1f_prior%.2f_bs1024_th14_bst_%i_exp1"
+                    #         % (lm_weight, prior_scale, bst),
+                    #         datasets=train_data,
+                    #         train_args=train_args,
+                    #         search_args=search_args,
+                    #         with_prior=True,
+                    #         decoder="ctc.decoder.flashlight_experimental_phoneme_ctc",
+                    #     )
+
+    # Search GRID
+    for lm_weight in [1.6, 1.8, 2.0, 2.2, 2.4]:  # 5
+        for prior_scale in [0.0, 0.3, 0.4, 0.5, 0.6, 0.7]:  # 5
+            for beam_threshold in [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]:  # 12
+                # for beam_size in [256, 1024, 4096, 8192]:  # 4
+                for beam_size in [256, 1024]:  # 4
+                    search_args = {
+                        **copy.deepcopy(default_search_args),
+                        "lm_weight": lm_weight,
+                        "prior_scale": prior_scale,
+                    }
+                    search_args["beam_size"] = beam_size
+                    search_args["beam_threshold"] = beam_threshold
+                    search_args["node"] = "intel"
+                    _, search_jobs, _, wer_values = run_exp(
+                        prefix_name
+                        + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR/search_grid_intel_full/lm%.1f_prior%.2f_bs%i_th%i"
+                        % (lm_weight, prior_scale, beam_size, beam_threshold),
+                        datasets=train_data,
+                        train_args=train_args,
+                        search_args=search_args,
+                        with_prior=True,
+                    )
+                    results.update(wer_values)
+                    del wer_values
+                    for search_job in search_jobs:
+                        search_job.rqmt["sbatch_args"] = "-p rescale_intel -A rescale_speed"
+                        if beam_size > 1024:
+                            search_job.rqmt["mem"] = 12
+                        elif beam_size > 4096:
+                            search_job.rqmt["mem"] = 16
+
+    generate_report(  # 7.2
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR"
+    )
+    del results
+
+    # with speed perturbation
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config),
+        },
+        "use_speed_perturbation": True,
+    }
+    train_args["config"]["learning_rates"] = (
+        list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    train_args["config"]["batch_size"] = 180 * 16000
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_speed/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.4
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_speed"
+    )
+    del results
+
+    from ..pytorch_networks.ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import (
+        ModelConfig as ModelConfigV4,
+    )
+
+    model_config_v4 = ModelConfigV4(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+        specauc_start_epoch=1,
+    )
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v5",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config_v4),
+        },
+    }
+    train_args["config"]["learning_rates"] = (
+        list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    train_args["config"]["batch_size"] = 180 * 16000
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.5, 0.7]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.2
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR"
+    )
+    del results
+    # TODO: this here above is the best baseline, use as starting point, giving 7.2% with LM 2.2 and Prior 0.7
+
+    train_args = copy.deepcopy(train_args)
+    train_args["config"]["learning_rates"] = (
+            list(np.linspace(7e-6, 7e-4, 220)) + list(np.linspace(7e-4, 7e-5, 220)) + list(np.linspace(7e-5, 1e-8, 60))
+    )
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.5, 0.7]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_longerJJLR_500ep/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                num_epochs=500
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.3
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_longerJJLR_500ep"
+    )
+    del results
+
+    train_args = copy.deepcopy(train_args)
+    train_args["config"]["learning_rates"] = (
+            list(np.linspace(7e-6, 7e-4, 130)) + list(np.linspace(7e-4, 7e-5, 230)) + list(np.linspace(7e-5, 1e-8, 140))
+    )
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.5, 0.7]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_flatterJJLR_500ep/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                num_epochs=500
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 6.8
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_flatterJJLR_500ep"
+    )
+    del results
+
+    train_args = copy.deepcopy(train_args)
+    train_args["config"]["learning_rates"] = (
+            list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 31) + [7e-5])
+    )
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.5, 0.7]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_endJJLR_500ep/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                num_epochs=500
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 6.9
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_endJJLR_500ep"
+    )
+    del results
+
+    model_config_v4_start11 = copy.deepcopy(model_config_v4)
+    model_config_v4_start11.specauc_start_epoch = 11
+    train_args = copy.deepcopy(train_args)
+    train_args["net_args"]["model_config_dict"] = asdict(model_config_v4_start11)
+    train_args["config"]["learning_rates"] = list(np.linspace(1e-5, 1e-3, 150)) + list(np.linspace(1e-3, 1e-5, 150))
+    train_args["config"]["batch_size"] = 500 * 16000
+    train_args["config"]["accum_grad_multiple_step"] = 1
+    train_args["config"]["optimizer"]["weight_decay"] = 1e-2
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.5, 0.7]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            train_job, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_24gb_bs500/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_best=False,
+            )
+            train_job.rqmt["gpu_mem"] = 24
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.8
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5_24gb_bs500"
+    )
+    del results
+
+    frontend_config_large = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(2, 1),
+        pool1_stride=(2, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=512,
+        activation=None,
+    )
+    model_config_large = ModelConfig(
+        frontend_config=frontend_config_large,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=512,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=2048,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+    )
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config_large),
+        },
+    }
+    train_args["config"]["learning_rates"] = (
+        list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    train_args["config"]["batch_size"] = 100 * 16000
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_large_accum2/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report( # 7.2
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_large_accum2"
+    )
+    del results
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config_large),
+        },
+    }
+    train_args["config"]["learning_rates"] = (
+        list(np.linspace(7e-6, 7e-4, 110)) + list(np.linspace(7e-4, 7e-5, 110)) + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    train_args["config"]["batch_size"] = 100 * 16000
+    train_args["config"]["accum_grad_multiple_step"] = 3
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.3, 0.5]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_large_accum3/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 94.4, not converged
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_large_accum3"
+    )
+    del results
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2),
+        "network_module": "ctc.conformer_0923.i6modelsV1_VGG4LayerActFrontendV1_v3",
+        "debug": True,
+        "net_args": {
+            "model_config_dict": asdict(model_config_large),
+        },
+    }
+    train_args["config"]["learning_rates"] = (
+        list(np.linspace(7e-6, 7e-4, 135)) + list(np.linspace(7e-4, 7e-5, 135)) + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    train_args["config"]["batch_size"] = 100 * 16000
+    train_args["config"]["accum_grad_multiple_step"] = 4
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.5, 0.7]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            _, _, _, wer_values = run_exp(
+                prefix_name
+                + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_large_accum4_300ep/lm%.1f_prior%.2f_bs1024_th14"
+                % (lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                num_epochs=300,
+            )
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 7.2
+        results=results, exp_name=prefix_name + "conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_JJLR_large_accum4_300ep"
+    )
+    del results
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/exp_pretrain.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/exp_pretrain.py
new file mode 100644
index 000000000..faa97ccf5
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/exp_pretrain.py
@@ -0,0 +1,320 @@
+from sisyphus import tk
+
+import copy
+from dataclasses import asdict
+import numpy as np
+from typing import cast, Optional, List
+
+from i6_core.report.report import _Report_Type
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+
+from .data import build_phon_training_datasets, TrainingDatasetSettings, get_eow_text_lexicon
+from ..data import build_test_dataset
+from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT
+
+from ..pipeline import training, search, compute_prior
+
+from .config import get_training_config, get_search_config, get_prior_config
+
+def flash_phon_ctc_report_format(report: _Report_Type) -> str:
+    extra_ls = []
+    out = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)]
+    out = sorted(out, key=lambda x: float(x[1]))
+    best_ls = [out[0]]
+    for extra in extra_ls:
+        out2 = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if extra in recog]
+        out2 = sorted(out2, key=lambda x: float(x[1]))
+        if len(out2) > 0:
+            out.append((extra, ""))
+            out.extend(out2)
+            best_ls.append(out2[0])
+    best_ls = sorted(best_ls, key=lambda x: float(x[1]))
+    out.append(("Best Results", ""))
+    out.extend(best_ls)
+    return "\n".join([f"{pair[0]}:  {str(pair[1])}" for pair in out])
+
+
+def pretrained_experiments():
+    prefix_name = "experiments/rescale/tedliumv2/flashlight_phon_ctc/"
+
+    train_settings = TrainingDatasetSettings(
+        custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000"
+    )
+
+    train_settings_retrain = copy.deepcopy(train_settings)
+    train_settings_retrain.epoch_wise_filters = []
+
+    # build the training datasets object containing train, cv, dev-train and the extern_data dict
+    train_data = build_phon_training_datasets(settings=train_settings)
+    label_datastream = cast(LabelDatastream, train_data.datastreams["labels"])
+    vocab_size_without_blank = label_datastream.vocab_size
+
+    # build testing datasets
+    test_dataset_tuples = {}
+    # for testset in ["dev", "test"]:
+    for testset in ["dev"]:
+        test_dataset_tuples[testset] = build_test_dataset(
+            dataset_key=testset,
+        )
+
+    from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm
+
+    lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False)
+    lm = lms_system.interpolated_lms["dev-pruned"]["4gram"]
+    arpa_ted_lm = lm.ngram_lm
+    # TODO: Add binary conversion job
+
+    # ---------------------------------------------------------------------------------------------------------------- #
+
+    def run_exp(
+        ft_name,
+        datasets,
+        train_args,
+        search_args=None,
+        with_prior=False,
+        num_epochs=250,
+        decoder="ctc.decoder.flashlight_phoneme_ctc",
+        eval_epochs: Optional[List] = None,
+        eval_best: bool = True,
+    ):
+        training_name = "/".join(ft_name.split("/")[:-1])
+        search_args = search_args if search_args is not None else {}
+
+        returnn_config = get_training_config(training_datasets=datasets, keep_epochs=eval_epochs, **train_args)
+        train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs)
+
+        if with_prior:
+            returnn_config = get_prior_config(training_datasets=datasets, **train_args)
+            prior_file = compute_prior(
+                ft_name,
+                returnn_config,
+                checkpoint=train_job.out_checkpoints[num_epochs],
+                returnn_exe=RETURNN_EXE,
+                returnn_root=MINI_RETURNN_ROOT,
+            )
+            tk.register_output(training_name + "/prior.txt", prior_file)
+            search_args["prior_file"] = prior_file
+
+        returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder)
+
+        if eval_epochs is None:
+            eval_epochs = [num_epochs]
+        search_job_ls = []
+        report = {}
+        for epoch in eval_epochs:
+            format_string_report, values_report, search_jobs = search(
+                ft_name + "/default_%i" % epoch,
+                returnn_search_config,
+                train_job.out_checkpoints[epoch],
+                test_dataset_tuples,
+                RETURNN_EXE,
+                MINI_RETURNN_ROOT,
+            )
+            search_job_ls += search_jobs
+            report.update(values_report)
+        from i6_core.returnn import GetBestPtCheckpointJob
+        if eval_best:
+            best_job = GetBestPtCheckpointJob(train_job.out_model_dir, train_job.out_learning_rates, key="dev_loss_ctc")
+            best_job.add_alias(ft_name + "/get_best_job")
+            format_string_report, values_report, search_jobs = search(
+                ft_name + "/best_chkpt",
+                returnn_search_config,
+                best_job.out_checkpoint,
+                test_dataset_tuples,
+                RETURNN_EXE,
+                MINI_RETURNN_ROOT,
+            )
+            search_job_ls += search_jobs
+            report.update(values_report)
+
+        return train_job, search_job_ls, format_string_report, report
+
+    def generate_report(results, exp_name):
+        from i6_core.report import GenerateReportStringJob, MailJob
+
+        report = GenerateReportStringJob(report_values=results, report_template=flash_phon_ctc_report_format)
+        report.add_alias(f"report/report/{exp_name}")
+        mail = MailJob(report.out_report, send_contents=True, subject=exp_name)
+        mail.add_alias(f"report/mail/{exp_name}")
+        tk.register_output("mail/" + exp_name, mail.out_status)
+
+    default_search_args = {
+        "lexicon": get_eow_text_lexicon(),
+        "returnn_vocab": label_datastream.vocab,
+        "beam_size": 64,
+        "arpa_lm": arpa_ted_lm,
+        "beam_threshold": 50,
+    }
+    from ..pytorch_networks.ctc.conformer_0923 import hubert_pretrained_v1_cfg
+
+    hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig(
+        finetune_layer=2,
+        name="base-ls960",
+    )
+    model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig(
+        specauc_start_epoch=0,
+        label_target_size=vocab_size_without_blank,
+        final_dropout=0.2,
+        hubert_cfg=hubert_cfg_2,
+    )
+    train_args_hubert_adam_accum25_jjlr = {
+        "config": {
+            "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+                              + list(np.linspace(7e-4, 7e-5, 110))
+                              + list(np.linspace(7e-5, 1e-8, 30)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "max_seqs": 3,
+            "accum_grad_multiple_step": 25,
+        },
+        "debug": True,
+    }
+    eval_epochs = [250]
+    train_args = {
+        **copy.deepcopy(train_args_hubert_adam_accum25_jjlr),
+        "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+        "net_args": {"model_config_dict": asdict(model_config_hubert_2)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.5, 0.7]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            train_job, _, _, wer_values = run_exp(
+                prefix_name
+                + "hubert/pretrain_v3_base_tune2_jjlr/lm%.1f_prior%.2f_bs1024_th14" % (
+                lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_epochs=eval_epochs,
+                eval_best=True
+            )
+            train_job.rqmt["gpu_mem"] = 24
+            results.update(wer_values)
+            del wer_values
+    generate_report(  # 6.6
+        results=results, exp_name=prefix_name + "hubert/pretrain_v3_base_tune2_jjlr"
+    )
+    del results
+
+    hubert_cfg_2 = hubert_pretrained_v1_cfg.HubertConfig(
+        finetune_layer=2,
+        name="large-ls960-ft",
+    )
+    model_config_hubert_2 = hubert_pretrained_v1_cfg.ModelConfig(
+        specauc_start_epoch=0,
+        label_target_size=vocab_size_without_blank,
+        final_dropout=0.2,
+        hubert_cfg=hubert_cfg_2,
+    )
+    train_args_hubert_adam_accum25_jjlr = {
+        "config": {
+            "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+                              + list(np.linspace(7e-4, 7e-5, 110))
+                              + list(np.linspace(7e-5, 1e-8, 30)) + [7e-5],
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "max_seqs": 3,
+            "accum_grad_multiple_step": 25,
+        },
+        "debug": False,
+    }
+    eval_epochs = [250, 300, 350, 400, 450, 500]
+    train_args = {
+        **copy.deepcopy(train_args_hubert_adam_accum25_jjlr),
+        "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+        "net_args": {"model_config_dict": asdict(model_config_hubert_2)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.5, 0.7]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            train_job, _, _, wer_values = run_exp(
+                prefix_name
+                + "hubert/pretrain_v3_large960_tune2_jjlr_longer/lm%.1f_prior%.2f_bs1024_th14" % (
+                lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_epochs=eval_epochs,
+                eval_best=True,
+                num_epochs=500
+            )
+            train_job.rqmt["gpu_mem"] = 24
+            results.update(wer_values)
+            del wer_values
+    generate_report(
+        results=results, exp_name=prefix_name + "hubert/pretrain_v3_large960_tune2_jjlr_longer"
+    )
+    del results
+
+    train_args_hubert_adam_accum25_jjlr = {
+        "config": {
+            "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 130))
+                              + list(np.linspace(7e-4, 7e-5, 230))
+                              + list(np.linspace(7e-5, 1e-8, 140)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "max_seqs": 3,
+            "accum_grad_multiple_step": 25,
+        },
+        "debug": False,
+    }
+    eval_epochs = [250, 300, 350, 400, 450, 500]
+    train_args = {
+        **copy.deepcopy(train_args_hubert_adam_accum25_jjlr),
+        "network_module": "ctc.conformer_0923.hubert_pretrained_v3",
+        "net_args": {"model_config_dict": asdict(model_config_hubert_2)},
+    }
+    results = {}
+    for lm_weight in [1.6, 1.8, 2.0, 2.2]:
+        for prior_scale in [0.5, 0.7]:
+            search_args = {
+                **default_search_args,
+                "lm_weight": lm_weight,
+                "prior_scale": prior_scale,
+            }
+            search_args["beam_size"] = 1024
+            search_args["beam_threshold"] = 14
+            train_job, _, _, wer_values = run_exp(
+                prefix_name
+                + "hubert/pretrain_v3_large960_tune2_jjlr_longflat/lm%.1f_prior%.2f_bs1024_th14" % (
+                lm_weight, prior_scale),
+                datasets=train_data,
+                train_args=train_args,
+                search_args=search_args,
+                with_prior=True,
+                eval_epochs=eval_epochs,
+                eval_best=True,
+                num_epochs=500
+            )
+            train_job.rqmt["gpu_mem"] = 24
+            results.update(wer_values)
+            del wer_values
+    generate_report(
+        results=results, exp_name=prefix_name + "hubert/pretrain_v3_large960_tune2_jjlr_longflat"
+    )
+    del results
+
+
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/serializer.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/serializer.py
new file mode 100644
index 000000000..3f154399c
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/flashlight_phon_ctc/serializer.py
@@ -0,0 +1,114 @@
+import copy
+from sisyphus import tk
+from typing import Any, Dict, Optional
+
+from i6_core.tools.git import CloneGitRepositoryJob
+
+from i6_experiments.common.setups.returnn_pytorch.serialization import (
+    Collection as TorchCollection,
+)
+from i6_experiments.common.setups.serialization import ExternalImport
+
+from .. import PACKAGE
+
+from i6_experiments.common.setups.serialization import Import, PartialImport
+
+
+def get_pytorch_serializer_v3(
+    network_module: str,
+    net_args: Dict[str, Any],
+    decoder: Optional[str] = None,
+    decoder_args: Optional[Dict[str, Any]] = None,
+    post_decoder_args: Optional[Dict[str, Any]] = None,
+    prior: bool = False,
+    debug: bool = False,
+    export:bool = False,
+    **kwargs
+) -> TorchCollection:
+    """
+
+    :param network_module: path to the pytorch config file containing Model
+    :param net_args: extra arguments for the model
+    :param decoder: path to the search decoder, if provided will link search functions
+    :param decoder_args:
+    :param post_decoder_args:
+    :param prior: build config for prior computation
+    :param debug: run training in debug mode (linking from recipe instead of copy)
+    :param kwargs:
+    :return:
+    """
+    package = PACKAGE + ".pytorch_networks"
+
+    pytorch_model_import = PartialImport(
+        code_object_path=package + ".%s.Model" % network_module,
+        unhashed_package_root=PACKAGE,
+        hashed_arguments=net_args,
+        unhashed_arguments={},
+        import_as="get_model",
+    )
+    pytorch_train_step = Import(
+        code_object_path=package + ".%s.train_step" % network_module, unhashed_package_root=PACKAGE
+    )
+    # i6_models_repo = CloneGitRepositoryJob(
+    #     url="https://github.com/rwth-i6/i6_models",
+    #     commit="1e94a4d9d1aa48fe3ac7f60de2cd7bd3fea19c3e",
+    #     checkout_folder_name="i6_models"
+    # ).out_repository
+    i6_models_repo = tk.Path("/u/hilmes/experiments/nick_asr/i6_models")
+    i6_models_repo.hash_overwrite = "LIBRISPEECH_DEFAULT_I6_MODELS"
+    i6_models = ExternalImport(import_path=i6_models_repo)
+
+    serializer_objects = [
+        i6_models,
+        pytorch_model_import,
+        pytorch_train_step,
+    ]
+    if decoder:
+        # Just a hack to test the phoneme-based recognition
+        forward_step = Import(
+            code_object_path=package + ".%s.forward_step" % decoder,
+            unhashed_package_root=PACKAGE,
+        )
+        init_hook = PartialImport(
+            code_object_path=package + ".%s.forward_init_hook" % decoder,
+            unhashed_package_root=PACKAGE,
+            hashed_arguments=decoder_args or {},
+            unhashed_arguments=post_decoder_args or {},
+        )
+        finish_hook = Import(
+            code_object_path=package + ".%s.forward_finish_hook" % decoder,
+            unhashed_package_root=PACKAGE,
+        )
+        serializer_objects.extend([forward_step, init_hook, finish_hook])
+    if prior:
+        forward_step = Import(
+            code_object_path=package + ".%s.prior_step" % network_module,
+            unhashed_package_root=PACKAGE,
+            import_as="forward_step",
+        )
+        init_hook = Import(
+            code_object_path=package + ".%s.prior_init_hook" % network_module,
+            unhashed_package_root=PACKAGE,
+            import_as="forward_init_hook",
+        )
+        finish_hook = Import(
+            code_object_path=package + ".%s.prior_finish_hook" % network_module,
+            import_as="forward_finish_hook",
+            unhashed_package_root=PACKAGE,
+        )
+        serializer_objects.extend([forward_step, init_hook, finish_hook])
+    if export:
+        export_step = Import(
+            code_object_path=package + ".%s.export" % network_module,
+            unhashed_package_root=PACKAGE,
+        )
+        serializer_objects.extend([export_step])
+    serializer = TorchCollection(
+        serializer_objects=serializer_objects,
+        make_local_package_copy=not debug,
+        packages={
+            package,
+        },
+    )
+
+    return serializer
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pipeline.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pipeline.py
new file mode 100644
index 000000000..6ec37b9b5
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pipeline.py
@@ -0,0 +1,188 @@
+import copy
+import os.path
+
+from sisyphus import tk
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datasets import GenericDataset
+
+from i6_core.returnn.config import ReturnnConfig
+from i6_core.returnn.training import ReturnnTrainingJob
+from i6_core.returnn.training import GetBestTFCheckpointJob
+from i6_core.returnn.forward import ReturnnForwardJob, ReturnnForwardJobV2
+from i6_core.returnn.search import SearchBPEtoWordsJob, ReturnnComputeWERJob
+from i6_experiments.users.rossenbach.returnn.training import AverageCheckpointsJobV2
+
+from .default_tools import RETURNN_EXE, MINI_RETURNN_ROOT, SCTK_BINARY_PATH
+
+
+@tk.block()
+def training(prefix_name, returnn_config, returnn_exe, returnn_root, num_epochs):
+    """
+
+    :param prefix_name:
+    :param returnn_config:
+    :param returnn_exe:
+    :param returnn_root:
+    :return:
+    """
+    default_rqmt = {
+        "mem_rqmt": 15,
+        "time_rqmt": 168,
+        "cpu_rqmt": 4,
+        "log_verbosity": 5,
+        "returnn_python_exe": returnn_exe,
+        "returnn_root": returnn_root,
+    }
+
+    train_job = ReturnnTrainingJob(returnn_config=returnn_config, num_epochs=num_epochs, **default_rqmt)
+    train_job.add_alias(prefix_name + "/training")
+    tk.register_output(prefix_name + "/learning_rates", train_job.out_learning_rates)
+
+    return train_job
+
+
+@tk.block()
+def search_single(
+    prefix_name,
+    returnn_config,
+    checkpoint,
+    recognition_dataset: GenericDataset,
+    recognition_bliss_corpus,
+    returnn_exe,
+    returnn_root,
+    mem_rqmt=8,
+    use_gpu=False,
+):
+    """
+    Run search for a specific test dataset
+
+    :param str prefix_name:
+    :param ReturnnConfig returnn_config:
+    :param Checkpoint checkpoint:
+    :param returnn_standalone.data.datasets.dataset.GenericDataset recognition_dataset:
+    :param Path recognition_reference: Path to a py-dict format reference file
+    :param Path returnn_exe:
+    :param Path returnn_root:
+    """
+    returnn_config = copy.deepcopy(returnn_config)
+    returnn_config.config["forward"] = recognition_dataset.as_returnn_opts()
+    search_job = ReturnnForwardJobV2(
+        model_checkpoint=checkpoint,
+        returnn_config=returnn_config,
+        log_verbosity=5,
+        mem_rqmt=mem_rqmt,
+        time_rqmt=24,
+        device="gpu" if use_gpu else "cpu",
+        cpu_rqmt=2,
+        returnn_python_exe=returnn_exe,
+        returnn_root=returnn_root,
+        output_files=["search_out.py"],
+    )
+    search_job.add_alias(prefix_name + "/search_job")
+
+    search_words = SearchBPEtoWordsJob(search_job.out_files["search_out.py"]).out_word_search_results
+
+    from i6_core.returnn.search import SearchWordsToCTMJob
+    from i6_core.corpus.convert import CorpusToStmJob
+    from i6_core.recognition.scoring import ScliteJob
+
+    search_ctm = SearchWordsToCTMJob(
+        recog_words_file=search_words,
+        bliss_corpus=recognition_bliss_corpus,
+    ).out_ctm_file
+
+    stm_file = CorpusToStmJob(bliss_corpus=recognition_bliss_corpus).out_stm_path
+
+    sclite_job = ScliteJob(ref=stm_file, hyp=search_ctm, sctk_binary_path=SCTK_BINARY_PATH)
+    tk.register_output(prefix_name + "/sclite/wer", sclite_job.out_wer)
+    tk.register_output(prefix_name + "/sclite/report", sclite_job.out_report_dir)
+
+    return sclite_job.out_wer, search_job
+
+
+@tk.block()
+def search(prefix_name, returnn_config, checkpoint, test_dataset_tuples, returnn_exe, returnn_root, use_gpu=False):
+    """
+
+    :param str prefix_name:
+    :param ReturnnConfig returnn_config:
+    :param Checkpoint checkpoint:
+    :param test_dataset_tuples:
+    :param returnn_exe:
+    :param returnn_root:
+    :return:
+    """
+    # use fixed last checkpoint for now, needs more fine-grained selection / average etc. here
+    wers = {}
+    search_jobs = []
+    for key, (test_dataset, test_dataset_reference) in test_dataset_tuples.items():
+        wers[key], search_job = search_single(
+            prefix_name + "/%s" % key,
+            returnn_config,
+            checkpoint,
+            test_dataset,
+            test_dataset_reference,
+            returnn_exe,
+            returnn_root,
+            mem_rqmt=16 if not "whisper" in prefix_name else 64,
+            use_gpu=use_gpu,
+        )
+        search_jobs.append(search_job)
+
+    from i6_core.report import GenerateReportStringJob, MailJob
+
+    format_string_report = ",".join(["{%s_val}" % (prefix_name + key) for key in test_dataset_tuples.keys()])
+    format_string = " - ".join(
+        ["{%s}: {%s_val}" % (prefix_name + key, prefix_name + key) for key in test_dataset_tuples.keys()]
+    )
+    values = {}
+    values_report = {}
+    for key in test_dataset_tuples.keys():
+        values[prefix_name + key] = key
+        values["%s_val" % (prefix_name + key)] = wers[key]
+        values_report["%s_val" % (prefix_name + key)] = wers[key]
+
+    report = GenerateReportStringJob(report_values=values, report_template=format_string, compress=False).out_report
+    # mail = MailJob(result=report, subject=prefix_name, send_contents=True).out_status
+    # tk.register_output(os.path.join(prefix_name, "mail_status"), mail)
+    return format_string_report, values_report, search_jobs
+
+
+@tk.block()
+def compute_prior(
+    prefix_name,
+    returnn_config,
+    checkpoint,
+    returnn_exe,
+    returnn_root,
+    mem_rqmt=8,
+    epoch=None
+):
+    """
+    Run search for a specific test dataset
+
+    :param str prefix_name:
+    :param ReturnnConfig returnn_config:
+    :param Checkpoint checkpoint:
+    :param Path returnn_exe:
+    :param Path returnn_root:
+    :param Optional[str] epoch: alias generation
+    """
+    search_job = ReturnnForwardJobV2(
+        model_checkpoint=checkpoint,
+        returnn_config=returnn_config,
+        log_verbosity=5,
+        mem_rqmt=mem_rqmt,
+        time_rqmt=2 if not "whisper" in prefix_name else 4,
+        device="gpu",
+        cpu_rqmt=4,
+        returnn_python_exe=returnn_exe,
+        returnn_root=returnn_root,
+        output_files=["prior.txt"],
+    )
+    if epoch is None:
+        epoch = ""
+    else:
+        epoch = "/" + epoch
+    search_job.add_alias(prefix_name + "/prior" + epoch)
+    return search_job.out_files["prior.txt"]
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/hubert_pretrained_v1_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/hubert_pretrained_v1_cfg.py
new file mode 100644
index 000000000..6278f0a95
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/hubert_pretrained_v1_cfg.py
@@ -0,0 +1,70 @@
+"""
+Config for the base CTC models v4, including specaug start time
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass(kw_only=True)
+class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config):
+    activation_str: str = ""
+    activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        activation_str = d.pop("activation_str")
+        if activation_str == "ReLU":
+            from torch.nn import ReLU
+
+            activation = ReLU()
+        else:
+            assert False, "Unsupported activation %s" % d["activation_str"]
+        d["activation"] = activation
+        return VGG4LayerActFrontendV1Config(**d)
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return SpecaugConfig(**d)
+
+
+@dataclass
+class HubertConfig(ModelConfiguration):
+    name: str
+    finetune_layer: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return HubertConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    specauc_start_epoch: int
+    label_target_size: int
+    final_dropout: float
+    hubert_cfg: HubertConfig
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["hubert_cfg"] = HubertConfig.from_dict(d["hubert_cfg"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/hubert_pretrained_v3.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/hubert_pretrained_v3.py
new file mode 100644
index 000000000..285eb6a7a
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/hubert_pretrained_v3.py
@@ -0,0 +1,160 @@
+"""
+Same as v1 with fix to finetune layer numbers (range +1)
+with additional fix to loading
+"""
+
+import numpy as np
+import torch
+from torch import nn
+
+from transformers import HubertModel, HubertConfig
+from returnn.torch.context import get_run_ctx
+from .hubert_pretrained_v1_cfg import ModelConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+        self.model_dict = None
+
+        self.hubert_cfg = self.cfg.hubert_cfg
+        run_ctx = get_run_ctx()
+        print("TEST", run_ctx.global_step, run_ctx.epoch)
+        if not run_ctx.global_step and run_ctx.epoch == 1:
+            print("Load Hubert model parameters")
+            self.hubert: HubertModel = HubertModel.from_pretrained(f"facebook/hubert-{self.hubert_cfg.name}",
+                                                                cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")
+        else:
+            self.hubert: HubertModel = HubertModel(HubertConfig.from_pretrained(f"facebook/hubert-{self.hubert_cfg.name}",
+                                                                   cache_dir="/work/asr4/hilmes/debug/whisper/transformers/"))
+        if self.training:
+            for param in self.hubert.parameters():
+                param.requires_grad_(False)
+            for layer_num in range(1, self.hubert_cfg.finetune_layer + 1):
+                for name, param in self.hubert.encoder.layers[-layer_num].named_parameters():
+                    param.requires_grad_(True)
+            for name, param in self.hubert.encoder.named_parameters():
+                if param.requires_grad:
+                    print(name)
+        self.final_linear = nn.Linear(self.hubert.config.hidden_size, self.cfg.label_target_size + 1)  # + CTC blank
+        # No particular weight init!
+
+    def forward(
+            self,
+            raw_audio: torch.Tensor,
+            raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+        assert any(param.requires_grad for param in self.hubert.parameters()) or self.hubert_cfg.finetune_layer == 0
+        squeezed_features = torch.squeeze(raw_audio, dim=-1)
+        hubert_outputs = self.hubert(input_values=squeezed_features)
+        encoder_output = hubert_outputs.last_hidden_state
+        encoder_output = self.final_dropout(encoder_output)
+        logits = self.final_linear(encoder_output)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+        return log_probs, self.hubert._get_feat_extract_output_lengths(raw_audio_len)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
+
+
+def export(*, model: Model, f: str, **kwargs):
+    from torch.onnx import export
+    model.export_mode = True
+    dummy_data = torch.randn(1, 30000, 1)
+    dummy_data_len = torch.IntTensor([30000])
+    export(
+        model,
+        (dummy_data, dummy_data_len),
+        f=f,
+        verbose=True,
+        input_names=["data", "data_len"],
+        output_names=["classes"],
+        dynamic_axes={
+            "data": {0: "batch", 1: "time"},
+            "data_len": {0: "batch"},
+            "classes": {0: "batch", 1: "time"},
+        },
+        opset_version=17,
+    )
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_2x1D_frontend_xavierinit.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_2x1D_frontend_xavierinit.py
new file mode 100644
index 000000000..af6e468d3
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_2x1D_frontend_xavierinit.py
@@ -0,0 +1,330 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+import math
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+from i6_models.parts.frontend.common import mask_pool
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+
+from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import (
+    returnn_specaugment_by_length,
+)
+
+
+from .i6modelsV1_2x1D_frontend_xavierinit_cfg import TwoLayer1DFrontendConfig, ModelConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformatted in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x: torch.Tensor):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Tensor with encoding and dropout applied [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class TwoLayer1DFrontend(nn.Module):
+    """
+    Convolutional Front-End using two 1-D Convolutions
+
+
+     - Contains Batch-Norm, but no activation functions.
+     - Applies absolute positional encoding on the output.
+     - With additional linear mapping
+    """
+
+    def __init__(self, model_cfg: TwoLayer1DFrontendConfig):
+        """
+        :param model_cfg: model configuration for this module
+        """
+        super().__init__()
+
+        model_cfg.check_valid()
+
+        self.cfg = model_cfg
+
+        self.conv1 = nn.Conv1d(
+            in_channels=model_cfg.in_features,
+            out_channels=model_cfg.conv1_channels,
+            kernel_size=model_cfg.conv1_kernel_size,
+            stride=model_cfg.conv1_stride,
+        )
+        self.conv2 = nn.Conv1d(
+            in_channels=model_cfg.conv1_channels,
+            out_channels=model_cfg.conv2_channels,
+            kernel_size=model_cfg.conv2_kernel_size,
+            stride=model_cfg.conv2_stride,
+        )
+
+        self.bn1 = nn.BatchNorm1d(num_features=model_cfg.conv1_channels)
+        self.bn2 = nn.BatchNorm1d(num_features=model_cfg.conv2_channels)
+        self.pos_encoding = ESPNetPositionalEncoding(model_cfg.conv2_channels, model_cfg.dropout)
+
+    def forward(self, tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        T might be reduced to T' or T'' depending on stride of the layers
+
+        stride is only allowed for the pool1 and pool2 operation.
+        other ops do not have stride configurable -> no update of mask sequence required but added anyway
+
+        :param tensor: input tensor of shape [B,T,F]
+        :param sequence_mask: the sequence mask for the tensor
+        :return: torch.Tensor of shape [B,T",F'] and the shape of the sequence mask
+        """
+        tensor = tensor.permute(0, 2, 1)  # [B,T,F] -> [B,C,T]
+
+        tensor = self.conv1(tensor)
+        tensor = self.bn1(tensor)
+        sequence_mask = mask_pool(
+            seq_mask=sequence_mask,
+            kernel_size=self.conv1.kernel_size[0],
+            stride=self.conv1.stride[0],
+            padding=self.conv1.padding[0],
+        )
+
+        tensor = self.conv2(tensor)
+        tensor = self.bn2(tensor)
+        sequence_mask = mask_pool(
+            sequence_mask,
+            kernel_size=self.conv2.kernel_size[0],
+            stride=self.conv2.stride[0],
+            padding=self.conv2.padding[0],
+        )
+
+        tensor = tensor.permute(0, 2, 1)  # [B,C,T] -> [B, T, hidden]
+        tensor = self.pos_encoding(tensor)
+
+        return tensor, sequence_mask
+
+    def _calculate_dim(self) -> int:
+        return self.conv2.out_channels
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=TwoLayer1DFrontend, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=conformer_size,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        self.export_mode = False
+
+        # initialize weights
+        self.apply(self._weight_init)
+
+    @staticmethod
+    def _weight_init(module: torch.nn.Module):
+        if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)):
+            print("apply weight init for %s" % str(module))
+            nn.init.xavier_uniform_(module.weight)
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+
+        :param raw_audio:
+        :param raw_audio_len:
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = returnn_specaugment_by_length(
+                    audio_features,
+                    repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    max_dim_time=self.cfg.specaug_config.max_dim_time,
+                    num_repeat_feat=self.cfg.specaug_config.num_repeat_feat,
+                    max_dim_feat=self.cfg.specaug_config.max_dim_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_2x1D_frontend_xavierinit_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_2x1D_frontend_xavierinit_cfg.py
new file mode 100644
index 000000000..f65ac2482
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_2x1D_frontend_xavierinit_cfg.py
@@ -0,0 +1,95 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+from dataclasses import dataclass
+
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass
+class TwoLayer1DFrontendConfig(ModelConfiguration):
+    """
+    Attributes:
+        in_features: number of input features to module
+        conv1_channels: number of channels for first conv layer
+        conv2_channels: number of channels for second conv layer
+    """
+
+    in_features: int
+    conv1_channels: int
+    conv2_channels: int
+    conv1_kernel_size: int
+    conv1_stride: int
+    conv2_kernel_size: int
+    conv2_stride: int
+    dropout: float
+
+    def check_valid(self):
+        pass
+
+    def __post__init__(self):
+        super().__post_init__()
+        self.check_valid()
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return TwoLayer1DFrontendConfig(**d)
+
+
+@dataclass
+class ConformerEncoderV1Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: Number of conformer layers in the conformer encoder
+        frontend: A pair of ConformerFrontend and corresponding config
+        block_cfg: Configuration for ConformerBlockV1
+    """
+
+    num_layers: int
+
+    # nested configurations
+    frontend: ModuleFactoryV1
+    block_cfg: ConformerBlockV1Config
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return SpecaugConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    frontend_config: TwoLayer1DFrontendConfig
+    specaug_config: SpecaugConfig
+    label_target_size: int
+    conformer_size: int
+    num_layers: int
+    num_heads: int
+    ff_dim: int
+    att_weights_dropout: float
+    conv_dropout: float
+    ff_dropout: float
+    mhsa_dropout: float
+    conv_kernel_size: int
+    final_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["frontend_config"] = TwoLayer1DFrontendConfig.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1.py
new file mode 100644
index 000000000..89762f151
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1.py
@@ -0,0 +1,195 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+import math
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import (
+    returnn_specaugment_by_length,
+)
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=conformer_size,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        self.export_mode = False
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+
+        :param raw_audio:
+        :param raw_audio_len:
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = returnn_specaugment_by_length(
+                    audio_features,
+                    repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    max_dim_time=self.cfg.specaug_config.max_dim_time,
+                    num_repeat_feat=self.cfg.specaug_config.num_repeat_feat,
+                    max_dim_feat=self.cfg.specaug_config.max_dim_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py
new file mode 100644
index 000000000..f120d4c5f
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_cfg.py
@@ -0,0 +1,85 @@
+"""
+Config objects for the base CTC models v1 till v3
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass(kw_only=True)
+class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config):
+    activation_str: str = ""
+    activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        activation_str = d.pop("activation_str")
+        if activation_str == "ReLU":
+            from torch.nn import ReLU
+
+            activation = ReLU()
+        else:
+            assert False, "Unsupported activation %s" % d["activation_str"]
+        d["activation"] = activation
+        return VGG4LayerActFrontendV1Config(**d)
+
+
+@dataclass
+class ConformerEncoderV1Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: Number of conformer layers in the conformer encoder
+        frontend: A pair of ConformerFrontend and corresponding config
+        block_cfg: Configuration for ConformerBlockV1
+    """
+
+    num_layers: int
+
+    # nested configurations
+    frontend: ModuleFactoryV1
+    block_cfg: ConformerBlockV1Config
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return SpecaugConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    frontend_config: VGG4LayerActFrontendV1Config
+    specaug_config: SpecaugConfig
+    label_target_size: int
+    conformer_size: int
+    num_layers: int
+    num_heads: int
+    ff_dim: int
+    att_weights_dropout: float
+    conv_dropout: float
+    ff_dropout: float
+    mhsa_dropout: float
+    conv_kernel_size: int
+    final_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_convfirst.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_convfirst.py
new file mode 100644
index 000000000..83002a44d
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_convfirst.py
@@ -0,0 +1,263 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+import math
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config, ConformerConvolutionV1
+from i6_models.parts.conformer.feedforward import (
+    ConformerPositionwiseFeedForwardV1Config,
+    ConformerPositionwiseFeedForwardV1,
+)
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config, ConformerMHSAV1
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import (
+    returnn_specaugment_by_length,
+)
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ConformerBlockV1ConvFirst(nn.Module):
+    """
+    Conformer block module with convolution first
+    """
+
+    def __init__(self, cfg: ConformerBlockV1Config):
+        """
+        :param cfg: conformer block configuration with subunits for the different conformer parts
+        """
+        super().__init__()
+        self.ff1 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg)
+        self.conv = ConformerConvolutionV1(model_cfg=cfg.conv_cfg)
+        self.mhsa = ConformerMHSAV1(cfg=cfg.mhsa_cfg)
+        self.ff2 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg)
+        self.final_layer_norm = torch.nn.LayerNorm(cfg.ff_cfg.input_dim)
+
+    def forward(self, x: torch.Tensor, /, sequence_mask: torch.Tensor) -> torch.Tensor:
+        """
+        :param x: input tensor of shape [B, T, F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T]
+        :return: torch.Tensor of shape [B, T, F]
+        """
+        x = 0.5 * self.ff1(x) + x  #  [B, T, F]
+        x = self.conv(x) + x  #  [B, T, F]
+        x = self.mhsa(x, sequence_mask) + x  #  [B, T, F]
+        x = 0.5 * self.ff2(x) + x  #  [B, T, F]
+        x = self.final_layer_norm(x)  #  [B, T, F]
+        return x
+
+
+class ConformerEncoderV1ConvFirst(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.module_list = torch.nn.ModuleList(
+            [ConformerBlockV1ConvFirst(cfg.block_cfg) for _ in range(cfg.num_layers)]
+        )
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+        for module in self.module_list:
+            x = module(x, sequence_mask)  # [B, T, F']
+
+        return x, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=conformer_size,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1ConvFirst(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        self.export_mode = False
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+
+        :param raw_audio:
+        :param raw_audio_len:
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = returnn_specaugment_by_length(
+                    audio_features,
+                    repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    max_dim_time=self.cfg.specaug_config.max_dim_time,
+                    num_repeat_feat=self.cfg.specaug_config.num_repeat_feat,
+                    max_dim_feat=self.cfg.specaug_config.max_dim_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc.py
new file mode 100644
index 000000000..480b7f952
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc.py
@@ -0,0 +1,255 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+import math
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import (
+    returnn_specaugment_by_length,
+)
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformatted in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x: torch.Tensor):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Tensor with encoding and dropout applied [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class VGG4LayerActFrontendV1PosEnc(VGG4LayerActFrontendV1):
+    def __init__(self, cfg: VGG4LayerActFrontendV1Config):
+        super().__init__(cfg)
+        self.posenc = ESPNetPositionalEncoding(self.cfg.out_features, 0.1)
+
+    def forward(self, tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        tensor, sequence_mask = super().forward(tensor, sequence_mask)
+        tensor = self.posenc(tensor)
+        return tensor, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1PosEnc, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=conformer_size,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        self.export_mode = False
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+
+        :param raw_audio:
+        :param raw_audio_len:
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = returnn_specaugment_by_length(
+                    audio_features,
+                    repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    max_dim_time=self.cfg.specaug_config.max_dim_time,
+                    num_repeat_feat=self.cfg.specaug_config.num_repeat_feat,
+                    max_dim_feat=self.cfg.specaug_config.max_dim_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_convfirst.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_convfirst.py
new file mode 100644
index 000000000..25aa19a3d
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_convfirst.py
@@ -0,0 +1,315 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+import math
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config, ConformerConvolutionV1
+from i6_models.parts.conformer.feedforward import (
+    ConformerPositionwiseFeedForwardV1Config,
+    ConformerPositionwiseFeedForwardV1,
+)
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config, ConformerMHSAV1
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import (
+    returnn_specaugment_by_length,
+)
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformatted in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x: torch.Tensor):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Tensor with encoding and dropout applied [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class ConformerBlockV1ConvFirst(nn.Module):
+    """
+    Conformer block module with convolution first
+    """
+
+    def __init__(self, cfg: ConformerBlockV1Config):
+        """
+        :param cfg: conformer block configuration with subunits for the different conformer parts
+        """
+        super().__init__()
+        self.ff1 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg)
+        self.conv = ConformerConvolutionV1(model_cfg=cfg.conv_cfg)
+        self.mhsa = ConformerMHSAV1(cfg=cfg.mhsa_cfg)
+        self.ff2 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg)
+        self.final_layer_norm = torch.nn.LayerNorm(cfg.ff_cfg.input_dim)
+
+    def forward(self, x: torch.Tensor, /, sequence_mask: torch.Tensor) -> torch.Tensor:
+        """
+        :param x: input tensor of shape [B, T, F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T]
+        :return: torch.Tensor of shape [B, T, F]
+        """
+        x = 0.5 * self.ff1(x) + x  #  [B, T, F]
+        x = self.conv(x) + x  #  [B, T, F]
+        x = self.mhsa(x, sequence_mask) + x  #  [B, T, F]
+        x = 0.5 * self.ff2(x) + x  #  [B, T, F]
+        x = self.final_layer_norm(x)  #  [B, T, F]
+        return x
+
+
+class ConformerEncoderV1ConvFirst(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.posenc = ESPNetPositionalEncoding(cfg.frontend.cfg.out_features, 0.1)
+        self.module_list = torch.nn.ModuleList(
+            [ConformerBlockV1ConvFirst(cfg.block_cfg) for _ in range(cfg.num_layers)]
+        )
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+        x = self.posenc(x)
+        for module in self.module_list:
+            x = module(x, sequence_mask)  # [B, T, F']
+
+        return x, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=conformer_size,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1ConvFirst(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        self.export_mode = False
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+
+        :param raw_audio:
+        :param raw_audio_len:
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = returnn_specaugment_by_length(
+                    audio_features,
+                    repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    max_dim_time=self.cfg.specaug_config.max_dim_time,
+                    num_repeat_feat=self.cfg.specaug_config.num_repeat_feat,
+                    max_dim_feat=self.cfg.specaug_config.max_dim_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_xavierinit.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_xavierinit.py
new file mode 100644
index 000000000..74c17b41a
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_posenc_xavierinit.py
@@ -0,0 +1,262 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+import math
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import (
+    returnn_specaugment_by_length,
+)
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformatted in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x: torch.Tensor):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Tensor with encoding and dropout applied [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class VGG4LayerActFrontendV1PosEnc(VGG4LayerActFrontendV1):
+    def __init__(self, cfg: VGG4LayerActFrontendV1Config):
+        super().__init__(cfg)
+        self.posenc = ESPNetPositionalEncoding(self.cfg.out_features, 0.1)
+
+    def forward(self, tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        tensor, sequence_mask = super().forward(tensor, sequence_mask)
+        tensor = self.posenc(tensor)
+        return tensor, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1PosEnc, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=conformer_size,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        self.export_mode = False
+
+        # initialize weights
+        self.apply(self._weight_init)
+
+    @staticmethod
+    def _weight_init(module: torch.nn.Module):
+        if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)):
+            print("apply xavier uniform weight init for %s" % str(module))
+            nn.init.xavier_uniform_(module.weight)
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+
+        :param raw_audio:
+        :param raw_audio_len:
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = returnn_specaugment_by_length(
+                    audio_features,
+                    repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    max_dim_time=self.cfg.specaug_config.max_dim_time,
+                    num_repeat_feat=self.cfg.specaug_config.num_repeat_feat,
+                    max_dim_feat=self.cfg.specaug_config.max_dim_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg.py
new file mode 100644
index 000000000..39ed46b44
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg.py
@@ -0,0 +1,89 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass(kw_only=True)
+class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config):
+    activation_str: str = ""
+    activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        activation_str = d.pop("activation_str")
+        if activation_str == "ReLU":
+            from torch.nn import ReLU
+
+            activation = ReLU()
+        else:
+            assert False, "Unsupported activation %s" % d["activation_str"]
+        d["activation"] = activation
+        return VGG4LayerActFrontendV1Config(**d)
+
+
+@dataclass
+class TransparentConformerEncoderV2Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: Number of conformer layers in the conformer encoder
+        frontend: A pair of ConformerFrontend and corresponding config
+        block_cfg: Configuration for ConformerBlockV1
+    """
+
+    num_layers: int
+    transparent_weights: dict[int, float]
+
+    # nested configurations
+    frontend: ModuleFactoryV1
+    block_cfg: ConformerBlockV1Config
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return SpecaugConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    frontend_config: VGG4LayerActFrontendV1Config
+    specaug_config: SpecaugConfig
+    label_target_size: int
+    conformer_size: int
+    num_layers: int
+    num_heads: int
+    ff_dim: int
+    att_weights_dropout: float
+    conv_dropout: float
+    ff_dropout: float
+    mhsa_dropout: float
+    conv_kernel_size: int
+    final_dropout: float
+    transparent_weights: dict[int, float]
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2.py
new file mode 100644
index 000000000..2f02ac93b
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v2.py
@@ -0,0 +1,193 @@
+"""
+Like the initial version, but with correctly set FF_dim
+"""
+
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+import math
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+from ...specaugment import (
+    returnn_specaugment_by_length,
+)
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        self.export_mode = False
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+
+        :param raw_audio:
+        :param raw_audio_len:
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = returnn_specaugment_by_length(
+                    audio_features,
+                    repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    max_dim_time=self.cfg.specaug_config.max_dim_time,
+                    num_repeat_feat=self.cfg.specaug_config.num_repeat_feat,
+                    max_dim_feat=self.cfg.specaug_config.max_dim_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3.py
new file mode 100644
index 000000000..25b5bb663
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3.py
@@ -0,0 +1,189 @@
+"""
+Like v2, but with i6_models specaugment
+"""
+
+import numpy as np
+import torch
+from torch import nn
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_convfirst_posenc_xavierinit_transparent_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_convfirst_posenc_xavierinit_transparent_v2.py
new file mode 100644
index 000000000..97b723e66
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_convfirst_posenc_xavierinit_transparent_v2.py
@@ -0,0 +1,348 @@
+"""
+Like v2, but with i6_models specaugment
+"""
+
+import math
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config, ConformerConvolutionV1
+from i6_models.parts.conformer.feedforward import (
+    ConformerPositionwiseFeedForwardV1Config,
+    ConformerPositionwiseFeedForwardV1,
+)
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config, ConformerMHSAV1
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+
+from .i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg import ModelConfig, TransparentConformerEncoderV2Config
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformatted in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x: torch.Tensor):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Tensor with encoding and dropout applied [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class ConvFirstConformerBlockV1(nn.Module):
+    """
+    Conformer block module
+    """
+
+    def __init__(self, cfg: ConformerBlockV1Config):
+        """
+        :param cfg: conformer block configuration with subunits for the different conformer parts
+        """
+        super().__init__()
+        self.ff1 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg)
+        self.mhsa = ConformerMHSAV1(cfg=cfg.mhsa_cfg)
+        self.conv = ConformerConvolutionV1(model_cfg=cfg.conv_cfg)
+        self.ff2 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg)
+        self.final_layer_norm = torch.nn.LayerNorm(cfg.ff_cfg.input_dim)
+
+    def forward(self, x: torch.Tensor, /, sequence_mask: torch.Tensor) -> torch.Tensor:
+        """
+        :param x: input tensor of shape [B, T, F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T]
+        :return: torch.Tensor of shape [B, T, F]
+        """
+        x = 0.5 * self.ff1(x) + x  #  [B, T, F]
+        x = self.conv(x) + x  #  [B, T, F]
+        x = self.mhsa(x, sequence_mask) + x  #  [B, T, F]
+        x = 0.5 * self.ff2(x) + x  #  [B, T, F]
+        x = self.final_layer_norm(x)  #  [B, T, F]
+        return x
+
+
+class TransparentConformerEncoderV2(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: TransparentConformerEncoderV2Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.0)
+        self.module_list = torch.nn.ModuleList(
+            [ConvFirstConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]
+        )
+        self.transparent_scales = nn.Parameter(torch.empty((len(cfg.transparent_weights),)))
+        self.transparent_keys = list(cfg.transparent_weights.keys())
+
+        torch.nn.init.zeros_(self.transparent_scales)
+        with torch.no_grad():
+            for i, (k, v) in enumerate(sorted(cfg.transparent_weights.items())):
+                self.transparent_scales[i] = v
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+        x = self.posenc(x)
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        if 0 in self.transparent_keys:
+            final = transparent_weights[0] * x
+            scale_index = 1
+        else:
+            final = 0 * x
+            scale_index = 0
+
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            if (i + 1) in self.transparent_keys:
+                # the current layer is part of the transparent layers, add to final and shift index value
+                final = final + (transparent_weights[scale_index] * x)
+                scale_index += 1
+
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = TransparentConformerEncoderV2Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+            transparent_weights=self.cfg.transparent_weights,
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV2(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        # No particular weight init!
+        # initialize weights
+        self.apply(self._weight_init)
+
+    @staticmethod
+    def _weight_init(module: torch.nn.Module):
+        if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)):
+            print("apply xavier uniform weight init for %s" % str(module))
+            nn.init.xavier_uniform_(module.weight)
+        if isinstance(module, (torch.nn.MultiheadAttention)):
+            if module._qkv_same_embed_dim:
+                print("apply 1/sqrt(2) scaled xavier uniform weight init for %s" % str(module))
+                nn.init.xavier_uniform_(module.in_proj_weight, gain=1 / np.sqrt(2))
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_convfirst_posenc_xavierinit_v2_transparent_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_convfirst_posenc_xavierinit_v2_transparent_v2.py
new file mode 100644
index 000000000..0fb3cb41c
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_convfirst_posenc_xavierinit_v2_transparent_v2.py
@@ -0,0 +1,352 @@
+"""
+Like v2, but with i6_models specaugment
+"""
+
+import math
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config, ConformerConvolutionV1
+from i6_models.parts.conformer.feedforward import (
+    ConformerPositionwiseFeedForwardV1Config,
+    ConformerPositionwiseFeedForwardV1,
+)
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config, ConformerMHSAV1
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+
+from .i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg import ModelConfig, TransparentConformerEncoderV2Config
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformatted in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x: torch.Tensor):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Tensor with encoding and dropout applied [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class ConvFirstConformerBlockV1(nn.Module):
+    """
+    Conformer block module
+    """
+
+    def __init__(self, cfg: ConformerBlockV1Config):
+        """
+        :param cfg: conformer block configuration with subunits for the different conformer parts
+        """
+        super().__init__()
+        self.ff1 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg)
+        self.mhsa = ConformerMHSAV1(cfg=cfg.mhsa_cfg)
+        self.conv = ConformerConvolutionV1(model_cfg=cfg.conv_cfg)
+        self.ff2 = ConformerPositionwiseFeedForwardV1(cfg=cfg.ff_cfg)
+        self.final_layer_norm = torch.nn.LayerNorm(cfg.ff_cfg.input_dim)
+
+    def forward(self, x: torch.Tensor, /, sequence_mask: torch.Tensor) -> torch.Tensor:
+        """
+        :param x: input tensor of shape [B, T, F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T]
+        :return: torch.Tensor of shape [B, T, F]
+        """
+        x = 0.5 * self.ff1(x) + x  #  [B, T, F]
+        x = self.conv(x) + x  #  [B, T, F]
+        x = self.mhsa(x, sequence_mask) + x  #  [B, T, F]
+        x = 0.5 * self.ff2(x) + x  #  [B, T, F]
+        x = self.final_layer_norm(x)  #  [B, T, F]
+        return x
+
+
+class TransparentConformerEncoderV2(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: TransparentConformerEncoderV2Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.0)
+        self.module_list = torch.nn.ModuleList(
+            [ConvFirstConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)]
+        )
+        self.transparent_scales = nn.Parameter(torch.empty((len(cfg.transparent_weights),)))
+        self.transparent_keys = list(cfg.transparent_weights.keys())
+
+        torch.nn.init.zeros_(self.transparent_scales)
+        with torch.no_grad():
+            for i, (k, v) in enumerate(sorted(cfg.transparent_weights.items())):
+                self.transparent_scales[i] = v
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+        x = self.posenc(x)
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        if 0 in self.transparent_keys:
+            final = transparent_weights[0] * x
+            scale_index = 1
+        else:
+            final = 0 * x
+            scale_index = 0
+
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            if (i + 1) in self.transparent_keys:
+                # the current layer is part of the transparent layers, add to final and shift index value
+                final = final + (transparent_weights[scale_index] * x)
+                scale_index += 1
+
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = TransparentConformerEncoderV2Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+            transparent_weights=self.cfg.transparent_weights,
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV2(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        # No particular weight init!
+        # initialize weights
+        self.apply(self._weight_init)
+
+    @staticmethod
+    def _weight_init(module: torch.nn.Module):
+        from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
+
+        if isinstance(module, NonDynamicallyQuantizableLinear):
+            nn.init.xavier_uniform_(module.weight, gain=1 / np.sqrt(2))
+        elif isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)):
+            print("apply xavier uniform weight init for %s" % str(module))
+            nn.init.xavier_uniform_(module.weight)
+        elif isinstance(module, (torch.nn.MultiheadAttention)):
+            if module._qkv_same_embed_dim:
+                print("apply 1/sqrt(2) scaled xavier uniform weight init for %s" % str(module))
+                nn.init.xavier_uniform_(module.in_proj_weight, gain=1 / np.sqrt(2))
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent.py
new file mode 100644
index 000000000..09b73d036
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent.py
@@ -0,0 +1,286 @@
+"""
+Like v2, but with i6_models specaugment
+"""
+
+import math
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformatted in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x: torch.Tensor):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Tensor with encoding and dropout applied [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class TransparentConformerEncoderV1(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.0)
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,)))
+
+        torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1))
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+        x = self.posenc(x)
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        final = transparent_weights[0] * x
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            final = final + (transparent_weights[i + 1] * x)
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_latespecaug.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_latespecaug.py
new file mode 100644
index 000000000..ea8c7ab4e
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_latespecaug.py
@@ -0,0 +1,289 @@
+"""
+Like v2, but with i6_models specaugment
+"""
+
+import math
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformatted in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x: torch.Tensor):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Tensor with encoding and dropout applied [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class TransparentConformerEncoderV1(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.0)
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,)))
+
+        torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1))
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+        x = self.posenc(x)
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        final = transparent_weights[0] * x
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            final = final + (transparent_weights[i + 1] * x)
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch > 10:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_v2.py
new file mode 100644
index 000000000..35f4adf1f
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_transparent_v2.py
@@ -0,0 +1,301 @@
+"""
+Like v2, but with i6_models specaugment
+"""
+
+import math
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+
+from .i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg import ModelConfig, TransparentConformerEncoderV2Config
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformatted in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x: torch.Tensor):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Tensor with encoding and dropout applied [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class TransparentConformerEncoderV2(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: TransparentConformerEncoderV2Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.0)
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((len(cfg.transparent_weights),)))
+        self.transparent_keys = list(cfg.transparent_weights.keys())
+
+        torch.nn.init.zeros_(self.transparent_scales)
+        with torch.no_grad():
+            for i, (k, v) in enumerate(sorted(cfg.transparent_weights.items())):
+                self.transparent_scales[i] = v
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+        x = self.posenc(x)
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        if 0 in self.transparent_keys:
+            final = transparent_weights[0] * x
+            scale_index = 1
+        else:
+            final = 0 * x
+            scale_index = 0
+
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            if (i + 1) in self.transparent_keys:
+                # the current layer is part of the transparent layers, add to final and shift index value
+                final = final + (transparent_weights[scale_index] * x)
+                scale_index += 1
+
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = TransparentConformerEncoderV2Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+            transparent_weights=self.cfg.transparent_weights,
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV2(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_xavierinit_transparent_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_xavierinit_transparent_v2.py
new file mode 100644
index 000000000..185de8ca9
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_posenc_xavierinit_transparent_v2.py
@@ -0,0 +1,313 @@
+"""
+Like v2, but with i6_models specaugment
+"""
+
+import math
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+
+from .i6modelsV1_VGG4LayerActFrontendV1_transparent_v2_cfg import ModelConfig, TransparentConformerEncoderV2Config
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformatted in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x: torch.Tensor):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Tensor with encoding and dropout applied [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class TransparentConformerEncoderV2(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: TransparentConformerEncoderV2Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.posenc = ESPNetPositionalEncoding(d_model=cfg.block_cfg.ff_cfg.input_dim, dropout_rate=0.0)
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((len(cfg.transparent_weights),)))
+        self.transparent_keys = list(cfg.transparent_weights.keys())
+
+        torch.nn.init.zeros_(self.transparent_scales)
+        with torch.no_grad():
+            for i, (k, v) in enumerate(sorted(cfg.transparent_weights.items())):
+                self.transparent_scales[i] = v
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+        x = self.posenc(x)
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        if 0 in self.transparent_keys:
+            final = transparent_weights[0] * x
+            scale_index = 1
+        else:
+            final = 0 * x
+            scale_index = 0
+
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            if (i + 1) in self.transparent_keys:
+                # the current layer is part of the transparent layers, add to final and shift index value
+                final = final + (transparent_weights[scale_index] * x)
+                scale_index += 1
+
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = TransparentConformerEncoderV2Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+            transparent_weights=self.cfg.transparent_weights,
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV2(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        # No particular weight init!
+        # initialize weights
+        self.apply(self._weight_init)
+
+    @staticmethod
+    def _weight_init(module: torch.nn.Module):
+        if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)):
+            print("apply xavier uniform weight init for %s" % str(module))
+            nn.init.xavier_uniform_(module.weight)
+        if isinstance(module, (torch.nn.MultiheadAttention)):
+            if module._qkv_same_embed_dim:
+                print("apply 1/sqrt(2) scaled xavier uniform weight init for %s" % str(module))
+                nn.init.xavier_uniform_(module.in_proj_weight, gain=1 / np.sqrt(2))
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_transparent.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_transparent.py
new file mode 100644
index 000000000..9bc7c8c77
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v3_transparent.py
@@ -0,0 +1,233 @@
+"""
+Like v2, but with i6_models specaugment
+"""
+
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class TransparentConformerEncoderV1(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,)))
+
+        torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1))
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        final = transparent_weights[0] * x
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            final = final + (transparent_weights[i + 1] * x)
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4.py
new file mode 100644
index 000000000..2926a5a1e
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4.py
@@ -0,0 +1,194 @@
+"""
+Like v2, but with i6_models specaugment (v3)
+and now controllable start time for when specaugment is applied
+"""
+
+import numpy as np
+import torch
+from torch import nn
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py
new file mode 100644
index 000000000..c5ff0e77c
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py
@@ -0,0 +1,86 @@
+"""
+Config for the base CTC models v4, including specaug start time
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass(kw_only=True)
+class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config):
+    activation_str: str = ""
+    activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        activation_str = d.pop("activation_str")
+        if activation_str == "ReLU":
+            from torch.nn import ReLU
+
+            activation = ReLU()
+        else:
+            assert False, "Unsupported activation %s" % d["activation_str"]
+        d["activation"] = activation
+        return VGG4LayerActFrontendV1Config(**d)
+
+
+@dataclass
+class ConformerEncoderV1Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: Number of conformer layers in the conformer encoder
+        frontend: A pair of ConformerFrontend and corresponding config
+        block_cfg: Configuration for ConformerBlockV1
+    """
+
+    num_layers: int
+
+    # nested configurations
+    frontend: ModuleFactoryV1
+    block_cfg: ConformerBlockV1Config
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return SpecaugConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    frontend_config: VGG4LayerActFrontendV1Config
+    specaug_config: SpecaugConfig
+    specauc_start_epoch: int
+    label_target_size: int
+    conformer_size: int
+    num_layers: int
+    num_heads: int
+    ff_dim: int
+    att_weights_dropout: float
+    conv_dropout: float
+    ff_dropout: float
+    mhsa_dropout: float
+    conv_kernel_size: int
+    final_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5.py
new file mode 100644
index 000000000..6f29f7364
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_v5.py
@@ -0,0 +1,194 @@
+"""
+Like v2, but with i6_models specaugment (v3)
+and now controllable start time for when specaugment is applied (v4)
+and with the proper feature extraction from i6-models (v5)
+"""
+
+import numpy as np
+import torch
+from torch import nn
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_xavierinit.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_xavierinit.py
new file mode 100644
index 000000000..bf3dbf043
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/i6modelsV1_VGG4LayerActFrontendV1_xavierinit.py
@@ -0,0 +1,201 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+import math
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config, VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+
+from .i6modelsV1_VGG4LayerActFrontendV1_cfg import ModelConfig
+from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import (
+    returnn_specaugment_by_length,
+)
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=conformer_size,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        self.export_mode = False
+
+        # initialize weights
+        self.apply(self._weight_init)
+
+    @staticmethod
+    def _weight_init(module: torch.nn.Module):
+        if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)):
+            print("apply xavier uniform weight init for %s" % str(module))
+            nn.init.xavier_uniform_(module.weight)
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+
+        :param raw_audio:
+        :param raw_audio_len:
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = returnn_specaugment_by_length(
+                    audio_features,
+                    repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    max_dim_time=self.cfg.specaug_config.max_dim_time,
+                    num_repeat_feat=self.cfg.specaug_config.num_repeat_feat,
+                    max_dim_feat=self.cfg.specaug_config.max_dim_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/hubert_pretrained_v1.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/hubert_pretrained_v1.py
new file mode 100644
index 000000000..ed320c7e3
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/hubert_pretrained_v1.py
@@ -0,0 +1,136 @@
+"""
+Based on i6modelsV1_VGG4LayerActFrontendV1_v5, modified to include Hubert pretraining.
+"""
+
+import numpy as np
+import torch
+from torch import nn
+
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+from transformers import HubertModel
+
+from returnn.torch.context import get_run_ctx
+
+from i6_experiments.users.hilmes.experiments.nick_setups.tedlium2_standalone_2023.pytorch_networks.ctc.conformer_0923.hubert_pretrained_v1_cfg import ModelConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+        self.model_dict = None
+
+        self.hubert_cfg = self.cfg.hubert_cfg
+        self.hubert: HubertModel = HubertModel.from_pretrained(f"facebook/hubert-{self.hubert_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")
+        self.upsampling_layer = torch.nn.ConvTranspose1d(
+            in_channels=self.hubert.config.hidden_size, out_channels=512, kernel_size=5, stride=2, padding=1
+        )
+        for param in self.hubert.parameters():
+            param.requires_grad_(False)
+        for layer_num in range(self.hubert_cfg.finetune_layer):
+            print(self.hubert.encoder.layers[-layer_num])
+            print(layer_num)
+            for name, param in self.hubert.encoder.layers[-layer_num].named_parameters():
+                param.requires_grad_(True)
+        self.final_linear = nn.Linear(self.hubert.config.hidden_size, self.cfg.label_target_size + 1)  # + CTC blank
+        # No particular weight init!
+
+    def forward(
+            self,
+            raw_audio: torch.Tensor,
+            raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+        squeezed_features = torch.squeeze(raw_audio, dim=-1)
+        hubert_outputs = self.hubert(input_values=squeezed_features)
+        encoder_output = hubert_outputs.last_hidden_state
+        encoder_output = self.final_dropout(encoder_output)
+        #encoder_output = self.upsampling_layer(encoder_output.transpose(1, 2)).transpose(1, 2)
+        #encoder_output = encoder_output[:, :torch.sum(attention_mask, dim=1).max(), :]
+        logits = self.final_linear(encoder_output)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+        return log_probs, self.hubert._get_feat_extract_output_lengths(raw_audio_len)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_modules_v1.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_modules_v1.py
new file mode 100644
index 000000000..fb140340d
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_modules_v1.py
@@ -0,0 +1,89 @@
+from typing import Optional, Iterable
+import torch
+from torch import nn
+import torch.nn.functional as F
+import whisper
+from whisper.model import MultiHeadAttention, LayerNorm, Linear, Tensor, Conv1d, sinusoids
+
+
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False, dropout: float = 0.0):
+        super().__init__()
+
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = LayerNorm(n_state)
+
+        self.cross_attn = MultiHeadAttention(n_state, n_head) if cross_attention else None
+        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
+
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state))
+        self.dropout = nn.Dropout(p=dropout) if dropout != 0.0 else None
+        self.mlp_ln = LayerNorm(n_state)
+
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0]
+        if self.cross_attn:
+            x = x + self.cross_attn(self.cross_attn_ln(x), xa, kv_cache=kv_cache)[0]
+        y = self.mlp_ln(x)
+        y = self.mlp[0](y)
+        y = self.mlp[1](y)
+        if self.dropout:
+            y = self.dropout(y)
+        y = self.mlp[2](y)
+        x = x + y
+        return x
+
+
+class Whisper(nn.Module):
+    def __init__(self, dims: whisper.ModelDimensions, dropout: float):
+        super().__init__()
+        self.dims = dims
+        self.encoder = AudioEncoder(
+            self.dims.n_mels,
+            self.dims.n_audio_ctx,
+            self.dims.n_audio_state,
+            self.dims.n_audio_head,
+            self.dims.n_audio_layer,
+            dropout=dropout,
+        )
+
+    def forward(self, mel: torch.Tensor) -> torch.Tensor:
+        return self.encoder(mel)
+
+
+class AudioEncoder(nn.Module):
+    def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int, dropout: float):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head, dropout=dropout) for _ in range(n_layer)]
+        )
+        self.ln_post = LayerNorm(n_state)
+
+    def forward(self, x: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)
+
+        assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"
+        x = (x + self.positional_embedding).to(x.dtype)
+
+        for block in self.blocks:
+            x = block(x)
+
+        x = self.ln_post(x)
+        return x
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v1.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v1.py
new file mode 100644
index 000000000..33e4d167d
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v1.py
@@ -0,0 +1,196 @@
+"""
+Based on i6modelsV1_VGG4LayerActFrontendV1_v5, modified to include whisper pretraining.
+"""
+
+import numpy as np
+import torch
+from torch import nn
+
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+import whisper
+from i6_experiments.users.hilmes.experiments.nick_setups.tedlium2_standalone_2023.pytorch_networks.ctc.conformer_0923.old_unusued.whisper_modules_v1 import \
+    Whisper
+from whisper.audio import N_FRAMES, pad_or_trim
+
+from returnn.torch.context import get_run_ctx
+
+from .whisper_pretrained_v1_cfg import ModelConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+        self.model_dict = None
+
+        self.whisper_cfg = self.cfg.whisper_config
+        if self.whisper_cfg.just_encoder:
+            with open(f"/work/asr4/hilmes/debug/whisper/{self.whisper_cfg.name}.pt", "rb") as f:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                self.whisper_checkpoint = torch.load(f, map_location=device)
+                self.whisper_dims = whisper.ModelDimensions(**self.whisper_checkpoint["dims"])
+            self.whisper = Whisper(self.whisper_dims, self.whisper_cfg.dropout)
+        else:
+            raise NotImplementedError
+
+        self.upsampling_layer = torch.nn.ConvTranspose1d(
+            in_channels=self.whisper.dims.n_audio_state, out_channels=512, kernel_size=5, stride=2, padding=1
+        )
+
+        self.final_linear = nn.Linear(512, self.cfg.label_target_size + 1)  # + CTC blank
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+        run_ctx = get_run_ctx()
+        if run_ctx.global_step == 0 and run_ctx.epoch == 1:
+            self.model_dict = self.whisper.state_dict()
+            print(self.model_dict.keys())
+            pretrained_dict = {k: v for k, v in self.whisper_checkpoint["model_state_dict"].items() if k in self.model_dict}
+            print(pretrained_dict.keys())
+            self.whisper.load_state_dict(pretrained_dict)
+        for param in self.whisper.parameters():
+            param.requires_grad_(False)
+        for layer_num in range(self.whisper_cfg.finetune_layer):
+            for param in self.whisper.encoder.blocks[-layer_num].parameters():
+                param.requires_grad_(True)
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+        audio_features_masked_2 = torch.transpose(audio_features_masked_2, 1, 2)
+        if audio_features_masked_2.shape[-1] > N_FRAMES:
+            audio_features_masked_2 = pad_or_trim(audio_features_masked_2, 2 * N_FRAMES, axis=-1)
+            trans_audio_mel_features_1: torch.Tensor = audio_features_masked_2.index_select(
+                dim=-1, index=torch.arange(end=N_FRAMES, device=audio_features_masked_2.device)
+            )
+            trans_audio_mel_features_2: torch.Tensor = audio_features_masked_2.index_select(
+                dim=-1, index=torch.arange(start=N_FRAMES, end=2 * N_FRAMES, device=audio_features_masked_2.device)
+            )
+            x_1: torch.Tensor = self.whisper.encoder(trans_audio_mel_features_1)
+            x_1 = self.upsampling_layer(x_1.transpose(1, 2)).transpose(1, 2)
+            x_2: torch.Tensor = self.whisper.encoder(trans_audio_mel_features_2)
+            x_2 = self.upsampling_layer(x_2.transpose(1, 2)).transpose(1, 2)
+            x = torch.cat((x_1, x_2), dim=1)
+        else:
+            audio_features_masked_2 = pad_or_trim(audio_features_masked_2, N_FRAMES)
+            x: torch.Tensor = self.whisper.encoder(audio_features_masked_2)
+            x = self.upsampling_layer(x.transpose(1, 2)).transpose(1, 2)
+        # create the mask for the conformer input
+        out_mask = mask_tensor(x, audio_features_len)
+        conformer_out = self.final_dropout(x)
+        conformer_out = conformer_out[:, :audio_features_len.max(), :]
+        logits = self.final_linear(conformer_out)
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v1_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v1_cfg.py
new file mode 100644
index 000000000..1a611d3fa
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v1_cfg.py
@@ -0,0 +1,77 @@
+"""
+Config for the base CTC models v4, including specaug start time
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass(kw_only=True)
+class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config):
+    activation_str: str = ""
+    activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        activation_str = d.pop("activation_str")
+        if activation_str == "ReLU":
+            from torch.nn import ReLU
+
+            activation = ReLU()
+        else:
+            assert False, "Unsupported activation %s" % d["activation_str"]
+        d["activation"] = activation
+        return VGG4LayerActFrontendV1Config(**d)
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return SpecaugConfig(**d)
+
+
+@dataclass
+class WhisperConfig(ModelConfiguration):
+    name: str
+    just_encoder: bool
+    finetune_layer: int
+    split_seq: bool
+    dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return WhisperConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    frontend_config: VGG4LayerActFrontendV1Config
+    specaug_config: SpecaugConfig
+    specauc_start_epoch: int
+    label_target_size: int
+    final_dropout: float
+    whisper_config: WhisperConfig
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"])
+        d["whisper_config"] = WhisperConfig.from_dict(d["whisper_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v2.py
new file mode 100644
index 000000000..272b44c4e
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v2.py
@@ -0,0 +1,158 @@
+"""
+Based on i6modelsV1_VGG4LayerActFrontendV1_v5, modified to include whisper pretraining.
+"""
+
+import numpy as np
+import torch
+from torch import nn
+
+
+from transformers import WhisperModel, WhisperFeatureExtractor, WhisperConfig
+
+from returnn.torch.context import get_run_ctx
+
+from i6_experiments.users.hilmes.experiments.nick_setups.tedlium2_standalone_2023.pytorch_networks.ctc.conformer_0923.whisper_pretrained_v2_cfg import ModelConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+        self.model_dict = None
+
+        self.whisper_cfg = self.cfg.whisper_config
+        self.whisper_feature_extractor = WhisperFeatureExtractor()
+        self.whisper = WhisperModel(WhisperConfig().from_pretrained(
+                f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/"))
+        for param in self.whisper.parameters():
+            param.requires_grad_(False)
+        for layer_num in range(self.whisper_cfg.finetune_layer):
+            for name, param in self.whisper.encoder.layers[-layer_num].named_parameters():
+                param.requires_grad_(True)
+                print(name)
+                print(param)
+        self.upsampling_layer = torch.nn.ConvTranspose1d(
+            in_channels=self.whisper.config.d_model, out_channels=512, kernel_size=5, stride=2, padding=1
+        )
+
+        self.final_linear = nn.Linear(512, self.cfg.label_target_size + 1)  # + CTC blank
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+        run_ctx = get_run_ctx()
+        if run_ctx.global_step == 0 and run_ctx.epoch == 1:
+            self.whisper_feature_extractor: WhisperFeatureExtractor = WhisperFeatureExtractor.from_pretrained(
+                f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")
+            self.whisper: WhisperModel = WhisperModel.from_pretrained(f"openai/whisper-{self.whisper_cfg.name}",
+                                                                      cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")
+        assert any(param.require_grad for param in self.whisper.encoder.parameters()) or self.whisper_cfg.finetune_layer == 0
+        squeezed_features = torch.squeeze(raw_audio)
+        squeezed_features = squeezed_features.cpu().numpy()
+        features = self.whisper_feature_extractor(raw_speech=squeezed_features, return_tensors="pt", return_attention_mask=True, sampling_rate=16000)
+        features = features.to(device="cuda")
+        audio_features = features["input_features"]
+        attention_mask = features["attention_mask"]
+        # TODO: try to remove specagument for now
+        run_ctx = get_run_ctx()
+        if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+            input_features = self.whisper._mask_input_features(audio_features, attention_mask=attention_mask)
+        else:
+            input_features = audio_features
+        whisper_outputs = self.whisper.encoder(input_features=input_features)
+        encoder_output = whisper_outputs.last_hidden_state
+        encoder_output = self.final_dropout(encoder_output)
+        encoder_output = self.upsampling_layer(encoder_output.transpose(1, 2)).transpose(1, 2)
+        encoder_output = encoder_output[:, :torch.sum(attention_mask, dim=1).max(), :]
+        logits = self.final_linear(encoder_output)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(attention_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v3.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v3.py
new file mode 100644
index 000000000..95415805d
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v3.py
@@ -0,0 +1,159 @@
+"""
+Same as v2 with fix to finetune layer numbers (range +1)
+"""
+
+import numpy as np
+import torch
+from torch import nn
+
+
+from transformers import WhisperModel, WhisperFeatureExtractor, WhisperConfig
+
+from returnn.torch.context import get_run_ctx
+
+from i6_experiments.users.hilmes.experiments.nick_setups.tedlium2_standalone_2023.pytorch_networks.ctc.conformer_0923.whisper_pretrained_v2_cfg import ModelConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+        self.model_dict = None
+
+        self.whisper_cfg = self.cfg.whisper_config
+        self.whisper_feature_extractor = WhisperFeatureExtractor()
+        self.whisper = WhisperModel(WhisperConfig().from_pretrained(
+            f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/"))
+        for param in self.whisper.parameters():
+            param.requires_grad_(False)
+        for layer_num in range(self.whisper_cfg.finetune_layer):
+            for name, param in self.whisper.encoder.layers[-layer_num].named_parameters():
+                param.requires_grad_(True)
+                print(name)
+                print(param)
+        self.upsampling_layer = torch.nn.ConvTranspose1d(
+            in_channels=self.whisper.config.d_model, out_channels=512, kernel_size=5, stride=2, padding=1
+        )
+
+        self.final_linear = nn.Linear(512, self.cfg.label_target_size + 1)  # + CTC blank
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+        run_ctx = get_run_ctx()
+        if run_ctx.global_step == 0 and run_ctx.epoch == 1:
+            self.whisper_feature_extractor: WhisperFeatureExtractor = WhisperFeatureExtractor.from_pretrained(
+                f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")
+            self.whisper: WhisperModel = WhisperModel.from_pretrained(f"openai/whisper-{self.whisper_cfg.name}",
+                                                                      cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")
+        assert any(param.requires_grad for param in self.whisper.encoder.parameters()) or self.whisper_cfg.finetune_layer == 0
+        squeezed_features = torch.squeeze(raw_audio)
+        squeezed_features = squeezed_features.cpu().numpy()
+        features = self.whisper_feature_extractor(raw_speech=squeezed_features, return_tensors="pt", return_attention_mask=True, sampling_rate=16000)
+        features = features.to(device="cuda")
+        audio_features = features["input_features"]
+        attention_mask = features["attention_mask"]
+        #audio_features_masked_2 = torch.transpose(audio_features_masked_2, 1, 2)  # B, F, T
+        # TODO: try to remove specagument for now
+        run_ctx = get_run_ctx()
+        if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+            input_features = self.whisper._mask_input_features(audio_features, attention_mask=attention_mask)
+        else:
+            input_features = audio_features
+        whisper_outputs = self.whisper.encoder(input_features=input_features)
+        encoder_output = whisper_outputs.last_hidden_state
+        encoder_output = self.final_dropout(encoder_output)
+        encoder_output = self.upsampling_layer(encoder_output.transpose(1, 2)).transpose(1, 2)
+        encoder_output = encoder_output[:, :torch.sum(attention_mask, dim=1).max(), :]
+        logits = self.final_linear(encoder_output)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(attention_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v4.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v4.py
new file mode 100644
index 000000000..501409628
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/old_unusued/whisper_pretrained_v4.py
@@ -0,0 +1,164 @@
+"""
+v3: with fix to finetune layer numbers (range +1)
+v4: change loading of whisper
+"""
+
+import numpy as np
+import torch
+from torch import nn
+
+
+from transformers import WhisperModel, WhisperFeatureExtractor, WhisperConfig
+
+from returnn.torch.context import get_run_ctx
+
+from .whisper_pretrained_v2_cfg import ModelConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+        self.model_dict = None
+
+        self.whisper_cfg = self.cfg.whisper_config
+        run_ctx = get_run_ctx()
+        if run_ctx.global_step == 0 and run_ctx.epoch == 1:
+            print("Load Whisper model parameters")
+            self.whisper_feature_extractor: WhisperFeatureExtractor = WhisperFeatureExtractor.from_pretrained(
+                f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")
+            self.whisper: WhisperModel = WhisperModel.from_pretrained(f"openai/whisper-{self.whisper_cfg.name}",
+                                                                      cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")
+        else:
+            self.whisper_feature_extractor = WhisperFeatureExtractor()
+            self.whisper = WhisperModel(WhisperConfig().from_pretrained(
+                f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/"))
+        for param in self.whisper.parameters():
+            param.requires_grad_(False)
+        for layer_num in range(1, self.whisper_cfg.finetune_layer + 1):
+            for name, param in self.whisper.encoder.layers[-layer_num].named_parameters():
+                param.requires_grad_(True)
+        for name, param in self.whisper.encoder.named_parameters():
+            if param.requires_grad:
+                print(name)
+        self.upsampling_layer = torch.nn.ConvTranspose1d(
+            in_channels=self.whisper.config.d_model, out_channels=512, kernel_size=5, stride=2, padding=1
+        )
+
+        self.final_linear = nn.Linear(512, self.cfg.label_target_size + 1)  # + CTC blank
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+        assert any(param.requires_grad for param in self.whisper.encoder.parameters()) or self.whisper_cfg.finetune_layer == 0
+        squeezed_features = torch.squeeze(raw_audio)
+        squeezed_features = squeezed_features.cpu().numpy()
+        features = self.whisper_feature_extractor(raw_speech=squeezed_features, return_tensors="pt", return_attention_mask=True, sampling_rate=16000)
+        features = features.to(device="cuda")
+        audio_features = features["input_features"]
+        attention_mask = features["attention_mask"]
+        #audio_features_masked_2 = torch.transpose(audio_features_masked_2, 1, 2)  # B, F, T
+        # TODO: try to remove specagument for now
+        run_ctx = get_run_ctx()
+        if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+            input_features = self.whisper._mask_input_features(audio_features, attention_mask=attention_mask)
+        else:
+            input_features = audio_features
+        whisper_outputs = self.whisper.encoder(input_features=input_features)
+        assert input_features.shape[2] == whisper_outputs.shape[1], (input_features.shape, whisper_outputs.shape)
+        encoder_output = whisper_outputs.last_hidden_state
+        encoder_output = self.final_dropout(encoder_output)
+        encoder_output = self.upsampling_layer(encoder_output.transpose(1, 2)).transpose(1, 2)
+        encoder_output = encoder_output[:, :torch.sum(attention_mask, dim=1).max(), :]
+        logits = self.final_linear(encoder_output)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(attention_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit.py
new file mode 100644
index 000000000..5224bad9a
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit.py
@@ -0,0 +1,371 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+import numpy as np
+import torch
+from torch import nn
+from typing import Tuple
+import math
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+from i6_models.parts.frontend.common import mask_pool
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+
+from .transparent_i6modelsV1_2x1D_frontend_xavierinit_cfg import TwoLayer1DFrontendConfig, ModelConfig
+from i6_experiments.users.rossenbach.experiments.rescale.tedlium2_standalone_2023.pytorch_networks.specaugment import (
+    returnn_specaugment_by_length,
+)
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class ESPNetPositionalEncoding(torch.nn.Module):
+    """
+    Absolute positional encoding taken from ESPNet, reformatted in i6-style
+    https://github.com/espnet/espnet/blob/5d0758e2a7063b82d1f10a8ac2de98eb6cf8a352/espnet/nets/pytorch_backend/transformer/embedding.py#L35
+
+    :param d_model: Embedding dimension.
+    :param dropout_rate: Dropout rate.
+    :param max_len: Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        super(ESPNetPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+
+    def extend_pe(self, x: torch.Tensor):
+        """
+        Reset the positional encodings.
+
+        :param x:
+        """
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Add positional encoding.
+
+        :param x: Input tensor [B, T, *]
+        :returns: Tensor with encoding and dropout applied [B, T, *]
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+class TwoLayer1DFrontend(nn.Module):
+    """
+    Convolutional Front-End using two 1-D Convolutions
+
+
+     - Contains Batch-Norm, but no activation functions.
+     - Applies absolute positional encoding on the output.
+     - With additional linear mapping
+    """
+
+    def __init__(self, model_cfg: TwoLayer1DFrontendConfig):
+        """
+        :param model_cfg: model configuration for this module
+        """
+        super().__init__()
+
+        model_cfg.check_valid()
+
+        self.cfg = model_cfg
+
+        self.conv1 = nn.Conv1d(
+            in_channels=model_cfg.in_features,
+            out_channels=model_cfg.conv1_channels,
+            kernel_size=model_cfg.conv1_kernel_size,
+            stride=model_cfg.conv1_stride,
+        )
+        self.conv2 = nn.Conv1d(
+            in_channels=model_cfg.conv1_channels,
+            out_channels=model_cfg.conv2_channels,
+            kernel_size=model_cfg.conv2_kernel_size,
+            stride=model_cfg.conv2_stride,
+        )
+
+        self.bn1 = nn.BatchNorm1d(num_features=model_cfg.conv1_channels)
+        self.bn2 = nn.BatchNorm1d(num_features=model_cfg.conv2_channels)
+        self.pos_encoding = ESPNetPositionalEncoding(model_cfg.conv2_channels, model_cfg.dropout)
+
+    def forward(self, tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        T might be reduced to T' or T'' depending on stride of the layers
+
+        stride is only allowed for the pool1 and pool2 operation.
+        other ops do not have stride configurable -> no update of mask sequence required but added anyway
+
+        :param tensor: input tensor of shape [B,T,F]
+        :param sequence_mask: the sequence mask for the tensor
+        :return: torch.Tensor of shape [B,T",F'] and the shape of the sequence mask
+        """
+        tensor = tensor.permute(0, 2, 1)  # [B,T,F] -> [B,C,T]
+
+        tensor = self.conv1(tensor)
+        tensor = self.bn1(tensor)
+        sequence_mask = mask_pool(
+            seq_mask=sequence_mask,
+            kernel_size=self.conv1.kernel_size[0],
+            stride=self.conv1.stride[0],
+            padding=self.conv1.padding[0],
+        )
+
+        tensor = self.conv2(tensor)
+        tensor = self.bn2(tensor)
+        sequence_mask = mask_pool(
+            sequence_mask,
+            kernel_size=self.conv2.kernel_size[0],
+            stride=self.conv2.stride[0],
+            padding=self.conv2.padding[0],
+        )
+
+        tensor = tensor.permute(0, 2, 1)  # [B,C,T] -> [B, T, hidden]
+        tensor = self.pos_encoding(tensor)
+
+        return tensor, sequence_mask
+
+    def _calculate_dim(self) -> int:
+        return self.conv2.out_channels
+
+
+class TransparentConformerEncoderV1(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,)))
+
+        torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1))
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        final = transparent_weights[0] * x
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            final = final + (transparent_weights[i + 1] * x)
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=TwoLayer1DFrontend, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=conformer_size,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV1(cfg=conformer_config)
+        self.final_linear = nn.Linear(conformer_size, self.cfg.label_target_size + 1)  # + CTC blank
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+
+        self.export_mode = False
+
+        # initialize weights
+        self.apply(self._weight_init)
+
+    @staticmethod
+    def _weight_init(module: torch.nn.Module):
+        if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)):
+            print("apply weight init for %s" % str(module))
+            nn.init.xavier_uniform_(module.weight)
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+
+        :param raw_audio:
+        :param raw_audio_len:
+        :return: logprobs [B, T, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training:
+                audio_features_masked_2 = returnn_specaugment_by_length(
+                    audio_features,
+                    repeat_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    max_dim_time=self.cfg.specaug_config.max_dim_time,
+                    num_repeat_feat=self.cfg.specaug_config.num_repeat_feat,
+                    max_dim_feat=self.cfg.specaug_config.max_dim_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.final_dropout(conformer_out)
+        logits = self.final_linear(conformer_out)
+
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(out_mask, dim=1)
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit_cfg.py
new file mode 100644
index 000000000..f65ac2482
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/transparent_i6modelsV1_2x1D_frontend_xavierinit_cfg.py
@@ -0,0 +1,95 @@
+"""
+Trying to make the aligner more AppTek-Like
+
+Extended weight init code
+"""
+
+from dataclasses import dataclass
+
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass
+class TwoLayer1DFrontendConfig(ModelConfiguration):
+    """
+    Attributes:
+        in_features: number of input features to module
+        conv1_channels: number of channels for first conv layer
+        conv2_channels: number of channels for second conv layer
+    """
+
+    in_features: int
+    conv1_channels: int
+    conv2_channels: int
+    conv1_kernel_size: int
+    conv1_stride: int
+    conv2_kernel_size: int
+    conv2_stride: int
+    dropout: float
+
+    def check_valid(self):
+        pass
+
+    def __post__init__(self):
+        super().__post_init__()
+        self.check_valid()
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return TwoLayer1DFrontendConfig(**d)
+
+
+@dataclass
+class ConformerEncoderV1Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: Number of conformer layers in the conformer encoder
+        frontend: A pair of ConformerFrontend and corresponding config
+        block_cfg: Configuration for ConformerBlockV1
+    """
+
+    num_layers: int
+
+    # nested configurations
+    frontend: ModuleFactoryV1
+    block_cfg: ConformerBlockV1Config
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return SpecaugConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    frontend_config: TwoLayer1DFrontendConfig
+    specaug_config: SpecaugConfig
+    label_target_size: int
+    conformer_size: int
+    num_layers: int
+    num_heads: int
+    ff_dim: int
+    att_weights_dropout: float
+    conv_dropout: float
+    ff_dropout: float
+    mhsa_dropout: float
+    conv_kernel_size: int
+    final_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["frontend_config"] = TwoLayer1DFrontendConfig.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/whisper_pretrained_v2_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/whisper_pretrained_v2_cfg.py
new file mode 100644
index 000000000..95ddc7f40
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/whisper_pretrained_v2_cfg.py
@@ -0,0 +1,34 @@
+"""
+Config for the base CTC models v4, including specaug start time
+"""
+
+from dataclasses import dataclass
+
+from i6_models.config import ModelConfiguration
+
+@dataclass
+class WhisperConfig(ModelConfiguration):
+    name: str
+    just_encoder: bool
+    finetune_layer: int
+    split_seq: bool
+    dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return WhisperConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    specauc_start_epoch: int
+    label_target_size: int
+    final_dropout: float
+    whisper_config: WhisperConfig
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["whisper_config"] = WhisperConfig.from_dict(d["whisper_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/whisper_pretrained_v5.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/whisper_pretrained_v5.py
new file mode 100644
index 000000000..83755541d
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/conformer_0923/whisper_pretrained_v5.py
@@ -0,0 +1,183 @@
+"""
+v3: with fix to finetune layer numbers (range +1)
+v4: change loading of whisper
+v5: add checks for dimensions
+"""
+
+import numpy as np
+import torch
+from torch import nn
+
+
+from transformers import WhisperModel, WhisperFeatureExtractor, WhisperConfig
+
+from returnn.torch.context import get_run_ctx
+
+from .whisper_pretrained_v2_cfg import ModelConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+        self.model_dict = None
+
+        self.whisper_cfg = self.cfg.whisper_config
+        run_ctx = get_run_ctx()
+        if run_ctx.global_step == 0 and run_ctx.epoch == 1:
+            print("Load Whisper model parameters")
+            self.whisper_feature_extractor: WhisperFeatureExtractor = WhisperFeatureExtractor.from_pretrained(
+                f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")
+            self.whisper: WhisperModel = WhisperModel.from_pretrained(f"openai/whisper-{self.whisper_cfg.name}",
+                                                                      cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")
+        else:
+            self.whisper_feature_extractor = WhisperFeatureExtractor()
+            self.whisper = WhisperModel(WhisperConfig().from_pretrained(
+                f"openai/whisper-{self.whisper_cfg.name}", cache_dir="/work/asr4/hilmes/debug/whisper/transformers/"))
+        for param in self.whisper.parameters():
+            param.requires_grad_(False)
+        for layer_num in range(1, self.whisper_cfg.finetune_layer + 1):
+            for name, param in self.whisper.encoder.layers[-layer_num].named_parameters():
+                param.requires_grad_(True)
+        for name, param in self.whisper.encoder.named_parameters():
+            if param.requires_grad:
+                print(name)
+        #self.upsampling_layer = torch.nn.ConvTranspose1d(
+        #    in_channels=self.whisper.config.d_model, out_channels=512, kernel_size=5, stride=2, padding=1
+        #)
+
+        self.final_linear = nn.Linear(512, self.cfg.label_target_size + 1)  # + CTC blank
+        # No particular weight init!
+
+    def forward(
+        self,
+        raw_audio: torch.Tensor,
+        raw_audio_len: torch.Tensor,
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :return: logprobs [B, T, #labels + blank]
+        """
+        assert any(param.requires_grad for param in self.whisper.encoder.parameters()) or self.whisper_cfg.finetune_layer == 0
+        squeezed_features = torch.squeeze(raw_audio, dim=-1)
+        if squeezed_features.shape[1] > 160 * 3000:
+            squeezed_features2 = squeezed_features[:, 160 * 3000:]
+            squeezed_features2 = squeezed_features2.cpu().numpy()
+            features2 = self.whisper_feature_extractor(raw_speech=squeezed_features2, return_tensors="pt",
+                                                      return_attention_mask=True, sampling_rate=16000)
+            features2 = features2.to(device="cuda" if torch.cuda.is_available() else "cpu")
+            audio_features2 = features2["input_features"]
+            attention_mask2 = features2["attention_mask"]
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                input_features2 = self.whisper._mask_input_features(audio_features2, attention_mask=attention_mask2)
+            else:
+                input_features2 = audio_features2
+            whisper_outputs2 = self.whisper.encoder(input_features=input_features2)
+            encoder_output2 = whisper_outputs2.last_hidden_state
+            encoder_output2 = self.final_dropout(encoder_output2)
+            logits2 = self.final_linear(encoder_output2)
+
+        squeezed_features = squeezed_features.cpu().numpy()
+        features = self.whisper_feature_extractor(raw_speech=squeezed_features, return_tensors="pt", return_attention_mask=True, sampling_rate=16000)
+        features = features.to(device="cuda" if torch.cuda.is_available() else "cpu")
+        audio_features = features["input_features"]
+        attention_mask = features["attention_mask"]
+        #audio_features_masked_2 = torch.transpose(audio_features_masked_2, 1, 2)  # B, F, T
+        # TODO: try to remove specagument for now
+        # TODO: fix dev set problems
+        run_ctx = get_run_ctx()
+        if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+            input_features = self.whisper._mask_input_features(audio_features, attention_mask=attention_mask)
+        else:
+            input_features = audio_features
+        whisper_outputs = self.whisper.encoder(input_features=input_features)
+        encoder_output = whisper_outputs.last_hidden_state
+        encoder_output = self.final_dropout(encoder_output)
+        logits = self.final_linear(encoder_output)
+        if squeezed_features.shape[1] > 160 * 3000:
+            logits = torch.cat((logits, logits2), dim=1)
+            attention_mask = torch.cat((attention_mask, attention_mask2), dim=1)
+        log_probs = torch.log_softmax(logits, dim=2)
+
+        return log_probs, torch.sum(attention_mask, dim=1) // 2
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    transposed_logprobs = torch.permute(logprobs, (1, 0, 2))  # CTC needs [T, B, F]
+    ctc_loss = nn.functional.ctc_loss(
+        transposed_logprobs,
+        labels,
+        input_lengths=audio_features_len,
+        target_lengths=labels_len,
+        blank=model.cfg.label_target_size,
+        reduction="sum",
+        zero_infinity=True,
+    )
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="ctc", loss=ctc_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py
new file mode 100644
index 000000000..3012eee33
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_bpe_ctc.py
@@ -0,0 +1,114 @@
+"""
+Flashlight/Torchaudio CTC decoder and prior computation functions
+"""
+
+import time
+import numpy as np
+import torch
+from torch import nn
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        tokens=labels + ["[blank]"],
+        # "[SILENCE]" and "[UNK]" are not actually part of the vocab,
+        # but the decoder is happy as long they are defined in the token list
+        # even if they do not exist as label index in the softmax output,
+        blank_token="[blank]",
+        sil_token="[blank]",
+        unk_word="[unknown]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_size_token=kwargs.get("beam_size_token", None),
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_am_time = 0
+    run_ctx.total_search_time = 0
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print(
+        "Total-AM-Time: %.2fs, AM-RTF: %.3f"
+        % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s)
+    )
+    print(
+        "Total-Search-Time: %.2fs, Search-RTF: %.3f"
+        % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s)
+    )
+    total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    am_start = time.time()
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    tags = data["seq_tag"]
+
+    logprobs_cpu = logprobs.cpu()
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    am_time = time.time() - am_start
+    run_ctx.total_am_time += am_time
+
+    search_start = time.time()
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu())
+    search_time = time.time() - search_start
+    run_ctx.total_search_time += search_time
+
+    print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+    print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch))
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch))
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_experimental_phoneme_ctc.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_experimental_phoneme_ctc.py
new file mode 100644
index 000000000..ced6dd241
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_experimental_phoneme_ctc.py
@@ -0,0 +1,137 @@
+"""
+Flashlight/Torchaudio CTC decoder and prior computation functions
+"""
+
+import time
+import numpy as np
+import torch
+from torch import nn
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"],
+        # "[SILENCE]" and "[UNK]" are not actually part of the vocab,
+        # but the decoder is happy as long they are defined in the token list
+        # even if they do not exist as label index in the softmax output,
+        blank_token="[blank]",
+        sil_token="[SILENCE]",
+        unk_word="[unknown]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_size_token=kwargs.get("beam_size_token", None),
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_am_time = 0
+    run_ctx.total_search_time = 0
+    run_ctx.graph_model = None
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print(
+        "Total-AM-Time: %.2fs, AM-RTF: %.3f"
+        % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s)
+    )
+    print(
+        "Total-Search-Time: %.2fs, Search-RTF: %.3f"
+        % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s)
+    )
+    total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    if run_ctx.graph_model is None:
+        from torch.onnx import export
+
+        dummy_data = torch.randn(3, 30000)
+        dummy_data_len = torch.IntTensor([30000, 20000, 15000])
+        export(
+            model,
+            (dummy_data, dummy_data_len),
+            f="/var/tmp/some_model.onnx",
+            verbose=True,
+            input_names=["data", "data_len"],
+            output_names=["classes"],
+            dynamic_axes={
+                "data": {0: "batch", 1: "time"},
+                "data_len": {0: "batch"},
+                "classes": {0: "batch", 1: "time"},
+            },
+            opset_version=17,
+        )
+        import onnxruntime as ort
+
+        run_ctx.ort_session = ort.InferenceSession("/var/tmp/some_model.onnx", providers=["CPUExecutionProvider"])
+
+    am_start = time.time()
+    logprobs, audio_features_len = run_ctx.ort_session.run(
+        None, {"data": raw_audio.cpu().numpy(), "data_len": raw_audio_len.cpu().numpy()}
+    )
+
+    tags = data["seq_tag"]
+
+    logprobs_cpu = logprobs.cpu()
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    am_time = time.time() - am_start
+    run_ctx.total_am_time += am_time
+
+    search_start = time.time()
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu())
+    search_time = time.time() - search_start
+    run_ctx.total_search_time += search_time
+
+    print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+    print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch))
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch))
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_onnx_bpe_ctc.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_onnx_bpe_ctc.py
new file mode 100644
index 000000000..b72af5fff
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_onnx_bpe_ctc.py
@@ -0,0 +1,152 @@
+"""
+Flashlight/Torchaudio CTC decoder and prior computation functions
+"""
+
+import time
+import numpy as np
+import torch
+import os
+from torch import nn
+from typing import Union
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        tokens=labels + ["[blank]"],
+        # "[SILENCE]" and "[UNK]" are not actually part of the vocab,
+        # but the decoder is happy as long they are defined in the token list
+        # even if they do not exist as label index in the softmax output,
+        blank_token="[blank]",
+        sil_token="[blank]",
+        unk_word="[unknown]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_size_token=kwargs.get("beam_size_token", None),
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_am_time = 0
+    run_ctx.total_search_time = 0
+    run_ctx.ort_session = None
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print(
+        "Total-AM-Time: %.2fs, AM-RTF: %.3f"
+        % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s)
+    )
+    print(
+        "Total-Search-Time: %.2fs, Search-RTF: %.3f"
+        % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s)
+    )
+    total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    if run_ctx.ort_session is None:
+        from torch.onnx import export
+
+        dummy_data = torch.randn(3, 30000, 1)
+        dummy_data_len = torch.IntTensor([30000, 20000, 15000])
+        export(
+            model,
+            (dummy_data, dummy_data_len),
+            f="/var/tmp/some_model.onnx",
+            verbose=True,
+            input_names=["data", "data_len"],
+            output_names=["classes"],
+            dynamic_axes={
+                "data": {0: "batch", 1: "time"},
+                "data_len": {0: "batch"},
+                "classes": {0: "batch", 1: "time"},
+            },
+            opset_version=17,
+        )
+        import onnxruntime as ort
+        sess_options = ort.SessionOptions()
+        import logging
+        logging.info(f"Session CPUS: {os.getenv('SLURM_CPUS_PER_TASK')}")
+        print("Compiled Onnx model")
+        if os.getenv("SLURM_CPUS_PER_TASK") is not None:
+            sess_options.intra_op_num_threads = int(os.getenv("SLURM_CPUS_PER_TASK"))
+        run_ctx.ort_session = ort.InferenceSession("/var/tmp/some_model.onnx", providers=["CPUExecutionProvider"], sess_options=sess_options)
+    else:
+        print("Taking existing model.")
+
+    am_start = time.time()
+    logprobs, audio_features_len = run_ctx.ort_session.run(
+        None, {"data": raw_audio.cpu().numpy(), "data_len": raw_audio_len.cpu().numpy().astype(np.int32)}
+    )
+
+    tags = data["seq_tag"]
+
+    if isinstance(logprobs, torch.Tensor):
+        logprobs_cpu = logprobs.cpu()
+    else:
+        logprobs_cpu = torch.from_numpy(logprobs)
+    if isinstance(audio_features_len, torch.Tensor):
+        audio_features_len_cpu = audio_features_len.cpu()
+    else:
+        audio_features_len_cpu = torch.from_numpy(audio_features_len)
+
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    am_time = time.time() - am_start
+    run_ctx.total_am_time += am_time
+
+    search_start = time.time()
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len_cpu)
+    search_time = time.time() - search_start
+    run_ctx.total_search_time += search_time
+
+    print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+    print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch))
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch))
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py
new file mode 100644
index 000000000..39d942e9b
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_phoneme_ctc.py
@@ -0,0 +1,114 @@
+"""
+Flashlight/Torchaudio CTC decoder and prior computation functions
+"""
+
+import time
+import numpy as np
+import torch
+from torch import nn
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        tokens=labels + ["[blank]", "[SILENCE]", "[UNK]"],
+        # "[SILENCE]" and "[UNK]" are not actually part of the vocab,
+        # but the decoder is happy as long they are defined in the token list
+        # even if they do not exist as label index in the softmax output,
+        blank_token="[blank]",
+        sil_token="[SILENCE]",
+        unk_word="[unknown]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_size_token=kwargs.get("beam_size_token", None),
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_am_time = 0
+    run_ctx.total_search_time = 0
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print(
+        "Total-AM-Time: %.2fs, AM-RTF: %.3f"
+        % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s)
+    )
+    print(
+        "Total-Search-Time: %.2fs, Search-RTF: %.3f"
+        % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s)
+    )
+    total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    am_start = time.time()
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    tags = data["seq_tag"]
+
+    logprobs_cpu = logprobs.cpu()
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    am_time = time.time() - am_start
+    run_ctx.total_am_time += am_time
+
+    search_start = time.time()
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len.cpu())
+    search_time = time.time() - search_start
+    run_ctx.total_search_time += search_time
+
+    print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+    print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch))
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch))
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_quantized_bpe_ctc.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_quantized_bpe_ctc.py
new file mode 100644
index 000000000..8db276c0e
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/flashlight_quantized_bpe_ctc.py
@@ -0,0 +1,144 @@
+"""
+Flashlight/Torchaudio CTC decoder and prior computation functions
+"""
+
+import time
+import numpy as np
+import torch
+import os
+from torch import nn
+from typing import Union
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    from torchaudio.models.decoder import ctc_decoder
+    import logging
+    print(kwargs)
+
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+    import subprocess
+
+    if kwargs["arpa_lm"] is not None:
+        lm = subprocess.check_output(["cf", kwargs["arpa_lm"]]).decode().strip()
+    else:
+        lm = None
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    labels = vocab.labels
+    run_ctx.ctc_decoder = ctc_decoder(
+        lexicon=kwargs["lexicon"],
+        lm=lm,
+        lm_weight=kwargs["lm_weight"],
+        tokens=labels + ["[blank]"],
+        # "[SILENCE]" and "[UNK]" are not actually part of the vocab,
+        # but the decoder is happy as long they are defined in the token list
+        # even if they do not exist as label index in the softmax output,
+        blank_token="[blank]",
+        sil_token="[blank]",
+        unk_word="[unknown]",
+        nbest=1,
+        beam_size=kwargs["beam_size"],
+        beam_size_token=kwargs.get("beam_size_token", None),
+        beam_threshold=kwargs["beam_threshold"],
+        sil_score=kwargs.get("sil_score", 0.0),
+        word_score=kwargs.get("word_score", 0.0),
+    )
+    run_ctx.labels = labels
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    if kwargs.get("prior_file", None):
+        run_ctx.prior = np.loadtxt(kwargs["prior_file"], dtype="float32")
+        run_ctx.prior_scale = kwargs["prior_scale"]
+    else:
+        run_ctx.prior = None
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_am_time = 0
+    run_ctx.total_search_time = 0
+    run_ctx.ort_session = None
+    run_ctx.quantized_model = kwargs.get("quantized_model", None)
+    print(f"Quantized model path: {run_ctx.quantized_model}")
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print(
+        "Total-AM-Time: %.2fs, AM-RTF: %.3f"
+        % (run_ctx.total_am_time, run_ctx.total_am_time / run_ctx.running_audio_len_s)
+    )
+    print(
+        "Total-Search-Time: %.2fs, Search-RTF: %.3f"
+        % (run_ctx.total_search_time, run_ctx.total_search_time / run_ctx.running_audio_len_s)
+    )
+    total_proc_time = run_ctx.total_am_time + run_ctx.total_search_time
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (total_proc_time, total_proc_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    if run_ctx.quantized_model:
+        import onnxruntime as ort
+        sess_options = ort.SessionOptions()
+        import logging
+        logging.info(f"Session CPUS: {os.getenv('SLURM_CPUS_PER_TASK')}")
+        print("Compiled Onnx model")
+        if os.getenv("SLURM_CPUS_PER_TASK") is not None:
+            sess_options.intra_op_num_threads = int(os.getenv("SLURM_CPUS_PER_TASK"))
+        run_ctx.ort_session = ort.InferenceSession(
+            run_ctx.quantized_model,
+            providers=["CPUExecutionProvider"],
+            sess_options=sess_options
+        )
+    else:
+       assert False, "Need quantized Model for this."
+
+    am_start = time.time()
+    logprobs, audio_features_len = run_ctx.ort_session.run(
+        None, {"data": raw_audio.cpu().numpy(), "data_len": raw_audio_len.cpu().numpy().astype(np.int32)}
+    )
+
+    tags = data["seq_tag"]
+
+    if isinstance(logprobs, torch.Tensor):
+        logprobs_cpu = logprobs.cpu()
+    else:
+        logprobs_cpu = torch.from_numpy(logprobs)
+    if isinstance(audio_features_len, torch.Tensor):
+        audio_features_len_cpu = audio_features_len.cpu()
+    else:
+        audio_features_len_cpu = torch.from_numpy(audio_features_len)
+
+    if run_ctx.blank_log_penalty is not None:
+        # assumes blank is last
+        logprobs_cpu[:, :, -1] -= run_ctx.blank_log_penalty
+    if run_ctx.prior is not None:
+        logprobs_cpu -= run_ctx.prior_scale * run_ctx.prior
+
+    am_time = time.time() - am_start
+    run_ctx.total_am_time += am_time
+
+    search_start = time.time()
+    hypothesis = run_ctx.ctc_decoder(logprobs_cpu, audio_features_len_cpu)
+    search_time = time.time() - search_start
+    run_ctx.total_search_time += search_time
+
+    print("Batch-AM-Time: %.2fs, AM-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+    print("Batch-Search-Time: %.2fs, Search-RTF: %.3f" % (search_time, search_time / audio_len_batch))
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time + search_time, (am_time + search_time) / audio_len_batch))
+
+    for hyp, tag in zip(hypothesis, tags):
+        words = hyp[0].words
+        sequence = " ".join([word for word in words if not word.startswith("[")])
+        print(sequence)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(sequence)))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py
new file mode 100644
index 000000000..e4f795ee3
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/ctc/decoder/greedy_bpe_ctc_v2.py
@@ -0,0 +1,61 @@
+"""
+Greedy CTC decoder without any extras
+"""
+
+import time
+import numpy as np
+import torch
+from torch import nn
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    run_ctx.labels = vocab.labels
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_time = 0
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (run_ctx.total_time, run_ctx.total_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    am_start = time.time()
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+    batch_indices = []
+    for lp, l in zip(logprobs, audio_features_len):
+        batch_indices.append(torch.unique_consecutive(torch.argmax(lp[:l], dim=-1), dim=0).detach().cpu().numpy())
+
+    am_time = time.time() - am_start
+    run_ctx.total_time += am_time
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (am_time, am_time / audio_len_batch))
+
+    tags = data["seq_tag"]
+
+    for indices, tag in zip(batch_indices, tags):
+        print(indices)
+        sequence = [run_ctx.labels[idx] for idx in indices if idx < len(run_ctx.labels)]
+        sequence = [s for s in sequence if (not s.startswith("<") and not s.startswith("["))]
+        text = " ".join(sequence).replace("@@ ", "")
+        print(text)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(text)))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/legacy_feature_extraction.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/legacy_feature_extraction.py
new file mode 100644
index 000000000..a6eb0bf1e
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/legacy_feature_extraction.py
@@ -0,0 +1,110 @@
+__all__ = ["LogMelFeatureExtractionV1", "LogMelFeatureExtractionV1Config"]
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+from librosa import filters
+import torch
+from torch import nn
+
+from i6_models.config import ModelConfiguration
+
+
+@dataclass
+class LogMelFeatureExtractionV1Config(ModelConfiguration):
+    """
+    Attributes:
+        sample_rate: audio sample rate in Hz
+        win_size: window size in seconds
+        hop_size: window shift in seconds
+        f_min: minimum filter frequency in Hz
+        f_max: maximum filter frequency in Hz
+        min_amp: minimum amplitude for safe log
+        num_filters: number of mel windows
+        center: centered STFT with automatic padding
+    """
+
+    sample_rate: int
+    win_size: float
+    hop_size: float
+    f_min: int
+    f_max: int
+    min_amp: float
+    num_filters: int
+    center: bool
+    n_fft: Optional[int] = None
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        assert self.f_max <= self.sample_rate // 2, "f_max can not be larger than half of the sample rate"
+        assert self.f_min > 0 and self.f_max > 0 and self.sample_rate > 0, "frequencies need to be positive"
+        assert self.win_size > 0 and self.hop_size > 0, "window settings need to be positive"
+        assert self.num_filters > 0, "number of filters needs to be positive"
+        assert self.hop_size <= self.win_size, "using a larger hop size than window size does not make sense"
+        if self.n_fft is None:
+            # if n_fft is not given, set n_fft to the window size (in samples)
+            self.n_fft = int(self.win_size * self.sample_rate)
+        else:
+            assert self.n_fft >= self.win_size * self.sample_rate, "n_fft cannot to be smaller than the window size"
+
+
+class LogMelFeatureExtractionV1(nn.Module):
+    """
+    Librosa-compatible log-mel feature extraction using log10. Does not use torchaudio.
+
+    Using it wrapped with torch.no_grad() is recommended if no gradient is needed
+    """
+
+    def __init__(self, cfg: LogMelFeatureExtractionV1Config):
+        super().__init__()
+        self.register_buffer("n_fft", torch.tensor(cfg.n_fft))
+        self.register_buffer("window", torch.hann_window(int(cfg.win_size * cfg.sample_rate)))
+        self.register_buffer("hop_length", torch.tensor(int(cfg.hop_size * cfg.sample_rate)))
+        self.register_buffer("min_amp", torch.tensor(cfg.min_amp))
+        self.center = cfg.center
+        self.register_buffer(
+            "mel_basis",
+            torch.tensor(
+                filters.mel(
+                    sr=cfg.sample_rate,
+                    n_fft=int(cfg.sample_rate * cfg.win_size),
+                    n_mels=cfg.num_filters,
+                    fmin=cfg.f_min,
+                    fmax=cfg.f_max,
+                )
+            ),
+        )
+
+    def forward(self, raw_audio, length) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param raw_audio: [B, T]
+        :param length in samples: [B]
+        :return features as [B,T,F] and length in frames [B]
+        """
+        power_spectrum = (
+            torch.abs(
+                torch.stft(
+                    raw_audio,
+                    n_fft=self.n_fft,
+                    hop_length=self.hop_length,
+                    window=self.window,
+                    center=self.center,
+                    pad_mode="constant",
+                    return_complex=True,
+                )
+            )
+            ** 2
+        )
+        if len(power_spectrum.size()) == 2:
+            # For some reason torch.stft removes the batch axis for batch sizes of 1, so we need to add it again
+            power_spectrum = torch.unsqueeze(power_spectrum, 0)
+        melspec = torch.einsum("...ft,mf->...mt", power_spectrum, self.mel_basis)
+        log_melspec = torch.log10(torch.max(self.min_amp, melspec))
+        feature_data = torch.transpose(log_melspec, 1, 2)
+
+        if self.center:
+            length = (length // self.hop_length) + 1
+        else:
+            length = ((length - self.n_fft) // self.hop_length) + 1
+
+        return feature_data, length.int()
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/hubert_pretrain_v1.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/hubert_pretrain_v1.py
new file mode 100644
index 000000000..521e8519f
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/hubert_pretrain_v1.py
@@ -0,0 +1,330 @@
+"""
+Modified from v4 with proper configuration for the predictor and using i6models feature extraction
+
+Sets joiner dropout correctly
+"""
+
+import numpy as np
+import torch
+import torchaudio
+from torch import nn
+from typing import List, Optional, Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from transformers import HubertModel, HubertConfig
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+from returnn.torch.context import get_run_ctx
+
+from .hubert_pretrain_v1_cfg import ModelConfig, PredictorConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Predictor(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) prediction network.
+
+    Taken from torchaudio
+    """
+
+    def __init__(self, cfg: PredictorConfig, label_target_size: int, output_dim: int) -> None:
+        """
+
+        :param cfg: model configuration for the predictor
+        :param label_target_size: shared value from model
+        :param output_dim: shared value from model
+        """
+        super().__init__()
+        self.embedding = torch.nn.Embedding(label_target_size, cfg.symbol_embedding_dim)
+        self.embedding_dropout = nn.Dropout(cfg.emebdding_dropout)
+        self.input_layer_norm = torch.nn.LayerNorm(cfg.symbol_embedding_dim)
+        self.lstm_layers = torch.nn.ModuleList(
+            [
+                nn.LSTM(
+                    input_size=cfg.symbol_embedding_dim if idx == 0 else cfg.lstm_hidden_dim,
+                    hidden_size=cfg.lstm_hidden_dim,
+                )
+                for idx in range(cfg.num_lstm_layers)
+            ]
+        )
+        self.dropout = torch.nn.Dropout(p=cfg.lstm_dropout)
+        self.linear = torch.nn.Linear(cfg.lstm_hidden_dim, output_dim)
+        self.output_layer_norm = torch.nn.LayerNorm(output_dim)
+
+        self.lstm_dropout = cfg.lstm_dropout
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass.
+
+        B: batch size;
+        U: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output encoding sequences, with shape `(B, U, output_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output encoding sequences.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``forward``.
+        """
+        input_tb = input.permute(1, 0)
+        embedding_out = self.embedding(input_tb)
+        embedding_out = self.embedding_dropout(embedding_out)
+        input_layer_norm_out = self.input_layer_norm(embedding_out)
+
+        lstm_out = input_layer_norm_out
+        state_out: List[List[torch.Tensor]] = []
+        for layer_idx, lstm in enumerate(self.lstm_layers):
+            lstm_out, lstm_state_out = lstm(
+                lstm_out, None if state is None else [s.permute(1, 0, 2) for s in state[layer_idx]]
+            )
+            lstm_out = self.dropout(lstm_out)
+            state_out.append([s.permute(1, 0, 2) for s in lstm_state_out])
+
+        linear_out = self.linear(lstm_out)
+        output_layer_norm_out = self.output_layer_norm(linear_out)
+        return output_layer_norm_out.permute(1, 0, 2), lengths, state_out
+
+
+class Joiner(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) joint network.
+
+    Args:
+        input_dim (int): source and target input dimension.
+        output_dim (int): output dimension.
+        activation (str, optional): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+
+    Taken directly from torchaudio
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, activation: str = "relu", dropout: float = 0.0) -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
+        self.dropout = nn.Dropout(p=dropout)
+        if activation == "relu":
+            self.activation = torch.nn.ReLU()
+        elif activation == "tanh":
+            self.activation = torch.nn.Tanh()
+        else:
+            raise ValueError(f"Unsupported activation {activation}")
+
+    def forward(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
+        joint_encodings = self.dropout(joint_encodings)
+        activation_out = self.activation(joint_encodings)
+        output = self.linear(activation_out)
+        return output, source_lengths.to("cuda"), target_lengths
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        self.hubert_cfg = self.cfg.hubert_cfg
+        run_ctx = get_run_ctx()
+        print("TEST", run_ctx.global_step, run_ctx.epoch)
+        if not run_ctx.global_step and run_ctx.epoch == 1:
+            print("Load Hubert model parameters")
+            self.hubert: HubertModel = HubertModel.from_pretrained(f"facebook/hubert-{self.hubert_cfg.name}",
+                                                                   cache_dir="/work/asr4/hilmes/debug/whisper/transformers/")
+        else:
+            self.hubert: HubertModel = HubertModel(
+                HubertConfig.from_pretrained(f"facebook/hubert-{self.hubert_cfg.name}",
+                                             cache_dir="/work/asr4/hilmes/debug/whisper/transformers/"))
+        for param in self.hubert.parameters():
+            param.requires_grad_(False)
+        for layer_num in range(1, self.hubert_cfg.finetune_layer + 1):
+            for name, param in self.hubert.encoder.layers[-layer_num].named_parameters():
+                param.requires_grad_(True)
+        for name, param in self.hubert.encoder.named_parameters():
+            if param.requires_grad:
+                print(name)
+
+        self.predictor = Predictor(
+            cfg=self.cfg.predictor_config,
+            label_target_size=self.cfg.label_target_size + 1,  # ctc blank added
+            output_dim=self.cfg.joiner_dim,
+        )
+        self.joiner = Joiner(
+            input_dim=self.cfg.joiner_dim,
+            output_dim=self.cfg.label_target_size + 1,
+            activation=self.cfg.joiner_activation,
+            dropout=self.cfg.joiner_dropout,
+        )
+        self.final_dropout = nn.Dropout(p=self.cfg.final_dropout)
+        self.encoder_out_linear = nn.Linear(self.hubert.config.hidden_size, self.cfg.joiner_dim)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0)
+        # No particular weight init!
+
+    def forward(
+        self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :param labels: [B, N]
+        :param labels_len: length of N as [B]
+        :return: logprobs [B, T + N, #labels + blank]
+        """
+        assert any(param.requires_grad for param in self.hubert.parameters()) or self.hubert_cfg.finetune_layer == 0
+        squeezed_features = torch.squeeze(raw_audio, dim=-1)
+        hubert_outputs = self.hubert(input_values=squeezed_features)
+        encoder_output = hubert_outputs.last_hidden_state
+        encoder_output = self.final_dropout(encoder_output)
+        encoder_output = self.encoder_out_linear(encoder_output)
+
+        encoder_out_lengths = self.hubert._get_feat_extract_output_lengths(raw_audio_len)  # [B, T] -> [B]
+
+        predict_out, _, _ = self.predictor(
+            input=labels,
+            lengths=labels_len,
+        )
+
+        output_logits, src_len, tgt_len = self.joiner(
+            source_encodings=encoder_output,
+            source_lengths=encoder_out_lengths,
+            target_encodings=predict_out,
+            target_lengths=labels_len,
+        )  # output is [B, T, N, #vocab]
+
+        return output_logits, src_len
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B], cpu transfer needed only for Mini-RETURNN
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1])
+    prepended_targets[:, 1:] = labels
+    prepended_targets[:, 0] = model.cfg.label_target_size  # blank is last index
+    prepended_target_lengths = labels_len + 1
+
+    logits, audio_features_len = model(
+        raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths
+    )
+
+    rnnt_loss = model.loss(
+        logits=logits,
+        logit_lengths=audio_features_len.to(dtype=torch.int32),
+        targets=labels,
+        target_lengths=labels_len.to(dtype=torch.int32),
+    )
+
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/hubert_pretrain_v1_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/hubert_pretrain_v1_cfg.py
new file mode 100644
index 000000000..bf1a3b04d
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/hubert_pretrain_v1_cfg.py
@@ -0,0 +1,57 @@
+"""
+Config for the base CTC models v4, including specaug start time
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass
+class PredictorConfig(ModelConfiguration):
+    symbol_embedding_dim: int
+    emebdding_dropout: float
+    num_lstm_layers: int
+    lstm_hidden_dim: int
+    lstm_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return PredictorConfig(**d)
+
+
+@dataclass
+class HubertConfig(ModelConfiguration):
+    name: str
+    finetune_layer: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return HubertConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    predictor_config: PredictorConfig
+    specauc_start_epoch: int
+    label_target_size: int
+    final_dropout: float
+    joiner_dim: int
+    joiner_activation: str
+    joiner_dropout: float
+    hubert_cfg: HubertConfig
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["predictor_config"] = PredictorConfig.from_dict(d["predictor_config"])
+        d["hubert_cfg"] = HubertConfig.from_dict(d["hubert_cfg"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4.py
new file mode 100644
index 000000000..aa0e8bc07
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4.py
@@ -0,0 +1,370 @@
+"""
+
+"""
+
+import numpy as np
+import torch
+import torchaudio
+from torch import nn
+from typing import List, Optional, Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Predictor(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) prediction network.
+
+    Args:
+        num_symbols (int): size of target token lexicon.
+        output_dim (int): feature dimension of each output sequence element.
+        symbol_embedding_dim (int): dimension of each target token embedding.
+        num_lstm_layers (int): number of LSTM layers to instantiate.
+        lstm_hidden_dim (int): output dimension of each LSTM layer.
+        lstm_dropout (float, optional): LSTM dropout probability. (Default: 0.0)
+
+    """
+
+    def __init__(
+        self,
+        num_symbols: int,
+        output_dim: int,
+        symbol_embedding_dim: int,
+        num_lstm_layers: int,
+        lstm_hidden_dim: int,
+        lstm_dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.embedding = torch.nn.Embedding(num_symbols, symbol_embedding_dim)
+        self.input_layer_norm = torch.nn.LayerNorm(symbol_embedding_dim)
+        self.lstm_layers = torch.nn.ModuleList(
+            [
+                nn.LSTM(
+                    input_size=symbol_embedding_dim if idx == 0 else lstm_hidden_dim,
+                    hidden_size=lstm_hidden_dim,
+                )
+                for idx in range(num_lstm_layers)
+            ]
+        )
+        self.dropout = torch.nn.Dropout(p=lstm_dropout)
+        self.linear = torch.nn.Linear(lstm_hidden_dim, output_dim)
+        self.output_layer_norm = torch.nn.LayerNorm(output_dim)
+
+        self.lstm_dropout = lstm_dropout
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass.
+
+        B: batch size;
+        U: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output encoding sequences, with shape `(B, U, output_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output encoding sequences.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``forward``.
+        """
+        input_tb = input.permute(1, 0)
+        embedding_out = self.embedding(input_tb)
+        input_layer_norm_out = self.input_layer_norm(embedding_out)
+
+        lstm_out = input_layer_norm_out
+        state_out: List[List[torch.Tensor]] = []
+        for layer_idx, lstm in enumerate(self.lstm_layers):
+            lstm_out, lstm_state_out = lstm(lstm_out, None if state is None else state[layer_idx])
+            lstm_out = self.dropout(lstm_out)
+            state_out.append(lstm_state_out)
+
+        linear_out = self.linear(lstm_out)
+        output_layer_norm_out = self.output_layer_norm(linear_out)
+        return output_layer_norm_out.permute(1, 0, 2), lengths, state_out
+
+
+class Joiner(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) joint network.
+
+    Args:
+        input_dim (int): source and target input dimension.
+        output_dim (int): output dimension.
+        activation (str, optional): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+
+    Taken directly from torchaudio
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
+        if activation == "relu":
+            self.activation = torch.nn.ReLU()
+        elif activation == "tanh":
+            self.activation = torch.nn.Tanh()
+        else:
+            raise ValueError(f"Unsupported activation {activation}")
+
+    def forward(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
+        activation_out = self.activation(joint_encodings)
+        output = self.linear(activation_out)
+        return output, source_lengths, target_lengths
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        JOINER_DIM = 512
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.predictor = Predictor(
+            num_symbols=self.cfg.label_target_size + 1,
+            output_dim=JOINER_DIM,
+            symbol_embedding_dim=256,
+            num_lstm_layers=1,
+            lstm_hidden_dim=1024,
+            lstm_dropout=0.0,
+        )
+        self.joiner = Joiner(
+            input_dim=JOINER_DIM,
+            output_dim=self.cfg.label_target_size + 1,
+        )
+        self.encoder_out_linear = nn.Linear(conformer_size, JOINER_DIM)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0)
+        # No particular weight init!
+
+    def forward(
+        self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :param labels: [B, N]
+        :param labels_len: length of N as [B]
+        :return: logprobs [B, T + N, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.encoder_out_linear(conformer_out)
+        conformer_out_lengths = torch.sum(out_mask, dim=1)  # [B, T] -> [B]
+
+        predict_out, _, _ = self.predictor(
+            input=labels,
+            lengths=labels_len,
+        )
+
+        output_logits, src_len, tgt_len = self.joiner(
+            source_encodings=conformer_out,
+            source_lengths=conformer_out_lengths,
+            target_encodings=predict_out,
+            target_lengths=labels_len,
+        )  # output is [B, T, N, #vocab]
+
+        return output_logits, src_len
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1])
+    prepended_targets[:, 1:] = labels
+    prepended_targets[:, 0] = model.cfg.label_target_size  # blank is last index
+    prepended_target_lengths = labels_len + 1
+
+    logits, audio_features_len = model(
+        raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths
+    )
+
+    rnnt_loss = model.loss(
+        logits=logits,
+        logit_lengths=audio_features_len.to(dtype=torch.int32),
+        targets=labels,
+        target_lengths=labels_len.to(dtype=torch.int32),
+    )
+
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py
new file mode 100644
index 000000000..c5ff0e77c
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_cfg.py
@@ -0,0 +1,86 @@
+"""
+Config for the base CTC models v4, including specaug start time
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass(kw_only=True)
+class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config):
+    activation_str: str = ""
+    activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        activation_str = d.pop("activation_str")
+        if activation_str == "ReLU":
+            from torch.nn import ReLU
+
+            activation = ReLU()
+        else:
+            assert False, "Unsupported activation %s" % d["activation_str"]
+        d["activation"] = activation
+        return VGG4LayerActFrontendV1Config(**d)
+
+
+@dataclass
+class ConformerEncoderV1Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: Number of conformer layers in the conformer encoder
+        frontend: A pair of ConformerFrontend and corresponding config
+        block_cfg: Configuration for ConformerBlockV1
+    """
+
+    num_layers: int
+
+    # nested configurations
+    frontend: ModuleFactoryV1
+    block_cfg: ConformerBlockV1Config
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return SpecaugConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    frontend_config: VGG4LayerActFrontendV1Config
+    specaug_config: SpecaugConfig
+    specauc_start_epoch: int
+    label_target_size: int
+    conformer_size: int
+    num_layers: int
+    num_heads: int
+    ff_dim: int
+    att_weights_dropout: float
+    conv_dropout: float
+    ff_dropout: float
+    mhsa_dropout: float
+    conv_kernel_size: int
+    final_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_transparent.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_transparent.py
new file mode 100644
index 000000000..e76513f95
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_transparent.py
@@ -0,0 +1,413 @@
+"""
+
+"""
+
+import numpy as np
+import torch
+import torchaudio
+from torch import nn
+from typing import List, Optional, Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Predictor(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) prediction network.
+
+    Args:
+        num_symbols (int): size of target token lexicon.
+        output_dim (int): feature dimension of each output sequence element.
+        symbol_embedding_dim (int): dimension of each target token embedding.
+        num_lstm_layers (int): number of LSTM layers to instantiate.
+        lstm_hidden_dim (int): output dimension of each LSTM layer.
+        lstm_dropout (float, optional): LSTM dropout probability. (Default: 0.0)
+
+    """
+
+    def __init__(
+        self,
+        num_symbols: int,
+        output_dim: int,
+        symbol_embedding_dim: int,
+        num_lstm_layers: int,
+        lstm_hidden_dim: int,
+        lstm_dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.embedding = torch.nn.Embedding(num_symbols, symbol_embedding_dim)
+        self.input_layer_norm = torch.nn.LayerNorm(symbol_embedding_dim)
+        self.lstm_layers = torch.nn.ModuleList(
+            [
+                nn.LSTM(
+                    input_size=symbol_embedding_dim if idx == 0 else lstm_hidden_dim,
+                    hidden_size=lstm_hidden_dim,
+                )
+                for idx in range(num_lstm_layers)
+            ]
+        )
+        self.dropout = torch.nn.Dropout(p=lstm_dropout)
+        self.linear = torch.nn.Linear(lstm_hidden_dim, output_dim)
+        self.output_layer_norm = torch.nn.LayerNorm(output_dim)
+
+        self.lstm_dropout = lstm_dropout
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass.
+
+        B: batch size;
+        U: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output encoding sequences, with shape `(B, U, output_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output encoding sequences.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``forward``.
+        """
+        input_tb = input.permute(1, 0)
+        embedding_out = self.embedding(input_tb)
+        input_layer_norm_out = self.input_layer_norm(embedding_out)
+
+        lstm_out = input_layer_norm_out
+        state_out: List[List[torch.Tensor]] = []
+        for layer_idx, lstm in enumerate(self.lstm_layers):
+            lstm_out, lstm_state_out = lstm(lstm_out, None if state is None else state[layer_idx])
+            lstm_out = self.dropout(lstm_out)
+            state_out.append(lstm_state_out)
+
+        linear_out = self.linear(lstm_out)
+        output_layer_norm_out = self.output_layer_norm(linear_out)
+        return output_layer_norm_out.permute(1, 0, 2), lengths, state_out
+
+
+class Joiner(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) joint network.
+
+    Args:
+        input_dim (int): source and target input dimension.
+        output_dim (int): output dimension.
+        activation (str, optional): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+
+    Taken directly from torchaudio
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
+        if activation == "relu":
+            self.activation = torch.nn.ReLU()
+        elif activation == "tanh":
+            self.activation = torch.nn.Tanh()
+        else:
+            raise ValueError(f"Unsupported activation {activation}")
+
+    def forward(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
+        activation_out = self.activation(joint_encodings)
+        output = self.linear(activation_out)
+        return output, source_lengths, target_lengths
+
+
+class TransparentConformerEncoderV1(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,)))
+
+        torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1))
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        final = transparent_weights[0] * x
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            final = final + (transparent_weights[i + 1] * x)
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        JOINER_DIM = 512
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV1(cfg=conformer_config)
+        self.predictor = Predictor(
+            num_symbols=self.cfg.label_target_size + 1,
+            output_dim=JOINER_DIM,
+            symbol_embedding_dim=256,
+            num_lstm_layers=1,
+            lstm_hidden_dim=1024,
+            lstm_dropout=0.0,
+        )
+        self.joiner = Joiner(
+            input_dim=JOINER_DIM,
+            output_dim=self.cfg.label_target_size + 1,
+        )
+        self.encoder_out_linear = nn.Linear(conformer_size, JOINER_DIM)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0)
+        # No particular weight init!
+
+    def forward(
+        self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :param labels: [B, N]
+        :param labels_len: length of N as [B]
+        :return: logprobs [B, T + N, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.encoder_out_linear(conformer_out)
+        conformer_out_lengths = torch.sum(out_mask, dim=1)  # [B, T] -> [B]
+
+        predict_out, _, _ = self.predictor(
+            input=labels,
+            lengths=labels_len,
+        )
+
+        output_logits, src_len, tgt_len = self.joiner(
+            source_encodings=conformer_out,
+            source_lengths=conformer_out_lengths,
+            target_encodings=predict_out,
+            target_lengths=labels_len,
+        )  # output is [B, T, N, #vocab]
+
+        return output_logits, src_len
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1])
+    prepended_targets[:, 1:] = labels
+    prepended_targets[:, 0] = model.cfg.label_target_size  # blank is last index
+    prepended_target_lengths = labels_len + 1
+
+    logits, audio_features_len = model(
+        raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths
+    )
+
+    rnnt_loss = model.loss(
+        logits=logits,
+        logit_lengths=audio_features_len.to(dtype=torch.int32),
+        targets=labels,
+        target_lengths=labels_len.to(dtype=torch.int32),
+    )
+
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_transparent_latepredictor.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_transparent_latepredictor.py
new file mode 100644
index 000000000..bfc183d22
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_transparent_latepredictor.py
@@ -0,0 +1,414 @@
+"""
+
+"""
+
+import numpy as np
+import torch
+import torchaudio
+from torch import nn
+from typing import List, Optional, Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import ModelConfig
+
+from ...legacy_feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Predictor(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) prediction network.
+
+    Taken from torchaudio
+    """
+
+    def __init__(
+        self,
+        num_symbols: int,
+        output_dim: int,
+        symbol_embedding_dim: int,
+        num_lstm_layers: int,
+        lstm_hidden_dim: int,
+        lstm_dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.embedding = torch.nn.Embedding(num_symbols, symbol_embedding_dim)
+        self.input_layer_norm = torch.nn.LayerNorm(symbol_embedding_dim)
+        self.lstm_layers = torch.nn.ModuleList(
+            [
+                nn.LSTM(
+                    input_size=symbol_embedding_dim if idx == 0 else lstm_hidden_dim,
+                    hidden_size=lstm_hidden_dim,
+                    batch_first=True,
+                )
+                for idx in range(num_lstm_layers)
+            ]
+        )
+        self.dropout = torch.nn.Dropout(p=lstm_dropout)
+        self.linear = torch.nn.Linear(lstm_hidden_dim, output_dim)
+        self.output_layer_norm = torch.nn.LayerNorm(output_dim)
+
+        self.lstm_dropout = lstm_dropout
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass.
+
+        B: batch size;
+        U: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output encoding sequences, with shape `(B, U, output_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output encoding sequences.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``forward``.
+        """
+        # input_tb = input.permute(1, 0)
+        input_tb = input
+        embedding_out = self.embedding(input_tb)
+        input_layer_norm_out = self.input_layer_norm(embedding_out)
+
+        lstm_out = input_layer_norm_out
+        state_out: List[List[torch.Tensor]] = []
+        for layer_idx, lstm in enumerate(self.lstm_layers):
+            lstm_out, lstm_state_out = lstm(
+                lstm_out, None if state is None else [s.permute(1, 0, 2) for s in state[layer_idx]]
+            )
+            lstm_out = self.dropout(lstm_out)
+            state_out.append([s.permute(1, 0, 2) for s in lstm_state_out])
+
+        linear_out = self.linear(lstm_out)
+        output_layer_norm_out = self.output_layer_norm(linear_out)
+        return output_layer_norm_out, lengths, state_out
+        # return output_layer_norm_out.permute(1, 0, 2), lengths, state_out
+
+
+class Joiner(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) joint network.
+
+    Args:
+        input_dim (int): source and target input dimension.
+        output_dim (int): output dimension.
+        activation (str, optional): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+
+    Taken directly from torchaudio
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
+        if activation == "relu":
+            self.activation = torch.nn.ReLU()
+        elif activation == "tanh":
+            self.activation = torch.nn.Tanh()
+        else:
+            raise ValueError(f"Unsupported activation {activation}")
+
+    def forward(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
+        activation_out = self.activation(joint_encodings)
+        output = self.linear(activation_out)
+        return output, source_lengths, target_lengths
+
+
+class TransparentConformerEncoderV1(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,)))
+
+        torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1))
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        final = transparent_weights[0] * x
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            final = final + (transparent_weights[i + 1] * x)
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        JOINER_DIM = 512
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV1(cfg=conformer_config)
+        self.predictor = Predictor(
+            num_symbols=self.cfg.label_target_size + 1,
+            output_dim=JOINER_DIM,
+            symbol_embedding_dim=256,
+            num_lstm_layers=1,
+            lstm_hidden_dim=1024,
+            lstm_dropout=0.0,
+        )
+        self.joiner = Joiner(
+            input_dim=JOINER_DIM,
+            output_dim=self.cfg.label_target_size + 1,
+        )
+        self.encoder_out_linear = nn.Linear(conformer_size, JOINER_DIM)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0)
+        # No particular weight init!
+
+    def forward(
+        self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :param labels: [B, N]
+        :param labels_len: length of N as [B]
+        :return: logprobs [B, T + N, #labels + blank]
+        """
+
+        run_ctx = get_run_ctx()
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.encoder_out_linear(conformer_out)
+        conformer_out_lengths = torch.sum(out_mask, dim=1)  # [B, T] -> [B]
+
+        predict_out, _, _ = self.predictor(
+            input=labels,
+            lengths=labels_len,
+        )
+
+        if self.training and run_ctx.epoch < self.specaug_start_epoch:
+            predict_out *= 0
+
+        output_logits, src_len, tgt_len = self.joiner(
+            source_encodings=conformer_out,
+            source_lengths=conformer_out_lengths,
+            target_encodings=predict_out,
+            target_lengths=labels_len,
+        )  # output is [B, T, N, #vocab]
+
+        return output_logits, src_len
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1])
+    prepended_targets[:, 1:] = labels
+    prepended_targets[:, 0] = model.cfg.label_target_size  # blank is last index
+    prepended_target_lengths = labels_len + 1
+
+    logits, audio_features_len = model(
+        raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths
+    )
+
+    rnnt_loss = model.loss(
+        logits=logits,
+        logit_lengths=audio_features_len.to(dtype=torch.int32),
+        targets=labels,
+        target_lengths=labels_len.to(dtype=torch.int32),
+    )
+
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5.py
new file mode 100644
index 000000000..3af72a796
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5.py
@@ -0,0 +1,357 @@
+"""
+Modified from v4 with proper configuration for the predictor and using i6models feature extraction
+"""
+
+import numpy as np
+import torch
+import torchaudio
+from torch import nn
+from typing import List, Optional, Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_v5_cfg import ModelConfig, PredictorConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Predictor(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) prediction network.
+
+    Taken from torchaudio
+    """
+
+    def __init__(self, cfg: PredictorConfig, label_target_size: int, output_dim: int) -> None:
+        """
+
+        :param cfg: model configuration for the predictor
+        :param label_target_size: shared value from model
+        :param output_dim: shared value from model
+        """
+        super().__init__()
+        self.embedding = torch.nn.Embedding(label_target_size, cfg.symbol_embedding_dim)
+        self.input_layer_norm = torch.nn.LayerNorm(cfg.symbol_embedding_dim)
+        self.lstm_layers = torch.nn.ModuleList(
+            [
+                nn.LSTM(
+                    input_size=cfg.symbol_embedding_dim if idx == 0 else cfg.lstm_hidden_dim,
+                    hidden_size=cfg.lstm_hidden_dim,
+                )
+                for idx in range(cfg.num_lstm_layers)
+            ]
+        )
+        self.dropout = torch.nn.Dropout(p=cfg.lstm_dropout)
+        self.linear = torch.nn.Linear(cfg.lstm_hidden_dim, output_dim)
+        self.output_layer_norm = torch.nn.LayerNorm(output_dim)
+
+        self.lstm_dropout = cfg.lstm_dropout
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass.
+
+        B: batch size;
+        U: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output encoding sequences, with shape `(B, U, output_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output encoding sequences.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``forward``.
+        """
+        input_tb = input.permute(1, 0)
+        embedding_out = self.embedding(input_tb)
+        input_layer_norm_out = self.input_layer_norm(embedding_out)
+
+        lstm_out = input_layer_norm_out
+        state_out: List[List[torch.Tensor]] = []
+        for layer_idx, lstm in enumerate(self.lstm_layers):
+            lstm_out, lstm_state_out = lstm(lstm_out, None if state is None else state[layer_idx])
+            lstm_out = self.dropout(lstm_out)
+            state_out.append(lstm_state_out)
+
+        linear_out = self.linear(lstm_out)
+        output_layer_norm_out = self.output_layer_norm(linear_out)
+        return output_layer_norm_out.permute(1, 0, 2), lengths, state_out
+
+
+class Joiner(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) joint network.
+
+    Args:
+        input_dim (int): source and target input dimension.
+        output_dim (int): output dimension.
+        activation (str, optional): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+
+    Taken directly from torchaudio
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
+        if activation == "relu":
+            self.activation = torch.nn.ReLU()
+        elif activation == "tanh":
+            self.activation = torch.nn.Tanh()
+        else:
+            raise ValueError(f"Unsupported activation {activation}")
+
+    def forward(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
+        activation_out = self.activation(joint_encodings)
+        output = self.linear(activation_out)
+        return output, source_lengths, target_lengths
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.predictor = Predictor(
+            cfg=self.cfg.predictor_config,
+            label_target_size=self.cfg.label_target_size + 1,  # ctc blank added
+            output_dim=self.cfg.joiner_dim,
+        )
+        self.joiner = Joiner(
+            input_dim=self.cfg.joiner_dim,
+            output_dim=self.cfg.label_target_size + 1,
+            activation=self.cfg.joiner_activation,
+        )
+        self.encoder_out_linear = nn.Linear(conformer_size, self.cfg.joiner_dim)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0)
+        # No particular weight init!
+
+    def forward(
+        self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :param labels: [B, N]
+        :param labels_len: length of N as [B]
+        :return: logprobs [B, T + N, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.encoder_out_linear(conformer_out)
+        conformer_out_lengths = torch.sum(out_mask, dim=1)  # [B, T] -> [B]
+
+        predict_out, _, _ = self.predictor(
+            input=labels,
+            lengths=labels_len,
+        )
+
+        output_logits, src_len, tgt_len = self.joiner(
+            source_encodings=conformer_out,
+            source_lengths=conformer_out_lengths,
+            target_encodings=predict_out,
+            target_lengths=labels_len,
+        )  # output is [B, T, N, #vocab]
+
+        return output_logits, src_len
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B], cpu transfer needed only for Mini-RETURNN
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1])
+    prepended_targets[:, 1:] = labels
+    prepended_targets[:, 0] = model.cfg.label_target_size  # blank is last index
+    prepended_target_lengths = labels_len + 1
+
+    logits, audio_features_len = model(
+        raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths
+    )
+
+    rnnt_loss = model.loss(
+        logits=logits,
+        logit_lengths=audio_features_len.to(dtype=torch.int32),
+        targets=labels,
+        target_lengths=labels_len.to(dtype=torch.int32),
+    )
+
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_cfg.py
new file mode 100644
index 000000000..fe9b127e4
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_cfg.py
@@ -0,0 +1,103 @@
+"""
+Config for the base CTC models v4, including specaug start time
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass(kw_only=True)
+class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config):
+    activation_str: str = ""
+    activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        activation_str = d.pop("activation_str")
+        if activation_str == "ReLU":
+            from torch.nn import ReLU
+
+            activation = ReLU()
+        else:
+            assert False, "Unsupported activation %s" % d["activation_str"]
+        d["activation"] = activation
+        return VGG4LayerActFrontendV1Config(**d)
+
+
+@dataclass
+class PredictorConfig(ModelConfiguration):
+    symbol_embedding_dim: int
+    num_lstm_layers: int
+    lstm_hidden_dim: int
+    lstm_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return PredictorConfig(**d)
+
+
+@dataclass
+class ConformerEncoderV1Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: Number of conformer layers in the conformer encoder
+        frontend: A pair of ConformerFrontend and corresponding config
+        block_cfg: Configuration for ConformerBlockV1
+    """
+
+    num_layers: int
+
+    # nested configurations
+    frontend: ModuleFactoryV1
+    block_cfg: ConformerBlockV1Config
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return SpecaugConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    frontend_config: VGG4LayerActFrontendV1Config
+    predictor_config: PredictorConfig
+    specaug_config: SpecaugConfig
+    specauc_start_epoch: int
+    label_target_size: int
+    conformer_size: int
+    num_layers: int
+    num_heads: int
+    ff_dim: int
+    att_weights_dropout: float
+    conv_dropout: float
+    ff_dropout: float
+    mhsa_dropout: float
+    conv_kernel_size: int
+    final_dropout: float
+    joiner_dim: int
+    joiner_activation: str
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"])
+        d["predictor_config"] = PredictorConfig.from_dict(d["predictor_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_transparent.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_transparent.py
new file mode 100644
index 000000000..6ebd64d03
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_transparent.py
@@ -0,0 +1,402 @@
+"""
+Modified from v4 with proper configuration for the predictor and using i6models feature extraction
+"""
+
+import numpy as np
+import torch
+import torchaudio
+from torch import nn
+from typing import List, Optional, Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_v5_cfg import ModelConfig, PredictorConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Predictor(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) prediction network.
+
+    Taken from torchaudio
+    """
+
+    def __init__(self, cfg: PredictorConfig, label_target_size: int, output_dim: int) -> None:
+        """
+
+        :param cfg: model configuration for the predictor
+        :param label_target_size: shared value from model
+        :param output_dim: shared value from model
+        """
+        super().__init__()
+        self.embedding = torch.nn.Embedding(label_target_size, cfg.symbol_embedding_dim)
+        self.input_layer_norm = torch.nn.LayerNorm(cfg.symbol_embedding_dim)
+        self.lstm_layers = torch.nn.ModuleList(
+            [
+                nn.LSTM(
+                    input_size=cfg.symbol_embedding_dim if idx == 0 else cfg.lstm_hidden_dim,
+                    hidden_size=cfg.lstm_hidden_dim,
+                )
+                for idx in range(cfg.num_lstm_layers)
+            ]
+        )
+        self.dropout = torch.nn.Dropout(p=cfg.lstm_dropout)
+        self.linear = torch.nn.Linear(cfg.lstm_hidden_dim, output_dim)
+        self.output_layer_norm = torch.nn.LayerNorm(output_dim)
+
+        self.lstm_dropout = cfg.lstm_dropout
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass.
+
+        B: batch size;
+        U: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output encoding sequences, with shape `(B, U, output_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output encoding sequences.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``forward``.
+        """
+        input_tb = input.permute(1, 0)
+        embedding_out = self.embedding(input_tb)
+        input_layer_norm_out = self.input_layer_norm(embedding_out)
+
+        lstm_out = input_layer_norm_out
+        state_out: List[List[torch.Tensor]] = []
+        for layer_idx, lstm in enumerate(self.lstm_layers):
+            lstm_out, lstm_state_out = lstm(
+                lstm_out, None if state is None else [s.permute(1, 0, 2) for s in state[layer_idx]]
+            )
+            lstm_out = self.dropout(lstm_out)
+            state_out.append([s.permute(1, 0, 2) for s in lstm_state_out])
+
+        linear_out = self.linear(lstm_out)
+        output_layer_norm_out = self.output_layer_norm(linear_out)
+        return output_layer_norm_out.permute(1, 0, 2), lengths, state_out
+
+
+class Joiner(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) joint network.
+
+    Args:
+        input_dim (int): source and target input dimension.
+        output_dim (int): output dimension.
+        activation (str, optional): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+
+    Taken directly from torchaudio
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
+        if activation == "relu":
+            self.activation = torch.nn.ReLU()
+        elif activation == "tanh":
+            self.activation = torch.nn.Tanh()
+        else:
+            raise ValueError(f"Unsupported activation {activation}")
+
+    def forward(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
+        activation_out = self.activation(joint_encodings)
+        output = self.linear(activation_out)
+        return output, source_lengths, target_lengths
+
+
+class TransparentConformerEncoderV1(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,)))
+
+        torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1))
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        final = transparent_weights[0] * x
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            final = final + (transparent_weights[i + 1] * x)
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV1(cfg=conformer_config)
+        self.predictor = Predictor(
+            cfg=self.cfg.predictor_config,
+            label_target_size=self.cfg.label_target_size + 1,  # ctc blank added
+            output_dim=self.cfg.joiner_dim,
+        )
+        self.joiner = Joiner(
+            input_dim=self.cfg.joiner_dim,
+            output_dim=self.cfg.label_target_size + 1,
+            activation=self.cfg.joiner_activation,
+        )
+        self.encoder_out_linear = nn.Linear(conformer_size, self.cfg.joiner_dim)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0)
+        # No particular weight init!
+
+    def forward(
+        self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :param labels: [B, N]
+        :param labels_len: length of N as [B]
+        :return: logprobs [B, T + N, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.encoder_out_linear(conformer_out)
+        conformer_out_lengths = torch.sum(out_mask, dim=1)  # [B, T] -> [B]
+
+        predict_out, _, _ = self.predictor(
+            input=labels,
+            lengths=labels_len,
+        )
+
+        output_logits, src_len, tgt_len = self.joiner(
+            source_encodings=conformer_out,
+            source_lengths=conformer_out_lengths,
+            target_encodings=predict_out,
+            target_lengths=labels_len,
+        )  # output is [B, T, N, #vocab]
+
+        return output_logits, src_len
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B], cpu transfer needed only for Mini-RETURNN
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1])
+    prepended_targets[:, 1:] = labels
+    prepended_targets[:, 0] = model.cfg.label_target_size  # blank is last index
+    prepended_target_lengths = labels_len + 1
+
+    logits, audio_features_len = model(
+        raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths
+    )
+
+    rnnt_loss = model.loss(
+        logits=logits,
+        logit_lengths=audio_features_len.to(dtype=torch.int32),
+        targets=labels,
+        target_lengths=labels_len.to(dtype=torch.int32),
+    )
+
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py
new file mode 100644
index 000000000..9d44ffb0c
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_cfg.py
@@ -0,0 +1,105 @@
+"""
+Config for the base CTC models v4, including specaug start time
+"""
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+from typing import Callable, Optional, Type, Union
+
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerBlockV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1Config
+from i6_models.config import ModuleFactoryV1, ModelConfiguration
+
+
+@dataclass(kw_only=True)
+class VGG4LayerActFrontendV1Config_mod(VGG4LayerActFrontendV1Config):
+    activation_str: str = ""
+    activation: Optional[Union[nn.Module, Callable[[torch.Tensor], torch.Tensor]]] = None
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        activation_str = d.pop("activation_str")
+        if activation_str == "ReLU":
+            from torch.nn import ReLU
+
+            activation = ReLU()
+        else:
+            assert False, "Unsupported activation %s" % d["activation_str"]
+        d["activation"] = activation
+        return VGG4LayerActFrontendV1Config(**d)
+
+
+@dataclass
+class PredictorConfig(ModelConfiguration):
+    symbol_embedding_dim: int
+    emebdding_dropout: float
+    num_lstm_layers: int
+    lstm_hidden_dim: int
+    lstm_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return PredictorConfig(**d)
+
+
+@dataclass
+class ConformerEncoderV1Config(ModelConfiguration):
+    """
+    Attributes:
+        num_layers: Number of conformer layers in the conformer encoder
+        frontend: A pair of ConformerFrontend and corresponding config
+        block_cfg: Configuration for ConformerBlockV1
+    """
+
+    num_layers: int
+
+    # nested configurations
+    frontend: ModuleFactoryV1
+    block_cfg: ConformerBlockV1Config
+
+
+@dataclass
+class SpecaugConfig(ModelConfiguration):
+    repeat_per_n_frames: int
+    max_dim_time: int
+    num_repeat_feat: int
+    max_dim_feat: int
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        return SpecaugConfig(**d)
+
+
+@dataclass
+class ModelConfig:
+    frontend_config: VGG4LayerActFrontendV1Config
+    predictor_config: PredictorConfig
+    specaug_config: SpecaugConfig
+    specauc_start_epoch: int
+    label_target_size: int
+    conformer_size: int
+    num_layers: int
+    num_heads: int
+    ff_dim: int
+    att_weights_dropout: float
+    conv_dropout: float
+    ff_dropout: float
+    mhsa_dropout: float
+    conv_kernel_size: int
+    final_dropout: float
+    joiner_dim: int
+    joiner_activation: str
+    joiner_dropout: float
+
+    @classmethod
+    def from_dict(cls, d):
+        d = d.copy()
+        d["frontend_config"] = VGG4LayerActFrontendV1Config_mod.from_dict(d["frontend_config"])
+        d["specaug_config"] = SpecaugConfig.from_dict(d["specaug_config"])
+        d["predictor_config"] = PredictorConfig.from_dict(d["predictor_config"])
+        return ModelConfig(**d)
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_transparent.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_transparent.py
new file mode 100644
index 000000000..cc1a499ca
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_transparent.py
@@ -0,0 +1,408 @@
+"""
+Modified from v4 with proper configuration for the predictor and using i6models feature extraction
+
+Has a bug where joiner dropout is not set
+"""
+
+import numpy as np
+import torch
+import torchaudio
+from torch import nn
+from typing import List, Optional, Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ModelConfig, PredictorConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Predictor(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) prediction network.
+
+    Taken from torchaudio
+    """
+
+    def __init__(self, cfg: PredictorConfig, label_target_size: int, output_dim: int) -> None:
+        """
+
+        :param cfg: model configuration for the predictor
+        :param label_target_size: shared value from model
+        :param output_dim: shared value from model
+        """
+        super().__init__()
+        self.embedding = torch.nn.Embedding(label_target_size, cfg.symbol_embedding_dim)
+        self.embedding_dropout = nn.Dropout(cfg.emebdding_dropout)
+        self.input_layer_norm = torch.nn.LayerNorm(cfg.symbol_embedding_dim)
+        self.lstm_layers = torch.nn.ModuleList(
+            [
+                nn.LSTM(
+                    input_size=cfg.symbol_embedding_dim if idx == 0 else cfg.lstm_hidden_dim,
+                    hidden_size=cfg.lstm_hidden_dim,
+                )
+                for idx in range(cfg.num_lstm_layers)
+            ]
+        )
+        self.dropout = torch.nn.Dropout(p=cfg.lstm_dropout)
+        self.linear = torch.nn.Linear(cfg.lstm_hidden_dim, output_dim)
+        self.output_layer_norm = torch.nn.LayerNorm(output_dim)
+
+        self.lstm_dropout = cfg.lstm_dropout
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass.
+
+        B: batch size;
+        U: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output encoding sequences, with shape `(B, U, output_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output encoding sequences.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``forward``.
+        """
+        input_tb = input.permute(1, 0)
+        embedding_out = self.embedding(input_tb)
+        embedding_out = self.embedding_dropout(embedding_out)
+        input_layer_norm_out = self.input_layer_norm(embedding_out)
+
+        lstm_out = input_layer_norm_out
+        state_out: List[List[torch.Tensor]] = []
+        for layer_idx, lstm in enumerate(self.lstm_layers):
+            lstm_out, lstm_state_out = lstm(
+                lstm_out, None if state is None else [s.permute(1, 0, 2) for s in state[layer_idx]]
+            )
+            lstm_out = self.dropout(lstm_out)
+            state_out.append([s.permute(1, 0, 2) for s in lstm_state_out])
+
+        linear_out = self.linear(lstm_out)
+        output_layer_norm_out = self.output_layer_norm(linear_out)
+        return output_layer_norm_out.permute(1, 0, 2), lengths, state_out
+
+
+class Joiner(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) joint network.
+
+    Args:
+        input_dim (int): source and target input dimension.
+        output_dim (int): output dimension.
+        activation (str, optional): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+
+    Taken directly from torchaudio
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, activation: str = "relu", dropout: float = 0.0) -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
+        self.dropout = nn.Dropout(p=dropout)
+        if activation == "relu":
+            self.activation = torch.nn.ReLU()
+        elif activation == "tanh":
+            self.activation = torch.nn.Tanh()
+        else:
+            raise ValueError(f"Unsupported activation {activation}")
+
+    def forward(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
+        joint_encodings = self.dropout(joint_encodings)
+        activation_out = self.activation(joint_encodings)
+        output = self.linear(activation_out)
+        return output, source_lengths, target_lengths
+
+
+class TransparentConformerEncoderV1(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,)))
+
+        torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1))
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        final = transparent_weights[0] * x
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            final = final + (transparent_weights[i + 1] * x)
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV1(cfg=conformer_config)
+        self.predictor = Predictor(
+            cfg=self.cfg.predictor_config,
+            label_target_size=self.cfg.label_target_size + 1,  # ctc blank added
+            output_dim=self.cfg.joiner_dim,
+        )
+        self.joiner = Joiner(
+            input_dim=self.cfg.joiner_dim,
+            output_dim=self.cfg.label_target_size + 1,
+            activation=self.cfg.joiner_activation,
+        )
+        self.encoder_out_linear = nn.Linear(conformer_size, self.cfg.joiner_dim)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0)
+        # No particular weight init!
+
+    def forward(
+        self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :param labels: [B, N]
+        :param labels_len: length of N as [B]
+        :return: logprobs [B, T + N, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.encoder_out_linear(conformer_out)
+        conformer_out_lengths = torch.sum(out_mask, dim=1)  # [B, T] -> [B]
+
+        predict_out, _, _ = self.predictor(
+            input=labels,
+            lengths=labels_len,
+        )
+
+        output_logits, src_len, tgt_len = self.joiner(
+            source_encodings=conformer_out,
+            source_lengths=conformer_out_lengths,
+            target_encodings=predict_out,
+            target_lengths=labels_len,
+        )  # output is [B, T, N, #vocab]
+
+        return output_logits, src_len
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B], cpu transfer needed only for Mini-RETURNN
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1])
+    prepended_targets[:, 1:] = labels
+    prepended_targets[:, 0] = model.cfg.label_target_size  # blank is last index
+    prepended_target_lengths = labels_len + 1
+
+    logits, audio_features_len = model(
+        raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths
+    )
+
+    rnnt_loss = model.loss(
+        logits=logits,
+        logit_lengths=audio_features_len.to(dtype=torch.int32),
+        targets=labels,
+        target_lengths=labels_len.to(dtype=torch.int32),
+    )
+
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7.py
new file mode 100644
index 000000000..d18869944
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7.py
@@ -0,0 +1,366 @@
+"""
+Modified from v4 with proper configuration for the predictor and using i6models feature extraction
+
+Sets joiner dropout correctly
+"""
+
+import numpy as np
+import torch
+import torchaudio
+from torch import nn
+from typing import List, Optional, Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ModelConfig, PredictorConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Predictor(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) prediction network.
+
+    Taken from torchaudio
+    """
+
+    def __init__(self, cfg: PredictorConfig, label_target_size: int, output_dim: int) -> None:
+        """
+
+        :param cfg: model configuration for the predictor
+        :param label_target_size: shared value from model
+        :param output_dim: shared value from model
+        """
+        super().__init__()
+        self.embedding = torch.nn.Embedding(label_target_size, cfg.symbol_embedding_dim)
+        self.embedding_dropout = nn.Dropout(cfg.emebdding_dropout)
+        self.input_layer_norm = torch.nn.LayerNorm(cfg.symbol_embedding_dim)
+        self.lstm_layers = torch.nn.ModuleList(
+            [
+                nn.LSTM(
+                    input_size=cfg.symbol_embedding_dim if idx == 0 else cfg.lstm_hidden_dim,
+                    hidden_size=cfg.lstm_hidden_dim,
+                )
+                for idx in range(cfg.num_lstm_layers)
+            ]
+        )
+        self.dropout = torch.nn.Dropout(p=cfg.lstm_dropout)
+        self.linear = torch.nn.Linear(cfg.lstm_hidden_dim, output_dim)
+        self.output_layer_norm = torch.nn.LayerNorm(output_dim)
+
+        self.lstm_dropout = cfg.lstm_dropout
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass.
+
+        B: batch size;
+        U: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output encoding sequences, with shape `(B, U, output_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output encoding sequences.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``forward``.
+        """
+        input_tb = input.permute(1, 0)
+        embedding_out = self.embedding(input_tb)
+        embedding_out = self.embedding_dropout(embedding_out)
+        input_layer_norm_out = self.input_layer_norm(embedding_out)
+
+        lstm_out = input_layer_norm_out
+        state_out: List[List[torch.Tensor]] = []
+        for layer_idx, lstm in enumerate(self.lstm_layers):
+            lstm_out, lstm_state_out = lstm(
+                lstm_out, None if state is None else [s.permute(1, 0, 2) for s in state[layer_idx]]
+            )
+            lstm_out = self.dropout(lstm_out)
+            state_out.append([s.permute(1, 0, 2) for s in lstm_state_out])
+
+        linear_out = self.linear(lstm_out)
+        output_layer_norm_out = self.output_layer_norm(linear_out)
+        return output_layer_norm_out.permute(1, 0, 2), lengths, state_out
+
+
+class Joiner(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) joint network.
+
+    Args:
+        input_dim (int): source and target input dimension.
+        output_dim (int): output dimension.
+        activation (str, optional): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+
+    Taken directly from torchaudio
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, activation: str = "relu", dropout: float = 0.0) -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
+        self.dropout = nn.Dropout(p=dropout)
+        if activation == "relu":
+            self.activation = torch.nn.ReLU()
+        elif activation == "tanh":
+            self.activation = torch.nn.Tanh()
+        else:
+            raise ValueError(f"Unsupported activation {activation}")
+
+    def forward(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
+        joint_encodings = self.dropout(joint_encodings)
+        activation_out = self.activation(joint_encodings)
+        output = self.linear(activation_out)
+        return output, source_lengths, target_lengths
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = ConformerEncoderV1(cfg=conformer_config)
+        self.predictor = Predictor(
+            cfg=self.cfg.predictor_config,
+            label_target_size=self.cfg.label_target_size + 1,  # ctc blank added
+            output_dim=self.cfg.joiner_dim,
+        )
+        self.joiner = Joiner(
+            input_dim=self.cfg.joiner_dim,
+            output_dim=self.cfg.label_target_size + 1,
+            activation=self.cfg.joiner_activation,
+            dropout=self.cfg.joiner_dropout,
+        )
+        self.encoder_out_linear = nn.Linear(conformer_size, self.cfg.joiner_dim)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0)
+        # No particular weight init!
+
+    def forward(
+        self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :param labels: [B, N]
+        :param labels_len: length of N as [B]
+        :return: logprobs [B, T + N, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.encoder_out_linear(conformer_out)
+        conformer_out_lengths = torch.sum(out_mask, dim=1)  # [B, T] -> [B]
+
+        predict_out, _, _ = self.predictor(
+            input=labels,
+            lengths=labels_len,
+        )
+
+        output_logits, src_len, tgt_len = self.joiner(
+            source_encodings=conformer_out,
+            source_lengths=conformer_out_lengths,
+            target_encodings=predict_out,
+            target_lengths=labels_len,
+        )  # output is [B, T, N, #vocab]
+
+        return output_logits, src_len
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B], cpu transfer needed only for Mini-RETURNN
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1])
+    prepended_targets[:, 1:] = labels
+    prepended_targets[:, 0] = model.cfg.label_target_size  # blank is last index
+    prepended_target_lengths = labels_len + 1
+
+    logits, audio_features_len = model(
+        raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths
+    )
+
+    rnnt_loss = model.loss(
+        logits=logits,
+        logit_lengths=audio_features_len.to(dtype=torch.int32),
+        targets=labels,
+        target_lengths=labels_len.to(dtype=torch.int32),
+    )
+
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent.py
new file mode 100644
index 000000000..26a8c8d94
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent.py
@@ -0,0 +1,409 @@
+"""
+Modified from v4 with proper configuration for the predictor and using i6models feature extraction
+
+Sets joiner dropout correctly
+"""
+
+import numpy as np
+import torch
+import torchaudio
+from torch import nn
+from typing import List, Optional, Tuple
+
+from i6_models.parts.conformer.norm import LayerNormNC
+from i6_models.assemblies.conformer.conformer_v1 import ConformerEncoderV1Config
+from i6_models.assemblies.conformer.conformer_v1 import ConformerBlockV1Config, ConformerEncoderV1, ConformerBlockV1
+from i6_models.config import ModuleFactoryV1
+from i6_models.parts.frontend.vgg_act import VGG4LayerActFrontendV1
+
+from i6_models.parts.conformer.convolution import ConformerConvolutionV1Config
+from i6_models.parts.conformer.feedforward import ConformerPositionwiseFeedForwardV1Config
+from i6_models.parts.conformer.mhsa import ConformerMHSAV1Config
+from i6_models.primitives.specaugment import specaugment_v1_by_length
+from i6_models.primitives.feature_extraction import LogMelFeatureExtractionV1, LogMelFeatureExtractionV1Config
+
+from returnn.torch.context import get_run_ctx
+
+from .i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import ModelConfig, PredictorConfig
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Predictor(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) prediction network.
+
+    Taken from torchaudio
+    """
+
+    def __init__(self, cfg: PredictorConfig, label_target_size: int, output_dim: int) -> None:
+        """
+
+        :param cfg: model configuration for the predictor
+        :param label_target_size: shared value from model
+        :param output_dim: shared value from model
+        """
+        super().__init__()
+        self.embedding = torch.nn.Embedding(label_target_size, cfg.symbol_embedding_dim)
+        self.embedding_dropout = nn.Dropout(cfg.emebdding_dropout)
+        self.input_layer_norm = torch.nn.LayerNorm(cfg.symbol_embedding_dim)
+        self.lstm_layers = torch.nn.ModuleList(
+            [
+                nn.LSTM(
+                    input_size=cfg.symbol_embedding_dim if idx == 0 else cfg.lstm_hidden_dim,
+                    hidden_size=cfg.lstm_hidden_dim,
+                )
+                for idx in range(cfg.num_lstm_layers)
+            ]
+        )
+        self.dropout = torch.nn.Dropout(p=cfg.lstm_dropout)
+        self.linear = torch.nn.Linear(cfg.lstm_hidden_dim, output_dim)
+        self.output_layer_norm = torch.nn.LayerNorm(output_dim)
+
+        self.lstm_dropout = cfg.lstm_dropout
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        state: Optional[List[List[torch.Tensor]]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        r"""Forward pass.
+
+        B: batch size;
+        U: maximum sequence length in batch;
+        D: feature dimension of each input sequence element.
+
+        Args:
+            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
+                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
+            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                number of valid frames for i-th batch element in ``input``.
+            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
+                representing internal state generated in preceding invocation
+                of ``forward``. (Default: ``None``)
+
+        Returns:
+            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
+                torch.Tensor
+                    output encoding sequences, with shape `(B, U, output_dim)`
+                torch.Tensor
+                    output lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements for i-th batch element in output encoding sequences.
+                List[List[torch.Tensor]]
+                    output states; list of lists of tensors
+                    representing internal state generated in current invocation of ``forward``.
+        """
+        input_tb = input.permute(1, 0)
+        embedding_out = self.embedding(input_tb)
+        embedding_out = self.embedding_dropout(embedding_out)
+        input_layer_norm_out = self.input_layer_norm(embedding_out)
+
+        lstm_out = input_layer_norm_out
+        state_out: List[List[torch.Tensor]] = []
+        for layer_idx, lstm in enumerate(self.lstm_layers):
+            lstm_out, lstm_state_out = lstm(
+                lstm_out, None if state is None else [s.permute(1, 0, 2) for s in state[layer_idx]]
+            )
+            lstm_out = self.dropout(lstm_out)
+            state_out.append([s.permute(1, 0, 2) for s in lstm_state_out])
+
+        linear_out = self.linear(lstm_out)
+        output_layer_norm_out = self.output_layer_norm(linear_out)
+        return output_layer_norm_out.permute(1, 0, 2), lengths, state_out
+
+
+class Joiner(torch.nn.Module):
+    r"""Recurrent neural network transducer (RNN-T) joint network.
+
+    Args:
+        input_dim (int): source and target input dimension.
+        output_dim (int): output dimension.
+        activation (str, optional): activation function to use in the joiner.
+            Must be one of ("relu", "tanh"). (Default: "relu")
+
+    Taken directly from torchaudio
+    """
+
+    def __init__(self, input_dim: int, output_dim: int, activation: str = "relu", dropout: float = 0.0) -> None:
+        super().__init__()
+        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
+        self.dropout = nn.Dropout(p=dropout)
+        if activation == "relu":
+            self.activation = torch.nn.ReLU()
+        elif activation == "tanh":
+            self.activation = torch.nn.Tanh()
+        else:
+            raise ValueError(f"Unsupported activation {activation}")
+
+    def forward(
+        self,
+        source_encodings: torch.Tensor,
+        source_lengths: torch.Tensor,
+        target_encodings: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""Forward pass for training.
+
+        B: batch size;
+        T: maximum source sequence length in batch;
+        U: maximum target sequence length in batch;
+        D: dimension of each source and target sequence encoding.
+
+        Args:
+            source_encodings (torch.Tensor): source encoding sequences, with
+                shape `(B, T, D)`.
+            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``source_encodings``.
+            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
+            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
+                valid sequence length of i-th batch element in ``target_encodings``.
+
+        Returns:
+            (torch.Tensor, torch.Tensor, torch.Tensor):
+                torch.Tensor
+                    joint network output, with shape `(B, T, U, output_dim)`.
+                torch.Tensor
+                    output source lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 1 for i-th batch element in joint network output.
+                torch.Tensor
+                    output target lengths, with shape `(B,)` and i-th element representing
+                    number of valid elements along dim 2 for i-th batch element in joint network output.
+        """
+        joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
+        joint_encodings = self.dropout(joint_encodings)
+        activation_out = self.activation(joint_encodings)
+        output = self.linear(activation_out)
+        return output, source_lengths, target_lengths
+
+
+class TransparentConformerEncoderV1(nn.Module):
+    """
+    Implementation of the convolution-augmented Transformer (short Conformer), as in the original publication.
+
+    The model consists of a frontend and a stack of N conformer blocks.
+    C.f. https://arxiv.org/pdf/2005.08100.pdf
+    """
+
+    def __init__(self, cfg: ConformerEncoderV1Config):
+        """
+        :param cfg: conformer encoder configuration with subunits for frontend and conformer blocks
+        """
+        super().__init__()
+
+        self.frontend = cfg.frontend()
+        self.module_list = torch.nn.ModuleList([ConformerBlockV1(cfg.block_cfg) for _ in range(cfg.num_layers)])
+        self.transparent_scales = nn.Parameter(torch.empty((cfg.num_layers + 1,)))
+
+        torch.nn.init.constant_(self.transparent_scales, 1 / (cfg.num_layers + 1))
+
+    def forward(self, data_tensor: torch.Tensor, sequence_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param data_tensor: input tensor of shape [B, T', F]
+        :param sequence_mask: mask tensor where 0 defines positions within the sequence and 1 outside, shape: [B, T']
+        :return: (output, out_seq_mask)
+            where output is torch.Tensor of shape [B, T, F'],
+            out_seq_mask is a torch.Tensor of shape [B, T]
+
+        F: input feature dim, F': internal and output feature dim
+        T': data time dim, T: down-sampled time dim (internal time dim)
+        """
+        x, sequence_mask = self.frontend(data_tensor, sequence_mask)  # [B, T, F']
+
+        transparent_weights = torch.softmax(self.transparent_scales + 0.001, dim=0)
+        print(transparent_weights)
+
+        final = transparent_weights[0] * x
+        for i, module in enumerate(self.module_list):
+            x = module(x, sequence_mask)  # [B, T, F']
+            final = final + (transparent_weights[i + 1] * x)
+        return final, sequence_mask
+
+
+class Model(torch.nn.Module):
+    def __init__(self, model_config_dict, **kwargs):
+        super().__init__()
+        self.cfg = ModelConfig.from_dict(model_config_dict)
+        fe_config = LogMelFeatureExtractionV1Config(
+            sample_rate=16000,
+            win_size=0.025,
+            hop_size=0.01,
+            f_min=60,
+            f_max=7600,
+            min_amp=1e-10,
+            num_filters=80,
+            center=False,
+        )
+        frontend_config = self.cfg.frontend_config
+        conformer_size = self.cfg.conformer_size
+        conformer_config = ConformerEncoderV1Config(
+            num_layers=self.cfg.num_layers,
+            frontend=ModuleFactoryV1(module_class=VGG4LayerActFrontendV1, cfg=frontend_config),
+            block_cfg=ConformerBlockV1Config(
+                ff_cfg=ConformerPositionwiseFeedForwardV1Config(
+                    input_dim=conformer_size,
+                    hidden_dim=self.cfg.ff_dim,
+                    dropout=self.cfg.ff_dropout,
+                    activation=nn.functional.silu,
+                ),
+                mhsa_cfg=ConformerMHSAV1Config(
+                    input_dim=conformer_size,
+                    num_att_heads=self.cfg.num_heads,
+                    att_weights_dropout=self.cfg.att_weights_dropout,
+                    dropout=self.cfg.mhsa_dropout,
+                ),
+                conv_cfg=ConformerConvolutionV1Config(
+                    channels=conformer_size,
+                    kernel_size=self.cfg.conv_kernel_size,
+                    dropout=self.cfg.conv_dropout,
+                    activation=nn.functional.silu,
+                    norm=LayerNormNC(conformer_size),
+                ),
+            ),
+        )
+
+        self.feature_extraction = LogMelFeatureExtractionV1(cfg=fe_config)
+        self.conformer = TransparentConformerEncoderV1(cfg=conformer_config)
+        self.predictor = Predictor(
+            cfg=self.cfg.predictor_config,
+            label_target_size=self.cfg.label_target_size + 1,  # ctc blank added
+            output_dim=self.cfg.joiner_dim,
+        )
+        self.joiner = Joiner(
+            input_dim=self.cfg.joiner_dim,
+            output_dim=self.cfg.label_target_size + 1,
+            activation=self.cfg.joiner_activation,
+            dropout=self.cfg.joiner_dropout,
+        )
+        self.encoder_out_linear = nn.Linear(conformer_size, self.cfg.joiner_dim)
+        self.specaug_start_epoch = self.cfg.specauc_start_epoch
+
+        self.loss = torchaudio.transforms.RNNTLoss(reduction="sum", clamp=1.0)
+        # No particular weight init!
+
+    def forward(
+        self, raw_audio: torch.Tensor, raw_audio_len: torch.Tensor, labels: torch.Tensor, labels_len: torch.Tensor
+    ):
+        """
+        :param raw_audio: Audio samples as [B, T, 1]
+        :param raw_audio_len: length of T as [B]
+        :param labels: [B, N]
+        :param labels_len: length of N as [B]
+        :return: logprobs [B, T + N, #labels + blank]
+        """
+
+        squeezed_features = torch.squeeze(raw_audio)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, raw_audio_len)
+
+            run_ctx = get_run_ctx()
+            if self.training and run_ctx.epoch >= self.specaug_start_epoch:
+                audio_features_masked_2 = specaugment_v1_by_length(
+                    audio_features,
+                    time_min_num_masks=2,
+                    time_max_mask_per_n_frames=self.cfg.specaug_config.repeat_per_n_frames,
+                    time_mask_max_size=self.cfg.specaug_config.max_dim_time,
+                    freq_min_num_masks=2,
+                    freq_mask_max_size=self.cfg.specaug_config.max_dim_feat,
+                    freq_max_num_masks=self.cfg.specaug_config.num_repeat_feat,
+                )
+            else:
+                audio_features_masked_2 = audio_features
+
+        conformer_in = audio_features_masked_2
+        # create the mask for the conformer input
+        mask = mask_tensor(conformer_in, audio_features_len)
+
+        conformer_out, out_mask = self.conformer(conformer_in, mask)
+        conformer_out = self.encoder_out_linear(conformer_out)
+        conformer_out_lengths = torch.sum(out_mask, dim=1)  # [B, T] -> [B]
+
+        predict_out, _, _ = self.predictor(
+            input=labels,
+            lengths=labels_len,
+        )
+
+        output_logits, src_len, tgt_len = self.joiner(
+            source_encodings=conformer_out,
+            source_lengths=conformer_out_lengths,
+            target_encodings=predict_out,
+            target_lengths=labels_len,
+        )  # output is [B, T, N, #vocab]
+
+        return output_logits, src_len
+
+
+def train_step(*, model: Model, data, run_ctx, **kwargs):
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"].to("cpu")  # [B], cpu transfer needed only for Mini-RETURNN
+
+    labels = data["labels"]  # [B, N] (sparse)
+    labels_len = data["labels:size1"]  # [B, N]
+
+    prepended_targets = labels.new_empty([labels.size(0), labels.size(1) + 1])
+    prepended_targets[:, 1:] = labels
+    prepended_targets[:, 0] = model.cfg.label_target_size  # blank is last index
+    prepended_target_lengths = labels_len + 1
+
+    logits, audio_features_len = model(
+        raw_audio=raw_audio, raw_audio_len=raw_audio_len, labels=prepended_targets, labels_len=prepended_target_lengths
+    )
+
+    rnnt_loss = model.loss(
+        logits=logits,
+        logit_lengths=audio_features_len.to(dtype=torch.int32),
+        targets=labels,
+        target_lengths=labels_len.to(dtype=torch.int32),
+    )
+
+    num_phonemes = torch.sum(labels_len)
+    run_ctx.mark_as_loss(name="rnnt", loss=rnnt_loss, inv_norm_factor=num_phonemes)
+
+
+def prior_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.sum_probs = None
+    run_ctx.sum_frames = 0
+
+
+def prior_finish_hook(run_ctx, **kwargs):
+    all_frames = run_ctx.sum_frames.detach().cpu().numpy()
+    all_probs = run_ctx.sum_probs.detach().cpu().numpy()
+    average_probs = all_probs / all_frames
+    log_average_probs = np.log(average_probs)
+    print("Prior sum in std-space (should be close to 1.0):", np.sum(average_probs))
+    with open("prior.txt", "w") as f:
+        np.savetxt(f, log_average_probs, delimiter=" ")
+    print("Saved prior in prior.txt in +log space.")
+
+
+def prior_step(*, model: Model, data, run_ctx, **kwargs):
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    logprobs, audio_features_len = model(
+        raw_audio=raw_audio,
+        raw_audio_len=raw_audio_len,
+    )
+
+    probs = torch.exp(logprobs)
+    run_ctx.sum_frames = run_ctx.sum_frames + torch.sum(audio_features_len)
+    if run_ctx.sum_probs is None:
+        run_ctx.sum_probs = torch.sum(probs, dim=(0, 1))
+    else:
+        run_ctx.sum_probs += torch.sum(probs, dim=(0, 1))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/experimental_rnnt_decoder.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/experimental_rnnt_decoder.py
new file mode 100644
index 000000000..386d5dfb0
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/experimental_rnnt_decoder.py
@@ -0,0 +1,153 @@
+"""
+Experimental RNNT decoder
+"""
+
+from typing import Callable, Dict, List, Optional, Tuple
+import time
+import numpy as np
+import torch
+from torch import nn
+
+from torchaudio.models import RNNT
+from .rnnt_beam_search import ModifiedRNNTBeamSearch
+
+import torch
+
+
+def mask_tensor(tensor: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+    """
+    mask a tensor with a "positive" mask (boolean true means position is used)
+
+    This function is traceable.
+
+    :param tensor: [B,T,....]
+    :param seq_len: [B]
+    :return: [B,T]
+    """
+    seq_len = seq_len.to(device=tensor.device)
+    r = torch.arange(tensor.shape[1], device=tensor.device)  # [T]
+    seq_mask = torch.less(r[None, :], seq_len[:, None])  # broadcast to [B,T]
+    return seq_mask
+
+
+class Transcriber(nn.Module):
+    def __init__(self, feature_extraction: nn.Module, encoder: nn.Module, mapping: nn.Module):
+        super().__init__()
+        self.feature_extraction = feature_extraction
+        self.encoder = encoder
+        self.mapping = mapping
+
+    def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+
+        :param input:
+        :param lengths:
+        :return:
+        """
+
+        squeezed_features = torch.squeeze(input)
+        with torch.no_grad():
+            audio_features, audio_features_len = self.feature_extraction(squeezed_features, lengths)
+
+        mask = mask_tensor(audio_features, audio_features_len)
+
+        encoder_out, out_mask = self.encoder(audio_features, mask)
+        encoder_out = self.mapping(encoder_out)
+        encoder_out_lengths = torch.sum(out_mask, dim=1)  # [B, T] -> [B]
+
+        return encoder_out, encoder_out_lengths
+
+    def infer(
+        self,
+        input: torch.Tensor,
+        lengths: torch.Tensor,
+        states: Optional[List[List[torch.Tensor]]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
+        output, out_lengths = self.forward(input, lengths)
+        return output, out_lengths, [[]]
+
+
+def forward_init_hook(run_ctx, **kwargs):
+    # we are storing durations, but call it output.hdf to match
+    # the default output of the ReturnnForwardJob
+    run_ctx.recognition_file = open("search_out.py", "wt")
+    run_ctx.recognition_file.write("{\n")
+
+    from returnn.datasets.util.vocabulary import Vocabulary
+
+    vocab = Vocabulary.create_vocab(vocab_file=kwargs["returnn_vocab"], unknown_label=None)
+    run_ctx.labels = vocab.labels
+
+    run_ctx.rnnt_decoder = None
+    run_ctx.beam_size = kwargs["beam_size"]
+
+    run_ctx.blank_log_penalty = kwargs.get("blank_log_penalty", None)
+
+    run_ctx.batched_encoder_decoding = kwargs.get("batched_encoder_decoding", False)
+
+    run_ctx.running_audio_len_s = 0
+    run_ctx.total_time = 0
+
+
+def forward_finish_hook(run_ctx, **kwargs):
+    run_ctx.recognition_file.write("}\n")
+    run_ctx.recognition_file.close()
+
+    print("Total-time: %.2f, Batch-RTF: %.3f" % (run_ctx.total_time, run_ctx.total_time / run_ctx.running_audio_len_s))
+
+
+def forward_step(*, model, data, run_ctx, **kwargs):
+
+    if run_ctx.rnnt_decoder is None:
+        print("create RNNT model...")
+        rnnt_model = RNNT(
+            transcriber=Transcriber(
+                feature_extraction=model.feature_extraction, encoder=model.conformer, mapping=model.encoder_out_linear
+            ),
+            predictor=model.predictor,
+            joiner=model.joiner,
+        )
+        run_ctx.rnnt_decoder = ModifiedRNNTBeamSearch(
+            model=rnnt_model,
+            blank=model.cfg.label_target_size,
+            blank_penalty=run_ctx.blank_log_penalty,
+        )
+        print("done!")
+
+    raw_audio = data["raw_audio"]  # [B, T', F]
+    raw_audio_len = data["raw_audio:size1"]  # [B]
+
+    audio_len_batch = torch.sum(raw_audio_len).detach().cpu().numpy() / 16000
+    run_ctx.running_audio_len_s += audio_len_batch
+
+    start = time.time()
+    tags = data["seq_tag"]
+
+    hyps = []
+
+    if run_ctx.batched_encoder_decoding:
+        batched_hypotheses = run_ctx.rnnt_decoder.forward_semi_batched(
+            input=raw_audio,
+            length=raw_audio_len,
+            beam_width=run_ctx.beam_size,
+        )
+        hyps = [hypothesis[0][0][:-1] for hypothesis in batched_hypotheses]  # exclude last sentence end token
+    else:
+        for i in range(raw_audio.shape[0]):
+            hypothesis, states = run_ctx.rnnt_decoder.infer(
+                input=raw_audio[[i]],
+                length=raw_audio_len[[i]],
+                beam_width=run_ctx.beam_size,
+            )
+            hyps.append(hypothesis[0][0][:-1])  # exclude last sentence end token
+
+    total_time = time.time() - start
+    run_ctx.total_time += total_time
+
+    print("Batch-time: %.2f, Batch-RTF: %.3f" % (total_time, total_time / audio_len_batch))
+
+    for hyp, tag in zip(hyps, tags):
+        sequence = [run_ctx.labels[idx] for idx in hyp if idx < len(run_ctx.labels)]
+        text = " ".join(sequence).replace("@@ ", "")
+        print(text)
+        run_ctx.recognition_file.write("%s: %s,\n" % (repr(tag), repr(text)))
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/rnnt_beam_search.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/rnnt_beam_search.py
new file mode 100644
index 000000000..64fe7ffd1
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/rnnt/decoder/rnnt_beam_search.py
@@ -0,0 +1,93 @@
+import torch
+from typing import Optional, Callable, List
+from torchaudio.models.rnnt_decoder import RNNTBeamSearch, RNNT, Hypothesis, _get_hypo_predictor_out
+
+
+class ModifiedRNNTBeamSearch(RNNTBeamSearch):
+    r"""Beam search decoder for RNN-T model.
+
+    Modified with blank penalty
+
+    See Also:
+        * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pretrained model.
+
+    Args:
+        model (RNNT): RNN-T model to use.
+        blank (int): index of blank token in vocabulary.
+        temperature (float, optional): temperature to apply to joint network output.
+            Larger values yield more uniform samples. (Default: 1.0)
+        hypo_sort_key (Callable[[Hypothesis], float] or None, optional): callable that computes a score
+            for a given hypothesis to rank hypotheses by. If ``None``, defaults to callable that returns
+            hypothesis score normalized by token sequence length. (Default: None)
+        step_max_tokens (int, optional): maximum number of tokens to emit per input time step. (Default: 100)
+        blank_penalty: blank penalty in log space
+    """
+
+    def __init__(
+        self,
+        model: RNNT,
+        blank: int,
+        temperature: float = 1.0,
+        hypo_sort_key: Optional[Callable[[Hypothesis], float]] = None,
+        step_max_tokens: int = 100,
+        blank_penalty: Optional[float] = None,
+    ) -> None:
+        super().__init__(
+            model=model,
+            blank=blank,
+            temperature=temperature,
+            hypo_sort_key=hypo_sort_key,
+            step_max_tokens=step_max_tokens,
+        )
+        self.blank_penalty = blank_penalty
+
+    def _gen_next_token_probs(
+        self, enc_out: torch.Tensor, hypos: List[Hypothesis], device: torch.device
+    ) -> torch.Tensor:
+        one_tensor = torch.tensor([1], device=device)
+        predictor_out = torch.stack([_get_hypo_predictor_out(h) for h in hypos], dim=0)
+        joined_out, _, _ = self.model.join(
+            enc_out,
+            one_tensor,
+            predictor_out,
+            torch.tensor([1] * len(hypos), device=device),
+        )  # [beam_width, 1, 1, num_tokens]
+        joined_out = torch.nn.functional.log_softmax(joined_out / self.temperature, dim=3)
+
+        if self.blank_penalty is not None:
+            # assumes blank is last
+            # joined_out[:, :, :, self.blank] -= self.blank_penalty.to(device=joined_out.device)
+            joined_out[:, :, :, self.blank] -= self.blank_penalty
+
+        return joined_out[:, 0, 0]
+
+    def forward_semi_batched(
+        self, input: torch.Tensor, length: torch.Tensor, beam_width: int
+    ) -> List[List[Hypothesis]]:
+        r"""Performs beam search for the given input sequence.
+
+        T: number of frames;
+        D: feature dimension of each frame.
+
+        Args:
+            input (torch.Tensor): sequence of input frames, with shape (B, T, D).
+            length (torch.Tensor): number of valid frames in input
+                sequence, (B,).
+            beam_width (int): beam size to use during search.
+
+        Returns:
+            List[Hypothesis]: top-``beam_width`` hypotheses found by beam search.
+        """
+        if input.dim() != 3:
+            raise ValueError("input must be of shape (B, T, D)")
+
+        if length.dim() != 1:
+            raise ValueError("length must be of shape (B,)")
+
+        enc_out_batched, _ = self.model.transcribe(input, length)
+
+        search_outputs = []
+        for enc_out in enc_out_batched:
+            search_outputs.append(self._search(enc_out.unsqueeze(0), None, beam_width))
+
+        return search_outputs
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/specaugment.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/specaugment.py
new file mode 100644
index 000000000..bff395505
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/pytorch_networks/specaugment.py
@@ -0,0 +1,81 @@
+import torch
+
+
+def _mask(tensor, batch_axis, axis, pos, max_amount):
+    batch_dim = tensor.shape[batch_axis]
+    dim = tensor.shape[axis]
+    amount = torch.randint(low=1, high=max_amount + 1, size=(batch_dim,), dtype=torch.int32).to(device=tensor.device)
+    pos2 = torch.min(pos + amount, torch.tensor([dim] * batch_dim).to(device=tensor.device))
+    idxs = torch.arange(0, dim).to(device=tensor.device).unsqueeze(0)  # [1,dim]
+    pos_bc = pos.unsqueeze(1)  # [B,1]
+    pos2_bc = pos2.unsqueeze(1)  # [B,1]
+    cond = torch.logical_and(torch.greater_equal(idxs, pos_bc), torch.less(idxs, pos2_bc))  # [B,dim]
+    if batch_axis > axis:
+        cond = cond.transpose(0, 1)  # [dim,B]
+    cond = torch.reshape(
+        cond, shape=[tensor.shape[i] if i in (batch_axis, axis) else 1 for i in range(len(tensor.shape))]
+    )
+    tensor = torch.where(cond, 0.0, tensor)
+    return tensor
+
+
+def _random_mask(tensor, batch_axis, axis, min_num, max_num, max_dims):
+    batch_dim = tensor.shape[batch_axis]
+    if min_num >= max_num:
+        num_masks = torch.ones((batch_dim,), dtype=torch.int64) * min_num
+    else:
+        num_masks = torch.randint(min_num, max_num, size=(batch_dim,))  # [B]
+    max_num_masks = num_masks.max().item()
+    z = -torch.log(-torch.log(torch.rand((batch_dim, tensor.shape[axis])).to(device=tensor.device)))  # [B,dim]
+    _, indices = torch.topk(z, max_num_masks, dim=1)
+
+    # Make num_masks broadcastable to shape of tensor for torch.where.
+    for i in range(tensor.dim() - 1):
+        if i < batch_axis:
+            num_masks = num_masks.unsqueeze(0)
+        else:
+            num_masks = num_masks.unsqueeze(-1)
+
+    num_masks = num_masks.to(device=tensor.device)
+
+    for i in range(max_num_masks):
+        tensor = torch.where(i < num_masks, _mask(tensor, batch_axis, axis, indices[:, i], max_dims), tensor)
+
+    return tensor
+
+
+def returnn_specaugment(tensor: torch.Tensor, time_num_masks, time_mask_max_size, freq_num_masks, freq_mask_max_size):
+    """
+    Returnn like specaugment from legacy rossenbach/zeineldeen attention setups (usually called specaugment_v2 or so)
+
+    :param tensor:
+    :param time_num_masks:
+    :param time_mask_max_size:
+    :param freq_num_masks:
+    :param freq_mask_max_size:
+    :return:
+    """
+    assert len(tensor.shape) == 3
+    tensor = _random_mask(tensor, 0, 1, 2, time_num_masks, time_mask_max_size)  # time masking
+    tensor = _random_mask(tensor, 0, 2, 2, freq_num_masks, freq_mask_max_size)  # freq masking
+    return tensor
+
+
+def returnn_specaugment_by_length(audio_features, repeat_per_n_frames, max_dim_time, num_repeat_feat, max_dim_feat):
+    """
+    like returnn_specaugment, but with length adaptive num of time masks
+
+    :param audio_features:
+    :param repeat_per_n_frames:
+    :param max_dim_time:
+    :param num_repeat_feat:
+    :param max_dim_feat:
+    :return:
+    """
+    return returnn_specaugment(
+        audio_features,
+        time_num_masks=audio_features.size(1) // repeat_per_n_frames,
+        time_mask_max_size=max_dim_time,
+        freq_num_masks=num_repeat_feat,
+        freq_mask_max_size=max_dim_feat,
+    )
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/__init__.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/config.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/config.py
new file mode 100644
index 000000000..8eda0fd9e
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/config.py
@@ -0,0 +1,151 @@
+import copy
+from typing import Any, Dict
+
+from i6_core.returnn.config import ReturnnConfig, CodeWrapper
+
+from i6_experiments.common.setups.returnn_pytorch.serialization import (
+    Collection as TorchCollection,
+)
+from i6_experiments.common.setups.serialization import Import
+from ..data import TrainingDatasets
+from ..flashlight_phon_ctc.serializer import get_pytorch_serializer_v3, PACKAGE
+
+
+def get_training_config(
+    training_datasets: TrainingDatasets,
+    network_module: str,
+    net_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine=False,
+    use_speed_perturbation=False,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these does not change the hash
+    post_config = {
+        "cleanup_old_models": True,
+        "stop_on_nonfinite_train_score": True,  # this might break now with True
+        "num_workers_per_gpu": 2,
+    }
+
+    base_config = {
+        "max_seqs": 60,
+        #############
+        "train": copy.deepcopy(training_datasets.train.as_returnn_opts()),
+        "dev": training_datasets.cv.as_returnn_opts(),
+        "eval_datasets": {"devtrain": training_datasets.devtrain.as_returnn_opts()},
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module, net_args=net_args, debug=debug, use_custom_engine=use_custom_engine
+    )
+    python_prolog = None
+    if use_speed_perturbation:
+        prolog_serializer = TorchCollection(
+            serializer_objects=[
+                Import(
+                    code_object_path=PACKAGE + ".dataset_code.speed_perturbation.legacy_speed_perturbation",
+                    unhashed_package_root=PACKAGE,
+                )
+            ]
+        )
+        python_prolog = [prolog_serializer]
+        config["train"]["datasets"]["zip_dataset"]["audio"]["pre_process"] = CodeWrapper("legacy_speed_perturbation")
+
+    returnn_config = ReturnnConfig(
+        config=config, post_config=post_config, python_prolog=python_prolog, python_epilog=[serializer]
+    )
+    return returnn_config
+
+
+def get_prior_config(
+    training_datasets: TrainingDatasets,
+    network_module: str,
+    net_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine=False,
+    **kwargs,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these does not change the hash
+    post_config = {}
+
+    base_config = {
+        #############
+        "batch_size": 50000 * 160,
+        "max_seqs": 60,
+        #############
+        "forward": training_datasets.prior.as_returnn_opts(),
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module,
+        net_args=net_args,
+        debug=debug,
+        use_custom_engine=use_custom_engine,
+        prior=True,
+    )
+    returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer])
+    return returnn_config
+
+
+def get_search_config(
+    network_module: str,
+    net_args: Dict[str, Any],
+    decoder: [str],
+    decoder_args: Dict[str, Any],
+    config: Dict[str, Any],
+    debug: bool = False,
+    use_custom_engine=False,
+    **kwargs,
+):
+    """
+    Returns the RETURNN config serialized by :class:`ReturnnCommonSerializer` in returnn_common for the ctc_aligner
+    :param returnn_common_root: returnn_common version to be used, usually output of CloneGitRepositoryJob
+    :param training_datasets: datasets for training
+    :param kwargs: arguments to be passed to the network construction
+    :return: RETURNN training config
+    """
+
+    # changing these does not change the hash
+    post_config = {}
+
+    base_config = {
+        #############
+        "batch_size": 24000 * 160,
+        "max_seqs": 60,
+        #############
+        # dataset is added later in the pipeline during search_single
+    }
+    config = {**base_config, **copy.deepcopy(config)}
+    post_config["backend"] = "torch"
+
+    serializer = get_pytorch_serializer_v3(
+        network_module=network_module,
+        net_args=net_args,
+        debug=debug,
+        use_custom_engine=use_custom_engine,
+        decoder=decoder,
+        decoder_args=decoder_args,
+    )
+    returnn_config = ReturnnConfig(config=config, post_config=post_config, python_epilog=[serializer])
+    return returnn_config
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/data.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/data.py
new file mode 100644
index 000000000..35d50f268
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/data.py
@@ -0,0 +1,92 @@
+"""
+The new version of data.py for the 2023 Slurm and Rescale/NeuroSys setups
+"""
+from sisyphus import tk
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Dict, List, Optional, Tuple
+
+from i6_core.returnn import CodeWrapper
+
+from i6_experiments.common.datasets.tedlium2.corpus import get_ogg_zip_dict
+from i6_experiments.common.datasets.tedlium2.vocab import get_subword_nmt_bpe_v2
+from i6_experiments.common.datasets.tedlium2.lexicon import get_bliss_lexicon
+from i6_experiments.common.helpers.text_labels.subword_nmt_bpe import get_returnn_subword_nmt
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import BpeDatastream
+from i6_experiments.users.rossenbach.lexicon.bpe_lexicon import CreateBPELexiconJob
+
+from returnn_common.datasets import Dataset, OggZipDataset, MetaDataset
+
+from ..data import build_training_datasets, TrainingDatasetSettings, TrainingDatasets
+
+from ..default_tools import MINI_RETURNN_ROOT, RETURNN_EXE
+
+
+from ..data import DATA_PREFIX
+
+
+def get_lexicon(bpe_size: int) -> tk.Path:
+    subword_nmt_repo = get_returnn_subword_nmt(
+        commit_hash="5015a45e28a958f800ef1c50e7880c0c9ef414cf", output_prefix=DATA_PREFIX
+    )
+    subword_nmt_repo.hash_overwrite = "I6_SUBWORD_NMT_V2"
+
+    bpe_datastream = get_bpe_datastream(bpe_size=bpe_size, is_recog=False)
+    bpe_lexicon = CreateBPELexiconJob(
+        base_lexicon_path=get_bliss_lexicon(
+            add_unknown_phoneme_and_mapping=False, add_silence=False, output_prefix="tedliumv2_datasets"
+        ),
+        bpe_codes=bpe_datastream.codes,
+        bpe_vocab=bpe_datastream.vocab,
+        subword_nmt_repo=subword_nmt_repo,
+        unk_label="<unk>",
+    ).out_lexicon
+
+    return bpe_lexicon
+
+
+def get_text_lexicon(bpe_size: int) -> tk.Path:
+    """
+
+    :return:
+    """
+    bliss_lex = get_lexicon(bpe_size=bpe_size)
+    from i6_experiments.users.rossenbach.lexicon.conversion import BlissLexiconToWordLexicon
+
+    word_lexicon = BlissLexiconToWordLexicon(bliss_lex).out_lexicon
+    return word_lexicon
+
+
+def get_bpe_datastream(bpe_size: int, is_recog: bool) -> BpeDatastream:
+    """
+    Returns the datastream for the bpe labels
+
+    Uses the legacy BPE setup that is compatible with old LM models
+
+    :param librispeech_key:
+    :param bpe_size: size for the bpe labels
+    :param is_recog: removes the UNK label when not in training
+    :param use_v2: subword_nmt had a bug where it would not find python, use corrected version which changes hash
+    """
+    bpe_settings = get_subword_nmt_bpe_v2(bpe_size=bpe_size, unk_label="<unk>")
+    bpe_targets = BpeDatastream(available_for_inference=False, bpe_settings=bpe_settings, use_unk_label=is_recog)
+    return bpe_targets
+
+
+def build_bpe_training_datasets(
+    bpe_size: int,
+    settings: TrainingDatasetSettings,
+) -> TrainingDatasets:
+    """
+    :param settings: configuration object for the dataset pipeline
+    """
+    label_datastream = get_bpe_datastream(bpe_size=bpe_size, is_recog=False)
+
+    ogg_zip_dict = get_ogg_zip_dict(returnn_python_exe=RETURNN_EXE, returnn_root=MINI_RETURNN_ROOT)
+    train_ogg = ogg_zip_dict["train"]
+    dev_ogg = ogg_zip_dict["dev"]
+
+    return build_training_datasets(
+        settings=settings, train_ogg=train_ogg, dev_ogg=dev_ogg, label_datastream=label_datastream
+    )
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_baseline.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_baseline.py
new file mode 100644
index 000000000..9177609e8
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_baseline.py
@@ -0,0 +1,706 @@
+from sisyphus import tk
+
+import copy
+from dataclasses import asdict
+import numpy as np
+from typing import cast
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+from i6_core.report.report import _Report_Type
+
+from .data import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon
+from ..data import build_test_dataset
+from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT
+
+from ..pipeline import training, search, compute_prior
+
+from .config import get_training_config, get_search_config, get_prior_config
+
+def flash_bpe_rnnt_report_format(report: _Report_Type) -> str:
+    extra_ls = []
+    out = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)]
+    out = sorted(out, key=lambda x: float(x[1]))
+    best_ls = [out[0]]
+    for extra in extra_ls:
+        out2 = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if extra in recog]
+        out2 = sorted(out2, key=lambda x: float(x[1]))
+        if len(out2) > 0:
+            out.append((extra, ""))
+            out.extend(out2)
+            best_ls.append(out2[0])
+    best_ls = sorted(best_ls, key=lambda x: float(x[1]))
+    out.append(("Best Results", ""))
+    out.extend(best_ls)
+    return "\n".join([f"{pair[0]}:  {str(pair[1])}" for pair in out])
+
+def conformer_rnnt_baseline():
+    prefix_name = "experiments/rescale/tedliumv2/torchaudio_bpe_rnnt/baseline/"
+
+    BPE_SIZE = 1000
+
+    train_settings = TrainingDatasetSettings(
+        custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000"
+    )
+
+    train_settings_retrain = copy.deepcopy(train_settings)
+    train_settings_retrain.epoch_wise_filters = []
+
+    # build the training datasets object containing train, cv, dev-train and the extern_data dict
+    train_data = build_bpe_training_datasets(
+        bpe_size=BPE_SIZE,
+        settings=train_settings,
+    )
+    label_datastream = cast(LabelDatastream, train_data.datastreams["labels"])
+    vocab_size_without_blank = label_datastream.vocab_size
+
+    # build testing datasets
+    test_dataset_tuples = {}
+    # for testset in ["dev", "test"]:
+    for testset in ["dev"]:
+        test_dataset_tuples[testset] = build_test_dataset(
+            dataset_key=testset,
+        )
+    from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm
+
+    lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False)
+    lm = lms_system.interpolated_lms["dev-pruned"]["4gram"]
+    arpa_ted_lm = lm.ngram_lm
+
+    # ---------------------------------------------------------------------------------------------------------------- #
+
+    def run_exp(
+        ft_name,
+        datasets,
+        train_args,
+        search_args=None,
+        num_epochs=250,
+        decoder="rnnt.decoder.experimental_rnnt_decoder",
+        with_prior=False,
+        evaluate_epoch=None,
+        eval_best=True,
+    ):
+        training_name = "/".join(ft_name.split("/")[:-1])
+        search_args = search_args if search_args is not None else {}
+
+        returnn_config = get_training_config(training_datasets=datasets, **train_args)
+        train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs)
+
+        if not evaluate_epoch:
+            evaluate_epoch = num_epochs
+        search_job_ls = []
+        report = {}
+        returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder)
+        format_string_report, values_report, search_jobs = search(
+            ft_name + "/default_%i" % evaluate_epoch,
+            returnn_search_config,
+            train_job.out_checkpoints[evaluate_epoch],
+            test_dataset_tuples,
+            RETURNN_EXE,
+            MINI_RETURNN_ROOT,
+            use_gpu=search_args.get("use_gpu", False),
+        )
+        search_job_ls += search_jobs
+        report.update(values_report)
+
+        from i6_core.returnn import GetBestPtCheckpointJob
+        if eval_best:
+            best_job = GetBestPtCheckpointJob(train_job.out_model_dir, train_job.out_learning_rates, key="dev_loss_rnnt")
+            best_job.add_alias(ft_name + "/get_best_job")
+            format_string_report, values_report, search_jobs = search(
+                ft_name + "/best_chkpt",
+                returnn_search_config,
+                best_job.out_checkpoint,
+                test_dataset_tuples,
+                RETURNN_EXE,
+                MINI_RETURNN_ROOT,
+            )
+            search_job_ls += search_jobs
+            report.update(values_report)
+
+        return train_job, search_job_ls, format_string_report, report
+
+    def generate_report(results, exp_name):
+        from i6_core.report import GenerateReportStringJob, MailJob
+
+        report = GenerateReportStringJob(report_values=results, report_template=flash_bpe_rnnt_report_format)
+        report.add_alias(f"report/report/{exp_name}")
+        mail = MailJob(report.out_report, send_contents=True, subject=exp_name)
+        mail.add_alias(f"report/mail/{exp_name}")
+        tk.register_output("mail/" + exp_name, mail.out_status)
+
+    train_args_adamw03_accum2_jjlr = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+            + list(np.linspace(7e-4, 7e-5, 110))
+            + list(np.linspace(7e-5, 1e-8, 30)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "accum_grad_multiple_step": 2,
+        },
+        "debug": True,
+    }
+
+    default_search_args = {
+        "lexicon": get_text_lexicon(bpe_size=BPE_SIZE),  # TODO: cleanup
+        "returnn_vocab": label_datastream.vocab,
+        "beam_size": 1024,
+        "arpa_lm": arpa_ted_lm,
+        "beam_threshold": 14,
+    }
+
+    #### New experiments with corrected FF-Dim
+
+    from ..pytorch_networks.rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v4_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+    )
+
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(2, 1),
+        pool1_stride=(2, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    model_config = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+        specauc_start_epoch=10,
+    )
+
+    model_config_sub6 = copy.deepcopy(model_config)
+    model_config_sub6.frontend_config.pool1_stride = (3, 1)
+    model_config_sub6.frontend_config.pool1_kernel_size = (3, 1)
+
+    model_config_sub6_later = copy.deepcopy(model_config_sub6)
+    model_config_sub6_later.specauc_start_epoch = 40
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v4_transparent_latepredictor",
+        "net_args": {"model_config_dict": asdict(model_config_sub6_later)},
+    }
+    train_args["config"]["batch_size"] = 120 * 16000
+    train_args["config"]["accum_grad_multiple_step"] = 3
+    search_args = {
+        "beam_size": 12,
+        "returnn_vocab": label_datastream.vocab,
+    }
+    results = {}
+    _, _, _, wer_values = run_exp(
+        prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_JJLR_sub6_transparent_latepredictor/bs12",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args,
+        with_prior=False,
+    )
+    results.update(wer_values)
+    del wer_values
+    generate_report(  # 14.9
+        results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v4_JJLR_sub6_transparent_latepredictor/bs12"
+    )
+    del results
+    from ..pytorch_networks.rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v5_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+        PredictorConfig,
+    )
+
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(3, 1),
+        pool1_stride=(3, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    predictor_config = PredictorConfig(
+        symbol_embedding_dim=256,
+        num_lstm_layers=1,
+        lstm_hidden_dim=1024,
+        lstm_dropout=0.3,
+    )
+    model_config_v5_sub6 = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        predictor_config=predictor_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+        specauc_start_epoch=20,
+        joiner_dim=512,
+        joiner_activation="relu",
+    )
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v5",
+        "net_args": {"model_config_dict": asdict(model_config_v5_sub6)},
+    }
+    train_args["config"]["batch_size"] = 120 * 16000
+    train_args["config"]["accum_grad_multiple_step"] = 3
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v5_transparent",
+        "net_args": {"model_config_dict": asdict(model_config_v5_sub6)},
+    }
+    train_args["config"]["batch_size"] = 120 * 16000
+    train_args["config"]["accum_grad_multiple_step"] = 3
+    search_args = {
+        "beam_size": 12,
+        "returnn_vocab": label_datastream.vocab,
+    }
+    results = {}
+    _, _, _, wer_values = run_exp(
+        prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_sub6_start20_transparent/bs12",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args,
+        with_prior=False,
+    )
+    results.update(wer_values)
+    del wer_values
+    generate_report(  # 11.3
+        results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_sub6_start20_transparent/bs12"
+    )
+    del results
+
+    results = {}
+    _, _, _, wer_values = run_exp(
+        prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_sub6_start20_transparent/bs12_ep134",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args,
+        with_prior=False,
+        evaluate_epoch=134,
+    )
+    results.update(wer_values)
+    del wer_values
+    generate_report(  # 13.5
+        results=results, exp_name=prefix_name +"conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_sub6_start20_transparent/bs12_ep134",
+    )
+    del results
+
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(3, 1),
+        pool1_stride=(3, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    predictor_config = PredictorConfig(
+        symbol_embedding_dim=256,
+        num_lstm_layers=1,
+        lstm_hidden_dim=512,
+        lstm_dropout=0.3,
+    )
+    model_config_v5_sub6_512lstm = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        predictor_config=predictor_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+        specauc_start_epoch=20,
+        joiner_dim=512,
+        joiner_activation="relu",
+    )
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v5_transparent",
+        "net_args": {"model_config_dict": asdict(model_config_v5_sub6_512lstm)},
+    }
+    train_args["config"]["batch_size"] = 120 * 16000
+    train_args["config"]["accum_grad_multiple_step"] = 3
+    search_args = {
+        "beam_size": 12,
+        "returnn_vocab": label_datastream.vocab,
+    }
+    results = {}
+    _, _, _, wer_values = run_exp(
+        prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_sub6_start20_lstm512_transparent/bs12",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args,
+        with_prior=False,
+    )
+    results.update(wer_values)
+    del wer_values
+    generate_report(  # 10.4
+        results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v5_JJLR_sub6_start20_lstm512_transparent/bs12"
+    )
+    from ..pytorch_networks.rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+        PredictorConfig,
+    )
+
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(3, 1),
+        pool1_stride=(3, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=384,
+        activation=None,
+    )
+    predictor_config = PredictorConfig(
+        symbol_embedding_dim=256,
+        emebdding_dropout=0.1,
+        num_lstm_layers=1,
+        lstm_hidden_dim=512,
+        lstm_dropout=0.3,
+    )
+    model_config_v5_sub6_512lstm = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        predictor_config=predictor_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+        specauc_start_epoch=20,
+        joiner_dim=512,
+        joiner_activation="relu",
+        joiner_dropout=0.1,
+    )
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_transparent",
+        "net_args": {"model_config_dict": asdict(model_config_v5_sub6_512lstm)},
+    }
+    train_args["config"]["batch_size"] = 120 * 16000
+    train_args["config"]["accum_grad_multiple_step"] = 3
+    search_args = {
+        "beam_size": 12,
+        "returnn_vocab": label_datastream.vocab,
+    }
+    results = {}
+    _, _, _, wer_values = run_exp(
+        prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs12",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args,
+        with_prior=False,
+    )
+    results.update(wer_values)
+    del wer_values
+    generate_report(  # 10.1
+        results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs12"
+    )
+    del results
+
+    results = {}
+    for beam_size in [1, 2, 4, 8, 12, 16, 20, 24, 32, 64, 128]:
+        search_args_gpu = {
+            "beam_size": beam_size,
+            "returnn_vocab": label_datastream.vocab,
+            "use_gpu": True,  # also for new hash
+        }
+        _, _, _, wer_values = run_exp(
+            prefix_name
+            + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs%u_gpu"
+            % beam_size,
+            datasets=train_data,
+            train_args=train_args,
+            search_args=search_args_gpu,
+            with_prior=False,
+        )
+        results.update(wer_values)
+        del wer_values
+    generate_report(  # 10.1
+        results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/base"
+    )
+    del results
+
+    search_args_gpu = {
+        "beam_size": 12,
+        "returnn_vocab": label_datastream.vocab,
+        "use_gpu": True,  # also for new hash
+        "batched_encoder_decoding": True,
+    }
+    results = {}
+    _, _, _, wer_values = run_exp(
+        prefix_name
+        + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs12_gpu_batched",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args_gpu,
+        with_prior=False,
+    )
+    results.update(wer_values)
+    del wer_values
+    generate_report(  # 10.1
+        results=results, exp_name=prefix_name +  "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs12_gpu_batched"
+    )
+    del results
+
+    results = {}
+    for blank_log_penalty in [0.1, 0.2, 0.3]:
+        search_args_gpu = {
+            "beam_size": 16,
+            "returnn_vocab": label_datastream.vocab,
+            "use_gpu": True,  # also for new hash
+            "blank_log_penalty": blank_log_penalty,
+        }
+        _, _, _, wer_values = run_exp(
+            prefix_name
+            + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs16_bp%.1f_gpu"
+            % blank_log_penalty,
+            datasets=train_data,
+            train_args=train_args,
+            search_args=search_args_gpu,
+            with_prior=False,
+        )
+        results.update(wer_values)
+        del wer_values
+    generate_report(  # 10.0
+        results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_JJLR_sub6_start20_lstm512_transparent/bs_16_penalty"
+    )
+    del results
+
+    train_args_const20 = copy.deepcopy(train_args)
+    train_args_const20["config"]["learning_rates"] = (
+        list(np.linspace(1e-4, 1e-4, 20))
+        + list(np.linspace(1e-4, 7e-4, 90))
+        + list(np.linspace(7e-4, 7e-5, 110))
+        + list(np.linspace(7e-5, 1e-8, 30))
+    )
+    search_args = {
+        "beam_size": 12,
+        "returnn_vocab": label_datastream.vocab,
+    }
+    results = {}
+    _, _, _, wer_values = run_exp(
+        prefix_name
+        + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_const20_sub6_start20_lstm512_transparent/bs12",
+        datasets=train_data,
+        train_args=train_args_const20,
+        search_args=search_args,
+        with_prior=False,
+    )
+    results.update(wer_values)
+    del wer_values
+    generate_report(  # 10.1
+        results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v6_const20_sub6_start20_lstm512_transparent/bs12"
+    )
+    del results
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7_transparent",
+        "net_args": {"model_config_dict": asdict(model_config_v5_sub6_512lstm)},
+    }
+    train_args["config"]["batch_size"] = 120 * 16000
+    train_args["config"]["accum_grad_multiple_step"] = 3
+    search_args = {
+        "beam_size": 12,
+        "returnn_vocab": label_datastream.vocab,
+    }
+    results = {}
+    _, _, _, wer_values = run_exp(
+        prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512_transparent/bs12",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args,
+        with_prior=False,
+    )
+    results.update(wer_values)
+    del wer_values
+    generate_report(  # 9.8
+        results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512_transparent/bs12"
+    )
+    del results
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7",
+        "net_args": {"model_config_dict": asdict(model_config_v5_sub6_512lstm)},
+    }
+    train_args["config"]["batch_size"] = 120 * 16000
+    train_args["config"]["accum_grad_multiple_step"] = 3
+    search_args = {
+        "beam_size": 12,
+        "returnn_vocab": label_datastream.vocab,
+    }
+    results = {}
+    train_job, _, _, wer_values= run_exp(
+        prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512/bs12",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args,
+        with_prior=False,
+    )
+    train_job.rqmt["gpu_mem"] = 24
+    results.update(wer_values)
+    del wer_values
+    generate_report(  # 9.6
+        results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512/bs12"
+    )
+    del results
+    # TODO: This here above is the best baseline with 9.3%, with the accum step 3 setting also runnable on 11GB GPU
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7",
+        "net_args": {"model_config_dict": asdict(model_config_v5_sub6_512lstm)},
+    }
+    train_args["config"]["learning_rates"] = (
+            list(np.linspace(7e-6, 7e-4, 220)) +
+            list(np.linspace(7e-4, 7e-5, 220)) +
+            list(np.linspace(7e-5, 1e-8, 60)))
+    train_args["config"]["batch_size"] = 120 * 16000
+    train_args["config"]["accum_grad_multiple_step"] = 3
+    search_args = {
+        "beam_size": 12,
+        "returnn_vocab": label_datastream.vocab,
+    }
+    results = {}
+    train_job, _, _, wer_values = run_exp(
+        prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512_longer/bs12",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args,
+        with_prior=False,
+        num_epochs=500
+    )
+    train_job.rqmt["gpu_mem"] = 24
+    results.update(wer_values)
+    del wer_values
+    generate_report(  # 9.6
+        results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512_longer/bs12"
+    )
+    del results
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_accum2_jjlr),
+        "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7",
+        "net_args": {"model_config_dict": asdict(model_config_v5_sub6_512lstm)},
+    }
+    train_args["config"]["batch_size"] = 180 * 16000
+    train_args["config"]["accum_grad_multiple_step"] = 2
+    search_args = {
+        "beam_size": 12,
+        "returnn_vocab": label_datastream.vocab,
+    }
+    results = {}
+    train_job, _, _, wer_values = run_exp(
+        prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512_r2/bs12",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args,
+        with_prior=False,
+    )
+    train_job.rqmt["gpu_mem"] = 24
+    results.update(wer_values)
+    del wer_values
+    generate_report(  # 9.5
+        results=results, exp_name=prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_JJLR_sub6_start20_lstm512_r2/bs12"
+    )
+    del results
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_espnet_like.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_espnet_like.py
new file mode 100644
index 000000000..9b7cb61d0
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_espnet_like.py
@@ -0,0 +1,252 @@
+from sisyphus import tk
+
+import copy
+from dataclasses import asdict
+import numpy as np
+from typing import cast
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+
+from .data import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon
+from ..data import build_test_dataset
+from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT
+
+from ..pipeline import training, search, compute_prior
+
+from .config import get_training_config, get_search_config, get_prior_config
+
+
+def conformer_rnnt_espnet_like():
+    """
+
+    ESPNet like means BPE 500 and subsampling 4
+
+    :return:
+    """
+
+    prefix_name = "experiments/rescale/tedliumv2/torchaudio_bpe_rnnt/espnet_like"
+
+    BPE_SIZE = 500
+
+    train_settings = TrainingDatasetSettings(
+        custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000"
+    )
+
+    train_settings_retrain = copy.deepcopy(train_settings)
+    train_settings_retrain.epoch_wise_filters = []
+
+    # build the training datasets object containing train, cv, dev-train and the extern_data dict
+    train_data = build_bpe_training_datasets(
+        bpe_size=BPE_SIZE,
+        settings=train_settings,
+    )
+    label_datastream = cast(LabelDatastream, train_data.datastreams["labels"])
+    vocab_size_without_blank = label_datastream.vocab_size
+
+    # build testing datasets
+    test_dataset_tuples = {}
+    # for testset in ["dev", "test"]:
+    for testset in ["dev"]:
+        test_dataset_tuples[testset] = build_test_dataset(
+            dataset_key=testset,
+        )
+    from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm
+
+    lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False)
+    lm = lms_system.interpolated_lms["dev-pruned"]["4gram"]
+    arpa_ted_lm = lm.ngram_lm
+
+    # ---------------------------------------------------------------------------------------------------------------- #
+
+    def run_exp(
+        ft_name,
+        datasets,
+        train_args,
+        search_args=None,
+        num_epochs=250,
+        decoder="rnnt.decoder.experimental_rnnt_decoder",
+        with_prior=False,
+        evaluate_epoch=None,
+    ):
+        training_name = "/".join(ft_name.split("/")[:-1])
+        search_args = search_args if search_args is not None else {}
+
+        returnn_config = get_training_config(training_datasets=datasets, **train_args)
+        train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs)
+
+        if not evaluate_epoch:
+            evaluate_epoch = num_epochs
+
+        returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder)
+        _, _, search_jobs = search(
+            ft_name + "/default_%i" % evaluate_epoch,
+            returnn_search_config,
+            train_job.out_checkpoints[evaluate_epoch],
+            test_dataset_tuples,
+            RETURNN_EXE,
+            MINI_RETURNN_ROOT,
+            use_gpu=search_args.get("use_gpu", False),
+        )
+
+        return train_job, search_jobs
+
+    from ..pytorch_networks.rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v6_cfg import (
+        SpecaugConfig,
+        VGG4LayerActFrontendV1Config_mod,
+        ModelConfig,
+        PredictorConfig,
+    )
+
+    specaug_config = SpecaugConfig(
+        repeat_per_n_frames=25,
+        max_dim_time=20,
+        max_dim_feat=16,
+        num_repeat_feat=5,
+    )
+    frontend_config = VGG4LayerActFrontendV1Config_mod(
+        in_features=80,
+        conv1_channels=32,
+        conv2_channels=64,
+        conv3_channels=64,
+        conv4_channels=32,
+        conv_kernel_size=(3, 3),
+        conv_padding=None,
+        pool1_kernel_size=(2, 1),
+        pool1_stride=(2, 1),
+        pool1_padding=None,
+        pool2_kernel_size=(2, 1),
+        pool2_stride=(2, 1),
+        pool2_padding=None,
+        activation_str="ReLU",
+        out_features=256,
+        activation=None,
+    )
+    predictor_config = PredictorConfig(
+        symbol_embedding_dim=256,
+        emebdding_dropout=0.2,
+        num_lstm_layers=1,
+        lstm_hidden_dim=256,
+        lstm_dropout=0.1,
+    )
+    model_config = ModelConfig(
+        frontend_config=frontend_config,
+        specaug_config=specaug_config,
+        predictor_config=predictor_config,
+        label_target_size=vocab_size_without_blank,
+        conformer_size=256,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1024,
+        att_weights_dropout=0.1,
+        conv_dropout=0.1,
+        ff_dropout=0.1,
+        mhsa_dropout=0.1,
+        conv_kernel_size=31,
+        final_dropout=0.1,
+        specauc_start_epoch=10,
+        joiner_dim=320,
+        joiner_activation="tanh",
+        joiner_dropout=0.1,
+    )
+
+    train_args_adamw03_24gb_jjlr = {
+        "config": {
+            "optimizer": {"class": "adamw", "epsilon": 1e-16, "weight_decay": 1e-3},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+            + list(np.linspace(7e-4, 7e-5, 110))
+            + list(np.linspace(7e-5, 1e-8, 30)),
+            #############
+            "batch_size": 200 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "accum_grad_multiple_step": 2,
+        },
+        "debug": True,
+    }
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_24gb_jjlr),
+        "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7_transparent",
+        "net_args": {"model_config_dict": asdict(model_config)},
+    }
+
+    search_args_gpu = {
+        "beam_size": 12,
+        "returnn_vocab": label_datastream.vocab,
+        "use_gpu": True,  # also for new hash
+    }
+    train_job, _ = run_exp(
+        prefix_name
+        + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent_JJLR_sub4_small_bs200ac2/bs12_gpu",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args_gpu,
+        with_prior=False,
+    )
+    train_job.rqmt["gpu_mem"] = 24
+
+    model_config_ff2048 = copy.deepcopy(model_config)
+    model_config_ff2048.ff_dim = 2048
+    train_args_ff2048 = {
+        **copy.deepcopy(train_args_adamw03_24gb_jjlr),
+        "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7_transparent",
+        "net_args": {"model_config_dict": asdict(model_config_ff2048)},
+    }
+    train_job, _ = run_exp(
+        prefix_name
+        + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent_JJLR_sub4_small_bs200ac2_ff2048/bs12_gpu",
+        datasets=train_data,
+        train_args=train_args_ff2048,
+        search_args=search_args_gpu,
+        with_prior=False,
+    )
+    train_job.rqmt["gpu_mem"] = 24
+
+    # TODO: Does not fit
+    # train_args_bs300ac1 = copy.deepcopy(train_args)
+    # train_args_bs300ac1["config"]["batch_size"] = 300 * 16000
+    # train_args_bs300ac1["config"]["accum_grad_multiple_step"] = 1
+    # train_job, _ = run_exp(
+    #     prefix_name + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent_JJLR_sub4_small_bs300ac1/bs12_gpu",
+    #     datasets=train_data, train_args=train_args_bs300ac1, search_args=search_args_gpu, with_prior=False)
+    # train_job.rqmt["gpu_mem"] = 24
+
+    # Do it large instead
+    model_config_v5_enc384_dec512 = ModelConfig(
+        frontend_config=copy.deepcopy(frontend_config),
+        specaug_config=specaug_config,
+        predictor_config=copy.deepcopy(predictor_config),
+        label_target_size=vocab_size_without_blank,
+        conformer_size=384,
+        num_layers=12,
+        num_heads=4,
+        ff_dim=1536,
+        att_weights_dropout=0.2,
+        conv_dropout=0.2,
+        ff_dropout=0.2,
+        mhsa_dropout=0.2,
+        conv_kernel_size=31,
+        final_dropout=0.2,
+        specauc_start_epoch=20,
+        joiner_dim=512,
+        joiner_activation="relu",
+        joiner_dropout=0.1,
+    )
+    model_config_v5_enc384_dec512.predictor_config.lstm_hidden_dim = 512
+    model_config_v5_enc384_dec512.predictor_config.lstm_dropout = 0.3
+    model_config_v5_enc384_dec512.predictor_config.emebdding_dropout = 0.1
+    model_config_v5_enc384_dec512.frontend_config.out_features = 384
+
+    train_args = {
+        **copy.deepcopy(train_args_adamw03_24gb_jjlr),
+        "network_module": "rnnt.conformer_1023.i6modelsV1_VGG4LayerActFrontendV1_v7_transparent",
+        "net_args": {"model_config_dict": asdict(model_config_v5_enc384_dec512)},
+    }
+    train_job, _ = run_exp(
+        prefix_name
+        + "conformer_1023/i6modelsV1_VGG4LayerActFrontendV1_v7_transparent_JJLR_sub4_enc384_dec512/bs12_gpu",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args_gpu,
+        with_prior=False,
+    )
+    train_job.rqmt["gpu_mem"] = 24
diff --git a/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_pretrained.py b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_pretrained.py
new file mode 100644
index 000000000..edb789284
--- /dev/null
+++ b/users/hilmes/experiments/nick_setups/tedlium2_standalone_2023/torchaudio_bpe_rnnt/exp_pretrained.py
@@ -0,0 +1,180 @@
+from sisyphus import tk
+
+import copy
+from dataclasses import asdict
+import numpy as np
+from typing import cast
+
+from i6_experiments.users.rossenbach.common_setups.returnn.datastreams.vocabulary import LabelDatastream
+from i6_core.report.report import _Report_Type
+
+from .data import build_bpe_training_datasets, TrainingDatasetSettings, get_text_lexicon
+from ..data import build_test_dataset
+from ..default_tools import RETURNN_EXE, MINI_RETURNN_ROOT
+
+from ..pipeline import training, search, compute_prior
+
+from .config import get_training_config, get_search_config, get_prior_config
+
+def flash_bpe_rnnt_report_format(report: _Report_Type) -> str:
+    extra_ls = []
+    out = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if not any(extra in recog for extra in extra_ls)]
+    out = sorted(out, key=lambda x: float(x[1]))
+    best_ls = [out[0]]
+    for extra in extra_ls:
+        out2 = [(" ".join(recog.split("/")[-3:]), str(report[recog])) for recog in report if extra in recog]
+        out2 = sorted(out2, key=lambda x: float(x[1]))
+        if len(out2) > 0:
+            out.append((extra, ""))
+            out.extend(out2)
+            best_ls.append(out2[0])
+    best_ls = sorted(best_ls, key=lambda x: float(x[1]))
+    out.append(("Best Results", ""))
+    out.extend(best_ls)
+    return "\n".join([f"{pair[0]}:  {str(pair[1])}" for pair in out])
+
+
+def pretrained_rnnt():
+    prefix_name = "experiments/rescale/tedliumv2/torchaudio_bpe_rnnt/"
+
+    BPE_SIZE = 1000
+
+    train_settings = TrainingDatasetSettings(
+        custom_processing_function=None, partition_epoch=5, epoch_wise_filters=[], seq_ordering="laplace:.1000"
+    )
+
+    train_settings_retrain = copy.deepcopy(train_settings)
+    train_settings_retrain.epoch_wise_filters = []
+
+    # build the training datasets object containing train, cv, dev-train and the extern_data dict
+    train_data = build_bpe_training_datasets(
+        bpe_size=BPE_SIZE,
+        settings=train_settings,
+    )
+    label_datastream = cast(LabelDatastream, train_data.datastreams["labels"])
+    vocab_size_without_blank = label_datastream.vocab_size
+
+    # build testing datasets
+    test_dataset_tuples = {}
+    # for testset in ["dev", "test"]:
+    for testset in ["dev"]:
+        test_dataset_tuples[testset] = build_test_dataset(
+            dataset_key=testset,
+        )
+    from i6_experiments.common.baselines.tedlium2.lm.ngram_config import run_tedlium2_ngram_lm
+
+    lms_system = run_tedlium2_ngram_lm(add_unknown_phoneme_and_mapping=False)
+    lm = lms_system.interpolated_lms["dev-pruned"]["4gram"]
+    arpa_ted_lm = lm.ngram_lm
+
+    # ---------------------------------------------------------------------------------------------------------------- #
+
+    def run_exp(
+        ft_name,
+        datasets,
+        train_args,
+        search_args=None,
+        num_epochs=250,
+        decoder="rnnt.decoder.experimental_rnnt_decoder",
+        with_prior=False,
+        evaluate_epoch=None,
+    ):
+        training_name = "/".join(ft_name.split("/")[:-1])
+        search_args = search_args if search_args is not None else {}
+
+        returnn_config = get_training_config(training_datasets=datasets, **train_args)
+        train_job = training(training_name, returnn_config, RETURNN_EXE, MINI_RETURNN_ROOT, num_epochs=num_epochs)
+
+        if not evaluate_epoch:
+            evaluate_epoch = num_epochs
+        search_job_ls = []
+        report = {}
+        returnn_search_config = get_search_config(**train_args, decoder_args=search_args, decoder=decoder)
+        format_string_report, values_report, search_jobs = search(
+            ft_name + "/default_%i" % evaluate_epoch,
+            returnn_search_config,
+            train_job.out_checkpoints[evaluate_epoch],
+            test_dataset_tuples,
+            RETURNN_EXE,
+            MINI_RETURNN_ROOT,
+            use_gpu=search_args.get("use_gpu", False),
+        )
+        search_job_ls += search_jobs
+        report.update(values_report)
+
+        return train_job, search_job_ls, format_string_report, report
+
+    def generate_report(results, exp_name):
+        from i6_core.report import GenerateReportStringJob, MailJob
+
+        report = GenerateReportStringJob(report_values=results, report_template=flash_bpe_rnnt_report_format)
+        report.add_alias(f"report/report/{exp_name}")
+        mail = MailJob(report.out_report, send_contents=True, subject=exp_name)
+        mail.add_alias(f"report/mail/{exp_name}")
+        tk.register_output("mail/" + exp_name, mail.out_status)
+
+    from ..pytorch_networks.rnnt.conformer_1023 import hubert_pretrain_v1_cfg
+
+    predictor_config = hubert_pretrain_v1_cfg.PredictorConfig(
+        symbol_embedding_dim=256,
+        emebdding_dropout=0.1,
+        num_lstm_layers=1,
+        lstm_hidden_dim=512,
+        lstm_dropout=0.3,
+    )
+
+    hubert_cfg_2 = hubert_pretrain_v1_cfg.HubertConfig(
+        finetune_layer=2,
+        name="base-ls960",
+    )
+    model_config_hubert_2 = hubert_pretrain_v1_cfg.ModelConfig(
+        specauc_start_epoch=0,
+        label_target_size=vocab_size_without_blank,
+        final_dropout=0.2,
+        hubert_cfg=hubert_cfg_2,
+        predictor_config=predictor_config,
+        joiner_dim=512,
+        joiner_activation="relu",
+        joiner_dropout=0.1,
+    )
+
+    train_args_hubert_adam_accum25_jjlr = {
+        "config": {
+            "optimizer": {"class": "adam", "epsilon": 1e-08, "betas": (0.9, 0.98)},
+            "learning_rates": list(np.linspace(7e-6, 7e-4, 110))
+                              + list(np.linspace(7e-4, 7e-5, 110))
+                              + list(np.linspace(7e-5, 1e-8, 30)),
+            #############
+            "batch_size": 180 * 16000,
+            "max_seq_length": {"audio_features": 35 * 16000},
+            "max_seqs": 3,
+            "accum_grad_multiple_step": 25,
+        },
+        "debug": False,
+    }
+    eval_epochs = [5, 10, 20, 30, 50, 75, 100, 150, 200, 250]
+    train_args = {
+        **copy.deepcopy(train_args_hubert_adam_accum25_jjlr),
+        "network_module": "rnnt.conformer_1023.hubert_pretrain_v1",
+        "net_args": {"model_config_dict": asdict(model_config_hubert_2)},
+    }
+    search_args = {
+        "beam_size": 12,
+        "returnn_vocab": label_datastream.vocab,
+    }
+    results = {}
+    train_job, _, _, wer_values = run_exp(
+        prefix_name
+        + "conformer_1023/hubert_pretrain_v3_base_tune2_jjlr/bs12",
+        datasets=train_data,
+        train_args=train_args,
+        search_args=search_args,
+        with_prior=False,
+    )
+    train_job.rqmt["gpu_mem"] = 24
+    results.update(wer_values)
+    del wer_values
+    generate_report(
+        results=results, exp_name=prefix_name + "conformer_1023/hubert_pretrain_v3_base_tune2_jjlr"
+    )
+    del results
diff --git a/users/hilmes/tools/onnx.py b/users/hilmes/tools/onnx.py
new file mode 100644
index 000000000..50a8de527
--- /dev/null
+++ b/users/hilmes/tools/onnx.py
@@ -0,0 +1,237 @@
+import sys
+import os
+from sisyphus import Job, Task, tk
+from typing import Any, Dict, Optional, Tuple, List, Union
+import logging
+
+from i6_core.returnn.config import ReturnnConfig
+from i6_core.returnn.training import PtCheckpoint
+from onnxruntime.quantization import quant_pre_process, quantize_static, CalibrationDataReader, CalibrationMethod, QuantType, QuantFormat
+from onnxruntime import InferenceSession, SessionOptions
+from returnn.datasets import Dataset, init_dataset
+from returnn.datasets.meta import MetaDataset
+import numpy as np
+
+class ExportPyTorchModelToOnnxJob(Job):
+    """
+    Experimental exporter job
+
+    JUST FOR DEBUGGING, THIS FUNCTIONALITY SHOULD BE IN RETURNN ITSELF
+    """
+    __sis_hash_exclude__ = {"quantize_dynamic": False, "quantize_static": False}
+
+    def __init__(self, pytorch_checkpoint: PtCheckpoint, returnn_config: ReturnnConfig, returnn_root: tk.Path, quantize_dynamic: bool = False):
+
+        self.pytorch_checkpoint = pytorch_checkpoint
+        self.returnn_config = returnn_config
+        self.returnn_root = returnn_root
+        self.quantize_dynamic = quantize_dynamic
+
+        self.out_onnx_model = self.output_path("model.onnx")
+        self.rqmt = {"time": 2, "cpu": 4, "mem": 16}
+
+    def tasks(self):
+        yield Task("run", rqmt=self.rqmt)
+
+    def run(self):
+        sys.path.insert(0, self.returnn_root.get())
+        import torch
+        from returnn.config import Config
+        config = Config()
+        self.returnn_config.write("returnn.config")
+        config.load_file("returnn.config")
+        
+        model_state = torch.load(str(self.pytorch_checkpoint),map_location=torch.device("cpu"))
+        if isinstance(model_state, dict):
+            epoch = model_state["epoch"]
+            step = model_state["step"]
+            model_state = model_state["model"]
+        else:
+            epoch = 1
+            step = 0
+        
+        get_model_func = config.typed_value("get_model")
+        assert get_model_func, "get_model not defined"
+        model = get_model_func(epoch=epoch, step=step)
+        assert isinstance(model, torch.nn.Module)
+
+        model.load_state_dict(model_state)
+
+        export_func = config.typed_value("export")
+        assert export_func
+        if self.quantize_dynamic:
+            import onnx
+            from onnxruntime.quantization import quantize_dynamic
+
+            model_fp32 = 'tmp_model.onnx'
+            export_func(model=model, model_filename=model_fp32)
+            quantized_model = quantize_dynamic(model_fp32, self.out_onnx_model.get())
+        else:
+            export_func(model=model, model_filename=self.out_onnx_model.get())
+
+
+class ModelQuantizeStaticJob(Job):
+
+    __sis_hash_exclude__ = {
+        "moving_average": False,
+        "smoothing_factor": 0.0,
+        "symmetric": False,
+        "activation_type": QuantType.QInt8,
+        "quant_format": QuantFormat.QDQ,
+        "weight_type": QuantType.QInt8,
+        "final_skip": (None, None),
+        "ops_to_quant": None,
+        "smooth_quant": False,
+    }
+
+    def __init__(self,
+        model: tk.Path,
+        dataset: Dict[str, Any],
+        num_seqs: int = 10,
+        num_parallel_seqs: int = 25,
+        calibrate_method: CalibrationMethod = CalibrationMethod.MinMax,
+        moving_average: bool = False,
+        smoothing_factor: float = 0.0,
+        symmetric: bool = False,
+        activation_type = QuantType.QInt8,
+        quant_format = QuantFormat.QDQ,
+        weight_type = QuantType.QInt8,
+        final_skip: Tuple[Optional[int], Optional[int]] = (None, None),
+        ops_to_quant: Optional[List[str]] = None,
+        smooth_quant: bool = False,
+    ):
+        """
+        :param model:
+        :param dataset:
+        :param num_seqs:
+        :param num_parallel_seqs:
+        :param moving_average: whether to use moving average for MinMax or Symmetry for Entropy
+        """
+        self.model = model
+        self.dataset = dataset
+        self.num_seqs = num_seqs
+        self.num_parallel_seqs = num_parallel_seqs
+        self.moving_average = moving_average
+        self.activation_type = activation_type
+        self.quant_format = quant_format
+        self.weight_type = weight_type
+
+        self.out_model = self.output_path("model.onnx")
+        if num_seqs >= 5000:
+            time = 12
+        elif num_seqs >= 2500:
+            time = 6
+        elif num_seqs >= 1000:
+            time = 4
+        else:
+            time = 1
+        if not calibrate_method == CalibrationMethod.MinMax:
+            time *= 2
+
+        self.rqmt = {"cpu": 8 if num_seqs > 100 else 4, "mem": 16.0 if calibrate_method == CalibrationMethod.MinMax else 48, "time": time}
+        self.calibration_method = calibrate_method
+        self.smoothing_factor = smoothing_factor
+        self.symmetric = symmetric
+        self.final_skip = final_skip
+        self.smooth_quant = smooth_quant
+        self.ops_to_quant = ops_to_quant
+        self.out_dev_log = self.output_path("dev_log")
+
+    def tasks(self):
+        yield Task("run", rqmt=self.rqmt)
+
+    def convert_to_str(self, dataset: Dict):
+        res = {}
+        for x in dataset:
+            if isinstance(dataset[x], dict):
+                res[x] = self.convert_to_str(dataset[x])
+            elif isinstance(dataset[x], tk.Path):
+                res[x] = str(dataset[x])
+            else:
+                res[x] = dataset[x]
+        return res
+
+    def run(self):
+        print("Start")
+        quant_pre_process(
+            input_model_path=self.model.get_path(),
+            output_model_path="model_prep.onnx")
+
+        class DummyDataReader(CalibrationDataReader):
+
+            def __init__(self, model_str: str, data: Union[Dataset, MetaDataset], max_seqs: int, final_skip:  Optional[Tuple[int, int]] = (None, None)):
+
+                self.max_seqs = max_seqs
+                self.data = data
+                self.idx: int = 0
+                sess_option = SessionOptions()
+                logging.info(f"Data Loading {os.getenv('SLURM_CPUS_PER_TASK')}")
+                sess_option.intra_op_num_threads = int(os.getenv('SLURM_CPUS_PER_TASK'))
+                session = InferenceSession(model_str, sess_option)
+                self.input_name_1 = session.get_inputs()[0].name
+                self.input_name_2 = session.get_inputs()[1].name
+                self.final_skip_step = final_skip[0]
+                self.final_skip_count = final_skip[1]
+
+            def get_next(self):
+                init_dataset(self.data)
+                key = "data" if "data" in self.data.data_keys else "raw_audio"  # hack to make it compatible with both setups for now
+                if not self.data.is_less_than_num_seqs(self.idx) or self.idx >= self.max_seqs:
+                    if self.final_skip_step is not None and self.idx < self.max_seqs + self.final_skip_step * self.final_skip_count:
+                        self.idx += self.final_skip_step
+                        logging.info(f"Skipping to Seq {self.idx}")
+                        self.data.load_seqs(self.idx, self.idx + 1)
+                        seq_len: np.ndarray = self.data.get_seq_length(self.idx)[key]
+                        data: np.ndarray = self.data.get_data(self.idx, key)
+                        seq_len = np.array([seq_len], dtype=np.int32)
+                        data = np.expand_dims(data, axis=0)
+                        return {self.input_name_1: data, self.input_name_2: seq_len}
+                    else:
+                        return None
+                self.data.load_seqs(self.idx, self.idx + 1)
+                seq_len: np.ndarray = self.data.get_seq_length(self.idx)[key]
+                data: np.ndarray = self.data.get_data(self.idx, key)
+                if self.idx % 10 == 0:
+                    logging.info(f"{self.idx} seqs seen")
+                seq_len = np.array([seq_len], dtype=np.int32)
+                data = np.expand_dims(data, axis=0)
+                self.idx += 1
+                return {self.input_name_1: data, self.input_name_2: seq_len}
+
+            def __iter__(self):
+                data = []
+                x = self.get_next()
+                while x is not None:
+                    data.append(x)
+                    x = self.get_next()
+                for x in data:
+                    yield x
+
+        self.dataset = self.convert_to_str(self.dataset)
+        dataset: Dataset = init_dataset(self.dataset)
+        dataset.init_seq_order(1)
+        y = DummyDataReader(model_str="model_prep.onnx", data=dataset, max_seqs=self.num_seqs, final_skip=self.final_skip)
+        quant_options = {
+                "CalibMaxIntermediateOutputs": self.num_parallel_seqs,
+                "CalibMovingAverage": self.moving_average,
+                "CalibTensorRangeSymmetric": self.symmetric,
+            }
+        if self.smoothing_factor > 0.0:
+            quant_options["CalibSmoothRange"] = self.smoothing_factor
+        if self.smooth_quant:
+            quant_options["SmoothQuant"] = True
+        quantize_static(
+            model_input="model_prep.onnx",
+            model_output=self.out_model.get_path(),
+            calibration_data_reader=y,
+            calibrate_method=self.calibration_method,
+            extra_options=quant_options,
+            quant_format=self.quant_format,
+            activation_type=self.activation_type,
+            weight_type=self.weight_type,
+            op_types_to_quantize=self.ops_to_quant,
+        )
+        import shutil
+        if self.final_skip[0] or self.final_skip[1]:
+            shutil.move("calibrate_tensors_dev", self.out_dev_log)
+