recipes/CommonVoice/ASR/seq2seq/hparams/train_en.yaml

# ################################
# Model: VGG2 + LSTM + time pooling
# Augmentation: SpecAugment
# Authors: Titouan Parcollet, Mirco Ravanelli, Peter Plantinga, Ju-Chieh Chou,
# and Abdel HEBA 2020
# ################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
output_folder: !ref results/CRDNN_it/<seed>
test_wer_file: !ref <output_folder>/wer_test.txt
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data files
data_folder: !PLACEHOLDER  # e.g, /localscratch/cv-corpus-5.1-2020-06-22/fr
train_tsv_file: !ref <data_folder>/train.tsv  # Standard CommonVoice .tsv files
dev_tsv_file: !ref <data_folder>/dev.tsv  # Standard CommonVoice .tsv files
test_tsv_file: !ref <data_folder>/test.tsv  # Standard CommonVoice .tsv files
accented_letters: False
language: en # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english
train_csv: !ref <save_folder>/train.csv
valid_csv: !ref <save_folder>/dev.csv
test_csv: !ref <save_folder>/test.csv
skip_prep: False # Skip data preparation

# We remove utterance slonger than 10s in the train/dev/test sets as
# longer sentences certainly correspond to "open microphones".
avoid_if_longer_than: 10.0

####################### Training Parameters ####################################

number_of_epochs: 25
number_of_ctc_epochs: 10
lr: 1.0
ctc_weight: 0.3
sorting: ascending
precision: fp32 # bf16, fp16 or fp32

# With data_parallel batch_size is split into N jobs
# With DDP batch_size is multiplied by N jobs
# Must be 6 per GPU to fit 16GB of VRAM
batch_size: 12
test_batch_size: 6

dataloader_options:
    batch_size: !ref <batch_size>
    num_workers: 6
test_dataloader_options:
    batch_size: !ref <test_batch_size>
    num_workers: 6

# BPE parameters
token_type: unigram  # ["unigram", "bpe", "char"]
character_coverage: 1.0
label_smoothing: 0.1

# Feature parameters (FBANKS etc)
sample_rate: 16000
n_fft: 400
n_mels: 80

####################### Model Parameters #######################################
activation: !name:torch.nn.LeakyReLU
dropout: 0.15
cnn_blocks: 3
cnn_channels: (128, 200, 256)
inter_layer_pooling_size: (2, 2, 2)
cnn_kernelsize: (3, 3)
time_pooling_size: 4
rnn_class: !name:speechbrain.nnet.RNN.LSTM
rnn_layers: 5
rnn_neurons: 1024
rnn_bidirectional: True
dnn_blocks: 2
dnn_neurons: 1024
emb_size: 128
dec_neurons: 1024
dec_hidden_size: !ref <dec_neurons>
dec_attn_dim: !ref <dec_neurons>

# Outputs
output_neurons: 500  # BPE size, index(blank/eos/bos) = 0

# Decoding parameters
# Be sure that the bos and eos index match with the BPEs ones
blank_index: 0
bos_index: 0
eos_index: 0
min_decode_ratio: 0.0
max_decode_ratio: 1.0
beam_size: 80
eos_threshold: 1.5
using_max_attn_shift: True
max_attn_shift: 140
# ctc_weight_decode: 0.0
temperature: 1.50

#
# Functions and classes
#
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>

############################## Augmentations ###################################

 # Time Drop
time_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop
    drop_length_low: 15
    drop_length_high: 25
    drop_count_low: 5
    drop_count_high: 5

# Frequency Drop
freq_drop: !new:speechbrain.augment.freq_domain.SpectrogramDrop
    drop_length_low: 25
    drop_length_high: 35
    drop_count_low: 2
    drop_count_high: 2
    dim: 2

# Time warp
time_warp: !new:speechbrain.augment.freq_domain.Warping

fea_augment: !new:speechbrain.augment.augmenter.Augmenter
    min_augmentations: 3
    max_augmentations: 3
    augment_prob: 1.0
    augmentations: [
        !ref <time_drop>,
        !ref <freq_drop>,
        !ref <time_warp>]

normalize: !new:speechbrain.processing.features.InputNormalization
    norm_type: global

compute_features: !new:speechbrain.lobes.features.Fbank
    sample_rate: !ref <sample_rate>
    n_fft: !ref <n_fft>
    n_mels: !ref <n_mels>

############################## Models ##########################################

enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
    input_shape: [null, null, !ref <n_mels>]
    activation: !ref <activation>
    dropout: !ref <dropout>
    cnn_blocks: !ref <cnn_blocks>
    cnn_channels: !ref <cnn_channels>
    cnn_kernelsize: !ref <cnn_kernelsize>
    inter_layer_pooling_size: !ref <inter_layer_pooling_size>
    time_pooling: True
    using_2d_pooling: False
    time_pooling_size: !ref <time_pooling_size>
    rnn_class: !ref <rnn_class>
    rnn_layers: !ref <rnn_layers>
    rnn_neurons: !ref <rnn_neurons>
    rnn_bidirectional: !ref <rnn_bidirectional>
    rnn_re_init: True
    dnn_blocks: !ref <dnn_blocks>
    dnn_neurons: !ref <dnn_neurons>

emb: !new:speechbrain.nnet.embedding.Embedding
    num_embeddings: !ref <output_neurons>
    embedding_dim: !ref <emb_size>

dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
    enc_dim: !ref <dec_neurons>
    input_size: !ref <emb_size>
    rnn_type: gru
    attn_type: location
    hidden_size: !ref <dec_hidden_size>
    attn_dim: !ref <dec_attn_dim>
    num_layers: 1
    scaling: 1.0
    channels: 10
    kernel_size: 100
    re_init: True
    dropout: !ref <dropout>

ctc_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <dnn_neurons>
    n_neurons: !ref <output_neurons>

seq_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <dec_neurons>
    n_neurons: !ref <output_neurons>

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
    blank_index: !ref <blank_index>

seq_cost: !name:speechbrain.nnet.losses.nll_loss
    label_smoothing: !ref <label_smoothing>

modules:
    enc: !ref <enc>
    emb: !ref <emb>
    dec: !ref <dec>
    ctc_lin: !ref <ctc_lin>
    seq_lin: !ref <seq_lin>
    normalize: !ref <normalize>

model: !new:torch.nn.ModuleList
    - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]

opt_class: !name:torch.optim.Adadelta
    lr: !ref <lr>
    rho: 0.95
    eps: 1.e-8

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.8
    patient: 0

beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
    embedding: !ref <emb>
    decoder: !ref <dec>
    linear: !ref <seq_lin>
    bos_index: !ref <bos_index>
    eos_index: !ref <eos_index>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>
    beam_size: !ref <beam_size>
    eos_threshold: !ref <eos_threshold>
    using_max_attn_shift: !ref <using_max_attn_shift>
    max_attn_shift: !ref <max_attn_shift>
    temperature: !ref <temperature>

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        scheduler: !ref <lr_annealing>
        normalizer: !ref <normalize>
        counter: !ref <epoch_counter>

train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats

cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
    split_tokens: True