recipes/Google-speech-commands/hparams/xvect_leaf.yaml

# ################################
# Model: Classification with xvector and LEAF
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
#          Further adapted for LEAF by Sarthak Yadav 2022
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/xvect_leaf_legacy_complex_mvnorm_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <save_folder>/train.csv
valid_annotation: !ref <save_folder>/valid.csv
test_annotation: !ref <save_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 100
batch_size: 32
lr: 0.001
lr_final: 0.0001

sample_rate: 16000
shuffle: True


# Feature parameters
n_features: 24

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 4
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
compute_features: !new:speechbrain.lobes.features.Leaf
    out_channels: !ref <n_features>
    in_channels: 1
    min_freq: 0.
    n_fft: 400
    use_legacy_complex: True

embedding_model: !new:speechbrain.lobes.models.Xvector.Xvector
    in_channels: !ref <n_features>
    activation: !name:torch.nn.LeakyReLU
    tdnn_blocks: 5
    tdnn_channels: [512, 512, 512, 512, 1500]
    tdnn_kernel_sizes: [5, 3, 3, 1, 1]
    tdnn_dilations: [1, 2, 3, 1, 1]
    lin_neurons: 512

classifier: !new:speechbrain.lobes.models.Xvector.Classifier
    input_shape: [null, null, 512]
    activation: !name:torch.nn.LeakyReLU
    lin_blocks: 1
    lin_neurons: 512
    out_neurons: !ref <out_n_neurons>

softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: True

modules:
    compute_features: !ref <compute_features>
    embedding_model: !ref <embedding_model>
    classifier: !ref <classifier>
    softmax: !ref <softmax>
    mean_var_norm: !ref <mean_var_norm>


# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
    initial_value: !ref <lr>
    final_value: !ref <lr_final>
    epoch_count: !ref <number_of_epochs>

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        embedding_model: !ref <embedding_model>
        classifier: !ref <classifier>
        normalizer: !ref <mean_var_norm>
        counter: !ref <epoch_counter>