forked from speechbrain/speechbrain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
conv2d.yaml
155 lines (124 loc) · 4.38 KB
/
conv2d.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# #################################
# Basic training parameters for sound classification using the ESC50 dataset.
# This recipe uses a conv2d backbone for classification.
#
# Authors:
# * Cem Subakan 2022, 2023
# * Francesco Paissan 2022, 2023
# (based on the SpeechBrain UrbanSound8k recipe)
# #################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1234
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]
# Set up folders for reading from and writing to
data_folder: !PLACEHOLDER # e.g., /localscratch/ESC-50-master
audio_data_folder: !ref <data_folder>/audio
wham_folder: null # Set it if add_wham_noise is True
wham_audio_folder: !ref <wham_folder>/tr
experiment_name: conv2dv2_classifier-16k
output_folder: !ref ./results/<experiment_name>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt
test_only: False
# Tensorboard logs
use_tensorboard: False
tensorboard_logs_folder: !ref <output_folder>/tb_logs/
# Path where data manifest files will be stored
train_annotation: !ref <data_folder>/manifest/train.json
valid_annotation: !ref <data_folder>/manifest/valid.json
test_annotation: !ref <data_folder>/manifest/test.json
# To standardize results, UrbanSound8k has pre-separated samples into
# 10 folds for multi-fold validation
train_fold_nums: [1, 2, 3]
valid_fold_nums: [4]
test_fold_nums: [5]
skip_manifest_creation: False
ckpt_interval_minutes: 15 # save checkpoint every N min
# Training parameters
number_of_epochs: 200
batch_size: 32
lr: 0.00002
base_lr: 0.000002
max_lr: !ref <lr>
step_size: 65000
sample_rate: 16000
signal_length_s: 5
add_wham_noise: False
# Feature parameters
n_mels: 80
# Number of classes
out_n_neurons: 50
shuffle: True
dataloader_options:
batch_size: !ref <batch_size>
shuffle: !ref <shuffle>
num_workers: 0
use_pretrained: True
use_melspectra: False
use_log1p_mel: False
embedding_model: !new:speechbrain.lobes.models.PIQ.Conv2dEncoder_v2
dim: 256
classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
input_size: 256
out_neurons: !ref <out_n_neurons>
lin_blocks: 1
#classifier: !new:torch.nn.Linear
#in_features: 256
#out_features: !ref <out_n_neurons>
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
limit: !ref <number_of_epochs>
mean_var_norm: !new:speechbrain.processing.features.InputNormalization
norm_type: sentence
std_norm: False
# pre-processing
n_fft: 1024
spec_mag_power: 0.5
hop_length: 11.6099
win_length: 23.2199
compute_stft: !new:speechbrain.processing.features.STFT
n_fft: !ref <n_fft>
hop_length: !ref <hop_length>
win_length: !ref <win_length>
sample_rate: !ref <sample_rate>
compute_fbank: !new:speechbrain.processing.features.Filterbank
n_mels: !ref <n_mels>
n_fft: !ref <n_fft>
sample_rate: !ref <sample_rate>
modules:
compute_stft: !ref <compute_stft>
compute_fbank: !ref <compute_fbank>
embedding_model: !ref <embedding_model>
classifier: !ref <classifier>
mean_var_norm: !ref <mean_var_norm>
compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
margin: 0.2
scale: 30
opt_class: !name:torch.optim.Adam
lr: !ref <lr>
weight_decay: 0.000002
lr_annealing: !new:speechbrain.nnet.schedulers.CyclicLRScheduler
base_lr: !ref <base_lr>
max_lr: !ref <max_lr>
step_size: !ref <step_size>
# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: !ref <train_log>
error_stats: !name:speechbrain.utils.metric_stats.MetricStats
metric: !name:speechbrain.nnet.losses.classification_error
reduction: batch
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: !ref <save_folder>
recoverables:
embedding_model: !ref <embedding_model>
classifier: !ref <classifier>
normalizer: !ref <mean_var_norm>
counter: !ref <epoch_counter>
# If you do not want to use the pretrained separator you can simply delete pretrained_separator field.
embedding_model_path: "speechbrain/PIQ-ESC50/embedding_model.ckpt"
pretrained_encoder: !new:speechbrain.utils.parameter_transfer.Pretrainer
collect_in: !ref <save_folder>
loadables:
embedding_model: !ref <embedding_model>
paths:
embedding_model: !ref <embedding_model_path>