From c88bf2a43d4f3750bcdac3d42ca665281b316efa Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Wed, 20 Jul 2022 18:55:22 +0800 Subject: [PATCH 1/6] add stats about duration and padding proportion --- .../ASR/pruned_transducer_stateless4/train.py | 8 ++++++ icefall/utils.py | 27 ++++++++++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py index 48c0e683df..66123e718a 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py @@ -603,6 +603,14 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) + info["utterances"] = feature.size(0) + # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utt_duration"] = feature_lens.sum().item() + # padding proportion of each utterance + info["utt_pad_proportion"] = ( + ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() + ) + # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() info["simple_loss"] = simple_loss.detach().cpu().item() diff --git a/icefall/utils.py b/icefall/utils.py index b38574f0c0..b495d6b5a3 100644 --- a/icefall/utils.py +++ b/icefall/utils.py @@ -521,13 +521,20 @@ def __mul__(self, alpha: float) -> "MetricsTracker": return ans def __str__(self) -> str: - ans = "" + ans_frames = "" + ans_utterances = "" for k, v in self.norm_items(): norm_value = "%.4g" % v - ans += str(k) + "=" + str(norm_value) + ", " + if "utt_" not in k: + ans_frames += str(k) + "=" + str(norm_value) + ", " + else: + ans_utterances += str(k) + "=" + str(norm_value) + ", " frames = "%.2f" % self["frames"] - ans += "over " + str(frames) + " frames." - return ans + ans_frames += "over " + str(frames) + " frames; " + utterances = "%.2f" % self["utterances"] + ans_utterances += "over " + str(utterances) + " utterances." + + return ans_frames + ans_utterances def norm_items(self) -> List[Tuple[str, float]]: """ @@ -535,11 +542,17 @@ def norm_items(self) -> List[Tuple[str, float]]: [('ctc_loss', 0.1), ('att_loss', 0.07)] """ num_frames = self["frames"] if "frames" in self else 1 + num_utterances = self["utterances"] if "utterances" in self else 1 ans = [] for k, v in self.items(): - if k != "frames": - norm_value = float(v) / num_frames - ans.append((k, norm_value)) + if k == "frames" or k == "utterances": + continue + norm_value = ( + float(v) / num_frames + if "utt_" not in k + else float(v) / num_utterances + ) + ans.append((k, norm_value)) return ans def reduce(self, device): From 2daf4fec4cba994119d8c227afd57f82418d84f8 Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Wed, 20 Jul 2022 20:05:34 +0800 Subject: [PATCH 2/6] add for utt_duration --- icefall/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/icefall/utils.py b/icefall/utils.py index b495d6b5a3..0271dd37c3 100644 --- a/icefall/utils.py +++ b/icefall/utils.py @@ -528,7 +528,11 @@ def __str__(self) -> str: if "utt_" not in k: ans_frames += str(k) + "=" + str(norm_value) + ", " else: - ans_utterances += str(k) + "=" + str(norm_value) + ", " + ans_utterances += str(k) + "=" + str(norm_value) + if k == "utt_duration": + ans_utterances += " frames, " + else: + ans_utterances += ", " frames = "%.2f" % self["frames"] ans_frames += "over " + str(frames) + " frames; " utterances = "%.2f" % self["utterances"] From fdb8371c8cd90d7b03f4e4b54d37ee94919421bd Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Sun, 24 Jul 2022 19:41:09 +0800 Subject: [PATCH 3/6] add stats for other recipes --- .../ASR/conv_emformer_transducer_stateless/train.py | 8 ++++++++ egs/librispeech/ASR/pruned_transducer_stateless/train.py | 8 ++++++++ egs/librispeech/ASR/pruned_transducer_stateless2/train.py | 8 ++++++++ egs/librispeech/ASR/pruned_transducer_stateless3/train.py | 8 ++++++++ egs/librispeech/ASR/pruned_transducer_stateless5/train.py | 8 ++++++++ egs/librispeech/ASR/pruned_transducer_stateless6/train.py | 8 ++++++++ 6 files changed, 48 insertions(+) diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py index 106f3e5110..6c3e01fe8b 100755 --- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py +++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py @@ -686,6 +686,14 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) + info["utterances"] = feature.size(0) + # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utt_duration"] = feature_lens.sum().item() + # padding proportion of each utterance + info["utt_pad_proportion"] = ( + ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() + ) + # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() info["simple_loss"] = simple_loss.detach().cpu().item() diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/train.py b/egs/librispeech/ASR/pruned_transducer_stateless/train.py index 4484197599..6267938155 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py @@ -504,6 +504,14 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) + info["utterances"] = feature.size(0) + # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utt_duration"] = feature_lens.sum().item() + # padding proportion of each utterance + info["utt_pad_proportion"] = ( + ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() + ) + # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() info["simple_loss"] = simple_loss.detach().cpu().item() diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py index 36ee7ca749..32e7cf5b49 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py @@ -573,6 +573,14 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) + info["utterances"] = feature.size(0) + # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utt_duration"] = feature_lens.sum().item() + # padding proportion of each utterance + info["utt_pad_proportion"] = ( + ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() + ) + # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() info["simple_loss"] = simple_loss.detach().cpu().item() diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py index 92eae78d1e..4f7ccf111f 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py @@ -612,6 +612,14 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) + info["utterances"] = feature.size(0) + # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utt_duration"] = feature_lens.sum().item() + # padding proportion of each utterance + info["utt_pad_proportion"] = ( + ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() + ) + # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() info["simple_loss"] = simple_loss.detach().cpu().item() diff --git a/egs/librispeech/ASR/pruned_transducer_stateless5/train.py b/egs/librispeech/ASR/pruned_transducer_stateless5/train.py index e77eb19ff2..03754f65a2 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless5/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless5/train.py @@ -644,6 +644,14 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) + info["utterances"] = feature.size(0) + # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utt_duration"] = feature_lens.sum().item() + # padding proportion of each utterance + info["utt_pad_proportion"] = ( + ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() + ) + # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() info["simple_loss"] = simple_loss.detach().cpu().item() diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/train.py b/egs/librispeech/ASR/pruned_transducer_stateless6/train.py index 315c01c8e8..136b20b6fb 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless6/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless6/train.py @@ -657,6 +657,14 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) + info["utterances"] = feature.size(0) + # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utt_duration"] = feature_lens.sum().item() + # padding proportion of each utterance + info["utt_pad_proportion"] = ( + ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() + ) + # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() info["simple_loss"] = simple_loss.detach().cpu().item() From 9576b2cb2c280bb2a059fed501c3758fdf7ae2ba Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Sun, 24 Jul 2022 19:47:30 +0800 Subject: [PATCH 4/6] add stats for other 2 recipes --- .../ASR/conv_emformer_transducer_stateless2/train.py | 8 ++++++++ .../ASR/pruned_stateless_emformer_rnnt2/train.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py index dfe1b61362..4706074b18 100755 --- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py @@ -686,6 +686,14 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) + info["utterances"] = feature.size(0) + # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utt_duration"] = feature_lens.sum().item() + # padding proportion of each utterance + info["utt_pad_proportion"] = ( + ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() + ) + # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() info["simple_loss"] = simple_loss.detach().cpu().item() diff --git a/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py b/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py index cd62787fa7..87fb71e1d5 100755 --- a/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py +++ b/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py @@ -603,6 +603,14 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) + info["utterances"] = feature.size(0) + # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utt_duration"] = feature_lens.sum().item() + # padding proportion of each utterance + info["utt_pad_proportion"] = ( + ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() + ) + # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() info["simple_loss"] = simple_loss.detach().cpu().item() From 0adfe6595d15fbbf80da8273db7a260909eb3364 Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Sun, 24 Jul 2022 20:00:57 +0800 Subject: [PATCH 5/6] modify doc --- .../ASR/conv_emformer_transducer_stateless/train.py | 5 +++-- .../ASR/conv_emformer_transducer_stateless2/train.py | 5 +++-- egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py | 5 +++-- egs/librispeech/ASR/pruned_transducer_stateless/train.py | 5 +++-- egs/librispeech/ASR/pruned_transducer_stateless2/train.py | 5 +++-- egs/librispeech/ASR/pruned_transducer_stateless3/train.py | 5 +++-- egs/librispeech/ASR/pruned_transducer_stateless4/train.py | 5 +++-- egs/librispeech/ASR/pruned_transducer_stateless5/train.py | 5 +++-- egs/librispeech/ASR/pruned_transducer_stateless6/train.py | 5 +++-- 9 files changed, 27 insertions(+), 18 deletions(-) diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py index b30d5e443b..c07d8f76b0 100755 --- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py +++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py @@ -686,10 +686,11 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) - info["utterances"] = feature.size(0) # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utterances"] = feature.size(0) + # averaged input duration in frames over utterances info["utt_duration"] = feature_lens.sum().item() - # padding proportion of each utterance + # averaged padding proportion over utterances info["utt_pad_proportion"] = ( ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() ) diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py index 4706074b18..2bbc45d780 100755 --- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py @@ -686,10 +686,11 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) - info["utterances"] = feature.size(0) # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utterances"] = feature.size(0) + # averaged input duration in frames over utterances info["utt_duration"] = feature_lens.sum().item() - # padding proportion of each utterance + # averaged padding proportion over utterances info["utt_pad_proportion"] = ( ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() ) diff --git a/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py b/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py index 87fb71e1d5..dd23309b39 100755 --- a/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py +++ b/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py @@ -603,10 +603,11 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) - info["utterances"] = feature.size(0) # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utterances"] = feature.size(0) + # averaged input duration in frames over utterances info["utt_duration"] = feature_lens.sum().item() - # padding proportion of each utterance + # averaged padding proportion over utterances info["utt_pad_proportion"] = ( ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() ) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/train.py b/egs/librispeech/ASR/pruned_transducer_stateless/train.py index b558c93182..b625ed3ff2 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py @@ -559,10 +559,11 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) - info["utterances"] = feature.size(0) # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utterances"] = feature.size(0) + # averaged input duration in frames over utterances info["utt_duration"] = feature_lens.sum().item() - # padding proportion of each utterance + # averaged padding proportion over utterances info["utt_pad_proportion"] = ( ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() ) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py index 4ffc15be8e..46d2cb86d4 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py @@ -627,10 +627,11 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) - info["utterances"] = feature.size(0) # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utterances"] = feature.size(0) + # averaged input duration in frames over utterances info["utt_duration"] = feature_lens.sum().item() - # padding proportion of each utterance + # averaged padding proportion over utterances info["utt_pad_proportion"] = ( ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() ) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py index e16279217b..371bf21d91 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py @@ -652,10 +652,11 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) - info["utterances"] = feature.size(0) # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utterances"] = feature.size(0) + # averaged input duration in frames over utterances info["utt_duration"] = feature_lens.sum().item() - # padding proportion of each utterance + # averaged padding proportion over utterances info["utt_pad_proportion"] = ( ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() ) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py index ca6cb462dd..893a6a749a 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py @@ -657,10 +657,11 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) - info["utterances"] = feature.size(0) # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utterances"] = feature.size(0) + # averaged input duration in frames over utterances info["utt_duration"] = feature_lens.sum().item() - # padding proportion of each utterance + # averaged padding proportion over utterances info["utt_pad_proportion"] = ( ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() ) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless5/train.py b/egs/librispeech/ASR/pruned_transducer_stateless5/train.py index 8ccaba9092..8f20eedc95 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless5/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless5/train.py @@ -644,10 +644,11 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) - info["utterances"] = feature.size(0) # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utterances"] = feature.size(0) + # averaged input duration in frames over utterances info["utt_duration"] = feature_lens.sum().item() - # padding proportion of each utterance + # averaged padding proportion over utterances info["utt_pad_proportion"] = ( ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() ) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless6/train.py b/egs/librispeech/ASR/pruned_transducer_stateless6/train.py index fb9eacc842..596f8f7d93 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless6/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless6/train.py @@ -661,10 +661,11 @@ def compute_loss( (feature_lens // params.subsampling_factor).sum().item() ) - info["utterances"] = feature.size(0) # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances` # noqa + info["utterances"] = feature.size(0) + # averaged input duration in frames over utterances info["utt_duration"] = feature_lens.sum().item() - # padding proportion of each utterance + # averaged padding proportion over utterances info["utt_pad_proportion"] = ( ((feature.size(1) - feature_lens) / feature.size(1)).sum().item() ) From 691deaf730ebf8483c1c673adbdc814b111eed4d Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Mon, 25 Jul 2022 16:31:05 +0800 Subject: [PATCH 6/6] minor change --- icefall/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/icefall/utils.py b/icefall/utils.py index b1ac331973..417ca17660 100644 --- a/icefall/utils.py +++ b/icefall/utils.py @@ -539,8 +539,10 @@ def __str__(self) -> str: ans_utterances += str(k) + "=" + str(norm_value) if k == "utt_duration": ans_utterances += " frames, " - else: + elif k == "utt_pad_proportion": ans_utterances += ", " + else: + raise ValueError(f"Unexpected key: {k}") frames = "%.2f" % self["frames"] ans_frames += "over " + str(frames) + " frames; " utterances = "%.2f" % self["utterances"]