k2-fsa · csukuangfj · Jul 14, 2022 · Jul 6, 2022 · Jul 6, 2022 · Jul 7, 2022
diff --git a/egs/aishell2/ASR/local/compute_fbank_aishell2.py b/egs/aishell2/ASR/local/compute_fbank_aishell2.py
@@ -17,7 +17,7 @@
 
 
 """
-This file computes fbank features of the aishell dataset.
+This file computes fbank features of the aishell2 dataset.
 It looks for manifests in the directory data/manifests.
 
 The generated fbank features are saved in data/fbank.

diff --git a/egs/aishell2/ASR/local/display_manifest_statistics.py b/egs/aishell2/ASR/local/display_manifest_statistics.py
@@ -29,165 +29,17 @@
 
 
 def main():
-    #  path = "./data/fbank/aishell2_cuts_train.jsonl.gz"
-    #  path = "./data/fbank/aishell2_cuts_test.jsonl.gz"
-    path = "./data/fbank/aishell2_cuts_dev.jsonl.gz"
+    paths = [
+        "./data/fbank/aishell2_cuts_train.jsonl.gz",
+        "./data/fbank/aishell2_cuts_dev.jsonl.gz",
+        "./data/fbank/aishell2_cuts_test.jsonl.gz"
+    ]
 
-    cuts = load_manifest_lazy(path)
-    cuts.describe()
+    for path in paths:
+        print(f"Starting display the statistics for {path}")
+        cuts = load_manifest_lazy(path)
+        cuts.describe()
 
 
 if __name__ == "__main__":
-    main()
-
-"""
-## train (after speed perturb)
-Cuts count: 360294
-Total duration (hours): 455.6
-Speech duration (hours): 455.6 (100.0%)
-***
-Duration statistics (seconds):
-mean    4.6
-std     1.4
-min     1.1
-0.1%    1.8
-0.5%    2.2
-1%      2.3
-5%      2.7
-10%     3.0
-10%     3.0
-25%     3.5
-50%     4.3
-75%     5.4
-90%     6.5
-95%     7.2
-99%     8.8
-99.5%   9.4
-99.9%   10.9
-max     16.1
-
-## test
-Cuts count: 7176
-Total duration (hours): 10.0
-Speech duration (hours): 10.0 (100.0%)
-***
-Duration statistics (seconds):
-mean    5.0
-std     1.6
-min     1.9
-0.1%    2.2
-0.5%    2.4
-1%      2.6
-5%      3.0
-10%     3.2
-10%     3.2
-25%     3.8
-50%     4.7
-75%     5.9
-90%     7.3
-95%     8.2
-99%     9.9
-99.5%   10.7
-99.9%   11.9
-max     14.7
-
-## dev
-Cuts count: 14326
-Total duration (hours): 18.1
-Speech duration (hours): 18.1 (100.0%)
-***
-Duration statistics (seconds):
-mean    4.5
-std     1.3
-min     1.6
-0.1%    2.1
-0.5%    2.3
-1%      2.4
-5%      2.9
-10%     3.1
-10%     3.1
-25%     3.5
-50%     4.3
-75%     5.4
-90%     6.4
-95%     7.0
-99%     8.4
-99.5%   8.9
-99.9%   10.3
-max     12.5
-
-## aidatatang_200zh (train)
-Cuts count: 164905
-Total duration (hours): 139.9
-Speech duration (hours): 139.9 (100.0%)
-***
-Duration statistics (seconds):
-mean    3.1
-std     1.1
-min     1.1
-0.1%    1.5
-0.5%    1.7
-1%      1.8
-5%      2.0
-10%     2.1
-10%     2.1
-25%     2.3
-50%     2.7
-75%     3.4
-90%     4.6
-95%     5.4
-99%     7.1
-99.5%   7.8
-99.9%   9.1
-max     16.3
-
-## aidatatang_200zh (test)
-Cuts count: 48144
-Total duration (hours): 40.2
-Speech duration (hours): 40.2 (100.0%)
-***
-Duration statistics (seconds):
-mean    3.0
-std     1.1
-min     0.9
-0.1%    1.5
-0.5%    1.8
-1%      1.8
-5%      2.0
-10%     2.1
-10%     2.1
-25%     2.3
-50%     2.6
-75%     3.4
-90%     4.4
-95%     5.2
-99%     6.9
-99.5%   7.5
-99.9%   9.0
-max     21.8
-
-## aidatatang_200zh (dev)
-Cuts count: 24216
-Total duration (hours): 20.2
-Speech duration (hours): 20.2 (100.0%)
-***
-Duration statistics (seconds):
-mean    3.0
-std     1.0
-min     1.2
-0.1%    1.6
-0.5%    1.7
-1%      1.8
-5%      2.0
-10%     2.1
-10%     2.1
-25%     2.3
-50%     2.7
-75%     3.4
-90%     4.4
-95%     5.1
-99%     6.7
-99.5%   7.3
-99.9%   8.8
-max     11.3
-"""
+    main()
diff --git a/egs/aishell2/ASR/prepare.sh b/egs/aishell2/ASR/prepare.sh
@@ -3,8 +3,8 @@
 set -eou pipefail
 
 nj=30
-stage=3
-stop_stage=3
+stage=0
+stop_stage=5
 
 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, you need to apply aishell2 through
@@ -117,12 +117,12 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
   mkdir -p $lang_char_dir
 
   # Prepare text.
-  grep "\"text\":" data/manifests/aishell2_supervisions_train.json \
+  grep "\"text\":" data/manifests/aishell2_supervisions_train.jsonl.gz \
     | sed -e 's/["text:\t ]*//g' | sed 's/,//g' \
     | ./local/text2token.py -t "char" > $lang_char_dir/text
 
   # Prepare words.txt
-  grep "\"text\":" data/manifests/aishell2_supervisions_train.json \
+  grep "\"text\":" data/manifests/aishell2_supervisions_train.jsonl.gz \
     | sed -e 's/["text:\t]*//g' | sed 's/,//g' \
     | ./local/text2token.py -t "char" > $lang_char_dir/text_words