Idlak · dabraude · Apr 4, 2018 · Apr 3, 2018 · Apr 3, 2018 · Apr 3, 2018
diff --git a/.gitignore b/.gitignore
@@ -72,11 +72,28 @@ GSYMS
 /src/kaldi.mk.tmp
 /src/kaldi.mk.bak
 
-# /egs/
+# /egs/ & /idlak-egs/
 /egs/*/s*/mfcc
 /egs/*/s*/plp
 /egs/*/s*/exp
 /egs/*/s*/data
+/idlak-egs/*/s*/mfcc
+/idlak-egs/*/s*/plp
+/idlak-egs/*/s*/exp
+/idlak-egs/*/s*/data
+/idlak-egs/*/s*/rawaudio
+/idlak-egs/*/s*/labels
+/idlak-egs/*/s*/exp-align
+/idlak-egs/*/s*/f0data
+/idlak-egs/*/s*/durdata
+/idlak-egs/*/s*/exp_dnn
+/idlak-egs/*/s*/lbldata
+/idlak-egs/*/s*/lbldurdata
+/idlak-egs/*/s*/lblf0data
+/idlak-egs/*/s*/slt_pmdl
+/idlak-egs/*/s*/bdl_pmdl
+/idlak-egs/*/s*/jmk_pmdl
+/idlak-egs/tts_tangle_lj/s2/LJ_pmdl/
 
 # /tools/
 /tools/ATLAS/
@@ -140,4 +157,14 @@ GSYMS
 /tools/phonetisaurus-g2p/
 /tools/mmseg-1.3.0.tar.gz
 /tools/mmseg-1.3.0/
+/tools/SPTK-3.9.tar.gz
+/tools/SPTK-3.9/
+/tools/SPTK/
+/tools/expat-2.1.0.tar.gz
+/tools/expat-2.1.0/
+/tools/pcre-8.20.tar.bz2
+/tools/pcre-8.20/
+/tools/pugixml-1.2.tar.gz
+/tools/pugixml-1.2/
+
 /kaldiwin_vs*
diff --git a/egs/aishell/README.txt b/egs/aishell/README.txt
@@ -0,0 +1,13 @@
+Aishell is an open Chinese Mandarin speech database published by Beijing Shell Shell Technology Co.,Ltd.
+
+400 people from different accent areas in China are invited to participate in the recording, which is conducted in a quiet indoor environment using high fidelity microphone and downsampled to 16kHz. The manual transcription accuracy is above 95%, through professional speech annotation and strict quality inspection. The data is free for academic use. The corpus contains 170 hours of speech, and is devided into training(85%), developement(10%) and testing(5%) sets. The developement set is used to tune the hyperparameters in training.
+
+The database can be downloaded from openslr:
+http://www.openslr.org/33/
+
+This folder contains two subfolders:
+s5: a speech recognition recipe
+v1: a speaker recognition recipe
+
+For more details, please visit:
+http://www.aishelltech.com/kysjcp
diff --git a/egs/aishell/s5/RESULTS b/egs/aishell/s5/RESULTS
@@ -0,0 +1,8 @@
+%WER 33.82 [ 35432 / 104765, 743 ins, 3991 del, 30698 sub ] exp/mono/decode_test/cer_12_0.0
+%WER 19.39 [ 20310 / 104765, 903 ins, 1452 del, 17955 sub ] exp/tri1/decode_test/cer_13_0.5
+%WER 19.23 [ 20147 / 104765, 910 ins, 1287 del, 17950 sub ] exp/tri2/decode_test/cer_14_0.5
+%WER 17.14 [ 17961 / 104765, 812 ins, 1024 del, 16125 sub ] exp/tri3a/decode_test/cer_14_0.0
+%WER 13.64 [ 14294 / 104765, 669 ins, 736 del, 12889 sub ] exp/tri4a/decode_test/cer_14_0.5
+%WER 12.23 [ 12809 / 104765, 656 ins, 580 del, 11573 sub ] exp/tri5a/decode_test/cer_13_1.0
+%WER 8.45 [ 8849 / 104765, 312 ins, 538 del, 7999 sub ] exp/nnet3/tdnn_sp/decode_test/cer_13_1.0
+%WER 7.46 [ 7813 / 104765, 287 ins, 472 del, 7054 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_10_1.0
diff --git a/egs/aishell/s5/cmd.sh b/egs/aishell/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/aishell/s5/conf/decode.config b/egs/aishell/s5/conf/decode.config
@@ -0,0 +1,5 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
+
+
diff --git a/egs/aishell/s5/conf/mfcc.conf b/egs/aishell/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/aishell/s5/conf/mfcc_hires.conf b/egs/aishell/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800)
diff --git a/egs/aishell/s5/conf/online_cmvn.conf b/egs/aishell/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.
diff --git a/egs/aishell/s5/conf/pitch.conf b/egs/aishell/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/aishell/s5/local/aishell_data_prep.sh b/egs/aishell/s5/local/aishell_data_prep.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <audio-path> <text-path>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript"
+  exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text=$2/aishell_transcript_v0.8.txt
+
+train_dir=data/local/train
+dev_dir=data/local/dev
+test_dir=data/local/test
+tmp_dir=data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p data/train data/dev data/test
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f data/train/$f || exit 1;
+  cp $dev_dir/$f data/dev/$f || exit 1;
+  cp $test_dir/$f data/test/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
diff --git a/egs/aishell/s5/local/aishell_prepare_dict.sh b/egs/aishell/s5/local/aishell_prepare_dict.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+# prepare dict resources
+
+. ./path.sh
+
+[ $# != 1 ] && echo "Usage: $0 <resource-path>" && exit 1;
+
+res_dir=$1
+dict_dir=data/local/dict
+mkdir -p $dict_dir
+cp $res_dir/lexicon.txt $dict_dir
+
+cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
+  sort -u |\
+  perl -e '
+  my %ph_cl;
+  while (<STDIN>) {
+    $phone = $_;
+    chomp($phone);
+    chomp($_);
+    $phone = $_;
+    next if ($phone eq "sil");
+    if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_)  }
+    else { $ph_cl{$phone} = [$_]; }
+  }
+  foreach $key ( keys %ph_cl ) {
+     print "@{ $ph_cl{$key} }\n"
+  }
+  ' | sort -k1 > $dict_dir/nonsilence_phones.txt  || exit 1;
+
+echo sil > $dict_dir/silence_phones.txt
+
+echo sil > $dict_dir/optional_silence.txt
+
+# No "extra questions" in the input to this setup, as we don't
+# have stress or tone
+
+cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1;
+cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+ >> $dict_dir/extra_questions.txt || exit 1;
+
+echo "$0: AISHELL dict preparation succeeded"
+exit 0;
diff --git a/egs/aishell/s5/local/aishell_train_lms.sh b/egs/aishell/s5/local/aishell_train_lms.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/local/train/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/train/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+kaldi_lm=`which train_lm.sh`
+if [ -z $kaldi_lm ]; then
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
+  exit 1
+fi
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <SPOKEN_NOISE> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<SPOKEN_NOISE>" > $dir/word_map \
+   || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+   || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+
+# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
+# Perplexity over 128254.000000 words is 90.446690
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+exit 0
+
+
+# From here is some commands to do a baseline with SRILM (assuming
+# you have it installed).
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+mkdir -p $sdir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $sdir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $sdir/train
+
+cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
+
+
+ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
+  -map-unk "<SPOKEN_NOISE>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
+
+# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
+# Difference in WSJ must have been due to different treatment of <SPOKEN_NOISE>.
+ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379
diff --git a/egs/aishell/s5/local/chain/run_tdnn.sh b/egs/aishell/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,5 @@
		beam=11.0 # beam for decoding. Was 13.0 in the scripts.
		first_beam=8.0 # beam for 1st-pass decoding in SAT.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		--use-energy=false # only non-default option.
		--sample-frequency=16000
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.