kaldi-asr · danpovey · Mar 16, 2019 · Jun 4, 2018 · Jun 4, 2018 · Jun 4, 2018
diff --git a/egs/formosa/README.txt b/egs/formosa/README.txt
@@ -0,0 +1,22 @@
+### Welcome to the demo recipe of the Formosa Speech in the Wild (FSW) Project ###
+
+The language habits of Taiwanese people are different from other Mandarin speakers (both accents and cultures) [1]. Especially Tainwaese use tranditional Chinese characters, i.e., 繁體中文). To address this issue, a Taiwanese speech corpus collection project "Formosa Speech in the Wild (FSW)" was initiated in 2017 to improve the development of Taiwanese-specific speech recognition techniques.
+
+FSW corpus will be a large-scale database of real-Life/multi-gene Taiwanese Spontaneous speech collected and transcribed from various sources (radio, TV, open courses, etc.). To demostrate that this database is a reasonable data resource for Taiwanese spontaneous speech recognition research, a baseline recipe is provied here for everybody, especially students, to develop their own systems easily and quickly.
+
+This recipe is based on the "NER-Trs-Vol1" corpus (about 150 hours broadcast radio speech selected from FSW). For more details, please visit: 
+* Formosa Speech in the Wild (FSW) project (https://sites.google.com/speech.ntut.edu.tw/fsw)
+
+If you want to apply the NER-Trs-Vol1 corpus, please contact Yuan-Fu Liao (廖元甫) via "yfliao@mail.ntut.edu.tw". This corpus is only for non-commercial research/education use and will be distributed via our GitLab server in https://speech.nchc.org.tw.
+
+Any bug, errors, comments or suggestions are very welcomed.
+
+Yuan-Fu Liao (廖元甫)
+Associate Professor
+Department of electronic Engineering,
+National Taipei University of Technology
+http://www.ntut.edu.tw/~yfliao
+yfliao@mail.ntut.edu.tw
+
+............
+[1] The languages of Taiwan consist of several varieties of languages under families of the Austronesian languages and the Sino-Tibetan languages. Taiwanese Mandarin, Hokkien, Hakka and Formosan languages are used by 83.5%, 81.9%, 6.6% and 1.4% of the population respectively (2010). Given the prevalent use of Taiwanese Hokkien, the Mandarin spoken in Taiwan has been to a great extent influenced by it.
diff --git a/egs/formosa/s5/RESULTS b/egs/formosa/s5/RESULTS
@@ -0,0 +1,46 @@
+#
+# Reference results
+#
+# Experimental settings:
+#
+# training set:	show CS, BG, DA, QG, SR, SY and WK,	in total 18977 utt., 1,088,948 words
+# test set:	show JZ, GJ, KX and YX,			in total  2112 utt.,   135,972 words
+#
+# lexicon: 274,036 words
+# phones (IPA):  196 (tonal)
+#
+# tdnn:	6 layers * 850 Relu neurons
+#		Features: 43-dim MFCCs * 5 frames + 100-dim  ivector (with LDA)
+# chain:	6 layers * 625 Relu neurons
+#		Features: 43-dim MFCCs * 3 frames + 100-dim  ivector (with LDA)
+#
+
+#
+# WER:
+#
+
+%WER 61.32 [ 83373 / 135972, 5458 ins, 19156 del, 58759 sub ] exp/mono/decode_test/wer_11_0.0
+%WER 41.00 [ 55742 / 135972, 6725 ins, 12763 del, 36254 sub ] exp/tri1/decode_test/wer_15_0.0
+%WER 40.41 [ 54948 / 135972, 7366 ins, 11505 del, 36077 sub ] exp/tri2/decode_test/wer_14_0.0
+%WER 38.67 [ 52574 / 135972, 6855 ins, 11250 del, 34469 sub ] exp/tri3a/decode_test/wer_15_0.0
+%WER 35.70 [ 48546 / 135972, 7197 ins,  9717 del, 31632 sub ] exp/tri4a/decode_test/wer_17_0.0
+%WER 32.11 [ 43661 / 135972, 6112 ins, 10185 del, 27364 sub ] exp/tri5a/decode_test/wer_17_0.5
+%WER 31.36 [ 42639 / 135972, 6846 ins, 8860 del, 26933 sub ] exp/tri5a_cleaned/decode_test/wer_17_0.5
+%WER 24.43 [ 33218 / 135972, 5524 ins,  7583 del, 20111 sub ] exp/nnet3/tdnn_sp/decode_test/wer_12_0.0
+%WER 23.95 [ 32568 / 135972, 4457 ins, 10271 del, 17840 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.0
+%WER 23.54 [ 32006 / 135972, 4717 ins, 8644 del, 18645 sub ] exp/chain/tdnn_1b_sp/decode_test/wer_10_0.0
+
+#
+# CER:
+#
+
+%WER 54.09 [ 116688 / 215718, 4747 ins, 24510 del, 87431 sub ] exp/mono/decode_test/cer_10_0.0
+%WER 32.61 [  70336 / 215718, 5866 ins, 16282 del, 48188 sub ] exp/tri1/decode_test/cer_13_0.0
+%WER 32.10 [  69238 / 215718, 6186 ins, 15772 del, 47280 sub ] exp/tri2/decode_test/cer_13_0.0
+%WER 30.40 [  65583 / 215718, 6729 ins, 13115 del, 45739 sub ] exp/tri3a/decode_test/cer_12_0.0
+%WER 27.53 [  59389 / 215718, 6311 ins, 13008 del, 40070 sub ] exp/tri4a/decode_test/cer_15_0.0
+%WER 24.21 [  52232 / 215718, 6425 ins, 11543 del, 34264 sub ] exp/tri5a/decode_test/cer_15_0.0
+%WER 23.41 [ 50492 / 215718, 6645 ins, 10997 del, 32850 sub ] exp/tri5a_cleaned/decode_test/cer_17_0.0
+%WER 17.07 [  36829 / 215718, 4734 ins,  9938 del, 22157 sub ] exp/nnet3/tdnn_sp/decode_test/cer_12_0.0
+%WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+%WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
diff --git a/egs/formosa/s5/cmd.sh b/egs/formosa/s5/cmd.sh
@@ -0,0 +1,20 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="run.pl --mem 2G"
+export decode_cmd="run.pl --mem 4G"
+export mkgraph_cmd="run.pl --mem 8G"
+
+#export train_cmd="queue.pl --mem 2G"
+#export decode_cmd="queue.pl --mem 4G"
+#export mkgraph_cmd="queue.pl --mem 8G"
+
diff --git a/egs/formosa/s5/conf/decode.config b/egs/formosa/s5/conf/decode.config
@@ -0,0 +1,5 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
+
+
diff --git a/egs/formosa/s5/conf/mfcc.conf b/egs/formosa/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/formosa/s5/conf/mfcc_hires.conf b/egs/formosa/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800)
diff --git a/egs/formosa/s5/conf/online_cmvn.conf b/egs/formosa/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.
diff --git a/egs/formosa/s5/conf/pitch.conf b/egs/formosa/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/formosa/s5/eval.sh b/egs/formosa/s5/eval.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+#
+# Copyright 2018, Yuan-Fu Liao, National Taipei University of Technology, yfliao@mail.ntut.edu.tw
+#
+# Before you run this recipe, please apply, download and put or make a link of the corpus under this folder (folder name: "NER-Trs-Vol1-Eval").
+# For more detail, please check:
+# 1. Formosa Speech in the Wild (FSW) project (https://sites.google.com/speech.ntut.edu.tw/fsw/home/corpus)
+# 2. Formosa Speech Recognition Challenge (FSW) 2018 (https://sites.google.com/speech.ntut.edu.tw/fsw/home/challenge)
+stage=-2
+train_stage=-10
+num_jobs=20
+
+# shell options
+set -e -o pipefail
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+# configure number of jobs running in parallel, you should adjust these numbers according to your machines
+# data preparation
+if [ $stage -le -2 ]; then
+
+  # Data Preparation
+  echo "$0: Data Preparation"
+  local/prepare_eval_data.sh || exit 1;
+
+fi
+
+# Now make MFCC plus pitch features.
+# mfccdir should be some place with a largish disk where you
+# want to store MFCC features.
+mfccdir=mfcc
+
+# mfcc
+if [ $stage -le -1 ]; then
+
+  echo "$0: making mfccs"
+  for x in eval; do
+    steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $num_jobs data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+    utils/fix_data_dir.sh data/$x || exit 1;
+  done
+
+fi
+
+# mono
+if [ $stage -le 0 ]; then
+
+  # Monophone decoding
+  (
+  steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+    exp/mono/graph data/eval exp/mono/decode_eval
+  )
+
+fi
+
+# tri1
+if [ $stage -le 1 ]; then
+
+  # decode tri1
+  (
+  steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+    exp/tri1/graph data/eval exp/tri1/decode_eval
+  )
+
+fi
+
+# tri2
+if [ $stage -le 2 ]; then
+
+  # decode tri2
+  (
+  steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+    exp/tri2/graph data/eval exp/tri2/decode_eval
+  )
+
+fi
+
+# tri3a
+if [ $stage -le 3 ]; then
+
+  # decode tri3a
+  (
+  steps/decode.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+    exp/tri3a/graph data/eval exp/tri3a/decode_eval
+  )
+
+fi
+
+# tri4
+if [ $stage -le 4 ]; then
+
+  # decode tri4a
+  (
+  steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+    exp/tri4a/graph data/eval exp/tri4a/decode_eval
+  )
+
+fi
+
+# tri5
+if [ $stage -le 5 ]; then
+
+  # decode tri5
+  (
+  steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+     exp/tri5a/graph data/eval exp/tri5a/decode_eval || exit 1;
+  )
+
+fi
+
+# nnet3 tdnn models
+# commented out by default, since the chain model is usually faster and better
+if [ $stage -le 6 ]; then
+
+#  train_stage=99
+#  echo "$0: evaluate nnet3 model"
+#  local/nnet3/run_tdnn.sh --stage $train_stage
+
+fi
+
+# chain model
+if [ $stage -le 7 ]; then
+
+  train_stage=99
+  echo "$0: evaluate chain model"
+  local/chain/run_tdnn.sh --stage $train_stage
+
+fi
+
+# getting results (see RESULTS file)
+if [ $stage -le 10 ]; then
+
+  echo "$0: extract the results"
+  rm -f eval-decoding-results.log
+  touch eval-decoding-results.log
+  for x in exp/*/decode_eval/log;   do [ -d $x ] && grep NER $x/*.log | grep -v LOG | grep -v WARNING >> eval-decoding-results.log; done
+  for x in exp/*/*/decode_eval/log; do [ -d $x ] && grep NER $x/*.log | grep -v LOG | grep -v WARNING >> eval-decoding-results.log; done
+
+fi
+
+# finish
+echo "$0: all done"
+
+exit 0;
diff --git a/egs/formosa/s5/local/chain/run_tdnn.sh b/egs/formosa/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1b.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,5 @@
		beam=11.0 # beam for decoding. Was 13.0 in the scripts.
		first_beam=8.0 # beam for 1st-pass decoding in SAT.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		--use-energy=false # only non-default option.
		--sample-frequency=16000
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.