Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add srcipts for bpe nbest #27

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 190 additions & 0 deletions egs/swbd/s5c/local/rnnlm/train_bpe_nbest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
#!/bin/bash

# Copyright 2012 Johns Hopkins University (author: Daniel Povey)
# 2015 Guoguo Chen
# 2017 Hainan Xu
# 2017 Xiaohui Zhang
# 2018 Dongji Gao

# This script trains LMs on the swbd LM-training data.

# rnnlm/train_rnnlm.sh: best iteration (out of 35) was 34, linking it to final iteration.
# rnnlm/train_rnnlm.sh: train/dev perplexity was 41.9 / 50.0.
# Train objf: -5.07 -4.43 -4.25 -4.17 -4.12 -4.07 -4.04 -4.01 -3.99 -3.98 -3.96 -3.94 -3.92 -3.90 -3.88 -3.87 -3.86 -3.85 -3.84 -3.83 -3.82 -3.81 -3.80 -3.79 -3.78 -3.78 -3.77 -3.77 -3.76 -3.75 -3.74 -3.73 -3.73 -3.72 -3.71
# Dev objf: -10.32 -4.68 -4.43 -4.31 -4.24 -4.19 -4.15 -4.13 -4.10 -4.09 -4.05 -4.03 -4.02 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 -3.91 -3.91

# %WER 11.1 | 1831 21395 | 89.9 6.4 3.7 1.0 11.1 46.3 | exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp/decode_eval2000_sw1_fsh_fg_looped/score_13_0.0/eval2000_hires.ctm.swbd.filt.sys
# %WER 9.9 | 1831 21395 | 91.0 5.8 3.2 0.9 9.9 43.2 | exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp/decode_eval2000_sw1_fsh_fg_looped_rnnlm_1e/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
# %WER 9.9 | 1831 21395 | 91.0 5.8 3.2 0.9 9.9 42.9 | exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp/decode_eval2000_sw1_fsh_fg_looped_rnnlm_1e_nbest/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys

# %WER 15.9 | 4459 42989 | 85.7 9.7 4.6 1.6 15.9 51.6 | exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp/decode_eval2000_sw1_fsh_fg_looped/score_10_0.0/eval2000_hires.ctm.filt.sys
# %WER 14.4 | 4459 42989 | 87.0 8.7 4.3 1.5 14.4 49.4 | exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp/decode_eval2000_sw1_fsh_fg_looped_rnnlm_1e/score_11_0.0/eval2000_hires.ctm.filt.sys
# %WER 14.4 | 4459 42989 | 87.1 8.7 4.2 1.5 14.4 49.0 | exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp/decode_eval2000_sw1_fsh_fg_looped_rnnlm_1e_nbest/score_10_0.0/eval2000_hires.ctm.filt.sys

# Begin configuration section.

dir=exp/rnnlm_bpe_nbest
embedding_dim=1024
lstm_rpd=256
lstm_nrpd=256
epochs=10
stage=-10
train_stage=-10

# variables for lattice rescoring
run_lat_rescore=false
run_nbest_rescore=true
run_backward_rnnlm=false

ac_model_dir=exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp
decode_dir_suffix=bpe
ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
# if it's set, it merges histories in the lattice if they share
# the same ngram history and this prevents the lattice from
# exploding exponentially
pruned_rescore=true

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

text=data/train_nodev/text
fisher_text=data/local/lm/fisher/text1.gz
lexicon=data/local/dict_nosp/lexiconp.txt # seems that lexicon is not used here (Dongji)
text_dir=data/rnnlm/text_bpe_nbest
wordlist=data/lang_nosp/words_sub.txt

# parameters for byte pair encoding
num_operations=1500

mkdir -p $dir/config
set -e

for f in $text; do
[ ! -f $f ] && \
echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
done

if [ $stage -le 0 ]; then
mkdir -p $text_dir
echo -n >$text_dir/dev.txt
# hold out one in every 50 lines as dev data.
cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%50 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/swbd.txt

# apply byte pair encoding
utils/lang/bpe/learn_bpe.py -s $num_operations < ${text_dir}/swbd.txt > ${dir}/pair_code.txt
utils/lang/bpe/apply_bpe.py -c ${dir}/pair_code.txt < ${text_dir}/swbd.txt > ${text_dir}/swbd_sub.txt

# get vocabulary (wordlist)
rnnlm/get_wordlist.py ${text_dir}/swbd_sub.txt $wordlist

# create a special file for rnn_compute_score running in bpe mode
echo -n >$dir/bpe.mode

rm ${text_dir}/swbd.txt
# cat > $dir/config/hesitation_mapping.txt <<EOF
#hmm hum
#mmm um
#mm um
#mhm um-hum
#EOF
# gunzip -c $fisher_text | awk 'NR==FNR{a[$1]=$2;next}{for (n=1;n<=NF;n++) if ($n in a) $n=a[$n];print $0}' \
# $dir/config/hesitation_mapping.txt - > $text_dir/fisher.txt
fi

if [ $stage -le 1 ]; then
cp $wordlist $dir/config/words.txt
n=`cat $dir/config/words.txt | wc -l`
echo "<brk> $n" >> $dir/config/words.txt

# words that are not present in words.txt but are in the training or dev data, will be
# mapped to <SPOKEN_NOISE> during training.
echo "<unk>" >$dir/config/oov.txt

cat > $dir/config/data_weights.txt <<EOF
swbd_sub 1 1.0
EOF

rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
--unk-word="<unk>" \
--data-weights-file=$dir/config/data_weights.txt \
$text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt

# choose features
rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
--use-constant-feature=true \
--special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter],[vocalized-noise]' \
$dir/config/words.txt > $dir/config/features.txt

# cat >$dir/config/xconfig <<EOF
#input dim=$embedding_dim name=input
#relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
#fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
#relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
#fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
#relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
#output-layer name=output include-log-softmax=false dim=$embedding_dim
#EOF

cat >$dir/config/xconfig <<EOF
input dim=$embedding_dim name=input
lstm-layer name=lstm1 cell-dim=$embedding_dim
relu-renorm-layer name=tdnn dim=$embedding_dim input=Append(0, IfDefined(-1))
lstm-layer name=lstm2 cell-dim=$embedding_dim
output-layer name=output include-log-softmax=false dim=$embedding_dim
EOF
rnnlm/validate_config_dir.sh $text_dir $dir/config
fi

if [ $stage -le 2 ]; then
rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
fi

if [ $stage -le 3 ]; then
rnnlm/train_rnnlm.sh --stage $train_stage --num-epochs $epochs --cmd "$train_cmd" $dir
fi

LM=sw1_fsh_fg # using the 4-gram const arpa file as old lm
if [ $stage -le 4 ] && $run_lat_rescore; then
echo "$0: Perform lattice-rescoring on $ac_model_dir"
# LM=sw1_tg # if using the original 3-gram G.fst as old lm
pruned=
if $pruned_rescore; then
pruned=_pruned
fi
for decode_set in eval2000; do
decode_dir=${ac_model_dir}/decode_${decode_set}_${LM}_looped

# Lattice rescoring
rnnlm/lmrescore$pruned.sh \
--cmd "$decode_cmd --mem 4G" \
--weight 0.45 --max-ngram-order $ngram_order \
data/lang_$LM $dir \
data/${decode_set}_hires ${decode_dir} \
${decode_dir}_${decode_dir_suffix}_0.45
done
fi

if [ $stage -le 5 ] && $run_nbest_rescore; then
echo "$0: Perform nbest-rescoring of byte pair encoding on $ac_model_dir"
for decode_set in eval2000; do
decode_dir=${ac_model_dir}/decode_${decode_set}_${LM}_looped

# nbest rescoring
rnnlm/lmrescore_nbest.sh \
--cmd "$decode_cmd --mem 4G" --N 20 \
0.8 data/lang_$LM $dir \
data/${decode_set}_hires ${decode_dir} \
${decode_dir}_${decode_dir_suffix}_nbest
done

[ -f $dir/bpe.mode ] && rm $dir/bpe.mode && echo "clear bpe.mode"
fi

# running backward RNNLM, which further improves WERS by combining backward with
# the forward RNNLM trained in this script.
if [ $stage -le 6 ] && $run_backward_rnnlm; then
local/rnnlm/run_tdnn_lstm_back.sh
fi

exit 0
10 changes: 9 additions & 1 deletion scripts/rnnlm/compute_sentence_scores.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ tempdir=$2
text_in=$3
scores_out=$4


if [ -f $dir/word_embedding.final.mat ]; then
word_embedding=$dir/word_embedding.final.mat
else
Expand All @@ -41,7 +42,14 @@ for x in final.raw config/words.txt; do
done

mkdir -p $tempdir
cat $text_in | sym2int.pl -f 2- $dir/config/words.txt > $tempdir/text.int

if [ -f $dir/bpe.mode ]; then
cat $text_in | cut -d ' ' -f2- > ${text_in}.temp
utils/lang/bpe/apply_bpe.py -c $dir/pair_code.txt < ${text_in}.temp > ${text_in}.sub
cat ${text_in}.sub | sym2int.pl $dir/config/words.txt > $tempdir/text.int
else
cat $text_in | sym2int.pl -f 2- $dir/config/words.txt > $tempdir/text.int
fi

special_symbol_opts=$(cat $dir/special_symbol_opts.txt)

Expand Down
40 changes: 40 additions & 0 deletions scripts/rnnlm/get_wordlist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python

import argparse

parser = argparse.ArgumentParser(description="This script get wordlist (vocabulary) for bpe script",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("input_file", type=str, help="Input text file")
parser.add_argument("wordlist", type=str, help="output wordlist (vocabulary)")

args = parser.parse_args()

def get_wordlist(text, wordlist):
word_set = set()
index = 0

special_symbol_1 = ["<eps>", "!sil", "<unk>"]
special_symbol_2 = ["#0", "<s>", "</s>"]

with open(wordlist, "w") as wordlist:

for word in special_symbol_1:
wordlist.write("{} {}\n".format(word, index))
index += 1

with open(text, "r") as text_in:
lines = text_in.readlines()
for line in lines:
word_list = line.split()
for word in word_list:
if word not in word_set:
word_set.add(word)
wordlist.write("{} {}\n".format(word, index))
index += 1

for word in special_symbol_2:
wordlist.write("{} {}\n".format(word, index))
index += 1

if __name__ == "__main__":
get_wordlist(args.input_file, args.wordlist)