Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

some small changes for aidatatang_200zh #542

Merged
merged 8 commits into from
Aug 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@


def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
src_dir = Path("data/manifests")
src_dir = Path("data/manifests/aidatatang_200zh")
output_dir = Path("data/fbank")
num_jobs = min(15, os.cpu_count())

Expand Down
58 changes: 28 additions & 30 deletions egs/aidatatang_200zh/ASR/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,69 +50,67 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Process aidatatang_200zh"
if [ ! -f data/fbank/aidatatang_200zh/.fbank.done ]; then
mkdir -p data/fbank/aidatatang_200zh
lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
touch data/fbank/aidatatang_200zh/.fbank.done
fi
fi

if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Prepare musan manifest"
log "Stage 2: Prepare musan manifest"
# We assume that you have downloaded the musan corpus
# to data/musan
if [ ! -f data/manifests/.musan_manifests.done ]; then
if [ ! -f data/manifests/.manifests.done ]; then
log "It may take 6 minutes"
mkdir -p data/manifests
lhotse prepare musan $dl_dir/musan data/manifests
touch data/manifests/.musan_manifests.done
mkdir -p data/manifests/
lhotse prepare musan $dl_dir/musan data/manifests/
touch data/manifests/.manifests.done
fi
fi

if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Compute fbank for musan"
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute fbank for musan"
if [ ! -f data/fbank/.msuan.done ]; then
mkdir -p data/fbank
./local/compute_fbank_musan.py
touch data/fbank/.msuan.done
fi
fi

if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Compute fbank for aidatatang_200zh"
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Compute fbank for aidatatang_200zh"
if [ ! -f data/fbank/.aidatatang_200zh.done ]; then
mkdir -p data/fbank
./local/compute_fbank_aidatatang_200zh.py
touch data/fbank/.aidatatang_200zh.done
fi
fi

if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Prepare char based lang"
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Prepare char based lang"
lang_char_dir=data/lang_char
mkdir -p $lang_char_dir

# Prepare text.
grep "\"text\":" data/manifests/aidatatang_200zh/supervisions_train.json \
| sed -e 's/["text:\t ]*//g' | sed 's/,//g' \
| ./local/text2token.py -t "char" > $lang_char_dir/text

# Note: in Linux, you can install jq with the following command:
# 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
# 2. chmod +x ./jq
# 3. cp jq /usr/bin
if [ ! -f $lang_char_dir/text ]; then
gunzip -c data/manifests/aidatatang_200zh/aidatatang_supervisions_train.jsonl.gz \
|jq '.text' |sed -e 's/["text:\t ]*//g' | sed 's/"//g' \
| ./local/text2token.py -t "char" > $lang_char_dir/text
fi
# Prepare words.txt
grep "\"text\":" data/manifests/aidatatang_200zh/supervisions_train.json \
| sed -e 's/["text:\t]*//g' | sed 's/,//g' \
| ./local/text2token.py -t "char" > $lang_char_dir/text_words
if [ ! -f $lang_char_dir/text_words ]; then
gunzip -c data/manifests/aidatatang_200zh/aidatatang_supervisions_train.jsonl.gz \
| jq '.text' | sed -e 's/["text:\t]*//g' | sed 's/"//g' \
| ./local/text2token.py -t "char" > $lang_char_dir/text_words
fi

cat $lang_char_dir/text_words | sed 's/ /\n/g' | sort -u | sed '/^$/d' \
| uniq > $lang_char_dir/words_no_ids.txt

if [ ! -f $lang_char_dir/words.txt ]; then
./local/prepare_words.py \
--input-file $lang_char_dir/words_no_ids.txt
--output-file $lang_char_dir/words.txt
--input-file $lang_char_dir/words_no_ids.txt \
--output-file $lang_char_dir/words.txt
fi

if [ ! -f $lang_char_dir/L_disambig.pt ]; then
./local/prepare_char.py
fi
fi

57 changes: 4 additions & 53 deletions egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,63 +522,14 @@ def main():
num_param = sum([p.numel() for p in model.parameters()])
logging.info(f"Number of model parameters: {num_param}")

# Note: Please use "pip install webdataset==0.1.103"
# for installing the webdataset.
import glob
import os

from lhotse import CutSet
from lhotse.dataset.webdataset import export_to_webdataset

# we need cut ids to display recognition results.
args.return_cuts = True
aidatatang_200zh = Aidatatang_200zhAsrDataModule(args)

dev = "dev"
test = "test"

if not os.path.exists(f"{dev}/shared-0.tar"):
os.makedirs(dev)
dev_cuts = aidatatang_200zh.valid_cuts()
export_to_webdataset(
dev_cuts,
output_path=f"{dev}/shared-%d.tar",
shard_size=300,
)

if not os.path.exists(f"{test}/shared-0.tar"):
os.makedirs(test)
test_cuts = aidatatang_200zh.test_cuts()
export_to_webdataset(
test_cuts,
output_path=f"{test}/shared-%d.tar",
shard_size=300,
)

dev_shards = [
str(path)
for path in sorted(glob.glob(os.path.join(dev, "shared-*.tar")))
]
cuts_dev_webdataset = CutSet.from_webdataset(
dev_shards,
split_by_worker=True,
split_by_node=True,
shuffle_shards=True,
)

test_shards = [
str(path)
for path in sorted(glob.glob(os.path.join(test, "shared-*.tar")))
]
cuts_test_webdataset = CutSet.from_webdataset(
test_shards,
split_by_worker=True,
split_by_node=True,
shuffle_shards=True,
)

dev_dl = aidatatang_200zh.valid_dataloaders(cuts_dev_webdataset)
test_dl = aidatatang_200zh.test_dataloaders(cuts_test_webdataset)
dev_cuts = aidatatang_200zh.valid_cuts()
test_cuts = aidatatang_200zh.test_cuts()
dev_dl = aidatatang_200zh.valid_dataloaders(dev_cuts)
test_dl = aidatatang_200zh.test_dataloaders(test_cuts)

test_sets = ["dev", "test"]
test_dl = [dev_dl, test_dl]
Expand Down