Skip to content

Commit

Permalink
fix the case when segments has different #lines from wav.scp (#413)
Browse files Browse the repository at this point in the history
  • Loading branch information
kan-bayashi authored Aug 15, 2023
1 parent aca7c2c commit c68b459
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 16 deletions.
35 changes: 35 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,41 @@ jobs:
name: artifacts-${{ matrix.config }}
path: egs/yesno/voc1

integration_segments:
runs-on: ubuntu-20.04
strategy:
max-parallel: 10
matrix:
python-version: [3.9]
pytorch-version: [1.13.1]
steps:
- uses: actions/checkout@master
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
architecture: 'x64'
- uses: actions/cache@v2
with:
path: ~/.cache/pip
key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-${{ hashFiles('**/setup.py') }}
restore-keys: |
${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-
- name: Install dependencies
run: |
sudo apt-get install libsndfile-dev jq
# make python env
cd tools; make CUDA_VERSION="" PYTHON=python${{ matrix.python-version }} PYTORCH_VERSION=${{ matrix.pytorch-version }}
source venv/bin/activate
pip install torch-yin
- name: Integration
run: |
cd egs/yesno/voc1 && ./run.sh --use_fake_segments true
- uses: actions/upload-artifact@v1
if: failure()
with:
name: artifacts-${{ matrix.config }}
path: egs/yesno/voc1

integration_vq:
runs-on: ubuntu-20.04
strategy:
Expand Down
12 changes: 12 additions & 0 deletions egs/yesno/voc1/local/data_prep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ train_set="train_nodev"
dev_set="dev"
eval_set="eval"
shuffle=false
use_fake_segments=false

# shellcheck disable=SC1091
. utils/parse_options.sh || exit 1;
Expand All @@ -31,6 +32,7 @@ if [ $# != 2 ]; then
echo " --dev_set: name of dev set (default=dev)."
echo " --eval_set: name of eval set (default=eval)."
echo " --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
echo " --use_fake_segments: whether to use fake segments (default=false)."
exit 1
fi

Expand All @@ -40,18 +42,28 @@ set -euo pipefail

# set filenames
scp="${data_dir}/all/wav.scp"
segments="${data_dir}/all/segments"

# check file existence
[ -e "${scp}" ] && rm "${scp}"
[ -e "${segments}" ] && rm "${segments}"

# make all scp
find "${db_root}" -follow -name "*.wav" | sort | while read -r filename; do
id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
echo "${id} ${filename}" >> "${scp}"
# NOTE(kan-bayashi): for integration test
if "${use_fake_segments}"; then
echo "${id}_1 ${id} 0.0 $(soxi -D "${filename}")" >> "${data_dir}/all/segments"
echo "${id}_2 ${id} 0.0 $(soxi -D "${filename}")" >> "${data_dir}/all/segments"
fi
done

# split
num_all=$(wc -l < "${scp}")
if "${use_fake_segments}"; then
num_all=$(wc -l < "${segments}")
fi
num_deveval=$((num_dev + num_eval))
num_train=$((num_all - num_deveval))
utils/split_data.sh \
Expand Down
4 changes: 4 additions & 0 deletions egs/yesno/voc1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ conf=conf/parallel_wavegan.v1.debug.yaml
download_dir=downloads # direcotry to save downloaded files
dumpdir=dump # directory to dump features

# data setting
use_fake_segments=false # for testing

# training related setting
tag="" # tag for directory to save model
resume="" # checkpoint path to resume training
Expand Down Expand Up @@ -47,6 +50,7 @@ fi
if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then
echo "Stage 0: Data preparation"
local/data_prep.sh \
--use_fake_segments "${use_fake_segments}" \
--train_set "${train_set}" \
--dev_set "${dev_set}" \
--eval_set "${eval_set}" \
Expand Down
20 changes: 6 additions & 14 deletions utils/make_subset_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,29 +22,21 @@ num_split=$2
dst_dir=$3

src_scp=${src_dir}/wav.scp
num_src_utts=$(wc -l < "${src_scp}")
has_utt2spk=false
has_segments=false

if [ -e "${src_dir}/segments" ]; then
has_segments=true
src_segments=${src_dir}/segments
else
has_segments=false
num_src_utts=$(wc -l < "${src_segments}")
fi

if [ -e "${src_dir}/utt2spk" ]; then
has_utt2spk=true
src_utt2spk=${src_dir}/utt2spk
else
has_utt2spk=false
fi
src_scp=${src_dir}/wav.scp
num_src_utts=$(wc -l < "${src_scp}")

# NOTE: We assume that wav.scp and segments has the same number of lines
if ${has_segments}; then
num_src_segments=$(wc -l < "${src_segments}")
if [ "${num_src_segments}" -ne "${num_src_utts}" ]; then
echo "ERROR: wav.scp and segments has different #lines (${num_src_utts} vs ${num_src_segments})." >&2
exit 1;
fi
fi
if ${has_utt2spk}; then
num_src_utt2spk=$(wc -l < "${src_utt2spk}")
if [ "${num_src_utt2spk}" -ne "${num_src_utts}" ]; then
Expand Down
3 changes: 1 addition & 2 deletions utils/split_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ if [ $# -ne 3 ]; then
exit 1
fi

set -eu
set -eux

src_dir=$1
first_dist_dir=$2
Expand All @@ -49,7 +49,6 @@ if [ -e "${src_dir}/utt2spk" ]; then
else
has_utt2spk=false
fi
src_scp=${src_dir}/wav.scp

if ${has_utt2spk}; then
num_src_utt2spk=$(wc -l < "${src_utt2spk}")
Expand Down

0 comments on commit c68b459

Please sign in to comment.