Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CSJ Data Preparation #617

Merged
merged 18 commits into from
Oct 18, 2022
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions egs/csj/ASR/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
librispeech_*.*
todelete*
lang*
telegramlogging.py
results.ipynb
notify_tg.py
finetune_*
misc.ini
19 changes: 19 additions & 0 deletions egs/csj/ASR/.vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true,
"args": [
"--debug"
]
}
]
}
8 changes: 8 additions & 0 deletions egs/csj/ASR/.vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"files.watcherExclude": {
"**.json": true,
"**.jsonl.gz": true,
"**.pt": true
},
"python.analysis.typeCheckingMode": "off"
}
140 changes: 140 additions & 0 deletions egs/csj/ASR/local/compute_fbank_csj.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import argparse
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you follow
https://k2-fsa.github.io/icefall/contributing/code-style.html
to fix your code style issues?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the link. I have just done that.

There is one file where I declared a long list of variables. It seems black and flake8 can't agree on whether to add a newline.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To by-pass the check for some line, you can append # noqa at the end of that line. For instance

this_is_a_very_long_name = 1  # noqa

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The above one is for flake8.

To also ignore that line for black, you can use

# fmt: off
this_is_a_very_long_name = 1  # noqa
# fmt: on

from itertools import islice
import logging
import os
from pathlib import Path
from random import Random
from typing import List, Tuple

import torch
from lhotse import (
CutSet,
RecordingSet,
SupervisionSet,
Fbank,
FbankConfig,
ChunkedLilcomHdf5Writer,
)

ARGPARSE_DESCRIPTION = """
This script follows the espnet method of splitting the remaining core+noncore utterances
into valid and train cutsets at an index which is by default 4000.

In other words, the core+noncore utterances are shuffled, where 4000 utterances of the
shuffled set go to the `valid` cutset and are not subjected to speed perturbation. The
remaining utterances become the `train` cutset and are speed-perturbed (0.9x, 1.0x, 1.1x).

"""

# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
# even when we are not invoking the main (e.g. when spawning subprocesses).
torch.set_num_threads(1)
torch.set_num_interop_threads(1)

RNG_SEED = 42


def make_cutset_blueprints(
manifest_dir : Path,
split : int,
) -> List[Tuple[str, CutSet]]:

# Create train and valid cuts
logging.info("Loading, trimming, and shuffling the remaining core+noncore cuts.")
recording_set = RecordingSet.from_file(manifest_dir / "csj_recordings_core.jsonl.gz") \
+ RecordingSet.from_file(manifest_dir / "csj_recordings_noncore.jsonl.gz")
supervision_set = SupervisionSet.from_file(manifest_dir / "csj_supervisions_core.jsonl.gz") \
+ SupervisionSet.from_file(manifest_dir / "csj_supervisions_noncore.jsonl.gz")

cut_set = CutSet.from_manifests(
recordings=recording_set,
supervisions=supervision_set,
)
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
cut_set = cut_set.shuffle(Random(RNG_SEED))

logging.info(f"Creating valid and train cuts from core and noncore, split at {split}.")
valid_set = CutSet.from_cuts(islice(cut_set, 0, split))

train_set = CutSet.from_cuts(islice(cut_set, split, None))
train_set = (
train_set
+ train_set.perturb_speed(0.9)
+ train_set.perturb_speed(1.1)
)

cut_sets = [("valid", valid_set), ("train", train_set)]

# Create eval datasets
logging.info("Creating eval cuts.")
for i in range(1, 4):
cut_set = CutSet.from_manifests(
recordings=RecordingSet.from_file(manifest_dir / f"csj_recordings_eval{i}.jsonl.gz"),
supervisions=SupervisionSet.from_file(manifest_dir / f"csj_supervisions_eval{i}.jsonl.gz")
)
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
cut_sets.append((f"eval{i}", cut_set))

return cut_sets


def get_args():
parser = argparse.ArgumentParser(
description=ARGPARSE_DESCRIPTION,
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)

parser.add_argument("--manifest-dir", type=Path,
help="Path to save manifests")
parser.add_argument("--fbank-dir", type=Path,
help="Path to save fbank features")
parser.add_argument("--split", type=int, default=4000,
help="Split at this index")
parser.add_argument("--debug", action="store_true",
help="Use hardcoded parameters")

return parser.parse_args()


def main():
args = get_args()

if args.debug:
args.manifest_dir = Path("data/manifests")
args.fbank_dir = Path("/mnt/minami_data_server/t2131178/corpus/CSJ/fbank_new")
args.split = 4000

extractor = Fbank(FbankConfig(num_mel_bins=80))
num_jobs = min(16, os.cpu_count())

formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)

logging.basicConfig(format=formatter, level=logging.INFO)

if (args.fbank_dir / ".done").exists():
logging.info(
"Previous fbank computed for CSJ found. "
f"Delete {args.fbank_dir / '.done'} to allow recomputing fbank."
)
return
else:
cut_sets = make_cutset_blueprints(args.manifest_dir, args.split)
for part, cut_set in cut_sets:
cut_set = cut_set.compute_and_store_features(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest that we use

logging.info(f"Processing {part}")

to show some progress information.

extractor=extractor,
num_jobs=num_jobs,
storage_path=(args.fbank_dir / f"feats_{part}").as_posix(),
storage_type=ChunkedLilcomHdf5Writer
)
cut_set.to_file(args.manifest_dir / f"csj_cuts_{part}.jsonl.gz")

logging.info("All fbank computed for CSJ.")
(args.fbank_dir / ".done").touch()


if __name__ == '__main__':
main()
1 change: 1 addition & 0 deletions egs/csj/ASR/local/compute_fbank_musan.py
Loading