k2-fsa · csukuangfj · Oct 18, 2022 · Oct 8, 2022 · Oct 8, 2022 · Oct 8, 2022
diff --git a/egs/csj/ASR/.gitignore b/egs/csj/ASR/.gitignore
@@ -0,0 +1,8 @@
+librispeech_*.*
+todelete*
+lang*
+telegramlogging.py
+results.ipynb
+notify_tg.py
+finetune_*
+misc.ini
diff --git a/egs/csj/ASR/.vscode/launch.json b/egs/csj/ASR/.vscode/launch.json
@@ -0,0 +1,19 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": [
+                "--debug"
+            ]
+        }
+    ]
+}
diff --git a/egs/csj/ASR/.vscode/settings.json b/egs/csj/ASR/.vscode/settings.json
@@ -0,0 +1,8 @@
+{
+    "files.watcherExclude": {
+        "**.json": true,
+        "**.jsonl.gz": true,
+        "**.pt": true
+    },
+    "python.analysis.typeCheckingMode": "off"
+}
diff --git a/egs/csj/ASR/local/compute_fbank_csj.py b/egs/csj/ASR/local/compute_fbank_csj.py
@@ -0,0 +1,140 @@
+import argparse
+from itertools import islice
+import logging
+import os
+from pathlib import Path
+from random import Random
+from typing import List, Tuple
+
+import torch
+from lhotse import (
+    CutSet,
+    RecordingSet,
+    SupervisionSet,
+    Fbank,
+    FbankConfig,
+    ChunkedLilcomHdf5Writer,
+)
+
+ARGPARSE_DESCRIPTION = """
+This script follows the espnet method of splitting the remaining core+noncore utterances
+into valid and train cutsets at an index which is by default 4000.
+
+In other words, the core+noncore utterances are shuffled, where 4000 utterances of the
+shuffled set go to the `valid` cutset and are not subjected to speed perturbation. The
+remaining utterances become the `train` cutset and are speed-perturbed (0.9x, 1.0x, 1.1x).
+
+"""
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+RNG_SEED = 42
+
+
+def make_cutset_blueprints(
+    manifest_dir : Path,
+    split : int,
+) -> List[Tuple[str, CutSet]]:
+
+    # Create train and valid cuts
+    logging.info("Loading, trimming, and shuffling the remaining core+noncore cuts.")
+    recording_set = RecordingSet.from_file(manifest_dir / "csj_recordings_core.jsonl.gz") \
+        + RecordingSet.from_file(manifest_dir / "csj_recordings_noncore.jsonl.gz")
+    supervision_set = SupervisionSet.from_file(manifest_dir / "csj_supervisions_core.jsonl.gz") \
+        + SupervisionSet.from_file(manifest_dir / "csj_supervisions_noncore.jsonl.gz")
+
+    cut_set = CutSet.from_manifests(
+        recordings=recording_set,
+        supervisions=supervision_set,
+    )
+    cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
+    cut_set = cut_set.shuffle(Random(RNG_SEED))
+
+    logging.info(f"Creating valid and train cuts from core and noncore, split at {split}.")
+    valid_set = CutSet.from_cuts(islice(cut_set, 0, split))
+
+    train_set = CutSet.from_cuts(islice(cut_set, split, None))
+    train_set = (
+        train_set
+        + train_set.perturb_speed(0.9)
+        + train_set.perturb_speed(1.1)
+    )
+
+    cut_sets = [("valid", valid_set), ("train", train_set)]
+
+    # Create eval datasets
+    logging.info("Creating eval cuts.")
+    for i in range(1, 4):
+        cut_set = CutSet.from_manifests(
+            recordings=RecordingSet.from_file(manifest_dir / f"csj_recordings_eval{i}.jsonl.gz"),
+            supervisions=SupervisionSet.from_file(manifest_dir / f"csj_supervisions_eval{i}.jsonl.gz")
+        )
+        cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
+        cut_sets.append((f"eval{i}", cut_set))
+
+    return cut_sets
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description=ARGPARSE_DESCRIPTION,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument("--manifest-dir", type=Path,
+                        help="Path to save manifests")
+    parser.add_argument("--fbank-dir", type=Path,
+                        help="Path to save fbank features")
+    parser.add_argument("--split", type=int, default=4000,
+                        help="Split at this index")
+    parser.add_argument("--debug", action="store_true",
+                        help="Use hardcoded parameters")
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    if args.debug:
+        args.manifest_dir = Path("data/manifests")
+        args.fbank_dir = Path("/mnt/minami_data_server/t2131178/corpus/CSJ/fbank_new")
+        args.split = 4000
+
+    extractor = Fbank(FbankConfig(num_mel_bins=80))
+    num_jobs = min(16, os.cpu_count())
+
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    if (args.fbank_dir / ".done").exists():
+        logging.info(
+            "Previous fbank computed for CSJ found. "
+            f"Delete {args.fbank_dir / '.done'} to allow recomputing fbank."
+        )
+        return
+    else:
+        cut_sets = make_cutset_blueprints(args.manifest_dir, args.split)
+        for part, cut_set in cut_sets:
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                num_jobs=num_jobs,
+                storage_path=(args.fbank_dir / f"feats_{part}").as_posix(),
+                storage_type=ChunkedLilcomHdf5Writer
+            )
+            cut_set.to_file(args.manifest_dir / f"csj_cuts_{part}.jsonl.gz")
+
+        logging.info("All fbank computed for CSJ.")
+        (args.fbank_dir / ".done").touch()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/csj/ASR/local/compute_fbank_musan.py b/egs/csj/ASR/local/compute_fbank_musan.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/compute_fbank_musan.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../librispeech/ASR/local/compute_fbank_musan.py