From edce489b535e74e27d9fc083f30fcb01617fdc19 Mon Sep 17 00:00:00 2001 From: Benjamin Van Niekerk Date: Mon, 9 May 2022 11:47:26 +0200 Subject: [PATCH] Initial commit --- README.md | 24 +++-------------- discretize.py | 42 ------------------------------ encode.py | 15 ++--------- hubert/__init__.py | 9 ++++++- hubert/model.py | 64 +++++++++++++++++++++++++++++++--------------- 5 files changed, 58 insertions(+), 96 deletions(-) delete mode 100644 discretize.py diff --git a/README.md b/README.md index e3afa3b..245c9e7 100644 --- a/README.md +++ b/README.md @@ -56,10 +56,10 @@ units = kmeans.predict(x.squeeze().cpu().numpy()) **Step 1**: Download and extract the [LibriSpeech](https://www.openslr.org/12) corpus. -**Step 2**: Encode LibriSpeech using the HuBERT-Discrete model and `encode.py` script (setting `--layer=7`): +**Step 2**: Encode LibriSpeech using the HuBERT-Discrete model and `encode.py` script: ``` -usage: encode.py [-h] [--extension EXTENSION] [--model {hubert_soft,hubert_discrete}] [--layer LAYER] in-dir out-dir +usage: encode.py [-h] [--extension EXTENSION] [--model {hubert_soft,hubert_discrete}] in-dir out-dir Encode an audio dataset. @@ -73,31 +73,15 @@ optional arguments: extension of the audio files. --model {hubert_soft,hubert_discrete} available models - --layer LAYER the selected transformer layer (defaults to the last layer) ``` for example: ``` -python encode.py path/to/LibriSpeech path/to/LibriSpeech/ +python encode.py path/to/LibriSpeech/wavs path/to/LibriSpeech/units --model hubert_discrete ``` -**Step 3**: Discretize the extracted features using the k-means checkpoint and `discretize.py` script: - -``` -usage: discretize.py [-h] in-dir out-dir - -Discretize HuBERT features. - -positional arguments: - in-dir path to the dataset directory. - out-dir path to the output directory. - -optional arguments: - -h, --help show this help message and exit -``` - -**Step 5**: Train the HuBERT-Soft model using the `train.py` script: +**Step 3**: Train the HuBERT-Soft model using the `train.py` script: ``` usage: train.py [-h] [--resume RESUME] [--warmstart] [--mask] [--alpha ALPHA] dataset-dir checkpoint-dir diff --git a/discretize.py b/discretize.py deleted file mode 100644 index 121fbf7..0000000 --- a/discretize.py +++ /dev/null @@ -1,42 +0,0 @@ -import argparse -import logging -import numpy as np -from pathlib import Path -from tqdm import tqdm - -import torch - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -def discretize_dataset(args): - logger.info("Loading k-means checkpoint") - kmeans = torch.hub.load("bshall/hubert:main", "kmeans100") - - logger.info(f"Discretizing dataset at {args.in_dir}") - for in_path in tqdm(list(args.in_dir.rglob("*.npy"))): - x = np.load(in_path) - x = kmeans.predict(x) - - out_path = args.out_dir / in_path.relative_to(args.in_dir) - out_path.parent.mkdir(parents=True, exist_ok=True) - np.save(out_path.with_suffix(".npy"), x) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Discretize HuBERT features.") - parser.add_argument( - "in_dir", - metavar="in-dir", - help="path to the dataset directory.", - type=Path, - ) - parser.add_argument( - "out_dir", - metavar="out-dir", - help="path to the output directory.", - type=Path, - ) - args = parser.parse_args() - discretize_dataset(args) diff --git a/encode.py b/encode.py index d81a24c..40df5c2 100644 --- a/encode.py +++ b/encode.py @@ -5,7 +5,6 @@ from tqdm import tqdm import torch -import torch.nn.functional as F import torchaudio from torchaudio.functional import resample @@ -22,17 +21,13 @@ def encode_dataset(args): wav, sr = torchaudio.load(in_path) wav = resample(wav, sr, 16000) wav = wav.unsqueeze(0).cuda() - wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) - # Extract hubert features from the args.layer transformer layer with torch.inference_mode(): - x, _ = hubert.encode(wav, layer=args.layer) - if args.layer is None: - x = hubert.proj(x) + units = hubert.units(wav) out_path = args.out_dir / in_path.relative_to(args.in_dir) out_path.parent.mkdir(parents=True, exist_ok=True) - np.save(out_path.with_suffix(".npy"), x.squeeze(0).cpu().numpy()) + np.save(out_path.with_suffix(".npy"), units.squeeze().cpu().numpy()) if __name__ == "__main__": @@ -61,11 +56,5 @@ def encode_dataset(args): choices=["hubert_soft", "hubert_discrete"], default="hubert_soft", ) - parser.add_argument( - "--layer", - help="the selected transformer layer (defaults to the last layer)", - default=None, - type=int, - ) args = parser.parse_args() encode_dataset(args) diff --git a/hubert/__init__.py b/hubert/__init__.py index 9134e63..99727eb 100644 --- a/hubert/__init__.py +++ b/hubert/__init__.py @@ -1 +1,8 @@ -from .model import Hubert, hubert_discrete, hubert_soft, kmeans100 +from .model import ( + Hubert, + HubertDiscrete, + HubertSoft, + hubert_discrete, + hubert_soft, + kmeans100, +) diff --git a/hubert/model.py b/hubert/model.py index def6cd5..9f4be8f 100644 --- a/hubert/model.py +++ b/hubert/model.py @@ -17,7 +17,7 @@ class Hubert(nn.Module): - def __init__(self, num_label_embeddings: int = 100, mask=True): + def __init__(self, num_label_embeddings: int = 100, mask: bool = True): super().__init__() self._mask = mask self.feature_extractor = FeatureExtractor() @@ -69,6 +69,28 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: return logits, mask +class HubertSoft(Hubert): + def __init__(self): + super().__init__() + + def units(self, wav: torch.Tensor) -> torch.Tensor: + wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) + x, _ = self.encode(wav) + return self.proj(x) + + +class HubertDiscrete(Hubert): + def __init__(self, kmeans): + super().__init__() + self.kmeans = kmeans + + def units(self, wav: torch.Tensor) -> torch.LongTensor: + wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2)) + x, _ = self.encode(wav, layer=7) + x = self.kmeans.predict(x.squeeze().cpu().numpy()) + return torch.tensor(x, dtype=torch.long, device=wav.device) + + class FeatureExtractor(nn.Module): def __init__(self): super().__init__() @@ -204,43 +226,45 @@ def _compute_mask( return mask -def _hubert( - name: str, - num_label_embeddings: int, - pretrained: bool = True, - progress: bool = True, -) -> Hubert: - hubert = Hubert(num_label_embeddings) - if pretrained: - checkpoint = torch.hub.load_state_dict_from_url(URLS[name], progress=progress) - consume_prefix_in_state_dict_if_present(checkpoint, "module.") - hubert.load_state_dict(checkpoint) - hubert.eval() - return hubert - - def hubert_discrete( pretrained: bool = True, progress: bool = True, -) -> Hubert: +) -> HubertDiscrete: r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. Args: pretrained (bool): load pretrained weights into the model progress (bool): show progress bar when downloading model """ - return _hubert("hubert-discrete", 504, pretrained, progress) + kmeans = kmeans100(pretrained=pretrained, progress=progress) + hubert = HubertDiscrete(kmeans) + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + URLS["hubert-discrete"], progress=progress + ) + consume_prefix_in_state_dict_if_present(checkpoint, "module.") + hubert.load_state_dict(checkpoint) + hubert.eval() + return hubert def hubert_soft( pretrained: bool = True, progress: bool = True, -) -> Hubert: +) -> HubertSoft: r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`. Args: pretrained (bool): load pretrained weights into the model progress (bool): show progress bar when downloading model """ - return _hubert("hubert-soft", 100, pretrained, progress) + hubert = HubertSoft() + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + URLS["hubert-soft"], progress=progress + ) + consume_prefix_in_state_dict_if_present(checkpoint, "module.") + hubert.load_state_dict(checkpoint) + hubert.eval() + return hubert def _kmeans(