From edce489b535e74e27d9fc083f30fcb01617fdc19 Mon Sep 17 00:00:00 2001
From: Benjamin Van Niekerk <benjamin.van-niekerk@ubisoft.com>
Date: Mon, 9 May 2022 11:47:26 +0200
Subject: [PATCH] Initial commit

---
 README.md          | 24 +++--------------
 discretize.py      | 42 ------------------------------
 encode.py          | 15 ++---------
 hubert/__init__.py |  9 ++++++-
 hubert/model.py    | 64 +++++++++++++++++++++++++++++++---------------
 5 files changed, 58 insertions(+), 96 deletions(-)
 delete mode 100644 discretize.py

diff --git a/README.md b/README.md
index e3afa3b..245c9e7 100644
--- a/README.md
+++ b/README.md
@@ -56,10 +56,10 @@ units = kmeans.predict(x.squeeze().cpu().numpy())
 
 **Step 1**: Download and extract the [LibriSpeech](https://www.openslr.org/12) corpus.
 
-**Step 2**: Encode LibriSpeech using the HuBERT-Discrete model and `encode.py` script (setting `--layer=7`):
+**Step 2**: Encode LibriSpeech using the HuBERT-Discrete model and `encode.py` script:
 
 ```
-usage: encode.py [-h] [--extension EXTENSION] [--model {hubert_soft,hubert_discrete}] [--layer LAYER] in-dir out-dir
+usage: encode.py [-h] [--extension EXTENSION] [--model {hubert_soft,hubert_discrete}] in-dir out-dir
 
 Encode an audio dataset.
 
@@ -73,31 +73,15 @@ optional arguments:
                         extension of the audio files.
   --model {hubert_soft,hubert_discrete}
                         available models
-  --layer LAYER         the selected transformer layer (defaults to the last layer)
 ```
 
 for example:
 
 ```
-python encode.py path/to/LibriSpeech path/to/LibriSpeech/
+python encode.py path/to/LibriSpeech/wavs path/to/LibriSpeech/units --model hubert_discrete
 ```
 
-**Step 3**: Discretize the extracted features using the k-means checkpoint and `discretize.py` script:
-
-```
-usage: discretize.py [-h] in-dir out-dir
-
-Discretize HuBERT features.
-
-positional arguments:
-  in-dir      path to the dataset directory.
-  out-dir     path to the output directory.
-
-optional arguments:
-  -h, --help  show this help message and exit
-```
-
-**Step 5**: Train the HuBERT-Soft model using the `train.py` script:
+**Step 3**: Train the HuBERT-Soft model using the `train.py` script:
 
 ```
 usage: train.py [-h] [--resume RESUME] [--warmstart] [--mask] [--alpha ALPHA] dataset-dir checkpoint-dir
diff --git a/discretize.py b/discretize.py
deleted file mode 100644
index 121fbf7..0000000
--- a/discretize.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import argparse
-import logging
-import numpy as np
-from pathlib import Path
-from tqdm import tqdm
-
-import torch
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-def discretize_dataset(args):
-    logger.info("Loading k-means checkpoint")
-    kmeans = torch.hub.load("bshall/hubert:main", "kmeans100")
-
-    logger.info(f"Discretizing dataset at {args.in_dir}")
-    for in_path in tqdm(list(args.in_dir.rglob("*.npy"))):
-        x = np.load(in_path)
-        x = kmeans.predict(x)
-
-        out_path = args.out_dir / in_path.relative_to(args.in_dir)
-        out_path.parent.mkdir(parents=True, exist_ok=True)
-        np.save(out_path.with_suffix(".npy"), x)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Discretize HuBERT features.")
-    parser.add_argument(
-        "in_dir",
-        metavar="in-dir",
-        help="path to the dataset directory.",
-        type=Path,
-    )
-    parser.add_argument(
-        "out_dir",
-        metavar="out-dir",
-        help="path to the output directory.",
-        type=Path,
-    )
-    args = parser.parse_args()
-    discretize_dataset(args)
diff --git a/encode.py b/encode.py
index d81a24c..40df5c2 100644
--- a/encode.py
+++ b/encode.py
@@ -5,7 +5,6 @@
 from tqdm import tqdm
 
 import torch
-import torch.nn.functional as F
 import torchaudio
 from torchaudio.functional import resample
 
@@ -22,17 +21,13 @@ def encode_dataset(args):
         wav, sr = torchaudio.load(in_path)
         wav = resample(wav, sr, 16000)
         wav = wav.unsqueeze(0).cuda()
-        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
 
-        # Extract hubert features from the args.layer transformer layer
         with torch.inference_mode():
-            x, _ = hubert.encode(wav, layer=args.layer)
-            if args.layer is None:
-                x = hubert.proj(x)
+            units = hubert.units(wav)
 
         out_path = args.out_dir / in_path.relative_to(args.in_dir)
         out_path.parent.mkdir(parents=True, exist_ok=True)
-        np.save(out_path.with_suffix(".npy"), x.squeeze(0).cpu().numpy())
+        np.save(out_path.with_suffix(".npy"), units.squeeze().cpu().numpy())
 
 
 if __name__ == "__main__":
@@ -61,11 +56,5 @@ def encode_dataset(args):
         choices=["hubert_soft", "hubert_discrete"],
         default="hubert_soft",
     )
-    parser.add_argument(
-        "--layer",
-        help="the selected transformer layer (defaults to the last layer)",
-        default=None,
-        type=int,
-    )
     args = parser.parse_args()
     encode_dataset(args)
diff --git a/hubert/__init__.py b/hubert/__init__.py
index 9134e63..99727eb 100644
--- a/hubert/__init__.py
+++ b/hubert/__init__.py
@@ -1 +1,8 @@
-from .model import Hubert, hubert_discrete, hubert_soft, kmeans100
+from .model import (
+    Hubert,
+    HubertDiscrete,
+    HubertSoft,
+    hubert_discrete,
+    hubert_soft,
+    kmeans100,
+)
diff --git a/hubert/model.py b/hubert/model.py
index def6cd5..9f4be8f 100644
--- a/hubert/model.py
+++ b/hubert/model.py
@@ -17,7 +17,7 @@
 
 
 class Hubert(nn.Module):
-    def __init__(self, num_label_embeddings: int = 100, mask=True):
+    def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
         super().__init__()
         self._mask = mask
         self.feature_extractor = FeatureExtractor()
@@ -69,6 +69,28 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         return logits, mask
 
 
+class HubertSoft(Hubert):
+    def __init__(self):
+        super().__init__()
+
+    def units(self, wav: torch.Tensor) -> torch.Tensor:
+        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav)
+        return self.proj(x)
+
+
+class HubertDiscrete(Hubert):
+    def __init__(self, kmeans):
+        super().__init__()
+        self.kmeans = kmeans
+
+    def units(self, wav: torch.Tensor) -> torch.LongTensor:
+        wav = F.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
+        x, _ = self.encode(wav, layer=7)
+        x = self.kmeans.predict(x.squeeze().cpu().numpy())
+        return torch.tensor(x, dtype=torch.long, device=wav.device)
+
+
 class FeatureExtractor(nn.Module):
     def __init__(self):
         super().__init__()
@@ -204,43 +226,45 @@ def _compute_mask(
     return mask
 
 
-def _hubert(
-    name: str,
-    num_label_embeddings: int,
-    pretrained: bool = True,
-    progress: bool = True,
-) -> Hubert:
-    hubert = Hubert(num_label_embeddings)
-    if pretrained:
-        checkpoint = torch.hub.load_state_dict_from_url(URLS[name], progress=progress)
-        consume_prefix_in_state_dict_if_present(checkpoint, "module.")
-        hubert.load_state_dict(checkpoint)
-        hubert.eval()
-    return hubert
-
-
 def hubert_discrete(
     pretrained: bool = True,
     progress: bool = True,
-) -> Hubert:
+) -> HubertDiscrete:
     r"""HuBERT-Discrete from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
     Args:
         pretrained (bool): load pretrained weights into the model
         progress (bool): show progress bar when downloading model
     """
-    return _hubert("hubert-discrete", 504, pretrained, progress)
+    kmeans = kmeans100(pretrained=pretrained, progress=progress)
+    hubert = HubertDiscrete(kmeans)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS["hubert-discrete"], progress=progress
+        )
+        consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+        hubert.load_state_dict(checkpoint)
+        hubert.eval()
+    return hubert
 
 
 def hubert_soft(
     pretrained: bool = True,
     progress: bool = True,
-) -> Hubert:
+) -> HubertSoft:
     r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
     Args:
         pretrained (bool): load pretrained weights into the model
         progress (bool): show progress bar when downloading model
     """
-    return _hubert("hubert-soft", 100, pretrained, progress)
+    hubert = HubertSoft()
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            URLS["hubert-soft"], progress=progress
+        )
+        consume_prefix_in_state_dict_if_present(checkpoint, "module.")
+        hubert.load_state_dict(checkpoint)
+        hubert.eval()
+    return hubert
 
 
 def _kmeans(