From 5d3f89a632b80dbd20b111dea5a26f2662d4779b Mon Sep 17 00:00:00 2001 From: Anupam Maurya Date: Thu, 28 Sep 2023 20:49:15 +0530 Subject: [PATCH 1/2] Delete notebooks/ExtractTTSpectrogram.ipynb --- notebooks/ExtractTTSpectrogram.ipynb | 372 --------------------------- 1 file changed, 372 deletions(-) delete mode 100644 notebooks/ExtractTTSpectrogram.ipynb diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb deleted file mode 100644 index a257b6bf25..0000000000 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ /dev/null @@ -1,372 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is a notebook to generate mel-spectrograms from a TTS model to be used in a Vocoder training." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import os\n", - "import sys\n", - "import torch\n", - "import importlib\n", - "import numpy as np\n", - "from tqdm import tqdm as tqdm\n", - "from torch.utils.data import DataLoader\n", - "from TTS.tts.datasets.dataset import TTSDataset\n", - "from TTS.tts.layers.losses import L1LossMasked\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.config import load_config\n", - "from TTS.tts.utils.visual import plot_spectrogram\n", - "from TTS.tts.utils.helpers import sequence_mask\n", - "from TTS.tts.models import setup_model\n", - "from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n", - "\n", - "%matplotlib inline\n", - "\n", - "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES']='2'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def set_filename(wav_path, out_path):\n", - " wav_file = os.path.basename(wav_path)\n", - " file_name = wav_file.split('.')[0]\n", - " os.makedirs(os.path.join(out_path, \"quant\"), exist_ok=True)\n", - " os.makedirs(os.path.join(out_path, \"mel\"), exist_ok=True)\n", - " os.makedirs(os.path.join(out_path, \"wav_gl\"), exist_ok=True)\n", - " wavq_path = os.path.join(out_path, \"quant\", file_name)\n", - " mel_path = os.path.join(out_path, \"mel\", file_name)\n", - " wav_path = os.path.join(out_path, \"wav_gl\", file_name)\n", - " return file_name, wavq_path, mel_path, wav_path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "OUT_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/specs2/\"\n", - "DATA_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/\"\n", - "DATASET = \"ljspeech\"\n", - "METADATA_FILE = \"metadata.csv\"\n", - "CONFIG_PATH = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json\"\n", - "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth\"\n", - "BATCH_SIZE = 32\n", - "\n", - "QUANTIZED_WAV = False\n", - "QUANTIZE_BIT = None\n", - "DRY_RUN = False # if False, does not generate output files, only computes loss and visuals.\n", - "\n", - "use_cuda = torch.cuda.is_available()\n", - "print(\" > CUDA enabled: \", use_cuda)\n", - "\n", - "C = load_config(CONFIG_PATH)\n", - "C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n", - "ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(C['r'])\n", - "# if the vocabulary was passed, replace the default\n", - "if 'characters' in C and C['characters']:\n", - " symbols, phonemes = make_symbols(**C.characters)\n", - "\n", - "# load the model\n", - "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", - "# TODO: multiple speaker\n", - "model = setup_model(C)\n", - "model.load_checkpoint(C, MODEL_FILE, eval=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "preprocessor = importlib.import_module(\"TTS.tts.datasets.formatters\")\n", - "preprocessor = getattr(preprocessor, DATASET.lower())\n", - "meta_data = preprocessor(DATA_PATH, METADATA_FILE)\n", - "dataset = TTSDataset(\n", - " checkpoint[\"config\"][\"r\"],\n", - " C.text_cleaner,\n", - " False,\n", - " ap,\n", - " meta_data,\n", - " characters=C.get('characters', None),\n", - " use_phonemes=C.use_phonemes,\n", - " phoneme_cache_path=C.phoneme_cache_path,\n", - " enable_eos_bos=C.enable_eos_bos_chars,\n", - ")\n", - "loader = DataLoader(\n", - " dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Generate model outputs " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "\n", - "file_idxs = []\n", - "metadata = []\n", - "losses = []\n", - "postnet_losses = []\n", - "criterion = L1LossMasked(seq_len_norm=C.seq_len_norm)\n", - "with torch.no_grad():\n", - " for data in tqdm(loader):\n", - " # setup input data\n", - " text_input = data[0]\n", - " text_lengths = data[1]\n", - " linear_input = data[3]\n", - " mel_input = data[4]\n", - " mel_lengths = data[5]\n", - " stop_targets = data[6]\n", - " item_idx = data[7]\n", - "\n", - " # dispatch data to GPU\n", - " if use_cuda:\n", - " text_input = text_input.cuda()\n", - " text_lengths = text_lengths.cuda()\n", - " mel_input = mel_input.cuda()\n", - " mel_lengths = mel_lengths.cuda()\n", - "\n", - " mask = sequence_mask(text_lengths)\n", - " mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(text_input, text_lengths, mel_input)\n", - " \n", - " # compute loss\n", - " loss = criterion(mel_outputs, mel_input, mel_lengths)\n", - " loss_postnet = criterion(postnet_outputs, mel_input, mel_lengths)\n", - " losses.append(loss.item())\n", - " postnet_losses.append(loss_postnet.item())\n", - "\n", - " # compute mel specs from linear spec if model is Tacotron\n", - " if C.model == \"Tacotron\":\n", - " mel_specs = []\n", - " postnet_outputs = postnet_outputs.data.cpu().numpy()\n", - " for b in range(postnet_outputs.shape[0]):\n", - " postnet_output = postnet_outputs[b]\n", - " mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T).cuda())\n", - " postnet_outputs = torch.stack(mel_specs)\n", - " elif C.model == \"Tacotron2\":\n", - " postnet_outputs = postnet_outputs.detach().cpu().numpy()\n", - " alignments = alignments.detach().cpu().numpy()\n", - "\n", - " if not DRY_RUN:\n", - " for idx in range(text_input.shape[0]):\n", - " wav_file_path = item_idx[idx]\n", - " wav = ap.load_wav(wav_file_path)\n", - " file_name, wavq_path, mel_path, wav_path = set_filename(wav_file_path, OUT_PATH)\n", - " file_idxs.append(file_name)\n", - "\n", - " # quantize and save wav\n", - " if QUANTIZED_WAV:\n", - " wavq = ap.quantize(wav)\n", - " np.save(wavq_path, wavq)\n", - "\n", - " # save TTS mel\n", - " mel = postnet_outputs[idx]\n", - " mel_length = mel_lengths[idx]\n", - " mel = mel[:mel_length, :].T\n", - " np.save(mel_path, mel)\n", - "\n", - " metadata.append([wav_file_path, mel_path])\n", - "\n", - " # for wavernn\n", - " if not DRY_RUN:\n", - " pickle.dump(file_idxs, open(OUT_PATH+\"/dataset_ids.pkl\", \"wb\")) \n", - " \n", - " # for pwgan\n", - " with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n", - " for data in metadata:\n", - " f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")\n", - "\n", - " print(np.mean(losses))\n", - " print(np.mean(postnet_losses))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# for pwgan\n", - "with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n", - " for data in metadata:\n", - " f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sanity Check" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "idx = 1\n", - "ap.melspectrogram(ap.load_wav(item_idx[idx])).shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import soundfile as sf\n", - "wav, sr = sf.read(item_idx[idx])\n", - "mel_postnet = postnet_outputs[idx][:mel_lengths[idx], :]\n", - "mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n", - "mel_truth = ap.melspectrogram(wav)\n", - "print(mel_truth.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# plot posnet output\n", - "print(mel_postnet[:mel_lengths[idx], :].shape)\n", - "plot_spectrogram(mel_postnet, ap)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# plot decoder output\n", - "print(mel_decoder.shape)\n", - "plot_spectrogram(mel_decoder, ap)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# plot GT specgrogram\n", - "print(mel_truth.shape)\n", - "plot_spectrogram(mel_truth.T, ap)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# postnet, decoder diff\n", - "from matplotlib import pylab as plt\n", - "mel_diff = mel_decoder - mel_postnet\n", - "plt.figure(figsize=(16, 10))\n", - "plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n", - "plt.colorbar()\n", - "plt.tight_layout()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# PLOT GT SPECTROGRAM diff\n", - "from matplotlib import pylab as plt\n", - "mel_diff2 = mel_truth.T - mel_decoder\n", - "plt.figure(figsize=(16, 10))\n", - "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n", - "plt.colorbar()\n", - "plt.tight_layout()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# PLOT GT SPECTROGRAM diff\n", - "from matplotlib import pylab as plt\n", - "mel = postnet_outputs[idx]\n", - "mel_diff2 = mel_truth.T - mel[:mel_truth.shape[1]]\n", - "plt.figure(figsize=(16, 10))\n", - "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n", - "plt.colorbar()\n", - "plt.tight_layout()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "interpreter": { - "hash": "822ce188d9bce5372c4adbb11364eeb49293228c2224eb55307f4664778e7f56" - }, - "kernelspec": { - "display_name": "Python 3.9.7 64-bit ('base': conda)", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From c5a2dec6e4226dbfed023ac7ca4309d4359f6a91 Mon Sep 17 00:00:00 2001 From: Anupam Maurya Date: Thu, 28 Sep 2023 20:49:50 +0530 Subject: [PATCH 2/2] Add files via upload --- notebooks/ExtractTTSpectrogram.ipynb | 397 +++++++++++++++++++++++++++ 1 file changed, 397 insertions(+) create mode 100644 notebooks/ExtractTTSpectrogram.ipynb diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb new file mode 100644 index 0000000000..9acc9929fc --- /dev/null +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -0,0 +1,397 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a notebook to generate mel-spectrograms from a TTS model to be used in a Vocoder training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import torch\n", + "import importlib\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "from torch.utils.data import DataLoader\n", + "import soundfile as sf\n", + "import pickle\n", + "from TTS.tts.datasets.dataset import TTSDataset\n", + "from TTS.tts.layers.losses import L1LossMasked\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.config import load_config\n", + "from TTS.tts.utils.visual import plot_spectrogram\n", + "from TTS.tts.utils.helpers import sequence_mask\n", + "from TTS.tts.models import setup_model\n", + "from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n", + "\n", + "%matplotlib inline\n", + "\n", + "# Configure CUDA visibility\n", + "os.environ['CUDA_VISIBLE_DEVICES'] = '2'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to create directories and file names\n", + "def set_filename(wav_path, out_path):\n", + " wav_file = os.path.basename(wav_path)\n", + " file_name = wav_file.split('.')[0]\n", + " os.makedirs(os.path.join(out_path, \"quant\"), exist_ok=True)\n", + " os.makedirs(os.path.join(out_path, \"mel\"), exist_ok=True)\n", + " os.makedirs(os.path.join(out_path, \"wav_gl\"), exist_ok=True)\n", + " wavq_path = os.path.join(out_path, \"quant\", file_name)\n", + " mel_path = os.path.join(out_path, \"mel\", file_name)\n", + " wav_path = os.path.join(out_path, \"wav_gl\", file_name)\n", + " return file_name, wavq_path, mel_path, wav_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Paths and configurations\n", + "OUT_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/specs2/\"\n", + "DATA_PATH = \"/home/ubuntu/TTS/recipes/ljspeech/LJSpeech-1.1/\"\n", + "DATASET = \"ljspeech\"\n", + "METADATA_FILE = \"metadata.csv\"\n", + "CONFIG_PATH = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json\"\n", + "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth\"\n", + "BATCH_SIZE = 32\n", + "\n", + "QUANTIZED_WAV = False\n", + "QUANTIZE_BIT = None\n", + "DRY_RUN = False # if False, does not generate output files, only computes loss and visuals.\n", + "\n", + "# Check CUDA availability\n", + "use_cuda = torch.cuda.is_available()\n", + "print(\" > CUDA enabled: \", use_cuda)\n", + "\n", + "# Load the configuration\n", + "C = load_config(CONFIG_PATH)\n", + "C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n", + "ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)\n", + "print(C['r'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# If the vocabulary was passed, replace the default\n", + "if 'characters' in C and C['characters']:\n", + " symbols, phonemes = make_symbols(**C.characters)\n", + "\n", + "# Load the model\n", + "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", + "# TODO: multiple speakers\n", + "model = setup_model(C)\n", + "model.load_checkpoint(C, MODEL_FILE, eval=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the preprocessor based on the dataset\n", + "preprocessor = importlib.import_module(\"TTS.tts.datasets.formatters\")\n", + "preprocessor = getattr(preprocessor, DATASET.lower())\n", + "meta_data = preprocessor(DATA_PATH, METADATA_FILE)\n", + "dataset = TTSDataset(\n", + " C,\n", + " C.text_cleaner,\n", + " False,\n", + " ap,\n", + " meta_data,\n", + " characters=C.get('characters', None),\n", + " use_phonemes=C.use_phonemes,\n", + " phoneme_cache_path=C.phoneme_cache_path,\n", + " enable_eos_bos=C.enable_eos_bos_chars,\n", + ")\n", + "loader = DataLoader(\n", + " dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize lists for storing results\n", + "file_idxs = []\n", + "metadata = []\n", + "losses = []\n", + "postnet_losses = []\n", + "criterion = L1LossMasked(seq_len_norm=C.seq_len_norm)\n", + "\n", + "# Create log file\n", + "log_file_path = os.path.join(OUT_PATH, \"log.txt\")\n", + "log_file = open(log_file_path, \"w\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate model outputs " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Start processing with a progress bar\n", + "with torch.no_grad():\n", + " for data in tqdm(loader, desc=\"Processing\"):\n", + " try:\n", + " # setup input data\n", + " text_input, text_lengths, _, linear_input, mel_input, mel_lengths, stop_targets, item_idx = data\n", + "\n", + " # dispatch data to GPU\n", + " if use_cuda:\n", + " text_input = text_input.cuda()\n", + " text_lengths = text_lengths.cuda()\n", + " mel_input = mel_input.cuda()\n", + " mel_lengths = mel_lengths.cuda()\n", + "\n", + " mask = sequence_mask(text_lengths)\n", + " mel_outputs, postnet_outputs, alignments, stop_tokens = model.forward(text_input, text_lengths, mel_input)\n", + "\n", + " # compute loss\n", + " loss = criterion(mel_outputs, mel_input, mel_lengths)\n", + " loss_postnet = criterion(postnet_outputs, mel_input, mel_lengths)\n", + " losses.append(loss.item())\n", + " postnet_losses.append(loss_postnet.item())\n", + "\n", + " # compute mel specs from linear spec if the model is Tacotron\n", + " if C.model == \"Tacotron\":\n", + " mel_specs = []\n", + " postnet_outputs = postnet_outputs.data.cpu().numpy()\n", + " for b in range(postnet_outputs.shape[0]):\n", + " postnet_output = postnet_outputs[b]\n", + " mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T).cuda())\n", + " postnet_outputs = torch.stack(mel_specs)\n", + " elif C.model == \"Tacotron2\":\n", + " postnet_outputs = postnet_outputs.detach().cpu().numpy()\n", + " alignments = alignments.detach().cpu().numpy()\n", + "\n", + " if not DRY_RUN:\n", + " for idx in range(text_input.shape[0]):\n", + " wav_file_path = item_idx[idx]\n", + " wav = ap.load_wav(wav_file_path)\n", + " file_name, wavq_path, mel_path, wav_path = set_filename(wav_file_path, OUT_PATH)\n", + " file_idxs.append(file_name)\n", + "\n", + " # quantize and save wav\n", + " if QUANTIZED_WAV:\n", + " wavq = ap.quantize(wav)\n", + " np.save(wavq_path, wavq)\n", + "\n", + " # save TTS mel\n", + " mel = postnet_outputs[idx]\n", + " mel_length = mel_lengths[idx]\n", + " mel = mel[:mel_length, :].T\n", + " np.save(mel_path, mel)\n", + "\n", + " metadata.append([wav_file_path, mel_path])\n", + "\n", + " except Exception as e:\n", + " log_file.write(f\"Error processing data: {str(e)}\\n\")\n", + "\n", + " # Calculate and log mean losses\n", + " mean_loss = np.mean(losses)\n", + " mean_postnet_loss = np.mean(postnet_losses)\n", + " log_file.write(f\"Mean Loss: {mean_loss}\\n\")\n", + " log_file.write(f\"Mean Postnet Loss: {mean_postnet_loss}\\n\")\n", + "\n", + "# Close the log file\n", + "log_file.close()\n", + "\n", + "# For wavernn\n", + "if not DRY_RUN:\n", + " pickle.dump(file_idxs, open(os.path.join(OUT_PATH, \"dataset_ids.pkl\"), \"wb\"))\n", + "\n", + "# For pwgan\n", + "with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n", + " for data in metadata:\n", + " f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")\n", + "\n", + "# Print mean losses\n", + "print(f\"Mean Loss: {mean_loss}\")\n", + "print(f\"Mean Postnet Loss: {mean_postnet_loss}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for pwgan\n", + "with open(os.path.join(OUT_PATH, \"metadata.txt\"), \"w\") as f:\n", + " for data in metadata:\n", + " f.write(f\"{data[0]}|{data[1]+'.npy'}\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sanity Check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "idx = 1\n", + "ap.melspectrogram(ap.load_wav(item_idx[idx])).shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import soundfile as sf\n", + "wav, sr = sf.read(item_idx[idx])\n", + "mel_postnet = postnet_outputs[idx][:mel_lengths[idx], :]\n", + "mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n", + "mel_truth = ap.melspectrogram(wav)\n", + "print(mel_truth.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot posnet output\n", + "print(mel_postnet[:mel_lengths[idx], :].shape)\n", + "plot_spectrogram(mel_postnet, ap)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot decoder output\n", + "print(mel_decoder.shape)\n", + "plot_spectrogram(mel_decoder, ap)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# plot GT specgrogram\n", + "print(mel_truth.shape)\n", + "plot_spectrogram(mel_truth.T, ap)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# postnet, decoder diff\n", + "from matplotlib import pylab as plt\n", + "mel_diff = mel_decoder - mel_postnet\n", + "plt.figure(figsize=(16, 10))\n", + "plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n", + "plt.colorbar()\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# PLOT GT SPECTROGRAM diff\n", + "from matplotlib import pylab as plt\n", + "mel_diff2 = mel_truth.T - mel_decoder\n", + "plt.figure(figsize=(16, 10))\n", + "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n", + "plt.colorbar()\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# PLOT GT SPECTROGRAM diff\n", + "from matplotlib import pylab as plt\n", + "mel = postnet_outputs[idx]\n", + "mel_diff2 = mel_truth.T - mel[:mel_truth.shape[1]]\n", + "plt.figure(figsize=(16, 10))\n", + "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n", + "plt.colorbar()\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "822ce188d9bce5372c4adbb11364eeb49293228c2224eb55307f4664778e7f56" + }, + "kernelspec": { + "display_name": "Python 3.9.7 64-bit ('base': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}