From cc9fdd24399c864e6905bf1057c76f468a028962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian-Robert=20St=C3=B6ter?= Date: Sat, 11 May 2019 10:55:00 +0100 Subject: [PATCH 1/3] first attempt of mono downmix in the magnitude domain --- torchaudio_contrib/functional.py | 6 +++++- torchaudio_contrib/layers.py | 25 ++++++++++++++++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/torchaudio_contrib/functional.py b/torchaudio_contrib/functional.py index f888996..cd4ae36 100644 --- a/torchaudio_contrib/functional.py +++ b/torchaudio_contrib/functional.py @@ -47,6 +47,10 @@ def stft(signal, fft_len, hop_len, window, return spect +def spectral_downmix(tensor, power=1.0): + return torch.sum(tensor**2/power, 1, keepdim=True) ** (power/2) + + def complex_norm(tensor, power=1.0): """ Normalize complex input. @@ -255,4 +259,4 @@ def mu_law_decoding(x_mu, n_quantize=256): mu = torch.tensor(n_quantize - 1, dtype=x_mu.dtype, requires_grad=False) # confused about dtype here.. x = (x_mu / mu) * 2 - 1. x = x.sign() * (torch.exp(x.abs() * torch.log1p(mu)) - 1.) / mu - return x \ No newline at end of file + return x diff --git a/torchaudio_contrib/layers.py b/torchaudio_contrib/layers.py index b7dfe48..717e265 100644 --- a/torchaudio_contrib/layers.py +++ b/torchaudio_contrib/layers.py @@ -2,7 +2,7 @@ import math import torch.nn as nn -from .functional import stft, complex_norm, \ +from .functional import stft, complex_norm, spectral_downmix, \ create_mel_filter, phase_vocoder, apply_filterbank, \ amplitude_to_db, db_to_amplitude, \ mu_law_encoding, mu_law_decoding @@ -118,6 +118,22 @@ def __repr__(self): return self.__class__.__name__ + '(power={})'.format(self.power) +class SpectralDownmix(nn.Module): + """ + Wrap torchaudio_contrib.spectral_downmix in an nn.Module. + """ + + def __init__(self, power=1.0): + super(SpectralDownmix, self).__init__() + self.power = power + + def forward(self, stft): + return spectral_downmix(stft, self.power) + + def __repr__(self): + return self.__class__.__name__ + '(power={})'.format(self.power) + + class ApplyFilterbank(_ModuleNoStateBuffers): """ Applies a filterbank transform. @@ -269,7 +285,7 @@ def __repr__(self): def Spectrogram(fft_len=2048, hop_len=None, frame_len=None, - window=None, pad=0, pad_mode="reflect", power=1., **kwargs): + window=None, pad=0, pad_mode="reflect", power=1., mono=True, **kwargs): """ Get spectrogram module. @@ -285,9 +301,10 @@ def Spectrogram(fft_len=2048, hop_len=None, frame_len=None, pad_mode: padding method (see torch.nn.functional.pad). Defaults to "reflect". power (float): Exponent of the magnitude. Defaults to 1. + mono (bool): Downmix to mono. **kwargs: Other torch.stft parameters, see torch.stft for more details. """ - return nn.Sequential( + modules = nn.Sequential( STFT( fft_len, hop_len, @@ -297,6 +314,8 @@ def Spectrogram(fft_len=2048, hop_len=None, frame_len=None, pad_mode, **kwargs), ComplexNorm(power)) + if mono: + modules.add_module(SpectralDownmix(power)) def Melspectrogram( From 3ac2761eac9b62a92c56c4d0198d7c642648d67e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian-Robert=20St=C3=B6ter?= Date: Tue, 4 Jun 2019 17:10:56 +0200 Subject: [PATCH 2/3] adjusting naming, replace spectral downmix with mean op --- torchaudio_contrib/functional.py | 25 ++++++++++++++++++++++-- torchaudio_contrib/layers.py | 33 +++++++++++++++++++++++--------- 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/torchaudio_contrib/functional.py b/torchaudio_contrib/functional.py index cd4ae36..cae02da 100644 --- a/torchaudio_contrib/functional.py +++ b/torchaudio_contrib/functional.py @@ -1,6 +1,7 @@ import torch import math import torch.nn.functional as F +import torchaudio def stft(signal, fft_len, hop_len, window, @@ -47,8 +48,28 @@ def stft(signal, fft_len, hop_len, window, return spect -def spectral_downmix(tensor, power=1.0): - return torch.sum(tensor**2/power, 1, keepdim=True) ** (power/2) +def downmix_waveform(waveform, ch_dim=1): + """ + Args: + waveform (Tensor): (batch, channel, time) + Returns: + waveform (Tensor): (batch, 1, time) + + """ + + return torchaudio.functional.downmix_mono(waveform, ch_dim=ch_dim) + + +def downmix_spectrum(mag_specgram, ch_dim=1): + """ + Args: + specgram (Tensor): (batch, channel, num_bins, time) + Returns: + specgram (Tensor): (batch, 1, num_bins, time) + + """ + + return torch.mean(mag_specgram, ch_dim, keepdim=True) def complex_norm(tensor, power=1.0): diff --git a/torchaudio_contrib/layers.py b/torchaudio_contrib/layers.py index 717e265..a6c04e2 100644 --- a/torchaudio_contrib/layers.py +++ b/torchaudio_contrib/layers.py @@ -2,7 +2,8 @@ import math import torch.nn as nn -from .functional import stft, complex_norm, spectral_downmix, \ +from .functional import stft, complex_norm, \ + downmix_spectrum, downmix_waveform, \ create_mel_filter, phase_vocoder, apply_filterbank, \ amplitude_to_db, db_to_amplitude, \ mu_law_encoding, mu_law_decoding @@ -118,20 +119,34 @@ def __repr__(self): return self.__class__.__name__ + '(power={})'.format(self.power) -class SpectralDownmix(nn.Module): +class DownmixWaveform(nn.Module): """ - Wrap torchaudio_contrib.spectral_downmix in an nn.Module. + Wrap torchaudio_contrib.downmix_waveform in an nn.Module. """ - def __init__(self, power=1.0): - super(SpectralDownmix, self).__init__() - self.power = power + def __init__(self): + super(DownmixWaveform, self).__init__() - def forward(self, stft): - return spectral_downmix(stft, self.power) + def forward(self, waveform): + return downmix_waveform(waveform) def __repr__(self): - return self.__class__.__name__ + '(power={})'.format(self.power) + return self.__class__.__name__ + + +class DownmixSpectrum(nn.Module): + """ + Wrap torchaudio_contrib.downmix_spectrum in an nn.Module. + """ + + def __init__(self): + super(DownmixSpectrum, self).__init__() + + def forward(self, mag_specgram): + return downmix_spectrum(mag_specgram) + + def __repr__(self): + return self.__class__.__name__ class ApplyFilterbank(_ModuleNoStateBuffers): From 3087c38e7cd54c088e4bf68427911ae2a695dda5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian-Robert=20St=C3=B6ter?= Date: Tue, 4 Jun 2019 17:11:54 +0200 Subject: [PATCH 3/3] set mono default to false --- torchaudio_contrib/layers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchaudio_contrib/layers.py b/torchaudio_contrib/layers.py index a6c04e2..752b5d5 100644 --- a/torchaudio_contrib/layers.py +++ b/torchaudio_contrib/layers.py @@ -300,7 +300,7 @@ def __repr__(self): def Spectrogram(fft_len=2048, hop_len=None, frame_len=None, - window=None, pad=0, pad_mode="reflect", power=1., mono=True, **kwargs): + window=None, pad=0, pad_mode="reflect", power=1., mono=False, **kwargs): """ Get spectrogram module. @@ -330,7 +330,7 @@ def Spectrogram(fft_len=2048, hop_len=None, frame_len=None, **kwargs), ComplexNorm(power)) if mono: - modules.add_module(SpectralDownmix(power)) + modules.add_module(DownmixSpectrum()) def Melspectrogram(