-
Notifications
You must be signed in to change notification settings - Fork 1
/
audio_processing.py
112 lines (88 loc) · 3.67 KB
/
audio_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import numpy as np
import torch
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore")
import torchaudio # we are using sox as backend
torchaudio.set_audio_backend("sox_io")
from torchaudio.transforms import GriffinLim, Resample, MelScale
from scipy import signal
import librosa
SAMPLING_RATE = 22050
N_FFT = 2048 # fft points (samples)
FRAME_SHIFT = 0.0125 # seconds
FRAME_LENGTH = 0.05 # seconds
HOP_LENGTH = int(SAMPLING_RATE * FRAME_SHIFT) # samples. =276.
WIN_LENGTH = int(SAMPLING_RATE * FRAME_LENGTH) # samples. =1102.
N_MELS = 80 # Number of Mel banks to generate
N_MAGS = 1 + N_FFT // 2
POWER = 1.5 # Exponent for amplifying the predicted magnitude
N_ITER = 50 # Number of inversion iterations
PREEMPHASIS = .97
MAX_DB = 100
REF_DB = 20
REDUCTION_FACTOR = 4
DB_TRIM_THRESHOLD = 70 # more than default to be sure no speech audio is cut
def get_spectrograms(fpath):
"""Parse the wave file in `fpath` and
Returns normalized melspectrogram and linear spectrogram.
Args:
fpath: A string. The full path of a sound file.
Returns:
mel: A 2d array of shape (n_mels, T) and dtype of float32.
mag: A 2d array of shape (n_mags, T) and dtype of float32.
"""
# Loading sound file
y, sr = librosa.load(fpath, sr=SAMPLING_RATE)
# removes silence at beginning and end of audio
y, _ = librosa.effects.trim(y, top_db=DB_TRIM_THRESHOLD, frame_length=WIN_LENGTH, hop_length=HOP_LENGTH)
# Preemphasis
y = np.append(y[0], y[1:] - PREEMPHASIS * y[:-1])
# stft
linear = librosa.stft(y=y, n_fft=N_FFT, hop_length=HOP_LENGTH, win_length=WIN_LENGTH)
# magnitude spectrogram
mag = np.abs(linear) # (n_mags, T)
# mel spectrogram
mel_basis = librosa.filters.mel(SAMPLING_RATE, N_FFT, N_MELS) # (n_mels, n_mags)
mel = np.dot(mel_basis, mag) # (n_mels, t)
# to decibel
mel = 20 * np.log10(np.maximum(1e-5, mel))
mag = 20 * np.log10(np.maximum(1e-5, mag))
# normalize
mel = np.clip((mel - REF_DB + MAX_DB) / MAX_DB, 1e-8, 1)
mag = np.clip((mag - REF_DB + MAX_DB) / MAX_DB, 1e-8, 1)
return mel.astype(np.float32), mag.astype(np.float32)
def spectrogram2wav(mag):
""" Generate wave file from linear magnitude spectrogram
Args:
mag: A tensor of (n_mags, T)
Returns:
wav: A 1-D numpy array.
"""
# de-noramlize
mag = (torch.clamp(mag, 0, 1) * MAX_DB) - MAX_DB + REF_DB
# to amplitude
mag = torch.pow(10.0, mag * 0.05)
# griffin-lim
with warnings.catch_warnings():
warnings.simplefilter("ignore") # deprecation warnings depending on GriffinLim implementation
wav = GriffinLim(n_fft=N_FFT, win_length=WIN_LENGTH, hop_length=HOP_LENGTH, power=1/POWER, normalized=True)(mag)
# de-preemphasis
wav = signal.lfilter([1], [1, -PREEMPHASIS], wav.numpy()) # for some reason, this is much faster than
# torchaudio.functional.lfilter(wav, torch.tensor([1, -PREEMPHASIS], dtype=torch.float32),
# torch.tensor([1, 1], dtype=torch.float32))
# trim
wav, _ = librosa.effects.trim(wav, frame_length=WIN_LENGTH, hop_length=HOP_LENGTH)
return wav.astype(np.float32)
def load_spectrograms(fpath):
"""Read the wave file in `fpath`
and extracts spectrograms, mel spectrograms time dimention is reduced"""
mel, mag = get_spectrograms(fpath)
t = mel.shape[1]
# Marginal padding for reduction shape sync.
num_paddings = REDUCTION_FACTOR - (t % REDUCTION_FACTOR) if t % REDUCTION_FACTOR != 0 else 0
mel = np.pad(mel, [[0, 0], [0, num_paddings]], mode="constant")
mag = np.pad(mag, [[0, 0], [0, num_paddings]], mode="constant")
# Reduction
mel = mel[:, ::REDUCTION_FACTOR]
return mel, mag