-
Notifications
You must be signed in to change notification settings - Fork 0
/
spectrogram.py
118 lines (76 loc) · 3.69 KB
/
spectrogram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import librosa, librosa.display
import resampy
import soundfile as sf
import pickle
from vad import vad
#Follow Google Research VGG-ish AudioSet
def generate_log_spectrogram(filepath,outpath,sr=16000,duration=5.0,segment=1.0,pre_emphasis=None,
n_mels=64,fmin=0,fmax=None,n_fft=256, hop_length=128, debug=False,VAD=False):
save_as_filetype = 0 # 0 - not save, 1-png, 2-npy
fn = filepath.split('/')[-1]
step = np.ceil(segment)
if duration == None:
data,sampling_rate = librosa.load(filepath,sr=sr)
duration = librosa.get_duration(y=data, sr=sr)
num_imgs = int(duration/step)
melgrams = []
if debug:
print(filepath,fn)
print(duration,segment,step,num_imgs)
for i in range(num_imgs):
if save_as_filetype ==1:
img_filepath = outpath + fn[:-3]+'_{}.npy'.format(i+1)
if os.path.exists(img_filepath):
continue
elif save_as_filetype ==2:
img_filepath = outpath + fn[:-3]+'_{}.png'.format(i+1)
if os.path.exists(img_filepath):
continue
try:
data, sampling_rate = librosa.load(filepath,sr=sr,offset=i*step,duration=segment)
if not pre_emphasis is None:
data = np.append(data[0], data[1:] - pre_emphasis * data[:-1])
vocal = True
if VAD:
vocal = vad(data,sr=sampling_rate,segment=segment)[0]
melgram = []
if VAD==False or (VAD==True and vocal==True) or True:
melspectrogram = librosa.feature.melspectrogram(y=data, sr=sampling_rate,n_mels=n_mels,
fmin=fmin,fmax=fmax,
center=False,
n_fft=n_fft, hop_length=hop_length)
melgram = librosa.power_to_db(melspectrogram, ref=np.max)
if debug:
print(i,i*step,vocal,melgram.shape)
mmin = np.min(melgram)
mmax = np.max(melgram)
if mmax==mmin:
print(filepath,fn)
print(melgram)
#input("dbg melgram")
continue
melgram = (melgram-mmin)/(mmax-mmin)*255
melgrams.append((melgram,i,vocal))
if save_as_filetype==1:
#print(melspectrogram.shape)
plt.figure(figsize=(1, 1))
plt.figure()
plt.axis('off')
#print(melgram.shape,np.max(melgram),np.min(melgram),np.mean(melgram))
librosa.display.specshow(melgram)
plt.savefig(img_filepath) #, dpi=224)
plt.close()
elif save_as_filetype==2:
np.save(img_filepath,melgram)
#print(img_filepath)
except:
print("Reading audio file {} has error".format(filepath))
return melgrams
if __name__ == '__main__':
melgram = generate_log_spectrogram('./1-22694-A.ogg','./',sr=8000,segment=4.1, pre_emphasis=None,
n_mels=64,fmin=0,fmax=None,
n_fft=512, hop_length=512,debug=True,VAD=True)
print(melgram)