-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsuperman.py
405 lines (345 loc) · 16.4 KB
/
superman.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
from essentia.standard import *
from scikits.samplerate import resample
from math import sqrt, ceil
from random import random
import numpy
import scikits.audiolab as audiolab
import sys
import essentia
import os
import json
import argparse
import time
import datetime
parser = argparse.ArgumentParser(description='Make an alternative version of a "target" audio file by concatenating chunks from a number of other "source" audio files.')
parser.add_argument('target_file', type=str, help='Filepath of the target file.')
parser.add_argument('--tempo', type=float, default=90.0, help='Splits target (and source) files in units with length equal to a beat of this tempo (default is 90.0).')
parser.add_argument('--random', type=float, default=0.0, help='Amount of randomization in unit selection (default is 0.0).')
parser.add_argument('--overlap_percentage', dest='overlap_percentage', type=float, default=0.5, help='Overlapping percentage of target units. Should be in range [0.0, 1.0], default is 0.5.')
parser.add_argument('--overlap_percentage_source', dest='overlap_percentage_source', type=float, default=0.5, help='Overlapping percentage of source units. Should be in range [0.0, 1.0], default is 0.5.')
parser.add_argument('--sources_dir', dest='sources_dir', type=str, default='sources/', help='Directory where to look for source files.')
parser.add_argument('--out_filepath', dest='out_filepath', type=str, default=None, help='Output path of the generated audio file.')
parser.add_argument('--length', dest='length', type=float, default=None, help='Length of the output in seconds.')
parser.add_argument('--force_analyze', dest='force_analyze', action="store_true", help='Force re-analyze all sources/targets.')
parser.add_argument('--write_score', dest='write_score', action="store_true", help='Write "score" of chosen units.')
parser.add_argument('--sample_rate', type=int, default=44100, help='Sample rate of the files (all should be the same, default is 44100).')
def time_stats(done, total, starttime):
nowtime = time.time()
position = done*1.0 / total
duration = round(nowtime - starttime)
durdelta = datetime.timedelta(seconds=duration)
remaining = round((duration / position) - duration)
remdelta = datetime.timedelta(seconds=remaining)
return str(durdelta), str(remdelta)
def analyze(filepath, frame_size=1024, hop_size=512, sample_rate=44100):
loader = MonoLoader(filename=filepath, sampleRate=sample_rate)
audio = loader()
w = Windowing(type = 'blackmanharris62')
peaks = SpectralPeaks(maxPeaks=10000, maxFrequency=5000, minFrequency=40)
hpcp = HPCP(harmonics=8, maxFrequency=5000, minFrequency=40, size=36, windowSize=0.5)
energy = Energy()
hpcps = []
energys = []
indexes = []
idx = 0
for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True, lastFrameToEndOfFile=False):
e = energy(w(frame))
energys.append(e)
h = hpcp(*peaks(w(frame)))
h = [float(e) for e in h]
hpcps.append(h) # Convert to python list so it can be serialized to JSON
indexes.append((idx, idx + frame_size))
idx += hop_size
return {
'energy': energys,
'hpcp': hpcps,
'n_frames': len(energys),
'indexes': indexes,
'filepath': filepath,
}
def analyze_file_or_load_analysis(filepath, frame_size, hop_size, sample_rate):
analysis_filepath = filepath + '.analysis_%i_%i_%i.json' % (frame_size, hop_size, sample_rate)
if not os.path.exists(analysis_filepath) or force_analyze:
# If analysis does not exist for given (frame_size, hop_size, sample_rate), run it
print 'Analyzing:', filepath
file_analysis = analyze(filepath, frame_size=frame_size, hop_size=hop_size, sample_rate=sample_rate)
json.dump(file_analysis, open(analysis_filepath, 'w'))
else:
file_analysis = json.load(open(analysis_filepath))
return file_analysis
def calcHPCPDist(tgtHPCP,srcHPCP):
# calc the distance between target and source hpcp descriptors, at different circular shifts
nBins = len(tgtHPCP)
if nBins != len(srcHPCP):
raise "cannot compare arrays of different length"
if nBins != 36:
raise "expected 36 HPCP bins per frame"
maxShiftBin = 18
bestShiftBin = 0.0
bestDist = 100000.0
startShiftBin = -maxShiftBin
endShiftBin = maxShiftBin
for shiftBin in range(startShiftBin,endShiftBin+1):
# calculate the distance between ref and cmp,
# with a shift of shiftBin bins
dist = 0.0
corr = 0.0
for bin in range(0,nBins):
# for each bins, at the current shift, add the distance
# to distance
tgtBin = bin
srcBin = (bin + shiftBin + nBins) % nBins
tgtVal = tgtHPCP[tgtBin]
srcVal = srcHPCP[srcBin]
d = tgtVal - srcVal
dist += d*d
#corr += (tgtVal-.5) * (srcVal-.5)
dist = sqrt(dist)
#dist = (nBins*.5*.5)-corr
# was this the best so far?
if dist < bestDist or (dist == bestDist and abs(shiftBin) < abs(bestShiftBin)):
# yes: we remember it
bestDist = dist
bestShiftBin = shiftBin
# return the best shift and distance found
return (bestShiftBin,bestDist)
def morph(source_analysis, target_analysis, outFilepath, windowsize=8192, hopsize=2048, max_output_length=None, rand_amount=0.3):
starttime = time.time()
output_score = []
# load target audio file and analysis
targetsf = audiolab.Sndfile(target_analysis['filepath'],'r')
samplerate = targetsf.samplerate
t_energy = numpy.array(target_analysis['energy'])
t_hpcps = numpy.array([numpy.array(e) for e in target_analysis['hpcp']])
# load source audio files and analysis
src_audio_files = dict()
s_energy = numpy.array([])
s_hpcps = numpy.zeros((0,t_hpcps.shape[1]))
s_analysis_index = []
for element in source_analysis:
# load audio file
sourcesf = audiolab.Sndfile(element['filepath'],'r')
file_frames = sourcesf.read_frames(sourcesf.nframes)
src_audio_files[element['filepath']] = file_frames
# concatenate analysis in single arrays
s_energy = numpy.append(s_energy, numpy.array(element['energy']))
hpcps = numpy.array([numpy.array(e) for e in element['hpcp']])
s_hpcps = numpy.append(s_hpcps, hpcps, axis=0)
for frame_bounds in element['indexes']:
s_analysis_index.append((element['filepath'], frame_bounds[0], frame_bounds[1]))
#init variables, generate the window(s) for the overlap-add
nchannels = targetsf.channels
window = 0.5-numpy.cos(2.*numpy.pi*numpy.arange(0,windowsize)/float(windowsize))/2
stereowindow = numpy.zeros((windowsize,nchannels))
for i in range(0,nchannels):
stereowindow[:,i] = window
#print 'Source analysis array shapes:', s_energy.shape, s_hpcps.shape
#print 'Target analysis array shapes:', t_energy.shape, t_hpcps.shape
#######################################################################
# load energy and hpcps: target
tgtEs = [[x] for x in t_energy]
tgtHPCPs = t_hpcps
ttimes = []
t = 0
for x in tgtEs:
ttimes.append(t)
t += hopsize
# load energy and hpcps: source
srcEs = [[x] for x in s_energy]
srcHPCPs = s_hpcps
stimes = []
t = 0
for x in srcEs:
stimes.append(t)
t += hopsize
# create empty output arrays
outsize = targetsf.nframes + windowsize
out = numpy.zeros((outsize,nchannels))
lastBestIndex = -1
# go through all target units and find best match
if max_output_length:
max_analysis_frames = int(ceil((max_output_length * samplerate) / hopsize))
else:
max_analysis_frames = len(tgtHPCPs)
for tgtHPCPIndex in range(0,max_analysis_frames):
_, remaining = time_stats(tgtHPCPIndex + 1, max_analysis_frames, starttime)
sys.stdout.write('\rFinding units for frames [%i/%i] - %s remaining' % (tgtHPCPIndex + 1, max_analysis_frames, remaining))
sys.stdout.flush()
tgtHPCP = tgtHPCPs[tgtHPCPIndex]
# and for each, look for the source HPCP that matches best
bestIndex = 0
bestShiftBin = 0
bestDist = 100000.0
bestEDist = 100000.0; # FOR ENERGY
#bestEindex = 0; # FOR ENERGY
for srcHPCPIndex in range(0,len(srcHPCPs)):
# COMPUTE HPCP DIST
srcHPCP = srcHPCPs[srcHPCPIndex]
(shiftBin,hpcpDist) = calcHPCPDist(tgtHPCP,srcHPCP)
hpcpDist += random()*rand_amount # add some random to introduce more variation
dist = hpcpDist
# COMPUTE ENERGY DIST
tgtE=tgtEs[tgtHPCPIndex]
srcE=srcEs[srcHPCPIndex]
eDist=abs(tgtE[0]-srcE[0])
#eIndex=srcHPCPIndex
#if eDist < bestEDist :
# bestEindex = srcHPCPIndex
# bestEDist = eDist
# was this the best so far?
# (meaning: shorter distance, or if the same distance, less shifting
# in case of equal HPCP dist, we select less shift difference, and in case of equal shift difference we select les energy difference
if dist < bestDist or (dist == bestDist and abs(shiftBin) < abs(bestShiftBin)) or (dist == bestDist and abs(shiftBin) == abs(bestShiftBin) and eDist < bestEDist):
#if eDist < bestEDist:
###################################
# Avoid repeating frames
if srcHPCPIndex == lastBestIndex :
# If the current frame is the same as the last one, we bypass the selection and go to the next.
pass #print "repetition avoided"
else:
bestIndex = srcHPCPIndex
bestDist = dist
bestShiftBin = 0#shiftBin # NEVER SHIFT!
bestEDist = eDist
###################################
# Combine ENERGY and HPCP information: we take the closer index to the target (just as an example)
#hpcpDiff=abs(tgtHPCPIndex-bestIndex)
#eDiff=abs(tgtHPCPIndex-bestEindex)
#usedEnergy=0;
#if hpcpDiff > eDiff :
# bestIndex = bestEindex
# usedEnergy = 1
#else:
# bestIndex = bestIndex
# usedEnergy = 0
lastBestIndex = bestIndex
# add info to the score
s_filename, srcPos_0, srcPos_1 = s_analysis_index[bestIndex]
time_target_unit = (tgtHPCPIndex * hopsize * 1.0)/samplerate
time_source_unit = (srcPos_0 * 1.0)/samplerate
output_score.append((time_target_unit, s_filename, time_source_unit))
# get corresponding audio unit from source
src_data_array = src_audio_files[s_filename]
#print s_filename, srcPos_0, srcPos_1, src_data_array.shape
sbuf = numpy.zeros((windowsize,nchannels))
for channel in range(nchannels):
if len(src_data_array.shape) == 2:
data = src_data_array[srcPos_0:srcPos_1, min(channel, src_data_array.shape[1])]
sbuf[:,channel] = numpy.pad(data, (0, len(sbuf) - len(data)), 'constant', constant_values=0.0)
else:
data = src_data_array[srcPos_0:srcPos_1]
sbuf[:,channel] = numpy.pad(data, (0, len(sbuf) - len(data)), 'constant', constant_values=0.0)
sbuf *= stereowindow
# change the pitch of the selected audio according to the shift
resampling = 2.**(float(bestShiftBin)/float(len(srcHPCPs[bestIndex])))
resamplingtype = 'linear'
rsbufs = []
for channel in range(nchannels):
rsbufs.append(resample(sbuf[:,channel], resampling, resamplingtype))
rsbuf = numpy.zeros((len(rsbufs[0]),nchannels))
for channel in range(nchannels):
rsbuf[:,channel] = rsbufs[channel]
rl = len(rsbuf)
tgtPos = int(ttimes[tgtHPCPIndex])
#tgtPos -= rl/2;
#tgtPos = max(0,min(outsize-rl,tgtPos))
# add the audio to the output buffer
out[tgtPos:tgtPos+rl] += rsbuf*0.5
# compute energy of target and output audios
# and modify the output audio by applying an envelope so it follows the target audio
print '\nApplying energy envelope...'
winsize = 2048+1;
hwinsize = (winsize-1)/2;
hopsize = 128;
tmp = numpy.hanning(winsize)
synthwin = numpy.zeros((winsize,nchannels))
for channel in range(nchannels):
synthwin[:,channel] = tmp
out2 = numpy.zeros((len(out),nchannels))
out2env = numpy.zeros((len(out),nchannels))
if max_output_length is not None:
outsize = int(max_output_length*samplerate)
for pos in range(0,outsize,hopsize):
targetsf.seek(max(0,min(targetsf.nframes-winsize,pos-hwinsize)))
tmp = targetsf.read_frames(winsize)
tbuf = numpy.zeros((winsize,nchannels))
for channel in range(nchannels):
if len(tmp.shape) == 2:
tbuf[:,channel] = tmp[:, min(channel, tmp.shape[1])]
else:
tbuf[:,channel] = tmp
tmp = max(0,min(outsize-winsize,pos-hwinsize))
obuf = out[tmp:tmp+winsize].copy()
obuf *= synthwin
tbuf *= synthwin
tenergy = numpy.sum( numpy.power(tbuf,2) )
oenergy = numpy.sum( numpy.power(obuf,2) )
obuf *= sqrt(tenergy/(oenergy+.000000001))
out2[tmp:tmp+winsize] += obuf
out2env[tmp:tmp+winsize] += synthwin
out2env += 0.0000000001 # this is to avoid divide warning
out2 /= out2env
out2 = numpy.clip(out2,-1,1)
if max_output_length is not None:
out2 = out2[:int(max_output_length*samplerate)]
format = audiolab.Format('wav','pcm16')
sfout = audiolab.Sndfile(outFilepath,'w',format,nchannels,targetsf.samplerate)
sfout.write_frames(out2)
sfout.close()
print 'Done! File written in:', outFilepath
return output_score
# PARSE ARGS AND DO THE MORPHING
args = parser.parse_args()
tempo = args.tempo
target_filepath = args.target_file
sample_rate = args.sample_rate
force_analyze = args.force_analyze
frame_size = int(round(sample_rate * 60.0 / tempo))
if args.overlap_percentage < 0.0:
raise Exception('--overlap_percentage should be in range [0.0, 1.0]')
elif args.overlap_percentage > 1.0:
raise Exception('--overlap_percentage should be in range [0.0, 1.0]')
hop_size = int(round(frame_size * (1.0 - args.overlap_percentage)))
if hop_size < 1:
hop_size = 1
if args.overlap_percentage_source < 0.0:
raise Exception('--overlap_percentage_source should be in range [0.0, 1.0]')
elif args.overlap_percentage > 1.0:
raise Exception('--overlap_percentage_source should be in range [0.0, 1.0]')
hop_size_source = int(round(frame_size * (1.0 - args.overlap_percentage_source)))
if hop_size_source < 1:
hop_size_source = 1
max_output_length = None
if args.length is not None:
max_output_length = abs(args.length)
write_score = args.write_score
SOURCE_DIR = args.sources_dir
# Analyze target
target_analysis = analyze_file_or_load_analysis(target_filepath, frame_size, hop_size, sample_rate)
print 'Loaded analysis for target file (%i target units)' % target_analysis['n_frames']
# Analyze source files
source_analysis = []
for filename in os.listdir(SOURCE_DIR):
filepath = os.path.join(SOURCE_DIR, filename)
if filepath.endswith('.json') or filepath.endswith('.DS_Store'):
continue
file_analysis = analyze_file_or_load_analysis(filepath, frame_size, hop_size_source, sample_rate)
source_analysis.append(file_analysis)
print 'Loaded analysis for %i source audio files (%i source units)' % (len(source_analysis), sum([value['n_frames'] for value in source_analysis]))
# Do the morphing
print 'Will do morphing for file "%s" at tempo %.2f' % (target_filepath, tempo)
date_label = datetime.datetime.today().strftime("%Y-%m-%d_%H.%M")
if args.out_filepath is None:
out_filepath = target_filepath + '.morphed_%.2f.%s.wav' % (tempo, date_label)
else:
out_filepath = args.out_filepath
if not out_filepath.endswith('.wav'):
out_filepath += '.wav'
score = morph(source_analysis, target_analysis, out_filepath, windowsize=frame_size, hopsize=hop_size, max_output_length=max_output_length, rand_amount=args.random)
if write_score:
score_filename = out_filepath + '.score.csv'
fid = open(score_filename, 'w')
fid.write('Target time;Source time;Source file\n')
for (target_time, source_filename, source_time) in score:
fid.write('%.2fs;\t%.2fs;\t%s\n' % (target_time, source_time, source_filename))
fid.close()