micplacementconvnet_full.py

# -*- coding: utf-8 -*-
"""MicPlacementConvNet.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1HZdAsF77IqUOvP6MaweNqCxOr8JBugD7
"""

from google.colab import drive
drive.mount('/content/drive')

!pip install comet_ml

from comet_ml import Experiment
import keras
from keras.models import Model, Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Dropout, Activation, Flatten,  RepeatVector, TimeDistributed, MaxPooling1D, UpSampling1D
from keras.layers import LSTM, Dense, Dropout, Input, concatenate, Conv1D, LeakyReLU
from keras.callbacks import ModelCheckpoint
import numpy as np
import librosa
import matplotlib.pyplot as plt
from librosa import display
from sklearn import preprocessing
import IPython.display as ipd
import datetime

class callback(keras.callbacks.Callback):
    def __init__(self, x_val, y_val, model, num_tests=1, audio_preview=True, sr=44100):
        self.losses = []
        self.model = model
        self.x_val = x_val
        self.y_val = y_val
        self.num_examples = x_val.shape[0]
        self.num_tests = num_tests
        self.sr = sr
        self.audio_preview = audio_preview

        self.difference_mask = False
        
    def on_train_begin(self, logs={}):
        return
 
    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return

    def random_sample(self):
        rand_idx = np.random.randint(0, high=self.num_examples)
        return self.x_val[rand_idx, :, :], self.y_val[rand_idx, :, :]
 
    def on_epoch_end(self, epoch, logs={}):
        for _ in range(self.num_tests):
          x, y = self.random_sample()
          y_p = self.model.predict(x.reshape((1,x.shape[0],1)))
          x = np.squeeze(x)
          y = np.squeeze(y)
          y_p = np.squeeze(y_p)

          if self.difference_mask:
            y = x + y
            y_p = x + y_p

          print('x/y_p diff:')
          print(abs(np.sum(x) - np.sum(y_p)))

          print('x vs predicted y')
          plt.plot(x, color='red')
          plt.plot(y_p)
          plt.show()

          print('ground truth y vs predicted y')
          plt.plot(y, color='red')
          plt.plot(y_p)
          plt.show()

          if self.audio_preview:
            print('input sample:')
            ipd.display(ipd.Audio(x, rate=self.sr, autoplay=False))
            print('ground truth:')
            ipd.display(ipd.Audio(y, rate=self.sr, autoplay=False))
            print('prediction')
            ipd.display(ipd.Audio(y_p, rate=self.sr, autoplay=False))

        return
 
    def on_batch_begin(self, batch, logs={}):
        return
 
    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        return

import librosa
import numpy as np

def extract_transients(audio, sr, ws, start_pad, hop=512, backtrack=True):
  # grab onset times; backtrack detects minimum before transient
  beats = librosa.onset.onset_detect(y=audio, sr=sr, units='frames', hop_length=hop, backtrack=backtrack)
  frames = librosa.util.frame(audio, frame_length=ws, hop_length=hop)
  
  return frames.T, beats

# verify transients are at same position
def correlate_transients(x, y):
  shared = np.intersect1d(x, y)
  print(f'len x {len(x)} len y {len(y)} len shared {len(shared)}')

  return shared

def analyze_contrast(block, sr=44100):
  S = np.abs(librosa.stft(block))
  contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
  this_avg_contrast = np.mean(contrast)
  return this_avg_contrast

def analyze_envelope(rms_blocks, plot_curves=False): 
  # check whether there is more energy in the first or second half
  f_env = np.linspace(1, 0, num=rms_blocks.shape[1]) # forward envelope
  r_env = np.linspace(0, 1, num=rms_blocks.shape[1]) # reverse envelope

  rms_start = rms_blocks * f_env
  rms_end = rms_blocks * r_env

  if plot_curves:
    plt.plot(rms_start.reshape((rms_start.shape[0] * rms_start.shape[1])), color='red')
    plt.plot(rms_end.reshape((rms_end.shape[0] * rms_end.shape[1])))
    plt.show()

  rms_start = np.mean(rms_start)
  rms_end = np.mean(rms_end)

  if rms_start > rms_end:
    return True
  else:
    return False

def validate_transients(x, y, sr=44100, visualize_rejects=False):
  rms_total = librosa.feature.rms(y=y.reshape((y.shape[0] * y.shape[1])))
  avg_rms = np.median(rms_total)

  x_valid = []
  y_valid = []

  rejects_x = []
  rejects_y = []

  for X, Y in zip(x, y):
    rms_blocks = (librosa.feature.rms(y=Y))
    this_avg = np.mean(rms_blocks)

    # determine if transient happens in first half
    envelope_skew = analyze_envelope(rms_blocks) 

    if envelope_skew and this_avg > avg_rms:
      x_valid.append(X)
      y_valid.append(Y)
    else:
      rejects_x.append(X)
      rejects_y.append(Y)

  x_valid = np.asarray(x_valid)
  y_valid = np.asarray(y_valid)
  print(f'valid samples {x_valid.shape[0]}')

  if visualize_rejects:
    rejects_x = np.asarray(rejects_x)
    rejects_y = np.asarray(rejects_y)
    visualize_audio_data(rejects_x, rejects_y, sr=sample_rate) #see what is rejected

  return x_valid, y_valid
`
def visualize_audio_data(data_x, data_y, sr=44100):
  for x, y in zip(data_x, data_y):
    print('x data:')
    plt.plot(x)
    print('y data:')
    plt.plot(y, color='red')
    plt.show()
    print('x data:')
    ipd.display(ipd.Audio(x, rate=sr, autoplay=False))
    print('y data:')
    ipd.display(ipd.Audio(y, rate=sr, autoplay=False))
    plt.show()
    print('\n')

def gen_dataset(data:"JSON database",
                ws:"window size", 
                x_key,
                y_key,
                normalize_stems=False,
                normalize_transients=False, 
                max_examples=100,
                sample_rate=44100,
                difference_mask=False): #"difference_mask = output y as (x - y)"
  x_train = []
  y_train = []

  for i, k in enumerate(data.keys()):
    x = data[k][x_key]
    y = data[k][y_key]
    try:
      if len(x) > 0 and len(y) > 0:

        print(f'loading {data[k][x_key][0]}')
        print(f'loaded {i} of {max_examples}')

        audio_x, sr = librosa.load(data[k][x_key][0], sr=sample_rate, res_type='kaiser_fast')
        audio_y, _ = librosa.load(data[k][y_key][0], sr=sample_rate, res_type='kaiser_fast')

        if normalize_stems: # NORMALIZES ENTIRE STEM, NOT INDIVIDUAL SAMPLES
          audio_x = librosa.util.normalize(audio_x)
          audio_y = librosa.util.normalize(audio_y)

        print('loaded files, analyzing transients')
        frames_x, bx = extract_transients(audio_x, sr, ws, 0)
        frames_y, by = extract_transients(audio_y, sr, ws, 0)

        idx_shared = correlate_transients(bx, by)

        tx = frames_x[idx_shared]
        ty = frames_y[idx_shared]

        tx, ty = validate_transients(tx, ty) # verify transients are clean

        for x_trans, y_trans in zip(tx, ty):
          if normalize_transients:
            x_trans = librosa.util.normalize(x_trans)
            y_trans = librosa.util.normalize(y_trans)

          if difference_mask: # calcuate difference
            y_trans = x_trans - y_trans
          
          x_train.append(x_trans)
          y_train.append(y_trans)
    except:
      print('error with loading file, skipping')
      continue

    if i+1 > max_examples:
      break
    
  x_train = np.asarray(x_train)
  y_train = np.asarray(y_train)
  
  print(f'x shape {x_train.shape} y shape {y_train.shape}')

  return x_train, y_train

# ======= LOAD IN DATASET ========
import json

cambridge_dataset = '/content/drive/My Drive/Datasets/MultitrackStems/train/cambridge_dataset.json'
win_size = 16384
shuffle_dataset = False

with open(cambridge_dataset) as json_file:
    data = json.load(json_file)

print(f'total num stems: {len(data)}')

x_key = 'overhead'
y_key = 'snare'

x_train, y_train = gen_dataset(data, win_size, x_key, y_key, 
                               normalize_stems=False, normalize_transients=False, 
                               max_examples=180)

# preview random 10 samples
print(x_train.shape)

rand_idx = np.random.randint(0, high=x_train.shape[0])
visualize_audio_data(x_train[rand_idx:rand_idx+10], y_train[rand_idx:rand_idx+10], sr=44100)

# ==== PREPARE DATASET DIMENSIONS ========

val_ratio = 0.15
val_examps = int(len(x_train) * val_ratio) # number of validation samples (takes from training)

x_val = x_train[len(x_train)-1-val_examps:]
y_val = y_train[len(x_train)-1-val_examps:]

x_train = x_train[:len(x_train)-1-val_examps]
y_train = y_train[:len(y_train)-1-val_examps]

x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1)
y_train = y_train.reshape(y_train.shape[0], y_train.shape[1], 1)

x_val = x_val.reshape(x_val.shape[0], x_val.shape[1], 1)
y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)

print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)

# ==== SHUFFLE AND PREPROCESS ============

from sklearn.utils import shuffle

x_train, y_train = shuffle(x_train, y_train, random_state=0)

x_val, y_val = shuffle(x_val, y_val, random_state=42)

# ===== RUN EXPERIMENT ===================

experiment = Experiment(api_key="OKhPlin1BVQJFzniHu1f3K1t3",
                        project_name="micplacementwavenet", workspace="cm5409a")

# Hyperparameters --------
batch_size = 32
num_epochs = 5
Fc = 24 # num filters per layer (which is multiplied by depth)
sources_to_estimate = 1

# -----------------------

init_x = x_train[0]
init_y = y_train[0]

input_shape = (x_train.shape[1], 1)
output_shape = (y_train.shape[1], 1)
main_dim = input_shape[1]

# MODEL =======================================================
# wave-U-net keras implementation, details: https://arxiv.org/pdf/1806.03185.pdf

# NOTES:
# change padding from 'same' to 'causal'
input_layer = Input(shape=input_shape)

downsample_0 = Conv1D(filters=Fc*1, kernel_size=15, padding='same')(input_layer)
downsample_0 = LeakyReLU(alpha=0.05)(downsample_0) # ACTIVATION
downsample_0 = MaxPooling1D(pool_size=2)(downsample_0) # DOWNSAMPLE

downsample_1 = Conv1D(filters=Fc*2, kernel_size=15, padding='same')(downsample_0)
downsample_1 = LeakyReLU(alpha=0.05)(downsample_1) # ACTIVATION
downsample_1 = MaxPooling1D(pool_size=2)(downsample_1) # DOWNSAMPLE

downsample_2 = Conv1D(filters=Fc*3, kernel_size=15, padding='same')(downsample_1)
downsample_2 = LeakyReLU(alpha=0.05) (downsample_2)# ACTIVATION
downsample_2 = MaxPooling1D(pool_size=2)(downsample_2) # DOWNSAMPLE

downsample_3 = Conv1D(filters=Fc*4, kernel_size=15, padding='same')(downsample_2)
downsample_3 = LeakyReLU(alpha=0.05) (downsample_3)# ACTIVATION
downsample_3 = MaxPooling1D(pool_size=2)(downsample_3) # DOWNSAMPLE

downsample_4 = Conv1D(filters=Fc*5, kernel_size=15, padding='same')(downsample_3)
downsample_4 = LeakyReLU(alpha=0.05) (downsample_4)# ACTIVATION
downsample_4 = MaxPooling1D(pool_size=2)(downsample_4) # DOWNSAMPLE

downsample_5 = Conv1D(filters=Fc*6, kernel_size=15, padding='same')(downsample_4)
downsample_5 = LeakyReLU(alpha=0.05) (downsample_5)# ACTIVATION
downsample_5 = MaxPooling1D(pool_size=2)(downsample_5) # DOWNSAMPLE

downsample_6 = Conv1D(filters=Fc*7, kernel_size=15, padding='same')(downsample_5)
downsample_6 = LeakyReLU(alpha=0.05) (downsample_6)# ACTIVATION
downsample_6 = MaxPooling1D(pool_size=2)(downsample_6) # DOWNSAMPLE

downsample_7 = Conv1D(filters=Fc*8, kernel_size=15, padding='same')(downsample_6)
downsample_7 = LeakyReLU(alpha=0.05) (downsample_7)# ACTIVATION
downsample_7 = MaxPooling1D(pool_size=2)(downsample_7) # DOWNSAMPLE

downsample_8 = Conv1D(filters=Fc*9, kernel_size=15, padding='same')(downsample_7)
downsample_8 = LeakyReLU(alpha=0.05) (downsample_8)# ACTIVATION
downsample_8 = MaxPooling1D(pool_size=2)(downsample_8) # DOWNSAMPLE

downsample_9 = Conv1D(filters=Fc*10, kernel_size=15, padding='same')(downsample_8)
downsample_9 = LeakyReLU(alpha=0.05) (downsample_9)# ACTIVATION
downsample_9 = MaxPooling1D(pool_size=2)(downsample_9) # DOWNSAMPLE

downsample_10 = Conv1D(filters=Fc*11, kernel_size=15, padding='same')(downsample_9)
downsample_10 = LeakyReLU(alpha=0.05) (downsample_10)# ACTIVATION
downsample_10 = MaxPooling1D(pool_size=2)(downsample_10) # DOWNSAMPLE

downsample_11 = Conv1D(filters=Fc*12, kernel_size=15, padding='same')(downsample_10)
downsample_11 = LeakyReLU(alpha=0.05) (downsample_11)# ACTIVATION
downsample_11 = MaxPooling1D(pool_size=2)(downsample_11) # DOWNSAMPLE


# =====================================================
# consider extending this so that shape in center reaches 4 or even 2 (12 layer)

upsample_11 = Conv1D(filters=Fc*12, kernel_size=5, padding='same')(downsample_11)
upsample_11 = LeakyReLU(alpha=0.05)(upsample_11) # ACTIVATION
upsample_11 = UpSampling1D(size=2)(upsample_11) # UPSAMPLE

upsample_10 = concatenate([upsample_11, downsample_10])
upsample_10 = Conv1D(filters=Fc*11, kernel_size=5, padding='same')(upsample_10)
upsample_10 = LeakyReLU(alpha=0.05)(upsample_10) # ACTIVATION
upsample_10 = UpSampling1D(size=2)(upsample_10) # UPSAMPLE

upsample_9 = concatenate([upsample_10, downsample_9])
upsample_9 = Conv1D(filters=Fc*10, kernel_size=5, padding='same')(upsample_9)
upsample_9 = LeakyReLU(alpha=0.05)(upsample_9) # ACTIVATION
upsample_9 = UpSampling1D(size=2)(upsample_9) # UPSAMPLE

upsample_8 = concatenate([upsample_9, downsample_8])
upsample_8 = Conv1D(filters=Fc*9, kernel_size=5, padding='same')(upsample_8)
upsample_8 = LeakyReLU(alpha=0.05)(upsample_8) # ACTIVATION
upsample_8 = UpSampling1D(size=2)(upsample_8) # UPSAMPLE

upsample_7 = concatenate([upsample_8, downsample_7])
upsample_7 = Conv1D(filters=Fc*8, kernel_size=5, padding='same')(upsample_7)
upsample_7 = LeakyReLU(alpha=0.05)(upsample_7) # ACTIVATION
upsample_7 = UpSampling1D(size=2)(upsample_7) # UPSAMPLE

upsample_6 = concatenate([upsample_7, downsample_6])
upsample_6 = Conv1D(filters=Fc*7, kernel_size=5, padding='same')(upsample_6)
upsample_6 = LeakyReLU(alpha=0.05)(upsample_6) # ACTIVATION
upsample_6 = UpSampling1D(size=2)(upsample_6) # UPSAMPLE

upsample_5 = concatenate([upsample_6, downsample_5])
upsample_5 = Conv1D(filters=Fc*6, kernel_size=5, padding='same')(upsample_5)
upsample_5 = LeakyReLU(alpha=0.05)(upsample_5) # ACTIVATION
upsample_5 = UpSampling1D(size=2)(upsample_5) # UPSAMPLE

upsample_4 = concatenate([upsample_5, downsample_4])
upsample_4 = Conv1D(filters=Fc*5, kernel_size=5, padding='same')(upsample_4)
upsample_4 = LeakyReLU(alpha=0.05)(upsample_4) # ACTIVATION
upsample_4 = UpSampling1D(size=2)(upsample_4) # UPSAMPLE

upsample_3 = concatenate([upsample_4, downsample_3])
upsample_3 = Conv1D(filters=Fc*4, kernel_size=5, padding='same')(upsample_3)
upsample_3 = LeakyReLU(alpha=0.05)(upsample_3) # ACTIVATION
upsample_3 = UpSampling1D(size=2)(upsample_3) # UPSAMPLE

upsample_2 = concatenate([upsample_3, downsample_2])
upsample_2 = Conv1D(filters=Fc*3, kernel_size=5, padding='same')(upsample_2)
upsample_2 = LeakyReLU(alpha=0.05)(upsample_2) # ACTIVATION
upsample_2 = UpSampling1D(size=2)(upsample_2) # UPSAMPLE

upsample_1 = concatenate([upsample_2, downsample_1])
upsample_1 = Conv1D(filters=Fc*2, kernel_size=5, padding='same')(upsample_1)
upsample_1 = LeakyReLU(alpha=0.05)(upsample_1) # ACTIVATION
upsample_1 = UpSampling1D(size=2)(upsample_1) # UPSAMPLE

upsample_0 = concatenate([upsample_1, downsample_0]) # CONCATENATE SKIP
upsample_0 = Conv1D(filters=Fc*1, kernel_size=1, padding='same')(upsample_0)
upsample_0 = LeakyReLU(alpha=0.05)(upsample_0) # ACTIVATION
upsample_0 = UpSampling1D(size=2)(upsample_0) # UPSAMPLE

output_layer = Conv1D(filters=sources_to_estimate, kernel_size=1, padding='same')(upsample_0)

model = Model(input_layer, output_layer)
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
model.summary()

cb = callback(x_val, y_val, model, num_tests=1)

result = model.fit(x_train, 
                   y_train, 
                   batch_size=batch_size,
                   shuffle=True,
                   epochs=num_epochs,
                   validation_data=(x_val, y_val),
                   callbacks=[cb])

model.save('/content/drive/My Drive/Datasets/MultitrackStems/models/oh_to_snare_5_epoch.h5')

# ========= CONTINUE TRAINING ==================
epochs_continue = 30
cb = callback(x_val, y_val, model, num_tests=1)

result = model.fit(x_train, 
                   y_train, 
                   batch_size=batch_size,
                   shuffle=True,
                   epochs=epochs_continue,
                   validation_data=(x_val, y_val),
                   callbacks=[cb])

# ========= MODEL TESTING WITH VALIDATION SET ===========

rand_range = np.random.randint(0,high=len(x_val)-1)

x_test = x_val[rand_range:rand_range+10, :, :]
y_test = model.predict(x_test)
z_test = y_val[rand_range:rand_range+10, :, :]
print(y_test.shape)

for i in range(y_test.shape[0]):

  print('x vs predicted y')
  plt.plot(x_test[i, :, 0], color='red')
  plt.plot(y_test[i, :, 0])
  plt.show()

  print('ground truth y vs predicted y')
  plt.plot(z_test[i, :, 0], color='red')
  plt.plot(y_test[i, :, 0])
  plt.show()

  print('input sample:')
  ipd.display(ipd.Audio(x_test[i, :, 0], rate=44100, autoplay=False))
  print('ground truth:')
  ipd.display(ipd.Audio(z_test[i, :, 0], rate=44100, autoplay=False))
  print('prediction')
  ipd.display(ipd.Audio(y_test[i, :, 0], rate=44100, autoplay=False))

def test_model(model: "model to test",
               audio_file: "path to audio sample",
               win_size: "window size of model",
               sample_rate=44100):
  
  x_audio, _ = librosa.load(audio_file, sr=sample_rate)
  num_win = x_audio.shape[0] // win_size
  print(f'number of windows: {num_win}')

  output = np.zeros((num_win * win_size))
  x_audio = x_audio[:num_win*win_size]
  x_test = x_audio.reshape((num_win, win_size, 1))
  y_test = model.predict(x_test)
  y_audio = y_test.reshape((num_win * win_size,))
  
  return x_audio, y_audio

# ======= MODEL TESTING ==========

audio_file = '/content/Shroom LANDR Break04_70bpm.wav'
sample_rate = 44100
win_size = 16384

x_audio, y_audio = test_model(model, audio_file, win_size, sample_rate)

plt.plot(x_audio)
plt.plot(y_audio)
plt.show()

print('input audio')
ipd.display(ipd.Audio(x_audio, rate=sample_rate))
print('output prediction')
ipd.display(ipd.Audio(y_audio, rate=sample_rate))

mask = abs(y_audio)
plt.plot(mask)
plt.show()

masked = x_audio * mask

print('masked vs prediction')
plt.plot(y_audio)
plt.plot(masked)
plt.show()

print('masked vs ground truth')
plt.plot(x_audio)
plt.plot(masked)
plt.show()

print('ground truth')
ipd.display(ipd.Audio(x_audio, rate=sample_rate))
print('network prediction')
ipd.display(ipd.Audio(y_audio, rate=sample_rate))
print('masked')
ipd.display(ipd.Audio(masked, rate=sample_rate))

class TrainGenerator():
    
    def __init__(self, audio_x, audio_y, win_size, batch_size):
        self.audio_x = audio_x
        self.audio_y = audio_y
        self.audio_len = min(len(audio_x), len(audio_y)) # just in case there are 2 different sizes
        self.win_size = win_size
        self.batch_size = batch_size
        
    def random_audio_sample(self):
        startIdx = np.random.randint(0, high = self.audio_len - self.win_size)
        sample_x = self.audio_x[startIdx:startIdx+self.win_size]
        sample_y = self.audio_y[startIdx:startIdx+self.win_size]
        return sample_x, sample_y
    
    def generator(self):
        
        while True:
            x_batch = np.zeros((self.batch_size, self.win_size))
            y_batch = np.zeros((self.batch_size, self.win_size))
            
            for i in range(self.batch_size):
                x, y = self.random_audio_sample()
                x = x.reshape((1, x.shape[0]))
                y = y.reshape((1, y.shape[0]))

                x_batch[i, :] = x
                y_batch[i, :] = y
                
            yield x_batch, y_batch

def load_and_process_stems(x, y, sr): # work on this to produce all combinations of stems

  x_audio = []
  y_audio = []
  
  for x_file in x:
    this_x, _ = librosa.load(x_file, sr=sr, res_type='kaiser_fast')
    x_audio.append(this_x)

  for y_file in y:
    this_y, _ = librosa.load(y_file, sr=sr, res_type='kaiser_fast')
    y_audio.append(this_y)


  frames_x, bx = extract_transients(audio_x, sr, ws, 0)
  frames_y, by = extract_transients(audio_y, sr, ws, 0)

  idx_shared = correlate_transients(bx, by)

  tx = frames_x[idx_shared]
  ty = frames_y[idx_shared]