pipeline_job_manual.py

"""
This script does everything from preprocessing to feature extraction to training to evaluation with randomly sampled settings,
and then stores results.
"""

from matplotlib.mlab import specgram
import numpy as np
import scipy.signal as sig
from numpy.lib import stride_tricks
import time
import cPickle as pickle

import kmeans

from sklearn import cross_validation, ensemble, metrics, svm

import random_search as rs

settings = {
    # prepro + specgram extraction
    'normalise_volume': True,
    'specgram_num_components': 64,
    'specgram_redundancy': 2, # 1 is no overlap between successive windows, 2 is half overlap, 4 is three quarter overlap
    'log_scale': 10**0,

    # patch extraction
    'patch_height': 30, # number of spectrogram components
    'patch_width': 16, # number of timesteps
    'num_patches_for_learning': 100000,

    # whitening
    'retain': 0.99,
    'pca_bias': 0.0001,

    # kmeans
    'num_means': 1000,

    # extraction
    'threshold': None,

    # classifier training /evaluation
    'n_folds': 4, # since we can do this in parallel, it's best to have a multiple of the number of cores
}

# TARGET_PATTERN = "/mnt/storage/usr/sedielem/whales/results/results-%s.pkl"
# DATA_PATH = "/mnt/storage/usr/sedielem/whales/X_train.npy"
# LABEL_PATH = "/mnt/storage/usr/sedielem/whales/Y_train.npy"

TARGET_PATTERN = "results/results-%s.pkl"
DATA_PATH = "X_train.npy"
LABEL_PATH = "Y_train.npy"


expid = rs.generate_expid()
print "EXPERIMENT: %s" % expid
print
print settings
print


start_time = time.time()
def tock():
    elapsed = time.time() - start_time
    print "  running for %.2f s" % elapsed


# load data
print "Load data"
X = np.load(DATA_PATH)
Y = np.load(LABEL_PATH)
tock()


# downsample
print "Downsample"
X_downsampled = sig.decimate(X, 2, axis=1)
tock()


# normalise
if settings['normalise_volume']:
    print "Normalise volume"
    X_downsampled -= X_downsampled.mean(1).reshape(-1, 1)
    X_downsampled /= X_downsampled.std(1).reshape(-1, 1)
    tock()


# compute spectrograms
print "Compute spectrograms"
nfft = settings['specgram_num_components']
noverlap = nfft * (1 - 1. / settings['specgram_redundancy'])
log_scale = settings['log_scale']

dummy = specgram(X_downsampled[0], NFFT=nfft, noverlap=noverlap)[0] # to get the dimensions
X_specgram = np.zeros((X.shape[0], dummy.shape[0], dummy.shape[1]), dtype=X.dtype)

for k in xrange(X.shape[0]):
    X_specgram[k] = specgram(X_downsampled[k], NFFT=nfft, noverlap=noverlap)[0]

X_specgram = np.log(1 + log_scale * X_specgram)

tock()

# import pdb; pdb.set_trace()


# patch extraction
print "Select subset of patches"
w, h = settings['patch_width'], settings['patch_height']
shape = X_specgram.shape
strides = X_specgram.strides
new_shape = (shape[0], shape[1] - h + 1, h, shape[2] - w + 1, w)
new_strides = (strides[0], strides[1], strides[1], strides[2], strides[2])
patches = stride_tricks.as_strided(X_specgram, shape=new_shape, strides=new_strides)

# now generate indices to select random patches
num = settings['num_patches_for_learning']
idx0 = np.random.randint(0, patches.shape[0], num)
idx1 = np.random.randint(0, patches.shape[1], num)
idx3 = np.random.randint(0, patches.shape[3], num)

patches_subset = patches[idx0, idx1, :, idx3, :].reshape(num, -1)

tock()


# whitening
print "Learn whitening"
retain = settings['retain']
bias = settings['pca_bias']

print "  computing transform..."
means = patches_subset.mean(0)
patches_subset_centered = patches_subset - means.reshape(1, -1)
cov = np.dot(patches_subset_centered.T, patches_subset_centered) / patches_subset.shape[0]
eigs, eigv = np.linalg.eigh(cov) # docs say the eigenvalues are NOT NECESSARILY ORDERED, but this seems to be the case in practice...

print "  computing number of components to retain %.2f of the variance..." % retain
normed_eigs = eigs[::-1] / np.sum(eigs) # maximal value first
eigs_sum = np.cumsum(normed_eigs)
num_components = np.argmax(eigs_sum > retain) # argmax selects the first index where eigs_sum > retain is true
print "  number of components to retain: %d" % num_components

P = eigv.astype('float32') * np.sqrt(1.0/(eigs + bias)) # PCA whitening
P = P[:, -num_components:] # truncate transformation matrix

tock()


# kmeans
print "Learn kmeans"
k = settings['num_means']

print "  whiten patches"
patches_subset_whitened = np.dot(patches_subset_centered, P)
print "  run kmeans"
centroids = kmeans.spherical_kmeans(patches_subset_whitened, k, num_iterations=20, batch_size=10000)

tock()


# feature extraction and summarisation
print "Feature extraction and summarisation"
threshold = settings['threshold']
PC = np.dot(P, centroids.T)

# def summarise_features(features):
#     features = features.reshape(features.shape[0], -1, features.shape[3]) # merge time and frequency axes
#     return np.hstack([features.mean(1), features.std(1), features.min(1), features.max(1)]) # summarize over time axis
#     # return features.mean(1)
#     # return features.max(1)

def summarise_features(features):
    features_freq_pooled = features.max(1) # max pool over frequency
    n_timesteps = features_freq_pooled.shape[1]
    # quadrant pooling over time
    parts = [features_freq_pooled.max(1)] # max pooling over the whole timelength
    n_timeslices = 4
    slice_size = n_timesteps // n_timeslices # floor
    for k in xrange(n_timeslices):
        features_slice = features_freq_pooled[:, k*slice_size:(k+1)*slice_size].max(1)
        parts.append(features_slice)

    return np.hstack(parts)

    # return features_freq_pooled.max(1) # time pooling


def extract_features(batch, means, PC, threshold=None):
    batch_shape = batch.shape
    batch = batch.transpose(0,1,3,2,4).reshape(-1, h * w)

    features = np.dot(batch - means.reshape(1, -1), PC)

    if threshold is not None: # if no threshold specified, use linear features
        features = np.maximum(features - threshold, 0) # thresholding

    # features = features.reshape(batch_shape[0], -1, PC.shape[1]) # split examples and other axes

    features = features.reshape(batch_shape[0], batch_shape[1], batch_shape[3], PC.shape[1]) # (examples, frequency bins, time, features)

    # import pdb; pdb.set_trace()

    features = summarise_features(features)
    return features

batch_size = 100
num_examples = patches.shape[0]
num_batches = int(np.ceil(num_examples / float(batch_size)))
features = np.zeros((num_examples, 5 * PC.shape[1]), dtype=X.dtype)
for b in xrange(num_batches):
    print "  batch %d of %d" % (b+1, num_batches)
    batch = patches[b*batch_size:(b+1)*batch_size]
    current_batch_size = batch.shape[0]
    batch_features = extract_features(batch, means, PC, threshold)
    features[b*batch_size:b*batch_size + current_batch_size] = batch_features

tock()


# interval normalisation
print "Normalisation of features"
fmaxs = features.max(0)
fmins = features.min(0)

features -= fmins.reshape(1, -1)
features /= (fmaxs - fmins).reshape(1, -1)

tock()


print "Do some memory cleanup"
del X
del X_downsampled
del X_specgram
del patches
del patches_subset
del patches_subset_whitened


# classifier training
print "Classifier training and evaluation through cross-validation"
n_folds = settings['n_folds']

# clf = ensemble.GradientBoostingClassifier()
# clf = ensemble.RandomForestClassifier(n_estimators=1024, min_samples_leaf=5, n_jobs=-1, verbose=1)
clf = svm.LinearSVC(C=10e-3)
# clf = svm.SVC(kernel="linear", probability=True, C=10e-6)
# scores = cross_validation.cross_val_score(clf, features, Y, cv=n_folds, score_func=metrics.auc_score, n_jobs=1) # n_jobs=-1) # use all cores
# print "  auc: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std())

cv = cross_validation.StratifiedKFold(Y, n_folds)

def auc(t, y):
    fpr, tpr, thresholds = metrics.roc_curve(t, y)
    return metrics.auc(fpr, tpr)

predictions = np.zeros(Y.shape)
fold_aucs = []

for i, (train, test) in enumerate(cv):
    scores = clf.fit(features[train], Y[train]).decision_function(features[test])
    predictions[test] = scores

    fold_aucs.append(auc(Y[test], scores))

global_auc = auc(Y, predictions)


print "  auc over folds: %0.4f (+/- %0.4f)" % (np.mean(fold_aucs), np.std(fold_aucs))
print "  global auc: %0.4f" % global_auc

tock()


print "Storing experiment data"

evaluation = { 'auc': global_auc, 'fold_aucs': fold_aucs }
params = { 'means': means, 'P': P, 'centroids': centroids, 'fmaxs': fmaxs, 'fmins': fmins }

data = { 'params': params, 'settings': settings, 'evaluation': evaluation }

target_path = TARGET_PATTERN % expid

with open(target_path, 'w') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

print "  stored in %s" % target_path