-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from breuderink/kpa-regression
Add kernel passive-aggressive (KPA) regression.
- Loading branch information
Showing
11 changed files
with
576 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
#include "epsilon.h" | ||
#include <assert.h> | ||
#include <math.h> | ||
#include <stddef.h> | ||
#include <stdlib.h> | ||
|
||
float squared_Euclidean(kernel_t kernel, size_t a, size_t b) { | ||
// (a - b)^T (a - b) = a^T a - 2 a^T b + b^T b. | ||
return kernel(a, a) - 2 * kernel(a, b) + kernel(b, b); | ||
} | ||
|
||
float squared_exponential_kernel(float length, float squared_dist) { | ||
return expf(-squared_dist / (2 * length * length)); | ||
} | ||
|
||
float KP_apply(KP_t *kp, size_t xi) { | ||
float y_hat = 0; | ||
for (size_t i = 0; i < kp->num_alpha; ++i) { | ||
float a = kp->alpha[i]; | ||
if (a != 0) { | ||
y_hat += a * kp->kernel(i, xi); | ||
} | ||
} | ||
return y_hat; | ||
} | ||
|
||
/* | ||
Update for passive aggressive regression solving (19) with PA-1 in [1]: | ||
w -> w + sign(e) tau x, e = y - y_hat, tau = min(C, loss / x^T x). | ||
The update weight sign(e) tau is returned, so that it can be used with | ||
kernalized implementations. | ||
*/ | ||
float PA_regress_PA1(const PA_t pa, float xx, float y_hat, float y) { | ||
float error = y_hat - y; | ||
float loss = fmaxf(0, fabs(error) - pa.eps); | ||
float tau = fminf(pa.C, loss / xx); | ||
return copysignf(tau, -error); | ||
} | ||
|
||
float KPA_regress(KP_t *kp, const PA_t pa, size_t x_i, float y) { | ||
float y_hat = KP_apply(kp, x_i); | ||
assert(isfinite(y_hat)); | ||
|
||
if (isfinite(y)) { | ||
assert(kp->alpha[x_i] == 0); | ||
kp->alpha[x_i] = PA_regress_PA1(pa, kp->kernel(x_i, x_i), y_hat, y); | ||
} | ||
|
||
return y_hat; | ||
} | ||
|
||
// Functions to maintain kernel machine budget. | ||
size_t KP_num_idle(const KP_t *kp) { | ||
size_t c = 0; | ||
for (size_t i = 0; i < kp->num_alpha; ++i) { | ||
c += kp->alpha[i] == 0; | ||
} | ||
return c; | ||
} | ||
|
||
size_t KP_find_idle(const KP_t *kp, size_t idle_index) { | ||
size_t idle_count = 0; | ||
for (size_t i = 0; i < kp->num_alpha; ++i) { | ||
if (kp->alpha[i] == 0) { | ||
if (idle_count++ == idle_index) { | ||
return i; | ||
} | ||
} | ||
} | ||
abort(); | ||
} | ||
|
||
/* | ||
BPA simple update presented in [2]. It maintains the kernel budget by | ||
absorbing a support vector into the target support vector with minimal | ||
distortion. The BPA simple update in equation (12) of [2] can be split in a | ||
regular learning update, and a budget maintenance update. Here we only | ||
implement the part that maintains the budget, so that it can be combined with | ||
different kernel methods. | ||
*/ | ||
typedef struct { | ||
float ii, ij, jj; | ||
} kernel_pair_t; | ||
|
||
void BPA_S_absorb(const kernel_pair_t k, float a_r, float *proj, float *L) { | ||
/* | ||
We want to to absorb a support vector r (remove) into a support vector t | ||
(target). That can be encode with a new weight vector a', where the | ||
coefficient of r is set to zero, and the coefficient of t is changed by | ||
adding p (projection). The goal is to find a a' that is minimizes the | ||
distortion: | ||
L = [a' - a]^T K [a' - a]. | ||
We can simplify L by expressing it with the submatrix of K with the elements | ||
that differ between a' and a: | ||
L = [-a_r, p]^T [k_rr, k_rt; k_rt, k_tt] [-a_r, p]. | ||
Note that we want to find the p that minimizes L. Expand L to: | ||
L = [-a_r, p]^T [k_rr, k_rt; k_rt, k_tt] [-a_r, p] | ||
= [-a_r k_rr + p k_rt, -a_r k_rt + p k_tt] [-a_r, p] | ||
= -a_r (-a_r k_rr + p k_rt) + p (-a_r k_rt + p k_tt) | ||
= a_r^2 k_rr - a_r p k_rt - a_r p k_rt + p^2 k_tt | ||
= a_r^2 k_rr - 2 a_r p k_rt + p^2 k_tt | ||
To minimize L, can set it's derivative with respect to p to zero: | ||
∂/∂p L = ∂/∂p (-2 a_r p k_rt + p^2 k_tt) = -2 a_r k_rt + 2 p k_tt = 0. | ||
2 p ktt = 2 a_r k_rt => p = a_r k_rt / k_tt. | ||
This corresponds to the first term of equation (12) for BPA-S in [2]. | ||
*/ | ||
|
||
// Compute p = a_r k_rt / k_tt. | ||
float p = *proj = a_r * k.ij / k.jj; | ||
|
||
// Compute L = a_r^2 k_rr - 2 a_r p k_rt + p^2 k_tt. | ||
*L = (a_r * a_r * k.ii) - (2 * a_r * p * k.ij) + (p * p * k.jj); | ||
|
||
//assert(*L >= 0); | ||
} | ||
|
||
float BPA_simple(KP_t *kp, size_t target) { | ||
struct { | ||
size_t r; | ||
float proj, loss; | ||
} curr = {0}, best = {.r = target, .proj = NAN, .loss = INFINITY}; | ||
|
||
float k_tt = kp->kernel(target, target); | ||
// Search for support vector r to absorb. | ||
for (curr.r = 0; curr.r < kp->num_alpha; ++curr.r) { | ||
if (curr.r == target || kp->alpha[curr.r] == 0) { | ||
continue; | ||
} | ||
kernel_pair_t k = { | ||
.ii = kp->kernel(curr.r, curr.r), | ||
.ij = kp->kernel(curr.r, target), | ||
.jj = k_tt, | ||
}; | ||
BPA_S_absorb(k, kp->alpha[curr.r], &curr.proj, &curr.loss); | ||
assert(isfinite(curr.loss)); | ||
if (curr.loss < best.loss) { | ||
best = curr; | ||
} | ||
} | ||
|
||
// Perform projection of r on target t, and neutralize r. | ||
assert(best.r != target); | ||
kp->alpha[target] += best.proj; | ||
kp->alpha[best.r] = 0; | ||
|
||
return best.loss; | ||
} | ||
|
||
float BKPA_regress(KP_t *kp, const PA_t pa, size_t x_i, float y) { | ||
float y_hat = KPA_regress(kp, pa, x_i, y); | ||
if (isfinite(y)) { | ||
// Maintain budget. | ||
if (KP_num_idle(kp) < 1) { | ||
BPA_simple(kp, x_i); | ||
} | ||
} | ||
return y_hat; | ||
} | ||
|
||
/* References | ||
[1] Crammer, K. et al. “Online Passive-Aggressive Algorithms.” J. Mach. Learn. | ||
Res. (2003). | ||
[2] Wang, Zhuang, and Slobodan Vucetic. "Online passive-aggressive | ||
algorithms on a budget." Proceedings of the Thirteenth International | ||
Conference on Artificial Intelligence and Statistics. JMLR Workshop and | ||
Conference Proceedings, 2010. | ||
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#include "epsilon.h" | ||
#include <assert.h> | ||
#include <math.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
|
||
/* | ||
Define problem-specific input structure. Our input is a single position, but it | ||
could be a vector, a struct or something else. In addition, we a create buffer | ||
that contains a fixed budget with inputs. | ||
*/ | ||
typedef struct { | ||
float position; | ||
} input_t; | ||
|
||
#define BUDGET 32 | ||
static input_t support_vectors[BUDGET] = {0}; | ||
|
||
/* | ||
Define a problem-specific kernel. A kernel defines a dot-product between inputs, | ||
and should be adapted to the problem. Here we use the common squared exponential | ||
kernel [1], that is also known as the Gaussian or RBF kernel. | ||
[1] Gaussian Processes for Machine Learning, Carl Edward Rasmussen and | ||
Christopher K. I. Williams The MIT Press, 2006. ISBN 0-262-18253-X. | ||
*/ | ||
static float inner_product(size_t a, size_t b) { | ||
float k_ab = 1; // Use 1 to implicitly add a bias term. | ||
k_ab += support_vectors[a].position * support_vectors[b].position; | ||
return k_ab; | ||
} | ||
|
||
// Define a RBF kernel by wrapping the inner product defined above. | ||
static float kernel(size_t a, size_t b) { | ||
float d2 = squared_Euclidean(inner_product, a, b); | ||
return squared_exponential_kernel(1.0, d2); | ||
} | ||
|
||
int main() { | ||
PA_t PA = {.C = 10, .eps = 0.01}; | ||
|
||
// Initialize a kernel regression model. | ||
float alpha[BUDGET] = {0}; | ||
KP_t regressor = { | ||
.alpha = alpha, | ||
.num_alpha = BUDGET, | ||
.kernel = &kernel, | ||
}; | ||
|
||
// Initialize loss statistics to track performance. | ||
online_stats_t loss = {0}; | ||
|
||
for (size_t t = 0; t < 100 * BUDGET; ++t) { | ||
// Generate a new input. | ||
float input = 10 * (rand() / (float)RAND_MAX) - 5; | ||
|
||
// Define target, see https://www.desmos.com/calculator/zxllrku62c. | ||
float target = sinf(input); | ||
|
||
// Store input in a free support vector to make it available to the | ||
// kernel. | ||
size_t i = KP_find_idle(®ressor, 0); | ||
support_vectors[i].position = input; | ||
|
||
// Predict on new input and track loss. | ||
float prediction = KPA_regress(®ressor, PA, i, NAN); | ||
float error = prediction - target; | ||
observe(&loss, error * error); | ||
|
||
// Update model. | ||
BKPA_regress(®ressor, PA, i, target); | ||
} | ||
|
||
// Display root mean squared error of predictions. It should be slightly | ||
// above zero, due to initial mistakes and the epsilon-insensitive hinge | ||
// loss used for training. | ||
printf("RMSE: %.3f.\n", sqrtf(mean(&loss))); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.