Merge pull request #6 from breuderink/kpa-regression

Add kernel passive-aggressive (KPA) regression.
breuderink · Jun 9, 2021 · bf998a8 · bf998a8
2 parents d0712b3 + 27fa5f2
commit bf998a8
Show file tree

Hide file tree

Showing 11 changed files with 576 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ particular memory-efficient algorithms. Further, the optimization process
 should be reliable. Epsilon provides methods to train and apply machine
 learning methods on microcontrollers.
 
-These algorithms should function on microcontrollers, such as the
+These algorithms should run on microcontrollers, such as the
 [Raspberry Pi Pico](https://www.raspberrypi.org/products/raspberry-pi-pico/),
 the [BBC micro:bit](https://www.microbit.org), the 
 [Arduino Nano 33 BLE Sense](https://store.arduino.cc/arduino-nano-33-ble-sense) 
@@ -56,7 +56,7 @@ hash function that maps variable length input to a fixed output
 hashing](https://en.wikipedia.org/wiki/Feature_hashing).
 
 ## Online statistics
-- Welfords method for computing mean and variance in one pass. See the
+- Welfords method computes mean and variance in a single pass. See the
 [example of Welford's method](examples/example_Welfords_method.c).
 
 ## Feature extraction
@@ -71,9 +71,20 @@ compilation of SORF is disabled by default. Instead one can use a budgeted
 kernel classification or regression.
 
 ## Regression
-- Kernelized passive-aggressive regression
-  ([TODO](https://github.com/breuderink/epsilon/pull/6)).
-- Kernelized passive-aggressive classification (TODO).
+- [Online passive-aggressive (PA)](docs/crammer2006opa.pdf) regression solves a
+regression problem by only updating the model on prediction mistakes. When the
+target depends non-linearly on the inputs one can use a kernel that projects the
+input onto a set of support vectors. [Kernel
+methods](https://en.wikipedia.org/wiki/Kernel_method) such as the support vector
+machine (SVM) work efficiently in high-dimensional feature spaces, but don't
+easily scale to large datasets. To scale to large datasets one can [maintain a
+budget](docs/wang2010opa.pdf) of support vectors.  The [example of budgeted kernel
+passive aggressive (BKPA) regression ](examples/example_BKPA_regression.c) 
+demonstrates how online, non-linear regression can be performed with a limited
+memory budget.
+
+## Classification
+- Kernel passive-aggressive classification (TODO).
 
 
 # Other solutions for Tiny ML or Edge AI

diff --git a/docs/crammer2006opa.pdf b/docs/crammer2006opa.pdf
diff --git a/docs/wang2010opa.pdf b/docs/wang2010opa.pdf
diff --git a/epsilon/CMakeLists.txt b/epsilon/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_library(epsilon rng.c welford.c sorf.c hash.c)
+add_library(epsilon rng.c welford.c sorf.c hash.c kpa.c)
 set_target_properties(epsilon PROPERTIES 
     VERSION ${PROJECT_VERSION}
     C_STANDARD 99)

diff --git a/epsilon/epsilon.h b/epsilon/epsilon.h
@@ -43,8 +43,39 @@ void SORF(float *x, uint8_t nbits);
 void SORF_repeat(float *x1, size_t n1, float *x2, size_t n2);
 
 
+// Kernels.
+typedef float (*kernel_t)(size_t i, size_t j);
+float squared_Euclidean(kernel_t kernel, size_t a, size_t b);
+float squared_exponential_kernel(float length, float squared_dist);
+
+// Kernel projection used to implement kernel passive-aggressive algorithms.
+typedef struct {
+	float *alpha;
+	size_t num_alpha;
+	kernel_t kernel;
+} KP_t;
+
+// Apply kernel projection to input x1.
+float KP_apply(KP_t *km, size_t x1);
+
+// Get n-th idle (i.e. alpha=0) support vector of kernel projection.
+size_t KP_find_idle(const KP_t *km, size_t n);
+
+// Passive-aggressive parameters, see [6].
+typedef struct {
+	float C;   // Perform aggressive updates with high C.
+	float eps; // Size of insensitive band for regression.
+} PA_t;
+
+// Perform kernel PA regression [1]. Target y can be NAN for inference.
+float KPA_regress(KP_t *km, const PA_t pa, size_t xi, float y);
+
+// Perform budgeted kernel PA regression [2]. Target y can be NAN for inference.
+float BKPA_regress(KP_t *km, const PA_t pa, size_t xi, float y);
+
 /*
 # References
+
 [1] Marsaglia, George. "Xorshift RNGs." Journal of Statistical Software 8.14
     (2003): 1-6.
 
@@ -60,4 +91,12 @@ void SORF_repeat(float *x1, size_t n1, float *x2, size_t n2);
 [5] Choromanski, Krzysztof, and Vikas Sindhwani. "Recycling randomness
     with structure for sublinear time kernel expansions." International
     Conference on Machine Learning. 2016.
+
+[6] Crammer, K. et al. “Online Passive-Aggressive Algorithms.” J. Mach. Learn.
+	Res. (2003).
+
+[7] Wang, Zhuang, and Slobodan Vucetic.  "Online passive-aggressive
+    algorithms on a budget." Proceedings of the Thirteenth International
+    Conference on Artificial Intelligence and Statistics.  JMLR Workshop and
+    Conference Proceedings, 2010.
 */
diff --git a/epsilon/kpa.c b/epsilon/kpa.c
@@ -0,0 +1,178 @@
+#include "epsilon.h"
+#include <assert.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+float squared_Euclidean(kernel_t kernel, size_t a, size_t b) {
+	// (a - b)^T (a - b) = a^T a - 2 a^T b + b^T b.
+	return kernel(a, a) - 2 * kernel(a, b) + kernel(b, b);
+}
+
+float squared_exponential_kernel(float length, float squared_dist) {
+	return expf(-squared_dist / (2 * length * length));
+}
+
+float KP_apply(KP_t *kp, size_t xi) {
+	float y_hat = 0;
+	for (size_t i = 0; i < kp->num_alpha; ++i) {
+		float a = kp->alpha[i];
+		if (a != 0) {
+			y_hat += a * kp->kernel(i, xi);
+		}
+	}
+	return y_hat;
+}
+
+/*
+Update for passive aggressive regression solving (19) with PA-1 in [1]:
+
+	w -> w + sign(e) tau x, e = y - y_hat, tau = min(C, loss / x^T x).
+
+The update weight sign(e) tau is returned, so that it can be used with
+kernalized implementations.
+*/
+float PA_regress_PA1(const PA_t pa, float xx, float y_hat, float y) {
+	float error = y_hat - y;
+	float loss = fmaxf(0, fabs(error) - pa.eps);
+	float tau = fminf(pa.C, loss / xx);
+	return copysignf(tau, -error);
+}
+
+float KPA_regress(KP_t *kp, const PA_t pa, size_t x_i, float y) {
+	float y_hat = KP_apply(kp, x_i);
+	assert(isfinite(y_hat));
+
+	if (isfinite(y)) {
+		assert(kp->alpha[x_i] == 0);
+		kp->alpha[x_i] = PA_regress_PA1(pa, kp->kernel(x_i, x_i), y_hat, y);
+	}
+
+	return y_hat;
+}
+
+// Functions to maintain kernel machine budget.
+size_t KP_num_idle(const KP_t *kp) {
+	size_t c = 0;
+	for (size_t i = 0; i < kp->num_alpha; ++i) {
+		c += kp->alpha[i] == 0;
+	}
+	return c;
+}
+
+size_t KP_find_idle(const KP_t *kp, size_t idle_index) {
+	size_t idle_count = 0;
+	for (size_t i = 0; i < kp->num_alpha; ++i) {
+		if (kp->alpha[i] == 0) {
+			if (idle_count++ == idle_index) {
+				return i;
+			}
+		}
+	}
+	abort();
+}
+
+/*
+BPA simple update presented in [2].  It maintains the kernel budget by
+absorbing a support vector into the target support vector with minimal
+distortion.  The BPA simple update in equation (12) of [2] can be split in a
+regular learning update, and a budget maintenance update.  Here we only
+implement the part that maintains the budget, so that it can be combined with
+different kernel methods.
+*/
+typedef struct {
+	float ii, ij, jj;
+} kernel_pair_t;
+
+void BPA_S_absorb(const kernel_pair_t k, float a_r, float *proj, float *L) {
+	/*
+	We want to to absorb a support vector r (remove) into a support vector t
+	(target). That can be encode with a new weight vector a', where the
+	coefficient of r is set to zero, and the coefficient of t is changed by
+	adding p (projection). The goal is to find a a' that is minimizes the
+	distortion:
+
+	    L = [a' - a]^T K [a' - a].
+
+	We can simplify L by expressing it with the submatrix of K with the elements
+	that differ between a' and a:
+
+	    L = [-a_r, p]^T [k_rr, k_rt; k_rt, k_tt] [-a_r, p].
+
+	Note that we want to find the p that minimizes L. Expand L to:
+
+	    L = [-a_r, p]^T [k_rr, k_rt; k_rt, k_tt] [-a_r, p]
+	      = [-a_r k_rr + p k_rt, -a_r k_rt + p k_tt]  [-a_r, p]
+	      = -a_r (-a_r k_rr + p k_rt) + p (-a_r k_rt + p k_tt)
+	      = a_r^2 k_rr - a_r p k_rt - a_r p k_rt + p^2 k_tt
+	      = a_r^2 k_rr - 2 a_r p k_rt + p^2 k_tt
+
+	To minimize L, can set it's derivative with respect to p to zero:
+
+	    ∂/∂p L = ∂/∂p (-2 a_r p k_rt + p^2 k_tt) = -2 a_r k_rt + 2 p k_tt = 0.
+	    2 p ktt = 2 a_r k_rt => p = a_r k_rt / k_tt.
+
+	This corresponds to the first term of equation (12) for BPA-S in [2].
+	*/
+
+	// Compute p = a_r k_rt / k_tt.
+	float p = *proj = a_r * k.ij / k.jj;
+
+	// Compute L = a_r^2 k_rr - 2 a_r p k_rt + p^2 k_tt.
+	*L = (a_r * a_r * k.ii) - (2 * a_r * p * k.ij) + (p * p * k.jj);
+
+	//assert(*L >= 0);
+}
+
+float BPA_simple(KP_t *kp, size_t target) {
+	struct {
+		size_t r;
+		float proj, loss;
+	} curr = {0}, best = {.r = target, .proj = NAN, .loss = INFINITY};
+
+	float k_tt = kp->kernel(target, target);
+	// Search for support vector r to absorb.
+	for (curr.r = 0; curr.r < kp->num_alpha; ++curr.r) {
+		if (curr.r == target || kp->alpha[curr.r] == 0) {
+			continue;
+		}
+		kernel_pair_t k = {
+		    .ii = kp->kernel(curr.r, curr.r),
+		    .ij = kp->kernel(curr.r, target),
+		    .jj = k_tt,
+		};
+		BPA_S_absorb(k, kp->alpha[curr.r], &curr.proj, &curr.loss);
+		assert(isfinite(curr.loss));
+		if (curr.loss < best.loss) {
+			best = curr;
+		}
+	}
+
+	// Perform projection of r on target t, and neutralize r.
+	assert(best.r != target);
+	kp->alpha[target] += best.proj;
+	kp->alpha[best.r] = 0;
+
+	return best.loss;
+}
+
+float BKPA_regress(KP_t *kp, const PA_t pa, size_t x_i, float y) {
+	float y_hat = KPA_regress(kp, pa, x_i, y);
+	if (isfinite(y)) {
+		// Maintain budget.
+		if (KP_num_idle(kp) < 1) {
+			BPA_simple(kp, x_i);
+		}
+	}
+	return y_hat;
+}
+
+/* References
+[1] Crammer, K. et al. “Online Passive-Aggressive Algorithms.” J. Mach. Learn.
+	Res. (2003).
+
+[2] Wang, Zhuang, and Slobodan Vucetic.  "Online passive-aggressive
+    algorithms on a budget." Proceedings of the Thirteenth International
+    Conference on Artificial Intelligence and Statistics.  JMLR Workshop and
+    Conference Proceedings, 2010.
+*/
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -17,3 +17,8 @@ add_test(NAME example_Welfords_method COMMAND example_Welfords_method)
 add_executable(example_FWHT example_FWHT.c)
 target_link_libraries(example_FWHT epsilon m)
 add_test(NAME example_FWHT COMMAND example_FWHT)
+
+# BKPA regression example.
+add_executable(example_BKPA_regression example_BKPA_regression.c)
+target_link_libraries(example_BKPA_regression epsilon m)
+add_test(NAME example_BKPA_regression COMMAND example_BKPA_regression)
diff --git a/examples/example_BKPA_regression.c b/examples/example_BKPA_regression.c
@@ -0,0 +1,78 @@
+#include "epsilon.h"
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+Define problem-specific input structure. Our input is a single position, but it
+could be a vector, a struct or something else. In addition, we a create buffer
+that contains a fixed budget with inputs.
+*/
+typedef struct {
+	float position;
+} input_t;
+
+#define BUDGET 32
+static input_t support_vectors[BUDGET] = {0};
+
+/*
+Define a problem-specific kernel. A kernel defines a dot-product between inputs,
+and should be adapted to the problem. Here we use the common squared exponential
+kernel [1], that is also known as the Gaussian or RBF kernel.
+
+[1] Gaussian Processes for Machine Learning, Carl Edward Rasmussen and
+    Christopher K. I. Williams The MIT Press, 2006. ISBN 0-262-18253-X.
+*/
+static float inner_product(size_t a, size_t b) {
+	float k_ab = 1; // Use 1 to implicitly add a bias term.
+	k_ab += support_vectors[a].position * support_vectors[b].position;
+	return k_ab;
+}
+
+// Define a RBF kernel by wrapping the inner product defined above.
+static float kernel(size_t a, size_t b) {
+	float d2 = squared_Euclidean(inner_product, a, b);
+	return squared_exponential_kernel(1.0, d2);
+}
+
+int main() {
+	PA_t PA = {.C = 10, .eps = 0.01};
+
+	// Initialize a kernel regression model.
+	float alpha[BUDGET] = {0};
+	KP_t regressor = {
+	    .alpha = alpha,
+	    .num_alpha = BUDGET,
+	    .kernel = &kernel,
+	};
+
+	// Initialize loss statistics to track performance.
+	online_stats_t loss = {0};
+
+	for (size_t t = 0; t < 100 * BUDGET; ++t) {
+		// Generate a new input.
+		float input = 10 * (rand() / (float)RAND_MAX) - 5;
+
+		// Define target, see https://www.desmos.com/calculator/zxllrku62c.
+		float target = sinf(input);
+
+		// Store input in a free support vector to make it available to the
+		// kernel.
+		size_t i = KP_find_idle(&regressor, 0);
+		support_vectors[i].position = input;
+
+		// Predict on new input and track loss.
+		float prediction = KPA_regress(&regressor, PA, i, NAN);
+		float error = prediction - target;
+		observe(&loss, error * error);
+
+		// Update model.
+		BKPA_regress(&regressor, PA, i, target);
+	}
+
+	// Display root mean squared error of predictions. It should be slightly
+	// above zero, due to initial mistakes and the epsilon-insensitive hinge
+	// loss used for training.
+	printf("RMSE: %.3f.\n", sqrtf(mean(&loss)));
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -9,7 +9,7 @@ FetchContent_MakeAvailable(greatest)
 
 # Define test executable.
 add_executable(epsilon_test 
-  main.c rng_test.c sorf_test.c welford_test.c hash_test.c)
+  main.c rng_test.c sorf_test.c welford_test.c hash_test.c kpa_test.c)
 target_include_directories(epsilon_test PRIVATE ${greatest_SOURCE_DIR})
 target_link_libraries(epsilon_test epsilon m)