Change quantization, add loop, expand model

Classification in yes, no, left, right, up and down works okay with the waveforms sent from the computer. Signed-off-by: Stefan Gloor <code@stefan-gloor.ch>
stgloorious · Jun 10, 2024 · 36b6aa8 · 36b6aa8
1 parent 1724f6e
commit 36b6aa8
Show file tree

Hide file tree

Showing 7 changed files with 189 additions and 115 deletions.
diff --git a/ld/stm32l475vgtx.ld b/ld/stm32l475vgtx.ld
@@ -40,8 +40,8 @@ ENTRY(Reset_Handler)
 /* Highest address of the user mode stack */
 _estack = ORIGIN(RAM) + LENGTH(RAM); /* end of "RAM" Ram type memory */
 
-_Min_Heap_Size = 0x1000; /* required amount of heap */
-_Min_Stack_Size = 0x4000; /* required amount of stack */
+_Min_Heap_Size = 0x100; /* required amount of heap */
+_Min_Stack_Size = 0x2000; /* required amount of stack */
 
 /* Memories definition */
 MEMORY

diff --git a/ml/train.py b/ml/train.py
@@ -36,6 +36,8 @@
 
 DATASET_PATH = 'data/mini_speech_commands'
 
+keywords = ['yes', 'no', 'left', 'right', 'up', 'down']
+
 data_dir = pathlib.Path(DATASET_PATH)
 if not data_dir.exists():
     tf.keras.utils.get_file(
@@ -44,14 +46,14 @@
         extract=True,
         cache_dir='.', cache_subdir='data')
     # Delete the unwanted parts of the dataset
-    shutil.rmtree(os.path.join(data_dir, 'left'))
-    shutil.rmtree(os.path.join(data_dir, 'right'))
-    shutil.rmtree(os.path.join(data_dir, 'up'))
-    shutil.rmtree(os.path.join(data_dir, 'down'))
+    #shutil.rmtree(os.path.join(data_dir, 'left'))
+    #shutil.rmtree(os.path.join(data_dir, 'right'))
+    #shutil.rmtree(os.path.join(data_dir, 'up'))
+    #shutil.rmtree(os.path.join(data_dir, 'down'))
     shutil.rmtree(os.path.join(data_dir, 'go'))
     shutil.rmtree(os.path.join(data_dir, 'stop'))
 
-    for data in ['yes', 'no']:
+    for data in keywords:
         curr_dir = os.path.join(data_dir, data)
         all_files = [os.path.join(curr_dir, fn) for fn in os.listdir(curr_dir) if fn.endswith('.wav')]
 
@@ -168,11 +170,12 @@ def make_spec_ds(ds):
         layers.Resizing(32, 32),
         # Normalize.
         norm_layer,
+        layers.Conv2D(32, 3, activation='relu'),
         layers.Conv2D(16, 3, activation='relu'),
         layers.MaxPooling2D(),
         layers.Dropout(0.25),
         layers.Flatten(),
-        layers.Dense(8, activation='relu'),
+        layers.Dense(24, activation='relu'),
         layers.Dropout(0.5),
         layers.Dense(num_labels),
     ])
@@ -258,8 +261,8 @@ def representative_dataset():
 
 # ## Run inference on an audio file
 
-testfile = os.listdir(data_dir/'test/yes/')[0]
-x = tf.io.read_file(str(os.path.join(data_dir/'test/yes/', testfile)))
+testfile = os.listdir(data_dir/'test/no/')[0]
+x = tf.io.read_file(str(os.path.join(data_dir/'test/no/', testfile)))
 x, sample_rate = tf.audio.decode_wav(x, desired_channels=1, desired_samples=16000,)
 x = tf.squeeze(x, axis=-1)
 waveform = x
@@ -269,7 +272,7 @@ def representative_dataset():
 prediction = model(x)
 values = tf.nn.softmax(prediction[0])
 print('Prediction of float model:')
-label_names = ['no', 'yes']
+label_names = keywords
 for i in range(len(label_names)):
     print(f'{label_names[i]}: {values[i]:.2%}')
 
@@ -283,8 +286,12 @@ def representative_dataset():
 interpreter.allocate_tensors()
 
 # Input data
-input_data = np.array(x)
-preprocessed_input_data = (input_data * 256).astype('uint8')
+spec = np.array(x)
+# Multiplying by 256 is not a proper quantization, we should normalize
+# first. However, due to memory and performance contraints this is difficult
+# to reproduce on the microcontroller, and just multiplying by 256 seem to
+# work well enough
+preprocessed_input_data = (spec * 256).astype('uint8')
 
 with open('sample_input.bin', 'wb') as f:
     f.write(preprocessed_input_data)

diff --git a/src/main.cc b/src/main.cc
@@ -40,10 +40,12 @@ limitations under the License.
 #include "mic.h"
 int dfsdm_conversion_done;
 
-const int kTensorArenaSize = 39 * 1024;
-alignas(16) static uint8_t tensor_arena[kTensorArenaSize] = { 0x55 };
 
-static char waveform[16500];
+const int kTensorArenaSize = 66800;
+alignas(16) static uint8_t tensor_arena[kTensorArenaSize];
+
+static uint8_t waveform[16128];
+static uint8_t last_ffts[125];
 
 #define DEBUG_PRINTF(...)            \
 	{                            \
@@ -131,11 +133,6 @@ int main(int argc, char *argv[])
 	microphone.dump_recording();
 */
 
-	uint32_t waveform_len = serial_recv(waveform, sizeof(waveform));
-	if (waveform_len == 0) {
-		assert(!"Transfer failed.");
-	}
-
 	const static uint32_t window_size = 256;
 	const static uint32_t frame_step = 128;
 
@@ -144,84 +141,9 @@ int main(int argc, char *argv[])
 		assert(!"Failed to init RFFT");
 	}
 
-	float min = 999999.0f;
-	float max = 0;
-	for (uint32_t i = 0; i < waveform_len; i++) {
-		float val = (float)((uint8_t)waveform[i]);
-		if (val < min) {
-			min = val;
-		}
-		if (val > max) {
-			max = val;
-		}
-	}
-
-	float hanning[window_size];
+	static float hanning[window_size];
 	arm_hanning_f32(hanning, window_size);
 
-	for (uint32_t idx = 0; idx < 124; idx++) {
-		static float dst[window_size];
-		static float mag[window_size + 1];
-		double sum = 0;
-
-		static float signal_chunk[window_size];
-
-		for (uint32_t i = 0; i < window_size; i++) {
-			signal_chunk[i] =
-				(float)((uint8_t)waveform[idx * frame_step + i]);
-
-			// Normalize from -1 to 1
-			signal_chunk[i] = (2.0f * (signal_chunk[i] - min) / (max - min)) - 1;
-			sum += signal_chunk[i];
-		}
-
-		// Remove DC component
-		float mean = (float)(sum / (double)window_size);
-		for (uint32_t i = 0; i < window_size; i++) {
-			signal_chunk[i] = signal_chunk[i] - mean;
-
-			// Apply window function
-			signal_chunk[i] *= hanning[i];
-		}
-
-		arm_rfft_fast_f32(&fft, signal_chunk, dst, 0);
-
-		// From to the CMSIS documentation:
-		// https://arm-software.github.io/CMSIS-DSP/latest/group__RealFFT.html
-		//
-		// The FFT of a real N-point sequence has even symmetry in the
-		// frequency domain. The second half of the data equals the conjugate
-		// of the first half flipped in frequency. This conjugate part is not
-		// computed by the float RFFT. As consequence, the output of a N point
-		// real FFT should be a N//2 + 1 complex numbers so N + 2 floats.
-
-		// It happens that the first complex of number of the RFFT output is
-		// actually all real. Its real part represents the DC offset. The value
-		// at Nyquist frequency is also real.
-
-		// Those two complex numbers can be encoded with 2 floats rather than
-		// using two numbers with an imaginary part set to zero.
-
-		// The implementation is using a trick so that the output buffer can be
-		// N float : the last real is packaged in the imaginary part of the
-		// first complex (since this imaginary part is not used and is zero).
-
-		// The first "complex" is actually to reals, X[0] and X[N/2]
-		float first_real = (dst[0] < 0.0f) ? (-1.0f * dst[0]) : dst[0];
-		float second_real = (dst[1] < 0.0f) ? (-1.0f * dst[1]) : dst[1];
-
-		// Take the magnitude for all the complex values in between
-		arm_cmplx_mag_f32(dst + 2, mag + 1, window_size / 2);
-
-		// Fill in the two real numbers at 0 and N/2
-		mag[0] = first_real;
-		mag[128] = second_real;
-
-		// Send N+1 FFT output
-		for (uint32_t i = 0; i < 129; i++) {
-			printf("%.06f\n", mag[i]);
-		}
-	}
 
 	const tflite::Model *model = tflite::GetModel(model_tflite);
 	DEBUG_PRINTF("Model architecture:\n");
@@ -260,7 +182,7 @@ int main(int argc, char *argv[])
 
 	// Interpreter
 	tflite::MicroInterpreter interpreter(model, op_resolver, tensor_arena,
-					     kTensorArenaSize);
+						 kTensorArenaSize);
 	DEBUG_PRINTF("MicroInterpreter initialized.\n");
 
 	if (interpreter.AllocateTensors() != kTfLiteOk) {
@@ -269,9 +191,117 @@ int main(int argc, char *argv[])
 	DEBUG_PRINTF("MicroInterpreter tensors allocated.\n");
 
 	while (1) {
-		/*
-		size_t input_tensor_len = serial_recv(input_tensor, sizeof(input_tensor));
-		DEBUG_PRINTF("Received %u bytes.\n", input_tensor_len);
+		uint32_t waveform_len = serial_recv((char*)waveform, sizeof(waveform));
+		if (waveform_len == 0) {
+			assert(!"Transfer failed.");
+		}
+
+		float min = 999999.0f;
+		float max = 0;
+		for (uint32_t i = 0; i < waveform_len; i++) {
+			float val = (float)(waveform[i]);
+			if (val < min) {
+				min = val;
+			}
+			if (val > max) {
+				max = val;
+			}
+		}
+
+		for (uint32_t idx = 0; idx < 124; idx++) {
+			float dst[window_size];
+			static float mag[window_size + 1];
+			double sum = 0;
+
+			static float* signal_chunk = mag;
+
+			for (uint32_t i = 0; i < window_size; i++) {
+				signal_chunk[i] =
+					(float)((uint8_t)waveform[idx * frame_step + i]);
+
+				// Normalize from -1 to 1
+				signal_chunk[i] = (2.0f * (signal_chunk[i] - min) / (max - min)) - 1;
+				sum += signal_chunk[i];
+			}
+
+			// Remove DC component
+			float mean = (float)(sum / (double)window_size);
+			for (uint32_t i = 0; i < window_size; i++) {
+				signal_chunk[i] = signal_chunk[i] - mean;
+
+				// Apply window function
+				signal_chunk[i] *= hanning[i];
+			}
+
+			arm_rfft_fast_f32(&fft, signal_chunk, dst, 0);
+
+			// From to the CMSIS documentation:
+			// https://arm-software.github.io/CMSIS-DSP/latest/group__RealFFT.html
+			//
+			// The FFT of a real N-point sequence has even symmetry in the
+			// frequency domain. The second half of the data equals the conjugate
+			// of the first half flipped in frequency. This conjugate part is not
+			// computed by the float RFFT. As consequence, the output of a N point
+			// real FFT should be a N//2 + 1 complex numbers so N + 2 floats.
+
+			// It happens that the first complex of number of the RFFT output is
+			// actually all real. Its real part represents the DC offset. The value
+			// at Nyquist frequency is also real.
+
+			// Those two complex numbers can be encoded with 2 floats rather than
+			// using two numbers with an imaginary part set to zero.
+
+			// The implementation is using a trick so that the output buffer can be
+			// N float : the last real is packaged in the imaginary part of the
+			// first complex (since this imaginary part is not used and is zero).
+
+			// The first "complex" is actually to reals, X[0] and X[N/2]
+			float first_real = (dst[0] < 0.0f) ? (-1.0f * dst[0]) : dst[0];
+			float second_real = (dst[1] < 0.0f) ? (-1.0f * dst[1]) : dst[1];
+
+			// Take the magnitude for all the complex values in between
+			arm_cmplx_mag_f32(dst + 2, mag + 1, window_size / 2);
+
+			// Fill in the two real numbers at 0 and N/2
+			mag[0] = first_real;
+			mag[128] = second_real;
+
+			// N+1 FFT output, reuse waveform array
+			for (uint32_t i = 0; i < 129; i++) {
+#ifdef PRINT_SPECTROGRAM
+				printf("%08f\n", mag[i]);
+#endif
+
+				// We can't override waveform[129 * idx + 128] yet
+				// because we need it for the next iteration, so we need to store
+				// it separately
+				if (i < 128) {
+					((uint8_t*)waveform)[128 * idx + i] = (uint8_t)(mag[i] * 8.0f);
+				}
+				else {
+					last_ffts[idx] = (uint8_t)(mag[i] * 8.0f);
+				}
+			}
+		}
+
+		uint8_t* input_tensor = waveform;
+		uint32_t input_tensor_len = 124 * 129;
+
+		// We need to append an additional 124 bytes at the end of the spectrogram
+		// because we need to transform it from 128 to 129 points
+		// and insert the the N+1 points
+		for (uint32_t idx = 123; idx > 0; idx--){
+			uint8_t tmp[128];
+			memcpy(tmp, waveform + (idx * 128), 128);
+			memcpy(waveform + (idx * 128 + idx), tmp, 128);
+			waveform[idx * 128 + (idx - 1)] = last_ffts[idx - 1];
+		}
+
+#ifdef PRINT_SPECTROGRAM
+		for (uint32_t i = 0; i < input_tensor_len; i++){
+			printf("%u\n", input_tensor[i]);
+		}
+#endif
 
 		size_t start_time = HAL_GetTick();
 		// Prepare input tensor
@@ -302,13 +332,17 @@ int main(int argc, char *argv[])
 		DEBUG_PRINTF("Time: #%08u\n", end_time - start_time);
 
 		print_shape(output);
-		const char* labels[] = {"NO", "YES"};
-		if (tflite::GetTensorData<uint8_t>(output)[0] > tflite::GetTensorData<uint8_t>(output)[1]){
-			SUCCESS_PRINTF("Prediction: @%s\n", labels[0])
-		}
-		else {
-			SUCCESS_PRINTF("Prediction: @%s\n", labels[1])
+		const char* labels[] = {"DOWN", "LEFT", "NO", "RIGHT", "UP", "YES"};
+
+		uint32_t pred = sizeof(labels);
+		uint32_t max_val = 0;
+		for (int32_t i = 0; i < output->dims->data[1]; i++){
+			SUCCESS_PRINTF("Prediction %s: %u\n", labels[i], output->data.uint8[i]);
+			if (output->data.uint8[i] > max_val){
+				max_val = output->data.uint8[i];
+				pred = i;
+			}
 		}
-		*/
+		SUCCESS_PRINTF("@%s\n", labels[pred]);
 	}
 }
diff --git a/tools/convert-wav.py b/tools/convert-wav.py
@@ -13,6 +13,7 @@ def wav_to_uint8(input_wav_path, output_bin_path):
         data = data.mean(axis=1)
 
     # Normalize the data to be between 0 and 1
+    data = data.astype(np.int64)
     data = (data - np.min(data)) / (np.max(data) - np.min(data))
 
     # Scale to uint8

diff --git a/tools/plot-wav.py b/tools/plot-wav.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+with open('output.bin', 'rb') as f:
+    data = np.frombuffer(f.read(), dtype=np.uint8)
+
+plt.plot(data, color='black', linewidth=0.5)
+plt.show()
diff --git a/tools/plots/spectrograms3.png b/tools/plots/spectrograms3.png