Skip to content

Commit

Permalink
Change quantization, add loop, expand model
Browse files Browse the repository at this point in the history
Classification in yes, no, left, right, up and down works okay
with the waveforms sent from the computer.

Signed-off-by: Stefan Gloor <code@stefan-gloor.ch>
  • Loading branch information
stgloorious committed Jun 10, 2024
1 parent 1724f6e commit 36b6aa8
Show file tree
Hide file tree
Showing 7 changed files with 189 additions and 115 deletions.
4 changes: 2 additions & 2 deletions ld/stm32l475vgtx.ld
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ ENTRY(Reset_Handler)
/* Highest address of the user mode stack */
_estack = ORIGIN(RAM) + LENGTH(RAM); /* end of "RAM" Ram type memory */

_Min_Heap_Size = 0x1000; /* required amount of heap */
_Min_Stack_Size = 0x4000; /* required amount of stack */
_Min_Heap_Size = 0x100; /* required amount of heap */
_Min_Stack_Size = 0x2000; /* required amount of stack */

/* Memories definition */
MEMORY
Expand Down
29 changes: 18 additions & 11 deletions ml/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@

DATASET_PATH = 'data/mini_speech_commands'

keywords = ['yes', 'no', 'left', 'right', 'up', 'down']

data_dir = pathlib.Path(DATASET_PATH)
if not data_dir.exists():
tf.keras.utils.get_file(
Expand All @@ -44,14 +46,14 @@
extract=True,
cache_dir='.', cache_subdir='data')
# Delete the unwanted parts of the dataset
shutil.rmtree(os.path.join(data_dir, 'left'))
shutil.rmtree(os.path.join(data_dir, 'right'))
shutil.rmtree(os.path.join(data_dir, 'up'))
shutil.rmtree(os.path.join(data_dir, 'down'))
#shutil.rmtree(os.path.join(data_dir, 'left'))
#shutil.rmtree(os.path.join(data_dir, 'right'))
#shutil.rmtree(os.path.join(data_dir, 'up'))
#shutil.rmtree(os.path.join(data_dir, 'down'))
shutil.rmtree(os.path.join(data_dir, 'go'))
shutil.rmtree(os.path.join(data_dir, 'stop'))

for data in ['yes', 'no']:
for data in keywords:
curr_dir = os.path.join(data_dir, data)
all_files = [os.path.join(curr_dir, fn) for fn in os.listdir(curr_dir) if fn.endswith('.wav')]

Expand Down Expand Up @@ -168,11 +170,12 @@ def make_spec_ds(ds):
layers.Resizing(32, 32),
# Normalize.
norm_layer,
layers.Conv2D(32, 3, activation='relu'),
layers.Conv2D(16, 3, activation='relu'),
layers.MaxPooling2D(),
layers.Dropout(0.25),
layers.Flatten(),
layers.Dense(8, activation='relu'),
layers.Dense(24, activation='relu'),
layers.Dropout(0.5),
layers.Dense(num_labels),
])
Expand Down Expand Up @@ -258,8 +261,8 @@ def representative_dataset():

# ## Run inference on an audio file

testfile = os.listdir(data_dir/'test/yes/')[0]
x = tf.io.read_file(str(os.path.join(data_dir/'test/yes/', testfile)))
testfile = os.listdir(data_dir/'test/no/')[0]
x = tf.io.read_file(str(os.path.join(data_dir/'test/no/', testfile)))
x, sample_rate = tf.audio.decode_wav(x, desired_channels=1, desired_samples=16000,)
x = tf.squeeze(x, axis=-1)
waveform = x
Expand All @@ -269,7 +272,7 @@ def representative_dataset():
prediction = model(x)
values = tf.nn.softmax(prediction[0])
print('Prediction of float model:')
label_names = ['no', 'yes']
label_names = keywords
for i in range(len(label_names)):
print(f'{label_names[i]}: {values[i]:.2%}')

Expand All @@ -283,8 +286,12 @@ def representative_dataset():
interpreter.allocate_tensors()

# Input data
input_data = np.array(x)
preprocessed_input_data = (input_data * 256).astype('uint8')
spec = np.array(x)
# Multiplying by 256 is not a proper quantization, we should normalize
# first. However, due to memory and performance contraints this is difficult
# to reproduce on the microcontroller, and just multiplying by 256 seem to
# work well enough
preprocessed_input_data = (spec * 256).astype('uint8')

with open('sample_input.bin', 'wb') as f:
f.write(preprocessed_input_data)
Expand Down
224 changes: 129 additions & 95 deletions src/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ limitations under the License.
#include "mic.h"
int dfsdm_conversion_done;

const int kTensorArenaSize = 39 * 1024;
alignas(16) static uint8_t tensor_arena[kTensorArenaSize] = { 0x55 };

static char waveform[16500];
const int kTensorArenaSize = 66800;
alignas(16) static uint8_t tensor_arena[kTensorArenaSize];

static uint8_t waveform[16128];
static uint8_t last_ffts[125];

#define DEBUG_PRINTF(...) \
{ \
Expand Down Expand Up @@ -131,11 +133,6 @@ int main(int argc, char *argv[])
microphone.dump_recording();
*/

uint32_t waveform_len = serial_recv(waveform, sizeof(waveform));
if (waveform_len == 0) {
assert(!"Transfer failed.");
}

const static uint32_t window_size = 256;
const static uint32_t frame_step = 128;

Expand All @@ -144,84 +141,9 @@ int main(int argc, char *argv[])
assert(!"Failed to init RFFT");
}

float min = 999999.0f;
float max = 0;
for (uint32_t i = 0; i < waveform_len; i++) {
float val = (float)((uint8_t)waveform[i]);
if (val < min) {
min = val;
}
if (val > max) {
max = val;
}
}

float hanning[window_size];
static float hanning[window_size];
arm_hanning_f32(hanning, window_size);

for (uint32_t idx = 0; idx < 124; idx++) {
static float dst[window_size];
static float mag[window_size + 1];
double sum = 0;

static float signal_chunk[window_size];

for (uint32_t i = 0; i < window_size; i++) {
signal_chunk[i] =
(float)((uint8_t)waveform[idx * frame_step + i]);

// Normalize from -1 to 1
signal_chunk[i] = (2.0f * (signal_chunk[i] - min) / (max - min)) - 1;
sum += signal_chunk[i];
}

// Remove DC component
float mean = (float)(sum / (double)window_size);
for (uint32_t i = 0; i < window_size; i++) {
signal_chunk[i] = signal_chunk[i] - mean;

// Apply window function
signal_chunk[i] *= hanning[i];
}

arm_rfft_fast_f32(&fft, signal_chunk, dst, 0);

// From to the CMSIS documentation:
// https://arm-software.github.io/CMSIS-DSP/latest/group__RealFFT.html
//
// The FFT of a real N-point sequence has even symmetry in the
// frequency domain. The second half of the data equals the conjugate
// of the first half flipped in frequency. This conjugate part is not
// computed by the float RFFT. As consequence, the output of a N point
// real FFT should be a N//2 + 1 complex numbers so N + 2 floats.

// It happens that the first complex of number of the RFFT output is
// actually all real. Its real part represents the DC offset. The value
// at Nyquist frequency is also real.

// Those two complex numbers can be encoded with 2 floats rather than
// using two numbers with an imaginary part set to zero.

// The implementation is using a trick so that the output buffer can be
// N float : the last real is packaged in the imaginary part of the
// first complex (since this imaginary part is not used and is zero).

// The first "complex" is actually to reals, X[0] and X[N/2]
float first_real = (dst[0] < 0.0f) ? (-1.0f * dst[0]) : dst[0];
float second_real = (dst[1] < 0.0f) ? (-1.0f * dst[1]) : dst[1];

// Take the magnitude for all the complex values in between
arm_cmplx_mag_f32(dst + 2, mag + 1, window_size / 2);

// Fill in the two real numbers at 0 and N/2
mag[0] = first_real;
mag[128] = second_real;

// Send N+1 FFT output
for (uint32_t i = 0; i < 129; i++) {
printf("%.06f\n", mag[i]);
}
}

const tflite::Model *model = tflite::GetModel(model_tflite);
DEBUG_PRINTF("Model architecture:\n");
Expand Down Expand Up @@ -260,7 +182,7 @@ int main(int argc, char *argv[])

// Interpreter
tflite::MicroInterpreter interpreter(model, op_resolver, tensor_arena,
kTensorArenaSize);
kTensorArenaSize);
DEBUG_PRINTF("MicroInterpreter initialized.\n");

if (interpreter.AllocateTensors() != kTfLiteOk) {
Expand All @@ -269,9 +191,117 @@ int main(int argc, char *argv[])
DEBUG_PRINTF("MicroInterpreter tensors allocated.\n");

while (1) {
/*
size_t input_tensor_len = serial_recv(input_tensor, sizeof(input_tensor));
DEBUG_PRINTF("Received %u bytes.\n", input_tensor_len);
uint32_t waveform_len = serial_recv((char*)waveform, sizeof(waveform));
if (waveform_len == 0) {
assert(!"Transfer failed.");
}

float min = 999999.0f;
float max = 0;
for (uint32_t i = 0; i < waveform_len; i++) {
float val = (float)(waveform[i]);
if (val < min) {
min = val;
}
if (val > max) {
max = val;
}
}

for (uint32_t idx = 0; idx < 124; idx++) {
float dst[window_size];
static float mag[window_size + 1];
double sum = 0;

static float* signal_chunk = mag;

for (uint32_t i = 0; i < window_size; i++) {
signal_chunk[i] =
(float)((uint8_t)waveform[idx * frame_step + i]);

// Normalize from -1 to 1
signal_chunk[i] = (2.0f * (signal_chunk[i] - min) / (max - min)) - 1;
sum += signal_chunk[i];
}

// Remove DC component
float mean = (float)(sum / (double)window_size);
for (uint32_t i = 0; i < window_size; i++) {
signal_chunk[i] = signal_chunk[i] - mean;

// Apply window function
signal_chunk[i] *= hanning[i];
}

arm_rfft_fast_f32(&fft, signal_chunk, dst, 0);

// From to the CMSIS documentation:
// https://arm-software.github.io/CMSIS-DSP/latest/group__RealFFT.html
//
// The FFT of a real N-point sequence has even symmetry in the
// frequency domain. The second half of the data equals the conjugate
// of the first half flipped in frequency. This conjugate part is not
// computed by the float RFFT. As consequence, the output of a N point
// real FFT should be a N//2 + 1 complex numbers so N + 2 floats.

// It happens that the first complex of number of the RFFT output is
// actually all real. Its real part represents the DC offset. The value
// at Nyquist frequency is also real.

// Those two complex numbers can be encoded with 2 floats rather than
// using two numbers with an imaginary part set to zero.

// The implementation is using a trick so that the output buffer can be
// N float : the last real is packaged in the imaginary part of the
// first complex (since this imaginary part is not used and is zero).

// The first "complex" is actually to reals, X[0] and X[N/2]
float first_real = (dst[0] < 0.0f) ? (-1.0f * dst[0]) : dst[0];
float second_real = (dst[1] < 0.0f) ? (-1.0f * dst[1]) : dst[1];

// Take the magnitude for all the complex values in between
arm_cmplx_mag_f32(dst + 2, mag + 1, window_size / 2);

// Fill in the two real numbers at 0 and N/2
mag[0] = first_real;
mag[128] = second_real;

// N+1 FFT output, reuse waveform array
for (uint32_t i = 0; i < 129; i++) {
#ifdef PRINT_SPECTROGRAM
printf("%08f\n", mag[i]);
#endif

// We can't override waveform[129 * idx + 128] yet
// because we need it for the next iteration, so we need to store
// it separately
if (i < 128) {
((uint8_t*)waveform)[128 * idx + i] = (uint8_t)(mag[i] * 8.0f);
}
else {
last_ffts[idx] = (uint8_t)(mag[i] * 8.0f);
}
}
}

uint8_t* input_tensor = waveform;
uint32_t input_tensor_len = 124 * 129;

// We need to append an additional 124 bytes at the end of the spectrogram
// because we need to transform it from 128 to 129 points
// and insert the the N+1 points
for (uint32_t idx = 123; idx > 0; idx--){
uint8_t tmp[128];
memcpy(tmp, waveform + (idx * 128), 128);
memcpy(waveform + (idx * 128 + idx), tmp, 128);
waveform[idx * 128 + (idx - 1)] = last_ffts[idx - 1];
}

#ifdef PRINT_SPECTROGRAM
for (uint32_t i = 0; i < input_tensor_len; i++){
printf("%u\n", input_tensor[i]);
}
#endif

size_t start_time = HAL_GetTick();
// Prepare input tensor
Expand Down Expand Up @@ -302,13 +332,17 @@ int main(int argc, char *argv[])
DEBUG_PRINTF("Time: #%08u\n", end_time - start_time);

print_shape(output);
const char* labels[] = {"NO", "YES"};
if (tflite::GetTensorData<uint8_t>(output)[0] > tflite::GetTensorData<uint8_t>(output)[1]){
SUCCESS_PRINTF("Prediction: @%s\n", labels[0])
}
else {
SUCCESS_PRINTF("Prediction: @%s\n", labels[1])
const char* labels[] = {"DOWN", "LEFT", "NO", "RIGHT", "UP", "YES"};

uint32_t pred = sizeof(labels);
uint32_t max_val = 0;
for (int32_t i = 0; i < output->dims->data[1]; i++){
SUCCESS_PRINTF("Prediction %s: %u\n", labels[i], output->data.uint8[i]);
if (output->data.uint8[i] > max_val){
max_val = output->data.uint8[i];
pred = i;
}
}
*/
SUCCESS_PRINTF("@%s\n", labels[pred]);
}
}
1 change: 1 addition & 0 deletions tools/convert-wav.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def wav_to_uint8(input_wav_path, output_bin_path):
data = data.mean(axis=1)

# Normalize the data to be between 0 and 1
data = data.astype(np.int64)
data = (data - np.min(data)) / (np.max(data) - np.min(data))

# Scale to uint8
Expand Down
10 changes: 10 additions & 0 deletions tools/plot-wav.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env python3

import matplotlib.pyplot as plt
import numpy as np

with open('output.bin', 'rb') as f:
data = np.frombuffer(f.read(), dtype=np.uint8)

plt.plot(data, color='black', linewidth=0.5)
plt.show()
Binary file added tools/plots/spectrograms3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 36b6aa8

Please sign in to comment.