Skip to content

Commit

Permalink
Merge pull request #36 from nhatdongdang/feat/more-optimize
Browse files Browse the repository at this point in the history
Optimize memory and fine-tuning based on virtual machine spec
  • Loading branch information
rozukke authored Jul 7, 2024
2 parents eed7019 + 6c69bfe commit bc05253
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 19 deletions.
17 changes: 6 additions & 11 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ typedef unsigned char u8;
#define NUM_LAYERS 7

#define TENSOR_SIZE 225
#define TSIZE_ALGN_BYTES (((TENSOR_SIZE + SIMD_ALGN - 1) / SIMD_ALGN * SIMD_ALGN) * sizeof(f32))
#define TSIZE_ALIGN_BYTES (((TENSOR_SIZE + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32) * sizeof(f32))

matrix* weights[NUM_LAYERS];
vector* biases[NUM_LAYERS];
Expand All @@ -28,7 +28,7 @@ char letters[52] = {'A', 'a', 'B', 'b', 'C', 'c', 'D', 'd', 'E', 'e', 'F', 'f',
void propagate_fwd(const matrix* weights, const vector* inputs, vector* results, const vector* biases) {
sgemv_t_tuned(weights->data, inputs->data, results->data, weights->cols, weights->rows);
// Add biases onto results
vector_add_inplace(results->len, biases->data, results->data);
vector_add_inplace(biases->len, biases->data, results->data);
}

// Basic version, too many aligned_alloc
Expand Down Expand Up @@ -89,31 +89,26 @@ u8 infer_reuse_layers_thread(vector* input, matrix** weights, vector** biases) {
propagate_fwd(weights[1], outputs[0], outputs[1], biases[1]);
relu_inplace(outputs[1]->data, 65);

outputs[0]->len = 50;
memset(outputs[0]->data, 0, 50 * sizeof(f32));

propagate_fwd(weights[2], outputs[1], outputs[0], biases[2]);
relu_inplace(outputs[0]->data, 50);

outputs[1]->len = 30;
memset(outputs[1]->data, 0, 30 * sizeof(f32));

propagate_fwd(weights[3], outputs[0], outputs[1], biases[3]);
relu_inplace(outputs[1]->data, 30);

outputs[0]->len = 25;
memset(outputs[0]->data, 0, 25 * sizeof(f32));

propagate_fwd(weights[4], outputs[1], outputs[0], biases[4]);
relu_inplace(outputs[0]->data, 25);

outputs[1]->len = 40;
memset(outputs[1]->data, 0, 40 * sizeof(f32));

propagate_fwd(weights[5], outputs[0], outputs[1], biases[5]);
relu_inplace(outputs[1]->data, 40);

outputs[0]->len = 52;
memset(outputs[0]->data, 0, 52 * sizeof(f32));

propagate_fwd(weights[6], outputs[1], outputs[0], biases[6]);
Expand Down Expand Up @@ -169,7 +164,7 @@ int main(int argc, char* argv[]) {
printf("Number of input tensors: %d\n", input_count);
printf("Iterations per input: %d\n", iter_per_in);

f32* tensors = (f32*)aligned_alloc(SIMD_ALGN, TSIZE_ALGN_BYTES * input_count);
f32* tensors = (f32*)aligned_alloc(SIMD_ALIGN, TSIZE_ALIGN_BYTES * input_count);

// Read and process inputs
char* file_path = (char*)malloc((256) * sizeof(char));
Expand All @@ -185,7 +180,7 @@ int main(int argc, char* argv[]) {
strcpy(file_path, directory_path);
strcat(file_path, "/");
strcat(file_path, entry->d_name);
read_tensor((f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * (file_num - 1)], file_path);
read_tensor((f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * (file_num - 1)], file_path);
}
}
closedir(dir);
Expand All @@ -209,7 +204,7 @@ int main(int argc, char* argv[]) {
// printf("Thread %d: Processing input %d\n", omp_get_thread_num(), i);

vector* input = new_vec_aligned(TENSOR_SIZE);
memcpy(input->data, (f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * i], TENSOR_SIZE * sizeof(f32));
memcpy(input->data, (f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * i], TENSOR_SIZE * sizeof(f32));

#pragma omp for
for (int j = 0; j < iter_per_in - 1; j++) {
Expand All @@ -230,7 +225,7 @@ int main(int argc, char* argv[]) {
vector* input = new_vec_aligned(TENSOR_SIZE);
u8* results = (u8*)malloc(input_count * sizeof(u8));
for (int i = 0; i < input_count; i++) {
input->data = (f32*)&tensors[TSIZE_ALGN_BYTES / sizeof(f32) * i];
input->data = (f32*)&tensors[TSIZE_ALIGN_BYTES / sizeof(f32) * i];
results[i] = infer_reuse_layers_thread(input, weights, biases);
}

Expand Down
12 changes: 6 additions & 6 deletions src/matrix.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ matrix* new_matrix_aligned(int rows, int cols) {
new_mat->cols = cols;

// Align entire array for simd access and better cache line utilisation
new_mat->data =
(f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALGN - 1) / SIMD_ALGN * SIMD_ALGN));
new_mat->data = (f32*)aligned_alloc(
SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));

return new_mat;
}
Expand All @@ -30,8 +30,8 @@ vector* new_vec_aligned(int len) {
new_vec->len = len;

// Align entire array for simd access and better cache line utilisation
new_vec->data =
(f32*)aligned_alloc(SIMD_ALGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALGN - 1) / SIMD_ALGN * SIMD_ALGN));
new_vec->data = (f32*)aligned_alloc(
SIMD_ALIGN, (((kern_align_f32 * sizeof(f32)) + SIMD_ALIGN_F32 - 1) / SIMD_ALIGN_F32 * SIMD_ALIGN_F32));

memset(new_vec->data, 0, kern_align_f32 * sizeof(f32));

Expand Down Expand Up @@ -114,8 +114,8 @@ void transpose_mat_inplace(matrix* in) {
// Swapped for transpose
int pad_w_rows = (cols_before + KERN_ROWS - 1) / KERN_ROWS * KERN_ROWS;
int pad_w_width = (rows_before + KERN_COLS - 1) / KERN_COLS * KERN_COLS;
f32* transposed = (f32*)aligned_alloc(
SIMD_ALGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALGN - 1) / SIMD_ALGN * SIMD_ALGN));
f32* transposed = (f32*)aligned_alloc(SIMD_ALIGN, (((pad_w_rows * pad_w_width * sizeof(f32)) + SIMD_ALIGN_F32 - 1) /
SIMD_ALIGN_F32 * SIMD_ALIGN_F32));
memset(transposed, 0, pad_w_rows * pad_w_width * sizeof(f32));

for (int row = 0; row < rows_before; row++) {
Expand Down
5 changes: 3 additions & 2 deletions src/matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ typedef unsigned char u8;
typedef signed long i64;

#define KERN_COLS 8
#define KERN_ROWS 2
#define SIMD_ALGN 64
#define KERN_ROWS 4
#define SIMD_ALIGN 32
#define SIMD_ALIGN_F32 (SIMD_ALIGN / sizeof(f32))

typedef struct vector {
int len;
Expand Down

0 comments on commit bc05253

Please sign in to comment.