kachi-group · rozukke · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -5,12 +5,12 @@ project(ichida-algo LANGUAGES C CXX CUDA)
 
 # Set compiler flags
 set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -Wall -Wextra")
-set(CMAKE_C_STANDARD 99)
+set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED True)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 
 # Ensure CUDA NVCC flags are set properly
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -arch=sm_75")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -O3 --use_fast_math -Xcompiler -march=native -unroll-aggressive -arch=sm_80")
 
 set(INC_DIR include)
 set(SRC_DIR src)
@@ -23,4 +23,4 @@ file(GLOB_RECURSE CUDA_SOURCE_FILES ${SRC_DIR}/*.cu)
 # Create GPU executable
 add_executable(speed_gpu ${CUDA_SOURCE_FILES})
 set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-target_link_libraries(speed_gpu m)
+target_link_libraries(speed_gpu m)
diff --git a/Makefile b/Makefile
@@ -17,6 +17,6 @@ run: build
 	./speed_gpu ./weights_and_biases.txt ./tensors 100000
 
 test: build
-	./speed_gpu ./weights_and_biases.txt ./tensors 100000
+	./speed_gpu ./weights_and_biases.txt ./tensors 1000000
 	mv ./results.csv ./test
 	python3 ./test/verify_csv.py
diff --git a/src/main.cu b/src/main.cu
@@ -23,8 +23,9 @@ matrix* biases[NUM_LAYERS];
 // device weights and biases;
 matrix** d_weights;
 matrix** d_biases;
-matrix** d_inputs;
 
+float* inputs;
+float* d_inputs;
 int* results;
 int* d_results;
 
@@ -83,7 +84,7 @@ void read_model(const char* file_name) {
     fclose(file);
 }
 
-void read_tensor(matrix* a, const char* fileName) {
+void read_tensor(float* a, const char* fileName) {
     FILE* file = fopen(fileName, "r");
     char* line = NULL;
     size_t len = 0;
@@ -96,64 +97,64 @@ void read_tensor(matrix* a, const char* fileName) {
 
     for (int i = 0; i < 225; i++) {
         value = strtof(token, NULL);
-        (a->data)[i] = value;
+        a[i] = value;
         token = strtok(NULL, delimiter);
     }
     free(line);
     fclose(file);
 }
 
-__device__ void propagate_fwd(matrix* weights, matrix* input_layer, matrix* output_layer, matrix* biases) {
-    matrix_mul(weights->data, input_layer->data, output_layer->data, weights->rows, weights->cols);
-    matrix_add(output_layer->data, biases->data, biases->rows);
+__device__ void propagate_fwd(matrix* weights, float* input_layer, float* output_layer, matrix* biases) {
+    matrix_mul(weights->data, input_layer, output_layer, weights->rows, weights->cols);
+    matrix_add(output_layer, biases->data, biases->rows);
 }
 
-__global__ void infer(matrix** d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
+#define BLOCKS 108
+#define THREADS_PER_BLOCK 1024
+
+__global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
                       int in_num) {
+
+    __shared__ float sharedInput[225];
+    float out1[98];
+    float out2[65];
+
     int num_threads = blockDim.x * gridDim.x;
     int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x);
 
-    if (thread_idx > it_per_input) return;
+    float* input = (float*)&d_inputs[in_num * 225];
 
-    matrix* input = d_inputs[in_num];
-
-    matrix* outputs[2];
-    outputs[0] = new_matrix(98, 1);
-    outputs[1] = new_matrix(65, 1);
+    if (threadIdx.x < 225) {
+        sharedInput[threadIdx.x] = input[threadIdx.x];
+    }
+    __syncthreads();
 
     for (int i = thread_idx; i < it_per_input; i += num_threads) {
-        propagate_fwd(d_weights[0], input, outputs[0], d_biases[0]);
-        relu(outputs[0]->data, 98);
+        propagate_fwd(d_weights[0], sharedInput, out1, d_biases[0]);
+        relu(out1, 98);
 
-        propagate_fwd(d_weights[1], outputs[0], outputs[1], d_biases[1]);
-        relu(outputs[1]->data, 65);
+        propagate_fwd(d_weights[1], out1, out2, d_biases[1]);
+        relu(out2, 65);
 
-        propagate_fwd(d_weights[2], outputs[1], outputs[0], d_biases[2]);
-        relu(outputs[0]->data, 50);
+        propagate_fwd(d_weights[2], out2, out1, d_biases[2]);
+        relu(out1, 50);
 
-        propagate_fwd(d_weights[3], outputs[0], outputs[1], d_biases[3]);
-        relu(outputs[1]->data, 30);
+        propagate_fwd(d_weights[3], out1, out2, d_biases[3]);
+        relu(out2, 30);
 
-        propagate_fwd(d_weights[4], outputs[1], outputs[0], d_biases[4]);
-        relu(outputs[0]->data, 25);
+        propagate_fwd(d_weights[4], out2, out1, d_biases[4]);
+        relu(out1, 25);
 
-        propagate_fwd(d_weights[5], outputs[0], outputs[1], d_biases[5]);
-        relu(outputs[1]->data, 40);
+        propagate_fwd(d_weights[5], out1, out2, d_biases[5]);
+        relu(out2, 40);
 
-        propagate_fwd(d_weights[6], outputs[1], outputs[0], d_biases[6]);
-        softmax(outputs[0]->data, 52);
+        propagate_fwd(d_weights[6], out2, out1, d_biases[6]);
+        softmax(out1, 52);
 
-        int res = argmax(outputs[0]->data, 52);
-        d_results[in_num] = res;
+        d_results[in_num] = argmax(out1, 52);
     }
-    free(outputs[0]->data);
-    free(outputs[0]);
-    free(outputs[1]->data);
-    free(outputs[1]);
 }
 
-#define IT_PER_IN 1000000
-
 int main(int argc, char* argv[]) {
     if (argc < 4) {
         printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/> <number_of_inferences>\n");
@@ -186,8 +187,7 @@ int main(int argc, char* argv[]) {
     for (int i = 0; i < NUM_LAYERS; i++) {
         matrix* a = copy_to_device(weights[i]);
         matrix* b = copy_to_device(biases[i]);
-        matrix** z = &(d_weights[i]);
-        CUDA_CHECK(cudaMemcpy(z, &a, sizeof(matrix*), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(&(d_weights[i]), &a, sizeof(matrix*), cudaMemcpyHostToDevice));
         CUDA_CHECK(cudaMemcpy(&(d_biases[i]), &b, sizeof(matrix*), cudaMemcpyHostToDevice));
     }
 
@@ -208,39 +208,82 @@ int main(int argc, char* argv[]) {
     }
 
     results = (int*)malloc((input_count) * sizeof(int));
-    memset(results, 0, sizeof(int) * (input_count));
+    inputs = (float*)malloc((input_count) * sizeof(float) * 225);
+
     cudaMalloc(&d_results, (input_count) * sizeof(int));
-    cudaMalloc(&d_inputs, (input_count) * sizeof(matrix*));
+    cudaMalloc(&d_inputs, (input_count) * sizeof(float) * 225);
 
     dir = opendir(directory_path);
     while ((entry = readdir(dir)) != NULL) {
         if (entry->d_type == DT_REG) {
-            matrix* input = new_matrix(225, 1);
             strcpy(file_num_str, entry->d_name);
             file_num_str[strlen(entry->d_name) - 7] = '\0';
             file_num = atoi(entry->d_name);
             strcpy(file_name, directory_path);
             strcat(file_name, "/");
             strcat(file_name, entry->d_name);
-            read_tensor(input, file_name);
-            matrix* temp = copy_to_device(input);
-            cudaMemcpy(&d_inputs[file_num - 1], &temp, sizeof(matrix*), cudaMemcpyHostToDevice);
-            free(input);
+            read_tensor((float*)&inputs[(file_num - 1) * 225], file_name);
         }
     }
+
     free(file_name);
     free(file_num_str);
     closedir(dir);
 
-    cudaMemset(d_results, 0, sizeof(int) * input_count);
+    cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice);
+
+    int deviceCount;
+    cudaError_t err = cudaGetDeviceCount(&deviceCount);
+    if (err != cudaSuccess) {
+        printf("Error: %s\n", cudaGetErrorString(err));
+        return -1;
+    }
+
+    for (int i = 0; i < deviceCount; ++i) {
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, i);
+        printf("Device %d:\n", i);
+        printf("  Device Name: %s\n", prop.name);
+        printf("  Compute Capability: %d.%d\n", prop.major, prop.minor);
+        printf("  Total Global Memory: %lu bytes\n", prop.totalGlobalMem);
+        printf("  Shared Memory per Block: %lu bytes\n", prop.sharedMemPerBlock);
+        printf("  Registers per Block: %d\n", prop.regsPerBlock);
+        printf("  Warp Size: %d\n", prop.warpSize);
+        printf("  Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
+        printf("  Max threads per SM: %d\n", prop.maxThreadsPerMultiProcessor);
+        printf("  Max Threads Dim: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1],
+               prop.maxThreadsDim[2]);
+        printf("  Max Grid Size: (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
+        printf("  Clock Rate: %d kHz\n", prop.clockRate);
+        printf("  Total Constant Memory: %lu bytes\n", prop.totalConstMem);
+        printf("  Multiprocessor Count: %d\n", prop.multiProcessorCount);
+        printf("  Memory Clock Rate: %d kHz\n", prop.memoryClockRate);
+        printf("  Memory Bus Width: %d bits\n", prop.memoryBusWidth);
+        printf("  L2 Cache Size: %d bytes\n", prop.l2CacheSize);
+        printf("\n");
+    }
+
+    int minGridSize, blockSize;
+    cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, infer, 0, 0);
+    printf("Recommended block size: %d Grid size: %d\n", blockSize, minGridSize);
+
+    int it_num = atoi(argv[3]);
+    struct timeval stop1, start1;
+    gettimeofday(&start1, NULL);
 
-    int iter_per_in = atoi(argv[3]);
+    cudaDeviceSynchronize();
     for (int i = 0; i < input_count; i++) {
-        infer<<<108, 69>>>(d_inputs, d_results, d_weights, d_biases, iter_per_in, i);
+        infer<<<BLOCKS, THREADS_PER_BLOCK>>>(d_inputs, d_results, d_weights, d_biases, it_num, i);
+        err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            printf("CUDA error: %s\n", cudaGetErrorString(err));
+        }
     }
-
     cudaDeviceSynchronize();
+
     cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost);
+    gettimeofday(&stop1, NULL);
+    printf("- Inference: %lu us\n", (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec);
 
     FILE* csv_file = fopen("results.csv", "w+");
     fprintf(csv_file, "image_number, guess\n");
@@ -251,7 +294,7 @@ int main(int argc, char* argv[]) {
 
     // Time taken
     gettimeofday(&stop, NULL);
-    printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
+    printf("- Total: %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
 
     return EXIT_SUCCESS;
-}
+}
diff --git a/src/matrix.cu b/src/matrix.cu
@@ -26,7 +26,6 @@ matrix* new_matrix_d(int rows, int cols) {
     float* data;
     cudaMalloc(&data, rows * cols * sizeof(float));
     alloc<<<1, 1>>>(res, data, rows, cols);
-    cudaDeviceSynchronize();
     return res;
 }
 
@@ -37,7 +36,6 @@ matrix* copy_to_device(matrix* h_mat) {
     cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(float));
     cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice);
     alloc<<<1, 1>>>(res, data, h_mat->rows, h_mat->cols);
-    cudaDeviceSynchronize();
     return res;
 }
 
@@ -89,17 +87,18 @@ static __device__ inline float fastexp(float x) {
 }
 
 __device__ void softmax(float* a, int rows) {
-    float res = (float)0;
-    for (int i = 0; i < rows; i++) {
-        res += exp(a[i]);
+    float sum = 0.0;
+    for (size_t i = 0; i < rows; i++) {
+        sum += __expf(a[i]);
     }
-    for (int i = 0; i < rows; i++) {
-        a[i] /= res;
+    float t = __logf(sum);
+    for (size_t i = 0; i < rows; i++) {
+        a[i] = __expf(a[i] - t);
     }
 }
 
 __device__ int argmax(float* a, int rows) {
-    int res = a[0];
+    float res = a[0];
     int idx = 0;
     for (int i = 0; i < rows; i++) {
         if (res < a[i]) {