Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gpu optimize and fix precision loss #30

Merged
merged 5 commits into from
Jul 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ project(ichida-algo LANGUAGES C CXX CUDA)

# Set compiler flags
set(CMAKE_C_FLAGS "-O3 -march=native -ffast-math -funroll-loops -Wall -Wextra")
set(CMAKE_C_STANDARD 99)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED True)
set(CMAKE_VERBOSE_MAKEFILE ON)

# Ensure CUDA NVCC flags are set properly
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -arch=sm_75")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas -O3 --use_fast_math -Xcompiler -march=native -unroll-aggressive -arch=sm_80")

set(INC_DIR include)
set(SRC_DIR src)
Expand All @@ -23,4 +23,4 @@ file(GLOB_RECURSE CUDA_SOURCE_FILES ${SRC_DIR}/*.cu)
# Create GPU executable
add_executable(speed_gpu ${CUDA_SOURCE_FILES})
set_target_properties(speed_gpu PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(speed_gpu m)
target_link_libraries(speed_gpu m)
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ run: build
./speed_gpu ./weights_and_biases.txt ./tensors 100000

test: build
./speed_gpu ./weights_and_biases.txt ./tensors 100000
./speed_gpu ./weights_and_biases.txt ./tensors 1000000
mv ./results.csv ./test
python3 ./test/verify_csv.py
143 changes: 93 additions & 50 deletions src/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ matrix* biases[NUM_LAYERS];
// device weights and biases;
matrix** d_weights;
matrix** d_biases;
matrix** d_inputs;

float* inputs;
float* d_inputs;
int* results;
int* d_results;

Expand Down Expand Up @@ -83,7 +84,7 @@ void read_model(const char* file_name) {
fclose(file);
}

void read_tensor(matrix* a, const char* fileName) {
void read_tensor(float* a, const char* fileName) {
FILE* file = fopen(fileName, "r");
char* line = NULL;
size_t len = 0;
Expand All @@ -96,64 +97,64 @@ void read_tensor(matrix* a, const char* fileName) {

for (int i = 0; i < 225; i++) {
value = strtof(token, NULL);
(a->data)[i] = value;
a[i] = value;
token = strtok(NULL, delimiter);
}
free(line);
fclose(file);
}

__device__ void propagate_fwd(matrix* weights, matrix* input_layer, matrix* output_layer, matrix* biases) {
matrix_mul(weights->data, input_layer->data, output_layer->data, weights->rows, weights->cols);
matrix_add(output_layer->data, biases->data, biases->rows);
__device__ void propagate_fwd(matrix* weights, float* input_layer, float* output_layer, matrix* biases) {
matrix_mul(weights->data, input_layer, output_layer, weights->rows, weights->cols);
matrix_add(output_layer, biases->data, biases->rows);
}

__global__ void infer(matrix** d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
#define BLOCKS 108
#define THREADS_PER_BLOCK 1024

__global__ void infer(float* d_inputs, int* d_results, matrix** d_weights, matrix** d_biases, int it_per_input,
int in_num) {

__shared__ float sharedInput[225];
float out1[98];
float out2[65];

int num_threads = blockDim.x * gridDim.x;
int thread_idx = (blockIdx.x * blockDim.x + threadIdx.x);

if (thread_idx > it_per_input) return;
float* input = (float*)&d_inputs[in_num * 225];

matrix* input = d_inputs[in_num];

matrix* outputs[2];
outputs[0] = new_matrix(98, 1);
outputs[1] = new_matrix(65, 1);
if (threadIdx.x < 225) {
sharedInput[threadIdx.x] = input[threadIdx.x];
}
nhatdongdang marked this conversation as resolved.
Show resolved Hide resolved
__syncthreads();

for (int i = thread_idx; i < it_per_input; i += num_threads) {
propagate_fwd(d_weights[0], input, outputs[0], d_biases[0]);
relu(outputs[0]->data, 98);
propagate_fwd(d_weights[0], sharedInput, out1, d_biases[0]);
relu(out1, 98);

propagate_fwd(d_weights[1], outputs[0], outputs[1], d_biases[1]);
relu(outputs[1]->data, 65);
propagate_fwd(d_weights[1], out1, out2, d_biases[1]);
relu(out2, 65);

propagate_fwd(d_weights[2], outputs[1], outputs[0], d_biases[2]);
relu(outputs[0]->data, 50);
propagate_fwd(d_weights[2], out2, out1, d_biases[2]);
relu(out1, 50);

propagate_fwd(d_weights[3], outputs[0], outputs[1], d_biases[3]);
relu(outputs[1]->data, 30);
propagate_fwd(d_weights[3], out1, out2, d_biases[3]);
relu(out2, 30);

propagate_fwd(d_weights[4], outputs[1], outputs[0], d_biases[4]);
relu(outputs[0]->data, 25);
propagate_fwd(d_weights[4], out2, out1, d_biases[4]);
relu(out1, 25);

propagate_fwd(d_weights[5], outputs[0], outputs[1], d_biases[5]);
relu(outputs[1]->data, 40);
propagate_fwd(d_weights[5], out1, out2, d_biases[5]);
relu(out2, 40);

propagate_fwd(d_weights[6], outputs[1], outputs[0], d_biases[6]);
softmax(outputs[0]->data, 52);
propagate_fwd(d_weights[6], out2, out1, d_biases[6]);
softmax(out1, 52);

int res = argmax(outputs[0]->data, 52);
d_results[in_num] = res;
d_results[in_num] = argmax(out1, 52);
}
free(outputs[0]->data);
free(outputs[0]);
free(outputs[1]->data);
free(outputs[1]);
}

#define IT_PER_IN 1000000

int main(int argc, char* argv[]) {
if (argc < 4) {
printf("Not enough arguments. Usage: speed_cpu <path_to_model.txt> <tensors_dir/> <number_of_inferences>\n");
Expand Down Expand Up @@ -186,8 +187,7 @@ int main(int argc, char* argv[]) {
for (int i = 0; i < NUM_LAYERS; i++) {
matrix* a = copy_to_device(weights[i]);
matrix* b = copy_to_device(biases[i]);
matrix** z = &(d_weights[i]);
CUDA_CHECK(cudaMemcpy(z, &a, sizeof(matrix*), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(&(d_weights[i]), &a, sizeof(matrix*), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(&(d_biases[i]), &b, sizeof(matrix*), cudaMemcpyHostToDevice));
}

Expand All @@ -208,39 +208,82 @@ int main(int argc, char* argv[]) {
}

results = (int*)malloc((input_count) * sizeof(int));
memset(results, 0, sizeof(int) * (input_count));
inputs = (float*)malloc((input_count) * sizeof(float) * 225);

cudaMalloc(&d_results, (input_count) * sizeof(int));
cudaMalloc(&d_inputs, (input_count) * sizeof(matrix*));
cudaMalloc(&d_inputs, (input_count) * sizeof(float) * 225);

dir = opendir(directory_path);
while ((entry = readdir(dir)) != NULL) {
if (entry->d_type == DT_REG) {
matrix* input = new_matrix(225, 1);
strcpy(file_num_str, entry->d_name);
file_num_str[strlen(entry->d_name) - 7] = '\0';
file_num = atoi(entry->d_name);
strcpy(file_name, directory_path);
strcat(file_name, "/");
strcat(file_name, entry->d_name);
read_tensor(input, file_name);
matrix* temp = copy_to_device(input);
cudaMemcpy(&d_inputs[file_num - 1], &temp, sizeof(matrix*), cudaMemcpyHostToDevice);
free(input);
read_tensor((float*)&inputs[(file_num - 1) * 225], file_name);
}
}

free(file_name);
free(file_num_str);
closedir(dir);

cudaMemset(d_results, 0, sizeof(int) * input_count);
cudaMemcpy(d_inputs, inputs, sizeof(float) * 225 * input_count, cudaMemcpyHostToDevice);

int deviceCount;
cudaError_t err = cudaGetDeviceCount(&deviceCount);
if (err != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(err));
return -1;
}

for (int i = 0; i < deviceCount; ++i) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device %d:\n", i);
printf(" Device Name: %s\n", prop.name);
printf(" Compute Capability: %d.%d\n", prop.major, prop.minor);
printf(" Total Global Memory: %lu bytes\n", prop.totalGlobalMem);
printf(" Shared Memory per Block: %lu bytes\n", prop.sharedMemPerBlock);
printf(" Registers per Block: %d\n", prop.regsPerBlock);
printf(" Warp Size: %d\n", prop.warpSize);
printf(" Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
printf(" Max threads per SM: %d\n", prop.maxThreadsPerMultiProcessor);
printf(" Max Threads Dim: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1],
prop.maxThreadsDim[2]);
printf(" Max Grid Size: (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf(" Clock Rate: %d kHz\n", prop.clockRate);
printf(" Total Constant Memory: %lu bytes\n", prop.totalConstMem);
printf(" Multiprocessor Count: %d\n", prop.multiProcessorCount);
printf(" Memory Clock Rate: %d kHz\n", prop.memoryClockRate);
printf(" Memory Bus Width: %d bits\n", prop.memoryBusWidth);
printf(" L2 Cache Size: %d bytes\n", prop.l2CacheSize);
printf("\n");
}

int minGridSize, blockSize;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, infer, 0, 0);
printf("Recommended block size: %d Grid size: %d\n", blockSize, minGridSize);

int it_num = atoi(argv[3]);
struct timeval stop1, start1;
gettimeofday(&start1, NULL);

int iter_per_in = atoi(argv[3]);
cudaDeviceSynchronize();
for (int i = 0; i < input_count; i++) {
infer<<<108, 69>>>(d_inputs, d_results, d_weights, d_biases, iter_per_in, i);
infer<<<BLOCKS, THREADS_PER_BLOCK>>>(d_inputs, d_results, d_weights, d_biases, it_num, i);
err = cudaGetLastError();
if (err != cudaSuccess) {
printf("CUDA error: %s\n", cudaGetErrorString(err));
}
}

cudaDeviceSynchronize();

cudaMemcpy(results, d_results, (input_count) * (sizeof(int)), cudaMemcpyDeviceToHost);
gettimeofday(&stop1, NULL);
printf("- Inference: %lu us\n", (stop1.tv_sec - start1.tv_sec) * 1000000 + stop1.tv_usec - start1.tv_usec);

FILE* csv_file = fopen("results.csv", "w+");
fprintf(csv_file, "image_number, guess\n");
Expand All @@ -251,7 +294,7 @@ int main(int argc, char* argv[]) {

// Time taken
gettimeofday(&stop, NULL);
printf("took %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);
printf("- Total: %lu us\n", (stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec);

return EXIT_SUCCESS;
}
}
15 changes: 7 additions & 8 deletions src/matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ matrix* new_matrix_d(int rows, int cols) {
float* data;
cudaMalloc(&data, rows * cols * sizeof(float));
alloc<<<1, 1>>>(res, data, rows, cols);
cudaDeviceSynchronize();
return res;
}

Expand All @@ -37,7 +36,6 @@ matrix* copy_to_device(matrix* h_mat) {
cudaMalloc(&data, h_mat->rows * h_mat->cols * sizeof(float));
cudaMemcpy(data, h_mat->data, h_mat->rows * h_mat->cols * sizeof(float), cudaMemcpyHostToDevice);
alloc<<<1, 1>>>(res, data, h_mat->rows, h_mat->cols);
cudaDeviceSynchronize();
return res;
}

Expand Down Expand Up @@ -89,17 +87,18 @@ static __device__ inline float fastexp(float x) {
}

__device__ void softmax(float* a, int rows) {
float res = (float)0;
for (int i = 0; i < rows; i++) {
res += exp(a[i]);
float sum = 0.0;
for (size_t i = 0; i < rows; i++) {
sum += __expf(a[i]);
}
for (int i = 0; i < rows; i++) {
a[i] /= res;
float t = __logf(sum);
for (size_t i = 0; i < rows; i++) {
a[i] = __expf(a[i] - t);
}
}

__device__ int argmax(float* a, int rows) {
int res = a[0];
float res = a[0];
int idx = 0;
for (int i = 0; i < rows; i++) {
if (res < a[i]) {
Expand Down