Skip to content

Commit

Permalink
Merge pull request #5 from sachingodishela/vector
Browse files Browse the repository at this point in the history
Vector Addition | Optimize for GPU memory, Exclude host memory allocation in GPU time.
  • Loading branch information
sachingodishela authored Jul 13, 2024
2 parents 681f226 + 0314b59 commit 4515a2a
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 62 deletions.
2 changes: 1 addition & 1 deletion vector/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Algorithms for manipulating data stored in arrays / vectors.

## 1. add
Add 2 vectors and store the result in a third vector. Run this for geometrically increasing vector sizes and compare the times taken when run sequentially using CPU vs parallely using GPU.
Add 2 vectors and store the result in the 2nd vector. Run this for geometrically increasing vector sizes and compare the times taken when run sequentially using CPU vs parallely using GPU.

File: [add.cu][addCudaFile]

Expand Down
128 changes: 67 additions & 61 deletions vector/add.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,44 @@
#include<vector>
#include<chrono>

__global__ void addVectors (float* A, float* B, float* C, unsigned long long int* vectorSize) {
__global__ void addVectors (float* A, float* B, unsigned long long int* vectorSize) {
int i = blockIdx.x * 1024 + threadIdx.x;
// if (threadIdx.x == 0) {
// printf("blockIdx.x: %d, threadIdx.x: %d, i: %d\n", blockIdx.x, threadIdx.x, i);
// }
if (i > *vectorSize) {
// Do nothing
return;
}
C[i] = A[i] + B[i];
B[i] = A[i] + B[i];
}

std::vector<float*> createDevicePointers (std::vector<float> A, std::vector<float> B, std::vector<float> C) {
std::vector<float*> createHostPointers (unsigned long long int &vectorSize) {
float* A;
float* B;
float* C;
A = (float*)malloc(vectorSize * sizeof(float));
B = (float*)malloc(vectorSize * sizeof(float));
for (unsigned long long int i = 0; i < vectorSize; i++) {
A[i] = 1.0;
B[i] = 2.2;
}
return std::vector<float*>{A, B};
}

void freeHostPointers (std::vector<float*> hostPointers) {
for (int i = 0; i < hostPointers.size(); i++) {
free(hostPointers[i]);
}
}

std::vector<float*> createDevicePointers (std::vector<float*> &hostPointers, unsigned long long int &vectorSize) {
cudaError_t err;
size_t memorySize = A.size() * sizeof(float);
size_t memorySize = vectorSize * sizeof(float);
float* A = hostPointers[0];
float* B = hostPointers[1];
float* d_A;
float* d_B;
float* d_C;
err = cudaMalloc(&d_A, memorySize);
if (err != cudaSuccess) {
std::cerr << "Error in allocating memory for A: " << cudaGetErrorString(err) << std::endl;
Expand All @@ -32,27 +55,17 @@ std::vector<float*> createDevicePointers (std::vector<float> A, std::vector<floa
std::cerr << "Error in allocating memory for B: " << cudaGetErrorString(err) << std::endl;
return std::vector<float*>{};
}
err = cudaMalloc(&d_C, memorySize);
if (err != cudaSuccess) {
std::cerr << "Error in allocating memory for C: " << cudaGetErrorString(err) << std::endl;
return std::vector<float*>{};
}
err = cudaMemcpy(d_A, A.data(), memorySize, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_A, A, memorySize, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Error in copying A: " << cudaGetErrorString(err) << std::endl;
return std::vector<float*>{};
}
err = cudaMemcpy(d_B, B.data(), memorySize, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_B, B, memorySize, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Error in copying B: " << cudaGetErrorString(err) << std::endl;
return std::vector<float*>{};
}
err = cudaMemcpy(d_C, C.data(), memorySize, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Error in copying C: " << cudaGetErrorString(err) << std::endl;
return std::vector<float*>{};
}
return std::vector<float*>{d_A, d_B, d_C};
return std::vector<float*>{d_A, d_B};
}

void freeDevicePointers (std::vector<float*> devicePointers) {
Expand All @@ -62,60 +75,53 @@ void freeDevicePointers (std::vector<float*> devicePointers) {
}

int main() {
unsigned long long int vectorSize = 1;
unsigned long long int vectorSize = 5;
int iteration = 0;
int MAX_THREADS_PER_BLOCK = 1024;
int MAX_BLOCKS = 32;
while (0 < vectorSize && iteration < 29) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int MAX_THREADS_PER_BLOCK = prop.maxThreadsPerBlock;
while (0 < vectorSize && vectorSize < 1e9) {
std::cout << "vectorSize: " << vectorSize << ", ";
size_t memorySize = vectorSize * sizeof(float);
std::vector<float> A(vectorSize);
std::vector<float> B(vectorSize);
std::vector<float> C(vectorSize, 0);
for (int i = 0; i < vectorSize; i++) {
A[i] = i+1;
B[i] = i+1;
}
std::cout << memorySize << " bytes, ";
std::vector<float*> hostPointers = createHostPointers(vectorSize);
auto tic = std::chrono::steady_clock::now();
std::chrono::time_point<std::chrono::steady_clock> toc;
int numBlocks = (vectorSize / MAX_THREADS_PER_BLOCK) + (vectorSize % MAX_THREADS_PER_BLOCK ? 1 : 0);
std::cout << "iteration: " << ++iteration << ", ";
if (numBlocks < MAX_BLOCKS) {
// std::cout << "numBlocks: " << numBlocks << std::endl;
int numThreadsPerBlock = numBlocks > 1 ? MAX_THREADS_PER_BLOCK : vectorSize;
// std::cout << "numThreadsPerBlock: " << numThreadsPerBlock << std::endl;
dim3 threadsPerBlock(numThreadsPerBlock);
// create device pointers
std::vector<float*> devicePointers = createDevicePointers(A, B, C);
if (devicePointers.size() == 0) {
std::cerr << "Error: unable to create device pointers" << std::endl;
return EXIT_FAILURE;
}
unsigned long long int* d_vectorSize;
cudaMalloc(&d_vectorSize, sizeof(unsigned long long int));
cudaMemcpy(d_vectorSize, &vectorSize, sizeof(unsigned long long int), cudaMemcpyHostToDevice);
addVectors<<<numBlocks, threadsPerBlock>>>(devicePointers[0], devicePointers[1], devicePointers[2], d_vectorSize);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "Error after calling the kernel: " << cudaGetErrorString(err);
return EXIT_FAILURE;
}
cudaDeviceSynchronize();
cudaMemcpy(C.data(), devicePointers[2], memorySize, cudaMemcpyDeviceToHost);
auto toc = std::chrono::steady_clock::now();
freeDevicePointers(devicePointers);
std::cout << "num threads: " << numThreadsPerBlock << ", numBlocks: " << numBlocks << ", time taken using GPU: " << std::chrono::duration_cast<std::chrono::milliseconds>(toc - tic).count() << " ms, ";
} else {
// skip GPU execution
// std::cout << "numBlocks: " << numBlocks << std::endl;
int numThreadsPerBlock = numBlocks > 1 ? MAX_THREADS_PER_BLOCK : vectorSize;
// std::cout << "numThreadsPerBlock: " << numThreadsPerBlock << std::endl;
dim3 threadsPerBlock(numThreadsPerBlock);
// create device pointers
std::vector<float*> devicePointers = createDevicePointers(hostPointers, vectorSize);
if (devicePointers.size() == 0) {
return EXIT_SUCCESS;
}

unsigned long long int* d_vectorSize;
cudaMalloc(&d_vectorSize, sizeof(unsigned long long int));
cudaMemcpy(d_vectorSize, &vectorSize, sizeof(unsigned long long int), cudaMemcpyHostToDevice);
addVectors<<<numBlocks, threadsPerBlock>>>(devicePointers[0], devicePointers[1], d_vectorSize);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "Error after calling the kernel: " << cudaGetErrorString(err);
return EXIT_FAILURE;
}
cudaDeviceSynchronize();
cudaMemcpy(hostPointers[1], devicePointers[1], memorySize, cudaMemcpyDeviceToHost);
toc = std::chrono::steady_clock::now();
freeDevicePointers(devicePointers);
auto gpuTime = std::chrono::duration_cast<std::chrono::milliseconds>(toc - tic).count();
// CPU
tic = std::chrono::steady_clock::now();
auto tic2 = std::chrono::steady_clock::now();
for (unsigned long long int i = 0; i < vectorSize; i++) {
C[i] = A[i] + B[i];
hostPointers[1][i] = hostPointers[0][i] + hostPointers[1][i];
}
freeHostPointers(hostPointers);
auto toc2 = std::chrono::steady_clock::now();
std::cout << "time taken using CPU: " << std::chrono::duration_cast<std::chrono::milliseconds>(toc2 - tic).count() << " ms" << std::endl;
vectorSize = vectorSize * 2;
std::cout << "GPU/CPU time ratio: " << (float)gpuTime / std::chrono::duration_cast<std::chrono::milliseconds>(toc2 - tic2).count() << std::endl;

vectorSize = vectorSize * 1.2;
}
return 0;
}

0 comments on commit 4515a2a

Please sign in to comment.