Skip to content

Commit

Permalink
Merge pull request #4 from sachingodishela/vector
Browse files Browse the repository at this point in the history
Vector
  • Loading branch information
sachingodishela authored Jul 13, 2024
2 parents 38b20c6 + 2616794 commit 681f226
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
"extensions": [
"ms-vscode.cpptools",
"GitHub.copilot",
"GitHub.copilot-chat"
"GitHub.copilot-chat",
"nvidia.nsight-vscode-edition"
]
}
}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: NVIDIA Config
run: nvidia-smi && nvcc hello-cuda/main.cu -o hello-cuda/main.elf && hello-cuda/main.elf
run: nvidia-smi && nvcc hello-cuda/main.cu -o hello-cuda/main.elf && hello-cuda/main.elf && nvcc vector/add.cu -arch compute_50 -o vector/add.elf && vector/add.elf
2 changes: 2 additions & 0 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
"${file}",
"-o",
"${fileDirname}/${fileBasenameNoExtension}.elf",
"-arch",
"compute_50",
"-g",
"-G"
],
Expand Down
4 changes: 3 additions & 1 deletion hello-cuda/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,16 @@ int main()
cudaGetDeviceProperties(&prop, i);
std::cout << "Device Number: " << i << std::endl;
std::cout << "Device Name: " << prop.name << std::endl;
std::cout << "Compute Capability: " << prop.major << "." << prop.minor << std::endl;
std::cout << "Global memory: " << prop.totalGlobalMem / (1024 * 1024) << " MB" << std::endl; // Print global memory in MB
std::cout << "Shared memory per MP: " << prop.sharedMemPerMultiprocessor / 1024 << " MB" << std::endl;
std::cout << "Regs per multi processor: " << prop.regsPerMultiprocessor << std::endl;
std::cout << "Max threads per block: " << prop.maxThreadsPerBlock << std::endl;
std::cout << "Max threads per multiprocessor: " << prop.maxThreadsPerMultiProcessor << std::endl;
std::cout << "Max grid dimensions: (" << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", " << prop.maxGridSize[2] << ")" << std::endl;
std::cout << "Max shared memory per block: " << prop.sharedMemPerBlock / 1024 << " KB" << std::endl;
std::cout << "Max threads per block: " << prop.maxThreadsPerBlock << std::endl;
std::cout << "Max blocks per multiprocessor: " << prop.maxBlocksPerMultiProcessor << std::endl;
std::cout << "Multiprocessor count: " << prop.multiProcessorCount << std::endl;
}
return EXIT_SUCCESS;
}
9 changes: 9 additions & 0 deletions vector/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# vectors
Algorithms for manipulating data stored in arrays / vectors.

## 1. add
Add 2 vectors and store the result in a third vector. Run this for geometrically increasing vector sizes and compare the times taken when run sequentially using CPU vs parallely using GPU.

File: [add.cu][addCudaFile]

[addCudaFile]: ./add.cu
121 changes: 121 additions & 0 deletions vector/add.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#include<cuda_runtime.h>
#include<iostream>
#include<stdio.h>
#include<time.h>
#include<unistd.h>
#include<stdlib.h>
#include<vector>
#include<chrono>

__global__ void addVectors (float* A, float* B, float* C, unsigned long long int* vectorSize) {
int i = blockIdx.x * 1024 + threadIdx.x;
if (i > *vectorSize) {
// Do nothing
return;
}
C[i] = A[i] + B[i];
}

std::vector<float*> createDevicePointers (std::vector<float> A, std::vector<float> B, std::vector<float> C) {
cudaError_t err;
size_t memorySize = A.size() * sizeof(float);
float* d_A;
float* d_B;
float* d_C;
err = cudaMalloc(&d_A, memorySize);
if (err != cudaSuccess) {
std::cerr << "Error in allocating memory for A: " << cudaGetErrorString(err) << std::endl;
return std::vector<float*>{};
}
err = cudaMalloc(&d_B, memorySize);
if (err != cudaSuccess) {
std::cerr << "Error in allocating memory for B: " << cudaGetErrorString(err) << std::endl;
return std::vector<float*>{};
}
err = cudaMalloc(&d_C, memorySize);
if (err != cudaSuccess) {
std::cerr << "Error in allocating memory for C: " << cudaGetErrorString(err) << std::endl;
return std::vector<float*>{};
}
err = cudaMemcpy(d_A, A.data(), memorySize, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Error in copying A: " << cudaGetErrorString(err) << std::endl;
return std::vector<float*>{};
}
err = cudaMemcpy(d_B, B.data(), memorySize, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Error in copying B: " << cudaGetErrorString(err) << std::endl;
return std::vector<float*>{};
}
err = cudaMemcpy(d_C, C.data(), memorySize, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cerr << "Error in copying C: " << cudaGetErrorString(err) << std::endl;
return std::vector<float*>{};
}
return std::vector<float*>{d_A, d_B, d_C};
}

void freeDevicePointers (std::vector<float*> devicePointers) {
for (int i = 0; i < devicePointers.size(); i++) {
cudaFree(devicePointers[i]);
}
}

int main() {
unsigned long long int vectorSize = 1;
int iteration = 0;
int MAX_THREADS_PER_BLOCK = 1024;
int MAX_BLOCKS = 32;
while (0 < vectorSize && iteration < 29) {
std::cout << "vectorSize: " << vectorSize << ", ";
size_t memorySize = vectorSize * sizeof(float);
std::vector<float> A(vectorSize);
std::vector<float> B(vectorSize);
std::vector<float> C(vectorSize, 0);
for (int i = 0; i < vectorSize; i++) {
A[i] = i+1;
B[i] = i+1;
}
auto tic = std::chrono::steady_clock::now();
int numBlocks = (vectorSize / MAX_THREADS_PER_BLOCK) + (vectorSize % MAX_THREADS_PER_BLOCK ? 1 : 0);
std::cout << "iteration: " << ++iteration << ", ";
if (numBlocks < MAX_BLOCKS) {
// std::cout << "numBlocks: " << numBlocks << std::endl;
int numThreadsPerBlock = numBlocks > 1 ? MAX_THREADS_PER_BLOCK : vectorSize;
// std::cout << "numThreadsPerBlock: " << numThreadsPerBlock << std::endl;
dim3 threadsPerBlock(numThreadsPerBlock);
// create device pointers
std::vector<float*> devicePointers = createDevicePointers(A, B, C);
if (devicePointers.size() == 0) {
std::cerr << "Error: unable to create device pointers" << std::endl;
return EXIT_FAILURE;
}
unsigned long long int* d_vectorSize;
cudaMalloc(&d_vectorSize, sizeof(unsigned long long int));
cudaMemcpy(d_vectorSize, &vectorSize, sizeof(unsigned long long int), cudaMemcpyHostToDevice);
addVectors<<<numBlocks, threadsPerBlock>>>(devicePointers[0], devicePointers[1], devicePointers[2], d_vectorSize);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "Error after calling the kernel: " << cudaGetErrorString(err);
return EXIT_FAILURE;
}
cudaDeviceSynchronize();
cudaMemcpy(C.data(), devicePointers[2], memorySize, cudaMemcpyDeviceToHost);
auto toc = std::chrono::steady_clock::now();
freeDevicePointers(devicePointers);
std::cout << "num threads: " << numThreadsPerBlock << ", numBlocks: " << numBlocks << ", time taken using GPU: " << std::chrono::duration_cast<std::chrono::milliseconds>(toc - tic).count() << " ms, ";
} else {
// skip GPU execution
}

// CPU
tic = std::chrono::steady_clock::now();
for (unsigned long long int i = 0; i < vectorSize; i++) {
C[i] = A[i] + B[i];
}
auto toc2 = std::chrono::steady_clock::now();
std::cout << "time taken using CPU: " << std::chrono::duration_cast<std::chrono::milliseconds>(toc2 - tic).count() << " ms" << std::endl;
vectorSize = vectorSize * 2;
}
return 0;
}

0 comments on commit 681f226

Please sign in to comment.