Skip to content

vectorAdd.cu Example With Streams, Pinned Host Array and Pipelining

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Feb 28, 2022 · 3 revisions

Same as non-stream version, it requires 3 GPUs and launches each GPU's grains in parallel on their own streams, with pipelined grain launching to overlap queues (and hide latencies).

/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/**
 * Vector addition: C = A + B.
 *
 * This sample is a very basic sample that implements element by element
 * vector addition. It is the same as the sample illustrating Chapter 2
 * of the programming guide with some additions like error checking.
 */

#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

#include <helper_cuda.h>

// for load balancing between 3 different GPUs
#include "LoadBalancerX.h"

/**
 * CUDA Kernel Device code
 *
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
__global__ void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < numElements)
    {
        C[i] = A[i] + B[i];
    }
}


#include<iostream>
#include<map>
int
main(void)
{

    int numElements = 15000000;
    int numElementsPerGrain = 500000;
    size_t size = numElements * sizeof(float);

    float *h_A; cudaMallocHost((void**)&h_A,size);
    float *h_B; cudaMallocHost((void**)&h_B,size);
    float *h_C; cudaMallocHost((void**)&h_C,size);


    for (int i = 0; i < numElements; ++i)
    {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
    }



    /*
     * default tutorial vecAdd logic

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);


    int threadsPerBlock = 256;
    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;

    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
    cudaGetLastError();


    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
	*/

    /* load-balanced 3-GPU version setup */
    class GrainState
    {
    public:
    	int offset;
    	int range;
        std::map<int,float *> d_A;
        std::map<int,float *> d_B;
        std::map<int,float *> d_C;
        std::map<int,cudaStream_t> stream;
        ~GrainState(){
        	for(auto a:d_A)
        		cudaFree(a.second);
        	for(auto b:d_B)
        	    cudaFree(b.second);
        	for(auto c:d_C)
        	    cudaFree(c.second);
        	for(auto s:stream)
        		cudaStreamDestroy(s.second);
        }
    };

    class DeviceState
    {
    public:
    	int gpuId;
    	int amIgpu;
    };

    LoadBalanceLib::LoadBalancerX<DeviceState,GrainState> lb;
    lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({0,1})); // 1st cuda gpu in computer
    lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({1,1})); // 2nd cuda gpu in computer
    lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({2,1})); // 3rd cuda gpu in computer
    //lb.addDevice(LoadBalanceLib::ComputeDevice<DeviceState>({3,0})); // CPU single core

    for(int i=0;i<numElements;i+=numElementsPerGrain)
    {
    	lb.addWork(LoadBalanceLib::GrainOfWork<DeviceState,GrainState>(
    			[&,i](DeviceState gpu, GrainState& grain){
    				if(gpu.amIgpu)
    				{
						cudaSetDevice(gpu.gpuId);
						cudaStreamCreate(&grain.stream[gpu.gpuId]);
						cudaMalloc((void **)&grain.d_A[gpu.gpuId], numElementsPerGrain*sizeof(float));
						cudaMalloc((void **)&grain.d_B[gpu.gpuId], numElementsPerGrain*sizeof(float));
						cudaMalloc((void **)&grain.d_C[gpu.gpuId], numElementsPerGrain*sizeof(float));
    				}
    			},
    			[&,i](DeviceState gpu, GrainState& grain){
    				if(gpu.amIgpu)
    				{
						cudaSetDevice(gpu.gpuId);
						cudaMemcpyAsync(grain.d_A[gpu.gpuId], h_A+i, numElementsPerGrain*sizeof(float), cudaMemcpyHostToDevice,grain.stream[gpu.gpuId]);
						cudaMemcpyAsync(grain.d_B[gpu.gpuId], h_B+i, numElementsPerGrain*sizeof(float), cudaMemcpyHostToDevice,grain.stream[gpu.gpuId]);
    				}
    			},
    			[&,i](DeviceState gpu, GrainState& grain){
    				if(gpu.amIgpu)
    				{
						int threadsPerBlock = 1000;
						int blocksPerGrid =numElementsPerGrain/1000;
						vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, grain.stream[gpu.gpuId]>>>(grain.d_A[gpu.gpuId], grain.d_B[gpu.gpuId], grain.d_C[gpu.gpuId], numElements-i);
    				}
    				else
    				{
    					for(int j=0;j<numElementsPerGrain;j++)
    					{
    						const int index = j+i;
    						h_C[index]=h_A[index]+h_B[index];
    					}
    				}
    			},
    			[&,i](DeviceState gpu, GrainState& grain){
    				if(gpu.amIgpu)
    				{
    				   cudaMemcpyAsync(h_C+i, grain.d_C[gpu.gpuId], numElementsPerGrain*sizeof(float), cudaMemcpyDeviceToHost,grain.stream[gpu.gpuId]);
    				}
    			},
    			[&,i](DeviceState gpu, GrainState& grain){
    				if(gpu.amIgpu)
    				{
    					cudaStreamSynchronize(grain.stream[gpu.gpuId]);
    				}
    			}
    	));
    }

    /* load-balance setup end*/

    // run 100 times
    size_t nanoseconds=0;
    std::vector<double> de(3);
    constexpr int repeat = 250;
    for(int i=0;i<repeat;i++)
    {
    	nanoseconds += lb.run(true);

    }

	for(auto v:de)
		std::cout<<v<<" ";
	std::cout<<std::endl;
	std::cout<<nanoseconds/(double)repeat<<"ns  ("<<((numElements*12.0/(nanoseconds/(double)repeat)))<<"GB/s)"<<std::endl;


    std::cout<<"??"<<std::endl;

    for (int i = 0; i < numElements; i+=numElementsPerGrain)
    {
    	std::cout<<h_A[i]<<" + "<<h_B[i]<<" = "<<h_C[i]<<std::endl;
    }
    auto z = lb.getRelativePerformancesOfDevices();
    std::cout<<"work distribution to devices:"<<std::endl;
    for(auto zz:z)
    {
    	std::cout<<zz<<"% ";
    }
    std::cout<<std::endl;

    for(int i=0;i<numElements;i++)
    {
    	if(h_A[i]+h_B[i]<h_C[i]-0.001f || h_A[i]+h_B[i]>h_C[i]+0.001f)
    	{
    		std::cout<<"ERROR!"<<std::endl;
    		return 1;
    	}
    }
    cudaFreeHost(h_A);
    cudaFreeHost(h_B);
    cudaFreeHost(h_C);

    return 0;
}

output:

0 0 0 
3.18283e+07ns  (5.65535GB/s)
??
0.840188 + 0.394383 = 1.23457
0.359084 + 0.7904 = 1.14948
0.13322 + 0.590877 = 0.724097
0.191662 + 0.487925 = 0.679586
0.0225955 + 0.286374 = 0.30897
0.427484 + 0.386663 = 0.814146
0.257601 + 0.838376 = 1.09598
0.23368 + 0.483369 = 0.717049
0.436181 + 0.883478 = 1.31966
0.24532 + 0.665559 = 0.910879
0.112266 + 0.41295 = 0.525217
0.849061 + 0.718937 = 1.568
0.46875 + 0.167666 = 0.636416
0.135784 + 0.79809 = 0.933874
0.628612 + 0.548775 = 1.17739
0.384567 + 0.222167 = 0.606735
0.32274 + 0.265794 = 0.588534
0.0502425 + 0.36232 = 0.412563
0.955184 + 0.523567 = 1.47875
0.539564 + 0.2765 = 0.816064
0.212554 + 0.0864721 = 0.299027
0.231759 + 0.352527 = 0.584286
0.351006 + 0.802613 = 1.15362
0.569777 + 0.0815185 = 0.651295
0.150883 + 0.402225 = 0.553108
0.754437 + 0.0613455 = 0.815782
0.381379 + 0.643216 = 1.02459
0.580856 + 0.336609 = 0.917465
0.36155 + 0.27649 = 0.638041
0.937536 + 0.227747 = 1.16528
work distribution to devices:
30.1787% 47.7554% 22.0659%
 ^             ^        ^    
 |             |        |  
gt1030 gpu     |        |
pcie 4x v2.0   |        k420 on pcie 4x v2.0
dual async     |
copy engines   |
               k420 on pcie 8x v2.0

motherboard splits single 16x v2.0 hub into two 8x lanes but gt1030 supports only 4x lanes

Pipelining enabled overlapping of copying operations and surpass one-way bandwidth of 4GB/s of the test system and reach 5.65 GB/s.

Clone this wiki locally