Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu

/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <cstdio>
#include <vector>

#include <helper_cuda.h>
#include <helper_timer.h>

using namespace std;

const char *sSampleName = "P2P (Peer-to-Peer) GPU Bandwidth Latency Test";

typedef enum {
  P2P_WRITE = 0,
  P2P_READ = 1,
} P2PDataTransfer;

typedef enum {
  CE = 0,
  SM = 1,
} P2PEngine;

P2PEngine p2p_mechanism = CE;  // By default use Copy Engine

// Macro for checking cuda errors following a cuda launch or api call
#define cudaCheckError()                                       \
  {                                                            \
    cudaError_t e = cudaGetLastError();                        \
    if (e != cudaSuccess) {                                    \
      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \
             cudaGetErrorString(e));                           \
      exit(EXIT_FAILURE);                                      \
    }                                                          \
  }
__global__ void delay(volatile int *flag,
                      unsigned long long timeout_clocks = 10000000) {
  // Wait until the application notifies us that it has completed queuing up the
  // experiment, or timeout and exit, allowing the application to make progress
  long long int start_clock, sample_clock;
  start_clock = clock64();

  while (!*flag) {
    sample_clock = clock64();

    if (sample_clock - start_clock > timeout_clocks) {
      break;
    }
  }
}

// This kernel is for demonstration purposes only, not a performant kernel for
// p2p transfers.
__global__ void copyp2p(int4 *__restrict__ dest, int4 const *__restrict__ src,
                        size_t num_elems) {
  size_t globalId = blockIdx.x * blockDim.x + threadIdx.x;
  size_t gridSize = blockDim.x * gridDim.x;

#pragma unroll(5)
  for (size_t i = globalId; i < num_elems; i += gridSize) {
    dest[i] = src[i];
  }
}

///////////////////////////////////////////////////////////////////////////
// Print help screen
///////////////////////////////////////////////////////////////////////////
void printHelp(void) {
  printf("Usage:  p2pBandwidthLatencyTest [OPTION]...\n");
  printf("Tests bandwidth/latency of GPU pairs using P2P and without P2P\n");
  printf("\n");

  printf("Options:\n");
  printf("--help\t\tDisplay this help menu\n");
  printf(
      "--p2p_read\tUse P2P reads for data transfers between GPU pairs and show "
      "corresponding results.\n \t\tDefault used is P2P write operation.\n");
  printf("--sm_copy\t\tUse SM intiated p2p transfers instead of Copy Engine\n");
}

void checkP2Paccess(int numGPUs) {
  for (int i = 0; i < numGPUs; i++) {
    cudaSetDevice(i);
    cudaCheckError();

    for (int j = 0; j < numGPUs; j++) {
      int access;
      if (i != j) {
        cudaDeviceCanAccessPeer(&access, i, j);
        cudaCheckError();
        printf("Device=%d %s Access Peer Device=%d\n", i,
               access ? "CAN" : "CANNOT", j);
      }
    }
  }
  printf(
      "\n***NOTE: In case a device doesn't have P2P access to other one, it "
      "falls back to normal memcopy procedure.\nSo you can see lesser "
      "Bandwidth (GB/s) and unstable Latency (us) in those cases.\n\n");
}

void performP2PCopy(int *dest, int destDevice, int *src, int srcDevice,
                    int num_elems, int repeat, bool p2paccess,
                    cudaStream_t streamToRun) {
  int blockSize = 0;
  int numBlocks = 0;

  cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, copyp2p);
  cudaCheckError();

  if (p2p_mechanism == SM && p2paccess) {
    for (int r = 0; r < repeat; r++) {
      copyp2p<<<numBlocks, blockSize, 0, streamToRun>>>(
          (int4 *)dest, (int4 *)src, num_elems / 4);
    }
  } else {
    for (int r = 0; r < repeat; r++) {
      cudaMemcpyPeerAsync(dest, destDevice, src, srcDevice,
                          sizeof(int) * num_elems, streamToRun);
    }
  }
}

void outputBandwidthMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {
  int numElems = 10000000;
  int repeat = 5;
  volatile int *flag = NULL;
  vector<int *> buffers(numGPUs);
  vector<int *> buffersD2D(numGPUs);  // buffer for D2D, that is, intra-GPU copy
  vector<cudaEvent_t> start(numGPUs);
  vector<cudaEvent_t> stop(numGPUs);
  vector<cudaStream_t> stream(numGPUs);

  cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
  cudaCheckError();

  for (int d = 0; d < numGPUs; d++) {
    cudaSetDevice(d);
    cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
    cudaMalloc(&buffers[d], numElems * sizeof(int));
    cudaCheckError();
    cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
    cudaCheckError();
    cudaEventCreate(&start[d]);
    cudaCheckError();
    cudaEventCreate(&stop[d]);
    cudaCheckError();
  }

  vector<double> bandwidthMatrix(numGPUs * numGPUs);

  for (int i = 0; i < numGPUs; i++) {
    cudaSetDevice(i);

    for (int j = 0; j < numGPUs; j++) {
      int access = 0;
      if (p2p) {
        cudaDeviceCanAccessPeer(&access, i, j);
        if (access) {
          cudaDeviceEnablePeerAccess(j, 0);
          cudaCheckError();
          cudaSetDevice(j);
          cudaCheckError();
          cudaDeviceEnablePeerAccess(i, 0);
          cudaCheckError();
          cudaSetDevice(i);
          cudaCheckError();
        }
      }

      cudaStreamSynchronize(stream[i]);
      cudaCheckError();

      // Block the stream until all the work is queued up
      // DANGER! - cudaMemcpy*Async may infinitely block waiting for
      // room to push the operation, so keep the number of repeatitions
      // relatively low.  Higher repeatitions will cause the delay kernel
      // to timeout and lead to unstable results.
      *flag = 0;
      delay<<<1, 1, 0, stream[i]>>>(flag);
      cudaCheckError();
      cudaEventRecord(start[i], stream[i]);
      cudaCheckError();

      if (i == j) {
        // Perform intra-GPU, D2D copies
        performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
                       access, stream[i]);

      } else {
        if (p2p_method == P2P_WRITE) {
          performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
                         stream[i]);
        } else {
          performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
                         stream[i]);
        }
      }

      cudaEventRecord(stop[i], stream[i]);
      cudaCheckError();

      // Release the queued events
      *flag = 1;
      cudaStreamSynchronize(stream[i]);
      cudaCheckError();

      float time_ms;
      cudaEventElapsedTime(&time_ms, start[i], stop[i]);
      double time_s = time_ms / 1e3;

      double gb = numElems * sizeof(int) * repeat / (double)1e9;
      if (i == j) {
        gb *= 2;  // must count both the read and the write here
      }
      bandwidthMatrix[i * numGPUs + j] = gb / time_s;
      if (p2p && access) {
        cudaDeviceDisablePeerAccess(j);
        cudaSetDevice(j);
        cudaDeviceDisablePeerAccess(i);
        cudaSetDevice(i);
        cudaCheckError();
      }
    }
  }

  printf("   D\\D");

  for (int j = 0; j < numGPUs; j++) {
    printf("%6d ", j);
  }

  printf("\n");

  for (int i = 0; i < numGPUs; i++) {
    printf("%6d ", i);

    for (int j = 0; j < numGPUs; j++) {
      printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);
    }

    printf("\n");
  }

  for (int d = 0; d < numGPUs; d++) {
    cudaSetDevice(d);
    cudaFree(buffers[d]);
    cudaFree(buffersD2D[d]);
    cudaCheckError();
    cudaEventDestroy(start[d]);
    cudaCheckError();
    cudaEventDestroy(stop[d]);
    cudaCheckError();
    cudaStreamDestroy(stream[d]);
    cudaCheckError();
  }

  cudaFreeHost((void *)flag);
  cudaCheckError();
}

void outputBidirectionalBandwidthMatrix(int numGPUs, bool p2p) {
  int numElems = 10000000;
  int repeat = 5;
  volatile int *flag = NULL;
  vector<int *> buffers(numGPUs);
  vector<int *> buffersD2D(numGPUs);
  vector<cudaEvent_t> start(numGPUs);
  vector<cudaEvent_t> stop(numGPUs);
  vector<cudaStream_t> stream0(numGPUs);
  vector<cudaStream_t> stream1(numGPUs);

  cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
  cudaCheckError();

  for (int d = 0; d < numGPUs; d++) {
    cudaSetDevice(d);
    cudaMalloc(&buffers[d], numElems * sizeof(int));
    cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
    cudaCheckError();
    cudaEventCreate(&start[d]);
    cudaCheckError();
    cudaEventCreate(&stop[d]);
    cudaCheckError();
    cudaStreamCreateWithFlags(&stream0[d], cudaStreamNonBlocking);
    cudaCheckError();
    cudaStreamCreateWithFlags(&stream1[d], cudaStreamNonBlocking);
    cudaCheckError();
  }

  vector<double> bandwidthMatrix(numGPUs * numGPUs);

  for (int i = 0; i < numGPUs; i++) {
    cudaSetDevice(i);

    for (int j = 0; j < numGPUs; j++) {
      int access = 0;
      if (p2p) {
        cudaDeviceCanAccessPeer(&access, i, j);
        if (access) {
          cudaSetDevice(i);
          cudaDeviceEnablePeerAccess(j, 0);
          cudaCheckError();
          cudaSetDevice(j);
          cudaDeviceEnablePeerAccess(i, 0);
          cudaCheckError();
        }
      }

      cudaSetDevice(i);
      cudaStreamSynchronize(stream0[i]);
      cudaStreamSynchronize(stream1[j]);
      cudaCheckError();

      // Block the stream until all the work is queued up
      // DANGER! - cudaMemcpy*Async may infinitely block waiting for
      // room to push the operation, so keep the number of repeatitions
      // relatively low.  Higher repeatitions will cause the delay kernel
      // to timeout and lead to unstable results.
      *flag = 0;
      cudaSetDevice(i);
      // No need to block stream1 since it'll be blocked on stream0's event
      delay<<<1, 1, 0, stream0[i]>>>(flag);
      cudaCheckError();

      // Force stream1 not to start until stream0 does, in order to ensure
      // the events on stream0 fully encompass the time needed for all
      // operations
      cudaEventRecord(start[i], stream0[i]);
      cudaStreamWaitEvent(stream1[j], start[i], 0);

      if (i == j) {
        // For intra-GPU perform 2 memcopies buffersD2D <-> buffers
        performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
                       access, stream0[i]);
        performP2PCopy(buffersD2D[i], i, buffers[i], i, numElems, repeat,
                       access, stream1[i]);
      } else {
        if (access && p2p_mechanism == SM) {
          cudaSetDevice(j);
        }
        performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
                       stream1[j]);
        if (access && p2p_mechanism == SM) {
          cudaSetDevice(i);
        }
        performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
                       stream0[i]);
      }

      // Notify stream0 that stream1 is complete and record the time of
      // the total transaction
      cudaEventRecord(stop[j], stream1[j]);
      cudaStreamWaitEvent(stream0[i], stop[j], 0);
      cudaEventRecord(stop[i], stream0[i]);

      // Release the queued operations
      *flag = 1;
      cudaStreamSynchronize(stream0[i]);
      cudaStreamSynchronize(stream1[j]);
      cudaCheckError();

      float time_ms;
      cudaEventElapsedTime(&time_ms, start[i], stop[i]);
      double time_s = time_ms / 1e3;

      double gb = 2.0 * numElems * sizeof(int) * repeat / (double)1e9;
      if (i == j) {
        gb *= 2;  // must count both the read and the write here
      }
      bandwidthMatrix[i * numGPUs + j] = gb / time_s;
      if (p2p && access) {
        cudaSetDevice(i);
        cudaDeviceDisablePeerAccess(j);
        cudaSetDevice(j);
        cudaDeviceDisablePeerAccess(i);
      }
    }
  }

  printf("   D\\D");

  for (int j = 0; j < numGPUs; j++) {
    printf("%6d ", j);
  }

  printf("\n");

  for (int i = 0; i < numGPUs; i++) {
    printf("%6d ", i);

    for (int j = 0; j < numGPUs; j++) {
      printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);
    }

    printf("\n");
  }

  for (int d = 0; d < numGPUs; d++) {
    cudaSetDevice(d);
    cudaFree(buffers[d]);
    cudaFree(buffersD2D[d]);
    cudaCheckError();
    cudaEventDestroy(start[d]);
    cudaCheckError();
    cudaEventDestroy(stop[d]);
    cudaCheckError();
    cudaStreamDestroy(stream0[d]);
    cudaCheckError();
    cudaStreamDestroy(stream1[d]);
    cudaCheckError();
  }

  cudaFreeHost((void *)flag);
  cudaCheckError();
}

void outputLatencyMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {
  int repeat = 100;
  int numElems = 4;  // perform 1-int4 transfer.
  volatile int *flag = NULL;
  StopWatchInterface *stopWatch = NULL;
  vector<int *> buffers(numGPUs);
  vector<int *> buffersD2D(numGPUs);  // buffer for D2D, that is, intra-GPU copy
  vector<cudaStream_t> stream(numGPUs);
  vector<cudaEvent_t> start(numGPUs);
  vector<cudaEvent_t> stop(numGPUs);

  cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
  cudaCheckError();

  if (!sdkCreateTimer(&stopWatch)) {
    printf("Failed to create stop watch\n");
    exit(EXIT_FAILURE);
  }
  sdkStartTimer(&stopWatch);

  for (int d = 0; d < numGPUs; d++) {
    cudaSetDevice(d);
    cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
    cudaMalloc(&buffers[d], sizeof(int) * numElems);
    cudaMalloc(&buffersD2D[d], sizeof(int) * numElems);
    cudaCheckError();
    cudaEventCreate(&start[d]);
    cudaCheckError();
    cudaEventCreate(&stop[d]);
    cudaCheckError();
  }

  vector<double> gpuLatencyMatrix(numGPUs * numGPUs);
  vector<double> cpuLatencyMatrix(numGPUs * numGPUs);

  for (int i = 0; i < numGPUs; i++) {
    cudaSetDevice(i);

    for (int j = 0; j < numGPUs; j++) {
      int access = 0;
      if (p2p) {
        cudaDeviceCanAccessPeer(&access, i, j);
        if (access) {
          cudaDeviceEnablePeerAccess(j, 0);
          cudaCheckError();
          cudaSetDevice(j);
          cudaDeviceEnablePeerAccess(i, 0);
          cudaSetDevice(i);
          cudaCheckError();
        }
      }
      cudaStreamSynchronize(stream[i]);
      cudaCheckError();

      // Block the stream until all the work is queued up
      // DANGER! - cudaMemcpy*Async may infinitely block waiting for
      // room to push the operation, so keep the number of repeatitions
      // relatively low.  Higher repeatitions will cause the delay kernel
      // to timeout and lead to unstable results.
      *flag = 0;
      delay<<<1, 1, 0, stream[i]>>>(flag);
      cudaCheckError();
      cudaEventRecord(start[i], stream[i]);

      sdkResetTimer(&stopWatch);
      if (i == j) {
        // Perform intra-GPU, D2D copies
        performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
                       access, stream[i]);
      } else {
        if (p2p_method == P2P_WRITE) {
          performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
                         stream[i]);
        } else {
          performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
                         stream[i]);
        }
      }
      float cpu_time_ms = sdkGetTimerValue(&stopWatch);

      cudaEventRecord(stop[i], stream[i]);
      // Now that the work has been queued up, release the stream
      *flag = 1;
      cudaStreamSynchronize(stream[i]);
      cudaCheckError();

      float gpu_time_ms;
      cudaEventElapsedTime(&gpu_time_ms, start[i], stop[i]);

      gpuLatencyMatrix[i * numGPUs + j] = gpu_time_ms * 1e3 / repeat;
      cpuLatencyMatrix[i * numGPUs + j] = cpu_time_ms * 1e3 / repeat;
      if (p2p && access) {
        cudaDeviceDisablePeerAccess(j);
        cudaSetDevice(j);
        cudaDeviceDisablePeerAccess(i);
        cudaSetDevice(i);
        cudaCheckError();
      }
    }
  }

  printf("   GPU");

  for (int j = 0; j < numGPUs; j++) {
    printf("%6d ", j);
  }

  printf("\n");

  for (int i = 0; i < numGPUs; i++) {
    printf("%6d ", i);

    for (int j = 0; j < numGPUs; j++) {
      printf("%6.02f ", gpuLatencyMatrix[i * numGPUs + j]);
    }

    printf("\n");
  }

  printf("\n   CPU");

  for (int j = 0; j < numGPUs; j++) {
    printf("%6d ", j);
  }

  printf("\n");

  for (int i = 0; i < numGPUs; i++) {
    printf("%6d ", i);

    for (int j = 0; j < numGPUs; j++) {
      printf("%6.02f ", cpuLatencyMatrix[i * numGPUs + j]);
    }

    printf("\n");
  }

  for (int d = 0; d < numGPUs; d++) {
    cudaSetDevice(d);
    cudaFree(buffers[d]);
    cudaFree(buffersD2D[d]);
    cudaCheckError();
    cudaEventDestroy(start[d]);
    cudaCheckError();
    cudaEventDestroy(stop[d]);
    cudaCheckError();
    cudaStreamDestroy(stream[d]);
    cudaCheckError();
  }

  sdkDeleteTimer(&stopWatch);

  cudaFreeHost((void *)flag);
  cudaCheckError();
}

int main(int argc, char **argv) {
  int numGPUs;
  P2PDataTransfer p2p_method = P2P_WRITE;

  cudaGetDeviceCount(&numGPUs);
  cudaCheckError();

  // process command line args
  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
    printHelp();
    return 0;
  }

  if (checkCmdLineFlag(argc, (const char **)argv, "p2p_read")) {
    p2p_method = P2P_READ;
  }

  if (checkCmdLineFlag(argc, (const char **)argv, "sm_copy")) {
    p2p_mechanism = SM;
  }

  printf("[%s]\n", sSampleName);

  // output devices
  for (int i = 0; i < numGPUs; i++) {
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, i);
    cudaCheckError();
    printf("Device: %d, %s, pciBusID: %x, pciDeviceID: %x, pciDomainID:%x\n", i,
           prop.name, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID);
  }

  checkP2Paccess(numGPUs);

  // Check peer-to-peer connectivity
  printf("P2P Connectivity Matrix\n");
  printf("     D\\D");

  for (int j = 0; j < numGPUs; j++) {
    printf("%6d", j);
  }
  printf("\n");

  for (int i = 0; i < numGPUs; i++) {
    printf("%6d\t", i);
    for (int j = 0; j < numGPUs; j++) {
      if (i != j) {
        int access;
        cudaDeviceCanAccessPeer(&access, i, j);
        cudaCheckError();
        printf("%6d", (access) ? 1 : 0);
      } else {
        printf("%6d", 1);
      }
    }
    printf("\n");
  }

  printf("Unidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");
  outputBandwidthMatrix(numGPUs, false, P2P_WRITE);
  printf("Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)\n");
  outputBandwidthMatrix(numGPUs, true, P2P_WRITE);
  if (p2p_method == P2P_READ) {
    printf("Unidirectional P2P=Enabled Bandwidth (P2P Reads) Matrix (GB/s)\n");
    outputBandwidthMatrix(numGPUs, true, p2p_method);
  }
  printf("Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");
  outputBidirectionalBandwidthMatrix(numGPUs, false);
  printf("Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n");
  outputBidirectionalBandwidthMatrix(numGPUs, true);

  printf("P2P=Disabled Latency Matrix (us)\n");
  outputLatencyMatrix(numGPUs, false, P2P_WRITE);
  printf("P2P=Enabled Latency (P2P Writes) Matrix (us)\n");
  outputLatencyMatrix(numGPUs, true, P2P_WRITE);
  if (p2p_method == P2P_READ) {
    printf("P2P=Enabled Latency (P2P Reads) Matrix (us)\n");
    outputLatencyMatrix(numGPUs, true, p2p_method);
  }

  printf(
      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
      "Results may vary when GPU Boost is enabled.\n");

  exit(EXIT_SUCCESS);
}