codeplaysoftware · Ruyk · Feb 25, 2020 · Feb 11, 2020 · Feb 14, 2020 · Feb 25, 2020
diff --git a/README b/README
@@ -0,0 +1,8 @@
+SYCL for CUDA examples
+==========================
+
+This repository contains examples that demonstrate how to use the CUDA backend
+in SYCL.
+
+The examples are built and test in Linux with GCC 7.4, NVCC 10.1 and the
+experimental support for CUDA in the DPC++ SYCL implementation.
diff --git a/example-01/CMakeLists.txt b/example-01/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
+project(cmake_and_cuda LANGUAGES CXX CUDA)
+
+include(CTest)
+
+# SYCL installation
+if (NOT SYCL_ROOT) 
+  message(FATAL_ERROR "No SYCL installation detected")
+endif(NOT SYCL_ROOT)
+
+set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
+set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
+set(SYCL_FLAGS "-fsycl" 
+      "-fsycl-targets=nvptx64-nvidia-cuda-sycldevice,spir64-unknown-linux-sycldevice"
+      "-fsycl-unnamed-lambda")
+
+# Build the CUDA code
+add_executable(vector_addition vector_addition.cu)
+target_compile_features(vector_addition PUBLIC cxx_std_11)
+set_target_properties(vector_addition PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+set_property(TARGET vector_addition PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
+
+# Build the SYCL code
+add_executable (sycl_vector_addition vector_addition.cpp)
+target_compile_features(sycl_vector_addition PUBLIC cxx_std_17)
+target_compile_options(sycl_vector_addition PUBLIC ${SYCL_FLAGS})
+target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_FLAGS})
+target_include_directories(sycl_vector_addition PUBLIC ${SYCL_INCLUDE_DIR})
+target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_LIB})
+
diff --git a/example-01/README.md b/example-01/README.md
@@ -0,0 +1,104 @@
+Example 01: Vector addition 
+===============================
+
+This trivial example can be used to compare a simple vector addition in 
+CUDA to an equivalent implementation in SYCL for CUDA.
+The aim of the example is also to highlight how to build an application
+with SYCL for CUDA using DPC++ support, for which an example CMakefile is
+provided.
+For detailed documentation on how to migrate from CUDA to SYCL, see 
+[SYCL For CUDA Developers](https://developer.codeplay.com/products/computecpp/ce/guides/sycl-for-cuda-developers).
+
+Note currently the CUDA backend does not support  the
+[USM](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc) 
+extension, so we use `sycl::buffer` and `sycl::accessors` instead.
+
+Pre-requisites
+---------------
+
+You would need an installation of DPC++ with CUDA support, 
+see [Getting Started Guide](https://github.com/codeplaysoftware/sycl-for-cuda/blob/cuda/sycl/doc/GetStartedWithSYCLCompiler.md) 
+for details on how to build it.
+
+The example has been built on CMake 3.13.3 and nvcc 10.1.243.
+
+Building the example
+---------------------
+
+```sh
+$ mkdir build && cd build`
+$ cmake ../ -DSYCL_ROOT=/path/to/dpc++/install \
+    -DCMAKE_CXX_COMPILER=/path/to/dpc++/install/bin/clang++
+$ make -j 8
+```
+
+This should produce two binaries, `vector_addition` and `sycl_vector_addition`.
+The former is the unmodified CUDA source and the second is the SYCL for CUDA
+version.
+
+Running the example
+--------------------
+
+The path to `libsycl.so` and the PI plugins must be in `LD_LIBRARY_PATH`.
+A simple way of running the app is as follows:
+
+```
+$ LD_LIBRARY_PATH=$HOME/open-source/sycl4cuda/lib  ./sycl_vector_addition
+```
+
+Note the `SYCL_BE` env variable is not required, since we use a custom
+device selector.
+
+CMake Build script
+------------------------
+
+The provided CMake build script uses the native CUDA support to build the
+CUDA application. It also serves as a check that all CUDA requirements
+on the system are available (such as an installation of CUDA on the system).
+
+Two flags are required: `-DSYCL_ROOT`, which must point to the place where the
+DPC++ compiler is installed, and `-DCMAKE_CXX_COMPILER`, which must point to
+the Clang compiler provided by DPC++. 
+
+The CMake target `sycl_vector_addition` will build the SYCL version of
+the application.
+Note the variable `SYCL_FLAGS` is used to store the Clang flags that enable
+the compilation of a SYCL application (`-fsycl`) but also the flag that specify
+which targets are built (`-fsycl-targets`).
+In this case, we will build the example for both NVPTX and SPIR64. 
+This means the kernel for the vector addition will be compiled for both
+backends, and runtime selection to the right queue will decide which variant
+to use.
+
+Note the project is built with C++17 support, which enables the usage of
+[deduction guides](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/deduction_guides/SYCL_INTEL_deduction_guides.asciidoc) to reduce the number of template parameters used.
+
+SYCL Vector Addition code
+--------------------------
+
+The vector addition example uses a simple approach to implement with a plain
+kernel that performs the add. Vectors are stored directly in buffers.
+Data is initialized on the host using host accessors. 
+This approach avoids creating unnecesary storage on the host, and facilitates
+the SYCL runtime to use optimized memory paths.
+
+The SYCL queue created later on uses a custom `CUDASelector` to select
+a CUDA device, or bail out if its not there. 
+The CUDA selector uses the `info::device::driver_version` to identify the 
+device exported by the CUDA backend.
+This is important, because if the NVIDIA OpenCL implementation is available on the
+system, it will be reported as another SYCL device, so checking the driver 
+version is the best way to differentiate between the two.
+
+The command group is created as a lambda expression that takes the
+`sycl::handler` parameter. Accessors are obtained from buffers using the
+`get_access` method.
+Finally the `parallel_for` with the SYCL kernel is invoked as usual.
+
+The command group is submitted to a queue which will convert all the 
+operations into CUDA commands that will be executed once the host accessor
+is encountered later on.
+
+The host accessor will trigger a copy of the data back to the host, an
+then the values are reduced into a single sum element.
+
diff --git a/example-01/vector_addition.cpp b/example-01/vector_addition.cpp
@@ -0,0 +1,75 @@
+
+#include <iostream>
+#include <vector>
+#include <algorithm>
+
+#include <CL/sycl.hpp>
+
+
+class CUDASelector : public cl::sycl::device_selector {
+  public:
+    int operator()(const cl::sycl::device &Device) const override {
+      using namespace cl::sycl::info;
+
+ 			const std::string DriverVersion = Device.get_info<device::driver_version>();
+
+      if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
+				std::cout << " CUDA device found " << std::endl;
+        return 1;
+      };
+      return -1;
+    }
+};
+
+class vec_add;
+int main( int argc, char* argv[] ) {
+  constexpr const size_t N = 100000;
+  const sycl::range VecSize{N};
+
+  sycl::buffer<double> bufA{VecSize};
+  sycl::buffer<double> bufB{VecSize};
+  sycl::buffer<double> bufC{VecSize};
+
+  // Initialize input data
+  {
+    const auto dwrite_t = sycl::access::mode::discard_write;
+
+    auto h_a = bufA.get_access<dwrite_t>();
+    auto h_b = bufB.get_access<dwrite_t>();
+    for ( int i = 0; i < N; i++ ) {
+      h_a[i] = sin(i) * sin(i);
+      h_b[i] = cos(i) * cos(i);
+    }
+  }
+
+  sycl::queue myQueue{CUDASelector()};
+
+  // Command Group creation
+  auto cg = [&](sycl::handler& h) {
+    const auto read_t = sycl::access::mode::read;
+    const auto write_t = sycl::access::mode::write;
+
+    auto a = bufA.get_access<read_t>(h);
+    auto b = bufB.get_access<read_t>(h);
+    auto c = bufC.get_access<write_t>(h);
+
+    h.parallel_for<vec_add>(VecSize, [=](sycl::id<1> i) {
+       c[i] = a[i] + b[i];
+    });
+  };
+
+  myQueue.submit(cg);
+
+  {
+    const auto write_t = sycl::access::mode::read;
+    auto h_c = bufC.get_access<write_t>();
+    double sum = 0.0f;
+    for(int i = 0; i < N; i++ ) {
+      sum += h_c[i];
+    }
+    std::cout << "Sum is : " << sum << std::endl;
+  }
+
+  return 0;  
+}
+
diff --git a/example-01/vector_addition.cu b/example-01/vector_addition.cu
@@ -0,0 +1,88 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+// CUDA kernel. Each thread takes care of one element of c
+__global__ void vecAdd(double *a, double *b, double *c, int n)
+{
+    // Get our global thread ID
+    int id = blockIdx.x*blockDim.x+threadIdx.x;
+
+    // Make sure we do not go out of bounds
+    if (id < n)
+        c[id] = a[id] + b[id];
+}
+
+int main( int argc, char* argv[] )
+{
+    // Size of vectors
+    int n = 100000;
+
+    // Host input vectors
+    double *h_a;
+    double *h_b;
+    //Host output vector
+    double *h_c;
+
+    // Device input vectors
+    double *d_a;
+    double *d_b;
+    //Device output vector
+    double *d_c;
+
+    // Size, in bytes, of each vector
+    size_t bytes = n*sizeof(double);
+
+    // Allocate memory for each vector on host
+    h_a = (double*)malloc(bytes);
+    h_b = (double*)malloc(bytes);
+    h_c = (double*)malloc(bytes);
+
+    // Allocate memory for each vector on GPU
+    cudaMalloc(&d_a, bytes);
+    cudaMalloc(&d_b, bytes);
+    cudaMalloc(&d_c, bytes);
+
+    int i;
+    // Initialize vectors on host
+    for( i = 0; i < n; i++ ) {
+        h_a[i] = sin(i)*sin(i);
+        h_b[i] = cos(i)*cos(i);
+    }
+
+    // Copy host vectors to device
+    cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
+    cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
+
+    int blockSize, gridSize;
+
+    // Number of threads in each thread block
+    blockSize = 1024;
+
+    // Number of thread blocks in grid
+    gridSize = (int)ceil((float)n/blockSize);
+
+    // Execute the kernel
+    vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
+
+    // Copy array back to host
+    cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
+
+    // Sum up vector c and print result divided by n, this should equal 1 within error
+    double sum = 0;
+    for(i=0; i<n; i++)
+        sum += h_c[i];
+    printf("final result: %f\n", sum/n);
+
+    // Release device memory
+    cudaFree(d_a);
+    cudaFree(d_b);
+    cudaFree(d_c);
+
+    // Release host memory
+    free(h_a);
+    free(h_b);
+    free(h_c);
+
+    return 0;
+}