diff --git a/README b/README new file mode 100644 index 0000000..2b00e2e --- /dev/null +++ b/README @@ -0,0 +1,8 @@ +SYCL for CUDA examples +========================== + +This repository contains examples that demonstrate how to use the CUDA backend +in SYCL. + +The examples are built and test in Linux with GCC 7.4, NVCC 10.1 and the +experimental support for CUDA in the DPC++ SYCL implementation. diff --git a/example-01/CMakeLists.txt b/example-01/CMakeLists.txt new file mode 100644 index 0000000..fbde766 --- /dev/null +++ b/example-01/CMakeLists.txt @@ -0,0 +1,30 @@ +cmake_minimum_required(VERSION 3.10 FATAL_ERROR) +project(cmake_and_cuda LANGUAGES CXX CUDA) + +include(CTest) + +# SYCL installation +if (NOT SYCL_ROOT) + message(FATAL_ERROR "No SYCL installation detected") +endif(NOT SYCL_ROOT) + +set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/") +set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so") +set(SYCL_FLAGS "-fsycl" + "-fsycl-targets=nvptx64-nvidia-cuda-sycldevice,spir64-unknown-linux-sycldevice" + "-fsycl-unnamed-lambda") + +# Build the CUDA code +add_executable(vector_addition vector_addition.cu) +target_compile_features(vector_addition PUBLIC cxx_std_11) +set_target_properties(vector_addition PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +set_property(TARGET vector_addition PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}") + +# Build the SYCL code +add_executable (sycl_vector_addition vector_addition.cpp) +target_compile_features(sycl_vector_addition PUBLIC cxx_std_17) +target_compile_options(sycl_vector_addition PUBLIC ${SYCL_FLAGS}) +target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_FLAGS}) +target_include_directories(sycl_vector_addition PUBLIC ${SYCL_INCLUDE_DIR}) +target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_LIB}) + diff --git a/example-01/README.md b/example-01/README.md new file mode 100644 index 0000000..4f609b5 --- /dev/null +++ b/example-01/README.md @@ -0,0 +1,103 @@ +Example 01: Vector addition +=============================== + +This trivial example can be used to compare a simple vector addition in +CUDA to an equivalent implementation in SYCL for CUDA. +The aim of the example is also to highlight how to build an application +with SYCL for CUDA using DPC++ support, for which an example CMakefile is +provided. +For detailed documentation on how to migrate from CUDA to SYCL, see +[SYCL For CUDA Developers](https://developer.codeplay.com/products/computecpp/ce/guides/sycl-for-cuda-developers). + +Note currently the CUDA backend does not support the +[USM](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc) +extension, so we use `sycl::buffer` and `sycl::accessors` instead. + +Pre-requisites +--------------- + +You would need an installation of DPC++ with CUDA support, +see [Getting Started Guide](https://github.com/codeplaysoftware/sycl-for-cuda/blob/cuda/sycl/doc/GetStartedWithSYCLCompiler.md) +for details on how to build it. + +The example has been built on CMake 3.13.3 and nvcc 10.1.243. + +Building the example +--------------------- + +```sh +$ mkdir build && cd build` +$ cmake ../ -DSYCL_ROOT=/path/to/dpc++/install \ + -DCMAKE_CXX_COMPILER=/path/to/dpc++/install/bin/clang++ +$ make -j 8 +``` + +This should produce two binaries, `vector_addition` and `sycl_vector_addition`. +The former is the unmodified CUDA source and the second is the SYCL for CUDA +version. + +Running the example +-------------------- + +The path to `libsycl.so` and the PI plugins must be in `LD_LIBRARY_PATH`. +A simple way of running the app is as follows: + +``` +$ LD_LIBRARY_PATH=$HOME/open-source/sycl4cuda/lib ./sycl_vector_addition +``` + +Note the `SYCL_BE` env variable is not required, since we use a custom +device selector. + +CMake Build script +------------------------ + +The provided CMake build script uses the native CUDA support to build the +CUDA application. It also serves as a check that all CUDA requirements +on the system are available (such as an installation of CUDA on the system). + +Two flags are required: `-DSYCL_ROOT`, which must point to the place where the +DPC++ compiler is installed, and `-DCMAKE_CXX_COMPILER`, which must point to +the Clang compiler provided by DPC++. + +The CMake target `sycl_vector_addition` will build the SYCL version of +the application. +Note the variable `SYCL_FLAGS` is used to store the Clang flags that enable +the compilation of a SYCL application (`-fsycl`) but also the flag that specify +which targets are built (`-fsycl-targets`). +In this case, we will build the example for both NVPTX and SPIR64. +This means the kernel for the vector addition will be compiled for both +backends, and runtime selection to the right queue will decide which variant +to use. + +Note the project is built with C++17 support, which enables the usage of +[deduction guides](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/deduction_guides/SYCL_INTEL_deduction_guides.asciidoc) to reduce the number of template parameters used. + +SYCL Vector Addition code +-------------------------- + +The vector addition example uses a simple approach to implement with a plain +kernel that performs the add. Vectors are stored directly in buffers. +Data is initialized on the host using host accessors. +This approach avoids creating unnecessary storage on the host, and facilitates +the SYCL runtime to use optimized memory paths. + +The SYCL queue created later on uses a custom `CUDASelector` to select +a CUDA device, or bail out if its not there. +The CUDA selector uses the `info::device::driver_version` to identify the +device exported by the CUDA backend. +If the NVIDIA OpenCL implementation is available on the +system, it will be reported as another SYCL device. The driver +version is the best way to differentiate between the two. + +The command group is created as a lambda expression that takes the +`sycl::handler` parameter. Accessors are obtained from buffers using the +`get_access` method. +Finally the `parallel_for` with the SYCL kernel is invoked as usual. + +The command group is submitted to a queue which will convert all the +operations into CUDA commands that will be executed once the host accessor +is encountered later on. + +The host accessor will trigger a copy of the data back to the host, and +then the values are reduced into a single sum element. diff --git a/example-01/vector_addition.cpp b/example-01/vector_addition.cpp new file mode 100644 index 0000000..53f1928 --- /dev/null +++ b/example-01/vector_addition.cpp @@ -0,0 +1,91 @@ +/** + * SYCL FOR CUDA : Vector Addition Example + * + * Copyright 2020 Codeplay Software Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @File: vector_addition.cpp + */ + +#include +#include +#include + +#include + +class CUDASelector : public sycl::device_selector { +public: + int operator()(const sycl::device &Device) const override { + using namespace sycl::info; + + const std::string DriverVersion = Device.get_info(); + + if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) { + std::cout << " CUDA device found " << std::endl; + return 1; + }; + return -1; + } +}; + +class vec_add; +int main(int argc, char *argv[]) { + constexpr const size_t N = 100000; + const sycl::range VecSize{N}; + + sycl::buffer bufA{VecSize}; + sycl::buffer bufB{VecSize}; + sycl::buffer bufC{VecSize}; + + // Initialize input data + { + const auto dwrite_t = sycl::access::mode::discard_write; + + auto h_a = bufA.get_access(); + auto h_b = bufB.get_access(); + for (int i = 0; i < N; i++) { + h_a[i] = sin(i) * sin(i); + h_b[i] = cos(i) * cos(i); + } + } + + sycl::queue myQueue{CUDASelector()}; + + // Command Group creation + auto cg = [&](sycl::handler &h) { + const auto read_t = sycl::access::mode::read; + const auto write_t = sycl::access::mode::write; + + auto a = bufA.get_access(h); + auto b = bufB.get_access(h); + auto c = bufC.get_access(h); + + h.parallel_for(VecSize, + [=](sycl::id<1> i) { c[i] = a[i] + b[i]; }); + }; + + myQueue.submit(cg); + + { + const auto write_t = sycl::access::mode::read; + auto h_c = bufC.get_access(); + double sum = 0.0f; + for (int i = 0; i < N; i++) { + sum += h_c[i]; + } + std::cout << "Sum is : " << sum << std::endl; + } + + return 0; +} diff --git a/example-01/vector_addition.cu b/example-01/vector_addition.cu new file mode 100644 index 0000000..4155f8a --- /dev/null +++ b/example-01/vector_addition.cu @@ -0,0 +1,90 @@ +// Original source reproduced unmodified here from: +// https://github.com/olcf/vector_addition_tutorials/blob/master/CUDA/vecAdd.cu + +#include +#include +#include + +// CUDA kernel. Each thread takes care of one element of c +__global__ void vecAdd(double *a, double *b, double *c, int n) { + // Get our global thread ID + int id = blockIdx.x * blockDim.x + threadIdx.x; + + // Make sure we do not go out of bounds + if (id < n) + c[id] = a[id] + b[id]; +} + +int main(int argc, char *argv[]) { + // Size of vectors + int n = 100000; + + // Host input vectors + double *h_a; + double *h_b; + // Host output vector + double *h_c; + + // Device input vectors + double *d_a; + double *d_b; + // Device output vector + double *d_c; + + // Size, in bytes, of each vector + size_t bytes = n * sizeof(double); + + // Allocate memory for each vector on host + h_a = (double *)malloc(bytes); + h_b = (double *)malloc(bytes); + h_c = (double *)malloc(bytes); + + // Allocate memory for each vector on GPU + cudaMalloc(&d_a, bytes); + cudaMalloc(&d_b, bytes); + cudaMalloc(&d_c, bytes); + + int i; + // Initialize vectors on host + for (i = 0; i < n; i++) { + h_a[i] = sin(i) * sin(i); + h_b[i] = cos(i) * cos(i); + } + + // Copy host vectors to device + cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice); + + int blockSize, gridSize; + + // Number of threads in each thread block + blockSize = 1024; + + // Number of thread blocks in grid + gridSize = (int)ceil((float)n / blockSize); + + // Execute the kernel + vecAdd<<>>(d_a, d_b, d_c, n); + + // Copy array back to host + cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost); + + // Sum up vector c and print result divided by n, this should equal 1 within + // error + double sum = 0; + for (i = 0; i < n; i++) + sum += h_c[i]; + printf("final result: %f\n", sum / n); + + // Release device memory + cudaFree(d_a); + cudaFree(d_b); + cudaFree(d_c); + + // Release host memory + free(h_a); + free(h_b); + free(h_c); + + return 0; +}