-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Vector addition using SYCL for CUDA (#1)
* Vector addition using SYCL for CUDA Added trivial example of vector addition using SYCL for CUDA support in DPC++. Includes a simple CMake build configuration to call the DPC++ compiler and build with both CUDA and SPIR support. Co-Authored-By: John Lawson <john@codeplay.com>
- Loading branch information
Showing
5 changed files
with
322 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
SYCL for CUDA examples | ||
========================== | ||
|
||
This repository contains examples that demonstrate how to use the CUDA backend | ||
in SYCL. | ||
|
||
The examples are built and test in Linux with GCC 7.4, NVCC 10.1 and the | ||
experimental support for CUDA in the DPC++ SYCL implementation. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
cmake_minimum_required(VERSION 3.10 FATAL_ERROR) | ||
project(cmake_and_cuda LANGUAGES CXX CUDA) | ||
|
||
include(CTest) | ||
|
||
# SYCL installation | ||
if (NOT SYCL_ROOT) | ||
message(FATAL_ERROR "No SYCL installation detected") | ||
endif(NOT SYCL_ROOT) | ||
|
||
set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/") | ||
set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so") | ||
set(SYCL_FLAGS "-fsycl" | ||
"-fsycl-targets=nvptx64-nvidia-cuda-sycldevice,spir64-unknown-linux-sycldevice" | ||
"-fsycl-unnamed-lambda") | ||
|
||
# Build the CUDA code | ||
add_executable(vector_addition vector_addition.cu) | ||
target_compile_features(vector_addition PUBLIC cxx_std_11) | ||
set_target_properties(vector_addition PROPERTIES CUDA_SEPARABLE_COMPILATION ON) | ||
set_property(TARGET vector_addition PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}") | ||
|
||
# Build the SYCL code | ||
add_executable (sycl_vector_addition vector_addition.cpp) | ||
target_compile_features(sycl_vector_addition PUBLIC cxx_std_17) | ||
target_compile_options(sycl_vector_addition PUBLIC ${SYCL_FLAGS}) | ||
target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_FLAGS}) | ||
target_include_directories(sycl_vector_addition PUBLIC ${SYCL_INCLUDE_DIR}) | ||
target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_LIB}) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
Example 01: Vector addition | ||
=============================== | ||
|
||
This trivial example can be used to compare a simple vector addition in | ||
CUDA to an equivalent implementation in SYCL for CUDA. | ||
The aim of the example is also to highlight how to build an application | ||
with SYCL for CUDA using DPC++ support, for which an example CMakefile is | ||
provided. | ||
For detailed documentation on how to migrate from CUDA to SYCL, see | ||
[SYCL For CUDA Developers](https://developer.codeplay.com/products/computecpp/ce/guides/sycl-for-cuda-developers). | ||
|
||
Note currently the CUDA backend does not support the | ||
[USM](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc) | ||
extension, so we use `sycl::buffer` and `sycl::accessors` instead. | ||
|
||
Pre-requisites | ||
--------------- | ||
|
||
You would need an installation of DPC++ with CUDA support, | ||
see [Getting Started Guide](https://github.com/codeplaysoftware/sycl-for-cuda/blob/cuda/sycl/doc/GetStartedWithSYCLCompiler.md) | ||
for details on how to build it. | ||
|
||
The example has been built on CMake 3.13.3 and nvcc 10.1.243. | ||
|
||
Building the example | ||
--------------------- | ||
|
||
```sh | ||
$ mkdir build && cd build` | ||
$ cmake ../ -DSYCL_ROOT=/path/to/dpc++/install \ | ||
-DCMAKE_CXX_COMPILER=/path/to/dpc++/install/bin/clang++ | ||
$ make -j 8 | ||
``` | ||
|
||
This should produce two binaries, `vector_addition` and `sycl_vector_addition`. | ||
The former is the unmodified CUDA source and the second is the SYCL for CUDA | ||
version. | ||
|
||
Running the example | ||
-------------------- | ||
|
||
The path to `libsycl.so` and the PI plugins must be in `LD_LIBRARY_PATH`. | ||
A simple way of running the app is as follows: | ||
|
||
``` | ||
$ LD_LIBRARY_PATH=$HOME/open-source/sycl4cuda/lib ./sycl_vector_addition | ||
``` | ||
Note the `SYCL_BE` env variable is not required, since we use a custom | ||
device selector. | ||
CMake Build script | ||
------------------------ | ||
The provided CMake build script uses the native CUDA support to build the | ||
CUDA application. It also serves as a check that all CUDA requirements | ||
on the system are available (such as an installation of CUDA on the system). | ||
Two flags are required: `-DSYCL_ROOT`, which must point to the place where the | ||
DPC++ compiler is installed, and `-DCMAKE_CXX_COMPILER`, which must point to | ||
the Clang compiler provided by DPC++. | ||
The CMake target `sycl_vector_addition` will build the SYCL version of | ||
the application. | ||
Note the variable `SYCL_FLAGS` is used to store the Clang flags that enable | ||
the compilation of a SYCL application (`-fsycl`) but also the flag that specify | ||
which targets are built (`-fsycl-targets`). | ||
In this case, we will build the example for both NVPTX and SPIR64. | ||
This means the kernel for the vector addition will be compiled for both | ||
backends, and runtime selection to the right queue will decide which variant | ||
to use. | ||
Note the project is built with C++17 support, which enables the usage of | ||
[deduction guides](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/deduction_guides/SYCL_INTEL_deduction_guides.asciidoc) to reduce the number of template parameters used. | ||
SYCL Vector Addition code | ||
-------------------------- | ||
The vector addition example uses a simple approach to implement with a plain | ||
kernel that performs the add. Vectors are stored directly in buffers. | ||
Data is initialized on the host using host accessors. | ||
This approach avoids creating unnecessary storage on the host, and facilitates | ||
the SYCL runtime to use optimized memory paths. | ||
The SYCL queue created later on uses a custom `CUDASelector` to select | ||
a CUDA device, or bail out if its not there. | ||
The CUDA selector uses the `info::device::driver_version` to identify the | ||
device exported by the CUDA backend. | ||
If the NVIDIA OpenCL implementation is available on the | ||
system, it will be reported as another SYCL device. The driver | ||
version is the best way to differentiate between the two. | ||
The command group is created as a lambda expression that takes the | ||
`sycl::handler` parameter. Accessors are obtained from buffers using the | ||
`get_access` method. | ||
Finally the `parallel_for` with the SYCL kernel is invoked as usual. | ||
The command group is submitted to a queue which will convert all the | ||
operations into CUDA commands that will be executed once the host accessor | ||
is encountered later on. | ||
The host accessor will trigger a copy of the data back to the host, and | ||
then the values are reduced into a single sum element. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
/** | ||
* SYCL FOR CUDA : Vector Addition Example | ||
* | ||
* Copyright 2020 Codeplay Software Ltd. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
* | ||
* @File: vector_addition.cpp | ||
*/ | ||
|
||
#include <algorithm> | ||
#include <iostream> | ||
#include <vector> | ||
|
||
#include <CL/sycl.hpp> | ||
|
||
class CUDASelector : public sycl::device_selector { | ||
public: | ||
int operator()(const sycl::device &Device) const override { | ||
using namespace sycl::info; | ||
|
||
const std::string DriverVersion = Device.get_info<device::driver_version>(); | ||
|
||
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) { | ||
std::cout << " CUDA device found " << std::endl; | ||
return 1; | ||
}; | ||
return -1; | ||
} | ||
}; | ||
|
||
class vec_add; | ||
int main(int argc, char *argv[]) { | ||
constexpr const size_t N = 100000; | ||
const sycl::range VecSize{N}; | ||
|
||
sycl::buffer<double> bufA{VecSize}; | ||
sycl::buffer<double> bufB{VecSize}; | ||
sycl::buffer<double> bufC{VecSize}; | ||
|
||
// Initialize input data | ||
{ | ||
const auto dwrite_t = sycl::access::mode::discard_write; | ||
|
||
auto h_a = bufA.get_access<dwrite_t>(); | ||
auto h_b = bufB.get_access<dwrite_t>(); | ||
for (int i = 0; i < N; i++) { | ||
h_a[i] = sin(i) * sin(i); | ||
h_b[i] = cos(i) * cos(i); | ||
} | ||
} | ||
|
||
sycl::queue myQueue{CUDASelector()}; | ||
|
||
// Command Group creation | ||
auto cg = [&](sycl::handler &h) { | ||
const auto read_t = sycl::access::mode::read; | ||
const auto write_t = sycl::access::mode::write; | ||
|
||
auto a = bufA.get_access<read_t>(h); | ||
auto b = bufB.get_access<read_t>(h); | ||
auto c = bufC.get_access<write_t>(h); | ||
|
||
h.parallel_for<vec_add>(VecSize, | ||
[=](sycl::id<1> i) { c[i] = a[i] + b[i]; }); | ||
}; | ||
|
||
myQueue.submit(cg); | ||
|
||
{ | ||
const auto write_t = sycl::access::mode::read; | ||
auto h_c = bufC.get_access<write_t>(); | ||
double sum = 0.0f; | ||
for (int i = 0; i < N; i++) { | ||
sum += h_c[i]; | ||
} | ||
std::cout << "Sum is : " << sum << std::endl; | ||
} | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
// Original source reproduced unmodified here from: | ||
// https://github.com/olcf/vector_addition_tutorials/blob/master/CUDA/vecAdd.cu | ||
|
||
#include <math.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
|
||
// CUDA kernel. Each thread takes care of one element of c | ||
__global__ void vecAdd(double *a, double *b, double *c, int n) { | ||
// Get our global thread ID | ||
int id = blockIdx.x * blockDim.x + threadIdx.x; | ||
|
||
// Make sure we do not go out of bounds | ||
if (id < n) | ||
c[id] = a[id] + b[id]; | ||
} | ||
|
||
int main(int argc, char *argv[]) { | ||
// Size of vectors | ||
int n = 100000; | ||
|
||
// Host input vectors | ||
double *h_a; | ||
double *h_b; | ||
// Host output vector | ||
double *h_c; | ||
|
||
// Device input vectors | ||
double *d_a; | ||
double *d_b; | ||
// Device output vector | ||
double *d_c; | ||
|
||
// Size, in bytes, of each vector | ||
size_t bytes = n * sizeof(double); | ||
|
||
// Allocate memory for each vector on host | ||
h_a = (double *)malloc(bytes); | ||
h_b = (double *)malloc(bytes); | ||
h_c = (double *)malloc(bytes); | ||
|
||
// Allocate memory for each vector on GPU | ||
cudaMalloc(&d_a, bytes); | ||
cudaMalloc(&d_b, bytes); | ||
cudaMalloc(&d_c, bytes); | ||
|
||
int i; | ||
// Initialize vectors on host | ||
for (i = 0; i < n; i++) { | ||
h_a[i] = sin(i) * sin(i); | ||
h_b[i] = cos(i) * cos(i); | ||
} | ||
|
||
// Copy host vectors to device | ||
cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice); | ||
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice); | ||
|
||
int blockSize, gridSize; | ||
|
||
// Number of threads in each thread block | ||
blockSize = 1024; | ||
|
||
// Number of thread blocks in grid | ||
gridSize = (int)ceil((float)n / blockSize); | ||
|
||
// Execute the kernel | ||
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n); | ||
|
||
// Copy array back to host | ||
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost); | ||
|
||
// Sum up vector c and print result divided by n, this should equal 1 within | ||
// error | ||
double sum = 0; | ||
for (i = 0; i < n; i++) | ||
sum += h_c[i]; | ||
printf("final result: %f\n", sum / n); | ||
|
||
// Release device memory | ||
cudaFree(d_a); | ||
cudaFree(d_b); | ||
cudaFree(d_c); | ||
|
||
// Release host memory | ||
free(h_a); | ||
free(h_b); | ||
free(h_c); | ||
|
||
return 0; | ||
} |