forked from intel/llvm
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SYCL][ESIMD][Doc] Add invoke_simd SLM example (intel#10058)
Similar to my previous PR, this change adds an example of using SLM with invoke_simd. We set up SLM in SYCL code and then do our computation in ESIMD code. --------- Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
- Loading branch information
Showing
2 changed files
with
151 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
103 changes: 103 additions & 0 deletions
103
sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/invoke_simd_slm.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
### Call ESIMD from SYCL using invoke_simd and Shared Local Memory (SLM) | ||
|
||
In this example, we will manipulate Shared Local Memory (SLM) with SYCL and `invoke_simd`. | ||
|
||
Compile and run: | ||
```bash | ||
> clang++ -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr invoke_simd_slm.cpp | ||
|
||
> IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 ONEAPI_DEVICE_SELECTOR=level_zero:gpu ./a.out | ||
Running on Intel(R) UHD Graphics 630 | ||
Passed | ||
``` | ||
Source code: | ||
```c++ | ||
#include <sycl/ext/intel/esimd.hpp> | ||
#include <sycl/ext/oneapi/experimental/invoke_simd.hpp> | ||
#include <sycl/sycl.hpp> | ||
#include <iostream> | ||
|
||
using namespace sycl; | ||
using namespace sycl::ext::oneapi::experimental; | ||
namespace esimd = sycl::ext::intel::esimd; | ||
|
||
constexpr int VL = 16; | ||
constexpr uint32_t LOCAL_RANGE = VL * 2; // 2 sub-groups per 1 group. | ||
constexpr uint32_t GLOBAL_RANGE = LOCAL_RANGE * 2; // 2 groups. | ||
|
||
[[intel::device_indirectly_callable]] SYCL_EXTERNAL void __regcall invoke_slm_load_store( | ||
local_accessor<int, 1> *local_acc, uint32_t slm_byte_offset, int *in, int *out, | ||
simd<uint32_t, VL> global_byte_offsets) SYCL_ESIMD_FUNCTION { | ||
esimd::simd<uint32_t, VL> esimd_global_byte_offsets = global_byte_offsets; | ||
// Read SLM in ESIMD context. | ||
auto local1 = esimd::block_load<int, VL>(*local_acc, slm_byte_offset); | ||
auto local2 = esimd::block_load<int, VL>(*local_acc, slm_byte_offset + | ||
LOCAL_RANGE * sizeof(int)); | ||
auto global = esimd::gather(in, esimd_global_byte_offsets); | ||
auto res = global + local1 + local2; | ||
esimd::scatter(out, esimd_global_byte_offsets, res); | ||
} | ||
|
||
int main(void) { | ||
auto q = queue{gpu_selector_v}; | ||
auto dev = q.get_device(); | ||
std::cout << "Running on " << dev.get_info<sycl::info::device::name>() | ||
<< std::endl; | ||
|
||
sycl::nd_range<1> nd_range{range<1>{GLOBAL_RANGE}, range<1>{LOCAL_RANGE}}; | ||
|
||
auto *in = malloc_shared<int>(GLOBAL_RANGE, q); | ||
auto *out = malloc_shared<int>(GLOBAL_RANGE, q); | ||
|
||
for (auto i = 0; i < GLOBAL_RANGE; i++) { | ||
in[i] = i; | ||
out[i] = 0; | ||
} | ||
try { | ||
q.submit([&](handler &cgh) { | ||
auto local_acc = local_accessor<int, 1>(LOCAL_RANGE * 2, cgh); | ||
cgh.parallel_for(nd_range, [=](nd_item<1> item) { | ||
uint32_t global_id = item.get_global_id(0); | ||
uint32_t local_id = item.get_local_id(0); | ||
// Write/initialize SLM in SYCL context. | ||
auto local_acc_copy = local_acc; | ||
local_acc_copy[local_id] = global_id * 2; | ||
local_acc_copy[local_id + LOCAL_RANGE] = global_id * 3; | ||
item.barrier(); | ||
|
||
uint32_t la_byte_offset = (local_id / VL) * VL * sizeof(int); | ||
uint32_t global_byte_offset = global_id * sizeof(int); | ||
sycl::sub_group sg = item.get_sub_group(); | ||
// Pass the local-accessor to initialized SLM memory to ESIMD context. | ||
// Pointer to a local copy of the local accessor is passed instead of a local-accessor value now | ||
// to work-around a known/temporary issue in GPU driver. | ||
auto local_acc_arg = uniform{&local_acc_copy}; | ||
invoke_simd(sg, invoke_slm_load_store, local_acc_arg, | ||
uniform{la_byte_offset}, uniform{in}, uniform{out}, | ||
global_byte_offset); | ||
}); | ||
}).wait(); | ||
} catch (sycl::exception const &e) { | ||
std::cout << "SYCL exception caught: " << e.what() << '\n'; | ||
free(in, q); | ||
free(out, q); | ||
return e.code().value(); | ||
} | ||
|
||
bool pass = true; | ||
for (auto i = 0; i < GLOBAL_RANGE; i++) { | ||
int expected = in[i] + i * 5; | ||
if (out[i] != expected) { | ||
std::cout << "Error: out[" << i << "]:" << out[i] | ||
<< " != [expected]:" << expected << std::endl; | ||
pass = false; | ||
} | ||
} | ||
|
||
free(in, q); | ||
free(out, q); | ||
|
||
std::cout << (pass ? "Passed" : "FAILED") << std::endl; | ||
return pass ? 0 : 1; | ||
} | ||
``` |