Skip to content

Commit

Permalink
[SYCL][ESIMD][Doc] Add invoke_simd SLM example (intel#10058)
Browse files Browse the repository at this point in the history
Similar to my previous PR, this change adds an example of using SLM with
invoke_simd. We set up SLM in SYCL code and then do our computation in
ESIMD code.

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
  • Loading branch information
sarnex authored Jun 29, 2023
1 parent 1bb5648 commit c99668e
Show file tree
Hide file tree
Showing 2 changed files with 151 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ is to show the basic ESIMD APIs in well known examples.
}).wait_and_throw();
```
2) Calling ESIMD from SYCL using invoke_simd - ["invoke_simd"](./invoke_simd.md).
Please see the full source code here: ["invoke_simd"](./invoke_simd.md)
```c++
[[intel::device_indirectly_callable]] simd<int, VL> __regcall scale(
Expand Down Expand Up @@ -65,4 +66,50 @@ is to show the basic ESIMD APIs in well known examples.
});
});
```
3) TODO: Add more examples here.
3) Calling ESIMD from SYCL using invoke_simd and Shared Local Memory (SLM) - ["invoke_simd_slm"](./invoke_simd_slm.md).

Please see the full source code here: ["invoke_simd_slm"](./invoke_simd_slm.md)
```c++
[[intel::device_indirectly_callable]] SYCL_EXTERNAL void __regcall invoke_slm_load_store(
local_accessor<int, 1> *local_acc, uint32_t slm_byte_offset, int *in, int *out,
simd<uint32_t, VL> global_byte_offsets) SYCL_ESIMD_FUNCTION {
esimd::simd<uint32_t, VL> esimd_global_byte_offsets = global_byte_offsets;
// Read SLM in ESIMD context.
auto local1 = esimd::block_load<int, VL>(*local_acc, slm_byte_offset);
auto local2 = esimd::block_load<int, VL>(*local_acc, slm_byte_offset +
LOCAL_RANGE * sizeof(int));
auto global = esimd::gather(in, esimd_global_byte_offsets);
auto res = global + local1 + local2;
esimd::scatter(out, esimd_global_byte_offsets, res);
}

int main(void) {
auto *in = malloc_shared<int>(GLOBAL_RANGE, q);
auto *out = malloc_shared<int>(GLOBAL_RANGE, q);
...
q.submit([&](handler &cgh) {
auto local_acc = local_accessor<int, 1>(LOCAL_RANGE * 2, cgh);
cgh.parallel_for(nd_range, [=](nd_item<1> item) {
uint32_t global_id = item.get_global_id(0);
uint32_t local_id = item.get_local_id(0);
// Write/initialize SLM in SYCL context.
auto local_acc_copy = local_acc;
local_acc_copy[local_id] = global_id * 2;
local_acc_copy[local_id + LOCAL_RANGE] = global_id * 3;
item.barrier();

uint32_t la_byte_offset = (local_id / VL) * VL * sizeof(int);
uint32_t global_byte_offset = global_id * sizeof(int);
sycl::sub_group sg = item.get_sub_group();
// Pass the local-accessor to initialized SLM memory to ESIMD context.
// Pointer to a local copy of the local accessor is passed instead of a local-accessor value now
// to work-around a known/temporary issue in GPU driver.
auto local_acc_arg = uniform{&local_acc_copy};
invoke_simd(sg, invoke_slm_load_store, local_acc_arg,
uniform{la_byte_offset}, uniform{in}, uniform{out},
global_byte_offset);
});
})
```

6) TODO: Add more examples here.
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
### Call ESIMD from SYCL using invoke_simd and Shared Local Memory (SLM)

In this example, we will manipulate Shared Local Memory (SLM) with SYCL and `invoke_simd`.

Compile and run:
```bash
> clang++ -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr invoke_simd_slm.cpp

> IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 ONEAPI_DEVICE_SELECTOR=level_zero:gpu ./a.out
Running on Intel(R) UHD Graphics 630
Passed
```
Source code:
```c++
#include <sycl/ext/intel/esimd.hpp>
#include <sycl/ext/oneapi/experimental/invoke_simd.hpp>
#include <sycl/sycl.hpp>
#include <iostream>

using namespace sycl;
using namespace sycl::ext::oneapi::experimental;
namespace esimd = sycl::ext::intel::esimd;

constexpr int VL = 16;
constexpr uint32_t LOCAL_RANGE = VL * 2; // 2 sub-groups per 1 group.
constexpr uint32_t GLOBAL_RANGE = LOCAL_RANGE * 2; // 2 groups.

[[intel::device_indirectly_callable]] SYCL_EXTERNAL void __regcall invoke_slm_load_store(
local_accessor<int, 1> *local_acc, uint32_t slm_byte_offset, int *in, int *out,
simd<uint32_t, VL> global_byte_offsets) SYCL_ESIMD_FUNCTION {
esimd::simd<uint32_t, VL> esimd_global_byte_offsets = global_byte_offsets;
// Read SLM in ESIMD context.
auto local1 = esimd::block_load<int, VL>(*local_acc, slm_byte_offset);
auto local2 = esimd::block_load<int, VL>(*local_acc, slm_byte_offset +
LOCAL_RANGE * sizeof(int));
auto global = esimd::gather(in, esimd_global_byte_offsets);
auto res = global + local1 + local2;
esimd::scatter(out, esimd_global_byte_offsets, res);
}

int main(void) {
auto q = queue{gpu_selector_v};
auto dev = q.get_device();
std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
<< std::endl;

sycl::nd_range<1> nd_range{range<1>{GLOBAL_RANGE}, range<1>{LOCAL_RANGE}};

auto *in = malloc_shared<int>(GLOBAL_RANGE, q);
auto *out = malloc_shared<int>(GLOBAL_RANGE, q);

for (auto i = 0; i < GLOBAL_RANGE; i++) {
in[i] = i;
out[i] = 0;
}
try {
q.submit([&](handler &cgh) {
auto local_acc = local_accessor<int, 1>(LOCAL_RANGE * 2, cgh);
cgh.parallel_for(nd_range, [=](nd_item<1> item) {
uint32_t global_id = item.get_global_id(0);
uint32_t local_id = item.get_local_id(0);
// Write/initialize SLM in SYCL context.
auto local_acc_copy = local_acc;
local_acc_copy[local_id] = global_id * 2;
local_acc_copy[local_id + LOCAL_RANGE] = global_id * 3;
item.barrier();

uint32_t la_byte_offset = (local_id / VL) * VL * sizeof(int);
uint32_t global_byte_offset = global_id * sizeof(int);
sycl::sub_group sg = item.get_sub_group();
// Pass the local-accessor to initialized SLM memory to ESIMD context.
// Pointer to a local copy of the local accessor is passed instead of a local-accessor value now
// to work-around a known/temporary issue in GPU driver.
auto local_acc_arg = uniform{&local_acc_copy};
invoke_simd(sg, invoke_slm_load_store, local_acc_arg,
uniform{la_byte_offset}, uniform{in}, uniform{out},
global_byte_offset);
});
}).wait();
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
free(in, q);
free(out, q);
return e.code().value();
}

bool pass = true;
for (auto i = 0; i < GLOBAL_RANGE; i++) {
int expected = in[i] + i * 5;
if (out[i] != expected) {
std::cout << "Error: out[" << i << "]:" << out[i]
<< " != [expected]:" << expected << std::endl;
pass = false;
}
}

free(in, q);
free(out, q);

std::cout << (pass ? "Passed" : "FAILED") << std::endl;
return pass ? 0 : 1;
}
```

0 comments on commit c99668e

Please sign in to comment.