Skip to content

Commit

Permalink
[SYCL][Graph] Update design doc for copy optimization and add test
Browse files Browse the repository at this point in the history
- Update UR tag to include L0 command-buffer copy engine optimization
- Add test which mixes copy and kernel commands
- Update design doc to detail copy engine optimization
  • Loading branch information
mfrancepillois authored and Bensuo committed Jun 11, 2024
1 parent a4c3019 commit 020a5ff
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 4 deletions.
18 changes: 18 additions & 0 deletions sycl/doc/design/CommandGraph.md
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,24 @@ Level Zero:
Future work will include exploring L0 API extensions to improve the mapping of
UR command-buffer to L0 command-list.

#### Copy Engine

For performance considerations, the Unified Runtime Level Zero adapter uses
different Level Zero command-queues to submit compute kernels and memory
operations when the device has a dedicated copy engine. To take advantage of the
copy engine when available, the graph workload can also be split between memory
operations and compute kernels. To achieve this, two graph workload
command-lists live simultaneously in a command-buffer.

When the command-buffer is finalized, memory operations (e.g. buffer copy,
buffer fill, ...) are enqueued in the *copy* command-list while the other
commands are enqueued in the compute command-list. On submission, if not empty,
the *copy* command-list is sent to the main copy command-queue while the compute
command-list is sent to the compute command-queue.

Both are executed concurrently. Synchronization between the command-lists is
handled by Level Zero events.

### CUDA

The SYCL Graph CUDA backend relies on the
Expand Down
8 changes: 4 additions & 4 deletions sycl/plugins/unified_runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
# commit 4f105262c30ac231b8db1e250f36e88ef9f0a36d
# Merge: 0f118d75 92fce2ee
set(UNIFIED_RUNTIME_REPO "https://github.com/bensuo/unified-runtime.git")
# commit f06bc02a24418990ecb9d41471b729f32aebd804
# Merge: 42c0b025 9868e3b0
# Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
# Date: Mon Jun 10 13:23:16 2024 +0100
# Merge pull request #1409 from omarahmed1111/Add-CTS-tests-for-image-format
Expand All @@ -110,7 +110,7 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)

fetch_adapter_source(level_zero
${UNIFIED_RUNTIME_REPO}
${UNIFIED_RUNTIME_TAG}
cmd-buf-copy-queue
)

fetch_adapter_source(opencl
Expand Down
102 changes: 102 additions & 0 deletions sycl/test-e2e/Graph/ValidUsage/linear_graph_l0_copy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
// Extra run to check for immediate-command-list in Level Zero
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
//

// Tests that the optimization to use the L0 Copy Engine for memory commands
// does not interfere with the linear graph optimization

#include "../graph_common.hpp"

#include <sycl/properties/queue_properties.hpp>

int main() {
queue Queue{{sycl::property::queue::in_order{}}};

using T = int;

const T ModValue = 7;
std::vector<T> DataA(Size), DataB(Size), DataC(Size);

std::iota(DataA.begin(), DataA.end(), 1);
std::iota(DataB.begin(), DataB.end(), 10);
std::iota(DataC.begin(), DataC.end(), 1000);

// Create reference data for output
std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
for (size_t i = 0; i < Iterations; i++) {
for (size_t j = 0; j < Size; j++) {
ReferenceA[j] += ModValue;
ReferenceB[j] = ReferenceA[j];
ReferenceB[j] -= ModValue;
ReferenceC[j] = ReferenceB[j];
ReferenceC[j] += ModValue;
}
}

ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
Queue.get_device()};

T *PtrA = malloc_device<T>(Size, Queue);
T *PtrB = malloc_device<T>(Size, Queue);
T *PtrC = malloc_device<T>(Size, Queue);

Queue.copy(DataA.data(), PtrA, Size);
Queue.copy(DataB.data(), PtrB, Size);
Queue.copy(DataC.data(), PtrC, Size);
Queue.wait_and_throw();

Graph.begin_recording(Queue);
Queue.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrA[LinID] += ModValue;
});
});

Queue.submit([&](handler &CGH) { CGH.memcpy(PtrB, PtrA, Size * sizeof(T)); });

Queue.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrB[LinID] -= ModValue;
});
});

Queue.submit([&](handler &CGH) { CGH.memcpy(PtrC, PtrB, Size * sizeof(T)); });

Queue.submit([&](handler &CGH) {
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
auto LinID = id.get_linear_id();
PtrC[LinID] += ModValue;
});
});

Graph.end_recording();

auto GraphExec = Graph.finalize();

event Event;
for (unsigned n = 0; n < Iterations; n++) {
Event =
Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
}

Queue.copy(PtrA, DataA.data(), Size, Event);
Queue.copy(PtrB, DataB.data(), Size, Event);
Queue.copy(PtrC, DataC.data(), Size, Event);
Queue.wait_and_throw();

free(PtrA, Queue);
free(PtrB, Queue);
free(PtrC, Queue);

for (size_t i = 0; i < Size; i++) {
assert(check_value(i, ReferenceA[i], DataA[i], "DataA"));
assert(check_value(i, ReferenceB[i], DataB[i], "DataB"));
assert(check_value(i, ReferenceC[i], DataC[i], "DataC"));
}
}

0 comments on commit 020a5ff

Please sign in to comment.