Add execution policy `thrust::cuda::par_nosync` #1568

fkallen · 2021-11-12T16:28:31Z

This PR adds functionality requested in #1515

add cuda_cub::synchronize_optional(policy) which may or may not synchronize the stream, depending on the policy
add execution policy thrust::cuda::par_nosync which does not perform optional synchronization
replace each call to cuda_cub::synchronize at the end of an algorithm by cuda_cub::synchronize_optional

Open question: synchronize_optional currently does not skip synchronization in device code. Should it stay like this?

I profiled the following example program to verify that no optional synchronization is performed.

#include <iostream>
#include <cstdint>
#include <cassert>

#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <thrust/reverse.h>
#include <thrust/execution_policy.h>

template<class ExecPolicy>
void executeHost(ExecPolicy policy, std::size_t N){
    thrust::device_vector<std::size_t> d_vec(N);
    thrust::sequence(policy, d_vec.begin(), d_vec.end(), 1);
    thrust::reverse(policy, d_vec.begin(), d_vec.end());

    constexpr int numiters = 15;
    for(int i = 0; i < numiters; i++){
        thrust::for_each(policy, d_vec.begin(), d_vec.end(), [] __host__ __device__ (std::size_t& x){ x = x + 1;});
    }

    std::size_t x = thrust::reduce(policy, d_vec.begin(), d_vec.end());

    cudaDeviceSynchronize();
    assert(cudaSuccess == cudaGetLastError());
    std::size_t expected = (numiters * N) + (N * (N+1)) / 2;
    assert(x == expected);
}

__global__ 
void synckernel(std::size_t* d_vec, std::size_t N){
    cudaStream_t stream;
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

    auto policy = thrust::cuda::par.on(stream);
    thrust::sequence(policy, d_vec, d_vec + N, 1);
    thrust::reverse(policy, d_vec, d_vec + N);

    constexpr int numiters = 15;
    for(int i = 0; i < numiters; i++){
        thrust::for_each(policy, d_vec, d_vec + N, [] (std::size_t& x){ x = x + 1;});
    }

    std::size_t x = thrust::reduce(policy, d_vec, d_vec + N);

    cudaDeviceSynchronize();
    assert(cudaSuccess == cudaGetLastError());
    std::size_t expected = (numiters * N) + (N * (N+1)) / 2;
    assert(x == expected);

    cudaStreamDestroy(stream);
}

__global__ 
void nosynckernel(std::size_t* d_vec, std::size_t N){
    cudaStream_t stream;
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

    auto policy = thrust::cuda::par_nosync.on(stream);
    thrust::sequence(policy, d_vec, d_vec + N, 1);
    thrust::reverse(policy, d_vec, d_vec + N);

    constexpr int numiters = 15;
    for(int i = 0; i < numiters; i++){
        thrust::for_each(policy, d_vec, d_vec + N, [] (std::size_t& x){ x = x + 1;});
    }

    std::size_t x = thrust::reduce(policy, d_vec, d_vec + N);

    cudaDeviceSynchronize();
    assert(cudaSuccess == cudaGetLastError());
    std::size_t expected = (numiters * N) + (N * (N+1)) / 2;
    assert(x == expected);

    cudaStreamDestroy(stream);
}

void executesynckernel(std::size_t N, cudaStream_t stream){
    thrust::device_vector<std::size_t> d_vec(N);

    synckernel<<<1,1, 0, stream>>>(
        thrust::raw_pointer_cast(d_vec.data()),
        N
    );

    cudaDeviceSynchronize();
    assert(cudaSuccess == cudaGetLastError());
}

void executenosynckernel(std::size_t N, cudaStream_t stream){
    thrust::device_vector<std::size_t> d_vec(N);

    nosynckernel<<<1,1, 0, stream>>>(
        thrust::raw_pointer_cast(d_vec.data()),
        N
    );

    cudaDeviceSynchronize();
    assert(cudaSuccess == cudaGetLastError());
}

int main(){
    std::size_t N = 1'000'000;

    cudaStream_t stream = cudaStreamPerThread;

    auto blockingpolicy = thrust::cuda::par.on(stream);
    executeHost(blockingpolicy, N);

    auto nosyncpolicy = thrust::cuda::par_nosync.on(stream);
    executeHost(nosyncpolicy, N);

    auto nosyncpolicydefaultstream = thrust::cuda::par_nosync;
    executeHost(nosyncpolicydefaultstream, N);

    cudaDeviceSynchronize();
    assert(cudaSuccess == cudaGetLastError());

    executenosynckernel(N, stream);
    executesynckernel(N, stream);
}

(edit: updated code)

GPUtester · 2021-11-12T16:28:33Z

Can one of the admins verify this patch?

alliepiper · 2021-11-15T16:26:22Z

Thanks for the PR! I'll review it soon.

Open question: synchronize_optional currently does not skip synchronization in device code. Should it stay like this?

I believe it should behave the same as on host. Is there a reason that it can't?

fkallen · 2021-11-22T16:38:01Z

I guess it's just that I am not very familiar with dynamic parallelism. For example, I have never seen __cudaDeviceSynchronizeDeprecationAvoidance before.

However, I got it working and also found a bug. Synchronization can't be skipped when followed by call to get_value. While on the host stream-ordering will ensure that the result has been calculated before the transfer, that may not be the case on the device since it's only dereferencing a pointer. So I reverted to the usual synchronization method for the corresponding algorithms.

This of course means that there is an "unneccessary" synchronization call in the host path. Maybe one could instead add synchronization in the device-path of get_value before dereferencing but I cannot tell the implications of this so I am leaving it as is.

Performance seems to be improved with dynamic parallelism the same as on the host judging from the kernel execution time. But my profiler does not show in-kernel synchronization events so its hard to tell for certain whether synchronization is skipped or not.

brycelelbach · 2021-11-22T16:40:09Z

This is great, thanks for the PR.

brycelelbach · 2021-11-22T16:40:55Z

@allisonvacanti we should probably prioritize this fairly highly, as a lot of folks have been asking for this.

@fkallen does this include documentation for this as well?

fkallen · 2021-11-24T12:53:17Z

@brycelelbach Currently it's only code changes. No separate documentation, and no tests either except for the toy example above.
Is there documentation for the usual thrust::cuda::par which I could use as reference ?

alliepiper · 2021-11-29T17:27:01Z

I'll do a full review soon, hopefully this week. I'm planning to include this in the next release.

I guess it's just that I am not very familiar with dynamic parallelism. For example, I have never seen __cudaDeviceSynchronizeDeprecationAvoidance before.

We should avoid syncing for CDP + par_nosync usecases. The "DeprecationAvoidance" function just supports some testing usecases, you can safely ignore it -- just call the cuda::detail::device_synchronize() method and it should do the right thing.

However, I got it working and also found a bug. Synchronization can't be skipped when followed by call to get_value. While on the host stream-ordering will ensure that the result has been calculated before the transfer, that may not be the case on the device since it's only dereferencing a pointer. So I reverted to the usual synchronization method for the corresponding algorithms.

This of course means that there is an "unneccessary" synchronization call in the host path. Maybe one could instead add synchronization in the device-path of get_value before dereferencing but I cannot tell the implications of this so I am leaving it as is.

Good catch! I think most folks are interested in using this policy with thrust::transform, so this workaround sounds good to me. We can look at further optimizations later if needed.

Is there documentation for the usual thrust::cuda::par which I could use as reference?

The existing execution policies are documented here: https://github.com/NVIDIA/thrust/blob/main/thrust/execution_policy.h

I think it would sufficient to just say that par_nosync behaves like par, but with some exceptions, and list out the ways that it is different. Consider adding a new example that demonstrates how to safely use this, too, that would be very useful.

alliepiper · 2021-11-29T17:38:50Z

The existing execution policies are documented here

Just to be clear, we don't need to expose the par_nosync policy in thrust/execution_policy.h, it can stay in thrust/system/cuda/... and just needs a doxygen comment.

alliepiper · 2021-12-03T19:40:16Z

LGTM -- thanks for submitting this! I tested it out locally and it works nicely.

I'll start CI and get this merged in the next week so.

DVS CL: 30735025

run tests

…from a thrust call before the kernels have completed

alliepiper · 2021-12-14T19:01:26Z

Rebased to resolve conflicts -- this is ready to merge! Thanks again @fkallen!

jedbrown · 2021-12-14T19:06:41Z

Thank you!

Version 1.16 of Thrust adds policy thrust::cuda::par_nosync, which accepts a stream argument and does not synchronize, thus preventing a stall waiting for the CPU to learn the kernel has completed before launching its next operation. NVIDIA/thrust#1568 This feature (not blocking for kernels that don't need to) had been removed (breaking change) in Thrust-1.9.4 to simplify error handling behavior and because a futures-based async interface had been deemed sufficient. This issue describes the history and rationale for the new par_nosync feature. NVIDIA/thrust#1515

alliepiper linked an issue Nov 15, 2021 that may be closed by this pull request

Add thrust::cuda::par_nosync execution policy #1515

Closed

alliepiper self-assigned this Nov 15, 2021

alliepiper added the P1: should have Necessary, but not critical. label Nov 15, 2021

alliepiper added this to the 1.16.0 milestone Nov 15, 2021

fkallen force-pushed the skipsynchronization branch from 291de6e to fd96c92 Compare November 22, 2021 16:14

alliepiper added testing: gpuCI in progress Started gpuCI testing. testing: internal ci in progress Currently testing on internal NVIDIA CI (DVS). labels Dec 3, 2021

fkallen added 5 commits December 14, 2021 13:59

Add execution policy thrust::cuda::par_nosync which allows to return …

375d085

…from a thrust call before the kernels have completed

Add device-side optional synchronization

9997e6b

Add nosync tests for algorithms which use cuda_cub::get_value()

56c653d

Add documentation with example for thrust::cuda::par_nosync

fe75a44

Fix allocator-aware nosync policy

3dfdcff

alliepiper force-pushed the skipsynchronization branch from f980eb1 to 3dfdcff Compare December 14, 2021 19:00

alliepiper merged commit a462ff6 into NVIDIA:main Dec 14, 2021

alliepiper mentioned this pull request Dec 14, 2021

Can I control synchronization? #1081

Closed

jrhemstad mentioned this pull request May 4, 2022

Add an example that shows how to use custom CUDA streams. #1669

Merged

upsj mentioned this pull request Aug 3, 2022

Update to Thrust 1.17.0 rapidsai/cudf#11437

Merged

3 tasks

harrism mentioned this pull request Dec 7, 2022

Add pairwise_linestring_intersection_with_duplicates API rapidsai/cuspatial#813

Merged

3 tasks

fkallen mentioned this pull request Sep 10, 2024

[FEA] Performing device-to-device copies with thrust::copy is synchronous with par_nosync. Can this be improved? NVIDIA/cccl#2397

Closed

1 task

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add execution policy `thrust::cuda::par_nosync` #1568

Add execution policy `thrust::cuda::par_nosync` #1568

fkallen commented Nov 12, 2021 •

edited

Loading

GPUtester commented Nov 12, 2021

alliepiper commented Nov 15, 2021

fkallen commented Nov 22, 2021 •

edited

Loading

brycelelbach commented Nov 22, 2021

brycelelbach commented Nov 22, 2021

fkallen commented Nov 24, 2021

alliepiper commented Nov 29, 2021

alliepiper commented Nov 29, 2021

alliepiper commented Dec 3, 2021

alliepiper commented Dec 14, 2021

jedbrown commented Dec 14, 2021

Add execution policy thrust::cuda::par_nosync #1568

Add execution policy thrust::cuda::par_nosync #1568

Conversation

fkallen commented Nov 12, 2021 • edited Loading

GPUtester commented Nov 12, 2021

alliepiper commented Nov 15, 2021

fkallen commented Nov 22, 2021 • edited Loading

brycelelbach commented Nov 22, 2021

brycelelbach commented Nov 22, 2021

fkallen commented Nov 24, 2021

alliepiper commented Nov 29, 2021

alliepiper commented Nov 29, 2021

alliepiper commented Dec 3, 2021

alliepiper commented Dec 14, 2021

jedbrown commented Dec 14, 2021

Add execution policy `thrust::cuda::par_nosync` #1568

Add execution policy `thrust::cuda::par_nosync` #1568

fkallen commented Nov 12, 2021 •

edited

Loading

fkallen commented Nov 22, 2021 •

edited

Loading