diff --git a/src/operator/tensor/dot-inl.cuh b/src/operator/tensor/dot-inl.cuh index c8572ba5e0cb..8960798c7a0c 100644 --- a/src/operator/tensor/dot-inl.cuh +++ b/src/operator/tensor/dot-inl.cuh @@ -71,7 +71,7 @@ struct DotCsrDnsDnsVectorKernel { for (int j = low+lane; j < high; j+=32) { sum += data_l[j] * data_r[col_idx_l[j]*num_cols_r + kcol]; } - vals[threadIdx.x] = sum; + vals[threadIdx.x] = sum; __syncwarp(); // Parallel reduction in shared memory if (lane < 16) {vals[threadIdx.x] += vals[threadIdx.x+16];} __syncwarp();