diff --git a/src/operator/tensor/dot-inl.cuh b/src/operator/tensor/dot-inl.cuh
index c8572ba5e0cb..8960798c7a0c 100644
--- a/src/operator/tensor/dot-inl.cuh
+++ b/src/operator/tensor/dot-inl.cuh
@@ -71,7 +71,7 @@ struct DotCsrDnsDnsVectorKernel {
     for (int j = low+lane; j < high; j+=32) {
       sum += data_l[j] * data_r[col_idx_l[j]*num_cols_r + kcol];
     }
-    vals[threadIdx.x] = sum;
+    vals[threadIdx.x] = sum; __syncwarp();
 
     // Parallel reduction in shared memory
     if (lane < 16) {vals[threadIdx.x] += vals[threadIdx.x+16];} __syncwarp();