diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index 632b579c55afa..7b997f2a58ae1 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -175,7 +175,7 @@ DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg, &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank]; auto self_counter_ptr = &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x]; - if constexpr (need_fence) { + if constexpr (false) { st_flag_release(peer_counter_ptr, val); while (ld_flag_acquire(self_counter_ptr) != val); } else { @@ -240,6 +240,17 @@ __global__ void __launch_bounds__(512, 1) tmps[i] = get_tmp_buf
(sg.signals[target]);
}
auto tmp_out = tmps[0];
+// Uncomment to populate cache of peer values, which are currently in a indeterminate state
+// for (int idx = tid; idx < largest_part; idx += stride) {
+// #pragma unroll
+// for (int i = 0; i < ngpus; i++) {
+// int gather_from_rank = ((rank + i) % ngpus);
+// if (gather_from_rank == ngpus - 1 || idx < part) {
+// int dst_idx = gather_from_rank * part + idx;
+// ((P*)result)[dst_idx] = tmps[i][idx];
+// }
+// }
+// }
multi_gpu_barrier (&tmp);
}
}
}