From 3fa80b3d593cf479567b7d24cc517e1a076c6e63 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Wed, 16 Aug 2023 20:22:22 +0000 Subject: [PATCH] [BE][PG NCCL] Improve input mismatch error msg (#107281) Test Plan: CI Differential Revision: D48363238 Pull Request resolved: https://github.com/pytorch/pytorch/pull/107281 Approved by: https://github.com/awgu, https://github.com/H-Huang, https://github.com/fegin --- torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index e6ed85c7574a67..d95b3ef6a49cd1 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -1502,8 +1502,16 @@ std::vector flatten_for_scatter_gather( if (tensor_lists[i].size() != world_size * num_devices) { TORCH_CHECK( false, - "Tensor list input to scatter/gather must match number of collective" - " participants"); + c10::str( + "Tensor list input to scatter/gather must match number of collective participants ", + "but got ", + tensor_lists[i].size(), + " inputs", + " with world_size ", + world_size, + " and ", + num_devices, + " devices.")); } // Only check device match for the first tensor in the list; the call to