pytorch · yeounoh · Jan 19, 2024 · Jan 18, 2024
diff --git a/test/pjrt/test_collective_ops_tpu.py b/test/pjrt/test_collective_ops_tpu.py
@@ -38,11 +38,13 @@ def _all_reduce(pin_layout):
     device = xm.xla_device()
     # Prevent 0 and 1 from being converted to constants
     ordinal = xm.send_cpu_data_to_device(
-        torch.tensor(xm.get_ordinal()), device=device)
+        torch.tensor(xm.get_ordinal(), dtype=torch.float32, requires_grad=True),
+        device=device)
     out = xm.all_reduce(xm.REDUCE_SUM, ordinal, pin_layout=pin_layout)[0]
+    assert out.requires_grad
     xm.mark_step()
 
-    return out.cpu().numpy()
+    return out.cpu().detach().numpy()
 
   @parameterized.named_parameters(('pinned', True), ('unpinned', False))
   def test_all_reduce(self, pin_layout):

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -1215,7 +1215,8 @@ void InitXlaModuleBindings(py::module m) {
       NoGilSection nogil;
       result = AllReduce(reduce_type, input, scale, replica_groups, pin_layout);
     }
-    return result;
+    return torch::autograd::make_variable(
+        result, /*requires_grad=*/input.requires_grad());
   });
   m.def("_xla_quantize_tensor",
         [](const at::Tensor& input, const std::vector<float>& scale_list,