pytorch · yeounoh · Mar 13, 2024 · Mar 12, 2024 · JackCaoG · Mar 12, 2024
diff --git a/torch_xla/csrc/device.cpp b/torch_xla/csrc/device.cpp
@@ -91,6 +91,12 @@ bool UseVirtualDevice(bool force_spmd) {
   return use_virtual_device;
 }
 
+bool IsVirtualDevice(const std::string& device) {
+  XlaDeviceType hw_type =
+      static_cast<XlaDeviceType>(ParseDeviceString(device).type());
+  return hw_type == XlaDeviceType::SPMD;
+}
+
 bool GetLockSpmdConfig() { return spmd_config_is_locked; }
 
 bool CheckTpuDevice(XlaDeviceType hw_type) {

diff --git a/torch_xla/csrc/device.h b/torch_xla/csrc/device.h
@@ -45,6 +45,9 @@ torch::lazy::BackendDevice GetVirtualDevice();
 // Optionally, `force_spmd` to set `use_virtual_device` to true.
 bool UseVirtualDevice(bool force_spmd = false);
 
+// Return true if `device` is of SPMD device type.
+bool IsVirtualDevice(const std::string& device);
+
 // Return true if SPMD config can be switches. That is, no device has been
 // initialized, yet.
 bool GetLockSpmdConfig();

diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -1939,16 +1939,8 @@ void InitXlaModuleBindings(py::module m) {
           std::vector<std::string> sharding_specs;
           sharding_specs.reserve(tensors.size());
           for (const at::Tensor& tensor : tensors) {
-            XLATensorPtr xtensor = bridge::GetXlaTensor(tensor);
-            XLATensor::ShardingSpecPtr sharding_spec =
-                xtensor ? xtensor->sharding_spec() : nullptr;
-            if (sharding_spec != nullptr) {
-              sharding_specs.push_back(
-                  xla::HloSharding::FromProto(sharding_spec->sharding)
-                      ->ToString());
-            } else {
-              sharding_specs.push_back("");
-            }
+            sharding_specs.push_back(
+                GetXLAShardingSpec(bridge::GetXlaTensor(tensor)));
           }
           return sharding_specs;
         });

diff --git a/torch_xla/csrc/ops/device_data.h b/torch_xla/csrc/ops/device_data.h
@@ -36,12 +36,10 @@ class DeviceData : public XlaNode {
   // backend data with a partitioned one in the node operands. Note that
   // this is permitted only if the node holds a placeholder.
   void Assign(std::shared_ptr<torch::lazy::BackendData> data) {
-    // TODO(yeounoh) check if the existing data is a placeholder after we
-    // address the issue where some of the sync tensors spill with device node.
     XLA_CHECK(data->shape() == data_->shape())
         << "Shape mismatch: expected (" << data_->shape().to_string()
         << "), actual (" << data->shape().to_string() << ")";
-    data_ = data;
+    data_.reset(data.get());
   }
 
   static DeviceData* Cast(const torch::lazy::Node* node);

diff --git a/torch_xla/csrc/tensor_util.cpp b/torch_xla/csrc/tensor_util.cpp
@@ -689,10 +689,13 @@ std::vector<torch::lazy::BackendDataPtr> CreateTensorsData(
     return {};
   }
 
-  // We assume that caller can't mix virtual device and real device.
-  if (devices[0] == "SPMD:0") {
-    // When running in SPMD mode, tensors here in the unsharded
-    // CreateTensorsData should be implicitly replicated to all devices.
+  // CreateTensorsData should be implicitly replicated to all devices.
+  if (IsVirtualDevice(devices[0])) {
+    XLA_CHECK(
+        std::all_of(devices.begin(), devices.end(),
+                    [&](const std::string& s) { return s == devices[0]; }))
+        << "can't mix virtual device and real device.";
+
     std::vector<std::string> local_devices =
         runtime::GetComputationClient()->GetLocalDevices();
     std::vector<runtime::ComputationClient::DataPtr> handles;

diff --git a/torch_xla/csrc/xla_graph_executor.cpp b/torch_xla/csrc/xla_graph_executor.cpp
@@ -1296,9 +1296,9 @@ XLAGraphExecutor::BuildInputOutputAliases(
 }
 
 XLAGraphExecutor::CompilationResult XLAGraphExecutor::Compile(
-    const std::vector<XLATensorPtr>& tensors,
-    absl::Span<const std::string> devices, const SyncTensorCollection& coll,
-    PostOrderData* po_data, const std::vector<torch::lazy::Value>& ir_values) {
+    std::vector<XLATensorPtr>& tensors, absl::Span<const std::string> devices,
+    const SyncTensorCollection& coll, PostOrderData* po_data,
+    const std::vector<torch::lazy::Value>& ir_values) {
   tsl::profiler::TraceMe activity(
       [&] {
         return tsl::profiler::TraceMeEncode(

diff --git a/torch_xla/csrc/xla_graph_executor.h b/torch_xla/csrc/xla_graph_executor.h
@@ -346,7 +346,9 @@ class XLAGraphExecutor : public torch::lazy::LazyGraphExecutor {
   std::vector<size_t> SetBufferDonors(LoweringContext* lowering_ctx);
 
   // We don't use upstream Compile to have BuildInputOutputAliases.
-  CompilationResult Compile(const std::vector<XLATensorPtr>& tensors,
+  // TODO(yeounoh) auto-sharding can change tensors shardings, which needs to be
+  // accounted for in Dynamo integration.
+  CompilationResult Compile(std::vector<XLATensorPtr>& tensors,
                             absl::Span<const std::string> devices,
                             const SyncTensorCollection& coll,
                             PostOrderData* po_data,