From 4c3c1725a9992cf9d3822eaa66735bf50f378c6c Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 20 Jun 2023 12:06:50 -0700 Subject: [PATCH] Update DORT to follow PyTorch changes (#16394) Fix #16355. The root cause change in PyTorch is [#103302](https://github.com/pytorch/pytorch/pull/103302), which seem blocking calling make_fx inside a dynamo backend. Changes: 1. Move decomposition to `register_backend.py`, so we don't have to call `make_fx` inside DORT, which triggers a bunch of new exceptions. 2. Remove shape inference based on FakeTensorProp since the FX graph received from dynamo contains all shapes now. 3. Fix a macro bug so that DORT can build without CUDA. Before (3), ``` #if defined(USE_CUDA) || defined(USE_ROCM) virtual PhiloxGenerator& PhiloxGenerator__Default() = 0; #ifdef ENABLE_TRAINING_TORCH_INTEROP ... #endif #endif ``` After (3), ``` #if defined(USE_CUDA) || defined(USE_ROCM) virtual PhiloxGenerator& PhiloxGenerator__Default() = 0; #endif #ifdef ENABLE_TRAINING_TORCH_INTEROP ... #endif ``` The later one looks better since the `ENABLE_TRAINING_TORCH_INTEROP` is for Python bridge code, not for random-number-generating kernels `PhiloxGenerator`. --- .../shared_library/provider_interfaces.h | 3 +-- onnxruntime/core/session/provider_bridge_ort.cc | 3 +-- .../python/training/torchdynamo/ort_backend.py | 15 +++------------ .../training/torchdynamo/register_backend.py | 6 ++++-- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index e24dcc08d6627..759fa22843ccd 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -924,8 +924,8 @@ struct ProviderHost { #endif #if defined(USE_CUDA) || defined(USE_ROCM) - virtual PhiloxGenerator& PhiloxGenerator__Default() = 0; +#endif #ifdef ENABLE_TRAINING_TORCH_INTEROP virtual void contrib__PythonOpBase__Init(contrib::PythonOpBase* p, const OpKernelInfo& info) = 0; @@ -940,7 +940,6 @@ struct ProviderHost { virtual language_interop_ops::torch::RefCountTracker& GetRefCountTrackerInstance() = 0; virtual void RefCountTracker__DumpDetails(const language_interop_ops::torch::RefCountTracker* p, const std::string& phase_name) = 0; #endif -#endif #if defined(USE_CANN) virtual RandomGenerator& RandomGenerator__Default() = 0; diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index c79de8105c039..6b2fad8441d86 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -1068,8 +1068,8 @@ struct ProviderHostImpl : ProviderHost { #endif #if defined(USE_CUDA) || defined(USE_ROCM) - PhiloxGenerator& PhiloxGenerator__Default() override { return PhiloxGenerator::Default(); } +#endif #ifdef ENABLE_TRAINING_TORCH_INTEROP void contrib__PythonOpBase__Init(contrib::PythonOpBase* p, const OpKernelInfo& info) override { p->PythonOpBase::Init(info); } @@ -1092,7 +1092,6 @@ struct ProviderHostImpl : ProviderHost { return p->language_interop_ops::torch::RefCountTracker::DumpDetails(phase_name); } #endif -#endif #if defined(USE_CANN) RandomGenerator& RandomGenerator__Default() override { return RandomGenerator::Default(); } diff --git a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py index 4f2ec745199b7..718ee84cf72a2 100644 --- a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py +++ b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py @@ -18,9 +18,7 @@ import torch.onnx import torch.onnx._onnx_supported_ops from torch._decomp import decomposition_table -from torch._dynamo.utils import detect_fake_mode from torch._subclasses.fake_tensor import FakeTensor -from torch.fx.experimental.proxy_tensor import make_fx from torch.fx.passes.fake_tensor_prop import FakeTensorProp from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner from torch.fx.passes.operator_support import OperatorSupport @@ -182,8 +180,8 @@ def _get_support_dictionaries_and_decomposition_tables() -> ( ( _SUPPORT_DICT, _EXTRA_SUPPORT_DICT, - _ATEN2ATEN_DECOMP, - _ATEN2PRIM_DECOMP, + ATEN2ATEN_DECOMP, + ATEN2PRIM_DECOMP, ) = _get_support_dictionaries_and_decomposition_tables() @@ -628,15 +626,8 @@ def compile(self, graph_module: torch.fx.GraphModule, args) -> torch.fx.GraphMod if graph_module in self._partitioner_cache: partitioned_prim_graph_module = self._partitioner_cache[graph_module] else: - prim_graph_module = make_fx( - graph_module, tracing_mode="fake", _allow_non_fake_inputs=True, decomposition_table=_ATEN2ATEN_DECOMP - )(*args) + prim_graph_module = graph_module # TODO(wechi): this is required for removing aten::_to_copy in _replace_to_copy_with_to. - # We need input and output tensors' devices to decide if aten::_to_copy is just a Cast. - fake_mode = detect_fake_mode(args) - if not fake_mode: - fake_mode = torch._subclasses.FakeTensorMode() - FakeTensorProp(prim_graph_module, mode=fake_mode).propagate(*args) _replace_to_copy_with_to(prim_graph_module) partitioner = CapabilityBasedPartitioner( prim_graph_module, self._supported_ops, allows_single_node_partition=False diff --git a/orttraining/orttraining/python/training/torchdynamo/register_backend.py b/orttraining/orttraining/python/training/torchdynamo/register_backend.py index 6f6c0f6575b0b..ae9a1522a3547 100644 --- a/orttraining/orttraining/python/training/torchdynamo/register_backend.py +++ b/orttraining/orttraining/python/training/torchdynamo/register_backend.py @@ -6,7 +6,7 @@ from functorch.compile import min_cut_rematerialization_partition from torch._dynamo.backends.common import aot_autograd -from .ort_backend import OrtBackend +from .ort_backend import ATEN2ATEN_DECOMP, OrtBackend # This should be the underlying compiler for ALL graphs if # the user uses ORT to accelerate PyTorch via Dynamo. @@ -28,7 +28,9 @@ # compiled_model = torch._dynamo.optimize(aot_ort)(model) # result = compiled_model(torch.rand(2, 2, dtype=torch.float) # result.sum().backward() -aot_ort = aot_autograd(fw_compiler=DEFAULT_BACKEND, partition_fn=min_cut_rematerialization_partition) +aot_ort = aot_autograd( + fw_compiler=DEFAULT_BACKEND, partition_fn=min_cut_rematerialization_partition, decompositions=ATEN2ATEN_DECOMP +) # Declare ORT as a compiler in Dynamo for inference (i.e., when .backward is NOT called). #