alibaba · eedalong · May 24, 2024 · Mar 14, 2024 · Mar 22, 2024 · May 20, 2024
diff --git a/.github/workflows/pytorch113_gpu.yml b/.github/workflows/pytorch113_gpu.yml
@@ -15,8 +15,8 @@ jobs:
       cuda_version: cu116
       runner_tag: gpu-a10
       remote_runtime_docker: bladedisc:latest-runtime-torch1.13.1-cu116
-      develop_base_image: nvidia/cuda:11.6.0-cudnn8-devel-ubuntu20.04
-      runtime_base_image: nvidia/cuda:11.6.0-cudnn8-devel-ubuntu20.04
+      develop_base_image: nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
+      runtime_base_image: nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
       extra_build_args: --build-arg PYTHON_VERSION=PYTHON3.8 --build-arg ENABLE_FIND_FASTEST_APT_SOURCE=OFF
       extra_envs: -e TORCH_BLADE_BUILD_TENSORRT_STATIC=OFF
           -e TORCH_BLADE_CI_BUILD_TORCH_VERSION=1.13.1+cu116

diff --git a/tao_compiler/mlir/disc/transforms/disc_op_schedule.cc b/tao_compiler/mlir/disc/transforms/disc_op_schedule.cc
@@ -354,7 +354,7 @@ class ScheduleGraph {
   explicit ScheduleGraph(std::vector<Operation*>& post_order_instructions,
                          LatencyEstimator* latency_estimator,
                          AsyncTracker* async_tracker) {
-    InitilizeGrpahTopology(post_order_instructions, latency_estimator,
+    InitilizeGraphTopology(post_order_instructions, latency_estimator,
                            async_tracker);
     InitializeGraphAnalysis(latency_estimator, async_tracker);
   }
@@ -497,7 +497,7 @@ class ScheduleGraph {
     }
   }
 
-  void InitilizeGrpahTopology(std::vector<Operation*>& post_order_instructions,
+  void InitilizeGraphTopology(std::vector<Operation*>& post_order_instructions,
                               LatencyEstimator* latency_estimator,
                               AsyncTracker* async_tracker) {
     original_order_ = post_order_instructions;

diff --git a/tao_compiler/mlir/disc/transforms/lhlo_legalize_roots_to_loops.cc b/tao_compiler/mlir/disc/transforms/lhlo_legalize_roots_to_loops.cc
@@ -5712,9 +5712,11 @@ struct DiscLhloLegalizeRootsToParallelLoops
     // TODO: We should put even single nodes into a fusion by fusion pass
     // Revisit this and walk lmhlo::FusionOp only after the revision done.
     func.walk([&](lmhlo::LmhloOp op) {
-      // Skip the embedded ops in lmhlo.fusion or lmhlo.reduce/scatter
+      // Skip the embedded ops in lmhlo.fusion or lmhlo.reduce/scatter or
+      // lmhlo_disc.args_mutation
       lmhlo::LmhloOp parent = op->getParentOfType<lmhlo::LmhloOp>();
-      if (parent && !isa<lmhlo::FusionOp>(op)) {
+      if (isa<lmhlo_disc::ArgsMutationOp>(op) ||
+          parent && !isa<lmhlo::FusionOp>(op)) {
         return;
       }
       if (isFusionType<FusionType::kStitch>(op) &&

diff --git a/tao_compiler/mlir/disc/transforms/mhlo_placer.cc b/tao_compiler/mlir/disc/transforms/mhlo_placer.cc
@@ -418,7 +418,7 @@ void OpsPlacer::placeI32Ops() {
     if (isa<mhlo_disc::CustomCallV2Op>(op)) return;
 
     if (isa<mhlo::TupleOp, mhlo::GetTupleElementOp, mhlo::WhileOp, mhlo::IfOp,
-            mhlo::ReturnOp>(op)) {
+            mhlo::ReturnOp, mhlo_disc::ArgsMutationOp>(op)) {
       return;
     }
     // Skip the Op that is already placed on CPU