taichi-dev · ailzhang · Dec 29, 2021 · Dec 27, 2021 · Dec 27, 2021 · Dec 29, 2021
diff --git a/taichi/backends/opengl/codegen_opengl.cpp b/taichi/backends/opengl/codegen_opengl.cpp
@@ -531,6 +531,7 @@ class KernelGen : public IRVisitor {
       used.buf_args = true;
       used.int32 = true;
       std::string var_name = fmt::format("_s{}_{}{}", i, "arr", arg_id);
+
       if (!loaded_args_.count(var_name)) {
         emit("int {} = _args_i32_[{} + {} * {} + {}];", var_name,
              taichi_opengl_extra_args_base / sizeof(int), arg_id,
@@ -964,7 +965,17 @@ class KernelGen : public IRVisitor {
       gen->emit("}}");
     }
   };
-
+  void gen_array_range(Stmt *stmt) {
+    if (auto val = stmt->cast<ExternalTensorShapeAlongAxisStmt>()) {
+      val->accept(this);
+    } else {
+      TI_ASSERT(stmt->is<BinaryOpStmt>());
+      auto bop = stmt->cast<BinaryOpStmt>();
+      gen_array_range(bop->lhs);
+      gen_array_range(bop->rhs);
+      bop->accept(this);
+    }
+  }
   void generate_range_for_kernel(OffloadedStmt *stmt) {
     TI_ASSERT(stmt->task_type == OffloadedStmt::TaskType::range_for);
     const std::string glsl_kernel_name = make_kernel_name();
@@ -1005,13 +1016,22 @@ class KernelGen : public IRVisitor {
       stmt->body->accept(this);
     } else {
       ScopedIndent _s(line_appender_);
-      emit("// range known at runtime");
-      auto begin_expr = stmt->const_begin ? std::to_string(stmt->begin_value)
-                                          : fmt::format("_gtmp_i32_[{} >> 2]",
-                                                        stmt->begin_offset);
-      auto end_expr = stmt->const_end ? std::to_string(stmt->end_value)
-                                      : fmt::format("_gtmp_i32_[{} >> 2]",
-                                                    stmt->end_offset);
+      std::string begin_expr, end_expr;
+      if (stmt->end_stmt) {
+        emit("// range from args buffer");
+        TI_ASSERT(stmt->const_begin);
+        begin_expr = std::to_string(stmt->begin_value);
+        gen_array_range(stmt->end_stmt);
+        end_expr = stmt->end_stmt->short_name();
+      } else {
+        emit("// range known at runtime");
+        begin_expr = stmt->const_begin ? std::to_string(stmt->begin_value)
+                                       : fmt::format("_gtmp_i32_[{} >> 2]",
+                                                     stmt->begin_offset);
+        end_expr = stmt->const_end
+                       ? std::to_string(stmt->end_value)
+                       : fmt::format("_gtmp_i32_[{} >> 2]", stmt->end_offset);
+      }
       workgroup_size_ = stmt->block_dim;
       num_workgroups_ = stmt->grid_dim;
       emit("int _beg = {}, _end = {};", begin_expr, end_expr);

diff --git a/taichi/ir/statements.cpp b/taichi/ir/statements.cpp
@@ -240,15 +240,17 @@ RangeForStmt::RangeForStmt(Stmt *begin,
                            int bit_vectorize,
                            int num_cpu_threads,
                            int block_dim,
-                           bool strictly_serialized)
+                           bool strictly_serialized,
+                           bool end_is_array_axis)
     : begin(begin),
       end(end),
       body(std::move(body)),
       vectorize(vectorize),
       bit_vectorize(bit_vectorize),
       num_cpu_threads(num_cpu_threads),
       block_dim(block_dim),
-      strictly_serialized(strictly_serialized) {
+      strictly_serialized(strictly_serialized),
+      end_is_array_axis(end_is_array_axis) {
   reversed = false;
   this->body->parent_stmt = this;
   TI_STMT_REG_FIELDS;

diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
@@ -378,6 +378,10 @@ class ExternalTensorShapeAlongAxisStmt : public Stmt {
 
   ExternalTensorShapeAlongAxisStmt(int axis, int arg_id);
 
+  bool has_global_side_effect() const override {
+    return false;
+  }
+
   TI_STMT_DEF_FIELDS(ret_type, axis, arg_id);
   TI_DEFINE_ACCEPT_AND_CLONE
 };
@@ -729,6 +733,7 @@ class RangeForStmt : public Stmt {
   int num_cpu_threads;
   int block_dim;
   bool strictly_serialized;
+  bool end_is_array_axis{false};
 
   RangeForStmt(Stmt *begin,
                Stmt *end,
@@ -737,7 +742,8 @@ class RangeForStmt : public Stmt {
                int bit_vectorize,
                int num_cpu_threads,
                int block_dim,
-               bool strictly_serialized);
+               bool strictly_serialized,
+               bool end_is_array_axis = false);
 
   bool is_container_statement() const override {
     return true;
@@ -756,7 +762,8 @@ class RangeForStmt : public Stmt {
                      bit_vectorize,
                      num_cpu_threads,
                      block_dim,
-                     strictly_serialized);
+                     strictly_serialized,
+                     end_is_array_axis);
   TI_DEFINE_ACCEPT
 };
 
@@ -1123,6 +1130,7 @@ class OffloadedStmt : public Stmt {
   int block_dim{1};
   bool reversed{false};
   int num_cpu_threads{1};
+  Stmt *end_stmt{nullptr};
 
   mesh::Mesh *mesh{nullptr};
   mesh::MeshElementType major_from_type;

diff --git a/taichi/transforms/ir_printer.cpp b/taichi/transforms/ir_printer.cpp
@@ -549,6 +549,9 @@ class IRPrinter : public IRVisitor {
       }
       if (stmt->const_end) {
         end_str = std::to_string(stmt->end_value);
+      } else if (stmt->end_stmt && !stmt->end_stmt->is<ConstStmt>()) {
+        // range_for end is a non-const stmt (e.g. ndarray axis)
+        end_str = stmt->end_stmt->name();
       } else {
         end_str = fmt::format("tmp(offset={}B)", stmt->end_offset);
       }

diff --git a/taichi/transforms/lower_ast.cpp b/taichi/transforms/lower_ast.cpp
@@ -343,10 +343,12 @@ class LowerAST : public IRVisitor {
       for (int i = 0; i < (int)shape.size(); i++) {
         end = fctx.push_back<BinaryOpStmt>(BinaryOpType::mul, end, shape[i]);
       }
+      // TODO: add a note explaining why shape might be empty.
       auto &&new_for = std::make_unique<RangeForStmt>(
           begin, end, std::move(stmt->body), stmt->vectorize,
           stmt->bit_vectorize, stmt->num_cpu_threads, stmt->block_dim,
-          stmt->strictly_serialized);
+          stmt->strictly_serialized,
+          /*end_is_array_axis=*/!end->is<ConstStmt>());
       VecStatement new_statements;
       Stmt *loop_index =
           new_statements.push_back<LoopIndexStmt>(new_for.get(), 0);

diff --git a/taichi/transforms/offload.cpp b/taichi/transforms/offload.cpp
@@ -85,19 +85,35 @@ class Offloader {
         } else {
           offloaded->block_dim = s->block_dim;
         }
-        if (auto val = s->begin->cast<ConstStmt>()) {
+
+        // TODO: We need to update codegen for each backend gradually so let's
+        // limit it to opengl backend for now.
+        if (arch == Arch::opengl && s->end_is_array_axis) {
+          // range of array must begin with 0.
+          auto begin = s->begin->cast<ConstStmt>();
+          TI_ASSERT(begin && begin->val[0].val_int32() == 0);
           offloaded->const_begin = true;
-          offloaded->begin_value = val->val[0].val_int32();
-        } else {
-          offloaded_ranges.begin_stmts.insert(
-              std::make_pair(offloaded.get(), s->begin));
-        }
-        if (auto val = s->end->cast<ConstStmt>()) {
-          offloaded->const_end = true;
-          offloaded->end_value = val->val[0].val_int32();
-        } else {
+          offloaded->begin_value = 0;
+
+          offloaded->end_stmt =
+              clone_and_replace_ext_axis(s->end, offloaded.get(), s);
           offloaded_ranges.end_stmts.insert(
-              std::make_pair(offloaded.get(), s->end));
+              std::make_pair(offloaded.get(), offloaded->end_stmt));
+        } else {
+          if (auto val = s->begin->cast<ConstStmt>()) {
+            offloaded->const_begin = true;
+            offloaded->begin_value = val->val[0].val_int32();
+          } else {
+            offloaded_ranges.begin_stmts.insert(
+                std::make_pair(offloaded.get(), s->begin));
+          }
+          if (auto val = s->end->cast<ConstStmt>()) {
+            offloaded->const_end = true;
+            offloaded->end_value = val->val[0].val_int32();
+          } else {
+            offloaded_ranges.end_stmts.insert(
+                std::make_pair(offloaded.get(), s->end));
+          }
         }
         offloaded->num_cpu_threads =
             std::min(s->num_cpu_threads, config.cpu_max_num_threads);
@@ -140,6 +156,28 @@ class Offloader {
   }
 
  private:
+  static Stmt *clone_and_replace_ext_axis(Stmt *stmt,
+                                          OffloadedStmt *offloaded,
+                                          RangeForStmt *range_for) {
+    if (stmt->cast<ExternalTensorShapeAlongAxisStmt>()) {
+      auto new_stmt = stmt->clone();
+      auto new_stmt_ptr = new_stmt.get();
+      offloaded->body->insert(std::move(new_stmt));
+      replace_all_usages_with(range_for, stmt, new_stmt_ptr);
+      return new_stmt_ptr;
+    } else {
+      auto val = stmt->cast<BinaryOpStmt>();
+      TI_ASSERT(val && val->op_type == BinaryOpType::mul);
+      auto new_stmt = stmt->clone();
+      auto new_stmt_ptr = new_stmt.get();
+      auto new_val = new_stmt->cast<BinaryOpStmt>();
+      new_val->lhs = clone_and_replace_ext_axis(val->lhs, offloaded, range_for);
+      new_val->rhs = clone_and_replace_ext_axis(val->rhs, offloaded, range_for);
+      offloaded->body->insert(std::move(new_stmt));
+      replace_all_usages_with(range_for, stmt, new_stmt_ptr);
+      return new_stmt_ptr;
+    }
+  }
   static void emit_struct_for(StructForStmt *for_stmt,
                               Block *root_block,
                               const CompileConfig &config,
@@ -480,15 +518,20 @@ class FixCrossOffloadReferences : public BasicStmtVisitor {
                                         ->second];
       }
       if (!stmt->const_end) {
-        TI_ASSERT(offloaded_ranges_->end_stmts.find(stmt) !=
-                  offloaded_ranges_->end_stmts.end())
-        TI_ASSERT_INFO(local_to_global_offset_.find(
-                           offloaded_ranges_->end_stmts.find(stmt)->second) !=
-                           local_to_global_offset_.end(),
-                       "End fails.")
-        stmt->end_offset =
-            local_to_global_offset_[offloaded_ranges_->end_stmts.find(stmt)
-                                        ->second];
+        if (stmt->end_stmt) {
+          TI_ASSERT(stmt->const_begin);
+          stmt->end_offset = 0;
+        } else {
+          TI_ASSERT(offloaded_ranges_->end_stmts.find(stmt) !=
+                    offloaded_ranges_->end_stmts.end())
+          TI_ASSERT_INFO(local_to_global_offset_.find(
+                             offloaded_ranges_->end_stmts.find(stmt)->second) !=
+                             local_to_global_offset_.end(),
+                         "End fails.")
+          stmt->end_offset =
+              local_to_global_offset_[offloaded_ranges_->end_stmts.find(stmt)
+                                          ->second];
+        }
       }
     }
   }

diff --git a/tests/python/test_aot.py b/tests/python/test_aot.py
@@ -202,7 +202,7 @@ def init(d: ti.i32, density1: ti.any_arr(), density2: ti.any_arr(),
 
 @ti.test(arch=ti.opengl)
 def test_opengl_exceed_max_ssbo():
-    # 7 ndarrays + gtmp + args > 8 (maximum allowed)
+    # 8 ndarrays + args > 8 (maximum allowed)
     n = 4
     density1 = ti.ndarray(dtype=ti.f32, shape=(n, n))
     density2 = ti.ndarray(dtype=ti.f32, shape=(n, n))
@@ -211,12 +211,13 @@ def test_opengl_exceed_max_ssbo():
     density5 = ti.ndarray(dtype=ti.f32, shape=(n, n))
     density6 = ti.ndarray(dtype=ti.f32, shape=(n, n))
     density7 = ti.ndarray(dtype=ti.f32, shape=(n, n))
+    density8 = ti.ndarray(dtype=ti.f32, shape=(n, n))
 
     @ti.kernel
     def init(d: ti.i32, density1: ti.any_arr(), density2: ti.any_arr(),
              density3: ti.any_arr(), density4: ti.any_arr(),
              density5: ti.any_arr(), density6: ti.any_arr(),
-             density7: ti.any_arr()):
+             density7: ti.any_arr(), density8: ti.any_arr()):
         for i, j in density1:
             density1[i, j] = d + 1
             density2[i, j] = d + 2
@@ -225,10 +226,11 @@ def init(d: ti.i32, density1: ti.any_arr(), density2: ti.any_arr(),
             density5[i, j] = d + 5
             density6[i, j] = d + 6
             density7[i, j] = d + 7
+            density8[i, j] = d + 8
 
     with pytest.raises(RuntimeError):
         init(0, density1, density2, density3, density4, density5, density6,
-             density7)
+             density7, density8)
 
 
 @ti.test(arch=ti.opengl)