apache · leandron · Sep 24, 2021 · Sep 21, 2021 · Sep 22, 2021 · areusch
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
@@ -625,8 +625,13 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // Define the storage allocator ids
     for (auto kv : storage_device_map_) {
       for (auto sid : kv.second->storage_ids) {
+        // The buffer_var is created with storage_scope to be global.workspace to be serviced by
+        // TVMBackendAllocWorkspace(TVMBAW) calls, explicitly. The reasoning being the executor
+        // allocates should be serviced by TVMBAWs as the data could be accessed by many devices and
+        // should not be lowered to the stack. For more details please refer to the discussion here:
+        // https://github.com/apache/tvm/issues/9022
         te::Var buffer_var(MakeString("sid_", sid),
-                           PointerType(PrimType(DataType::Int(8)), "global"));
+                           PointerType(PrimType(DataType::Int(8)), "global.workspace"));
         sids_table_[sid] = buffer_var;
       }
     }

diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
@@ -113,9 +113,14 @@ class BuiltinLower : public StmtExprMutator {
     op = stmt.as<AllocateNode>();
     // Get constant allocation bound.
     int64_t nbytes = GetVectorBytes(op->dtype);
+    // If the buffers are for CPU and have global scope,
+    // and less than runtime::kMaxStackAlloca heuristic
+    // they are not serviced with TVMBackendWorkspaceAlloc calls
+    // to be placed on stack.
     if (device_type_.defined()) {
       if (const auto* dev_type = device_type_.as<IntImmNode>()) {
-        if (dev_type->value == kDLCPU) {
+        auto storage_scope = Downcast<PointerType>(op->buffer_var->type_annotation)->storage_scope;
+        if (dev_type->value == kDLCPU && storage_scope == "global") {
           int32_t constant_size = op->constant_allocation_size();
           if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) {
             return stmt;

diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
@@ -478,6 +478,11 @@ class StoragePlanRewriter : public StmtExprMutator {
     uint64_t bits_offset{0};
   };
 
+  // Checks whether the storage_scope is especially tagged for a specific memory.
+  bool IsSpecialTaggedMemory(const StorageScope& scope) {
+    return scope.tag.length() != 0 && scope.tag != ".dyn" && scope.tag != ".workspace";
+  }
+
   // Alllocate entry of node.
   // Event entry in liveness analysis
   struct EventEntry {
@@ -516,7 +521,7 @@ class StoragePlanRewriter : public StmtExprMutator {
       // try to find merge, for tagged memory
       for (size_t i = 0; i < vec.size(); ++i) {
         StorageEntry* e = vec[i];
-        if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") {
+        if (IsSpecialTaggedMemory(e->scope)) {
           ICHECK_NE(e->const_nbits, 0U) << "Special tagged memory must be const size";
           for (size_t j = 0; j < i; ++j) {
             if (e->scope == vec[j]->scope) {
@@ -550,7 +555,7 @@ class StoragePlanRewriter : public StmtExprMutator {
                               make_const(DataType::Int(32), 1), e->allocs[0]->extents);
           e->new_alloc =
               Allocate(e->alloc_var, alloc_type, {sz}, e->allocs[0]->condition, Evaluate(0));
-          if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") {
+          if (IsSpecialTaggedMemory(e->scope)) {
             MemoryInfo info = GetMemoryInfo(e->scope.to_string());
             uint64_t total_elem = e->const_nbits / e->elem_type.bits();
             ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
@@ -591,7 +596,7 @@ class StoragePlanRewriter : public StmtExprMutator {
           combo_size = analyzer_.Simplify(combo_size);
           e->new_alloc =
               Allocate(e->alloc_var, alloc_type, {combo_size}, const_true(), Evaluate(0));
-          if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") {
+          if (IsSpecialTaggedMemory(e->scope)) {
             MemoryInfo info = GetMemoryInfo(e->scope.to_string());
             uint64_t total_elem = e->const_nbits / e->elem_type.bits();
             ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)

diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
@@ -471,37 +471,67 @@ def extract_main_workspace_size_bytes(extract_dir):
         return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"]
 
 
-def compile_and_run(
+def compile_models(
     models: Union[List[AOTTestModel], AOTTestModel],
-    runner: AOTTestRunner,
     interface_api,
     use_unpacked_api,
-    debug_calculated_workspaces=False,
     workspace_byte_alignment=8,
     enable_op_fusion=True,
 ):
     """
-    This method verifies the generated source
+    This method generates runtime.Modules for the tests
     """
+
     base_target = "c -runtime=c --link-params --executor=aot"
     extra_target = f"--workspace-byte-alignment={workspace_byte_alignment} --interface-api={interface_api} --unpacked-api={int(use_unpacked_api)}"
     target = f"{base_target} {extra_target}"
-    cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
 
     if not isinstance(models, list):
         models = [models]
 
-    # The calculated workspaces will not account for stack allocator tags used for debugging
-    if debug_calculated_workspaces:
-        cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "
-
     config = {"tir.disable_vectorize": True}
     if not enable_op_fusion:
         config["relay.FuseOps.max_depth"] = 1
 
+    compiled_runtime_mods = list()
+    for model in models:
+        with tvm.transform.PassContext(opt_level=3, config=config):
+            compiled_runtime_mods.append(
+                tvm.relay.build(
+                    model.module,
+                    target,
+                    target_host=target,
+                    params=model.params,
+                    mod_name=model.name,
+                )
+            )
+    return compiled_runtime_mods
+
+
+def run_and_check(
+    models: Union[List[AOTTestModel], AOTTestModel],
+    runner: AOTTestRunner,
+    interface_api,
+    compiled_runtime_mods: List[tvm.runtime.Module],
+    debug_calculated_workspaces=False,
+    workspace_byte_alignment=8,
+):
+    """
+    This method uses the original test data and compiled runtime.Modules
+    to run in the test runner to verify the results.
+    """
+
+    if not isinstance(models, list):
+        models = [models]
+
     tmp_path = utils.tempdir()
     tmp_dir = tmp_path.temp_dir
 
+    cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
+    # The calculated workspaces will not account for stack allocator tags used for debugging
+    if debug_calculated_workspaces:
+        cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "
+
     base_path = os.path.join(tmp_dir, "test")
     build_path = os.path.join(base_path, "build")
     os.makedirs(build_path, exist_ok=True)
@@ -515,18 +545,9 @@ def compile_and_run(
     )
 
     workspace_bytes = 0
-    for model in models:
-        with tvm.transform.PassContext(opt_level=3, config=config):
-            lib = tvm.relay.build(
-                model.module,
-                target,
-                target_host=target,
-                params=model.params,
-                mod_name=model.name,
-            )
-
+    for runtime_module, model in zip(compiled_runtime_mods, models):
         tar_file = os.path.join(base_path, f"{model.name}.tar")
-        export_model_library_format(lib, tar_file)
+        export_model_library_format(runtime_module, tar_file)
         t = tarfile.open(tar_file)
         t.extractall(base_path)
 
@@ -592,6 +613,29 @@ def compile_and_run(
         assert AOT_SUCCESS_TOKEN in run_log.read()
 
 
+def compile_and_run(
+    models: Union[List[AOTTestModel], AOTTestModel],
+    runner: AOTTestRunner,
+    interface_api,
+    use_unpacked_api,
+    debug_calculated_workspaces=False,
+    workspace_byte_alignment=8,
+    enable_op_fusion=True,
+):
+    """This is a wrapper API to compile and run models as test for AoT"""
+    compiled_runtime_mods = compile_models(
+        models, interface_api, use_unpacked_api, workspace_byte_alignment, enable_op_fusion
+    )
+    run_and_check(
+        models,
+        runner,
+        interface_api,
+        compiled_runtime_mods,
+        debug_calculated_workspaces,
+        workspace_byte_alignment,
+    )
+
+
 def generate_ref_data(mod, input_data, params=None, target="llvm"):
     """Generate reference data through executing the relay module"""
     compile_engine.get().clear()

diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
@@ -33,6 +33,7 @@
     generate_ref_data,
     convert_to_relay,
     compile_and_run,
+    compile_models,
     parametrize_aot_options,
 )
 
@@ -643,5 +644,45 @@ def test_memory_planning(workspace_byte_alignment, main_workspace_size, sum_work
     )
 
 
+def test_aot_codegen_backend_alloc_workspace_calls():
+    """This test checks whether AoT lowering creates TVMBackendAllocWorkspace calls"""
+
+    # The %data and %weight shapes in the following primitive Relay should create
+    # small tensors that would get lowered to stack allocations in the CPU PrimFuncs.
+    # However, the AoT executor codegen should retain them as TVMBAW calls
+    relay_mod = tvm.parser.fromtext(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), float32], src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 4, 4, 4), float32] {
+        %0 = fn (%p02: Tensor[(1, 4, 4, 4), float32], Primitive=1, hash="9332b3872fb5292c", src_layout="NCHW", dst_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] {
+            layout_transform(%p02, src_layout="NCHW", dst_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */
+        };
+        %1 = fn (%p03: Tensor[(4, 4, 3, 3), float32], Primitive=1, hash="9f0b2b8a24a4dab3", src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 1, 3, 3, 4, 4), float32] {
+            layout_transform(%p03, src_layout="OIHW", dst_layout="OIHW4i4o") /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */
+        };
+        %2 = %0(%data) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */;
+        %3 = %1(%weight) /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */;
+        %4 = fn (%p01: Tensor[(1, 1, 4, 4, 4), float32], %p1: Tensor[(1, 1, 3, 3, 4, 4), float32], out_layout="NCHW4c", kernel_layout="OIHW4i4o", Primitive=1, data_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] {
+                                                                                                                                                                                                                                                      nn.contrib_conv2d_NCHWc(%p01, %p1, padding=[1, 1, 1, 1], channels=4, kernel_size=[3, 3], data_layout="NCHW4c", kernel_layout="OIHW4i4o", out_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */
+        };
+        %5 = %4(%2, %3) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */;
+        %6 = fn (%p0: Tensor[(1, 1, 4, 4, 4), float32], Primitive=1, src_layout="NCHW4c", dst_layout="NCHW") -> Tensor[(1, 4, 4, 4), float32] {
+            layout_transform(%p0, src_layout="NCHW4c", dst_layout="NCHW") /* ty=Tensor[(1, 4, 4, 4), float32] */
+        };
+        %6(%5) /* ty=Tensor[(1, 4, 4, 4), float32] */
+        }
+        """
+    )
+    compiled_runtime_modules = compile_models(
+        AOTTestModel(module=relay_mod, inputs=None, outputs=None),
+        "c",
+        True,
+    )
+    source = compiled_runtime_modules[0].lib.imported_modules[0].get_source()
+    # There should be three allocates created for three primitive relay function
+    # calls in the main for the above relay snippet.
+    assert source.count("TVMBackendAllocWorkspace") == 3
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))