diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index f1398786b93b..deca3b5a4c5a 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -625,8 +625,13 @@ class AOTExecutorCodegen : public MixedModeVisitor { // Define the storage allocator ids for (auto kv : storage_device_map_) { for (auto sid : kv.second->storage_ids) { + // The buffer_var is created with storage_scope to be global.workspace to be serviced by + // TVMBackendAllocWorkspace(TVMBAW) calls, explicitly. The reasoning being the executor + // allocates should be serviced by TVMBAWs as the data could be accessed by many devices and + // should not be lowered to the stack. For more details please refer to the discussion here: + // https://github.com/apache/tvm/issues/9022 te::Var buffer_var(MakeString("sid_", sid), - PointerType(PrimType(DataType::Int(8)), "global")); + PointerType(PrimType(DataType::Int(8)), "global.workspace")); sids_table_[sid] = buffer_var; } } diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc index 99d71ebe15bd..062d67eef165 100644 --- a/src/tir/transforms/lower_tvm_builtin.cc +++ b/src/tir/transforms/lower_tvm_builtin.cc @@ -113,9 +113,14 @@ class BuiltinLower : public StmtExprMutator { op = stmt.as(); // Get constant allocation bound. int64_t nbytes = GetVectorBytes(op->dtype); + // If the buffers are for CPU and have global scope, + // and less than runtime::kMaxStackAlloca heuristic + // they are not serviced with TVMBackendWorkspaceAlloc calls + // to be placed on stack. if (device_type_.defined()) { if (const auto* dev_type = device_type_.as()) { - if (dev_type->value == kDLCPU) { + auto storage_scope = Downcast(op->buffer_var->type_annotation)->storage_scope; + if (dev_type->value == kDLCPU && storage_scope == "global") { int32_t constant_size = op->constant_allocation_size(); if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) { return stmt; diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc index 592a6a33375e..409b7c262954 100644 --- a/src/tir/transforms/storage_rewrite.cc +++ b/src/tir/transforms/storage_rewrite.cc @@ -478,6 +478,11 @@ class StoragePlanRewriter : public StmtExprMutator { uint64_t bits_offset{0}; }; + // Checks whether the storage_scope is especially tagged for a specific memory. + bool IsSpecialTaggedMemory(const StorageScope& scope) { + return scope.tag.length() != 0 && scope.tag != ".dyn" && scope.tag != ".workspace"; + } + // Alllocate entry of node. // Event entry in liveness analysis struct EventEntry { @@ -516,7 +521,7 @@ class StoragePlanRewriter : public StmtExprMutator { // try to find merge, for tagged memory for (size_t i = 0; i < vec.size(); ++i) { StorageEntry* e = vec[i]; - if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") { + if (IsSpecialTaggedMemory(e->scope)) { ICHECK_NE(e->const_nbits, 0U) << "Special tagged memory must be const size"; for (size_t j = 0; j < i; ++j) { if (e->scope == vec[j]->scope) { @@ -550,7 +555,7 @@ class StoragePlanRewriter : public StmtExprMutator { make_const(DataType::Int(32), 1), e->allocs[0]->extents); e->new_alloc = Allocate(e->alloc_var, alloc_type, {sz}, e->allocs[0]->condition, Evaluate(0)); - if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") { + if (IsSpecialTaggedMemory(e->scope)) { MemoryInfo info = GetMemoryInfo(e->scope.to_string()); uint64_t total_elem = e->const_nbits / e->elem_type.bits(); ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits) @@ -591,7 +596,7 @@ class StoragePlanRewriter : public StmtExprMutator { combo_size = analyzer_.Simplify(combo_size); e->new_alloc = Allocate(e->alloc_var, alloc_type, {combo_size}, const_true(), Evaluate(0)); - if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") { + if (IsSpecialTaggedMemory(e->scope)) { MemoryInfo info = GetMemoryInfo(e->scope.to_string()); uint64_t total_elem = e->const_nbits / e->elem_type.bits(); ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits) diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py index 490257ac66da..0935c0c16e99 100644 --- a/tests/python/relay/aot/aot_test_utils.py +++ b/tests/python/relay/aot/aot_test_utils.py @@ -471,37 +471,67 @@ def extract_main_workspace_size_bytes(extract_dir): return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"] -def compile_and_run( +def compile_models( models: Union[List[AOTTestModel], AOTTestModel], - runner: AOTTestRunner, interface_api, use_unpacked_api, - debug_calculated_workspaces=False, workspace_byte_alignment=8, enable_op_fusion=True, ): """ - This method verifies the generated source + This method generates runtime.Modules for the tests """ + base_target = "c -runtime=c --link-params --executor=aot" extra_target = f"--workspace-byte-alignment={workspace_byte_alignment} --interface-api={interface_api} --unpacked-api={int(use_unpacked_api)}" target = f"{base_target} {extra_target}" - cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} " if not isinstance(models, list): models = [models] - # The calculated workspaces will not account for stack allocator tags used for debugging - if debug_calculated_workspaces: - cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK " - config = {"tir.disable_vectorize": True} if not enable_op_fusion: config["relay.FuseOps.max_depth"] = 1 + compiled_runtime_mods = list() + for model in models: + with tvm.transform.PassContext(opt_level=3, config=config): + compiled_runtime_mods.append( + tvm.relay.build( + model.module, + target, + target_host=target, + params=model.params, + mod_name=model.name, + ) + ) + return compiled_runtime_mods + + +def run_and_check( + models: Union[List[AOTTestModel], AOTTestModel], + runner: AOTTestRunner, + interface_api, + compiled_runtime_mods: List[tvm.runtime.Module], + debug_calculated_workspaces=False, + workspace_byte_alignment=8, +): + """ + This method uses the original test data and compiled runtime.Modules + to run in the test runner to verify the results. + """ + + if not isinstance(models, list): + models = [models] + tmp_path = utils.tempdir() tmp_dir = tmp_path.temp_dir + cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} " + # The calculated workspaces will not account for stack allocator tags used for debugging + if debug_calculated_workspaces: + cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK " + base_path = os.path.join(tmp_dir, "test") build_path = os.path.join(base_path, "build") os.makedirs(build_path, exist_ok=True) @@ -515,18 +545,9 @@ def compile_and_run( ) workspace_bytes = 0 - for model in models: - with tvm.transform.PassContext(opt_level=3, config=config): - lib = tvm.relay.build( - model.module, - target, - target_host=target, - params=model.params, - mod_name=model.name, - ) - + for runtime_module, model in zip(compiled_runtime_mods, models): tar_file = os.path.join(base_path, f"{model.name}.tar") - export_model_library_format(lib, tar_file) + export_model_library_format(runtime_module, tar_file) t = tarfile.open(tar_file) t.extractall(base_path) @@ -592,6 +613,29 @@ def compile_and_run( assert AOT_SUCCESS_TOKEN in run_log.read() +def compile_and_run( + models: Union[List[AOTTestModel], AOTTestModel], + runner: AOTTestRunner, + interface_api, + use_unpacked_api, + debug_calculated_workspaces=False, + workspace_byte_alignment=8, + enable_op_fusion=True, +): + """This is a wrapper API to compile and run models as test for AoT""" + compiled_runtime_mods = compile_models( + models, interface_api, use_unpacked_api, workspace_byte_alignment, enable_op_fusion + ) + run_and_check( + models, + runner, + interface_api, + compiled_runtime_mods, + debug_calculated_workspaces, + workspace_byte_alignment, + ) + + def generate_ref_data(mod, input_data, params=None, target="llvm"): """Generate reference data through executing the relay module""" compile_engine.get().clear() diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index 73aa385161f6..9961cd567fbe 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -33,6 +33,7 @@ generate_ref_data, convert_to_relay, compile_and_run, + compile_models, parametrize_aot_options, ) @@ -643,5 +644,45 @@ def test_memory_planning(workspace_byte_alignment, main_workspace_size, sum_work ) +def test_aot_codegen_backend_alloc_workspace_calls(): + """This test checks whether AoT lowering creates TVMBackendAllocWorkspace calls""" + + # The %data and %weight shapes in the following primitive Relay should create + # small tensors that would get lowered to stack allocations in the CPU PrimFuncs. + # However, the AoT executor codegen should retain them as TVMBAW calls + relay_mod = tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), float32], src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 4, 4, 4), float32] { + %0 = fn (%p02: Tensor[(1, 4, 4, 4), float32], Primitive=1, hash="9332b3872fb5292c", src_layout="NCHW", dst_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] { + layout_transform(%p02, src_layout="NCHW", dst_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */ + }; + %1 = fn (%p03: Tensor[(4, 4, 3, 3), float32], Primitive=1, hash="9f0b2b8a24a4dab3", src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 1, 3, 3, 4, 4), float32] { + layout_transform(%p03, src_layout="OIHW", dst_layout="OIHW4i4o") /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */ + }; + %2 = %0(%data) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */; + %3 = %1(%weight) /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */; + %4 = fn (%p01: Tensor[(1, 1, 4, 4, 4), float32], %p1: Tensor[(1, 1, 3, 3, 4, 4), float32], out_layout="NCHW4c", kernel_layout="OIHW4i4o", Primitive=1, data_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] { + nn.contrib_conv2d_NCHWc(%p01, %p1, padding=[1, 1, 1, 1], channels=4, kernel_size=[3, 3], data_layout="NCHW4c", kernel_layout="OIHW4i4o", out_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */ + }; + %5 = %4(%2, %3) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */; + %6 = fn (%p0: Tensor[(1, 1, 4, 4, 4), float32], Primitive=1, src_layout="NCHW4c", dst_layout="NCHW") -> Tensor[(1, 4, 4, 4), float32] { + layout_transform(%p0, src_layout="NCHW4c", dst_layout="NCHW") /* ty=Tensor[(1, 4, 4, 4), float32] */ + }; + %6(%5) /* ty=Tensor[(1, 4, 4, 4), float32] */ + } + """ + ) + compiled_runtime_modules = compile_models( + AOTTestModel(module=relay_mod, inputs=None, outputs=None), + "c", + True, + ) + source = compiled_runtime_modules[0].lib.imported_modules[0].get_source() + # There should be three allocates created for three primitive relay function + # calls in the main for the above relay snippet. + assert source.count("TVMBackendAllocWorkspace") == 3 + + if __name__ == "__main__": sys.exit(pytest.main([__file__] + sys.argv[1:]))