Skip to content

Commit

Permalink
Move the allocates of AoT codegen to be TVMBAWs (apache#9065)
Browse files Browse the repository at this point in the history
* Move the allocates of AoT codegen to be TVMBAWs

This commit introduces changes to aot_executor_codegen.cc
to place tir.allocate to use storage_scope = 'global.workspace'.
The lower_tvm_builtin pass is modified slightly to generate
TVMBAW calls.

Change-Id: Iba4ba437c1431c5197bf11abb826e03807bbcf66

* Move the allocates of AoT codegen to be TVMBAWs

*Adding more comments and descriptions
*Modified the test case to use primitive relay

Change-Id: Ia18a169d94bded3f81af7b3081c7d1ac29c669bc
  • Loading branch information
manupak authored and ylc committed Sep 29, 2021
1 parent dbf6af6 commit f963b31
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 25 deletions.
7 changes: 6 additions & 1 deletion src/relay/backend/aot_executor_codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -625,8 +625,13 @@ class AOTExecutorCodegen : public MixedModeVisitor {
// Define the storage allocator ids
for (auto kv : storage_device_map_) {
for (auto sid : kv.second->storage_ids) {
// The buffer_var is created with storage_scope to be global.workspace to be serviced by
// TVMBackendAllocWorkspace(TVMBAW) calls, explicitly. The reasoning being the executor
// allocates should be serviced by TVMBAWs as the data could be accessed by many devices and
// should not be lowered to the stack. For more details please refer to the discussion here:
// https://github.com/apache/tvm/issues/9022
te::Var buffer_var(MakeString("sid_", sid),
PointerType(PrimType(DataType::Int(8)), "global"));
PointerType(PrimType(DataType::Int(8)), "global.workspace"));
sids_table_[sid] = buffer_var;
}
}
Expand Down
7 changes: 6 additions & 1 deletion src/tir/transforms/lower_tvm_builtin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,14 @@ class BuiltinLower : public StmtExprMutator {
op = stmt.as<AllocateNode>();
// Get constant allocation bound.
int64_t nbytes = GetVectorBytes(op->dtype);
// If the buffers are for CPU and have global scope,
// and less than runtime::kMaxStackAlloca heuristic
// they are not serviced with TVMBackendWorkspaceAlloc calls
// to be placed on stack.
if (device_type_.defined()) {
if (const auto* dev_type = device_type_.as<IntImmNode>()) {
if (dev_type->value == kDLCPU) {
auto storage_scope = Downcast<PointerType>(op->buffer_var->type_annotation)->storage_scope;
if (dev_type->value == kDLCPU && storage_scope == "global") {
int32_t constant_size = op->constant_allocation_size();
if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) {
return stmt;
Expand Down
11 changes: 8 additions & 3 deletions src/tir/transforms/storage_rewrite.cc
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,11 @@ class StoragePlanRewriter : public StmtExprMutator {
uint64_t bits_offset{0};
};

// Checks whether the storage_scope is especially tagged for a specific memory.
bool IsSpecialTaggedMemory(const StorageScope& scope) {
return scope.tag.length() != 0 && scope.tag != ".dyn" && scope.tag != ".workspace";
}

// Alllocate entry of node.
// Event entry in liveness analysis
struct EventEntry {
Expand Down Expand Up @@ -516,7 +521,7 @@ class StoragePlanRewriter : public StmtExprMutator {
// try to find merge, for tagged memory
for (size_t i = 0; i < vec.size(); ++i) {
StorageEntry* e = vec[i];
if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") {
if (IsSpecialTaggedMemory(e->scope)) {
ICHECK_NE(e->const_nbits, 0U) << "Special tagged memory must be const size";
for (size_t j = 0; j < i; ++j) {
if (e->scope == vec[j]->scope) {
Expand Down Expand Up @@ -550,7 +555,7 @@ class StoragePlanRewriter : public StmtExprMutator {
make_const(DataType::Int(32), 1), e->allocs[0]->extents);
e->new_alloc =
Allocate(e->alloc_var, alloc_type, {sz}, e->allocs[0]->condition, Evaluate(0));
if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") {
if (IsSpecialTaggedMemory(e->scope)) {
MemoryInfo info = GetMemoryInfo(e->scope.to_string());
uint64_t total_elem = e->const_nbits / e->elem_type.bits();
ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
Expand Down Expand Up @@ -591,7 +596,7 @@ class StoragePlanRewriter : public StmtExprMutator {
combo_size = analyzer_.Simplify(combo_size);
e->new_alloc =
Allocate(e->alloc_var, alloc_type, {combo_size}, const_true(), Evaluate(0));
if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") {
if (IsSpecialTaggedMemory(e->scope)) {
MemoryInfo info = GetMemoryInfo(e->scope.to_string());
uint64_t total_elem = e->const_nbits / e->elem_type.bits();
ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
Expand Down
84 changes: 64 additions & 20 deletions tests/python/relay/aot/aot_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,37 +471,67 @@ def extract_main_workspace_size_bytes(extract_dir):
return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"]


def compile_and_run(
def compile_models(
models: Union[List[AOTTestModel], AOTTestModel],
runner: AOTTestRunner,
interface_api,
use_unpacked_api,
debug_calculated_workspaces=False,
workspace_byte_alignment=8,
enable_op_fusion=True,
):
"""
This method verifies the generated source
This method generates runtime.Modules for the tests
"""

base_target = "c -runtime=c --link-params --executor=aot"
extra_target = f"--workspace-byte-alignment={workspace_byte_alignment} --interface-api={interface_api} --unpacked-api={int(use_unpacked_api)}"
target = f"{base_target} {extra_target}"
cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "

if not isinstance(models, list):
models = [models]

# The calculated workspaces will not account for stack allocator tags used for debugging
if debug_calculated_workspaces:
cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "

config = {"tir.disable_vectorize": True}
if not enable_op_fusion:
config["relay.FuseOps.max_depth"] = 1

compiled_runtime_mods = list()
for model in models:
with tvm.transform.PassContext(opt_level=3, config=config):
compiled_runtime_mods.append(
tvm.relay.build(
model.module,
target,
target_host=target,
params=model.params,
mod_name=model.name,
)
)
return compiled_runtime_mods


def run_and_check(
models: Union[List[AOTTestModel], AOTTestModel],
runner: AOTTestRunner,
interface_api,
compiled_runtime_mods: List[tvm.runtime.Module],
debug_calculated_workspaces=False,
workspace_byte_alignment=8,
):
"""
This method uses the original test data and compiled runtime.Modules
to run in the test runner to verify the results.
"""

if not isinstance(models, list):
models = [models]

tmp_path = utils.tempdir()
tmp_dir = tmp_path.temp_dir

cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
# The calculated workspaces will not account for stack allocator tags used for debugging
if debug_calculated_workspaces:
cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "

base_path = os.path.join(tmp_dir, "test")
build_path = os.path.join(base_path, "build")
os.makedirs(build_path, exist_ok=True)
Expand All @@ -515,18 +545,9 @@ def compile_and_run(
)

workspace_bytes = 0
for model in models:
with tvm.transform.PassContext(opt_level=3, config=config):
lib = tvm.relay.build(
model.module,
target,
target_host=target,
params=model.params,
mod_name=model.name,
)

for runtime_module, model in zip(compiled_runtime_mods, models):
tar_file = os.path.join(base_path, f"{model.name}.tar")
export_model_library_format(lib, tar_file)
export_model_library_format(runtime_module, tar_file)
t = tarfile.open(tar_file)
t.extractall(base_path)

Expand Down Expand Up @@ -592,6 +613,29 @@ def compile_and_run(
assert AOT_SUCCESS_TOKEN in run_log.read()


def compile_and_run(
models: Union[List[AOTTestModel], AOTTestModel],
runner: AOTTestRunner,
interface_api,
use_unpacked_api,
debug_calculated_workspaces=False,
workspace_byte_alignment=8,
enable_op_fusion=True,
):
"""This is a wrapper API to compile and run models as test for AoT"""
compiled_runtime_mods = compile_models(
models, interface_api, use_unpacked_api, workspace_byte_alignment, enable_op_fusion
)
run_and_check(
models,
runner,
interface_api,
compiled_runtime_mods,
debug_calculated_workspaces,
workspace_byte_alignment,
)


def generate_ref_data(mod, input_data, params=None, target="llvm"):
"""Generate reference data through executing the relay module"""
compile_engine.get().clear()
Expand Down
41 changes: 41 additions & 0 deletions tests/python/relay/aot/test_crt_aot.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
generate_ref_data,
convert_to_relay,
compile_and_run,
compile_models,
parametrize_aot_options,
)

Expand Down Expand Up @@ -643,5 +644,45 @@ def test_memory_planning(workspace_byte_alignment, main_workspace_size, sum_work
)


def test_aot_codegen_backend_alloc_workspace_calls():
"""This test checks whether AoT lowering creates TVMBackendAllocWorkspace calls"""

# The %data and %weight shapes in the following primitive Relay should create
# small tensors that would get lowered to stack allocations in the CPU PrimFuncs.
# However, the AoT executor codegen should retain them as TVMBAW calls
relay_mod = tvm.parser.fromtext(
"""
#[version = "0.0.5"]
def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), float32], src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 4, 4, 4), float32] {
%0 = fn (%p02: Tensor[(1, 4, 4, 4), float32], Primitive=1, hash="9332b3872fb5292c", src_layout="NCHW", dst_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] {
layout_transform(%p02, src_layout="NCHW", dst_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */
};
%1 = fn (%p03: Tensor[(4, 4, 3, 3), float32], Primitive=1, hash="9f0b2b8a24a4dab3", src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 1, 3, 3, 4, 4), float32] {
layout_transform(%p03, src_layout="OIHW", dst_layout="OIHW4i4o") /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */
};
%2 = %0(%data) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */;
%3 = %1(%weight) /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */;
%4 = fn (%p01: Tensor[(1, 1, 4, 4, 4), float32], %p1: Tensor[(1, 1, 3, 3, 4, 4), float32], out_layout="NCHW4c", kernel_layout="OIHW4i4o", Primitive=1, data_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] {
nn.contrib_conv2d_NCHWc(%p01, %p1, padding=[1, 1, 1, 1], channels=4, kernel_size=[3, 3], data_layout="NCHW4c", kernel_layout="OIHW4i4o", out_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */
};
%5 = %4(%2, %3) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */;
%6 = fn (%p0: Tensor[(1, 1, 4, 4, 4), float32], Primitive=1, src_layout="NCHW4c", dst_layout="NCHW") -> Tensor[(1, 4, 4, 4), float32] {
layout_transform(%p0, src_layout="NCHW4c", dst_layout="NCHW") /* ty=Tensor[(1, 4, 4, 4), float32] */
};
%6(%5) /* ty=Tensor[(1, 4, 4, 4), float32] */
}
"""
)
compiled_runtime_modules = compile_models(
AOTTestModel(module=relay_mod, inputs=None, outputs=None),
"c",
True,
)
source = compiled_runtime_modules[0].lib.imported_modules[0].get_source()
# There should be three allocates created for three primitive relay function
# calls in the main for the above relay snippet.
assert source.count("TVMBackendAllocWorkspace") == 3


if __name__ == "__main__":
sys.exit(pytest.main([__file__] + sys.argv[1:]))

0 comments on commit f963b31

Please sign in to comment.