Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move the allocates of AoT codegen to be TVMBAWs #9065

Merged
merged 2 commits into from
Sep 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/relay/backend/aot_executor_codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -625,8 +625,13 @@ class AOTExecutorCodegen : public MixedModeVisitor {
// Define the storage allocator ids
for (auto kv : storage_device_map_) {
for (auto sid : kv.second->storage_ids) {
// The buffer_var is created with storage_scope to be global.workspace to be serviced by
// TVMBackendAllocWorkspace(TVMBAW) calls, explicitly. The reasoning being the executor
// allocates should be serviced by TVMBAWs as the data could be accessed by many devices and
// should not be lowered to the stack. For more details please refer to the discussion here:
// https://github.com/apache/tvm/issues/9022
te::Var buffer_var(MakeString("sid_", sid),
PointerType(PrimType(DataType::Int(8)), "global"));
PointerType(PrimType(DataType::Int(8)), "global.workspace"));
sids_table_[sid] = buffer_var;
}
}
Expand Down
7 changes: 6 additions & 1 deletion src/tir/transforms/lower_tvm_builtin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,14 @@ class BuiltinLower : public StmtExprMutator {
op = stmt.as<AllocateNode>();
// Get constant allocation bound.
int64_t nbytes = GetVectorBytes(op->dtype);
// If the buffers are for CPU and have global scope,
// and less than runtime::kMaxStackAlloca heuristic
// they are not serviced with TVMBackendWorkspaceAlloc calls
// to be placed on stack.
if (device_type_.defined()) {
manupak marked this conversation as resolved.
Show resolved Hide resolved
if (const auto* dev_type = device_type_.as<IntImmNode>()) {
if (dev_type->value == kDLCPU) {
auto storage_scope = Downcast<PointerType>(op->buffer_var->type_annotation)->storage_scope;
if (dev_type->value == kDLCPU && storage_scope == "global") {
int32_t constant_size = op->constant_allocation_size();
if (constant_size > 0 && constant_size * nbytes < runtime::kMaxStackAlloca) {
return stmt;
Expand Down
11 changes: 8 additions & 3 deletions src/tir/transforms/storage_rewrite.cc
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,11 @@ class StoragePlanRewriter : public StmtExprMutator {
uint64_t bits_offset{0};
};

// Checks whether the storage_scope is especially tagged for a specific memory.
bool IsSpecialTaggedMemory(const StorageScope& scope) {
manupak marked this conversation as resolved.
Show resolved Hide resolved
return scope.tag.length() != 0 && scope.tag != ".dyn" && scope.tag != ".workspace";
}

// Alllocate entry of node.
// Event entry in liveness analysis
struct EventEntry {
Expand Down Expand Up @@ -516,7 +521,7 @@ class StoragePlanRewriter : public StmtExprMutator {
// try to find merge, for tagged memory
for (size_t i = 0; i < vec.size(); ++i) {
StorageEntry* e = vec[i];
if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") {
if (IsSpecialTaggedMemory(e->scope)) {
ICHECK_NE(e->const_nbits, 0U) << "Special tagged memory must be const size";
for (size_t j = 0; j < i; ++j) {
if (e->scope == vec[j]->scope) {
Expand Down Expand Up @@ -550,7 +555,7 @@ class StoragePlanRewriter : public StmtExprMutator {
make_const(DataType::Int(32), 1), e->allocs[0]->extents);
e->new_alloc =
Allocate(e->alloc_var, alloc_type, {sz}, e->allocs[0]->condition, Evaluate(0));
if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") {
if (IsSpecialTaggedMemory(e->scope)) {
MemoryInfo info = GetMemoryInfo(e->scope.to_string());
uint64_t total_elem = e->const_nbits / e->elem_type.bits();
ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
Expand Down Expand Up @@ -591,7 +596,7 @@ class StoragePlanRewriter : public StmtExprMutator {
combo_size = analyzer_.Simplify(combo_size);
e->new_alloc =
Allocate(e->alloc_var, alloc_type, {combo_size}, const_true(), Evaluate(0));
if (e->scope.tag.length() != 0 && e->scope.tag != ".dyn") {
if (IsSpecialTaggedMemory(e->scope)) {
MemoryInfo info = GetMemoryInfo(e->scope.to_string());
uint64_t total_elem = e->const_nbits / e->elem_type.bits();
ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
Expand Down
84 changes: 64 additions & 20 deletions tests/python/relay/aot/aot_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,37 +471,67 @@ def extract_main_workspace_size_bytes(extract_dir):
return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"]


def compile_and_run(
def compile_models(
models: Union[List[AOTTestModel], AOTTestModel],
runner: AOTTestRunner,
interface_api,
use_unpacked_api,
debug_calculated_workspaces=False,
workspace_byte_alignment=8,
enable_op_fusion=True,
):
"""
This method verifies the generated source
This method generates runtime.Modules for the tests
"""

base_target = "c -runtime=c --link-params --executor=aot"
extra_target = f"--workspace-byte-alignment={workspace_byte_alignment} --interface-api={interface_api} --unpacked-api={int(use_unpacked_api)}"
target = f"{base_target} {extra_target}"
cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "

if not isinstance(models, list):
models = [models]

# The calculated workspaces will not account for stack allocator tags used for debugging
if debug_calculated_workspaces:
cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "

config = {"tir.disable_vectorize": True}
if not enable_op_fusion:
config["relay.FuseOps.max_depth"] = 1

compiled_runtime_mods = list()
for model in models:
with tvm.transform.PassContext(opt_level=3, config=config):
compiled_runtime_mods.append(
tvm.relay.build(
model.module,
target,
target_host=target,
params=model.params,
mod_name=model.name,
)
)
return compiled_runtime_mods


def run_and_check(
models: Union[List[AOTTestModel], AOTTestModel],
runner: AOTTestRunner,
interface_api,
compiled_runtime_mods: List[tvm.runtime.Module],
debug_calculated_workspaces=False,
workspace_byte_alignment=8,
):
"""
This method uses the original test data and compiled runtime.Modules
to run in the test runner to verify the results.
"""

if not isinstance(models, list):
models = [models]

tmp_path = utils.tempdir()
tmp_dir = tmp_path.temp_dir

cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
# The calculated workspaces will not account for stack allocator tags used for debugging
if debug_calculated_workspaces:
cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "

base_path = os.path.join(tmp_dir, "test")
build_path = os.path.join(base_path, "build")
os.makedirs(build_path, exist_ok=True)
Expand All @@ -515,18 +545,9 @@ def compile_and_run(
)

workspace_bytes = 0
for model in models:
with tvm.transform.PassContext(opt_level=3, config=config):
lib = tvm.relay.build(
model.module,
target,
target_host=target,
params=model.params,
mod_name=model.name,
)

for runtime_module, model in zip(compiled_runtime_mods, models):
tar_file = os.path.join(base_path, f"{model.name}.tar")
export_model_library_format(lib, tar_file)
export_model_library_format(runtime_module, tar_file)
t = tarfile.open(tar_file)
t.extractall(base_path)

Expand Down Expand Up @@ -592,6 +613,29 @@ def compile_and_run(
assert AOT_SUCCESS_TOKEN in run_log.read()


def compile_and_run(
models: Union[List[AOTTestModel], AOTTestModel],
runner: AOTTestRunner,
interface_api,
use_unpacked_api,
debug_calculated_workspaces=False,
workspace_byte_alignment=8,
enable_op_fusion=True,
):
"""This is a wrapper API to compile and run models as test for AoT"""
compiled_runtime_mods = compile_models(
models, interface_api, use_unpacked_api, workspace_byte_alignment, enable_op_fusion
)
run_and_check(
models,
runner,
interface_api,
compiled_runtime_mods,
debug_calculated_workspaces,
workspace_byte_alignment,
)


def generate_ref_data(mod, input_data, params=None, target="llvm"):
"""Generate reference data through executing the relay module"""
compile_engine.get().clear()
Expand Down
41 changes: 41 additions & 0 deletions tests/python/relay/aot/test_crt_aot.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
generate_ref_data,
convert_to_relay,
compile_and_run,
compile_models,
parametrize_aot_options,
)

Expand Down Expand Up @@ -643,5 +644,45 @@ def test_memory_planning(workspace_byte_alignment, main_workspace_size, sum_work
)


def test_aot_codegen_backend_alloc_workspace_calls():
"""This test checks whether AoT lowering creates TVMBackendAllocWorkspace calls"""

# The %data and %weight shapes in the following primitive Relay should create
# small tensors that would get lowered to stack allocations in the CPU PrimFuncs.
# However, the AoT executor codegen should retain them as TVMBAW calls
relay_mod = tvm.parser.fromtext(
"""
#[version = "0.0.5"]
def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), float32], src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 4, 4, 4), float32] {
%0 = fn (%p02: Tensor[(1, 4, 4, 4), float32], Primitive=1, hash="9332b3872fb5292c", src_layout="NCHW", dst_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] {
layout_transform(%p02, src_layout="NCHW", dst_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */
};
%1 = fn (%p03: Tensor[(4, 4, 3, 3), float32], Primitive=1, hash="9f0b2b8a24a4dab3", src_layout="OIHW", dst_layout="OIHW4i4o") -> Tensor[(1, 1, 3, 3, 4, 4), float32] {
layout_transform(%p03, src_layout="OIHW", dst_layout="OIHW4i4o") /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */
};
%2 = %0(%data) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */;
%3 = %1(%weight) /* ty=Tensor[(1, 1, 3, 3, 4, 4), float32] */;
%4 = fn (%p01: Tensor[(1, 1, 4, 4, 4), float32], %p1: Tensor[(1, 1, 3, 3, 4, 4), float32], out_layout="NCHW4c", kernel_layout="OIHW4i4o", Primitive=1, data_layout="NCHW4c") -> Tensor[(1, 1, 4, 4, 4), float32] {
nn.contrib_conv2d_NCHWc(%p01, %p1, padding=[1, 1, 1, 1], channels=4, kernel_size=[3, 3], data_layout="NCHW4c", kernel_layout="OIHW4i4o", out_layout="NCHW4c") /* ty=Tensor[(1, 1, 4, 4, 4), float32] */
};
%5 = %4(%2, %3) /* ty=Tensor[(1, 1, 4, 4, 4), float32] */;
%6 = fn (%p0: Tensor[(1, 1, 4, 4, 4), float32], Primitive=1, src_layout="NCHW4c", dst_layout="NCHW") -> Tensor[(1, 4, 4, 4), float32] {
layout_transform(%p0, src_layout="NCHW4c", dst_layout="NCHW") /* ty=Tensor[(1, 4, 4, 4), float32] */
};
%6(%5) /* ty=Tensor[(1, 4, 4, 4), float32] */
}
"""
)
compiled_runtime_modules = compile_models(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

want to assert that all the tir.allocate nodes are correctly tagged somewhere?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah it is a bit cumbersome to do that :), Instead I used relay in primitive form so its clear that main function should only have three allocates.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mbs-octoml @denise-k can we add a tracking/cleanup task to make this kind of assert easier to write? And flag to cleanup this test?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@areusch roadmap item and task tracking have been created.

AOTTestModel(module=relay_mod, inputs=None, outputs=None),
"c",
True,
)
source = compiled_runtime_modules[0].lib.imported_modules[0].get_source()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The AoT codegen for main ends up as an imported module? Naively I would expect the TVMBackendAllocateWorkspace calls in the imported_modules list to be intra-op only, e.g. for the conv2d, not AoT main.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can understand the reasoning but the current flow just creates per target IRModules just before runtime.Modules are created. Therefore all host_target (i.e. CPU) PrimFuncs end up in a single runtime.Module.

# There should be three allocates created for three primitive relay function
# calls in the main for the above relay snippet.
assert source.count("TVMBackendAllocWorkspace") == 3


if __name__ == "__main__":
sys.exit(pytest.main([__file__] + sys.argv[1:]))