Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[opengl] Optimize range_for for ndarrays #3884

Merged
merged 4 commits into from
Dec 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 28 additions & 8 deletions taichi/backends/opengl/codegen_opengl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ class KernelGen : public IRVisitor {
used.buf_args = true;
used.int32 = true;
std::string var_name = fmt::format("_s{}_{}{}", i, "arr", arg_id);

if (!loaded_args_.count(var_name)) {
emit("int {} = _args_i32_[{} + {} * {} + {}];", var_name,
taichi_opengl_extra_args_base / sizeof(int), arg_id,
Expand Down Expand Up @@ -964,7 +965,17 @@ class KernelGen : public IRVisitor {
gen->emit("}}");
}
};

void gen_array_range(Stmt *stmt) {
if (auto val = stmt->cast<ExternalTensorShapeAlongAxisStmt>()) {
val->accept(this);
} else {
TI_ASSERT(stmt->is<BinaryOpStmt>());
auto bop = stmt->cast<BinaryOpStmt>();
gen_array_range(bop->lhs);
gen_array_range(bop->rhs);
bop->accept(this);
}
}
void generate_range_for_kernel(OffloadedStmt *stmt) {
TI_ASSERT(stmt->task_type == OffloadedStmt::TaskType::range_for);
const std::string glsl_kernel_name = make_kernel_name();
Expand Down Expand Up @@ -1005,13 +1016,22 @@ class KernelGen : public IRVisitor {
stmt->body->accept(this);
} else {
ScopedIndent _s(line_appender_);
emit("// range known at runtime");
auto begin_expr = stmt->const_begin ? std::to_string(stmt->begin_value)
: fmt::format("_gtmp_i32_[{} >> 2]",
stmt->begin_offset);
auto end_expr = stmt->const_end ? std::to_string(stmt->end_value)
: fmt::format("_gtmp_i32_[{} >> 2]",
stmt->end_offset);
std::string begin_expr, end_expr;
if (stmt->end_stmt) {
emit("// range from args buffer");
TI_ASSERT(stmt->const_begin);
begin_expr = std::to_string(stmt->begin_value);
gen_array_range(stmt->end_stmt);
end_expr = stmt->end_stmt->short_name();
} else {
emit("// range known at runtime");
begin_expr = stmt->const_begin ? std::to_string(stmt->begin_value)
: fmt::format("_gtmp_i32_[{} >> 2]",
stmt->begin_offset);
end_expr = stmt->const_end
? std::to_string(stmt->end_value)
: fmt::format("_gtmp_i32_[{} >> 2]", stmt->end_offset);
}
workgroup_size_ = stmt->block_dim;
num_workgroups_ = stmt->grid_dim;
emit("int _beg = {}, _end = {};", begin_expr, end_expr);
Expand Down
6 changes: 4 additions & 2 deletions taichi/ir/statements.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,15 +240,17 @@ RangeForStmt::RangeForStmt(Stmt *begin,
int bit_vectorize,
int num_cpu_threads,
int block_dim,
bool strictly_serialized)
bool strictly_serialized,
bool end_is_array_axis)
: begin(begin),
end(end),
body(std::move(body)),
vectorize(vectorize),
bit_vectorize(bit_vectorize),
num_cpu_threads(num_cpu_threads),
block_dim(block_dim),
strictly_serialized(strictly_serialized) {
strictly_serialized(strictly_serialized),
end_is_array_axis(end_is_array_axis) {
reversed = false;
this->body->parent_stmt = this;
TI_STMT_REG_FIELDS;
Expand Down
12 changes: 10 additions & 2 deletions taichi/ir/statements.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,10 @@ class ExternalTensorShapeAlongAxisStmt : public Stmt {

ExternalTensorShapeAlongAxisStmt(int axis, int arg_id);

bool has_global_side_effect() const override {
return false;
}

TI_STMT_DEF_FIELDS(ret_type, axis, arg_id);
TI_DEFINE_ACCEPT_AND_CLONE
};
Expand Down Expand Up @@ -729,6 +733,7 @@ class RangeForStmt : public Stmt {
int num_cpu_threads;
int block_dim;
bool strictly_serialized;
bool end_is_array_axis{false};

RangeForStmt(Stmt *begin,
Stmt *end,
Expand All @@ -737,7 +742,8 @@ class RangeForStmt : public Stmt {
int bit_vectorize,
int num_cpu_threads,
int block_dim,
bool strictly_serialized);
bool strictly_serialized,
bool end_is_array_axis = false);

bool is_container_statement() const override {
return true;
Expand All @@ -756,7 +762,8 @@ class RangeForStmt : public Stmt {
bit_vectorize,
num_cpu_threads,
block_dim,
strictly_serialized);
strictly_serialized,
end_is_array_axis);
TI_DEFINE_ACCEPT
};

Expand Down Expand Up @@ -1123,6 +1130,7 @@ class OffloadedStmt : public Stmt {
int block_dim{1};
bool reversed{false};
int num_cpu_threads{1};
Stmt *end_stmt{nullptr};

mesh::Mesh *mesh{nullptr};
mesh::MeshElementType major_from_type;
Expand Down
3 changes: 3 additions & 0 deletions taichi/transforms/ir_printer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,9 @@ class IRPrinter : public IRVisitor {
}
if (stmt->const_end) {
end_str = std::to_string(stmt->end_value);
} else if (stmt->end_stmt && !stmt->end_stmt->is<ConstStmt>()) {
// range_for end is a non-const stmt (e.g. ndarray axis)
end_str = stmt->end_stmt->name();
} else {
end_str = fmt::format("tmp(offset={}B)", stmt->end_offset);
}
Expand Down
4 changes: 3 additions & 1 deletion taichi/transforms/lower_ast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,10 +343,12 @@ class LowerAST : public IRVisitor {
for (int i = 0; i < (int)shape.size(); i++) {
end = fctx.push_back<BinaryOpStmt>(BinaryOpType::mul, end, shape[i]);
}
// TODO: add a note explaining why shape might be empty.
auto &&new_for = std::make_unique<RangeForStmt>(
begin, end, std::move(stmt->body), stmt->vectorize,
stmt->bit_vectorize, stmt->num_cpu_threads, stmt->block_dim,
stmt->strictly_serialized);
stmt->strictly_serialized,
/*end_is_array_axis=*/!end->is<ConstStmt>());
VecStatement new_statements;
Stmt *loop_index =
new_statements.push_back<LoopIndexStmt>(new_for.get(), 0);
Expand Down
83 changes: 63 additions & 20 deletions taichi/transforms/offload.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,19 +85,35 @@ class Offloader {
} else {
offloaded->block_dim = s->block_dim;
}
if (auto val = s->begin->cast<ConstStmt>()) {

// TODO: We need to update codegen for each backend gradually so let's
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When we merge this in let's also notify people working on the vulkan ndarray this change

// limit it to opengl backend for now.
if (arch == Arch::opengl && s->end_is_array_axis) {
// range of array must begin with 0.
auto begin = s->begin->cast<ConstStmt>();
TI_ASSERT(begin && begin->val[0].val_int32() == 0);
offloaded->const_begin = true;
offloaded->begin_value = val->val[0].val_int32();
} else {
offloaded_ranges.begin_stmts.insert(
std::make_pair(offloaded.get(), s->begin));
}
if (auto val = s->end->cast<ConstStmt>()) {
offloaded->const_end = true;
offloaded->end_value = val->val[0].val_int32();
} else {
offloaded->begin_value = 0;

offloaded->end_stmt =
clone_and_replace_ext_axis(s->end, offloaded.get(), s);
offloaded_ranges.end_stmts.insert(
std::make_pair(offloaded.get(), s->end));
std::make_pair(offloaded.get(), offloaded->end_stmt));
} else {
if (auto val = s->begin->cast<ConstStmt>()) {
offloaded->const_begin = true;
offloaded->begin_value = val->val[0].val_int32();
} else {
offloaded_ranges.begin_stmts.insert(
std::make_pair(offloaded.get(), s->begin));
}
if (auto val = s->end->cast<ConstStmt>()) {
offloaded->const_end = true;
offloaded->end_value = val->val[0].val_int32();
} else {
offloaded_ranges.end_stmts.insert(
std::make_pair(offloaded.get(), s->end));
}
}
offloaded->num_cpu_threads =
std::min(s->num_cpu_threads, config.cpu_max_num_threads);
Expand Down Expand Up @@ -140,6 +156,28 @@ class Offloader {
}

private:
static Stmt *clone_and_replace_ext_axis(Stmt *stmt,
OffloadedStmt *offloaded,
RangeForStmt *range_for) {
if (stmt->cast<ExternalTensorShapeAlongAxisStmt>()) {
auto new_stmt = stmt->clone();
auto new_stmt_ptr = new_stmt.get();
offloaded->body->insert(std::move(new_stmt));
replace_all_usages_with(range_for, stmt, new_stmt_ptr);
return new_stmt_ptr;
} else {
auto val = stmt->cast<BinaryOpStmt>();
TI_ASSERT(val && val->op_type == BinaryOpType::mul);
auto new_stmt = stmt->clone();
auto new_stmt_ptr = new_stmt.get();
auto new_val = new_stmt->cast<BinaryOpStmt>();
new_val->lhs = clone_and_replace_ext_axis(val->lhs, offloaded, range_for);
new_val->rhs = clone_and_replace_ext_axis(val->rhs, offloaded, range_for);
offloaded->body->insert(std::move(new_stmt));
replace_all_usages_with(range_for, stmt, new_stmt_ptr);
return new_stmt_ptr;
}
}
static void emit_struct_for(StructForStmt *for_stmt,
Block *root_block,
const CompileConfig &config,
Expand Down Expand Up @@ -480,15 +518,20 @@ class FixCrossOffloadReferences : public BasicStmtVisitor {
->second];
}
if (!stmt->const_end) {
TI_ASSERT(offloaded_ranges_->end_stmts.find(stmt) !=
offloaded_ranges_->end_stmts.end())
TI_ASSERT_INFO(local_to_global_offset_.find(
offloaded_ranges_->end_stmts.find(stmt)->second) !=
local_to_global_offset_.end(),
"End fails.")
stmt->end_offset =
local_to_global_offset_[offloaded_ranges_->end_stmts.find(stmt)
->second];
if (stmt->end_stmt) {
TI_ASSERT(stmt->const_begin);
stmt->end_offset = 0;
} else {
TI_ASSERT(offloaded_ranges_->end_stmts.find(stmt) !=
offloaded_ranges_->end_stmts.end())
TI_ASSERT_INFO(local_to_global_offset_.find(
offloaded_ranges_->end_stmts.find(stmt)->second) !=
local_to_global_offset_.end(),
"End fails.")
stmt->end_offset =
local_to_global_offset_[offloaded_ranges_->end_stmts.find(stmt)
->second];
}
}
}
}
Expand Down
8 changes: 5 additions & 3 deletions tests/python/test_aot.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def init(d: ti.i32, density1: ti.any_arr(), density2: ti.any_arr(),

@ti.test(arch=ti.opengl)
def test_opengl_exceed_max_ssbo():
# 7 ndarrays + gtmp + args > 8 (maximum allowed)
# 8 ndarrays + args > 8 (maximum allowed)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really want this test? This seems a bit arch specific & a lot of devices support more

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea this test was mainly a self reminder to double check how many ssbos we create in normal cases - we can remove it later :D

n = 4
density1 = ti.ndarray(dtype=ti.f32, shape=(n, n))
density2 = ti.ndarray(dtype=ti.f32, shape=(n, n))
Expand All @@ -211,12 +211,13 @@ def test_opengl_exceed_max_ssbo():
density5 = ti.ndarray(dtype=ti.f32, shape=(n, n))
density6 = ti.ndarray(dtype=ti.f32, shape=(n, n))
density7 = ti.ndarray(dtype=ti.f32, shape=(n, n))
density8 = ti.ndarray(dtype=ti.f32, shape=(n, n))

@ti.kernel
def init(d: ti.i32, density1: ti.any_arr(), density2: ti.any_arr(),
density3: ti.any_arr(), density4: ti.any_arr(),
density5: ti.any_arr(), density6: ti.any_arr(),
density7: ti.any_arr()):
density7: ti.any_arr(), density8: ti.any_arr()):
for i, j in density1:
density1[i, j] = d + 1
density2[i, j] = d + 2
Expand All @@ -225,10 +226,11 @@ def init(d: ti.i32, density1: ti.any_arr(), density2: ti.any_arr(),
density5[i, j] = d + 5
density6[i, j] = d + 6
density7[i, j] = d + 7
density8[i, j] = d + 8

with pytest.raises(RuntimeError):
init(0, density1, density2, density3, density4, density5, density6,
density7)
density7, density8)


@ti.test(arch=ti.opengl)
Expand Down