-
Notifications
You must be signed in to change notification settings - Fork 2.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[opengl] Optimize range_for for ndarrays #3884
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -85,19 +85,35 @@ class Offloader { | |
} else { | ||
offloaded->block_dim = s->block_dim; | ||
} | ||
if (auto val = s->begin->cast<ConstStmt>()) { | ||
|
||
// TODO: We need to update codegen for each backend gradually so let's | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When we merge this in let's also notify people working on the vulkan ndarray this change |
||
// limit it to opengl backend for now. | ||
if (arch == Arch::opengl && s->range_of_array) { | ||
// range of array must begin with 0. | ||
auto begin = s->begin->cast<ConstStmt>(); | ||
TI_ASSERT(begin && begin->val[0].val_int32() == 0); | ||
offloaded->const_begin = true; | ||
offloaded->begin_value = val->val[0].val_int32(); | ||
} else { | ||
offloaded_ranges.begin_stmts.insert( | ||
std::make_pair(offloaded.get(), s->begin)); | ||
} | ||
if (auto val = s->end->cast<ConstStmt>()) { | ||
offloaded->const_end = true; | ||
offloaded->end_value = val->val[0].val_int32(); | ||
} else { | ||
offloaded->begin_value = 0; | ||
|
||
offloaded->end_stmt = | ||
clone_and_replace_ext_axis(s->end, offloaded.get(), s); | ||
offloaded_ranges.end_stmts.insert( | ||
std::make_pair(offloaded.get(), s->end)); | ||
std::make_pair(offloaded.get(), offloaded->end_stmt)); | ||
} else { | ||
if (auto val = s->begin->cast<ConstStmt>()) { | ||
offloaded->const_begin = true; | ||
offloaded->begin_value = val->val[0].val_int32(); | ||
} else { | ||
offloaded_ranges.begin_stmts.insert( | ||
std::make_pair(offloaded.get(), s->begin)); | ||
} | ||
if (auto val = s->end->cast<ConstStmt>()) { | ||
offloaded->const_end = true; | ||
offloaded->end_value = val->val[0].val_int32(); | ||
} else { | ||
offloaded_ranges.end_stmts.insert( | ||
std::make_pair(offloaded.get(), s->end)); | ||
} | ||
} | ||
offloaded->num_cpu_threads = | ||
std::min(s->num_cpu_threads, config.cpu_max_num_threads); | ||
|
@@ -140,6 +156,28 @@ class Offloader { | |
} | ||
|
||
private: | ||
static Stmt *clone_and_replace_ext_axis(Stmt *stmt, | ||
OffloadedStmt *offloaded, | ||
RangeForStmt *range_for) { | ||
if (stmt->cast<ExternalTensorShapeAlongAxisStmt>()) { | ||
auto new_stmt = stmt->clone(); | ||
auto new_stmt_ptr = new_stmt.get(); | ||
offloaded->body->insert(std::move(new_stmt)); | ||
replace_all_usages_with(range_for, stmt, new_stmt_ptr); | ||
return new_stmt_ptr; | ||
} else { | ||
auto val = stmt->cast<BinaryOpStmt>(); | ||
TI_ASSERT(val && val->op_type == BinaryOpType::mul); | ||
auto new_stmt = stmt->clone(); | ||
auto new_stmt_ptr = new_stmt.get(); | ||
auto new_val = new_stmt->cast<BinaryOpStmt>(); | ||
new_val->lhs = clone_and_replace_ext_axis(val->lhs, offloaded, range_for); | ||
new_val->rhs = clone_and_replace_ext_axis(val->rhs, offloaded, range_for); | ||
offloaded->body->insert(std::move(new_stmt)); | ||
replace_all_usages_with(range_for, stmt, new_stmt_ptr); | ||
return new_stmt_ptr; | ||
} | ||
} | ||
static void emit_struct_for(StructForStmt *for_stmt, | ||
Block *root_block, | ||
const CompileConfig &config, | ||
|
@@ -480,15 +518,20 @@ class FixCrossOffloadReferences : public BasicStmtVisitor { | |
->second]; | ||
} | ||
if (!stmt->const_end) { | ||
TI_ASSERT(offloaded_ranges_->end_stmts.find(stmt) != | ||
offloaded_ranges_->end_stmts.end()) | ||
TI_ASSERT_INFO(local_to_global_offset_.find( | ||
offloaded_ranges_->end_stmts.find(stmt)->second) != | ||
local_to_global_offset_.end(), | ||
"End fails.") | ||
stmt->end_offset = | ||
local_to_global_offset_[offloaded_ranges_->end_stmts.find(stmt) | ||
->second]; | ||
if (stmt->end_stmt) { | ||
TI_ASSERT(stmt->const_begin); | ||
stmt->end_offset = 0; | ||
} else { | ||
TI_ASSERT(offloaded_ranges_->end_stmts.find(stmt) != | ||
offloaded_ranges_->end_stmts.end()) | ||
TI_ASSERT_INFO(local_to_global_offset_.find( | ||
offloaded_ranges_->end_stmts.find(stmt)->second) != | ||
local_to_global_offset_.end(), | ||
"End fails.") | ||
stmt->end_offset = | ||
local_to_global_offset_[offloaded_ranges_->end_stmts.find(stmt) | ||
->second]; | ||
} | ||
} | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -202,7 +202,7 @@ def init(d: ti.i32, density1: ti.any_arr(), density2: ti.any_arr(), | |
|
||
@ti.test(arch=ti.opengl) | ||
def test_opengl_exceed_max_ssbo(): | ||
# 7 ndarrays + gtmp + args > 8 (maximum allowed) | ||
# 8 ndarrays + args > 8 (maximum allowed) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we really want this test? This seems a bit arch specific & a lot of devices support more There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea this test was mainly a self reminder to double check how many ssbos we create in normal cases - we can remove it later :D |
||
n = 4 | ||
density1 = ti.ndarray(dtype=ti.f32, shape=(n, n)) | ||
density2 = ti.ndarray(dtype=ti.f32, shape=(n, n)) | ||
|
@@ -211,12 +211,13 @@ def test_opengl_exceed_max_ssbo(): | |
density5 = ti.ndarray(dtype=ti.f32, shape=(n, n)) | ||
density6 = ti.ndarray(dtype=ti.f32, shape=(n, n)) | ||
density7 = ti.ndarray(dtype=ti.f32, shape=(n, n)) | ||
density8 = ti.ndarray(dtype=ti.f32, shape=(n, n)) | ||
|
||
@ti.kernel | ||
def init(d: ti.i32, density1: ti.any_arr(), density2: ti.any_arr(), | ||
density3: ti.any_arr(), density4: ti.any_arr(), | ||
density5: ti.any_arr(), density6: ti.any_arr(), | ||
density7: ti.any_arr()): | ||
density7: ti.any_arr(), density8: ti.any_arr()): | ||
for i, j in density1: | ||
density1[i, j] = d + 1 | ||
density2[i, j] = d + 2 | ||
|
@@ -225,10 +226,11 @@ def init(d: ti.i32, density1: ti.any_arr(), density2: ti.any_arr(), | |
density5[i, j] = d + 5 | ||
density6[i, j] = d + 6 | ||
density7[i, j] = d + 7 | ||
density8[i, j] = d + 8 | ||
|
||
with pytest.raises(RuntimeError): | ||
init(0, density1, density2, density3, density4, density5, density6, | ||
density7) | ||
density7, density8) | ||
|
||
|
||
@ti.test(arch=ti.opengl) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This parameter name seems slightly confusing?