Skip to content

Commit

Permalink
Add checks for unknown workgroup count when inferring boundaries. (#9258
Browse files Browse the repository at this point in the history
)

The workgroup count is set to zero when we're not able to infer the bounds.

Fixes #9244
  • Loading branch information
hanhanW authored Jun 2, 2022
1 parent 2187016 commit 697396e
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,23 @@ static Optional<std::pair<AffineExpr, AffineExpr>> getWorkgroupRange(
if (auto idOp =
processorValue.getDefiningOp<IREE::HAL::InterfaceWorkgroupIDOp>()) {
OpBuilder builder(processorValue.getContext());

// Can't infer the range when workroupCount is unknown.
unsigned index = idOp.dimension().getZExtValue();
if (!workgroupCount[index]) return llvm::None;

AffineExpr zero = builder.getAffineConstantExpr(0);
AffineExpr ubExpr = builder.getAffineConstantExpr(workgroupCount[index]);
return std::make_pair(zero, ubExpr - 1);
}
if (auto dimOp = processorValue
.getDefiningOp<IREE::HAL::InterfaceWorkgroupCountOp>()) {
OpBuilder builder(processorValue.getContext());

// Can't infer the range when workroupCount is unknown.
unsigned index = dimOp.dimension().getZExtValue();
if (!workgroupCount[index]) return llvm::None;

AffineExpr bound = builder.getAffineConstantExpr(workgroupCount[index]);
return std::make_pair(bound, bound);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,69 @@ hal.executable private @both_workgroup_and_workitem {
}
}
}

// -----


#config = #iree_codegen.lowering_config<tile_sizes = [[4], [4], [0]]>
#device_target_cpu = #hal.device.target<"cpu", {executable_targets = [#hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]}>
#executable_layout = #hal.executable.layout<push_constants = 0, sets = [#hal.descriptor_set.layout<0, bindings = [#hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#translation = #iree_codegen.translation_info<CPUDoubleTilingExpert workload_per_wg = [4]>
#map0 = affine_map<()[s0] -> (s0 ceildiv 4)>
#map1 = affine_map<()[s0] -> (s0 * 4)>
#map2 = affine_map<()[s0, s1] -> (-((s0 * -4 + 4) mod (s1 * 4)) + 4)>
#map3 = affine_map<(d0)[s0] -> (d0 + s0)>
module attributes {hal.device.targets = [#device_target_cpu]} {
hal.executable private @simple_mul {
hal.executable.variant public @embedded_elf_x86_64, target = #executable_target_embedded_elf_x86_64_ {
hal.executable.entry_point public @simple_mul ordinal(0) layout(#executable_layout) {translation_info = #translation} {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%c1 = arith.constant 1 : index
%0 = affine.apply #map0()[%arg1]
hal.return %0, %c1, %c1 : index, index, index
}
builtin.module {
func.func @simple_mul() {
%cst = arith.constant 0.000000e+00 : f32
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %0, 64 : memref<4xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %1, 64 : memref<4xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c0) alignment(64) : memref<4xf32>
memref.assume_alignment %2, 64 : memref<4xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%3 = affine.apply #map1()[%workgroup_id_x]
%4 = affine.apply #map1()[%workgroup_count_x]
%5 = affine.apply #map2()[%workgroup_id_x, %workgroup_count_x]
scf.for %arg0 = %3 to %5 step %4 {
%6 = memref.subview %2[%arg0] [4] [1] : memref<4xf32> to memref<4xf32, #map3>
%7 = memref.subview %0[%arg0] [4] [1] : memref<4xf32> to memref<4xf32, #map3>
%8 = memref.subview %1[%arg0] [4] [1] : memref<4xf32> to memref<4xf32, #map3>
%9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : memref<4xf32, #map3>, vector<4xf32>
%10 = vector.transfer_read %8[%c0], %cst {in_bounds = [true]} : memref<4xf32, #map3>, vector<4xf32>
%11 = arith.mulf %9, %10 : vector<4xf32>
vector.transfer_write %11, %6[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32, #map3>
}
scf.for %arg0 = %5 to %c4 step %4 {
%6 = memref.subview %2[%arg0] [4] [1] : memref<4xf32> to memref<4xf32, #map3>
%7 = memref.subview %0[%arg0] [4] [1] : memref<4xf32> to memref<4xf32, #map3>
%8 = memref.subview %1[%arg0] [4] [1] : memref<4xf32> to memref<4xf32, #map3>
%9 = vector.transfer_read %7[%c0], %cst {in_bounds = [true]} : memref<4xf32, #map3>, vector<4xf32>
%10 = vector.transfer_read %8[%c0], %cst {in_bounds = [true]} : memref<4xf32, #map3>, vector<4xf32>
%11 = arith.mulf %9, %10 : vector<4xf32>
vector.transfer_write %11, %6[%c0] {in_bounds = [true]} : vector<4xf32>, memref<4xf32, #map3>
}
return
}
}
}
}
}

// CHECK-LABEL: func.func @simple_mul
// CHECK: scf.for
// CHECK: scf.for

0 comments on commit 697396e

Please sign in to comment.