Skip to content

Commit

Permalink
Address review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Giuseppe Rossini committed Oct 29, 2020
1 parent ad63847 commit bdad27a
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 29 deletions.
66 changes: 41 additions & 25 deletions python/tvm/topi/arm_cpu/depthwise_conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,17 +289,26 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs):

##### space definition begin #####
n, h, w, c = s[out].op.axis
# Split the number of input/output channels
cfg.define_split("tile_c", c, num_outputs=2)
# Split the height of the convolution
_, hi = cfg.define_split("tile_h", h, num_outputs=2)
# Split the width of the convolution
_, wi = cfg.define_split("tile_w", w, num_outputs=2)
# Additional out (e.g., requantization, bias addition, etc..)
# 0: locate the output on the second last axis of the main compuation
# 1: locate the output closest to the main computation
cfg.define_knob("locate_output", [0, 1])
# Determine if we should unroll the computation of the inner tile
cfg.define_knob("unroll_tile", [True, False])

# fallback support
if cfg.is_fallback:
cfg["tile_c"] = SplitEntity([-1, 8])
cfg["tile_h"] = SplitEntity([-1, 2])
cfg["tile_w"] = SplitEntity([-1, 2])
cfg["locate_output"] = OtherOptionEntity(2)
cfg["locate_output"] = OtherOptionEntity(1)
cfg["unroll_tile"] = OtherOptionEntity(True)
##### space definition end #####

def schedule_conv(conv):
Expand All @@ -315,43 +324,49 @@ def schedule_conv(conv):
wo, wi = cfg["tile_w"].apply(s, conv, w)
co, ci = cfg["tile_c"].apply(s, conv, c)

split_val = cfg["tile_c"].size[-1]
use_tensorization = (
(in_type == "int16")
and (split_val == 8)
and (IC % split_val == 0)
and (channel_multiplier == 1)
and is_aarch64_arm()
)

data_pad_value = -1
if conv_data.name == "data_pad":
assert isinstance(conv_data.op, tvm.te.ComputeOp)
# Define a policy for padding computation
cfg.define_knob("data_pad_inline", [1, 2, 3])
# Define a strategy for padding computation
cfg.define_knob("data_pad_strategy", [1, 2, 3])
if cfg.is_fallback:
cfg["data_pad_inline"] = OtherOptionEntity(2)
if cfg["data_pad_inline"].val == 1:
# We cannot inline padding when tensorizing. So, if we can tensorize, let's compute_at the closest axis
cfg["data_pad_strategy"] = (
OtherOptionEntity(2) if use_tensorization else OtherOptionEntity(3)
)
# Compute padding on the third to last axis of the computation
if cfg["data_pad_strategy"].val == 1:
s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
s[conv_data].compute_at(s[conv], ho)
if cfg["data_pad_inline"].val == 2:
# Compute padding on the second to last axis of the computation
if cfg["data_pad_strategy"].val == 2:
s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
s[conv_data].compute_at(s[conv], wo)
if cfg["data_pad_inline"].val == 3:
# Inline padding during computation
if cfg["data_pad_strategy"].val == 3:
s[conv_data].compute_inline()
data_pad_value = cfg["data_pad_inline"].val

split_val = cfg["tile_c"].size[-1]
use_tensorization = (
(in_type == "int16")
and (split_val == 8)
and (IC % split_val == 0)
and (channel_multiplier == 1)
and (data_pad_value != 3)
and is_aarch64_arm()
)
data_pad_value = cfg["data_pad_strategy"].val

if use_tensorization:
if use_tensorization and data_pad_value != 3:
smlal = smlal_int16_int32()
s[conv].tensorize(ci, smlal)
else:
s[conv].vectorize(ci)

s[conv].unroll(r_h)
s[conv].unroll(r_w)
s[conv].unroll(wi)
s[conv].unroll(hi)
if cfg["unroll_tile"].val:
s[conv].unroll(r_h)
s[conv].unroll(r_w)
s[conv].unroll(wi)
s[conv].unroll(hi)

s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci)
fused_n_ho = s[conv].fuse(n, ho)
Expand All @@ -363,8 +378,9 @@ def schedule_conv_out(out):
wo, wi = cfg["tile_w"].apply(s, out, w)
ho, hi = cfg["tile_h"].apply(s, out, h)
s[out].reorder(n, ho, wo, co, hi, wi, ci)
s[out].unroll(wi)
s[out].unroll(hi)
if cfg["unroll_tile"]:
s[out].unroll(wi)
s[out].unroll(hi)

if out.dtype in ["int8", "uint8"]:
# In case of quantized convolution further split the channel in batches of 4 elements
Expand Down
8 changes: 4 additions & 4 deletions python/tvm/topi/arm_cpu/tensor_intrin.py
Original file line number Diff line number Diff line change
Expand Up @@ -885,8 +885,8 @@ def smlal_int16_int32():
them together through a pair of smlal/smlal2 instructions. The pseudo-code
for the algorithm is as follows:
vec_a = vld1q_s16(A)
vec_b = vld1q_s16(B)
vec_a = vload(A, "int16x8")
vec_b = vload(B, "int16x8")
vec_c[0:4] += vec_a[0:4]*vec_b[0:4] // -> smlal instruction
vec_c[4:8] += vec_a[4:8]*vec_b[4:8] // -> smlal2 instruction
Expand Down Expand Up @@ -932,7 +932,7 @@ def _instr(index):
vec_b = ins[1].vload([0, 0], "int16x8")
inst = "llvm.aarch64.neon.smull"

# Lower part of the vector
# Higher part of the vector
vec_c_h = outs[0].vload([4], "int32x4")
vec_a_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_a)
vec_b_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_b)
Expand All @@ -951,7 +951,7 @@ def _instr(index):
vec_out_l = vec_c_l + vmull_l

# Combine higher and lower part in a single int32x8 vector to store
# (this will require two different STR instructions, since the
# (this will require two different store instructions, since the
# length of a NEON vector is fixed at 128
vec_out = tvm.tir.call_intrin("int32x8", "tir.vectorcombine", vec_out_l, vec_out_h)
ib.emit(outs[0].vstore(0, vec_out))
Expand Down

0 comments on commit bdad27a

Please sign in to comment.