Address review comments

apache · Oct 29, 2020 · bdad27a · bdad27a
1 parent ad63847
commit bdad27a
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 29 deletions.
diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -289,17 +289,26 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs):
 
     ##### space definition begin #####
     n, h, w, c = s[out].op.axis
+    # Split the number of input/output channels
     cfg.define_split("tile_c", c, num_outputs=2)
+    # Split the height of the convolution
     _, hi = cfg.define_split("tile_h", h, num_outputs=2)
+    # Split the width of the convolution
     _, wi = cfg.define_split("tile_w", w, num_outputs=2)
+    # Additional out (e.g., requantization, bias addition, etc..)
+    # 0: locate the output on the second last axis of the main compuation
+    # 1: locate the output closest to the main computation
     cfg.define_knob("locate_output", [0, 1])
+    # Determine if we should unroll the computation of the inner tile
+    cfg.define_knob("unroll_tile", [True, False])
 
     # fallback support
     if cfg.is_fallback:
         cfg["tile_c"] = SplitEntity([-1, 8])
         cfg["tile_h"] = SplitEntity([-1, 2])
         cfg["tile_w"] = SplitEntity([-1, 2])
-        cfg["locate_output"] = OtherOptionEntity(2)
+        cfg["locate_output"] = OtherOptionEntity(1)
+        cfg["unroll_tile"] = OtherOptionEntity(True)
     ##### space definition end #####
 
     def schedule_conv(conv):
@@ -315,43 +324,49 @@ def schedule_conv(conv):
         wo, wi = cfg["tile_w"].apply(s, conv, w)
         co, ci = cfg["tile_c"].apply(s, conv, c)
 
+        split_val = cfg["tile_c"].size[-1]
+        use_tensorization = (
+            (in_type == "int16")
+            and (split_val == 8)
+            and (IC % split_val == 0)
+            and (channel_multiplier == 1)
+            and is_aarch64_arm()
+        )
+
         data_pad_value = -1
         if conv_data.name == "data_pad":
             assert isinstance(conv_data.op, tvm.te.ComputeOp)
-            # Define a policy for padding computation
-            cfg.define_knob("data_pad_inline", [1, 2, 3])
+            # Define a strategy for padding computation
+            cfg.define_knob("data_pad_strategy", [1, 2, 3])
             if cfg.is_fallback:
-                cfg["data_pad_inline"] = OtherOptionEntity(2)
-            if cfg["data_pad_inline"].val == 1:
+                # We cannot inline padding when tensorizing. So, if we can tensorize, let's compute_at the closest axis
+                cfg["data_pad_strategy"] = (
+                    OtherOptionEntity(2) if use_tensorization else OtherOptionEntity(3)
+                )
+            # Compute padding on the third to last axis of the computation
+            if cfg["data_pad_strategy"].val == 1:
                 s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                 s[conv_data].compute_at(s[conv], ho)
-            if cfg["data_pad_inline"].val == 2:
+            # Compute padding on the second to last axis of the computation
+            if cfg["data_pad_strategy"].val == 2:
                 s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                 s[conv_data].compute_at(s[conv], wo)
-            if cfg["data_pad_inline"].val == 3:
+            # Inline padding during computation
+            if cfg["data_pad_strategy"].val == 3:
                 s[conv_data].compute_inline()
-            data_pad_value = cfg["data_pad_inline"].val
-
-        split_val = cfg["tile_c"].size[-1]
-        use_tensorization = (
-            (in_type == "int16")
-            and (split_val == 8)
-            and (IC % split_val == 0)
-            and (channel_multiplier == 1)
-            and (data_pad_value != 3)
-            and is_aarch64_arm()
-        )
+            data_pad_value = cfg["data_pad_strategy"].val
 
-        if use_tensorization:
+        if use_tensorization and data_pad_value != 3:
             smlal = smlal_int16_int32()
             s[conv].tensorize(ci, smlal)
         else:
             s[conv].vectorize(ci)
 
-        s[conv].unroll(r_h)
-        s[conv].unroll(r_w)
-        s[conv].unroll(wi)
-        s[conv].unroll(hi)
+        if cfg["unroll_tile"].val:
+            s[conv].unroll(r_h)
+            s[conv].unroll(r_w)
+            s[conv].unroll(wi)
+            s[conv].unroll(hi)
 
         s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci)
         fused_n_ho = s[conv].fuse(n, ho)
@@ -363,8 +378,9 @@ def schedule_conv_out(out):
         wo, wi = cfg["tile_w"].apply(s, out, w)
         ho, hi = cfg["tile_h"].apply(s, out, h)
         s[out].reorder(n, ho, wo, co, hi, wi, ci)
-        s[out].unroll(wi)
-        s[out].unroll(hi)
+        if cfg["unroll_tile"]:
+            s[out].unroll(wi)
+            s[out].unroll(hi)
 
         if out.dtype in ["int8", "uint8"]:
             # In case of quantized convolution further split the channel in batches of 4 elements

diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -885,8 +885,8 @@ def smlal_int16_int32():
     them together through a pair of smlal/smlal2 instructions. The pseudo-code
     for the algorithm is as follows:
 
-        vec_a = vld1q_s16(A)
-        vec_b = vld1q_s16(B)
+        vec_a = vload(A, "int16x8")
+        vec_b = vload(B, "int16x8")
 
         vec_c[0:4] += vec_a[0:4]*vec_b[0:4] //  -> smlal instruction
         vec_c[4:8] += vec_a[4:8]*vec_b[4:8] // -> smlal2 instruction
@@ -932,7 +932,7 @@ def _instr(index):
             vec_b = ins[1].vload([0, 0], "int16x8")
             inst = "llvm.aarch64.neon.smull"
 
-            # Lower part of the vector
+            # Higher part of the vector
             vec_c_h = outs[0].vload([4], "int32x4")
             vec_a_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_a)
             vec_b_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_b)
@@ -951,7 +951,7 @@ def _instr(index):
             vec_out_l = vec_c_l + vmull_l
 
             # Combine higher and lower part in a single int32x8 vector to store
-            # (this will require two different STR instructions, since the
+            # (this will require two different store instructions, since the
             # length of a NEON vector is fixed at 128
             vec_out = tvm.tir.call_intrin("int32x8", "tir.vectorcombine", vec_out_l, vec_out_h)
             ib.emit(outs[0].vstore(0, vec_out))