From dc557ae6cb30bf1120132ebc8f66c8a1271a388a Mon Sep 17 00:00:00 2001
From: Anastasia Stulova <anastasia.stulova@arm.com>
Date: Mon, 28 Jun 2021 16:28:05 +0100
Subject: [PATCH] [Relay] Fix index order in conv2d computation for Arm CPU.

When dilation is larger than value 1 in conv2d with NHWC
layout, the ordering of indexes when accessing data array
in computation of convolution appears to be incorrect.

'data_vec' is defined as

lambda n, oho, owo, kh, kw, ic, ohi, owi:

But accessed as

data_vec[n, oho, owo, kh, kw, ohi, owi, ic]

This patch fixes the order of indexes and modifies the test
so that it is suitable for running on an AArch64 CPU.
---
 .../tvm/topi/arm_cpu/conv2d_spatial_pack.py   |  2 +-
 .../topi/python/test_topi_conv2d_nhwc.py      | 19 +++++++------------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
index f4cd9d899b732..91bff512a0ab6 100644
--- a/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/python/tvm/topi/arm_cpu/conv2d_spatial_pack.py
@@ -344,7 +344,7 @@ def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_
         conv = te.compute(
             ovshape,
             lambda n, oho, owo, oco, ohi, owi, oci: te.sum(
-                data_vec[n, oho, owo, kh, kw, ohi, owi, ic].astype(out_dtype)
+                data_vec[n, oho, owo, kh, kw, ic, ohi, owi].astype(out_dtype)
                 * kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype),
                 axis=[ic, kh, kw],
             ),
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc.py b/tests/python/topi/python/test_topi_conv2d_nhwc.py
index cdb7c0e8d4aa4..f2e4456032f7f 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc.py
@@ -58,26 +58,21 @@ def get_ref_data():
 
     a_np, w_np, b_np = get_ref_data()
 
-    def check_device(device):
-        if not tvm.testing.device_enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.Target(device):
-            fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv2d_nhwc_implement)
+    def check_device(target, dev):
+        print("Running on target: %s" % target)
+        with tvm.target.Target(target):
+            fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv2d_nhwc_implement)
             B = fcompute(A, W, stride, padding, dilation, dtype)
             s = fschedule([B])
-        dev = tvm.device(device, 0)
         a = tvm.nd.array(a_np, dev)
         w = tvm.nd.array(w_np, dev)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
-        func = tvm.build(s, [A, W, B], device)
+        func = tvm.build(s, [A, W, B], target)
         func(a, w, b)
         tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
 
-    for device in ["llvm", "cuda"]:
-        check_device(device)
-
+    for target, dev in tvm.testing.enabled_targets():
+        check_device(target, dev)
 
 @tvm.testing.uses_gpu
 def test_conv2d_nhwc():