From efd53261c2a94886d9aa67fb0420bbd31cc13404 Mon Sep 17 00:00:00 2001 From: Wheest Date: Tue, 29 Dec 2020 23:09:47 +0000 Subject: [PATCH] Asymmetric padding and dilation in conv2d workload (#7142) * added asymmetric padding to conv2d workload * fixed depthwise conv2d padding * Added fix to include dilation in workload output width calculation * Added missing dilation to arm_cpu/conv2d_int8.py workload * Fixed dilation for x86 conv2d * Improved dilation workload integration in x86 * Fixed x86 conv2d_alter_op to add dilation * Local linting not always producing same output as CI, probably my fault * Fixed bug, tested locally * Abusing CI until I can figure out how to reproduce the same behaviour of running integration tests locally. * Ammeded conv2d_int8 test * Updated workload, improved unit tests * Added depthwise conv2d workload test --- python/tvm/topi/arm_cpu/conv2d_int8.py | 7 +-- python/tvm/topi/cuda/conv2d_int8.py | 7 +-- python/tvm/topi/generic/conv2d.py | 15 ++++--- python/tvm/topi/nn/conv2d.py | 43 +++++++++++++++---- python/tvm/topi/nn/depthwise_conv2d.py | 33 +++++++++----- .../topi/testing/depthwise_conv2d_python.py | 2 +- python/tvm/topi/x86/conv2d.py | 16 ++++--- python/tvm/topi/x86/conv2d_alter_op.py | 30 +++++++++++-- python/tvm/topi/x86/conv2d_avx_1x1.py | 11 +++-- python/tvm/topi/x86/conv2d_avx_common.py | 14 +++--- python/tvm/topi/x86/conv2d_int8.py | 14 +++--- python/tvm/topi/x86/depthwise_conv2d.py | 9 ++-- .../topi/python/test_topi_conv2d_int8.py | 23 +++++++++- .../topi/python/test_topi_conv2d_nchw.py | 17 ++++++++ .../topi/python/test_topi_depthwise_conv2d.py | 23 +++++++++- 15 files changed, 201 insertions(+), 63 deletions(-) diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py index 445b9ec0c113..fc7e4036341a 100644 --- a/python/tvm/topi/arm_cpu/conv2d_int8.py +++ b/python/tvm/topi/arm_cpu/conv2d_int8.py @@ -32,12 +32,12 @@ from .arm_utils import get_tiling_B_interleaved_t -def _get_default_config(cfg, data, kernel, strides, padding, out_dtype): +def _get_default_config(cfg, data, kernel, strides, padding, dilation, out_dtype): """ Get default int8 schedule config for the workload """ - wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype) - is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1 + wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype) + is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1 if is_kernel_1x1: conv2d_generic.fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes=2, num_int8_elements=4) else: @@ -65,6 +65,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out te.placeholder((num_filter, in_channel, kh, kw), dtype=kernel.dtype), strides, padding, + dilation, out_dtype, ) return nn.conv2d_NCHWc_int8_compute( diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py index 50a0e8b71661..001411d6e4c9 100644 --- a/python/tvm/topi/cuda/conv2d_int8.py +++ b/python/tvm/topi/cuda/conv2d_int8.py @@ -142,9 +142,10 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_ pad_data = pad(packed_data, pad_before, pad_after, name="pad_data") # compute the output shape - out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1 - out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1 - + dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 + dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 + out_height = (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1 + out_width = (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1 oshape = (batch, oc_chunk, out_height, out_width, oc_block) icc = te.reduce_axis((0, ic_chunk), name="ic_chunk") diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py index 7dd9aed7545d..4daa84c29528 100644 --- a/python/tvm/topi/generic/conv2d.py +++ b/python/tvm/topi/generic/conv2d.py @@ -38,9 +38,10 @@ def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements): How many numbers of input int32/uint32 will be multiplied and reduced. This is related to input channel. """ - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + HSTR, WSTR = wkl.stride_h, wkl.stride_w + dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 + out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % ( wkl.out_filter, @@ -85,10 +86,10 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements): How many numbers of input int32/uint32 will be multiplied and reduced. This is related to input channel. """ - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + HSTR, WSTR = wkl.stride_h, wkl.stride_w + out_height = (wkl.height + pt + pb - wkl.kernel_h) // HSTR + 1 + out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1 assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % ( wkl.out_filter, diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index 886470bb3b9d..80f87f86736c 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -38,12 +38,16 @@ "in_filter", "groups", "out_filter", - "hkernel", - "wkernel", - "hpad", - "wpad", - "hstride", - "wstride", + "kernel_h", + "kernel_w", + "padt", + "padl", + "padb", + "padr", + "dilation_h", + "dilation_w", + "stride_h", + "stride_w", ], ) @@ -154,7 +158,7 @@ def conv2d_infer_layout(workload, cfg): raise ValueError("missing register for topi.nn.conv2d_infer_layout") -def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"): +def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layout="NCHW"): """ Get the workload structure. """ if data_layout == "NCHW": _, CI, IH, IW = get_const_tuple(data.shape) @@ -170,7 +174,10 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"): else: KH, KW, CIG, CO = get_const_tuple(kernel.shape) - HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) + pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) + dilation_h, dilation_w = ( + dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + ) GRPS = CI // CIG if isinstance(stride, (tuple, list)): HSTR, WSTR = stride @@ -182,7 +189,25 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"): '{} vs. {}".format( data.dtype, kernel.dtype ) - return Workload(data.dtype, out_dtype, IH, IW, CI, GRPS, CO, KH, KW, HPAD, WPAD, HSTR, WSTR) + return Workload( + data.dtype, + out_dtype, + IH, + IW, + CI, + GRPS, + CO, + KH, + KW, + pt, + pl, + pb, + pr, + dilation_h, + dilation_w, + HSTR, + WSTR, + ) def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None): diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py index 72356821770d..052ab8b88d1c 100644 --- a/python/tvm/topi/nn/depthwise_conv2d.py +++ b/python/tvm/topi/nn/depthwise_conv2d.py @@ -36,22 +36,28 @@ "width", "in_filter", "out_filter", - "hkernel", - "wkernel", - "hpad", - "wpad", - "hstride", - "wstride", + "kernel_h", + "kernel_w", + "padt", + "padl", + "padb", + "padr", + "dilation_h", + "dilation_w", + "stride_h", + "stride_w", ], ) -def _get_workload(data, kernel, stride, padding, out_dtype): +def _get_workload(data, kernel, stride, padding, dilation, out_dtype): """ Get the workload structure. """ _, in_channel, height, width = [x.value for x in data.shape] channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape] out_channel = channel * channel_multiplier - HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + dilation_h, dilation_w = ( + dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + ) if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: @@ -62,6 +68,9 @@ def _get_workload(data, kernel, stride, padding, out_dtype): '{} vs. {}".format( data.dtype, kernel.dtype ) + dilated_kernel_h = (kh - 1) * dilation_h + 1 + dilated_kernel_w = (kw - 1) * dilation_w + 1 + pt, pl, pb, pr = get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w)) return Workload( data.dtype, out_dtype, @@ -71,8 +80,12 @@ def _get_workload(data, kernel, stride, padding, out_dtype): out_channel, kh, kw, - HPAD, - WPAD, + pt, + pl, + pb, + pr, + dilation_h, + dilation_w, HSTR, WSTR, ) diff --git a/python/tvm/topi/testing/depthwise_conv2d_python.py b/python/tvm/topi/testing/depthwise_conv2d_python.py index 06f26ab3a2e4..2239c56134f5 100644 --- a/python/tvm/topi/testing/depthwise_conv2d_python.py +++ b/python/tvm/topi/testing/depthwise_conv2d_python.py @@ -65,7 +65,7 @@ def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding): 0 : (in_height - filter_height + 1) : stride_h, 0 : (in_width - filter_width + 1) : stride_w, ] - if padding == "SAME": + elif padding == "SAME": out_channel = in_channel * channel_multiplier out_height = np.int(np.ceil(float(in_height) / float(stride_h))) out_width = np.int(np.ceil(float(in_width) / float(stride_w))) diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py index a3b7e473415e..182454acf3a6 100644 --- a/python/tvm/topi/x86/conv2d.py +++ b/python/tvm/topi/x86/conv2d.py @@ -35,7 +35,7 @@ def _get_default_config( - cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False, layout="NCHW" + cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW" ): """ Get default schedule config for the workload @@ -48,13 +48,13 @@ def _get_default_config( static_data_shape.append(dim) data = te.placeholder(static_data_shape, dtype=data.dtype) if is_depthwise: - wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, out_dtype) + wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype) from .depthwise_conv2d import _fallback_schedule _fallback_schedule(cfg, wkl) else: - wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout) - is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1 + wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout) + is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1 if is_kernel_1x1: conv2d_avx_1x1._fallback_schedule(cfg, wkl) else: @@ -69,8 +69,11 @@ def _conv2d_infer_layout(workload, cfg): idxdiv = tvm.tir.indexdiv pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width)) - out_height = idxdiv(in_height + pt + pb - k_height, strides[0]) + 1 - out_width = idxdiv(in_width + pl + pr - k_width, strides[1]) + 1 + hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + dilated_kernel_h = (k_height - 1) * hdilation + 1 + dilated_kernel_w = (k_width - 1) * wdilation + 1 + out_height = idxdiv(in_height + pt + pb - dilated_kernel_h, strides[0]) + 1 + out_width = idxdiv(in_width + pl + pr - dilated_kernel_w, strides[1]) + 1 tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] in_shape = (batch_size, idxdiv(in_channel, tile_ic), in_height, in_width, tile_ic) in_layout = "NCHW%dc" % tile_ic @@ -208,6 +211,7 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layo ), strides, padding, + dilation, out_dtype, ) diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py index 979dc5ab5702..f05bac82ff0c 100644 --- a/python/tvm/topi/x86/conv2d_alter_op.py +++ b/python/tvm/topi/x86/conv2d_alter_op.py @@ -97,7 +97,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): if data_layout == "NCHW" and kernel_layout == "OIHW": if cfg.is_fallback: _get_default_config( - cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False, data_layout + cfg, + data_tensor, + kernel_tensor, + strides, + padding, + dilation, + out_dtype, + False, + data_layout, ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape) @@ -142,7 +150,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): assert data_layout == "NCHW" and kernel_layout == "OIHW" if cfg.is_fallback: _get_default_config_int8( - cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False, data_layout + cfg, + data_tensor, + kernel_tensor, + strides, + padding, + dilation, + out_dtype, + False, + data_layout, ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) @@ -198,7 +214,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): if data_layout == "NCHW" and kernel_layout == "OIHW": if cfg.is_fallback: _get_default_config( - cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, True, data_layout + cfg, + data_tensor, + kernel_tensor, + strides, + padding, + dilation, + out_dtype, + True, + data_layout, ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py index 3e5a12bc43b2..afee03a9f6a0 100644 --- a/python/tvm/topi/x86/conv2d_avx_1x1.py +++ b/python/tvm/topi/x86/conv2d_avx_1x1.py @@ -31,10 +31,13 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + HSTR, WSTR = wkl.stride_h, wkl.stride_w + dilated_kernel_h = (wkl.kernel_h - 1) * wkl.dilation_h + 1 + dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 + + out_height = (wkl.height + pt + pb - dilated_kernel_h) // HSTR + 1 + out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py index 8d707445be05..5e63de329bba 100644 --- a/python/tvm/topi/x86/conv2d_avx_common.py +++ b/python/tvm/topi/x86/conv2d_avx_common.py @@ -27,9 +27,11 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + HSTR, WSTR = wkl.stride_h, wkl.stride_w + dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 + + out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): @@ -56,9 +58,9 @@ def _fallback_schedule(cfg, wkl): def _fallback_schedule_int8(cfg, wkl): - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + HSTR, WSTR = wkl.stride_h, wkl.stride_w + out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1 oc_bn = 16 assert wkl.out_filter % oc_bn == 0 diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py index 905ada68f277..ca0d0b8b223c 100644 --- a/python/tvm/topi/x86/conv2d_int8.py +++ b/python/tvm/topi/x86/conv2d_int8.py @@ -33,7 +33,7 @@ def _get_default_config_int8( - cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False, layout="NCHW" + cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW" ): """ Get default schedule config for the workload @@ -45,8 +45,8 @@ def _get_default_config_int8( _fallback_schedule(cfg, wkl) else: - wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout) - is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1 + wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout) + is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1 if is_kernel_1x1: conv2d_generic.fallback_schedule_cpu_1x1_int8( cfg, wkl, int32_lanes=16, num_int8_elements=4 @@ -138,8 +138,11 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out is_kernel_1x1 = kernel_height == 1 and kernel_width == 1 pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width)) sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) - oh = (ih - kernel_height + pt + pb) // sh + 1 - ow = (iw - kernel_width + pl + pr) // sw + 1 + dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + dilated_kernel_h = (kernel_height - 1) * dh + 1 + dilated_kernel_w = (kernel_width - 1) * dw + 1 + oh = (ih - dilated_kernel_h + pt + pb) // sh + 1 + ow = (iw - dilated_kernel_w + pl + pr) // sw + 1 cfg.define_split("tile_ic", in_channel, num_outputs=2, filter=lambda y: y.size[-1] % 4 == 0) cfg.define_split("tile_oc", num_filter, num_outputs=2, filter=lambda y: y.size[-1] % 16 == 0) @@ -159,6 +162,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out ), strides, padding, + dilation, out_dtype, ) diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py index badba1a248e9..a0225ef9e147 100644 --- a/python/tvm/topi/x86/depthwise_conv2d.py +++ b/python/tvm/topi/x86/depthwise_conv2d.py @@ -42,9 +42,11 @@ def _fallback_schedule(cfg, wkl): """ simd_width = get_fp32_len() - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + HSTR, WSTR = wkl.stride_h, wkl.stride_w + dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 + + out_width = (wkl.width - dilated_kernel_w + pl + pr) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): @@ -165,6 +167,7 @@ def depthwise_conv2d_NCHWc( ), strides, (pad_top, pad_down), + dilation, out_dtype, ) if cfg.is_fallback: diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py index 1bf83eba53ac..a934e3ef2fd2 100644 --- a/tests/python/topi/python/test_topi_conv2d_int8.py +++ b/tests/python/topi/python/test_topi_conv2d_int8.py @@ -27,6 +27,8 @@ from tvm.topi.nn.utils import get_pad_tuple from tvm.topi.utils import get_const_tuple from tvm.topi.arm_cpu.conv2d_gemm import is_aarch64_arm +from tvm.topi.nn.conv2d import _get_workload +from tvm.topi.generic.conv2d import fallback_schedule_cpu_common_int8 from common import Int8Fallback import tvm.testing @@ -112,7 +114,7 @@ def compile_conv2d_NHWC_gemm_int8_arm( s, [A, W, bias, C], device, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" + name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) else: @@ -385,6 +387,22 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() + def verify_workload_padding(): + _, _, out_height, out_width = get_const_tuple(c_np.shape) + wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) + + # for testing functionality, + # we choose arbitrary int32_lanes and num_int8_elements can divide the channel, + # regardless of the performance. + int32_lanes, num_int8_elements = num_filter, in_channel + + # check if tile_ow candidates are the factors of the right output weight. + cfg = autotvm.get_config() + fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements) + ow_tile = np.prod(cfg["tile_ow"].size) + + tvm.testing.assert_allclose(ow_tile, out_width) + def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): @@ -436,6 +454,8 @@ def check_device(device): func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) + verify_workload_padding() + for device in ["cuda"]: check_device(device) @@ -547,6 +567,7 @@ def test_conv2d_nchw(): verify_conv2d_nchw_int8(1, 32, 149, 32, 3, 1, 0) verify_conv2d_nchw_int8(7, 32, 149, 32, 3, 1, 0) verify_conv2d_nchw_int8(1, 32, 35, 64, 7, 2, (0, 0, 1, 1)) + verify_conv2d_nchw_int8(1, 32, 35, 64, 7, 2, (0, 0, 2, 2)) def test_conv2d_nhwc(): diff --git a/tests/python/topi/python/test_topi_conv2d_nchw.py b/tests/python/topi/python/test_topi_conv2d_nchw.py index 1b7575211dac..07ad45c971df 100644 --- a/tests/python/topi/python/test_topi_conv2d_nchw.py +++ b/tests/python/topi/python/test_topi_conv2d_nchw.py @@ -25,6 +25,8 @@ from tvm.contrib.pickle_memoize import memoize from tvm.topi.nn.utils import get_pad_tuple from tvm.topi.utils import get_const_tuple +from tvm.topi.nn.conv2d import _get_workload +from tvm.topi.x86.conv2d_avx_common import _fallback_schedule import tvm.testing @@ -76,6 +78,17 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() + def verify_workload_padding(): + _, _, out_height, out_width = get_const_tuple(c_np.shape) + wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) + + # check if tile_ow candidates are the factors of the right output weight. + cfg = autotvm.get_config() + _fallback_schedule(cfg, wkl) + ow_tile = np.prod(cfg["tile_ow"].size) + + tvm.testing.assert_allclose(ow_tile, out_width) + def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): @@ -101,6 +114,9 @@ def check_device(device): C = topi.nn.relu(C) s = fschedule([C]) + if "llvm" in device: + verify_workload_padding() + a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) @@ -242,6 +258,7 @@ def test_conv2d_nchw(): verify_conv2d_nchw(1, 64, 8, 64, 5, 2, (1, 3), add_bias=True) verify_conv2d_nchw(1, 64, 8, 64, 3, 1, "VALID", add_bias=True, add_relu=True) verify_conv2d_nchw(1, 64, 8, 64, 24, 1, "SAME", add_bias=True, add_relu=True) + verify_conv2d_nchw(1, 32, 35, 64, 7, 2, (0, 0, 2, 2)) if __name__ == "__main__": diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py index 55d2fe0c4e52..804c486d27d7 100644 --- a/tests/python/topi/python/test_topi_depthwise_conv2d.py +++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py @@ -23,6 +23,8 @@ from tvm.topi.utils import get_const_tuple from tvm.topi.nn.utils import get_pad_tuple from tvm.contrib.pickle_memoize import memoize +from tvm.topi.nn.depthwise_conv2d import _get_workload +from tvm.topi.x86.depthwise_conv2d import _fallback_schedule import tvm.testing @@ -116,8 +118,8 @@ def depthwise_conv2d_with_workload_nchw( if dilation == 1: # here we transform the padding argument from 'str' to 'tuple' , # because we need this to match the "workload" tuple to the records in TopHub - pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width)) - padding_args = (pad_h, pad_w) + padt, padl, padb, padr = get_pad_tuple(padding, (filter_height, filter_width)) + padding_args = (padt, padl, padb, padr) else: padding_args = padding @@ -205,6 +207,23 @@ def get_ref_data(): relu_scipy, ) = get_ref_data() + def verify_workload_padding(): + _, _, out_height, out_width = get_const_tuple(depthwise_conv2d_scipy.shape) + wkl = _get_workload( + Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype + ) + + # check if tile_ow candidates are the factors of the right output weight. + with tvm.target.Target(device): + cfg = autotvm.get_config() + _fallback_schedule(cfg, wkl) + ow_tile = np.prod(cfg["tile_ow"].size) + + tvm.testing.assert_allclose(ow_tile, out_width) + + if "llvm" in device: + verify_workload_padding() + input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx)