From 94f5fd3969919213740e2b7e5f8e193333e5932a Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Sun, 20 Aug 2023 04:07:34 +0000
Subject: [PATCH 01/23] Squash commits together

replace constant with appropriate block dimension

WIP: add strides as a kernel parameter for non-packed tensors

WIP: created non-packed variants for remaining 3D conv kernels

fix: treat input and output as 6D NDHWGC tensor

use std::call_once for initializing random seed

revamp naive kernels to use strides in the non-packed tensors case (controlled by a bool flag)

WIP: fixed kernel compilation issues but unable to load kernel code object

WIP: fixed issue with hip rtc

split channel strides into group, channels_per_group in solver

fix indexing to left-to-right order

fix bug with too much padding between kernel args

num channels should be a multiple of num groups

re-enable naive ref kernels with strides array

2D forward tests are all working now

WIP: debugging bwd tests

WIP: tests up till 3D bwd conv passing

fix bug in bwd ndhwc kernel

fix formatting

disable prints

fix readability-inconsistent-declaration-parameter-name

fix clang-format

fix hip tidy issue

reverting the change to static init of random seed

address comments and tidy issues. Remove extra print

removed blank line change

remove unneeded include

env var for choosing packed vs non-packed reference kernel

fix warnings from hip-tidy

address comment about array initialization

clear a tiny hip tidy issue
---
 src/include/miopen/hipoc_kernel.hpp           |   15 +-
 .../miopen/solver/conv_direct_naive_conv.hpp  |  159 +-
 .../gpu_reference_kernel/naive_conv.cpp       | 1927 ++++++++++++-----
 src/solver/conv_direct_naive_conv.cpp         |   58 +-
 src/solver/conv_direct_naive_conv_bwd.cpp     |   47 +-
 src/solver/conv_direct_naive_conv_fwd.cpp     |   32 +
 src/solver/conv_direct_naive_conv_wrw.cpp     |   36 +
 test/gpu_reference_kernel.cpp                 |    3 +-
 8 files changed, 1696 insertions(+), 581 deletions(-)
diff --git a/src/include/miopen/hipoc_kernel.hpp b/src/include/miopen/hipoc_kernel.hpp
index ba9992bab3..b18955b5c2 100644
--- a/src/include/miopen/hipoc_kernel.hpp
+++ b/src/include/miopen/hipoc_kernel.hpp
@@ -47,7 +47,7 @@ inline HipEventPtr make_hip_event()
 
 #if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017
 
-#if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017
+#if 0 // Keep around other storage techinques -- @pfultz2 27.03.2017
 template <class T, class U>
 struct KernelArgsPair
 {
@@ -65,9 +65,16 @@ struct KernelArgsPair
 template <class T, class U>
 struct KernelArgsPair
 {
-    KernelArgsPair(T x, U y) : first(x), second(y) {}
-    T first;
-    U second;
+    static const int alignment = alignof(U);
+    static const int padding   = (alignment - (sizeof(T) % alignment)) % alignment;
+    static_assert(padding >= 0, "padding cannot be negative");
+    static const int second_index = sizeof(T) + padding;
+    KernelArgsPair(T x, U y)
+    {
+        new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
+        new(buffer + second_index) U(y);
+    }
+    alignas(U) char buffer[second_index + sizeof(U)] = {};
 };
 #endif
 
diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp
index f05bbdf712..927a64feff 100644
--- a/src/include/miopen/solver/conv_direct_naive_conv.hpp
+++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp
@@ -25,9 +25,14 @@
  *******************************************************************************/
 #pragma once
 
-#include <string>
 #include <miopen/conv/context.hpp>
 
+#include <string>
+#include <array>
+#include <algorithm>
+#include <vector>
+#include <cassert>
+
 namespace miopen {
 
 namespace solver {
@@ -53,5 +58,157 @@ bool IsOutputBfp16(const ProblemDescription&);
 bool IsOutputInt8(const ProblemDescription&);
 bool IsOutputInt32(const ProblemDescription&);
 
+int GetGroupStrideIndex(const ProblemDescription& problem);
+
+void printTensorStrides(const TensorDescriptor& inDesc,
+                        const TensorDescriptor& wDesc,
+                        const TensorDescriptor& outDesc);
+
+// TODO(Amber): Uncomment when hip RTC accepts std::array
+// using StrideIndexType = int;
+// using Strides3D       = std::array<StrideIndexType, 3>;
+// using Strides4D       = std::array<StrideIndexType, 4>;
+// using Strides5D       = std::array<StrideIndexType, 5>;
+// using Strides6D       = std::array<StrideIndexType, 6>;
+#if 1
+template <typename T, unsigned N>
+class MyArray
+{
+    T data_[N] = {};
+
+public:
+    constexpr static const unsigned SIZE = N;
+    __host__ __device__ constexpr unsigned size() const { return N; }
+
+    __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; }
+
+    __host__ T& operator[](unsigned i) { return data_[i]; }
+
+    __host__ __device__ MyArray()                   = default;
+    __host__ __device__ MyArray(const MyArray&)     = default;
+    __host__ __device__ MyArray(MyArray&&) noexcept = default;
+    __host__ __device__ MyArray& operator=(const MyArray&) = default;
+    __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default;
+    __host__ __device__ ~MyArray()                             = default;
+};
+
+using StrideIndexType = int;
+using Strides5D       = MyArray<StrideIndexType, 5u>;
+using Strides6D       = MyArray<StrideIndexType, 6u>;
+
+#else
+
+extern "C" typedef int StrideIndexType;
+
+extern "C" typedef struct
+{
+    StrideIndexType v[5];
+} Strides5D;
+
+extern "C" typedef struct
+{
+    StrideIndexType v[6];
+} Strides6D;
+
+#endif
+
+namespace internal {
+template <unsigned N>
+struct ChooseStride
+{
+};
+
+template <>
+struct ChooseStride<5u>
+{
+    using type = Strides5D;
+};
+
+template <>
+struct ChooseStride<6u>
+{
+    using type = Strides6D;
+};
+
+} // end namespace internal
+
+template <unsigned N, typename V>
+auto MakeStrideArray(V vec)
+{
+    typename internal::ChooseStride<N>::type ret;
+    assert(vec.size() == N);
+
+    // MIOpen stores strides for NHWC in NCHW order, i.e. C stride in 2nd from left.
+    // We sort the input stride vector so that smallest stride is at index 0. This
+    // (little-endian) order is what naive convolution kernel expects for strides
+    std::sort(vec.begin(), vec.end());
+
+    for(unsigned i = 0; i < N; ++i)
+    {
+        ret[i] = static_cast<StrideIndexType>(vec[i]);
+    }
+    return ret;
+}
+
+/**
+ * split the strides for C dimension in a tensor descriptor into (G, C_per_group).
+ * Normally, (in packed case) num channels is a multiplying factor in the stride of
+ * whatever lies to the left of C, e.g., in NCHW, N's stride contains C as a
+ * factor. We output NGCHW for NCHW (and NHWGC for NHWC)
+ * where the stride[G] = stride[N] / num_groups
+ */
+template <typename V>
+V SplitStrideCtoGC(int num_groups, const V& orig_strides, int G_stride_idx)
+{
+    assert(G_stride_idx > 0 && G_stride_idx <= orig_strides.size());
+    // (G_stride_idx - 1) is the stride index of whatever lies to the left and
+    // contains C or K as a multiplying factor. We divide this value by num_groups
+    // to get G_stride_val
+    assert(orig_strides[G_stride_idx - 1] % num_groups == 0);
+
+    V ret{orig_strides};
+    auto G_stride_val = orig_strides[G_stride_idx - 1] / num_groups;
+
+    ret.insert(ret.begin() + G_stride_idx, G_stride_val);
+
+    return ret;
+}
+
+/**
+ * Weight tensor has original dims: [K, C_per_group, Y, X] (2D case)
+ * We return a new stride vector with strides for [G, K_per_group, C_per_group, Y, X]
+ * Stride for G is computed as stride[C_per_group] * K_per_group and inserted at
+ * left most position
+ */
+template <typename V>
+V SplitWeiStrideKtoGK(int k_per_group, const V& wei_strides)
+{
+    V ret{wei_strides};
+    ret.insert(ret.begin(), wei_strides[0] * k_per_group);
+    return ret;
+}
+
+template <typename StrideArray>
+void printStrideArray(const char* name, const StrideArray& sarr)
+{
+    printf("%s = [", name);
+    for(unsigned i = 0; i < StrideArray::SIZE; ++i)
+    {
+        printf("%d,", sarr[i]);
+    }
+    printf("]\n");
+}
+
+template <typename StrideArray>
+void printStrideArrays(const StrideArray& in_strides,
+                       const StrideArray& wei_strides,
+                       const StrideArray& out_strides)
+{
+
+    printStrideArray("in_strides", in_strides);
+    printStrideArray("wei_strides", wei_strides);
+    printStrideArray("out_strides", out_strides);
+}
+
 } // namespace solver
 } // namespace miopen
diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp
index 24d7cd489e..6ffb0789c4 100644
--- a/src/kernels/gpu_reference_kernel/naive_conv.cpp
+++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp
@@ -114,10 +114,120 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val)
     return static_cast<int8_t>(val & 0xff);
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+// TODO(Amber): this file is compiled via HIP RTC and includes don't work easily
+// so currently duplicating content from miopen/common.hpp
+// #include "miopen/common.hpp"
+// #include <array>
+// TODO(Amber): HIP RTC redefines stuff from std library (I don't know why)
+// #include <hip/amd_detail/amd_hip_vector_types.h>
+#if 1
+template <typename T, unsigned N>
+class MyArray
+{
+    T data_[N] = {};
+
+public:
+    constexpr static const unsigned SIZE = N;
+
+    __host__ __device__ constexpr unsigned size() const { return N; }
+
+    __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; }
+
+    __host__ __device__ T& operator[](unsigned i) { return data_[i]; }
+
+    __host__ __device__ MyArray()                   = default;
+    __host__ __device__ MyArray(const MyArray&)     = default;
+    __host__ __device__ MyArray(MyArray&&) noexcept = default;
+    __host__ __device__ MyArray& operator=(const MyArray&) = default;
+    __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default;
+    __host__ __device__ ~MyArray()                             = default;
+};
+
+using StrideIndexType = int;
+using Strides5D       = MyArray<StrideIndexType, 5u>;
+using Strides6D       = MyArray<StrideIndexType, 6u>;
+#else
+
+extern "C" typedef int StrideIndexType;
+
+extern "C" typedef struct
+{
+    StrideIndexType v[5];
+} Strides5D;
+
+extern "C" typedef struct
+{
+    StrideIndexType v[6];
+} Strides6D;
+
+extern "C" __global__ void testKernel(void* ptr_a,
+                                      void* ptr_b,
+                                      void* ptr_c,
+                                      Strides5D in_strides,
+                                      Strides5D wei_strides,
+                                      Strides5D out_strides)
+{
+
+    if(blockIdx.x == 0 && threadIdx.x == 0)
+    {
+        printf("sizeof(Strides5D) = %lu\n", sizeof(Strides5D));
+        printf("%p, %p, %p, %p\n", &ptr_a, &ptr_b, &ptr_c, &in_strides);
+        printf("in_strides = [");
+        for(int i = 0; i < 5; ++i)
+        {
+            // printf("%d,", in_strides.v[i]);
+            printf("%d,", in_strides[i]);
+        }
+        printf("]\n");
+        printf("wei_strides = [");
+        for(int i = 0; i < 5; ++i)
+        {
+            // printf("%d,", wei_strides.v[i]);
+            printf("%d,", wei_strides[i]);
+        }
+        printf("]\n");
+        printf("out_strides = [");
+        for(int i = 0; i < 5; ++i)
+        {
+            // printf("%d,", out_strides.v[i]);
+            printf("%d,", out_strides[i]);
+        }
+        printf("]\n");
+    }
+}
+#endif
+
+template <typename StrideArray>
+__device__ void printStrideArray(const char* name, const StrideArray& sarr)
+{
+    printf("%s = [", name);
+    for(int i = 0; i < StrideArray::SIZE; ++i)
+    {
+        printf("%d,", sarr[i]);
+    }
+    printf("]\n");
+}
+
+// TODO(Amber): remove template parameter 'bool ASSUME_PACKED' in a follow up PR
+// Notes (Amber):
+// * The following code used to assume that group (G) is an implicit
+// dimension, i.e. c= c_per_group * group and k = k_per_group * group. This is not
+// true for non-packed case because group (G) dimension needs to have its stride
+// explicitly specified for address math to make sense. This is also how
+// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw,
+// and nhwc should be nhwgc. Same follows for the 3D case.
+// * strides here are in the little-endian order, i.e., for NHWC, stride for N is
+// at index 3 while stride for C is at index 0. This is reverse of how strides are
+// stored in tensor descriptors, which are big-endian.
+
+// TODO(Amber): Rename nchw to ngchw
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
                                            const src_data_t* __restrict__ p_wei,
                                            dst_data_t* __restrict__ p_out,
+                                           Strides5D in_strides,
+                                           Strides5D wei_strides,
+                                           Strides5D out_strides,
                                            int hi,
                                            int wi,
                                            int n,
@@ -135,6 +245,43 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
                                            int fx,
                                            int group)
 {
+
+    // TODO(Amber): Remove this code
+    /*
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+      printStrideArray("in_strides", in_strides);
+      printStrideArray("wei_strides", wei_strides);
+      printStrideArray("out_strides", out_strides);
+
+      printf("modified strides\n");
+      Strides5D in_strd;
+      Strides5D wei_strd;
+      Strides5D out_strd;
+
+      in_strd[0] = 1;
+      in_strd[1] = wi;
+      in_strd[2] = hi * wi;
+      in_strd[3] = c_per_group * hi * wi;
+      in_strd[4] = group * c_per_group * hi * wi;
+
+      wei_strd[0] = 1;
+      wei_strd[1] = fx;
+      wei_strd[2] = fy * fx;
+      wei_strd[3] = c_per_group * fy * fx;
+      wei_strd[4] = k_per_group * c_per_group * fy * fx;
+
+      out_strd[0] = 1;
+      out_strd[1] = wo;
+      out_strd[2] = ho * wo;
+      out_strd[3] = k_per_group * ho * wo;
+      out_strd[4] = group * k_per_group * ho * wo;
+
+      printStrideArray("in_strd", in_strd);
+      printStrideArray("wei_strd", wei_strd);
+      printStrideArray("out_strd", out_strd);
+    }
+    */
+
     /*
      *  need to compute total output pixel: `group * n * k_per_group * ho * wo`.
      *  to distribute this workload, let one workgroup compute `ho * wo` pixel,
@@ -148,18 +295,37 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
     int in            = (bid / k_per_group) % n;
     int ig            = bid / (n * k_per_group);
 
-    p_in += static_cast<size_t>(in) * c * hi * wi + static_cast<size_t>(ig) * c_per_group * hi * wi;
-    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
-             static_cast<size_t>(ik) * c_per_group * fy * fx;
-    p_out += static_cast<size_t>(in) * k * ho * wo +
-             static_cast<size_t>(ig) * k_per_group * ho * wo + static_cast<size_t>(ik) * ho * wo;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in +=
+            static_cast<size_t>(in) * c * hi * wi + static_cast<size_t>(ig) * c_per_group * hi * wi;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
+                 static_cast<size_t>(ik) * c_per_group * fy * fx;
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out += static_cast<size_t>(in) * k * ho * wo +
+                 static_cast<size_t>(ig) * k_per_group * ho * wo +
+                 static_cast<size_t>(ik) * ho * wo;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[4] + static_cast<size_t>(ig) * in_strides[3];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[4] + static_cast<size_t>(ik) * wei_strides[3];
+
+        p_out += static_cast<size_t>(in) * out_strides[4] +
+                 static_cast<size_t>(ig) * out_strides[3] +
+                 static_cast<size_t>(ik) * out_strides[2];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int iho = tid / wo;
         int iwo = tid % wo;
 
-        double value = .0f;
+        // double value = .0f;
+        acc_data_t value = 0;
 
         for(int ic = 0; ic < c_per_group; ic++)
         {
@@ -178,25 +344,58 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
 
                     if(valid_w & valid_h)
                     {
-                        size_t i_idx = static_cast<size_t>(ic) * hi * wi +
-                                       static_cast<size_t>(cur_h) * wi + static_cast<size_t>(cur_w);
-                        size_t f_idx = static_cast<size_t>(ic) * fy * fx +
-                                       static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
-                        value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                 cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        if constexpr(ASSUME_PACKED)
+                        {
+                            size_t i_idx = static_cast<size_t>(ic) * hi * wi +
+                                           static_cast<size_t>(cur_h) * wi +
+                                           static_cast<size_t>(cur_w);
+
+                            size_t f_idx = static_cast<size_t>(ic) * fy * fx +
+                                           static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
+                        else
+                        {
+                            size_t i_idx = static_cast<size_t>(ic) * in_strides[2] +
+                                           static_cast<size_t>(cur_h) * in_strides[1] +
+                                           static_cast<size_t>(cur_w) * in_strides[0];
+
+                            size_t f_idx = static_cast<size_t>(ic) * wei_strides[2] +
+                                           static_cast<size_t>(iy) * wei_strides[1] +
+                                           static_cast<size_t>(ix) * wei_strides[0];
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
                     }
                 }
             }
         }
-        size_t o_idx = static_cast<size_t>(iho) * wo + static_cast<size_t>(iwo);
-        p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t o_idx = static_cast<size_t>(iho) * wo + static_cast<size_t>(iwo);
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t o_idx = static_cast<size_t>(iho) * out_strides[1] +
+                           static_cast<size_t>(iwo) * out_strides[0];
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in,
                                            const src_data_t* __restrict__ p_wei,
                                            const src_data_t* __restrict__ p_out,
+                                           Strides5D in_strides,
+                                           Strides5D wei_strides,
+                                           Strides5D out_strides,
                                            int hi,
                                            int wi,
                                            int n,
@@ -227,19 +426,36 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in,
     int in            = (bid / c_per_group) % n;
     int ig            = bid / (n * c_per_group);
 
-    p_in += static_cast<size_t>(in) * c * hi * wi +
-            static_cast<size_t>(ig) * c_per_group * hi * wi + static_cast<size_t>(ic) * hi * wi;
-    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
-             static_cast<size_t>(ic) * fy * fx;
-    p_out +=
-        static_cast<size_t>(in) * k * ho * wo + static_cast<size_t>(ig) * k_per_group * ho * wo;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * c * hi * wi +
+                static_cast<size_t>(ig) * c_per_group * hi * wi + static_cast<size_t>(ic) * hi * wi;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
+                 static_cast<size_t>(ic) * fy * fx;
+
+        p_out +=
+            static_cast<size_t>(in) * k * ho * wo + static_cast<size_t>(ig) * k_per_group * ho * wo;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[4] + static_cast<size_t>(ig) * in_strides[3] +
+                static_cast<size_t>(ic) * in_strides[2];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[4] + static_cast<size_t>(ic) * wei_strides[2];
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out +=
+            static_cast<size_t>(in) * out_strides[4] + static_cast<size_t>(ig) * out_strides[3];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ihi = tid / wi;
         int iwi = tid % wi;
 
-        double value = .0f;
+        // double value = .0f;
+        acc_data_t value = 0;
 
         for(int ik = 0; ik < k_per_group; ik++)
         {
@@ -264,26 +480,59 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in,
 
                     if(valid_h & valid_w)
                     {
-                        size_t o_idx = static_cast<size_t>(ik) * ho * wo +
-                                       static_cast<size_t>(cur_ho) * wo +
-                                       static_cast<size_t>(cur_wo);
-                        size_t f_idx = static_cast<size_t>(ik) * c_per_group * fy * fx +
-                                       static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
-                        value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
-                                 cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        if constexpr(ASSUME_PACKED)
+                        {
+                            size_t o_idx = static_cast<size_t>(ik) * ho * wo +
+                                           static_cast<size_t>(cur_ho) * wo +
+                                           static_cast<size_t>(cur_wo);
+
+                            size_t f_idx = static_cast<size_t>(ik) * c_per_group * fy * fx +
+                                           static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
+
+                            value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
+                        else
+                        {
+                            size_t o_idx = static_cast<size_t>(ik) * out_strides[2] +
+                                           static_cast<size_t>(cur_ho) * out_strides[1] +
+                                           static_cast<size_t>(cur_wo) * out_strides[0];
+
+                            size_t f_idx = static_cast<size_t>(ik) * wei_strides[3] +
+                                           static_cast<size_t>(iy) * wei_strides[1] +
+                                           static_cast<size_t>(ix) * wei_strides[0];
+
+                            value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
                     }
                 }
             }
         }
-        size_t i_idx = static_cast<size_t>(ihi) * wi + static_cast<size_t>(iwi);
-        p_in[i_idx]  = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t i_idx = static_cast<size_t>(ihi) * wi + static_cast<size_t>(iwi);
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t i_idx =
+                static_cast<size_t>(ihi) * in_strides[1] + static_cast<size_t>(iwi) * in_strides[0];
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in,
                                            dst_data_t* __restrict__ p_wei,
                                            const src_data_t* __restrict__ p_out,
+                                           Strides5D in_strides,
+                                           Strides5D wei_strides,
+                                           Strides5D out_strides,
                                            int hi,
                                            int wi,
                                            int n,
@@ -315,18 +564,35 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in,
     int ik            = bid % k_per_group;
     int ig            = bid / k_per_group;
 
-    p_in += static_cast<size_t>(ig) * c_per_group * hi * wi;
-    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
-             static_cast<size_t>(ik) * c_per_group * fy * fx;
-    p_out += static_cast<size_t>(ig) * k_per_group * ho * wo + static_cast<size_t>(ik) * ho * wo;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(ig) * c_per_group * hi * wi;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
+                 static_cast<size_t>(ik) * c_per_group * fy * fx;
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out +=
+            static_cast<size_t>(ig) * k_per_group * ho * wo + static_cast<size_t>(ik) * ho * wo;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(ig) * in_strides[3];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[4] + static_cast<size_t>(ik) * wei_strides[3];
+
+        p_out +=
+            static_cast<size_t>(ig) * out_strides[3] + static_cast<size_t>(ik) * out_strides[2];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ix = tid % fx;
         int iy = (tid / fx) % fy;
         int ic = tid / (fx * fy);
 
-        double value = .0f;
+        // double value = .0f;
+        acc_data_t value = 0;
 
         for(int in = 0; in < n; in++)
         {
@@ -345,28 +611,64 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in,
 
                     if(valid_h & valid_w)
                     {
-                        size_t i_idx = static_cast<size_t>(in) * c * hi * wi +
-                                       static_cast<size_t>(ic) * hi * wi +
-                                       static_cast<size_t>(cur_h) * wi + static_cast<size_t>(cur_w);
-                        size_t o_idx = static_cast<size_t>(in) * k * ho * wo +
-                                       static_cast<size_t>(iho) * wo + static_cast<size_t>(iwo);
-                        value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                 cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                        if constexpr(ASSUME_PACKED)
+                        {
+                            size_t i_idx = static_cast<size_t>(in) * c * hi * wi +
+                                           static_cast<size_t>(ic) * hi * wi +
+                                           static_cast<size_t>(cur_h) * wi +
+                                           static_cast<size_t>(cur_w);
+
+                            size_t o_idx = static_cast<size_t>(in) * k * ho * wo +
+                                           static_cast<size_t>(iho) * wo + static_cast<size_t>(iwo);
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                        }
+                        else
+                        {
+                            size_t i_idx = static_cast<size_t>(in) * in_strides[4] +
+                                           static_cast<size_t>(ic) * in_strides[2] +
+                                           static_cast<size_t>(cur_h) * in_strides[1] +
+                                           static_cast<size_t>(cur_w) * in_strides[0];
+
+                            size_t o_idx = static_cast<size_t>(in) * out_strides[4] +
+                                           static_cast<size_t>(iho) * out_strides[1] +
+                                           static_cast<size_t>(iwo) * out_strides[0];
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                        }
                     }
                 }
             }
         }
-        size_t f_idx = static_cast<size_t>(ic) * fy * fx + static_cast<size_t>(iy) * fx +
-                       static_cast<size_t>(ix);
-        p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t f_idx = static_cast<size_t>(ic) * fy * fx + static_cast<size_t>(iy) * fx +
+                           static_cast<size_t>(ix);
+
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t f_idx = static_cast<size_t>(ic) * wei_strides[2] +
+                           static_cast<size_t>(iy) * wei_strides[1] +
+                           static_cast<size_t>(ix) * wei_strides[0];
+
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
 // design block_size 256
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in,
                                             const src_data_t* __restrict__ p_wei,
                                             dst_data_t* __restrict__ p_out,
+                                            Strides6D in_strides,
+                                            Strides6D wei_strides,
+                                            Strides6D out_strides,
                                             int di,
                                             int hi,
                                             int wi,
@@ -405,21 +707,38 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in,
     int in            = (bid / k_per_group) % n;
     int ig            = bid / (n * k_per_group);
 
-    p_in += static_cast<size_t>(in) * c * di * hi * wi +
-            static_cast<size_t>(ig) * c_per_group * di * hi * wi;
-    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fz * fy * fx +
-             static_cast<size_t>(ik) * c_per_group * fz * fy * fx;
-    p_out += static_cast<size_t>(in) * k * do_ * ho * wo +
-             static_cast<size_t>(ig) * k_per_group * do_ * ho * wo +
-             static_cast<size_t>(ik) * do_ * ho * wo;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * c * di * hi * wi +
+                static_cast<size_t>(ig) * c_per_group * di * hi * wi;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fz * fy * fx +
+                 static_cast<size_t>(ik) * c_per_group * fz * fy * fx;
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out += static_cast<size_t>(in) * k * do_ * ho * wo +
+                 static_cast<size_t>(ig) * k_per_group * do_ * ho * wo +
+                 static_cast<size_t>(ik) * do_ * ho * wo;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[5] + static_cast<size_t>(ig) * in_strides[4];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[5] + static_cast<size_t>(ik) * wei_strides[4];
+
+        p_out += static_cast<size_t>(in) * out_strides[5] +
+                 static_cast<size_t>(ig) * out_strides[4] +
+                 static_cast<size_t>(ik) * out_strides[3];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int iwo = tid % wo;
         int iho = (tid / wo) % ho;
         int ido = tid / (ho * wo);
 
-        double value = .0f;
+        // double value = .0f;
+        acc_data_t value = 0;
 
         for(int ic = 0; ic < c_per_group; ic++)
         {
@@ -444,30 +763,67 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in,
 
                         if(valid_d & valid_w & valid_h)
                         {
-                            size_t i_idx = static_cast<size_t>(ic) * di * hi * wi +
-                                           static_cast<size_t>(cur_d) * hi * wi +
-                                           static_cast<size_t>(cur_h) * wi +
-                                           static_cast<size_t>(cur_w);
-                            size_t f_idx = static_cast<size_t>(ic) * fz * fy * fx +
-                                           static_cast<size_t>(iz) * fy * fx +
-                                           static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
-                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            if constexpr(ASSUME_PACKED)
+                            {
+                                size_t i_idx = static_cast<size_t>(ic) * di * hi * wi +
+                                               static_cast<size_t>(cur_d) * hi * wi +
+                                               static_cast<size_t>(cur_h) * wi +
+                                               static_cast<size_t>(cur_w);
+
+                                size_t f_idx = static_cast<size_t>(ic) * fz * fy * fx +
+                                               static_cast<size_t>(iz) * fy * fx +
+                                               static_cast<size_t>(iy) * fx +
+                                               static_cast<size_t>(ix);
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
+                            else
+                            {
+                                size_t i_idx = static_cast<size_t>(ic) * in_strides[3] +
+                                               static_cast<size_t>(cur_d) * in_strides[2] +
+                                               static_cast<size_t>(cur_h) * in_strides[1] +
+                                               static_cast<size_t>(cur_w) * in_strides[0];
+
+                                size_t f_idx = static_cast<size_t>(ic) * wei_strides[3] +
+                                               static_cast<size_t>(iz) * wei_strides[2] +
+                                               static_cast<size_t>(iy) * wei_strides[1] +
+                                               static_cast<size_t>(ix) * wei_strides[0];
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
                         }
                     }
                 }
             }
         }
-        size_t o_idx = static_cast<size_t>(ido) * ho * wo + static_cast<size_t>(iho) * wo +
-                       static_cast<size_t>(iwo);
-        p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t o_idx = static_cast<size_t>(ido) * ho * wo + static_cast<size_t>(iho) * wo +
+                           static_cast<size_t>(iwo);
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t o_idx = static_cast<size_t>(ido) * out_strides[2] +
+                           static_cast<size_t>(iho) * out_strides[1] +
+                           static_cast<size_t>(iwo) * out_strides[0];
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in,
                                             const src_data_t* __restrict__ p_wei,
                                             const src_data_t* __restrict__ p_out,
+                                            Strides6D in_strides,
+                                            Strides6D wei_strides,
+                                            Strides6D out_strides,
                                             int di,
                                             int hi,
                                             int wi,
@@ -506,21 +862,38 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in,
     int in            = (bid / c_per_group) % n;
     int ig            = bid / (n * c_per_group);
 
-    p_in += static_cast<size_t>(in) * c * di * hi * wi +
-            static_cast<size_t>(ig) * c_per_group * di * hi * wi +
-            static_cast<size_t>(ic) * di * hi * wi;
-    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fz * fy * fx +
-             static_cast<size_t>(ic) * fz * fy * fx;
-    p_out += static_cast<size_t>(in) * k * do_ * ho * wo +
-             static_cast<size_t>(ig) * k_per_group * do_ * ho * wo;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * c * di * hi * wi +
+                static_cast<size_t>(ig) * c_per_group * di * hi * wi +
+                static_cast<size_t>(ic) * di * hi * wi;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fz * fy * fx +
+                 static_cast<size_t>(ic) * fz * fy * fx;
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out += static_cast<size_t>(in) * k * do_ * ho * wo +
+                 static_cast<size_t>(ig) * k_per_group * do_ * ho * wo;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[5] + static_cast<size_t>(ig) * in_strides[4] +
+                static_cast<size_t>(ic) * in_strides[3];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[5] + static_cast<size_t>(ic) * wei_strides[3];
+
+        p_out +=
+            static_cast<size_t>(in) * out_strides[5] + static_cast<size_t>(ig) * out_strides[4];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int iwi = tid % wi;
         int ihi = (tid / wi) % hi;
         int idi = tid / (hi * wi);
 
-        double value = .0f;
+        // double value = .0f;
+        acc_data_t value = 0;
 
         for(int ik = 0; ik < k_per_group; ik++)
         {
@@ -554,30 +927,67 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in,
 
                         if(valid_d & valid_h & valid_w)
                         {
-                            size_t o_idx = static_cast<size_t>(ik) * do_ * ho * wo +
-                                           static_cast<size_t>(cur_do) * ho * wo +
-                                           static_cast<size_t>(cur_ho) * wo +
-                                           static_cast<size_t>(cur_wo);
-                            size_t f_idx = static_cast<size_t>(ik) * c_per_group * fz * fy * fx +
-                                           static_cast<size_t>(iz) * fy * fx +
-                                           static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
-                            value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
-                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            if constexpr(ASSUME_PACKED)
+                            {
+                                size_t o_idx = static_cast<size_t>(ik) * do_ * ho * wo +
+                                               static_cast<size_t>(cur_do) * ho * wo +
+                                               static_cast<size_t>(cur_ho) * wo +
+                                               static_cast<size_t>(cur_wo);
+
+                                size_t f_idx =
+                                    static_cast<size_t>(ik) * c_per_group * fz * fy * fx +
+                                    static_cast<size_t>(iz) * fy * fx +
+                                    static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
+
+                                value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
+                            else
+                            {
+                                size_t o_idx = static_cast<size_t>(ik) * out_strides[3] +
+                                               static_cast<size_t>(cur_do) * out_strides[2] +
+                                               static_cast<size_t>(cur_ho) * out_strides[1] +
+                                               static_cast<size_t>(cur_wo) * out_strides[0];
+
+                                size_t f_idx = static_cast<size_t>(ik) * wei_strides[4] +
+                                               static_cast<size_t>(iz) * wei_strides[2] +
+                                               static_cast<size_t>(iy) * wei_strides[1] +
+                                               static_cast<size_t>(ix) * wei_strides[0];
+
+                                value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
                         }
                     }
                 }
             }
         }
-        size_t i_idx = static_cast<size_t>(idi) * hi * wi + static_cast<size_t>(ihi) * wi +
-                       static_cast<size_t>(iwi);
-        p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t i_idx = static_cast<size_t>(idi) * hi * wi + static_cast<size_t>(ihi) * wi +
+                           static_cast<size_t>(iwi);
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t i_idx = static_cast<size_t>(idi) * in_strides[2] +
+                           static_cast<size_t>(ihi) * in_strides[1] +
+                           static_cast<size_t>(iwi) * in_strides[0];
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in,
                                             dst_data_t* __restrict__ p_wei,
                                             const src_data_t* __restrict__ p_out,
+                                            Strides6D in_strides,
+                                            Strides6D wei_strides,
+                                            Strides6D out_strides,
                                             int di,
                                             int hi,
                                             int wi,
@@ -615,20 +1025,36 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in,
     int ik            = bid % k_per_group;
     int ig            = bid / k_per_group;
 
-    p_in += static_cast<size_t>(ig) * c_per_group * di * hi * wi;
-    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fz * fy * fx +
-             static_cast<size_t>(ik) * c_per_group * fz * fy * fx;
-    p_out += static_cast<size_t>(ig) * k_per_group * do_ * ho * wo +
-             static_cast<size_t>(ik) * do_ * ho * wo;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(ig) * c_per_group * di * hi * wi;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fz * fy * fx +
+                 static_cast<size_t>(ik) * c_per_group * fz * fy * fx;
+
+        p_out += static_cast<size_t>(ig) * k_per_group * do_ * ho * wo +
+                 static_cast<size_t>(ik) * do_ * ho * wo;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(ig) * in_strides[4];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[5] + static_cast<size_t>(ik) * wei_strides[4];
+
+        p_out +=
+            static_cast<size_t>(ig) * out_strides[4] + static_cast<size_t>(ik) * out_strides[3];
+    }
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ix = tid % fx;
         int iy = (tid / fx) % fy;
         int iz = (tid / (fx * fy)) % fz;
         int ic = tid / (fx * fy * fz);
 
-        double value = .0f;
+        // double value = .0f;
+        acc_data_t value = 0;
 
         for(int in = 0; in < n; in++)
         {
@@ -653,33 +1079,73 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in,
 
                         if(valid_d & valid_h & valid_w)
                         {
-                            size_t i_idx = static_cast<size_t>(in) * c * di * hi * wi +
-                                           static_cast<size_t>(ic) * di * hi * wi +
-                                           static_cast<size_t>(cur_d) * hi * wi +
-                                           static_cast<size_t>(cur_h) * wi +
-                                           static_cast<size_t>(cur_w);
-                            size_t o_idx = static_cast<size_t>(in) * k * do_ * ho * wo +
-                                           static_cast<size_t>(ido) * ho * wo +
-                                           static_cast<size_t>(iho) * wo + static_cast<size_t>(iwo);
-                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                     cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                            if constexpr(ASSUME_PACKED)
+                            {
+                                size_t i_idx = static_cast<size_t>(in) * c * di * hi * wi +
+                                               static_cast<size_t>(ic) * di * hi * wi +
+                                               static_cast<size_t>(cur_d) * hi * wi +
+                                               static_cast<size_t>(cur_h) * wi +
+                                               static_cast<size_t>(cur_w);
+
+                                size_t o_idx = static_cast<size_t>(in) * k * do_ * ho * wo +
+                                               static_cast<size_t>(ido) * ho * wo +
+                                               static_cast<size_t>(iho) * wo +
+                                               static_cast<size_t>(iwo);
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                            }
+                            else
+                            {
+                                size_t i_idx = static_cast<size_t>(in) * in_strides[5] +
+                                               static_cast<size_t>(ic) * in_strides[3] +
+                                               static_cast<size_t>(cur_d) * in_strides[2] +
+                                               static_cast<size_t>(cur_h) * in_strides[1] +
+                                               static_cast<size_t>(cur_w) * in_strides[0];
+
+                                size_t o_idx = static_cast<size_t>(in) * out_strides[5] +
+                                               static_cast<size_t>(ido) * out_strides[2] +
+                                               static_cast<size_t>(iho) * out_strides[1] +
+                                               static_cast<size_t>(iwo) * out_strides[0];
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                            }
                         }
                     }
                 }
             }
         }
-        size_t f_idx = static_cast<size_t>(ic) * fz * fy * fx + static_cast<size_t>(iz) * fy * fx +
-                       static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
-        p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t f_idx = static_cast<size_t>(ic) * fz * fy * fx +
+                           static_cast<size_t>(iz) * fy * fx + static_cast<size_t>(iy) * fx +
+                           static_cast<size_t>(ix);
+
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t f_idx = static_cast<size_t>(ic) * wei_strides[3] +
+                           static_cast<size_t>(iz) * wei_strides[2] +
+                           static_cast<size_t>(iy) * wei_strides[1] +
+                           static_cast<size_t>(ix) * wei_strides[0];
+
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
 /***************************** nhwc *****************************/
 // design block_size 256
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in,
                                            const src_data_t* __restrict__ p_wei,
                                            dst_data_t* __restrict__ p_out,
+                                           Strides5D in_strides,
+                                           Strides5D wei_strides,
+                                           Strides5D out_strides,
                                            int hi,
                                            int wi,
                                            int n,
@@ -711,17 +1177,33 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in,
     int in            = (bid / ho) % n;
     int ig            = bid / (n * ho);
 
-    p_in += static_cast<size_t>(in) * hi * wi * c + static_cast<size_t>(ig) * c_per_group;
-    p_wei += static_cast<size_t>(ig) * k_per_group * fy * fx * c_per_group;
-    p_out += static_cast<size_t>(in) * ho * wo * k + static_cast<size_t>(ig) * k_per_group +
-             static_cast<size_t>(iho) * wo * k;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * hi * wi * c + static_cast<size_t>(ig) * c_per_group;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * fy * fx * c_per_group;
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out += static_cast<size_t>(in) * ho * wo * k + static_cast<size_t>(iho) * wo * k +
+                 static_cast<size_t>(ig) * k_per_group;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[4] + static_cast<size_t>(ig) * in_strides[1];
+
+        p_wei += static_cast<size_t>(ig) * wei_strides[4];
+
+        p_out += static_cast<size_t>(in) * out_strides[4] +
+                 static_cast<size_t>(iho) * out_strides[3] +
+                 static_cast<size_t>(ig) * out_strides[1];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int iwo = tid / k_per_group;
         int ik  = tid % k_per_group;
 
-        double value = .0f;
+        // double value = .0f;
+        acc_data_t value = 0;
 
         for(int iy = 0; iy < fy; iy++)
         {
@@ -740,27 +1222,61 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in,
 
                     if(valid_w & valid_h)
                     {
-                        size_t i_idx = static_cast<size_t>(cur_h) * wi * c +
-                                       static_cast<size_t>(cur_w) * c + static_cast<size_t>(ic);
-                        size_t f_idx = static_cast<size_t>(ik) * fy * fx * c_per_group +
-                                       static_cast<size_t>(iy) * fx * c_per_group +
-                                       static_cast<size_t>(ix) * c_per_group +
-                                       static_cast<size_t>(ic);
-                        value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                 cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        if constexpr(ASSUME_PACKED)
+                        {
+                            size_t i_idx = static_cast<size_t>(cur_h) * wi * c +
+                                           static_cast<size_t>(cur_w) * c + static_cast<size_t>(ic);
+
+                            size_t f_idx = static_cast<size_t>(ik) * fy * fx * c_per_group +
+                                           static_cast<size_t>(iy) * fx * c_per_group +
+                                           static_cast<size_t>(ix) * c_per_group +
+                                           static_cast<size_t>(ic);
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
+                        else
+                        {
+                            size_t i_idx = static_cast<size_t>(cur_h) * in_strides[3] +
+                                           static_cast<size_t>(cur_w) * in_strides[2] +
+                                           static_cast<size_t>(ic) * in_strides[0];
+
+                            size_t f_idx = static_cast<size_t>(ik) * wei_strides[3] +
+                                           static_cast<size_t>(iy) * wei_strides[2] +
+                                           static_cast<size_t>(ix) * wei_strides[1] +
+                                           static_cast<size_t>(ic) * wei_strides[0];
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
                     }
                 }
             }
         }
-        size_t o_idx = static_cast<size_t>(iwo) * k + static_cast<size_t>(ik);
-        p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t o_idx = static_cast<size_t>(iwo) * k + static_cast<size_t>(ik);
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t o_idx = static_cast<size_t>(iwo) * out_strides[2] +
+                           static_cast<size_t>(ik) * out_strides[0];
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in,
                                            const src_data_t* __restrict__ p_wei,
                                            const src_data_t* __restrict__ p_out,
+                                           Strides5D in_strides,
+                                           Strides5D wei_strides,
+                                           Strides5D out_strides,
                                            int hi,
                                            int wi,
                                            int n,
@@ -778,6 +1294,41 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in,
                                            int fx,
                                            int group)
 {
+    /*
+      if (blockIdx.x == 0 && threadIdx.x == 0) {
+        printStrideArray("in_strides", in_strides);
+        printStrideArray("wei_strides", wei_strides);
+        printStrideArray("out_strides", out_strides);
+
+        printf("modified strides\n");
+        Strides5D in_strd;
+        Strides5D wei_strd;
+        Strides5D out_strd;
+
+        in_strd[0] = 1;
+        in_strd[1] = k_per_group;
+        in_strd[2] = group * k_per_group;
+        in_strd[3] = group * k_per_group * wo;
+        in_strd[4] = group * k_per_group * wo * ho;
+
+        wei_strd[0] = 1;
+        wei_strd[1] = c_per_group;
+        wei_strd[2] = c_per_group * fx;
+        wei_strd[3] = c_per_group * fx * fy;
+        wei_strd[4] = c_per_group * fx * fy * k_per_group;
+
+        out_strd[0] = 1;
+        out_strd[1] = c_per_group;
+        out_strd[2] = group * c_per_group;
+        out_strd[3] = group * c_per_group * wi;
+        out_strd[4] = group * c_per_group * wi * hi;
+
+        printStrideArray("in_strd", in_strd);
+        printStrideArray("wei_strd", wei_strd);
+        printStrideArray("out_strd", out_strd);
+      }
+      */
+
     /*
      *  need to compute total input pixel: `group * n * hi * wi * c_per_group`.
      *  to distribute this workload, let one workgroup compute `wi *
@@ -792,17 +1343,33 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in,
     int in            = (bid / hi) % n;
     int ig            = bid / (n * hi);
 
-    p_in += static_cast<size_t>(in) * hi * wi * c + static_cast<size_t>(ihi) * wi * c +
-            static_cast<size_t>(ig) * c_per_group;
-    p_wei += static_cast<size_t>(ig) * k_per_group * fy * fx * c_per_group;
-    p_out += static_cast<size_t>(in) * ho * wo * k + static_cast<size_t>(ig) * k_per_group;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * hi * wi * c + static_cast<size_t>(ihi) * wi * c +
+                static_cast<size_t>(ig) * c_per_group;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * fy * fx * c_per_group;
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out += static_cast<size_t>(in) * ho * wo * k + static_cast<size_t>(ig) * k_per_group;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[4] + static_cast<size_t>(ihi) * in_strides[3] +
+                static_cast<size_t>(ig) * in_strides[1];
+
+        p_wei += static_cast<size_t>(ig) * wei_strides[4];
+
+        p_out +=
+            static_cast<size_t>(in) * out_strides[4] + static_cast<size_t>(ig) * out_strides[1];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int iwi = tid / c_per_group;
         int ic  = tid % c_per_group;
 
-        double value = .0f;
+        // double value = .0f;
+        acc_data_t value = 0;
 
         for(int iy = 0; iy < fy; iy++)
         {
@@ -827,27 +1394,61 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in,
 
                     if(valid_h & valid_w)
                     {
-                        size_t o_idx = static_cast<size_t>(cur_ho) * wo * k +
-                                       static_cast<size_t>(cur_wo) * k + static_cast<size_t>(ik);
-                        size_t f_idx = static_cast<size_t>(ik) * fy * fx * c_per_group +
-                                       static_cast<size_t>(iy) * fx * c_per_group +
-                                       static_cast<size_t>(ix) * c_per_group +
-                                       static_cast<size_t>(ic);
-                        value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
-                                 cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        if constexpr(ASSUME_PACKED)
+                        {
+                            size_t o_idx = static_cast<size_t>(cur_ho) * wo * k +
+                                           static_cast<size_t>(cur_wo) * k +
+                                           static_cast<size_t>(ik);
+
+                            size_t f_idx = static_cast<size_t>(ik) * fy * fx * c_per_group +
+                                           static_cast<size_t>(iy) * fx * c_per_group +
+                                           static_cast<size_t>(ix) * c_per_group +
+                                           static_cast<size_t>(ic);
+
+                            value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
+                        else
+                        {
+                            size_t o_idx = static_cast<size_t>(cur_ho) * out_strides[3] +
+                                           static_cast<size_t>(cur_wo) * out_strides[2] +
+                                           static_cast<size_t>(ik) * out_strides[0];
+
+                            size_t f_idx = static_cast<size_t>(ik) * wei_strides[3] +
+                                           static_cast<size_t>(iy) * wei_strides[2] +
+                                           static_cast<size_t>(ix) * wei_strides[1] +
+                                           static_cast<size_t>(ic) * wei_strides[0];
+
+                            value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
                     }
                 }
             }
         }
-        size_t i_idx = static_cast<size_t>(iwi) * c + static_cast<size_t>(ic);
-        p_in[i_idx]  = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t i_idx = static_cast<size_t>(iwi) * c + static_cast<size_t>(ic);
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t i_idx =
+                static_cast<size_t>(iwi) * in_strides[2] + static_cast<size_t>(ic) * in_strides[0];
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in,
                                            dst_data_t* __restrict__ p_wei,
                                            const src_data_t* __restrict__ p_out,
+                                           Strides5D in_strides,
+                                           Strides5D wei_strides,
+                                           Strides5D out_strides,
                                            int hi,
                                            int wi,
                                            int n,
@@ -879,18 +1480,34 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in,
     int ik            = bid % k_per_group;
     int ig            = bid / k_per_group;
 
-    p_in += static_cast<size_t>(ig) * c_per_group;
-    p_wei += static_cast<size_t>(ig) * k_per_group * fy * fx * c_per_group +
-             static_cast<size_t>(ik) * fy * fx * c_per_group;
-    p_out += static_cast<size_t>(ig) * k_per_group + static_cast<size_t>(ik);
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(ig) * c_per_group;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * fy * fx * c_per_group +
+                 static_cast<size_t>(ik) * fy * fx * c_per_group;
+
+        p_out += static_cast<size_t>(ig) * k_per_group + static_cast<size_t>(ik);
+    }
+    else
+    {
+        p_in += static_cast<size_t>(ig) * in_strides[1];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[4] + static_cast<size_t>(ik) * wei_strides[3];
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out +=
+            static_cast<size_t>(ig) * out_strides[1] + static_cast<size_t>(ik) * out_strides[0];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ic = tid % c_per_group;
         int ix = (tid / c_per_group) % fx;
         int iy = tid / (c_per_group * fx);
 
-        double value = .0f;
+        // double value = .0f;
+        acc_data_t value = 0;
 
         for(int in = 0; in < n; in++)
         {
@@ -909,29 +1526,65 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in,
 
                     if(valid_h & valid_w)
                     {
-                        size_t i_idx = static_cast<size_t>(in) * hi * wi * c +
-                                       static_cast<size_t>(cur_h) * wi * c +
-                                       static_cast<size_t>(cur_w) * c + static_cast<size_t>(ic);
-                        size_t o_idx = static_cast<size_t>(in) * ho * wo * k +
-                                       static_cast<size_t>(iho) * wo * k +
-                                       static_cast<size_t>(iwo) * k;
-                        value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                 cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+
+                        if constexpr(ASSUME_PACKED)
+                        {
+                            size_t i_idx = static_cast<size_t>(in) * hi * wi * c +
+                                           static_cast<size_t>(cur_h) * wi * c +
+                                           static_cast<size_t>(cur_w) * c + static_cast<size_t>(ic);
+
+                            size_t o_idx = static_cast<size_t>(in) * ho * wo * k +
+                                           static_cast<size_t>(iho) * wo * k +
+                                           static_cast<size_t>(iwo) * k;
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                        }
+                        else
+                        {
+                            size_t i_idx = static_cast<size_t>(in) * in_strides[4] +
+                                           static_cast<size_t>(cur_h) * in_strides[3] +
+                                           static_cast<size_t>(cur_w) * in_strides[2] +
+                                           static_cast<size_t>(ic) * in_strides[0];
+
+                            size_t o_idx = static_cast<size_t>(in) * out_strides[4] +
+                                           static_cast<size_t>(iho) * out_strides[3] +
+                                           static_cast<size_t>(iwo) * out_strides[2];
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                        }
                     }
                 }
             }
         }
-        size_t f_idx = static_cast<size_t>(iy) * fx * c_per_group +
-                       static_cast<size_t>(ix) * c_per_group + static_cast<size_t>(ic);
-        p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t f_idx = static_cast<size_t>(iy) * fx * c_per_group +
+                           static_cast<size_t>(ix) * c_per_group + static_cast<size_t>(ic);
+
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t f_idx = static_cast<size_t>(iy) * wei_strides[2] +
+                           static_cast<size_t>(ix) * wei_strides[1] +
+                           static_cast<size_t>(ic) * wei_strides[0];
+
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
 // design block_size 256
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in,
                                             const src_data_t* __restrict__ p_wei,
                                             dst_data_t* __restrict__ p_out,
+                                            Strides6D in_strides,
+                                            Strides6D wei_strides,
+                                            Strides6D out_strides,
                                             int di,
                                             int hi,
                                             int wi,
@@ -970,18 +1623,37 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in,
     int in            = (bid / do_) % n;
     int ig            = bid / (n * do_);
 
-    p_in += static_cast<size_t>(in) * di * hi * wi * c + static_cast<size_t>(ig) * c_per_group;
-    p_wei += static_cast<size_t>(ig) * k_per_group * fz * fy * fx * c_per_group;
-    p_out += static_cast<size_t>(in) * do_ * ho * wo * k + static_cast<size_t>(ido) * ho * wo * k +
-             static_cast<size_t>(ig) * k_per_group;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * di * hi * wi * c + static_cast<size_t>(ig) * c_per_group;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * fz * fy * fx * c_per_group;
+
+        p_out += static_cast<size_t>(in) * do_ * ho * wo * k +
+                 static_cast<size_t>(ido) * ho * wo * k + static_cast<size_t>(ig) * k_per_group;
+    }
+    else
+    {
+        // dim order NDHWGC
+        // replace C and K with G * C_per_G and G * K_per_G
+        p_in += static_cast<size_t>(in) * in_strides[5] + static_cast<size_t>(ig) * in_strides[1];
+
+        // Assumes that group G is the highest dimension in the layout
+        p_wei += static_cast<size_t>(ig) * wei_strides[5];
+
+        p_out += static_cast<size_t>(in) * out_strides[5] +
+                 static_cast<size_t>(ido) * out_strides[4] +
+                 static_cast<size_t>(ig) * out_strides[1];
+    }
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ik  = tid % k_per_group;
         int iwo = (tid / k_per_group) % wo;
         int iho = tid / (k_per_group * wo);
 
-        double value = .0f;
+        // double value = .0f;
+        acc_data_t value = 0;
 
         for(int iz = 0; iz < fz; iz++)
         {
@@ -1005,30 +1677,69 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in,
                     {
                         if(valid_d & valid_w & valid_h)
                         {
-                            size_t i_idx = static_cast<size_t>(cur_d) * hi * wi * c +
-                                           static_cast<size_t>(cur_h) * wi * c +
-                                           static_cast<size_t>(cur_w) * c + static_cast<size_t>(ic);
-                            size_t f_idx = static_cast<size_t>(ik) * fz * fy * fx * c_per_group +
-                                           static_cast<size_t>(iz) * fy * fx * c_per_group +
-                                           static_cast<size_t>(iy) * fx * c_per_group +
-                                           static_cast<size_t>(ix) * c_per_group +
-                                           static_cast<size_t>(ic);
-                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            if constexpr(ASSUME_PACKED)
+                            {
+                                size_t i_idx = static_cast<size_t>(cur_d) * hi * wi * c +
+                                               static_cast<size_t>(cur_h) * wi * c +
+                                               static_cast<size_t>(cur_w) * c +
+                                               static_cast<size_t>(ic);
+
+                                size_t f_idx =
+                                    static_cast<size_t>(ik) * fz * fy * fx * c_per_group +
+                                    static_cast<size_t>(iz) * fy * fx * c_per_group +
+                                    static_cast<size_t>(iy) * fx * c_per_group +
+                                    static_cast<size_t>(ix) * c_per_group + static_cast<size_t>(ic);
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
+                            else
+                            {
+                                size_t i_idx = static_cast<size_t>(cur_d) * in_strides[4] +
+                                               static_cast<size_t>(cur_h) * in_strides[3] +
+                                               static_cast<size_t>(cur_w) * in_strides[2] +
+                                               static_cast<size_t>(ic) * in_strides[0];
+
+                                size_t f_idx = static_cast<size_t>(ik) * wei_strides[4] +
+                                               static_cast<size_t>(iz) * wei_strides[3] +
+                                               static_cast<size_t>(iy) * wei_strides[2] +
+                                               static_cast<size_t>(ix) * wei_strides[1] +
+                                               static_cast<size_t>(ic) * wei_strides[0];
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
                         }
                     }
                 }
             }
         }
-        size_t o_idx = static_cast<size_t>(iho) * wo * k + static_cast<size_t>(iwo) * k +
-                       static_cast<size_t>(ik);
-        p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t o_idx = static_cast<size_t>(iho) * wo * k + static_cast<size_t>(iwo) * k +
+                           static_cast<size_t>(ik);
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t o_idx = static_cast<size_t>(iho) * out_strides[3] +
+                           static_cast<size_t>(iwo) * out_strides[2] +
+                           static_cast<size_t>(ik) * out_strides[0];
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in,
                                             const src_data_t* __restrict__ p_wei,
                                             const src_data_t* __restrict__ p_out,
+                                            Strides6D in_strides,
+                                            Strides6D wei_strides,
+                                            Strides6D out_strides,
                                             int di,
                                             int hi,
                                             int wi,
@@ -1052,6 +1763,44 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in,
                                             int fx,
                                             int group)
 {
+    /*
+      if (blockIdx.x == 0 && threadIdx.x == 0) {
+        printStrideArray("in_strides", in_strides);
+        printStrideArray("wei_strides", wei_strides);
+        printStrideArray("out_strides", out_strides);
+
+        printf("modified strides\n");
+        Strides6D in_strd;
+        Strides6D wei_strd;
+        Strides6D out_strd;
+
+        in_strd[0] = 1;
+        in_strd[1] = k_per_group;
+        in_strd[2] = group * k_per_group;
+        in_strd[3] = group * k_per_group * wo;
+        in_strd[4] = group * k_per_group * wo * ho;
+        in_strd[5] = group * k_per_group * wo * ho * do_;
+
+        wei_strd[0] = 1;
+        wei_strd[1] = c_per_group;
+        wei_strd[2] = c_per_group * fx;
+        wei_strd[3] = c_per_group * fx * fy;
+        wei_strd[4] = c_per_group * fx * fy * fz;
+        wei_strd[5] = k_per_group * c_per_group * fx * fy * fz;
+
+        out_strd[0] = 1;
+        out_strd[1] = c_per_group;
+        out_strd[2] = group * c_per_group;
+        out_strd[3] = group * c_per_group * wi;
+        out_strd[4] = group * c_per_group * wi * hi;
+        out_strd[5] = group * c_per_group * wi * hi * di;
+
+        printStrideArray("in_strd", in_strd);
+        printStrideArray("wei_strd", wei_strd);
+        printStrideArray("out_strd", out_strd);
+      }
+      */
+
     /*
      *  need to compute total input pixel: `group * n * di * hi * wi *
      * c_per_group`.
@@ -1067,18 +1816,35 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in,
     int in            = (bid / di) % n;
     int ig            = bid / (n * di);
 
-    p_in += static_cast<size_t>(in) * di * hi * wi * c + static_cast<size_t>(idi) * hi * wi * c +
-            static_cast<size_t>(ig) * c_per_group;
-    p_wei += static_cast<size_t>(ig) * k_per_group * fz * fy * fx * c_per_group;
-    p_out += static_cast<size_t>(in) * do_ * ho * wo * k + static_cast<size_t>(ig) * k_per_group;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * di * hi * wi * c +
+                static_cast<size_t>(idi) * hi * wi * c + static_cast<size_t>(ig) * c_per_group;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * fz * fy * fx * c_per_group;
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out +=
+            static_cast<size_t>(in) * do_ * ho * wo * k + static_cast<size_t>(ig) * k_per_group;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[5] + static_cast<size_t>(idi) * in_strides[4] +
+                static_cast<size_t>(ig) * in_strides[1];
+
+        p_wei += static_cast<size_t>(ig) * wei_strides[5];
+
+        p_out +=
+            static_cast<size_t>(in) * out_strides[5] + static_cast<size_t>(ig) * out_strides[1];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ic  = tid % c_per_group;
         int iwi = (tid / c_per_group) % wi;
         int ihi = (tid / (c_per_group * wi));
 
-        double value = .0f;
+        // double value = .0f;
+        acc_data_t value = 0;
 
         for(int iz = 0; iz < fz; iz++)
         {
@@ -1111,32 +1877,69 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in,
                     {
                         if(valid_d & valid_h & valid_w)
                         {
-                            size_t o_idx = static_cast<size_t>(cur_do) * ho * wo * k +
-                                           static_cast<size_t>(cur_ho) * wo * k +
-                                           static_cast<size_t>(cur_wo) * k +
-                                           static_cast<size_t>(ik);
-                            size_t f_idx = static_cast<size_t>(ik) * fz * fy * fx * c_per_group +
-                                           static_cast<size_t>(iz) * fy * fx * c_per_group +
-                                           static_cast<size_t>(iy) * fx * c_per_group +
-                                           static_cast<size_t>(ix) * c_per_group +
-                                           static_cast<size_t>(ic);
-                            value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
-                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            if constexpr(ASSUME_PACKED)
+                            {
+                                size_t o_idx = static_cast<size_t>(cur_do) * ho * wo * k +
+                                               static_cast<size_t>(cur_ho) * wo * k +
+                                               static_cast<size_t>(cur_wo) * k +
+                                               static_cast<size_t>(ik);
+
+                                size_t f_idx =
+                                    static_cast<size_t>(ik) * fz * fy * fx * c_per_group +
+                                    static_cast<size_t>(iz) * fy * fx * c_per_group +
+                                    static_cast<size_t>(iy) * fx * c_per_group +
+                                    static_cast<size_t>(ix) * c_per_group + static_cast<size_t>(ic);
+
+                                value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
+                            else
+                            {
+                                size_t o_idx = static_cast<size_t>(cur_do) * out_strides[4] +
+                                               static_cast<size_t>(cur_ho) * out_strides[3] +
+                                               static_cast<size_t>(cur_wo) * out_strides[2] +
+                                               static_cast<size_t>(ik) * out_strides[0];
+
+                                size_t f_idx = static_cast<size_t>(ik) * wei_strides[4] +
+                                               static_cast<size_t>(iz) * wei_strides[3] +
+                                               static_cast<size_t>(iy) * wei_strides[2] +
+                                               static_cast<size_t>(ix) * wei_strides[1] +
+                                               static_cast<size_t>(ic) * wei_strides[0];
+
+                                value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
                         }
                     }
                 }
             }
         }
-        size_t i_idx = static_cast<size_t>(ihi) * wi * c + static_cast<size_t>(iwi) * c +
-                       static_cast<size_t>(ic);
-        p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t i_idx = static_cast<size_t>(ihi) * wi * c + static_cast<size_t>(iwi) * c +
+                           static_cast<size_t>(ic);
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t i_idx = static_cast<size_t>(ihi) * in_strides[3] +
+                           static_cast<size_t>(iwi) * in_strides[2] +
+                           static_cast<size_t>(ic) * in_strides[0];
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in,
                                             dst_data_t* __restrict__ p_wei,
                                             const src_data_t* __restrict__ p_out,
+                                            Strides6D in_strides,
+                                            Strides6D wei_strides,
+                                            Strides6D out_strides,
                                             int di,
                                             int hi,
                                             int wi,
@@ -1174,19 +1977,35 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in,
     int ik            = bid % k_per_group;
     int ig            = bid / k_per_group;
 
-    p_in += static_cast<size_t>(ig) * c_per_group;
-    p_wei += static_cast<size_t>(ig) * k_per_group * fz * fy * fx * c_per_group +
-             static_cast<size_t>(ik) * fz * fy * fx * c_per_group;
-    p_out += static_cast<size_t>(ig) * k_per_group + static_cast<size_t>(ik);
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(ig) * c_per_group;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * fz * fy * fx * c_per_group +
+                 static_cast<size_t>(ik) * fz * fy * fx * c_per_group;
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out += static_cast<size_t>(ig) * k_per_group + static_cast<size_t>(ik);
+    }
+    else
+    {
+        p_in += static_cast<size_t>(ig) * in_strides[1];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[5] + static_cast<size_t>(ik) * wei_strides[4];
+
+        p_out +=
+            static_cast<size_t>(ig) * out_strides[1] + static_cast<size_t>(ik) * out_strides[0];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ic = tid % c_per_group;
         int ix = (tid / c_per_group) % fx;
         int iy = (tid / (c_per_group * fx)) % fy;
         int iz = (tid / (c_per_group * fx * fy));
 
-        double value = .0f;
+        // double value = .0f;
+        acc_data_t value = 0;
 
         for(int in = 0; in < n; in++)
         {
@@ -1211,374 +2030,336 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in,
 
                         if(valid_d & valid_h & valid_w)
                         {
-                            size_t i_idx = static_cast<size_t>(in) * di * hi * wi * c +
-                                           static_cast<size_t>(cur_d) * hi * wi * c +
-                                           static_cast<size_t>(cur_h) * wi * c +
-                                           static_cast<size_t>(cur_w) * c + static_cast<size_t>(ic);
-                            size_t o_idx = static_cast<size_t>(in) * do_ * ho * wo * k +
-                                           static_cast<size_t>(ido) * ho * wo * k +
-                                           static_cast<size_t>(iho) * wo * k +
-                                           static_cast<size_t>(iwo) * k;
-                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                     cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+
+                            if constexpr(ASSUME_PACKED)
+                            {
+                                size_t i_idx = static_cast<size_t>(in) * di * hi * wi * c +
+                                               static_cast<size_t>(cur_d) * hi * wi * c +
+                                               static_cast<size_t>(cur_h) * wi * c +
+                                               static_cast<size_t>(cur_w) * c +
+                                               static_cast<size_t>(ic);
+
+                                size_t o_idx = static_cast<size_t>(in) * do_ * ho * wo * k +
+                                               static_cast<size_t>(ido) * ho * wo * k +
+                                               static_cast<size_t>(iho) * wo * k +
+                                               static_cast<size_t>(iwo) * k;
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                            }
+                            else
+                            {
+
+                                size_t i_idx = static_cast<size_t>(in) * in_strides[5] +
+                                               static_cast<size_t>(cur_d) * in_strides[4] +
+                                               static_cast<size_t>(cur_h) * in_strides[3] +
+                                               static_cast<size_t>(cur_w) * in_strides[2] +
+                                               static_cast<size_t>(ic) * in_strides[0];
+
+                                size_t o_idx = static_cast<size_t>(in) * out_strides[5] +
+                                               static_cast<size_t>(ido) * out_strides[4] +
+                                               static_cast<size_t>(iho) * out_strides[3] +
+                                               static_cast<size_t>(iwo) * out_strides[2];
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                            }
                         }
                     }
                 }
             }
         }
-        size_t f_idx = static_cast<size_t>(iz) * fy * fx * c_per_group +
-                       static_cast<size_t>(iy) * fx * c_per_group +
-                       static_cast<size_t>(ix) * c_per_group + static_cast<size_t>(ic);
-        p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
-    }
-}
-
-#define DEFINE_2D_NAIVE_FWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \
-    extern "C" __global__ void                                                             \
-        naive_conv_fwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(       \
-            src_data_t* __restrict__ p_in,                                                 \
-            src_data_t* __restrict__ p_wei,                                                \
-            dst_data_t* __restrict__ p_out,                                                \
-            int hi,                                                                        \
-            int wi,                                                                        \
-            int n,                                                                         \
-            int k_per_group,                                                               \
-            int c_per_group,                                                               \
-            int ho,                                                                        \
-            int wo,                                                                        \
-            int sy,                                                                        \
-            int sx,                                                                        \
-            int dy,                                                                        \
-            int dx,                                                                        \
-            int py,                                                                        \
-            int px,                                                                        \
-            int fy,                                                                        \
-            int fx,                                                                        \
-            int group)                                                                     \
-    {                                                                                      \
-        naive_conv_fwd_##tensor_layout<src_data_t, acc_data_t, dst_data_t>(p_in,           \
-                                                                           p_wei,          \
-                                                                           p_out,          \
-                                                                           hi,             \
-                                                                           wi,             \
-                                                                           n,              \
-                                                                           k_per_group,    \
-                                                                           c_per_group,    \
-                                                                           ho,             \
-                                                                           wo,             \
-                                                                           sy,             \
-                                                                           sx,             \
-                                                                           dy,             \
-                                                                           dx,             \
-                                                                           py,             \
-                                                                           px,             \
-                                                                           fy,             \
-                                                                           fx,             \
-                                                                           group);         \
-    }
 
-#define DEFINE_2D_NAIVE_BWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \
-    extern "C" __global__ void                                                             \
-        naive_conv_bwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(       \
-            dst_data_t* __restrict__ p_in,                                                 \
-            src_data_t* __restrict__ p_wei,                                                \
-            src_data_t* __restrict__ p_out,                                                \
-            int hi,                                                                        \
-            int wi,                                                                        \
-            int n,                                                                         \
-            int k_per_group,                                                               \
-            int c_per_group,                                                               \
-            int ho,                                                                        \
-            int wo,                                                                        \
-            int sy,                                                                        \
-            int sx,                                                                        \
-            int dy,                                                                        \
-            int dx,                                                                        \
-            int py,                                                                        \
-            int px,                                                                        \
-            int fy,                                                                        \
-            int fx,                                                                        \
-            int group)                                                                     \
-    {                                                                                      \
-        naive_conv_bwd_##tensor_layout<src_data_t, acc_data_t, dst_data_t>(p_in,           \
-                                                                           p_wei,          \
-                                                                           p_out,          \
-                                                                           hi,             \
-                                                                           wi,             \
-                                                                           n,              \
-                                                                           k_per_group,    \
-                                                                           c_per_group,    \
-                                                                           ho,             \
-                                                                           wo,             \
-                                                                           sy,             \
-                                                                           sx,             \
-                                                                           dy,             \
-                                                                           dx,             \
-                                                                           py,             \
-                                                                           px,             \
-                                                                           fy,             \
-                                                                           fx,             \
-                                                                           group);         \
-    }
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t f_idx = static_cast<size_t>(iz) * fy * fx * c_per_group +
+                           static_cast<size_t>(iy) * fx * c_per_group +
+                           static_cast<size_t>(ix) * c_per_group + static_cast<size_t>(ic);
 
-#define DEFINE_2D_NAIVE_WRW_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \
-    extern "C" __global__ void                                                             \
-        naive_conv_wrw_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(       \
-            src_data_t* __restrict__ p_in,                                                 \
-            dst_data_t* __restrict__ p_wei,                                                \
-            src_data_t* __restrict__ p_out,                                                \
-            int hi,                                                                        \
-            int wi,                                                                        \
-            int n,                                                                         \
-            int k_per_group,                                                               \
-            int c_per_group,                                                               \
-            int ho,                                                                        \
-            int wo,                                                                        \
-            int sy,                                                                        \
-            int sx,                                                                        \
-            int dy,                                                                        \
-            int dx,                                                                        \
-            int py,                                                                        \
-            int px,                                                                        \
-            int fy,                                                                        \
-            int fx,                                                                        \
-            int group)                                                                     \
-    {                                                                                      \
-        naive_conv_wrw_##tensor_layout<src_data_t, acc_data_t, dst_data_t>(p_in,           \
-                                                                           p_wei,          \
-                                                                           p_out,          \
-                                                                           hi,             \
-                                                                           wi,             \
-                                                                           n,              \
-                                                                           k_per_group,    \
-                                                                           c_per_group,    \
-                                                                           ho,             \
-                                                                           wo,             \
-                                                                           sy,             \
-                                                                           sx,             \
-                                                                           dy,             \
-                                                                           dx,             \
-                                                                           py,             \
-                                                                           px,             \
-                                                                           fy,             \
-                                                                           fx,             \
-                                                                           group);         \
-    }
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t f_idx = static_cast<size_t>(iz) * wei_strides[3] +
+                           static_cast<size_t>(iy) * wei_strides[2] +
+                           static_cast<size_t>(ix) * wei_strides[1] +
+                           static_cast<size_t>(ic) * wei_strides[0];
 
-#define DEFINE_3D_NAIVE_FWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \
-    extern "C" __global__ void                                                             \
-        naive_conv_fwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(       \
-            src_data_t* __restrict__ p_in,                                                 \
-            src_data_t* __restrict__ p_wei,                                                \
-            dst_data_t* __restrict__ p_out,                                                \
-            int di,                                                                        \
-            int hi,                                                                        \
-            int wi,                                                                        \
-            int n,                                                                         \
-            int k_per_group,                                                               \
-            int c_per_group,                                                               \
-            int do_,                                                                       \
-            int ho,                                                                        \
-            int wo,                                                                        \
-            int sz,                                                                        \
-            int sy,                                                                        \
-            int sx,                                                                        \
-            int dz,                                                                        \
-            int dy,                                                                        \
-            int dx,                                                                        \
-            int pz,                                                                        \
-            int py,                                                                        \
-            int px,                                                                        \
-            int fz,                                                                        \
-            int fy,                                                                        \
-            int fx,                                                                        \
-            int group)                                                                     \
-    {                                                                                      \
-        naive_conv_fwd_##tensor_layout<src_data_t, acc_data_t, dst_data_t>(p_in,           \
-                                                                           p_wei,          \
-                                                                           p_out,          \
-                                                                           di,             \
-                                                                           hi,             \
-                                                                           wi,             \
-                                                                           n,              \
-                                                                           k_per_group,    \
-                                                                           c_per_group,    \
-                                                                           do_,            \
-                                                                           ho,             \
-                                                                           wo,             \
-                                                                           sz,             \
-                                                                           sy,             \
-                                                                           sx,             \
-                                                                           dz,             \
-                                                                           dy,             \
-                                                                           dx,             \
-                                                                           pz,             \
-                                                                           py,             \
-                                                                           px,             \
-                                                                           fz,             \
-                                                                           fy,             \
-                                                                           fx,             \
-                                                                           group);         \
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
+}
 
-#define DEFINE_3D_NAIVE_BWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \
-    extern "C" __global__ void                                                             \
-        naive_conv_bwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(       \
-            dst_data_t* __restrict__ p_in,                                                 \
-            src_data_t* __restrict__ p_wei,                                                \
-            src_data_t* __restrict__ p_out,                                                \
-            int di,                                                                        \
-            int hi,                                                                        \
-            int wi,                                                                        \
-            int n,                                                                         \
-            int k_per_group,                                                               \
-            int c_per_group,                                                               \
-            int do_,                                                                       \
-            int ho,                                                                        \
-            int wo,                                                                        \
-            int sz,                                                                        \
-            int sy,                                                                        \
-            int sx,                                                                        \
-            int dz,                                                                        \
-            int dy,                                                                        \
-            int dx,                                                                        \
-            int pz,                                                                        \
-            int py,                                                                        \
-            int px,                                                                        \
-            int fz,                                                                        \
-            int fy,                                                                        \
-            int fx,                                                                        \
-            int group)                                                                     \
-    {                                                                                      \
-        naive_conv_bwd_##tensor_layout<src_data_t, acc_data_t, dst_data_t>(p_in,           \
-                                                                           p_wei,          \
-                                                                           p_out,          \
-                                                                           di,             \
-                                                                           hi,             \
-                                                                           wi,             \
-                                                                           n,              \
-                                                                           k_per_group,    \
-                                                                           c_per_group,    \
-                                                                           do_,            \
-                                                                           ho,             \
-                                                                           wo,             \
-                                                                           sz,             \
-                                                                           sy,             \
-                                                                           sx,             \
-                                                                           dz,             \
-                                                                           dy,             \
-                                                                           dx,             \
-                                                                           pz,             \
-                                                                           py,             \
-                                                                           px,             \
-                                                                           fz,             \
-                                                                           fy,             \
-                                                                           fx,             \
-                                                                           group);         \
+#define DEFINE_2D_NAIVE_CONV_KERNEL(direction, tensor_layout, src_data_t, acc_data_t, dst_data_t)        \
+    extern "C" __global__ void                                                                           \
+        naive_conv_packed_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(    \
+            src_data_t* __restrict__ p_in,                                                               \
+            src_data_t* __restrict__ p_wei,                                                              \
+            dst_data_t* __restrict__ p_out,                                                              \
+            Strides5D in_strides,                                                                        \
+            Strides5D wei_strides,                                                                       \
+            Strides5D out_strides,                                                                       \
+            int hi,                                                                                      \
+            int wi,                                                                                      \
+            int n,                                                                                       \
+            int k_per_group,                                                                             \
+            int c_per_group,                                                                             \
+            int ho,                                                                                      \
+            int wo,                                                                                      \
+            int sy,                                                                                      \
+            int sx,                                                                                      \
+            int dy,                                                                                      \
+            int dx,                                                                                      \
+            int py,                                                                                      \
+            int px,                                                                                      \
+            int fy,                                                                                      \
+            int fx,                                                                                      \
+            int group)                                                                                   \
+    {                                                                                                    \
+        naive_conv_##direction##_##tensor_layout<true, src_data_t, acc_data_t, dst_data_t>(              \
+            p_in,                                                                                        \
+            p_wei,                                                                                       \
+            p_out,                                                                                       \
+            in_strides,                                                                                  \
+            wei_strides,                                                                                 \
+            out_strides,                                                                                 \
+            hi,                                                                                          \
+            wi,                                                                                          \
+            n,                                                                                           \
+            k_per_group,                                                                                 \
+            c_per_group,                                                                                 \
+            ho,                                                                                          \
+            wo,                                                                                          \
+            sy,                                                                                          \
+            sx,                                                                                          \
+            dy,                                                                                          \
+            dx,                                                                                          \
+            py,                                                                                          \
+            px,                                                                                          \
+            fy,                                                                                          \
+            fx,                                                                                          \
+            group);                                                                                      \
+    }                                                                                                    \
+    extern "C" __global__ void                                                                           \
+        naive_conv_nonpacked_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \
+            src_data_t* __restrict__ p_in,                                                               \
+            src_data_t* __restrict__ p_wei,                                                              \
+            dst_data_t* __restrict__ p_out,                                                              \
+            Strides5D in_strides,                                                                        \
+            Strides5D wei_strides,                                                                       \
+            Strides5D out_strides,                                                                       \
+            int hi,                                                                                      \
+            int wi,                                                                                      \
+            int n,                                                                                       \
+            int k_per_group,                                                                             \
+            int c_per_group,                                                                             \
+            int ho,                                                                                      \
+            int wo,                                                                                      \
+            int sy,                                                                                      \
+            int sx,                                                                                      \
+            int dy,                                                                                      \
+            int dx,                                                                                      \
+            int py,                                                                                      \
+            int px,                                                                                      \
+            int fy,                                                                                      \
+            int fx,                                                                                      \
+            int group)                                                                                   \
+    {                                                                                                    \
+        naive_conv_##direction##_##tensor_layout<false, src_data_t, acc_data_t, dst_data_t>(             \
+            p_in,                                                                                        \
+            p_wei,                                                                                       \
+            p_out,                                                                                       \
+            in_strides,                                                                                  \
+            wei_strides,                                                                                 \
+            out_strides,                                                                                 \
+            hi,                                                                                          \
+            wi,                                                                                          \
+            n,                                                                                           \
+            k_per_group,                                                                                 \
+            c_per_group,                                                                                 \
+            ho,                                                                                          \
+            wo,                                                                                          \
+            sy,                                                                                          \
+            sx,                                                                                          \
+            dy,                                                                                          \
+            dx,                                                                                          \
+            py,                                                                                          \
+            px,                                                                                          \
+            fy,                                                                                          \
+            fx,                                                                                          \
+            group);                                                                                      \
     }
 
-#define DEFINE_3D_NAIVE_WRW_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \
-    extern "C" __global__ void                                                             \
-        naive_conv_wrw_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(       \
-            src_data_t* __restrict__ p_in,                                                 \
-            dst_data_t* __restrict__ p_wei,                                                \
-            src_data_t* __restrict__ p_out,                                                \
-            int di,                                                                        \
-            int hi,                                                                        \
-            int wi,                                                                        \
-            int n,                                                                         \
-            int k_per_group,                                                               \
-            int c_per_group,                                                               \
-            int do_,                                                                       \
-            int ho,                                                                        \
-            int wo,                                                                        \
-            int sz,                                                                        \
-            int sy,                                                                        \
-            int sx,                                                                        \
-            int dz,                                                                        \
-            int dy,                                                                        \
-            int dx,                                                                        \
-            int pz,                                                                        \
-            int py,                                                                        \
-            int px,                                                                        \
-            int fz,                                                                        \
-            int fy,                                                                        \
-            int fx,                                                                        \
-            int group)                                                                     \
-    {                                                                                      \
-        naive_conv_wrw_##tensor_layout<src_data_t, acc_data_t, dst_data_t>(p_in,           \
-                                                                           p_wei,          \
-                                                                           p_out,          \
-                                                                           di,             \
-                                                                           hi,             \
-                                                                           wi,             \
-                                                                           n,              \
-                                                                           k_per_group,    \
-                                                                           c_per_group,    \
-                                                                           do_,            \
-                                                                           ho,             \
-                                                                           wo,             \
-                                                                           sz,             \
-                                                                           sy,             \
-                                                                           sx,             \
-                                                                           dz,             \
-                                                                           dy,             \
-                                                                           dx,             \
-                                                                           pz,             \
-                                                                           py,             \
-                                                                           px,             \
-                                                                           fz,             \
-                                                                           fy,             \
-                                                                           fx,             \
-                                                                           group);         \
+#define DEFINE_3D_NAIVE_CONV_KERNEL(direction, tensor_layout, src_data_t, acc_data_t, dst_data_t)        \
+    extern "C" __global__ void                                                                           \
+        naive_conv_packed_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(    \
+            src_data_t* __restrict__ p_in,                                                               \
+            src_data_t* __restrict__ p_wei,                                                              \
+            dst_data_t* __restrict__ p_out,                                                              \
+            Strides6D in_strides,                                                                        \
+            Strides6D wei_strides,                                                                       \
+            Strides6D out_strides,                                                                       \
+            int di,                                                                                      \
+            int hi,                                                                                      \
+            int wi,                                                                                      \
+            int n,                                                                                       \
+            int k_per_group,                                                                             \
+            int c_per_group,                                                                             \
+            int do_,                                                                                     \
+            int ho,                                                                                      \
+            int wo,                                                                                      \
+            int sz,                                                                                      \
+            int sy,                                                                                      \
+            int sx,                                                                                      \
+            int dz,                                                                                      \
+            int dy,                                                                                      \
+            int dx,                                                                                      \
+            int pz,                                                                                      \
+            int py,                                                                                      \
+            int px,                                                                                      \
+            int fz,                                                                                      \
+            int fy,                                                                                      \
+            int fx,                                                                                      \
+            int group)                                                                                   \
+    {                                                                                                    \
+        naive_conv_##direction##_##tensor_layout<true, src_data_t, acc_data_t, dst_data_t>(              \
+            p_in,                                                                                        \
+            p_wei,                                                                                       \
+            p_out,                                                                                       \
+            in_strides,                                                                                  \
+            wei_strides,                                                                                 \
+            out_strides,                                                                                 \
+            di,                                                                                          \
+            hi,                                                                                          \
+            wi,                                                                                          \
+            n,                                                                                           \
+            k_per_group,                                                                                 \
+            c_per_group,                                                                                 \
+            do_,                                                                                         \
+            ho,                                                                                          \
+            wo,                                                                                          \
+            sz,                                                                                          \
+            sy,                                                                                          \
+            sx,                                                                                          \
+            dz,                                                                                          \
+            dy,                                                                                          \
+            dx,                                                                                          \
+            pz,                                                                                          \
+            py,                                                                                          \
+            px,                                                                                          \
+            fz,                                                                                          \
+            fy,                                                                                          \
+            fx,                                                                                          \
+            group);                                                                                      \
+    }                                                                                                    \
+    extern "C" __global__ void                                                                           \
+        naive_conv_nonpacked_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \
+            src_data_t* __restrict__ p_in,                                                               \
+            src_data_t* __restrict__ p_wei,                                                              \
+            dst_data_t* __restrict__ p_out,                                                              \
+            Strides6D in_strides,                                                                        \
+            Strides6D wei_strides,                                                                       \
+            Strides6D out_strides,                                                                       \
+            int di,                                                                                      \
+            int hi,                                                                                      \
+            int wi,                                                                                      \
+            int n,                                                                                       \
+            int k_per_group,                                                                             \
+            int c_per_group,                                                                             \
+            int do_,                                                                                     \
+            int ho,                                                                                      \
+            int wo,                                                                                      \
+            int sz,                                                                                      \
+            int sy,                                                                                      \
+            int sx,                                                                                      \
+            int dz,                                                                                      \
+            int dy,                                                                                      \
+            int dx,                                                                                      \
+            int pz,                                                                                      \
+            int py,                                                                                      \
+            int px,                                                                                      \
+            int fz,                                                                                      \
+            int fy,                                                                                      \
+            int fx,                                                                                      \
+            int group)                                                                                   \
+    {                                                                                                    \
+        naive_conv_##direction##_##tensor_layout<false, src_data_t, acc_data_t, dst_data_t>(             \
+            p_in,                                                                                        \
+            p_wei,                                                                                       \
+            p_out,                                                                                       \
+            in_strides,                                                                                  \
+            wei_strides,                                                                                 \
+            out_strides,                                                                                 \
+            di,                                                                                          \
+            hi,                                                                                          \
+            wi,                                                                                          \
+            n,                                                                                           \
+            k_per_group,                                                                                 \
+            c_per_group,                                                                                 \
+            do_,                                                                                         \
+            ho,                                                                                          \
+            wo,                                                                                          \
+            sz,                                                                                          \
+            sy,                                                                                          \
+            sx,                                                                                          \
+            dz,                                                                                          \
+            dy,                                                                                          \
+            dx,                                                                                          \
+            pz,                                                                                          \
+            py,                                                                                          \
+            px,                                                                                          \
+            fz,                                                                                          \
+            fy,                                                                                          \
+            fx,                                                                                          \
+            group);                                                                                      \
     }
 
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, float, double, float)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, half, double, half)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, ushort, double, ushort)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, int8_t, int32_t, int32_t)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, int8_t, int32_t, float)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, float, double, float)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, half, double, half)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, ushort, double, ushort)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, int8_t, int32_t, int32_t)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, int8_t, int32_t, float)
-
-DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nchw, float, double, float)
-DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nchw, half, double, half)
-DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nchw, ushort, double, ushort)
-DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nhwc, float, double, float)
-DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nhwc, half, double, half)
-DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nhwc, ushort, double, ushort)
-
-DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nchw, float, double, float)
-DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nchw, half, double, half)
-DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nchw, ushort, double, ushort)
-DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nhwc, float, double, float)
-DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nhwc, half, double, half)
-DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nhwc, ushort, double, ushort)
-
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, float, double, float)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, half, double, half)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, ushort, double, ushort)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, int8_t, int32_t, int32_t)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, int8_t, int32_t, float)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, float, double, float)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, half, double, half)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, ushort, double, ushort)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, int8_t, int32_t, int32_t)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, int8_t, int32_t, float)
-
-DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ncdhw, float, double, float)
-DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ncdhw, half, double, half)
-DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ncdhw, ushort, double, ushort)
-DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ndhwc, float, double, float)
-DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ndhwc, half, double, half)
-DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ndhwc, ushort, double, ushort)
-
-DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ncdhw, float, double, float)
-DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ncdhw, half, double, half)
-DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ncdhw, ushort, double, ushort)
-DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ndhwc, float, double, float)
-DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ndhwc, half, double, half)
-DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ndhwc, ushort, double, ushort)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, float, double, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, half, double, half)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, ushort, double, ushort)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, int32_t)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, float, double, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, half, double, half)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, ushort, double, ushort)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, int32_t)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, float)
+
+DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, float, double, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, half, double, half)
+DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, ushort, double, ushort)
+DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, float, double, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, half, double, half)
+DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, ushort, double, ushort)
+
+DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, float, double, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, half, double, half)
+DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, ushort, double, ushort)
+DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, float, double, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, half, double, half)
+DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, ushort, double, ushort)
+
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, float, double, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, half, double, half)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, ushort, double, ushort)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, int8_t, int32_t, int32_t)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, int8_t, int32_t, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, float, double, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, half, double, half)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, ushort, double, ushort)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, int8_t, int32_t, int32_t)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, int8_t, int32_t, float)
+
+DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, float, double, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, half, double, half)
+DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, ushort, double, ushort)
+DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, float, double, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, half, double, half)
+DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, ushort, double, ushort)
+
+DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, float, double, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, half, double, half)
+DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, ushort, double, ushort)
+DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, float, double, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, half, double, half)
+DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, ushort, double, ushort)
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index 64c95257e6..425b65c6de 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -24,6 +24,7 @@
  *
  *******************************************************************************/
 
+#include "miopen/env.hpp"
 #include <miopen/solver/conv_direct_naive_conv.hpp>
 #include <miopen/solver.hpp>
 #include <miopen/problem_description.hpp>
@@ -32,6 +33,7 @@
 #include <miopen/solver/implicitgemm_util.hpp>
 #include <miopen/datatype.hpp>
 #include <ostream>
+#include <filesystem>
 
 namespace miopen {
 
@@ -108,7 +110,15 @@ bool IsOutputInt32(const ProblemDescription& problem)
 std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
 {
     std::ostringstream kernel_name;
-    kernel_name << "naive_conv_";
+    if(miopen::IsEnvvarValueEnabled("MIOPEN_USE_PACKED_CONV_REF_KERNEL"))
+    {
+        kernel_name << "naive_conv_packed_";
+    }
+    else
+    {
+        kernel_name << "naive_conv_nonpacked_";
+    }
+
     if(problem.direction.IsForward())
         kernel_name << "fwd_";
     else if(problem.direction.IsBackwardData())
@@ -173,6 +183,8 @@ std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
     else
         MIOPEN_THROW("unsupported data type:");
 
+    // TODO(Amber): Left for debugging. Will remove in the future.
+    // std::cout << "############ kernel_name = " << kernel_name.str() << std::endl;
     return kernel_name.str();
 }
 
@@ -244,5 +256,49 @@ bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext& ctx,
     return true;
 }
 
+// figure out the index of C (channel) stride so we can expand it into
+// (G, C_per_group). Return value G_stride_idx is the position of G stride
+// in the stride vector, such that the (G_stride_idx - 1) is the index that
+// contains C's stride as a multiplying factor
+int GetGroupStrideIndex(const ProblemDescription& problem)
+{
+    int G_stride_idx = -1;
+    if(problem.IsLayoutDefault())
+    {
+        G_stride_idx = 1;
+    }
+    else
+    {
+        assert(problem.IsLayoutNHWC());
+        assert(problem.Is2d() || problem.Is3d());
+        //
+        // G_stride_idx = problem.Is2d() ? 3 : 4;
+        // For NHWC, MIOpen stores strides in NCHW order, so we are interested in 1 + W's
+        // stride as that will be the value of G_stride_idx;
+        G_stride_idx = problem.Is2d() ? 4 : 5;
+    }
+    assert(G_stride_idx != -1);
+    return G_stride_idx;
+}
+
+void printTensorStrides(const TensorDescriptor& inDesc,
+                        const TensorDescriptor& wDesc,
+                        const TensorDescriptor& outDesc)
+{
+
+    auto printOneStrideVec = [](const char* name, const auto& vec) {
+        printf("%s = [", name);
+        for(const size_t v : vec)
+        {
+            printf("%zu,", v);
+        }
+        printf("]\n");
+    };
+
+    printOneStrideVec("inDesc = ", inDesc.GetStrides());
+    printOneStrideVec("wDesc = ", wDesc.GetStrides());
+    printOneStrideVec("outDesc = ", outDesc.GetStrides());
+}
+
 } // namespace solver
 } // namespace miopen
diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp
index c5d793860c..d0687b64ba 100644
--- a/src/solver/conv_direct_naive_conv_bwd.cpp
+++ b/src/solver/conv_direct_naive_conv_bwd.cpp
@@ -142,17 +142,35 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
     }();
     kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
+    int G_stride_idx = GetGroupStrideIndex(problem);
+
     if(problem.Is2d())
+    {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
             const auto kern = kernels[0];
             return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
                 decltype(auto) data_ctx = primitive_parameters.CastTo<conv::DataInvokeParams>();
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
-                if(is_f8)
+                auto in_strides = MakeStrideArray<5>(
+                    SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx));
+                // For weights, we split K to (G, K_per_group), which is always index 0
+                auto wei_strides = MakeStrideArray<5>(
+                    SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
+                auto out_strides = MakeStrideArray<5>(
+                    SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx));
+                // TODO(Amber): Someone made the silly decision of swapping in and
+                // out pointers in ConvTensors for backward pass, so now I have to
+                // pass out in place of in, out_strides in place of in_strides and
+                // vice-versa
+                if(is_f8) 
+                {
                     handle.Run(kern)(tensors.out,
                                      tensors.w,
                                      tensors.in,
+                                     out_strides,
+                                     wei_strides,
+                                     in_strides,
                                      hi,
                                      wi,
                                      n,
@@ -172,10 +190,15 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
                                      problem.GetConv().attribute.fp8rounding_mode.Get() ==
                                          miopenF8RoundingModeStochastic,
                                      problem.GetConv().attribute.fp8rounding_mode.GetSeed());
+                }
                 else
+                {
                     handle.Run(kern)(tensors.out,
                                      tensors.w,
                                      tensors.in,
+                                     out_strides,
+                                     wei_strides,
+                                     in_strides,
                                      hi,
                                      wi,
                                      n,
@@ -192,6 +215,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
                                      fy,
                                      fx,
                                      group);
+                }
                 if(handle.IsProfilingEnabled())
                     elapsed += handle.GetKernelTime();
 
@@ -202,7 +226,9 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
                 }
             };
         };
+    }
     else
+    {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
             const auto kern = kernels[0];
             return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
@@ -210,9 +236,27 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
 
+                auto in_strides = MakeStrideArray<6>(
+                    SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx));
+                // For weights, we split K to (G, K_per_group), which is always index 0
+                auto wei_strides = MakeStrideArray<6>(
+                    SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
+                auto out_strides = MakeStrideArray<6>(
+                    SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx));
+
+                // printTensorStrides(tensors.inDesc, tensors.wDesc, tensors.outDesc);
+                // printStrideArrays(in_strides, wei_strides, out_strides);
+
+                // TODO(Amber): Someone made the silly decision of swapping in and
+                // out pointers in ConvTensors for backward pass, so now I have to
+                // pass out in place of in, out_strides in place of in_strides and
+                // vice-versa
                 handle.Run(kern)(tensors.out,
                                  tensors.w,
                                  tensors.in,
+                                 out_strides,
+                                 wei_strides,
+                                 in_strides,
                                  di,
                                  hi,
                                  wi,
@@ -245,6 +289,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
                 }
             };
         };
+    }
     result.construction_params.push_back(kernel);
     return result;
 }
diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp
index fc8d8e77fd..6f06a2251b 100644
--- a/src/solver/conv_direct_naive_conv_fwd.cpp
+++ b/src/solver/conv_direct_naive_conv_fwd.cpp
@@ -28,6 +28,7 @@
 #include <miopen/solver.hpp>
 #include <miopen/conv/data_invoke_params.hpp>
 #include <miopen/env.hpp>
+#include <miopen/common.hpp>
 
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD)
 
@@ -142,18 +143,33 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
 
     kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
+    int G_stride_idx = GetGroupStrideIndex(problem);
+
     if(problem.Is2d())
+    {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
             const auto kern = kernels[0];
             return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
                 decltype(auto) data_ctx = primitive_parameters.CastTo<conv::DataInvokeParams>();
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
+
+                auto in_strides = MakeStrideArray<5>(
+                    SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx));
+                // For weights, we split K to (G, K_per_group), which is always index 0
+                auto wei_strides = MakeStrideArray<5>(
+                    SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
+                auto out_strides = MakeStrideArray<5>(
+                    SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx));
+
                 if(is_f8)
                 {
                     handle.Run(kern)(tensors.in,
                                      tensors.w,
                                      tensors.out,
+                                     in_strides,
+                                     wei_strides,
+                                     out_strides,
                                      hi,
                                      wi,
                                      n,
@@ -179,6 +195,9 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
                     handle.Run(kern)(tensors.in,
                                      tensors.w,
                                      tensors.out,
+                                     in_strides,
+                                     wei_strides,
+                                     out_strides,
                                      hi,
                                      wi,
                                      n,
@@ -206,7 +225,9 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
                 }
             };
         };
+    }
     else
+    {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
             const auto kern = kernels[0];
             return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
@@ -214,9 +235,19 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
 
+                auto in_strides = MakeStrideArray<6>(
+                    SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx));
+                // For weights, we split K to (G, K_per_group), which is always index 0
+                auto wei_strides = MakeStrideArray<6>(
+                    SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
+                auto out_strides = MakeStrideArray<6>(
+                    SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx));
                 handle.Run(kern)(tensors.in,
                                  tensors.w,
                                  tensors.out,
+                                 in_strides,
+                                 wei_strides,
+                                 out_strides,
                                  di,
                                  hi,
                                  wi,
@@ -249,6 +280,7 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
                 }
             };
         };
+    }
     result.construction_params.push_back(kernel);
     return result;
 }
diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp
index 2c85949ad4..60bbe90411 100644
--- a/src/solver/conv_direct_naive_conv_wrw.cpp
+++ b/src/solver/conv_direct_naive_conv_wrw.cpp
@@ -129,17 +129,33 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
             return false;
     }();
 
+    int G_stride_idx = GetGroupStrideIndex(problem);
+
     if(problem.Is2d())
+    {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
             const auto kern = kernels[0];
             return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
                 decltype(auto) data_ctx = primitive_parameters.CastTo<conv::WrWInvokeParams>();
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
+
+                auto in_strides = MakeStrideArray<5>(
+                    SplitStrideCtoGC(group, tensors.xDesc.GetStrides(), G_stride_idx));
+                // For weights, we split K to (G, K_per_group), which is always index 0
+                auto wei_strides = MakeStrideArray<5>(
+                    SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides()));
+                auto out_strides = MakeStrideArray<5>(
+                    SplitStrideCtoGC(group, tensors.dyDesc.GetStrides(), G_stride_idx));
+
                 if(is_f8)
+                {
                     handle.Run(kern)(tensors.x,
                                      tensors.dw,
                                      tensors.dy,
+                                 in_strides,
+                                 wei_strides,
+                                 out_strides,
                                      hi,
                                      wi,
                                      n,
@@ -159,10 +175,15 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
                                      problem.GetConv().attribute.fp8rounding_mode.Get() ==
                                          miopenF8RoundingModeStochastic,
                                      problem.GetConv().attribute.fp8rounding_mode.GetSeed());
+                }
                 else
+                {
                     handle.Run(kern)(tensors.x,
                                      tensors.dw,
                                      tensors.dy,
+                                 in_strides,
+                                 wei_strides,
+                                 out_strides,
                                      hi,
                                      wi,
                                      n,
@@ -179,6 +200,7 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
                                      fy,
                                      fx,
                                      group);
+                }
                 if(handle.IsProfilingEnabled())
                     elapsed += handle.GetKernelTime();
 
@@ -189,7 +211,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
                 }
             };
         };
+    }
     else
+    {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
             const auto kern = kernels[0];
             return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
@@ -197,9 +221,20 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
 
+                auto in_strides = MakeStrideArray<6>(
+                    SplitStrideCtoGC(group, tensors.xDesc.GetStrides(), G_stride_idx));
+                // For weights, we split K to (G, K_per_group), which is always index 0
+                auto wei_strides = MakeStrideArray<6>(
+                    SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides()));
+                auto out_strides = MakeStrideArray<6>(
+                    SplitStrideCtoGC(group, tensors.dyDesc.GetStrides(), G_stride_idx));
+
                 handle.Run(kern)(tensors.x,
                                  tensors.dw,
                                  tensors.dy,
+                                 in_strides,
+                                 wei_strides,
+                                 out_strides,
                                  di,
                                  hi,
                                  wi,
@@ -232,6 +267,7 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
                 }
             };
         };
+    }
     result.construction_params.push_back(kernel);
     return result;
 }
diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp
index c3b26a80a9..aa3dda788d 100644
--- a/test/gpu_reference_kernel.cpp
+++ b/test/gpu_reference_kernel.cpp
@@ -95,7 +95,8 @@ struct gpu_reference_kernel_base
 
     static std::vector<int> get_image_size() { return {9, 14}; }
 
-    static std::vector<int> get_channel_size() { return {3, 8}; }
+    // Warning: Channel size must be multiple of group size
+    static std::vector<int> get_channel_size() { return {4, 8}; }
 
     static std::vector<int> get_filter_depth() { return {1, 3}; }
 

From 5ab82f31807caf2c3710677732be0f34fb16b145 Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Sat, 23 Sep 2023 11:51:34 +0000
Subject: [PATCH 02/23] fix formatting. disable strides for fp8 kernel for now

---
 src/kernels/gpu_reference_kernel/fp8_kern_types.h | 6 +++---
 src/solver/conv_direct_naive_conv_bwd.cpp         | 7 ++-----
 src/solver/conv_direct_naive_conv_fwd.cpp         | 3 ---
 src/solver/conv_direct_naive_conv_wrw.cpp         | 9 +++------
 4 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/src/kernels/gpu_reference_kernel/fp8_kern_types.h b/src/kernels/gpu_reference_kernel/fp8_kern_types.h
index 3bac0a31f7..b14302e0c2 100644
--- a/src/kernels/gpu_reference_kernel/fp8_kern_types.h
+++ b/src/kernels/gpu_reference_kernel/fp8_kern_types.h
@@ -58,6 +58,6 @@
 
 #define KERNEL_NAME_SUFFIX CAT(CAT(INPUT_TYPE, _), CAT(CAT(WEIGHTS_TYPE, _), OUTPUT_TYPE))
 
-#define FWD_KERNEL_NAME CAT(naive_conv_fwd_nchw_, KERNEL_NAME_SUFFIX)
-#define BWD_KERNEL_NAME CAT(naive_conv_bwd_nchw_, KERNEL_NAME_SUFFIX)
-#define WRW_KERNEL_NAME CAT(naive_conv_wrw_nchw_, KERNEL_NAME_SUFFIX)
+#define FWD_KERNEL_NAME CAT(naive_conv_packed_fwd_nchw_, KERNEL_NAME_SUFFIX)
+#define BWD_KERNEL_NAME CAT(naive_conv_packed_bwd_nchw_, KERNEL_NAME_SUFFIX)
+#define WRW_KERNEL_NAME CAT(naive_conv_packed_wrw_nchw_, KERNEL_NAME_SUFFIX)
diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp
index d0687b64ba..3fe005e994 100644
--- a/src/solver/conv_direct_naive_conv_bwd.cpp
+++ b/src/solver/conv_direct_naive_conv_bwd.cpp
@@ -152,7 +152,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
                 decltype(auto) data_ctx = primitive_parameters.CastTo<conv::DataInvokeParams>();
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
-                auto in_strides = MakeStrideArray<5>(
+                auto in_strides         = MakeStrideArray<5>(
                     SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx));
                 // For weights, we split K to (G, K_per_group), which is always index 0
                 auto wei_strides = MakeStrideArray<5>(
@@ -163,14 +163,11 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
                 // out pointers in ConvTensors for backward pass, so now I have to
                 // pass out in place of in, out_strides in place of in_strides and
                 // vice-versa
-                if(is_f8) 
+                if(is_f8)
                 {
                     handle.Run(kern)(tensors.out,
                                      tensors.w,
                                      tensors.in,
-                                     out_strides,
-                                     wei_strides,
-                                     in_strides,
                                      hi,
                                      wi,
                                      n,
diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp
index 6f06a2251b..3d953de173 100644
--- a/src/solver/conv_direct_naive_conv_fwd.cpp
+++ b/src/solver/conv_direct_naive_conv_fwd.cpp
@@ -167,9 +167,6 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
                     handle.Run(kern)(tensors.in,
                                      tensors.w,
                                      tensors.out,
-                                     in_strides,
-                                     wei_strides,
-                                     out_strides,
                                      hi,
                                      wi,
                                      n,
diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp
index 60bbe90411..6d4f5e3126 100644
--- a/src/solver/conv_direct_naive_conv_wrw.cpp
+++ b/src/solver/conv_direct_naive_conv_wrw.cpp
@@ -153,9 +153,6 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
                     handle.Run(kern)(tensors.x,
                                      tensors.dw,
                                      tensors.dy,
-                                 in_strides,
-                                 wei_strides,
-                                 out_strides,
                                      hi,
                                      wi,
                                      n,
@@ -181,9 +178,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
                     handle.Run(kern)(tensors.x,
                                      tensors.dw,
                                      tensors.dy,
-                                 in_strides,
-                                 wei_strides,
-                                 out_strides,
+                                     in_strides,
+                                     wei_strides,
+                                     out_strides,
                                      hi,
                                      wi,
                                      n,

From be93522f0f111622a7b879bb7fd6d75c2ae51344 Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Sat, 23 Sep 2023 19:14:08 +0000
Subject: [PATCH 03/23] fix the lengths of weight tensor

---
 test/gtest/conv3d_test_case.hpp | 112 ++++++++++++++++++++++++++++++++
 test/gtest/group_conv3d_bwd.cpp |   2 +-
 test/gtest/group_conv3d_bwd.hpp |  88 ++-----------------------
 test/gtest/group_conv3d_fwd.cpp |   2 +-
 test/gtest/group_conv3d_fwd.hpp |  88 ++-----------------------
 test/gtest/group_conv3d_wrw.cpp |   2 +-
 test/gtest/group_conv3d_wrw.hpp |  88 ++-----------------------
 test/gtest/group_solver.hpp     |   6 +-
 8 files changed, 132 insertions(+), 256 deletions(-)
 create mode 100644 test/gtest/conv3d_test_case.hpp

diff --git a/test/gtest/conv3d_test_case.hpp b/test/gtest/conv3d_test_case.hpp
new file mode 100644
index 0000000000..242615077f
--- /dev/null
+++ b/test/gtest/conv3d_test_case.hpp
@@ -0,0 +1,112 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <random>
+
+#include "get_handle.hpp"
+#include <miopen/conv/data_invoke_params.hpp>
+
+#include "../driver/tensor_driver.hpp"
+#include "conv_common.hpp"
+
+template <typename T>
+miopenDataType_t GetDataType();
+
+template <>
+miopenDataType_t GetDataType<float>()
+{
+    return miopenFloat;
+}
+
+template <>
+miopenDataType_t GetDataType<half_float::half>()
+{
+    return miopenHalf;
+}
+
+template <>
+miopenDataType_t GetDataType<int8_t>()
+{
+    return miopenInt8;
+}
+
+struct Conv3DTestCase
+{
+    size_t G;
+    size_t N;
+    size_t C;
+    size_t D;
+    size_t H;
+    size_t W;
+    size_t k;
+    size_t z;
+    size_t y;
+    size_t x;
+    size_t pad_x;
+    size_t pad_y;
+    size_t pad_z;
+    size_t stride_x;
+    size_t stride_y;
+    size_t stride_z;
+    size_t dilation_x;
+    size_t dilation_y;
+    size_t dilation_z;
+    miopenConvolutionMode_t conv_mode;
+    friend std::ostream& operator<<(std::ostream& os, const Conv3DTestCase& tc)
+    {
+        return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D
+                  << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z
+                  << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z
+                  << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z
+                  << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x
+                  << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y
+                  << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode;
+    }
+
+    std::vector<size_t> GetInput() { return {N, C, D, H, W}; }
+    std::vector<size_t> GetWeights()
+    {
+        EXPECT_EQUAL(C % G, 0);
+        return {k, C / G, z, y, x};
+    }
+
+    miopen::ConvolutionDescriptor GetConv()
+    {
+        return miopen::ConvolutionDescriptor{
+            3,
+            miopenConvolution,
+            miopenPaddingDefault,
+            {static_cast<int>(pad_z), static_cast<int>(pad_y), static_cast<int>(pad_x)},
+            {static_cast<int>(stride_z), static_cast<int>(stride_y), static_cast<int>(stride_x)},
+            {static_cast<int>(dilation_z),
+             static_cast<int>(dilation_y),
+             static_cast<int>(dilation_x)},
+            {0, 0, 0},
+            static_cast<int>(G),
+            1.0};
+    }
+};
diff --git a/test/gtest/group_conv3d_bwd.cpp b/test/gtest/group_conv3d_bwd.cpp
index 8e794749dd..6d0c3c678b 100644
--- a/test/gtest/group_conv3d_bwd.cpp
+++ b/test/gtest/group_conv3d_bwd.cpp
@@ -44,7 +44,7 @@ void SolverBwd(const miopen::TensorDescriptor& inputDesc,
                const miopen::TensorDescriptor& outputDesc,
                ConstData_t output,
                const miopen::ConvolutionDescriptor& convDesc,
-               const ConvTestCase& conv_config,
+               const Conv3DTestCase& conv_config,
                bool& test_skipped)
 {
     auto&& handle = get_handle();
diff --git a/test/gtest/group_conv3d_bwd.hpp b/test/gtest/group_conv3d_bwd.hpp
index 410d71e6d0..71702c5808 100644
--- a/test/gtest/group_conv3d_bwd.hpp
+++ b/test/gtest/group_conv3d_bwd.hpp
@@ -25,89 +25,9 @@
  *******************************************************************************/
 #pragma once
 
-#include <random>
+#include "conv3d_test_case.hpp"
 
-#include "get_handle.hpp"
-#include <miopen/conv/data_invoke_params.hpp>
-
-#include "../driver/tensor_driver.hpp"
-#include "conv_common.hpp"
-
-template <typename T>
-miopenDataType_t GetDataType();
-
-template <>
-miopenDataType_t GetDataType<float>()
-{
-    return miopenFloat;
-}
-
-template <>
-miopenDataType_t GetDataType<half_float::half>()
-{
-    return miopenHalf;
-}
-
-template <>
-miopenDataType_t GetDataType<int8_t>()
-{
-    return miopenInt8;
-}
-
-struct ConvTestCase
-{
-    size_t G;
-    size_t N;
-    size_t C;
-    size_t D;
-    size_t H;
-    size_t W;
-    size_t k;
-    size_t z;
-    size_t y;
-    size_t x;
-    size_t pad_x;
-    size_t pad_y;
-    size_t pad_z;
-    size_t stride_x;
-    size_t stride_y;
-    size_t stride_z;
-    size_t dilation_x;
-    size_t dilation_y;
-    size_t dilation_z;
-    miopenConvolutionMode_t conv_mode;
-    friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc)
-    {
-        return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D
-                  << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z
-                  << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z
-                  << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z
-                  << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x
-                  << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y
-                  << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode;
-    }
-
-    std::vector<size_t> GetInput() { return {N, C, D, H, W}; }
-    std::vector<size_t> GetWeights() { return {k, C, z, y, x}; }
-
-    miopen::ConvolutionDescriptor GetConv()
-    {
-        return miopen::ConvolutionDescriptor{
-            3,
-            miopenConvolution,
-            miopenPaddingDefault,
-            {static_cast<int>(pad_z), static_cast<int>(pad_y), static_cast<int>(pad_x)},
-            {static_cast<int>(stride_z), static_cast<int>(stride_y), static_cast<int>(stride_x)},
-            {static_cast<int>(dilation_z),
-             static_cast<int>(dilation_y),
-             static_cast<int>(dilation_x)},
-            {0, 0, 0},
-            static_cast<int>(G),
-            1.0};
-    }
-};
-
-std::vector<ConvTestCase> ConvTestConfigs()
+std::vector<Conv3DTestCase> ConvTestConfigs()
 { // g    n   c   d    h   w   k   z  y  x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z
     return {{1, 128, 64, 14, 28, 28, 64, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
             {1, 64, 32, 28, 28, 28, 32, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
@@ -133,7 +53,7 @@ inline int SetTensorLayout(miopen::TensorDescriptor& desc)
 template <typename T = float>
 struct ConvBwdSolverTest
     : public ::testing::TestWithParam<
-          std::tuple<miopenConvBwdDataAlgorithm_t, ConvTestCase, miopenTensorLayout_t>>
+          std::tuple<miopenConvBwdDataAlgorithm_t, Conv3DTestCase, miopenTensorLayout_t>>
 {
 protected:
     void SetUp() override
@@ -188,7 +108,7 @@ struct ConvBwdSolverTest
         EXPECT_TRUE(error < threshold)
             << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
     }
-    ConvTestCase conv_config;
+    Conv3DTestCase conv_config;
     miopen::ConvolutionDescriptor conv_desc;
     tensor<T> input;
     tensor<T> weights;
diff --git a/test/gtest/group_conv3d_fwd.cpp b/test/gtest/group_conv3d_fwd.cpp
index 876f513258..d21bf6f67b 100644
--- a/test/gtest/group_conv3d_fwd.cpp
+++ b/test/gtest/group_conv3d_fwd.cpp
@@ -44,7 +44,7 @@ void SolverFwd(const miopen::TensorDescriptor& inputDesc,
                const miopen::TensorDescriptor& outputDesc,
                Data_t output,
                const miopen::ConvolutionDescriptor& convDesc,
-               const ConvTestCase& conv_config,
+               const Conv3DTestCase& conv_config,
                bool& test_skipped)
 {
     auto&& handle = get_handle();
diff --git a/test/gtest/group_conv3d_fwd.hpp b/test/gtest/group_conv3d_fwd.hpp
index 983f897d78..c8767399a7 100644
--- a/test/gtest/group_conv3d_fwd.hpp
+++ b/test/gtest/group_conv3d_fwd.hpp
@@ -25,89 +25,9 @@
  *******************************************************************************/
 #pragma once
 
-#include <random>
+#include "conv3d_test_case.hpp"
 
-#include "get_handle.hpp"
-#include <miopen/conv/data_invoke_params.hpp>
-
-#include "../driver/tensor_driver.hpp"
-#include "conv_common.hpp"
-
-template <typename T>
-miopenDataType_t GetDataType();
-
-template <>
-miopenDataType_t GetDataType<float>()
-{
-    return miopenFloat;
-}
-
-template <>
-miopenDataType_t GetDataType<half_float::half>()
-{
-    return miopenHalf;
-}
-
-template <>
-miopenDataType_t GetDataType<int8_t>()
-{
-    return miopenInt8;
-}
-
-struct ConvTestCase
-{
-    size_t G;
-    size_t N;
-    size_t C;
-    size_t D;
-    size_t H;
-    size_t W;
-    size_t k;
-    size_t z;
-    size_t y;
-    size_t x;
-    size_t pad_x;
-    size_t pad_y;
-    size_t pad_z;
-    size_t stride_x;
-    size_t stride_y;
-    size_t stride_z;
-    size_t dilation_x;
-    size_t dilation_y;
-    size_t dilation_z;
-    miopenConvolutionMode_t conv_mode;
-    friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc)
-    {
-        return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D
-                  << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z
-                  << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z
-                  << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z
-                  << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x
-                  << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y
-                  << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode;
-    }
-
-    std::vector<size_t> GetInput() { return {N, C, D, H, W}; }
-    std::vector<size_t> GetWeights() { return {k, C, z, y, x}; }
-
-    miopen::ConvolutionDescriptor GetConv()
-    {
-        return miopen::ConvolutionDescriptor{
-            3,
-            miopenConvolution,
-            miopenPaddingDefault,
-            {static_cast<int>(pad_z), static_cast<int>(pad_y), static_cast<int>(pad_x)},
-            {static_cast<int>(stride_z), static_cast<int>(stride_y), static_cast<int>(stride_x)},
-            {static_cast<int>(dilation_z),
-             static_cast<int>(dilation_y),
-             static_cast<int>(dilation_x)},
-            {0, 0, 0},
-            static_cast<int>(G),
-            1.0};
-    }
-};
-
-std::vector<ConvTestCase> ConvTestConfigs()
+std::vector<Conv3DTestCase> ConvTestConfigs()
 { // g    n   c   d    h   w   k   z  y  x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z
     return {{1, 128, 64, 14, 28, 28, 64, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
             {1, 64, 32, 28, 28, 28, 32, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
@@ -139,7 +59,7 @@ inline int SetTensorLayout(miopen::TensorDescriptor& desc)
 template <typename T = float>
 struct ConvFwdSolverTest
     : public ::testing::TestWithParam<
-          std::tuple<miopenConvFwdAlgorithm_t, ConvTestCase, miopenTensorLayout_t>>
+          std::tuple<miopenConvFwdAlgorithm_t, Conv3DTestCase, miopenTensorLayout_t>>
 {
 protected:
     void SetUp() override
@@ -195,7 +115,7 @@ struct ConvFwdSolverTest
         EXPECT_TRUE(error < threshold)
             << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
     }
-    ConvTestCase conv_config;
+    Conv3DTestCase conv_config;
     miopen::ConvolutionDescriptor conv_desc;
     tensor<T> input;
     tensor<T> weights;
diff --git a/test/gtest/group_conv3d_wrw.cpp b/test/gtest/group_conv3d_wrw.cpp
index 0fae3402d8..398bc3becd 100644
--- a/test/gtest/group_conv3d_wrw.cpp
+++ b/test/gtest/group_conv3d_wrw.cpp
@@ -44,7 +44,7 @@ void SolverWrw(const miopen::TensorDescriptor& inputDesc,
                const miopen::TensorDescriptor& outputDesc,
                ConstData_t output, // dy
                const miopen::ConvolutionDescriptor& convDesc,
-               const ConvTestCase& conv_config,
+               const Conv3DTestCase& conv_config,
                bool& test_skipped)
 {
 
diff --git a/test/gtest/group_conv3d_wrw.hpp b/test/gtest/group_conv3d_wrw.hpp
index 76d8ae5d90..bf5824b4fa 100644
--- a/test/gtest/group_conv3d_wrw.hpp
+++ b/test/gtest/group_conv3d_wrw.hpp
@@ -25,89 +25,9 @@
  *******************************************************************************/
 #pragma once
 
-#include <random>
+#include "conv3d_test_case.hpp"
 
-#include "get_handle.hpp"
-#include <miopen/conv/wrw_invoke_params.hpp>
-
-#include "../driver/tensor_driver.hpp"
-#include "conv_common.hpp"
-
-template <typename T>
-miopenDataType_t GetDataType();
-
-template <>
-miopenDataType_t GetDataType<float>()
-{
-    return miopenFloat;
-}
-
-template <>
-miopenDataType_t GetDataType<half_float::half>()
-{
-    return miopenHalf;
-}
-
-template <>
-miopenDataType_t GetDataType<int8_t>()
-{
-    return miopenInt8;
-}
-
-struct ConvTestCase
-{
-    size_t G;
-    size_t N;
-    size_t C;
-    size_t D;
-    size_t H;
-    size_t W;
-    size_t k;
-    size_t z;
-    size_t y;
-    size_t x;
-    size_t pad_x;
-    size_t pad_y;
-    size_t pad_z;
-    size_t stride_x;
-    size_t stride_y;
-    size_t stride_z;
-    size_t dilation_x;
-    size_t dilation_y;
-    size_t dilation_z;
-    miopenConvolutionMode_t conv_mode;
-    friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc)
-    {
-        return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D
-                  << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z
-                  << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z
-                  << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z
-                  << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x
-                  << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y
-                  << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode;
-    }
-
-    std::vector<size_t> GetInput() { return {N, C, D, H, W}; }
-    std::vector<size_t> GetWeights() { return {k, C, z, y, x}; }
-
-    miopen::ConvolutionDescriptor GetConv()
-    {
-        return miopen::ConvolutionDescriptor{
-            3,
-            miopenConvolution,
-            miopenPaddingDefault,
-            {static_cast<int>(pad_z), static_cast<int>(pad_y), static_cast<int>(pad_x)},
-            {static_cast<int>(stride_z), static_cast<int>(stride_y), static_cast<int>(stride_x)},
-            {static_cast<int>(dilation_z),
-             static_cast<int>(dilation_y),
-             static_cast<int>(dilation_x)},
-            {0, 0, 0},
-            static_cast<int>(G),
-            1.0};
-    }
-};
-
-std::vector<ConvTestCase> ConvTestConfigs()
+std::vector<Conv3DTestCase> ConvTestConfigs()
 { // g   n   c   d    h   w   k   z  y  x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z
     return {{1, 128, 64, 14, 28, 28, 64, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
             {1, 64, 32, 28, 28, 28, 32, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
@@ -135,7 +55,7 @@ inline int SetTensorLayout(miopen::TensorDescriptor& desc)
 template <typename T = float>
 struct ConvWrwSolverTest
     : public ::testing::TestWithParam<
-          std::tuple<miopenConvBwdWeightsAlgorithm_t, ConvTestCase, miopenTensorLayout_t>>
+          std::tuple<miopenConvBwdWeightsAlgorithm_t, Conv3DTestCase, miopenTensorLayout_t>>
 {
 protected:
     void SetUp() override
@@ -191,7 +111,7 @@ struct ConvWrwSolverTest
         EXPECT_TRUE(error < threshold)
             << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
     }
-    ConvTestCase conv_config;
+    Conv3DTestCase conv_config;
     miopen::ConvolutionDescriptor conv_desc;
     tensor<T> input;
     tensor<T> weights;
diff --git a/test/gtest/group_solver.hpp b/test/gtest/group_solver.hpp
index 6fe02e00da..3d9ebddca3 100644
--- a/test/gtest/group_solver.hpp
+++ b/test/gtest/group_solver.hpp
@@ -80,7 +80,11 @@ struct ConvTestCase
     }
 
     std::vector<size_t> GetInput() { return {N, C, H, W}; }
-    std::vector<size_t> GetWeights() { return {k, C, y, x}; }
+    std::vector<size_t> GetWeights()
+    {
+        EXPECT_EQUAL(C % G, 0);
+        return {k, C / G, y, x};
+    }
 
     miopen::ConvolutionDescriptor GetConv()
     {

From d85785b525435bbcc6f9e5d41dec5e3f0fabc521 Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Mon, 25 Sep 2023 15:09:57 +0000
Subject: [PATCH 04/23] use 64-bit integers for stride value

---
 src/include/miopen/solver/conv_direct_naive_conv.hpp | 2 +-
 src/kernels/gpu_reference_kernel/naive_conv.cpp      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp
index 927a64feff..36ba8d6bc3 100644
--- a/src/include/miopen/solver/conv_direct_naive_conv.hpp
+++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp
@@ -92,7 +92,7 @@ class MyArray
     __host__ __device__ ~MyArray()                             = default;
 };
 
-using StrideIndexType = int;
+using StrideIndexType = size_t;
 using Strides5D       = MyArray<StrideIndexType, 5u>;
 using Strides6D       = MyArray<StrideIndexType, 6u>;
 
diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp
index 6ffb0789c4..8f2ab87cf6 100644
--- a/src/kernels/gpu_reference_kernel/naive_conv.cpp
+++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp
@@ -143,7 +143,7 @@ class MyArray
     __host__ __device__ ~MyArray()                             = default;
 };
 
-using StrideIndexType = int;
+using StrideIndexType = size_t;
 using Strides5D       = MyArray<StrideIndexType, 5u>;
 using Strides6D       = MyArray<StrideIndexType, 6u>;
 #else

From ee6abb3d0e177bff55695fc7ad2ab9cf8f904aca Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Sun, 20 Aug 2023 04:07:34 +0000
Subject: [PATCH 05/23] Squash commits together

replace constant with appropriate block dimension

WIP: add strides as a kernel parameter for non-packed tensors

WIP: created non-packed variants for remaining 3D conv kernels

fix: treat input and output as 6D NDHWGC tensor

use std::call_once for initializing random seed

revamp naive kernels to use strides in the non-packed tensors case (controlled by a bool flag)

WIP: fixed kernel compilation issues but unable to load kernel code object

WIP: fixed issue with hip rtc

split channel strides into group, channels_per_group in solver

fix indexing to left-to-right order

fix bug with too much padding between kernel args

num channels should be a multiple of num groups

re-enable naive ref kernels with strides array

2D forward tests are all working now

WIP: debugging bwd tests

WIP: tests up till 3D bwd conv passing

fix bug in bwd ndhwc kernel

fix formatting

disable prints

fix readability-inconsistent-declaration-parameter-name

fix clang-format

fix hip tidy issue

reverting the change to static init of random seed

address comments and tidy issues. Remove extra print

removed blank line change

remove unneeded include

env var for choosing packed vs non-packed reference kernel

fix warnings from hip-tidy

address comment about array initialization

clear a tiny hip tidy issue
---
 src/solver/conv_direct_naive_conv_bwd.cpp | 2 ++
 src/solver/conv_direct_naive_conv_fwd.cpp | 2 ++
 src/solver/conv_direct_naive_conv_wrw.cpp | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp
index 3fe005e994..cc30de3dff 100644
--- a/src/solver/conv_direct_naive_conv_bwd.cpp
+++ b/src/solver/conv_direct_naive_conv_bwd.cpp
@@ -144,6 +144,8 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
 
     int G_stride_idx = GetGroupStrideIndex(problem);
 
+    int G_stride_idx = GetGroupStrideIndex(problem);
+
     if(problem.Is2d())
     {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp
index 3d953de173..89f9e293dc 100644
--- a/src/solver/conv_direct_naive_conv_fwd.cpp
+++ b/src/solver/conv_direct_naive_conv_fwd.cpp
@@ -145,6 +145,8 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
 
     int G_stride_idx = GetGroupStrideIndex(problem);
 
+    int G_stride_idx = GetGroupStrideIndex(problem);
+
     if(problem.Is2d())
     {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp
index 6d4f5e3126..dc01ed93d4 100644
--- a/src/solver/conv_direct_naive_conv_wrw.cpp
+++ b/src/solver/conv_direct_naive_conv_wrw.cpp
@@ -131,6 +131,8 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
 
     int G_stride_idx = GetGroupStrideIndex(problem);
 
+    int G_stride_idx = GetGroupStrideIndex(problem);
+
     if(problem.Is2d())
     {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {

From 4fbcd7716d589ff06976675946588bc74e0babbf Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Mon, 18 Sep 2023 13:23:11 +0000
Subject: [PATCH 06/23] fix test for non-packed strides

---
 driver/random.hpp             |   1 +
 test/gpu_reference_kernel.cpp | 105 ++++++++++++++++++++++++++++++----
 2 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/driver/random.hpp b/driver/random.hpp
index 6398048dde..45b46ce571 100644
--- a/driver/random.hpp
+++ b/driver/random.hpp
@@ -91,6 +91,7 @@ inline T gen_0_to_B(T B)
 template <typename T>
 inline T gen_A_to_B(T A, T B)
 {
+    assert(B >= A);
     return gen_0_to_B(B - A) + A;
 }
 
diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp
index aa3dda788d..6de633f776 100644
--- a/test/gpu_reference_kernel.cpp
+++ b/test/gpu_reference_kernel.cpp
@@ -24,6 +24,8 @@
  *
  *******************************************************************************/
 
+#include <functional>
+#include <numeric>
 #include <miopen/handle.hpp>
 #include <miopen/miopen.h>
 #include <miopen/convolution.hpp>
@@ -310,6 +312,26 @@ static std::string miopen_type_to_string(miopenDataType_t type)
     return "n/a";
 }
 
+// input: a vector of lengths of dims in a tensor
+// multiply each element with a random constant integer
+void pad_tensor_strides(std::vector<int>& strides) {
+  auto pvec = [] (const char* name, const auto& vec) {
+    std::cout << name << ": [";
+    for (const auto& v: vec) {
+      std::cout << v << ", ";
+    }
+    std::cout << "]\n";
+  };
+
+  pvec("orig strides", strides);
+  auto c = prng::gen_A_to_B(1, 3);
+  // int c = 2;
+  for (auto& v: strides) {
+    v = v * c; 
+  }
+  pvec("new strides", strides);
+}
+
 template <miopen::conv::Direction direction,
           typename TRef,
           typename Tout,
@@ -341,11 +363,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
             int ho          = conv_out_size(hi, py, dy, fy, sy);
             int wo          = conv_out_size(wi, px, dx, fx, sx);
             int c_per_group = c / g;
-            int k_per_group = k / g;
+            // int k_per_group = k / g;
 
-            int in_sz  = g * n * c_per_group * hi * wi;
-            int wei_sz = g * k_per_group * c_per_group * fy * fx;
-            int out_sz = g * n * k_per_group * ho * wo;
+            // int in_sz  = g * n * c_per_group * hi * wi;
+            // int wei_sz = g * k_per_group * c_per_group * fy * fx;
+            // int out_sz = g * n * k_per_group * ho * wo;
 
             std::vector<int> in_len({n, c, hi, wi});
             std::vector<int> wei_len({k, c_per_group, fy, fx});
@@ -362,9 +384,18 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
             miopen::tensor_layout_to_strides(wei_len, layout_default, layout_string, wei_strides);
             miopen::tensor_layout_to_strides(out_len, layout_default, layout_string, out_strides);
 
+            pad_tensor_strides(in_strides);
+            pad_tensor_strides(wei_strides);
+            pad_tensor_strides(out_strides);
+
             tensor<TRef> in(in_len, in_strides);
             tensor<TRef> wei(wei_len, wei_strides);
             tensor<Tout> out(out_len, out_strides);
+
+            auto in_sz = in.data.size();
+            auto wei_sz = wei.data.size();
+            auto out_sz = out.data.size();
+
 #if MIOPEN_BACKEND_OPENCL
             cl_context ctx;
             clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
@@ -447,6 +478,13 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
+                // TODO(Amber): copy output before computation because output may
+                // be not be packed, and convolution may update only a subset of
+                // indices
+                EXPECT(hipMemcpy(out_dev,
+                                 out.data.data(),
+                                 sizeof(Tout) * out_sz,
+                                 hipMemcpyHostToDevice) == hipSuccess);
 #endif
                 cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(),
                                         in,
@@ -518,9 +556,16 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                                                nullptr);
                 EXPECT(status == CL_SUCCESS);
 #elif MIOPEN_BACKEND_HIP
+                // TODO(Amber): copy output before computation because output may
+                // be not be packed, and convolution may update only a subset of
+                // indices
+                EXPECT(hipMemcpy(in_dev,
+                                 in.data.data(),
+                                 sizeof(TRef) * in_sz,
+                                 hipMemcpyHostToDevice) == hipSuccess);
                 EXPECT(hipMemcpy(out_dev,
                                  out.data.data(),
-                                 sizeof(TRef) * out_sz,
+                                 sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
                 EXPECT(hipMemcpy(wei_dev,
                                  wei.data.data(),
@@ -600,9 +645,16 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
+                // TODO(Amber): copy output before computation because output may
+                // be not be packed, and convolution may update only a subset of
+                // indices
+                EXPECT(hipMemcpy(wei_dev,
+                                 wei.data.data(),
+                                 sizeof(TRef) * wei_sz,
+                                 hipMemcpyHostToDevice) == hipSuccess);
                 EXPECT(hipMemcpy(out_dev,
                                  out.data.data(),
-                                 sizeof(TRef) * out_sz,
+                                 sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
 #endif
                 cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(),
@@ -719,11 +771,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             int wo          = conv_out_size(wi, px, dx, fx, sx);
             int do_         = conv_out_size(di, pz, dz, fz, sz);
             int c_per_group = c / g;
-            int k_per_group = k / g;
+            // int k_per_group = k / g;
 
-            int in_sz  = g * n * c_per_group * di * hi * wi;
-            int wei_sz = g * k_per_group * c_per_group * fz * fy * fx;
-            int out_sz = g * n * k_per_group * do_ * ho * wo;
+            // int in_sz  = g * n * c_per_group * di * hi * wi;
+            // int wei_sz = g * k_per_group * c_per_group * fz * fy * fx;
+            // int out_sz = g * n * k_per_group * do_ * ho * wo;
 
             std::vector<int> in_len({n, c, di, hi, wi});
             std::vector<int> wei_len({k, c_per_group, fz, fy, fx});
@@ -740,9 +792,18 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             miopen::tensor_layout_to_strides(wei_len, layout_default, layout_string, wei_strides);
             miopen::tensor_layout_to_strides(out_len, layout_default, layout_string, out_strides);
 
+            pad_tensor_strides(in_strides);
+            pad_tensor_strides(wei_strides);
+            pad_tensor_strides(out_strides);
+
             tensor<TRef> in(in_len, in_strides);
             tensor<TRef> wei(wei_len, wei_strides);
             tensor<Tout> out(out_len, out_strides);
+
+            auto in_sz = in.data.size();
+            auto wei_sz = wei.data.size();
+            auto out_sz = out.data.size();
+
 #if MIOPEN_BACKEND_OPENCL
             cl_context ctx;
             clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
@@ -821,6 +882,13 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
+                // TODO(Amber): copy output before computation because output may
+                // be not be packed, and convolution may update only a subset of
+                // indices
+                EXPECT(hipMemcpy(out_dev,
+                                 out.data.data(),
+                                 sizeof(Tout) * out_sz,
+                                 hipMemcpyHostToDevice) == hipSuccess);
                 EXPECT(hipMemcpy(wei_dev,
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
@@ -898,9 +966,15 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                                                nullptr);
                 EXPECT(status == CL_SUCCESS);
 #elif MIOPEN_BACKEND_HIP
+                // TODO(Amber): copy output before computation because output may
+                // be not be packed, and convolution may update only a subset of
+                // indices
+                EXPECT(hipMemcpy(
+                           in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
+                       hipSuccess);
                 EXPECT(hipMemcpy(out_dev,
                                  out.data.data(),
-                                 sizeof(TRef) * out_sz,
+                                 sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
                 EXPECT(hipMemcpy(wei_dev,
                                  wei.data.data(),
@@ -980,9 +1054,16 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
+                // TODO(Amber): copy output before computation because output may
+                // be not be packed, and convolution may update only a subset of
+                // indices
+                EXPECT(hipMemcpy(wei_dev,
+                                 wei.data.data(),
+                                 sizeof(TRef) * wei_sz,
+                                 hipMemcpyHostToDevice) == hipSuccess);
                 EXPECT(hipMemcpy(out_dev,
                                  out.data.data(),
-                                 sizeof(TRef) * out_sz,
+                                 sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
 #endif
                 cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(),

From 82e0ccfda5f1de7a873ba7e6f60450cddb373174 Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Mon, 18 Sep 2023 13:24:43 +0000
Subject: [PATCH 07/23] fix format

---
 test/gpu_reference_kernel.cpp | 44 ++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp
index 6de633f776..9dca52669c 100644
--- a/test/gpu_reference_kernel.cpp
+++ b/test/gpu_reference_kernel.cpp
@@ -314,22 +314,25 @@ static std::string miopen_type_to_string(miopenDataType_t type)
 
 // input: a vector of lengths of dims in a tensor
 // multiply each element with a random constant integer
-void pad_tensor_strides(std::vector<int>& strides) {
-  auto pvec = [] (const char* name, const auto& vec) {
-    std::cout << name << ": [";
-    for (const auto& v: vec) {
-      std::cout << v << ", ";
+void pad_tensor_strides(std::vector<int>& strides)
+{
+    auto pvec = [](const char* name, const auto& vec) {
+        std::cout << name << ": [";
+        for(const auto& v : vec)
+        {
+            std::cout << v << ", ";
+        }
+        std::cout << "]\n";
+    };
+
+    pvec("orig strides", strides);
+    auto c = prng::gen_A_to_B(1, 3);
+    // int c = 2;
+    for(auto& v : strides)
+    {
+        v = v * c;
     }
-    std::cout << "]\n";
-  };
-
-  pvec("orig strides", strides);
-  auto c = prng::gen_A_to_B(1, 3);
-  // int c = 2;
-  for (auto& v: strides) {
-    v = v * c; 
-  }
-  pvec("new strides", strides);
+    pvec("new strides", strides);
 }
 
 template <miopen::conv::Direction direction,
@@ -392,7 +395,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
             tensor<TRef> wei(wei_len, wei_strides);
             tensor<Tout> out(out_len, out_strides);
 
-            auto in_sz = in.data.size();
+            auto in_sz  = in.data.size();
             auto wei_sz = wei.data.size();
             auto out_sz = out.data.size();
 
@@ -559,10 +562,9 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 // TODO(Amber): copy output before computation because output may
                 // be not be packed, and convolution may update only a subset of
                 // indices
-                EXPECT(hipMemcpy(in_dev,
-                                 in.data.data(),
-                                 sizeof(TRef) * in_sz,
-                                 hipMemcpyHostToDevice) == hipSuccess);
+                EXPECT(hipMemcpy(
+                           in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
+                       hipSuccess);
                 EXPECT(hipMemcpy(out_dev,
                                  out.data.data(),
                                  sizeof(Tout) * out_sz,
@@ -800,7 +802,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             tensor<TRef> wei(wei_len, wei_strides);
             tensor<Tout> out(out_len, out_strides);
 
-            auto in_sz = in.data.size();
+            auto in_sz  = in.data.size();
             auto wei_sz = wei.data.size();
             auto out_sz = out.data.size();
 

From 7e8a258abd57e9873d4cfa401e7913e3900683d1 Mon Sep 17 00:00:00 2001
From: amberhassaan <amber_474@yahoo.com>
Date: Mon, 18 Sep 2023 11:18:39 -0400
Subject: [PATCH 08/23] Fix assertion check.

Co-authored-by: Alex Eremin <CAHEK7@yandex.ru>
---
 driver/random.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/random.hpp b/driver/random.hpp
index 45b46ce571..b3be81f56e 100644
--- a/driver/random.hpp
+++ b/driver/random.hpp
@@ -91,7 +91,7 @@ inline T gen_0_to_B(T B)
 template <typename T>
 inline T gen_A_to_B(T A, T B)
 {
-    assert(B >= A);
+    assert(B > A);
     return gen_0_to_B(B - A) + A;
 }
 

From cadfb9583040ce36ed5c7b8960bacffa053bcb8a Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Tue, 19 Sep 2023 18:13:40 -0700
Subject: [PATCH 09/23] suppress cppcheck warning to test CI

---
 test/gpu_reference_kernel.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp
index 9dca52669c..31061c346c 100644
--- a/test/gpu_reference_kernel.cpp
+++ b/test/gpu_reference_kernel.cpp
@@ -330,6 +330,7 @@ void pad_tensor_strides(std::vector<int>& strides)
     // int c = 2;
     for(auto& v : strides)
     {
+        // cppcheck-suppress useStlAlgorithm
         v = v * c;
     }
     pvec("new strides", strides);

From cde6e22cfef5e611a90925461ecec0698e7e53c6 Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Mon, 25 Sep 2023 15:52:56 +0000
Subject: [PATCH 10/23] fix build and remove a check that prevents non-strided
 inputs

---
 src/ocl/convolutionocl.cpp                |  3 ++-
 src/solver/conv_direct_naive_conv_bwd.cpp | 10 ++--------
 src/solver/conv_direct_naive_conv_fwd.cpp | 10 ++--------
 src/solver/conv_direct_naive_conv_wrw.cpp | 10 ++--------
 4 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp
index 07e6f28b19..d012125499 100644
--- a/src/ocl/convolutionocl.cpp
+++ b/src/ocl/convolutionocl.cpp
@@ -417,11 +417,12 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle,
                                                size_t workSpaceSize) const
 {
     MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize);
-
+#if 0
     if(!(xDesc.IsPacked() && wDesc.IsPacked() && yDesc.IsPacked()))
     {
         MIOPEN_THROW(miopenStatusNotImplemented, "Only fully packed tensors are supported");
     }
+#endif
 
     const auto tensors = ConvFwdTensors{xDesc, x, wDesc, w, yDesc, y};
     ValidateConvTensors(tensors);
diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp
index cc30de3dff..5bb765c56a 100644
--- a/src/solver/conv_direct_naive_conv_bwd.cpp
+++ b/src/solver/conv_direct_naive_conv_bwd.cpp
@@ -134,15 +134,9 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
-    const auto is_f8 = [&]() {
-        if(kernel.kernel_file == "fp8_naive_conv.cpp")
-            return true;
-        else
-            return false;
-    }();
-    kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
+    const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp");
 
-    int G_stride_idx = GetGroupStrideIndex(problem);
+    kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
     int G_stride_idx = GetGroupStrideIndex(problem);
 
diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp
index 89f9e293dc..12ba295934 100644
--- a/src/solver/conv_direct_naive_conv_fwd.cpp
+++ b/src/solver/conv_direct_naive_conv_fwd.cpp
@@ -124,12 +124,6 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
     KernelInfo kernel;
 
     kernel.kernel_file = ConvDirectNaiveConvKernelFile(ctx, problem);
-    const auto is_f8   = [&]() {
-        if(kernel.kernel_file == "fp8_naive_conv.cpp")
-            return true;
-        else
-            return false;
-    }();
     kernel.kernel_name = ConvDirectNaiveConvKernelName(problem);
     kernel.g_wk.clear();
 
@@ -141,9 +135,9 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
-    kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
+    const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp");
 
-    int G_stride_idx = GetGroupStrideIndex(problem);
+    kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
     int G_stride_idx = GetGroupStrideIndex(problem);
 
diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp
index dc01ed93d4..e9fb0dee57 100644
--- a/src/solver/conv_direct_naive_conv_wrw.cpp
+++ b/src/solver/conv_direct_naive_conv_wrw.cpp
@@ -121,15 +121,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
-    kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
-    const auto is_f8    = [&]() {
-        if(kernel.kernel_file == "fp8_naive_conv.cpp")
-            return true;
-        else
-            return false;
-    }();
+    const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp");
 
-    int G_stride_idx = GetGroupStrideIndex(problem);
+    kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
     int G_stride_idx = GetGroupStrideIndex(problem);
 

From e06c523aaccd7a460ef6b78678df9b348917c91b Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Tue, 26 Sep 2023 22:21:43 +0000
Subject: [PATCH 11/23] addressed comments. Moved common code into an include
 file

---
 src/CMakeLists.txt                            |   1 +
 src/include/miopen/hipoc_kernel.hpp           |  31 +--
 .../miopen/solver/conv_direct_naive_conv.hpp  |  73 +-----
 .../gpu_reference_kernel/naive_conv.cpp       | 219 ++----------------
 src/kernels/stride_array.hpp                  |  86 +++++++
 src/solver/conv_direct_naive_conv.cpp         |  21 +-
 src/solver/conv_direct_naive_conv_bwd.cpp     |  17 +-
 src/solver/conv_direct_naive_conv_fwd.cpp     |   2 -
 8 files changed, 133 insertions(+), 317 deletions(-)
 create mode 100644 src/kernels/stride_array.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4ffed2b4c8..f64feac4e5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -388,6 +388,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/workaround_issue_1431.hpp
         kernels/hip_f8_impl.hpp
         kernels/hip_float8.hpp
+        kernels/stride_array.hpp
         )
 
     set(MIOPEN_KERNELS
diff --git a/src/include/miopen/hipoc_kernel.hpp b/src/include/miopen/hipoc_kernel.hpp
index b18955b5c2..938d20d9e6 100644
--- a/src/include/miopen/hipoc_kernel.hpp
+++ b/src/include/miopen/hipoc_kernel.hpp
@@ -26,14 +26,15 @@
 #ifndef GUARD_MIOPEN_HIPOC_KERNEL_HPP
 #define GUARD_MIOPEN_HIPOC_KERNEL_HPP
 
-#include <array>
-#include <cassert>
 #include <miopen/errors.hpp>
 #include <miopen/hipoc_program.hpp>
 #include <miopen/stringutils.hpp>
 #include <miopen/op_kernel_args.hpp>
+
+#include <array>
+#include <cassert>
+#include <cstring>
 #include <vector>
-#include <memory.h>
 
 namespace miopen {
 
@@ -47,36 +48,20 @@ inline HipEventPtr make_hip_event()
 
 #if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017
 
-#if 0 // Keep around other storage techinques -- @pfultz2 27.03.2017
-template <class T, class U>
-struct KernelArgsPair
-{
-    static const int alignment    = sizeof(U);
-    static const int padding      = (alignment - sizeof(T) % alignment) % alignment;
-    static const int second_index = sizeof(T) + padding;
-    KernelArgsPair(T x, U y)
-    {
-        new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
-        new(buffer + second_index) U(y);
-    }
-    alignas(U) char buffer[second_index + sizeof(U)] = {};
-};
-#else
 template <class T, class U>
 struct KernelArgsPair
 {
-    static const int alignment = alignof(U);
-    static const int padding   = (alignment - (sizeof(T) % alignment)) % alignment;
-    static_assert(padding >= 0, "padding cannot be negative");
-    static const int second_index = sizeof(T) + padding;
+    constexpr static const auto alignU       = alignof(U);
+    constexpr static const auto padding      = (alignU - (sizeof(T) % alignU)) % alignU;
+    constexpr static const auto second_index = sizeof(T) + padding;
     KernelArgsPair(T x, U y)
     {
         new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
         new(buffer + second_index) U(y);
     }
+
     alignas(U) char buffer[second_index + sizeof(U)] = {};
 };
-#endif
 
 template <class... Ts>
 struct KernelArgsPack;
diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp
index 31b399c409..0d0029d814 100644
--- a/src/include/miopen/solver/conv_direct_naive_conv.hpp
+++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp
@@ -27,12 +27,13 @@
 
 #include <miopen/execution_context.hpp>
 #include <miopen/problem_description.hpp>
+#include "miopen/../../kernels/stride_array.hpp"
 
 #include <array>
 #include <algorithm>
+#include <cassert>
 #include <string>
 #include <vector>
-#include <cassert>
 
 namespace miopen {
 
@@ -65,54 +66,6 @@ void printTensorStrides(const TensorDescriptor& inDesc,
                         const TensorDescriptor& wDesc,
                         const TensorDescriptor& outDesc);
 
-// TODO(Amber): Uncomment when hip RTC accepts std::array
-// using StrideIndexType = int;
-// using Strides3D       = std::array<StrideIndexType, 3>;
-// using Strides4D       = std::array<StrideIndexType, 4>;
-// using Strides5D       = std::array<StrideIndexType, 5>;
-// using Strides6D       = std::array<StrideIndexType, 6>;
-#if 1
-template <typename T, unsigned N>
-class MyArray
-{
-    T data_[N] = {};
-
-public:
-    constexpr static const unsigned SIZE = N;
-    __host__ __device__ constexpr unsigned size() const { return N; }
-
-    __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; }
-
-    __host__ T& operator[](unsigned i) { return data_[i]; }
-
-    __host__ __device__ MyArray()                   = default;
-    __host__ __device__ MyArray(const MyArray&)     = default;
-    __host__ __device__ MyArray(MyArray&&) noexcept = default;
-    __host__ __device__ MyArray& operator=(const MyArray&) = default;
-    __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default;
-    __host__ __device__ ~MyArray()                             = default;
-};
-
-using StrideIndexType = size_t;
-using Strides5D       = MyArray<StrideIndexType, 5u>;
-using Strides6D       = MyArray<StrideIndexType, 6u>;
-
-#else
-
-extern "C" typedef int StrideIndexType;
-
-extern "C" typedef struct
-{
-    StrideIndexType v[5];
-} Strides5D;
-
-extern "C" typedef struct
-{
-    StrideIndexType v[6];
-} Strides6D;
-
-#endif
-
 namespace internal {
 template <unsigned N>
 struct ChooseStride
@@ -189,27 +142,5 @@ V SplitWeiStrideKtoGK(int k_per_group, const V& wei_strides)
     return ret;
 }
 
-template <typename StrideArray>
-void printStrideArray(const char* name, const StrideArray& sarr)
-{
-    printf("%s = [", name);
-    for(unsigned i = 0; i < StrideArray::SIZE; ++i)
-    {
-        printf("%d,", sarr[i]);
-    }
-    printf("]\n");
-}
-
-template <typename StrideArray>
-void printStrideArrays(const StrideArray& in_strides,
-                       const StrideArray& wei_strides,
-                       const StrideArray& out_strides)
-{
-
-    printStrideArray("in_strides", in_strides);
-    printStrideArray("wei_strides", wei_strides);
-    printStrideArray("out_strides", out_strides);
-}
-
 } // namespace solver
 } // namespace miopen
diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp
index 8f2ab87cf6..60c3583e77 100644
--- a/src/kernels/gpu_reference_kernel/naive_conv.cpp
+++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp
@@ -46,6 +46,8 @@ typedef float float_t;
 #endif
 #endif // __HIPCC_RTC__
 
+#include "stride_array.hpp"
+
 // hcc seems need __device__ __host__ together to compile, and no extern "C"
 typedef union value_bf16_fp32_t
 {
@@ -114,13 +116,8 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val)
     return static_cast<int8_t>(val & 0xff);
 }
 
+#if 0
 // TODO(Amber): this file is compiled via HIP RTC and includes don't work easily
-// so currently duplicating content from miopen/common.hpp
-// #include "miopen/common.hpp"
-// #include <array>
-// TODO(Amber): HIP RTC redefines stuff from std library (I don't know why)
-// #include <hip/amd_detail/amd_hip_vector_types.h>
-#if 1
 template <typename T, unsigned N>
 class MyArray
 {
@@ -146,81 +143,22 @@ class MyArray
 using StrideIndexType = size_t;
 using Strides5D       = MyArray<StrideIndexType, 5u>;
 using Strides6D       = MyArray<StrideIndexType, 6u>;
-#else
-
-extern "C" typedef int StrideIndexType;
-
-extern "C" typedef struct
-{
-    StrideIndexType v[5];
-} Strides5D;
-
-extern "C" typedef struct
-{
-    StrideIndexType v[6];
-} Strides6D;
-
-extern "C" __global__ void testKernel(void* ptr_a,
-                                      void* ptr_b,
-                                      void* ptr_c,
-                                      Strides5D in_strides,
-                                      Strides5D wei_strides,
-                                      Strides5D out_strides)
-{
-
-    if(blockIdx.x == 0 && threadIdx.x == 0)
-    {
-        printf("sizeof(Strides5D) = %lu\n", sizeof(Strides5D));
-        printf("%p, %p, %p, %p\n", &ptr_a, &ptr_b, &ptr_c, &in_strides);
-        printf("in_strides = [");
-        for(int i = 0; i < 5; ++i)
-        {
-            // printf("%d,", in_strides.v[i]);
-            printf("%d,", in_strides[i]);
-        }
-        printf("]\n");
-        printf("wei_strides = [");
-        for(int i = 0; i < 5; ++i)
-        {
-            // printf("%d,", wei_strides.v[i]);
-            printf("%d,", wei_strides[i]);
-        }
-        printf("]\n");
-        printf("out_strides = [");
-        for(int i = 0; i < 5; ++i)
-        {
-            // printf("%d,", out_strides.v[i]);
-            printf("%d,", out_strides[i]);
-        }
-        printf("]\n");
-    }
-}
 #endif
 
-template <typename StrideArray>
-__device__ void printStrideArray(const char* name, const StrideArray& sarr)
-{
-    printf("%s = [", name);
-    for(int i = 0; i < StrideArray::SIZE; ++i)
-    {
-        printf("%d,", sarr[i]);
-    }
-    printf("]\n");
-}
+/// \todo remove template parameter 'bool ASSUME_PACKED' in a follow up PR
+/// --amberhassaan
+/// Notes (Amber):
+/// - The following code used to assume that group (G) is an implicit
+/// dimension, i.e. c= c_per_group * group and k = k_per_group * group. This is not
+/// true for non-packed case because group (G) dimension needs to have its stride
+/// explicitly specified for address math to make sense. This is also how
+/// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw,
+/// and nhwc should be nhwgc. Same follows for the 3D case.
+///
+/// - strides here are in the little-endian order, i.e., for NHWC, stride for N is
+/// at index 3 while stride for C is at index 0. This is reverse of how strides are
+/// stored in tensor descriptors, which are big-endian.
 
-// TODO(Amber): remove template parameter 'bool ASSUME_PACKED' in a follow up PR
-// Notes (Amber):
-// * The following code used to assume that group (G) is an implicit
-// dimension, i.e. c= c_per_group * group and k = k_per_group * group. This is not
-// true for non-packed case because group (G) dimension needs to have its stride
-// explicitly specified for address math to make sense. This is also how
-// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw,
-// and nhwc should be nhwgc. Same follows for the 3D case.
-// * strides here are in the little-endian order, i.e., for NHWC, stride for N is
-// at index 3 while stride for C is at index 0. This is reverse of how strides are
-// stored in tensor descriptors, which are big-endian.
-
-// TODO(Amber): Rename nchw to ngchw
 template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
                                            const src_data_t* __restrict__ p_wei,
@@ -245,43 +183,6 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
                                            int fx,
                                            int group)
 {
-
-    // TODO(Amber): Remove this code
-    /*
-    if (blockIdx.x == 0 && threadIdx.x == 0) {
-      printStrideArray("in_strides", in_strides);
-      printStrideArray("wei_strides", wei_strides);
-      printStrideArray("out_strides", out_strides);
-
-      printf("modified strides\n");
-      Strides5D in_strd;
-      Strides5D wei_strd;
-      Strides5D out_strd;
-
-      in_strd[0] = 1;
-      in_strd[1] = wi;
-      in_strd[2] = hi * wi;
-      in_strd[3] = c_per_group * hi * wi;
-      in_strd[4] = group * c_per_group * hi * wi;
-
-      wei_strd[0] = 1;
-      wei_strd[1] = fx;
-      wei_strd[2] = fy * fx;
-      wei_strd[3] = c_per_group * fy * fx;
-      wei_strd[4] = k_per_group * c_per_group * fy * fx;
-
-      out_strd[0] = 1;
-      out_strd[1] = wo;
-      out_strd[2] = ho * wo;
-      out_strd[3] = k_per_group * ho * wo;
-      out_strd[4] = group * k_per_group * ho * wo;
-
-      printStrideArray("in_strd", in_strd);
-      printStrideArray("wei_strd", wei_strd);
-      printStrideArray("out_strd", out_strd);
-    }
-    */
-
     /*
      *  need to compute total output pixel: `group * n * k_per_group * ho * wo`.
      *  to distribute this workload, let one workgroup compute `ho * wo` pixel,
@@ -324,7 +225,6 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
         int iho = tid / wo;
         int iwo = tid % wo;
 
-        // double value = .0f;
         acc_data_t value = 0;
 
         for(int ic = 0; ic < c_per_group; ic++)
@@ -454,7 +354,6 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in,
         int ihi = tid / wi;
         int iwi = tid % wi;
 
-        // double value = .0f;
         acc_data_t value = 0;
 
         for(int ik = 0; ik < k_per_group; ik++)
@@ -591,7 +490,6 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in,
         int iy = (tid / fx) % fy;
         int ic = tid / (fx * fy);
 
-        // double value = .0f;
         acc_data_t value = 0;
 
         for(int in = 0; in < n; in++)
@@ -737,7 +635,6 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in,
         int iho = (tid / wo) % ho;
         int ido = tid / (ho * wo);
 
-        // double value = .0f;
         acc_data_t value = 0;
 
         for(int ic = 0; ic < c_per_group; ic++)
@@ -892,7 +789,6 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in,
         int ihi = (tid / wi) % hi;
         int idi = tid / (hi * wi);
 
-        // double value = .0f;
         acc_data_t value = 0;
 
         for(int ik = 0; ik < k_per_group; ik++)
@@ -1053,7 +949,6 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in,
         int iz = (tid / (fx * fy)) % fz;
         int ic = tid / (fx * fy * fz);
 
-        // double value = .0f;
         acc_data_t value = 0;
 
         for(int in = 0; in < n; in++)
@@ -1202,7 +1097,6 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in,
         int iwo = tid / k_per_group;
         int ik  = tid % k_per_group;
 
-        // double value = .0f;
         acc_data_t value = 0;
 
         for(int iy = 0; iy < fy; iy++)
@@ -1294,41 +1188,6 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in,
                                            int fx,
                                            int group)
 {
-    /*
-      if (blockIdx.x == 0 && threadIdx.x == 0) {
-        printStrideArray("in_strides", in_strides);
-        printStrideArray("wei_strides", wei_strides);
-        printStrideArray("out_strides", out_strides);
-
-        printf("modified strides\n");
-        Strides5D in_strd;
-        Strides5D wei_strd;
-        Strides5D out_strd;
-
-        in_strd[0] = 1;
-        in_strd[1] = k_per_group;
-        in_strd[2] = group * k_per_group;
-        in_strd[3] = group * k_per_group * wo;
-        in_strd[4] = group * k_per_group * wo * ho;
-
-        wei_strd[0] = 1;
-        wei_strd[1] = c_per_group;
-        wei_strd[2] = c_per_group * fx;
-        wei_strd[3] = c_per_group * fx * fy;
-        wei_strd[4] = c_per_group * fx * fy * k_per_group;
-
-        out_strd[0] = 1;
-        out_strd[1] = c_per_group;
-        out_strd[2] = group * c_per_group;
-        out_strd[3] = group * c_per_group * wi;
-        out_strd[4] = group * c_per_group * wi * hi;
-
-        printStrideArray("in_strd", in_strd);
-        printStrideArray("wei_strd", wei_strd);
-        printStrideArray("out_strd", out_strd);
-      }
-      */
-
     /*
      *  need to compute total input pixel: `group * n * hi * wi * c_per_group`.
      *  to distribute this workload, let one workgroup compute `wi *
@@ -1368,7 +1227,6 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in,
         int iwi = tid / c_per_group;
         int ic  = tid % c_per_group;
 
-        // double value = .0f;
         acc_data_t value = 0;
 
         for(int iy = 0; iy < fy; iy++)
@@ -1506,7 +1364,6 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in,
         int ix = (tid / c_per_group) % fx;
         int iy = tid / (c_per_group * fx);
 
-        // double value = .0f;
         acc_data_t value = 0;
 
         for(int in = 0; in < n; in++)
@@ -1652,7 +1509,6 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in,
         int iwo = (tid / k_per_group) % wo;
         int iho = tid / (k_per_group * wo);
 
-        // double value = .0f;
         acc_data_t value = 0;
 
         for(int iz = 0; iz < fz; iz++)
@@ -1763,43 +1619,6 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in,
                                             int fx,
                                             int group)
 {
-    /*
-      if (blockIdx.x == 0 && threadIdx.x == 0) {
-        printStrideArray("in_strides", in_strides);
-        printStrideArray("wei_strides", wei_strides);
-        printStrideArray("out_strides", out_strides);
-
-        printf("modified strides\n");
-        Strides6D in_strd;
-        Strides6D wei_strd;
-        Strides6D out_strd;
-
-        in_strd[0] = 1;
-        in_strd[1] = k_per_group;
-        in_strd[2] = group * k_per_group;
-        in_strd[3] = group * k_per_group * wo;
-        in_strd[4] = group * k_per_group * wo * ho;
-        in_strd[5] = group * k_per_group * wo * ho * do_;
-
-        wei_strd[0] = 1;
-        wei_strd[1] = c_per_group;
-        wei_strd[2] = c_per_group * fx;
-        wei_strd[3] = c_per_group * fx * fy;
-        wei_strd[4] = c_per_group * fx * fy * fz;
-        wei_strd[5] = k_per_group * c_per_group * fx * fy * fz;
-
-        out_strd[0] = 1;
-        out_strd[1] = c_per_group;
-        out_strd[2] = group * c_per_group;
-        out_strd[3] = group * c_per_group * wi;
-        out_strd[4] = group * c_per_group * wi * hi;
-        out_strd[5] = group * c_per_group * wi * hi * di;
-
-        printStrideArray("in_strd", in_strd);
-        printStrideArray("wei_strd", wei_strd);
-        printStrideArray("out_strd", out_strd);
-      }
-      */
 
     /*
      *  need to compute total input pixel: `group * n * di * hi * wi *
@@ -1843,7 +1662,6 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in,
         int iwi = (tid / c_per_group) % wi;
         int ihi = (tid / (c_per_group * wi));
 
-        // double value = .0f;
         acc_data_t value = 0;
 
         for(int iz = 0; iz < fz; iz++)
@@ -2004,7 +1822,6 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in,
         int iy = (tid / (c_per_group * fx)) % fy;
         int iz = (tid / (c_per_group * fx * fy));
 
-        // double value = .0f;
         acc_data_t value = 0;
 
         for(int in = 0; in < n; in++)
@@ -2363,3 +2180,7 @@ DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, ushort, double, ushort)
 DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, float, double, float)
 DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, half, double, half)
 DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, ushort, double, ushort)
+
+/// \todo discuss whether we should split the kernels into separate files, or
+/// figure out a mechanism to compile each kernel separately to reduce hipRTC
+/// compilation times.  --amberhassaan
diff --git a/src/kernels/stride_array.hpp b/src/kernels/stride_array.hpp
new file mode 100644
index 0000000000..32cb1f85b6
--- /dev/null
+++ b/src/kernels/stride_array.hpp
@@ -0,0 +1,86 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#ifdef __HIPCC_RTC__
+#ifndef WORKAROUND_ISSUE_HIPRTC_TRUE_TYPE
+#include <cstdint>
+#endif
+#endif // __HIPCC_RTC__
+
+/// \todo Uncomment when hip RTC accepts std::array -- amberhassaan
+// #include <hip/amd_detail/amd_hip_vector_types.h>
+// using StrideIndexType = int;
+// using Strides3D       = std::array<StrideIndexType, 3>;
+// using Strides4D       = std::array<StrideIndexType, 4>;
+// using Strides5D       = std::array<StrideIndexType, 5>;
+// using Strides6D       = std::array<StrideIndexType, 6>;
+template <typename T, unsigned N>
+class MyArray
+{
+    T data_[N] = {};
+
+public:
+    constexpr static const unsigned SIZE = N;
+    __host__ __device__ constexpr unsigned size() const { return N; }
+
+    __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; }
+
+    __host__ T& operator[](unsigned i) { return data_[i]; }
+
+    __host__ __device__ MyArray()                   = default;
+    __host__ __device__ MyArray(const MyArray&)     = default;
+    __host__ __device__ MyArray(MyArray&&) noexcept = default;
+    __host__ __device__ MyArray& operator=(const MyArray&) = default;
+    __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default;
+    __host__ __device__ ~MyArray()                             = default;
+};
+
+using StrideIndexType = size_t;
+using Strides5D       = MyArray<StrideIndexType, 5u>;
+using Strides6D       = MyArray<StrideIndexType, 6u>;
+
+template <typename StrideArray>
+__host__ __device__ void printStrideArray(const char* name, const StrideArray& sarr)
+{
+    printf("%s = [", name);
+    for(int i = 0; i < StrideArray::SIZE; ++i)
+    {
+        printf("%zu,", sarr[i]);
+    }
+    printf("]\n");
+}
+
+template <typename StrideArray>
+__host__ __device__ void printStrideArrays(const StrideArray& in_strides,
+                                           const StrideArray& wei_strides,
+                                           const StrideArray& out_strides)
+{
+
+    printStrideArray("in_strides", in_strides);
+    printStrideArray("wei_strides", wei_strides);
+    printStrideArray("out_strides", out_strides);
+}
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index a0d6637978..e39ee06a66 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -33,7 +33,6 @@
 #include <miopen/solver/implicitgemm_util.hpp>
 #include <miopen/datatype.hpp>
 #include <ostream>
-#include <filesystem>
 
 namespace miopen {
 
@@ -107,10 +106,12 @@ bool IsOutputInt32(const ProblemDescription& problem)
            problem.GetOutDataType() == miopenInt32;
 }
 
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS);
+
 std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
 {
     std::ostringstream kernel_name;
-    if(miopen::IsEnvvarValueEnabled("MIOPEN_USE_PACKED_CONV_REF_KERNEL"))
+    if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS()))
     {
         kernel_name << "naive_conv_packed_";
     }
@@ -183,8 +184,6 @@ std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
     else
         MIOPEN_THROW("unsupported data type:");
 
-    // TODO(Amber): Left for debugging. Will remove in the future.
-    // std::cout << "############ kernel_name = " << kernel_name.str() << std::endl;
     return kernel_name.str();
 }
 
@@ -256,10 +255,10 @@ bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext& ctx,
     return true;
 }
 
-// figure out the index of C (channel) stride so we can expand it into
-// (G, C_per_group). Return value G_stride_idx is the position of G stride
-// in the stride vector, such that the (G_stride_idx - 1) is the index that
-// contains C's stride as a multiplying factor
+/// Figure out the index of C (channel) stride so we can expand it into
+/// (G, C_per_group). Return value G_stride_idx is the position of G stride
+/// in the stride vector, such that the (G_stride_idx - 1) is the index that
+/// contains C's stride as a multiplying factor
 int GetGroupStrideIndex(const ProblemDescription& problem)
 {
     int G_stride_idx = -1;
@@ -287,12 +286,12 @@ void printTensorStrides(const TensorDescriptor& inDesc,
 {
 
     auto printOneStrideVec = [](const char* name, const auto& vec) {
-        printf("%s = [", name);
+        MIOPEN_LOG_I(name << " = [");
         for(const size_t v : vec)
         {
-            printf("%zu,", v);
+            MIOPEN_LOG_I(v << ",");
         }
-        printf("]\n");
+        MIOPEN_LOG_I("]\n");
     };
 
     printOneStrideVec("inDesc = ", inDesc.GetStrides());
diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp
index d0d5f579a1..c0aa1385f8 100644
--- a/src/solver/conv_direct_naive_conv_bwd.cpp
+++ b/src/solver/conv_direct_naive_conv_bwd.cpp
@@ -159,10 +159,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
                     SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
                 auto out_strides = MakeStrideArray<5>(
                     SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx));
-                // TODO(Amber): Someone made the silly decision of swapping in and
-                // out pointers in ConvTensors for backward pass, so now I have to
-                // pass out in place of in, out_strides in place of in_strides and
-                // vice-versa
+                /// \ref backward_tensors_reversed_why
                 if(is_f8)
                 {
                     handle.Run(kern)(tensors.out,
@@ -241,13 +238,11 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
                 auto out_strides = MakeStrideArray<6>(
                     SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx));
 
-                // printTensorStrides(tensors.inDesc, tensors.wDesc, tensors.outDesc);
-                // printStrideArrays(in_strides, wei_strides, out_strides);
-
-                // TODO(Amber): Someone made the silly decision of swapping in and
-                // out pointers in ConvTensors for backward pass, so now I have to
-                // pass out in place of in, out_strides in place of in_strides and
-                // vice-versa
+                /// \anchor backward_tensors_reversed_why
+                /// \todo Someone made the silly decision of swapping in and
+                /// out pointers in ConvTensors for backward pass, so now I have to
+                /// pass out in place of in, out_strides in place of in_strides and
+                /// vice-versa --amberhassaan
                 handle.Run(kern)(tensors.out,
                                  tensors.w,
                                  tensors.in,
diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp
index 49cbe49a1c..11ac4dd191 100644
--- a/src/solver/conv_direct_naive_conv_fwd.cpp
+++ b/src/solver/conv_direct_naive_conv_fwd.cpp
@@ -27,8 +27,6 @@
 #include <miopen/solver/conv_direct_naive_conv.hpp>
 #include <miopen/solver.hpp>
 #include <miopen/conv/data_invoke_params.hpp>
-#include <miopen/env.hpp>
-#include <miopen/common.hpp>
 
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD)
 

From 67d9a77f0810e4672ecec65e1ebc591c06ff36f6 Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Tue, 26 Sep 2023 22:44:33 +0000
Subject: [PATCH 12/23] address comments

---
 src/ocl/convolutionocl.cpp    |  6 ------
 test/gpu_reference_kernel.cpp | 37 +++++++++--------------------------
 2 files changed, 9 insertions(+), 34 deletions(-)

diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp
index 741f499703..48e14e3f30 100644
--- a/src/ocl/convolutionocl.cpp
+++ b/src/ocl/convolutionocl.cpp
@@ -416,12 +416,6 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle,
                                                size_t workSpaceSize) const
 {
     MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize);
-#if 0
-    if(!(xDesc.IsPacked() && wDesc.IsPacked() && yDesc.IsPacked()))
-    {
-        MIOPEN_THROW(miopenStatusNotImplemented, "Only fully packed tensors are supported");
-    }
-#endif
 
     const auto tensors = ConvFwdTensors{xDesc, x, wDesc, w, yDesc, y};
     ValidateConvTensors(tensors);
diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp
index 31061c346c..13eae6f01e 100644
--- a/test/gpu_reference_kernel.cpp
+++ b/test/gpu_reference_kernel.cpp
@@ -367,11 +367,6 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
             int ho          = conv_out_size(hi, py, dy, fy, sy);
             int wo          = conv_out_size(wi, px, dx, fx, sx);
             int c_per_group = c / g;
-            // int k_per_group = k / g;
-
-            // int in_sz  = g * n * c_per_group * hi * wi;
-            // int wei_sz = g * k_per_group * c_per_group * fy * fx;
-            // int out_sz = g * n * k_per_group * ho * wo;
 
             std::vector<int> in_len({n, c, hi, wi});
             std::vector<int> wei_len({k, c_per_group, fy, fx});
@@ -482,9 +477,10 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-                // TODO(Amber): copy output before computation because output may
-                // be not be packed, and convolution may update only a subset of
-                // indices
+                /// \anchor copy_non_packed_output_before_convolution
+                /// \note copy output before computation because output may
+                /// be not be packed, and convolution may update only a subset of
+                /// indices
                 EXPECT(hipMemcpy(out_dev,
                                  out.data.data(),
                                  sizeof(Tout) * out_sz,
@@ -560,9 +556,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                                                nullptr);
                 EXPECT(status == CL_SUCCESS);
 #elif MIOPEN_BACKEND_HIP
-                // TODO(Amber): copy output before computation because output may
-                // be not be packed, and convolution may update only a subset of
-                // indices
+                /// \ref copy_non_packed_output_before_convolution
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -648,9 +642,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
-                // TODO(Amber): copy output before computation because output may
-                // be not be packed, and convolution may update only a subset of
-                // indices
+                /// \ref copy_non_packed_output_before_convolution
                 EXPECT(hipMemcpy(wei_dev,
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
@@ -774,11 +766,6 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             int wo          = conv_out_size(wi, px, dx, fx, sx);
             int do_         = conv_out_size(di, pz, dz, fz, sz);
             int c_per_group = c / g;
-            // int k_per_group = k / g;
-
-            // int in_sz  = g * n * c_per_group * di * hi * wi;
-            // int wei_sz = g * k_per_group * c_per_group * fz * fy * fx;
-            // int out_sz = g * n * k_per_group * do_ * ho * wo;
 
             std::vector<int> in_len({n, c, di, hi, wi});
             std::vector<int> wei_len({k, c_per_group, fz, fy, fx});
@@ -885,9 +872,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
-                // TODO(Amber): copy output before computation because output may
-                // be not be packed, and convolution may update only a subset of
-                // indices
+                /// \ref copy_non_packed_output_before_convolution
                 EXPECT(hipMemcpy(out_dev,
                                  out.data.data(),
                                  sizeof(Tout) * out_sz,
@@ -969,9 +954,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                                                nullptr);
                 EXPECT(status == CL_SUCCESS);
 #elif MIOPEN_BACKEND_HIP
-                // TODO(Amber): copy output before computation because output may
-                // be not be packed, and convolution may update only a subset of
-                // indices
+                /// \ref copy_non_packed_output_before_convolution
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -1057,9 +1040,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
-                // TODO(Amber): copy output before computation because output may
-                // be not be packed, and convolution may update only a subset of
-                // indices
+                /// \ref copy_non_packed_output_before_convolution
                 EXPECT(hipMemcpy(wei_dev,
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,

From 0f16c625af2debfb9e757661a22199a029c11b3a Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Thu, 28 Sep 2023 17:30:39 +0000
Subject: [PATCH 13/23] address review comments

---
 src/include/miopen/hipoc_kernel.hpp           |  6 +-
 .../miopen/solver/conv_direct_naive_conv.hpp  | 87 ++++++++++---------
 .../gpu_reference_kernel/naive_conv.cpp       | 29 -------
 src/solver/conv_direct_naive_conv.cpp         |  8 +-
 src/solver/conv_direct_naive_conv_bwd.cpp     | 28 +++---
 src/solver/conv_direct_naive_conv_fwd.cpp     | 28 +++---
 src/solver/conv_direct_naive_conv_wrw.cpp     | 28 +++---
 7 files changed, 98 insertions(+), 116 deletions(-)

diff --git a/src/include/miopen/hipoc_kernel.hpp b/src/include/miopen/hipoc_kernel.hpp
index 938d20d9e6..73ac77f160 100644
--- a/src/include/miopen/hipoc_kernel.hpp
+++ b/src/include/miopen/hipoc_kernel.hpp
@@ -51,9 +51,9 @@ inline HipEventPtr make_hip_event()
 template <class T, class U>
 struct KernelArgsPair
 {
-    constexpr static const auto alignU       = alignof(U);
-    constexpr static const auto padding      = (alignU - (sizeof(T) % alignU)) % alignU;
-    constexpr static const auto second_index = sizeof(T) + padding;
+    constexpr static auto alignU       = alignof(U);
+    constexpr static auto padding      = (alignU - (sizeof(T) % alignU)) % alignU;
+    constexpr static auto second_index = sizeof(T) + padding;
     KernelArgsPair(T x, U y)
     {
         new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp
index 0d0029d814..6d935b249d 100644
--- a/src/include/miopen/solver/conv_direct_naive_conv.hpp
+++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp
@@ -60,49 +60,18 @@ bool IsOutputBfp16(const ProblemDescription&);
 bool IsOutputInt8(const ProblemDescription&);
 bool IsOutputInt32(const ProblemDescription&);
 
-int GetGroupStrideIndex(const ProblemDescription& problem);
-
-void printTensorStrides(const TensorDescriptor& inDesc,
-                        const TensorDescriptor& wDesc,
-                        const TensorDescriptor& outDesc);
-
-namespace internal {
-template <unsigned N>
-struct ChooseStride
-{
-};
-
-template <>
-struct ChooseStride<5u>
-{
-    using type = Strides5D;
-};
-
-template <>
-struct ChooseStride<6u>
-{
-    using type = Strides6D;
-};
-
-} // end namespace internal
+namespace conv_internal {
 
-template <unsigned N, typename V>
-auto MakeStrideArray(V vec)
-{
-    typename internal::ChooseStride<N>::type ret;
-    assert(vec.size() == N);
-
-    // MIOpen stores strides for NHWC in NCHW order, i.e. C stride in 2nd from left.
-    // We sort the input stride vector so that smallest stride is at index 0. This
-    // (little-endian) order is what naive convolution kernel expects for strides
-    std::sort(vec.begin(), vec.end());
+void DebugPrintTensorStrides(const TensorDescriptor& inDesc,
+                             const TensorDescriptor& wDesc,
+                             const TensorDescriptor& outDesc);
 
-    for(unsigned i = 0; i < N; ++i)
-    {
-        ret[i] = static_cast<StrideIndexType>(vec[i]);
-    }
-    return ret;
-}
+/**
+ * Get the index where group (G) stride should go. For NCHW, we want to convert
+ * its strides to NGCHW, and for NHWC, we want to convert its strides to NHWGC.
+ * Same applies for the 3D case.
+ */
+int GetGroupStrideIndex(const ProblemDescription& problem);
 
 /**
  * split the strides for C dimension in a tensor descriptor into (G, C_per_group).
@@ -142,5 +111,41 @@ V SplitWeiStrideKtoGK(int k_per_group, const V& wei_strides)
     return ret;
 }
 
+template <unsigned N>
+struct ChooseStride
+{
+};
+
+template <>
+struct ChooseStride<5u>
+{
+    using type = Strides5D;
+};
+
+template <>
+struct ChooseStride<6u>
+{
+    using type = Strides6D;
+};
+
+template <unsigned N, typename V>
+auto MakeStrideArray(V vec)
+{
+    typename ChooseStride<N>::type ret;
+    assert(vec.size() == N);
+
+    // MIOpen stores strides for NHWC in NCHW order, i.e. C stride in 2nd from left.
+    // We sort the input stride vector so that smallest stride is at index 0. This
+    // (little-endian) order is what naive convolution kernel expects for strides
+    std::sort(vec.begin(), vec.end());
+
+    for(unsigned i = 0; i < N; ++i)
+    {
+        ret[i] = static_cast<StrideIndexType>(vec[i]);
+    }
+    return ret;
+}
+} // end namespace conv_internal
+
 } // namespace solver
 } // namespace miopen
diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp
index 60c3583e77..b243b1234a 100644
--- a/src/kernels/gpu_reference_kernel/naive_conv.cpp
+++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp
@@ -116,35 +116,6 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val)
     return static_cast<int8_t>(val & 0xff);
 }
 
-#if 0
-// TODO(Amber): this file is compiled via HIP RTC and includes don't work easily
-template <typename T, unsigned N>
-class MyArray
-{
-    T data_[N] = {};
-
-public:
-    constexpr static const unsigned SIZE = N;
-
-    __host__ __device__ constexpr unsigned size() const { return N; }
-
-    __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; }
-
-    __host__ __device__ T& operator[](unsigned i) { return data_[i]; }
-
-    __host__ __device__ MyArray()                   = default;
-    __host__ __device__ MyArray(const MyArray&)     = default;
-    __host__ __device__ MyArray(MyArray&&) noexcept = default;
-    __host__ __device__ MyArray& operator=(const MyArray&) = default;
-    __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default;
-    __host__ __device__ ~MyArray()                             = default;
-};
-
-using StrideIndexType = size_t;
-using Strides5D       = MyArray<StrideIndexType, 5u>;
-using Strides6D       = MyArray<StrideIndexType, 6u>;
-#endif
-
 /// \todo remove template parameter 'bool ASSUME_PACKED' in a follow up PR
 /// --amberhassaan
 /// Notes (Amber):
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index e39ee06a66..63c308c0fb 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -259,7 +259,7 @@ bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext& ctx,
 /// (G, C_per_group). Return value G_stride_idx is the position of G stride
 /// in the stride vector, such that the (G_stride_idx - 1) is the index that
 /// contains C's stride as a multiplying factor
-int GetGroupStrideIndex(const ProblemDescription& problem)
+int conv_internal::GetGroupStrideIndex(const ProblemDescription& problem)
 {
     int G_stride_idx = -1;
     if(problem.IsLayoutDefault())
@@ -280,9 +280,9 @@ int GetGroupStrideIndex(const ProblemDescription& problem)
     return G_stride_idx;
 }
 
-void printTensorStrides(const TensorDescriptor& inDesc,
-                        const TensorDescriptor& wDesc,
-                        const TensorDescriptor& outDesc)
+void conv_internal::DebugPrintTensorStrides(const TensorDescriptor& inDesc,
+                                            const TensorDescriptor& wDesc,
+                                            const TensorDescriptor& outDesc)
 {
 
     auto printOneStrideVec = [](const char* name, const auto& vec) {
diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp
index c0aa1385f8..1a28f8aae6 100644
--- a/src/solver/conv_direct_naive_conv_bwd.cpp
+++ b/src/solver/conv_direct_naive_conv_bwd.cpp
@@ -142,7 +142,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
     }();
     kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
-    int G_stride_idx = GetGroupStrideIndex(problem);
+    int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);
 
     if(problem.Is2d())
     {
@@ -152,13 +152,14 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
                 decltype(auto) data_ctx = primitive_parameters.CastTo<conv::DataInvokeParams>();
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
-                auto in_strides         = MakeStrideArray<5>(
-                    SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx));
+                auto in_strides = conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC(
+                    group, tensors.inDesc.GetStrides(), G_stride_idx));
                 // For weights, we split K to (G, K_per_group), which is always index 0
-                auto wei_strides = MakeStrideArray<5>(
-                    SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
-                auto out_strides = MakeStrideArray<5>(
-                    SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx));
+                auto wei_strides = conv_internal::MakeStrideArray<5>(
+                    conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
+                auto out_strides =
+                    conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC(
+                        group, tensors.outDesc.GetStrides(), G_stride_idx));
                 /// \ref backward_tensors_reversed_why
                 if(is_f8)
                 {
@@ -230,13 +231,14 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
 
-                auto in_strides = MakeStrideArray<6>(
-                    SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx));
+                auto in_strides = conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC(
+                    group, tensors.inDesc.GetStrides(), G_stride_idx));
                 // For weights, we split K to (G, K_per_group), which is always index 0
-                auto wei_strides = MakeStrideArray<6>(
-                    SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
-                auto out_strides = MakeStrideArray<6>(
-                    SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx));
+                auto wei_strides = conv_internal::MakeStrideArray<6>(
+                    conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
+                auto out_strides =
+                    conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC(
+                        group, tensors.outDesc.GetStrides(), G_stride_idx));
 
                 /// \anchor backward_tensors_reversed_why
                 /// \todo Someone made the silly decision of swapping in and
diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp
index 11ac4dd191..a4656d929a 100644
--- a/src/solver/conv_direct_naive_conv_fwd.cpp
+++ b/src/solver/conv_direct_naive_conv_fwd.cpp
@@ -141,7 +141,7 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
 
     kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
-    int G_stride_idx = GetGroupStrideIndex(problem);
+    int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);
 
     if(problem.Is2d())
     {
@@ -152,13 +152,14 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
 
-                auto in_strides = MakeStrideArray<5>(
-                    SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx));
+                auto in_strides = conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC(
+                    group, tensors.inDesc.GetStrides(), G_stride_idx));
                 // For weights, we split K to (G, K_per_group), which is always index 0
-                auto wei_strides = MakeStrideArray<5>(
-                    SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
-                auto out_strides = MakeStrideArray<5>(
-                    SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx));
+                auto wei_strides = conv_internal::MakeStrideArray<5>(
+                    conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
+                auto out_strides =
+                    conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC(
+                        group, tensors.outDesc.GetStrides(), G_stride_idx));
 
                 if(is_f8)
                 {
@@ -230,13 +231,14 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
 
-                auto in_strides = MakeStrideArray<6>(
-                    SplitStrideCtoGC(group, tensors.inDesc.GetStrides(), G_stride_idx));
+                auto in_strides = conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC(
+                    group, tensors.inDesc.GetStrides(), G_stride_idx));
                 // For weights, we split K to (G, K_per_group), which is always index 0
-                auto wei_strides = MakeStrideArray<6>(
-                    SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
-                auto out_strides = MakeStrideArray<6>(
-                    SplitStrideCtoGC(group, tensors.outDesc.GetStrides(), G_stride_idx));
+                auto wei_strides = conv_internal::MakeStrideArray<6>(
+                    conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
+                auto out_strides =
+                    conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC(
+                        group, tensors.outDesc.GetStrides(), G_stride_idx));
                 handle.Run(kern)(tensors.in,
                                  tensors.w,
                                  tensors.out,
diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp
index d02e1fbdb5..dfe1c342b0 100644
--- a/src/solver/conv_direct_naive_conv_wrw.cpp
+++ b/src/solver/conv_direct_naive_conv_wrw.cpp
@@ -129,7 +129,7 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
             return false;
     }();
 
-    int G_stride_idx = GetGroupStrideIndex(problem);
+    int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);
 
     if(problem.Is2d())
     {
@@ -140,13 +140,14 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
 
-                auto in_strides = MakeStrideArray<5>(
-                    SplitStrideCtoGC(group, tensors.xDesc.GetStrides(), G_stride_idx));
+                auto in_strides = conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC(
+                    group, tensors.xDesc.GetStrides(), G_stride_idx));
                 // For weights, we split K to (G, K_per_group), which is always index 0
-                auto wei_strides = MakeStrideArray<5>(
-                    SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides()));
-                auto out_strides = MakeStrideArray<5>(
-                    SplitStrideCtoGC(group, tensors.dyDesc.GetStrides(), G_stride_idx));
+                auto wei_strides = conv_internal::MakeStrideArray<5>(
+                    conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides()));
+                auto out_strides =
+                    conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC(
+                        group, tensors.dyDesc.GetStrides(), G_stride_idx));
 
                 if(is_f8)
                 {
@@ -218,13 +219,14 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
 
-                auto in_strides = MakeStrideArray<6>(
-                    SplitStrideCtoGC(group, tensors.xDesc.GetStrides(), G_stride_idx));
+                auto in_strides = conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC(
+                    group, tensors.xDesc.GetStrides(), G_stride_idx));
                 // For weights, we split K to (G, K_per_group), which is always index 0
-                auto wei_strides = MakeStrideArray<6>(
-                    SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides()));
-                auto out_strides = MakeStrideArray<6>(
-                    SplitStrideCtoGC(group, tensors.dyDesc.GetStrides(), G_stride_idx));
+                auto wei_strides = conv_internal::MakeStrideArray<6>(
+                    conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides()));
+                auto out_strides =
+                    conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC(
+                        group, tensors.dyDesc.GetStrides(), G_stride_idx));
 
                 handle.Run(kern)(tensors.x,
                                  tensors.dw,

From dec88a7ca57fa71f645c5ba81a601e0a2a273c4c Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Fri, 29 Sep 2023 08:28:03 +0000
Subject: [PATCH 14/23] add more checks for strides

---
 src/include/miopen/convolution.hpp |   4 ++
 src/ocl/convolutionocl.cpp         | 103 ++++++++++++++++++++++-------
 test/gpu_reference_kernel.cpp      |  41 +++++++-----
 3 files changed, 106 insertions(+), 42 deletions(-)

diff --git a/src/include/miopen/convolution.hpp b/src/include/miopen/convolution.hpp
index bac0133106..d94211ed2e 100644
--- a/src/include/miopen/convolution.hpp
+++ b/src/include/miopen/convolution.hpp
@@ -36,6 +36,7 @@
 #include <miopen/names.hpp>
 #include <miopen/invoke_params.hpp>
 #include <miopen/invoker.hpp>
+#include <miopen/conv/tensors.hpp>
 
 #include <nlohmann/json_fwd.hpp>
 
@@ -404,6 +405,9 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor
 
     friend void to_json(nlohmann::json& json, const ConvolutionDescriptor& conv);
     friend void from_json(const nlohmann::json& json, ConvolutionDescriptor& conv);
+
+private:
+    void ValidateConvTensors(const ConvTensors& conv_tensors) const;
 };
 
 void ConvolutionBackwardBias(const Handle& handle,
diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp
index 48e14e3f30..b9d221f8a9 100644
--- a/src/ocl/convolutionocl.cpp
+++ b/src/ocl/convolutionocl.cpp
@@ -287,31 +287,6 @@ void ConvolutionDescriptor::FindConvFwdAlgorithm(Handle& handle,
 
 namespace {
 
-void ValidateConvTensors(const ConvTensors& tensors)
-{
-    const auto invalid_buffers =
-        tensors.x == nullptr || tensors.w == nullptr || tensors.y == nullptr;
-
-    const auto tensor_sizes_not_matched = tensors.xDesc.GetSize() != tensors.yDesc.GetSize() ||
-                                          tensors.xDesc.GetSize() != tensors.wDesc.GetSize();
-
-    const auto trivial_tensor_types_not_matched =
-        tensors.xDesc.GetType() != tensors.yDesc.GetType() &&
-        tensors.xDesc.GetType() != miopenInt8 && tensors.xDesc.GetType() != miopenInt8x4;
-
-    // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) {
-    //    MIOPEN_THROW(miopenStatusBadParm);
-    //}
-
-    const auto x_tensor_invalid = tensors.xDesc.GetSize() < 3;
-
-    const auto bad_parameters = invalid_buffers || tensor_sizes_not_matched ||
-                                trivial_tensor_types_not_matched || x_tensor_invalid;
-
-    if(bad_parameters)
-        MIOPEN_THROW(miopenStatusBadParm);
-}
-
 void ValidateAlphaBeta(const void* alpha, const void* beta)
 {
     if(!float_equal(*(static_cast<const float*>(alpha)), 1.0) ||
@@ -402,6 +377,84 @@ static void ConvForwardCheckNumerics(const Handle& handle,
     }
 }
 
+void ConvolutionDescriptor::ValidateConvTensors(const ConvTensors& tensors) const
+{
+
+    // Group stride in current TensorDescriptor is implicit. When invoking kernels,
+    // we need to add the group dimension G and compute its stride. We want the stride
+    // left of C to be a multiple of group count G. e.g. for NCHW, the stride for N
+    // should be a multiple of G so that we can compute the strides for NGCHW
+    auto bad_group_stride = [this](const TensorDescriptor& td) {
+        auto l             = td.GetLayout_t();
+        int g_stride_index = -1;
+        if(l == miopenTensorNCHW || l == miopenTensorNCDHW)
+        {
+            g_stride_index = 0; // stride index for N;
+        }
+        else if(l == miopenTensorNHWC || l == miopenTensorNDHWC)
+        {
+            // stride index for W. Normally this would be 2nd-last stride but we store
+            // strides in NCHW order for some weird reason.
+            g_stride_index = td.GetStrides().size() - 1;
+        }
+
+        if(g_stride_index != 1)
+        {
+            return (td.GetStrides()[g_stride_index] % this->group_count) != 0;
+        }
+
+        return false;
+    };
+
+    // invalid_buffers
+    if(tensors.x == nullptr || tensors.w == nullptr || tensors.y == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "One of the convolution tensors is null");
+    }
+
+    // x_tensor_invalid =
+    if(tensors.xDesc.GetSize() < 3)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "input tensor's number of dimensions is wrong");
+    }
+
+    // tensor_sizes_not_matched =
+    if(tensors.xDesc.GetSize() != tensors.yDesc.GetSize() ||
+       tensors.xDesc.GetSize() != tensors.wDesc.GetSize())
+    {
+        MIOPEN_THROW(miopenStatusBadParm,
+                     "number of dimensions mismatch between input, output and weights tensors");
+    }
+
+    // trivial_tensor_types_not_matched =
+    if(tensors.xDesc.GetType() != tensors.yDesc.GetType() &&
+       tensors.xDesc.GetType() != miopenInt8 && tensors.xDesc.GetType() != miopenInt8x4)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "input/output tensor data types do not match");
+    }
+
+    // check for bad_group_stride. This applies for input and output only. There
+    // is no check for weight tensor currently.
+    // no need to check for group_count == 1
+
+    if((this->group_count > 1) && bad_group_stride(tensors.xDesc))
+    {
+        MIOPEN_THROW(
+            miopenStatusBadParm,
+            "Invalid input tensor strides. Channel stride must be a multiple of group count");
+    }
+    if((this->group_count > 1) && bad_group_stride(tensors.yDesc))
+    {
+        MIOPEN_THROW(
+            miopenStatusBadParm,
+            "Invalid output tensor strides. Channel stride must be a multiple of group count");
+    }
+
+    // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) {
+    //    MIOPEN_THROW(miopenStatusBadParm);
+    //}
+}
+
 void ConvolutionDescriptor::ConvolutionForward(Handle& handle,
                                                const void* alpha,
                                                const TensorDescriptor& xDesc,
diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp
index 13eae6f01e..f8bbaa8e86 100644
--- a/test/gpu_reference_kernel.cpp
+++ b/test/gpu_reference_kernel.cpp
@@ -312,28 +312,19 @@ static std::string miopen_type_to_string(miopenDataType_t type)
     return "n/a";
 }
 
-// input: a vector of lengths of dims in a tensor
-// multiply each element with a random constant integer
+/// input: a vector of lengths of dims in a tensor
+/// multiply each element with a random constant integer
 void pad_tensor_strides(std::vector<int>& strides)
 {
-    auto pvec = [](const char* name, const auto& vec) {
-        std::cout << name << ": [";
-        for(const auto& v : vec)
-        {
-            std::cout << v << ", ";
-        }
-        std::cout << "]\n";
-    };
+    constexpr int min_stride_multiplier = 1;
+    constexpr int max_stride_multiplier = 5;
 
-    pvec("orig strides", strides);
-    auto c = prng::gen_A_to_B(1, 3);
-    // int c = 2;
+    auto c = prng::gen_A_to_B(min_stride_multiplier, max_stride_multiplier);
     for(auto& v : strides)
     {
         // cppcheck-suppress useStlAlgorithm
         v = v * c;
     }
-    pvec("new strides", strides);
 }
 
 template <miopen::conv::Direction direction,
@@ -449,6 +440,8 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 // initialize data with integer
                 rand_tensor_integer(in);
                 rand_tensor_integer(wei);
+                /// \ref copy_non_packed_output_before_convolution
+                rand_tensor_integer(out);
 #if MIOPEN_BACKEND_OPENCL
                 status = clEnqueueWriteBuffer(q,
                                               in_dev,
@@ -478,9 +471,13 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
                 /// \anchor copy_non_packed_output_before_convolution
-                /// \note copy output before computation because output may
-                /// be not be packed, and convolution may update only a subset of
-                /// indices
+                /// \note Output is a non-packed tensor, which means there are
+                /// elements that convolution will not update. In order to verify
+                /// the convolution result, the GPU buffer should have the same
+                /// data as the CPU in both update and not-updated elements.
+                /// Therefore, we copy the output to the GPU buffer after
+                /// initializing it with random values.
+                ///
                 EXPECT(hipMemcpy(out_dev,
                                  out.data.data(),
                                  sizeof(Tout) * out_sz,
@@ -535,6 +532,8 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 // initialize data with integer
                 rand_tensor_integer(out);
                 rand_tensor_integer(wei);
+                /// \ref copy_non_packed_output_before_convolution
+                rand_tensor_integer(in);
 #if MIOPEN_BACKEND_OPENCL
                 status = clEnqueueWriteBuffer(q,
                                               out_dev,
@@ -618,6 +617,8 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
             {
                 rand_tensor_integer(in);
                 rand_tensor_integer(out);
+                /// \ref copy_non_packed_output_before_convolution
+                rand_tensor_integer(wei);
 #if MIOPEN_BACKEND_OPENCL
                 status |= clEnqueueWriteBuffer(q,
                                                in_dev,
@@ -848,6 +849,8 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 // initialize data with integer
                 rand_tensor_integer(in);
                 rand_tensor_integer(wei);
+                /// \ref copy_non_packed_output_before_convolution
+                rand_tensor_integer(out);
 #if MIOPEN_BACKEND_OPENCL
                 status = clEnqueueWriteBuffer(q,
                                               in_dev,
@@ -933,6 +936,8 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 // initialize data with integer
                 rand_tensor_integer(out);
                 rand_tensor_integer(wei);
+                /// \ref copy_non_packed_output_before_convolution
+                rand_tensor_integer(in);
 #if MIOPEN_BACKEND_OPENCL
                 status = clEnqueueWriteBuffer(q,
                                               out_dev,
@@ -1016,6 +1021,8 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             {
                 rand_tensor_integer(in, 3, -2);
                 rand_tensor_integer(out, 3, -2);
+                /// \ref copy_non_packed_output_before_convolution
+                rand_tensor_integer(wei);
 #if MIOPEN_BACKEND_OPENCL
                 status |= clEnqueueWriteBuffer(q,
                                                in_dev,

From 85e8a626c7471c3bdbc264e6670f5385d2b1d239 Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Tue, 3 Oct 2023 14:30:51 +0000
Subject: [PATCH 15/23] fix test now that strides are supported

---
 test/gtest/conv_api_strided_tensors.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/gtest/conv_api_strided_tensors.cpp b/test/gtest/conv_api_strided_tensors.cpp
index 2a59dcd696..9a2876b3f0 100644
--- a/test/gtest/conv_api_strided_tensors.cpp
+++ b/test/gtest/conv_api_strided_tensors.cpp
@@ -139,7 +139,6 @@ class ConvStridedTensors : public ::testing::Test
     std::vector<float> h_output;
 };
 
-// This test should be replaced when strided tensors are fully implemented
 TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented)
 {
     auto device = Device(handle);
@@ -178,9 +177,8 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented)
     const float alpha = 1.f;
     const float beta  = 0.f;
 
-    // miopenConvolutionForward() must return error if the format is not supported
     ASSERT_TRUE(device.Synchronize());
-    ASSERT_NE(miopenConvolutionForward(handle,
+    ASSERT_EQ(miopenConvolutionForward(handle,
                                        &alpha,
                                        input_descr,
                                        d_input.Data(),

From 39eee971062bdcaea82d48e00c6fe01e26842258 Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Wed, 4 Oct 2023 15:30:15 +0000
Subject: [PATCH 16/23] use C++17 to compile HIP Kernels

---
 src/hip/hip_build_utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hip/hip_build_utils.cpp b/src/hip/hip_build_utils.cpp
index 8f6f9f0c50..86cf3a7272 100644
--- a/src/hip/hip_build_utils.cpp
+++ b/src/hip/hip_build_utils.cpp
@@ -73,7 +73,7 @@ static boost::filesystem::path HipBuildImpl(boost::optional<TmpDir>& tmp_dir,
     auto env = std::string("");
 
     if(params.find("-std=") == std::string::npos)
-        params += " --std=c++11";
+        params += " --std=c++17";
 
 #if HIP_PACKAGE_VERSION_FLAT < 4001000000ULL
     params += " --cuda-gpu-arch=" + lots.device;

From 57fdf6e40b351492633cf89bfa5d9c555273c6e9 Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Wed, 4 Oct 2023 16:59:02 +0000
Subject: [PATCH 17/23] address comments

---
 src/kernels/gpu_reference_kernel/naive_conv.cpp | 6 +++---
 src/solver/conv_direct_naive_conv.cpp           | 4 ++++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp
index b243b1234a..531c7fd7bb 100644
--- a/src/kernels/gpu_reference_kernel/naive_conv.cpp
+++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp
@@ -126,9 +126,9 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val)
 /// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw,
 /// and nhwc should be nhwgc. Same follows for the 3D case.
 ///
-/// - strides here are in the little-endian order, i.e., for NHWC, stride for N is
-/// at index 3 while stride for C is at index 0. This is reverse of how strides are
-/// stored in tensor descriptors, which are big-endian.
+/// - strides here are stored right to left, i.e., for NHWC, stride for N is
+/// at index 3 while stride for C is at index 0. This is different from how the
+/// tensor descriptors store strides, which is always NCHW order, left-to-right. 
 
 template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index 86a8a4161e..f4f2027293 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -111,11 +111,15 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS);
 std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
 {
     std::ostringstream kernel_name;
+
+    /// \todo remove packed reference convolution kernels --amberhassaan
+#ifndef NDEBUG// enable in debug mode only
     if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS()))
     {
         kernel_name << "naive_conv_packed_";
     }
     else
+#endif
     {
         kernel_name << "naive_conv_nonpacked_";
     }

From b5d8b0f51193a1c93ee7ec9dd5c65f265ec607c7 Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Wed, 4 Oct 2023 16:59:02 +0000
Subject: [PATCH 18/23] address comments

---
 src/kernels/gpu_reference_kernel/naive_conv.cpp | 6 +++---
 src/solver/conv_direct_naive_conv.cpp           | 4 ++++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp
index b243b1234a..531c7fd7bb 100644
--- a/src/kernels/gpu_reference_kernel/naive_conv.cpp
+++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp
@@ -126,9 +126,9 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val)
 /// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw,
 /// and nhwc should be nhwgc. Same follows for the 3D case.
 ///
-/// - strides here are in the little-endian order, i.e., for NHWC, stride for N is
-/// at index 3 while stride for C is at index 0. This is reverse of how strides are
-/// stored in tensor descriptors, which are big-endian.
+/// - strides here are stored right to left, i.e., for NHWC, stride for N is
+/// at index 3 while stride for C is at index 0. This is different from how the
+/// tensor descriptors store strides, which is always NCHW order, left-to-right. 
 
 template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index 86a8a4161e..f4f2027293 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -111,11 +111,15 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS);
 std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
 {
     std::ostringstream kernel_name;
+
+    /// \todo remove packed reference convolution kernels --amberhassaan
+#ifndef NDEBUG// enable in debug mode only
     if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS()))
     {
         kernel_name << "naive_conv_packed_";
     }
     else
+#endif
     {
         kernel_name << "naive_conv_nonpacked_";
     }

From 0fe1a55fb6c979d3b11ef69007f225cfe16577dd Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Wed, 4 Oct 2023 21:15:03 +0000
Subject: [PATCH 19/23] remove OpenCL code due to deprecation

---
 test/gpu_reference_kernel.cpp | 281 +++-------------------------------
 1 file changed, 23 insertions(+), 258 deletions(-)

diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp
index f8bbaa8e86..b3677487fe 100644
--- a/test/gpu_reference_kernel.cpp
+++ b/test/gpu_reference_kernel.cpp
@@ -75,17 +75,9 @@ std::string tensor_layout_to_string(tensor_layout_t layout)
 struct gpu_reference_kernel_base
 {
     miopenHandle_t handle{};
-#if MIOPEN_BACKEND_OPENCL
-    cl_command_queue q{};
-#endif
 
-    gpu_reference_kernel_base()
-    {
-        miopenCreate(&handle);
-#if MIOPEN_BACKEND_OPENCL
-        miopenGetStream(handle, &q);
-#endif
-    }
+    gpu_reference_kernel_base() { miopenCreate(&handle); }
+
     ~gpu_reference_kernel_base() { miopenDestroy(handle); }
 
     static int conv_out_size(int in_size, int pad, int dilation, int ksize, int stride)
@@ -386,25 +378,13 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
             auto wei_sz = wei.data.size();
             auto out_sz = out.data.size();
 
-#if MIOPEN_BACKEND_OPENCL
-            cl_context ctx;
-            clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
-            cl_int status = CL_SUCCESS;
-            cl_mem in_dev =
-                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * in_sz, nullptr, &status);
-            cl_mem wei_dev =
-                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * wei_sz, nullptr, nullptr);
-            cl_mem out_dev =
-                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(Tout) * out_sz, nullptr, nullptr);
-            EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
             void* in_dev;
             void* wei_dev;
             void* out_dev;
             EXPECT(hipMalloc(&in_dev, sizeof(TRef) * in_sz) == hipSuccess);
             EXPECT(hipMalloc(&wei_dev, sizeof(TRef) * wei_sz) == hipSuccess);
             EXPECT(hipMalloc(&out_dev, sizeof(Tout) * out_sz) == hipSuccess);
-#endif
+
             EXPECT(miopenCreateConvolutionDescriptor(&convDesc) == miopenStatusSuccess);
             EXPECT(miopenInitConvolutionNdDescriptor(convDesc,
                                                      2,
@@ -442,27 +422,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 rand_tensor_integer(wei);
                 /// \ref copy_non_packed_output_before_convolution
                 rand_tensor_integer(out);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueWriteBuffer(q,
-                                              in_dev,
-                                              CL_TRUE,
-                                              0,
-                                              sizeof(TRef) * in_sz,
-                                              in.data.data(),
-                                              0,
-                                              nullptr,
-                                              nullptr);
-                status |= clEnqueueWriteBuffer(q,
-                                               wei_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * wei_sz,
-                                               wei.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -482,7 +442,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                                  out.data.data(),
                                  sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-#endif
+
                 cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(),
                                         in,
                                         wei,
@@ -507,23 +467,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                        miopenStatusSuccess);
 
                 tensor<Tout> out_host(out_len, out_strides);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueReadBuffer(q,
-                                             out_dev,
-                                             CL_TRUE,
-                                             0,
-                                             sizeof(Tout) * out_sz,
-                                             out_host.data.data(),
-                                             0,
-                                             nullptr,
-                                             nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
                 EXPECT(hipMemcpy(out_host.data.data(),
                                  out_dev,
                                  sizeof(Tout) * out_sz,
                                  hipMemcpyDeviceToHost) == hipSuccess);
-#endif
+
                 // we expect excact match, since use integer
                 valid_result = verify_tensor(out_host, out);
             }
@@ -534,28 +482,8 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 rand_tensor_integer(wei);
                 /// \ref copy_non_packed_output_before_convolution
                 rand_tensor_integer(in);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueWriteBuffer(q,
-                                              out_dev,
-                                              CL_TRUE,
-                                              0,
-                                              sizeof(TRef) * out_sz,
-                                              out.data.data(),
-                                              0,
-                                              nullptr,
-                                              nullptr);
-                status |= clEnqueueWriteBuffer(q,
-                                               wei_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * wei_sz,
-                                               wei.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
                 /// \ref copy_non_packed_output_before_convolution
+
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -567,7 +495,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-#endif
+
                 cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(),
                                               in,
                                               wei,
@@ -592,23 +520,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                        miopenStatusSuccess);
 
                 tensor<TRef> in_host(in_len, in_strides);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueReadBuffer(q,
-                                             in_dev,
-                                             CL_TRUE,
-                                             0,
-                                             sizeof(TRef) * in_sz,
-                                             in_host.data.data(),
-                                             0,
-                                             nullptr,
-                                             nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(in_host.data.data(),
                                  in_dev,
                                  sizeof(TRef) * in_sz,
                                  hipMemcpyDeviceToHost) == hipSuccess);
-#endif
 
                 // we expect excact match, since use integer
                 valid_result = verify_tensor(in_host, in);
@@ -619,27 +535,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 rand_tensor_integer(out);
                 /// \ref copy_non_packed_output_before_convolution
                 rand_tensor_integer(wei);
-#if MIOPEN_BACKEND_OPENCL
-                status |= clEnqueueWriteBuffer(q,
-                                               in_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * in_sz,
-                                               in.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                status |= clEnqueueWriteBuffer(q,
-                                               out_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * out_sz,
-                                               out.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -652,7 +548,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                                  out.data.data(),
                                  sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-#endif
+
                 cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(),
                                                 in,
                                                 wei,
@@ -677,23 +573,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                        miopenStatusSuccess);
 
                 tensor<TRef> wei_host(wei_len, wei_strides);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueReadBuffer(q,
-                                             wei_dev,
-                                             CL_TRUE,
-                                             0,
-                                             sizeof(TRef) * wei_sz,
-                                             wei_host.data.data(),
-                                             0,
-                                             nullptr,
-                                             nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(wei_host.data.data(),
                                  wei_dev,
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyDeviceToHost) == hipSuccess);
-#endif
 
                 // we expect excact match, since use integer
                 valid_result = verify_tensor(wei_host, wei);
@@ -715,15 +599,10 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
             miopenDestroyTensorDescriptor(inDesc);
             miopenDestroyTensorDescriptor(weiDesc);
             miopenDestroyTensorDescriptor(outDesc);
-#if MIOPEN_BACKEND_OPENCL
-            clReleaseMemObject(in_dev);
-            clReleaseMemObject(wei_dev);
-            clReleaseMemObject(out_dev);
-#elif MIOPEN_BACKEND_HIP
+
             hipFree(in_dev);
             hipFree(wei_dev);
             hipFree(out_dev);
-#endif
         };
 
         iterate_conv_2d(run_conv_2d);
@@ -795,25 +674,14 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             auto wei_sz = wei.data.size();
             auto out_sz = out.data.size();
 
-#if MIOPEN_BACKEND_OPENCL
-            cl_context ctx;
-            clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
-            cl_int status = CL_SUCCESS;
-            cl_mem in_dev =
-                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * in_sz, nullptr, &status);
-            cl_mem wei_dev =
-                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * wei_sz, nullptr, nullptr);
-            cl_mem out_dev =
-                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(Tout) * out_sz, nullptr, nullptr);
-            EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
             void* in_dev;
             void* wei_dev;
             void* out_dev;
+
             EXPECT(hipMalloc(&in_dev, sizeof(TRef) * in_sz) == hipSuccess);
             EXPECT(hipMalloc(&wei_dev, sizeof(TRef) * wei_sz) == hipSuccess);
             EXPECT(hipMalloc(&out_dev, sizeof(Tout) * out_sz) == hipSuccess);
-#endif
+
             EXPECT(miopenCreateConvolutionDescriptor(&convDesc) == miopenStatusSuccess);
             EXPECT(miopenInitConvolutionNdDescriptor(convDesc,
                                                      3,
@@ -851,27 +719,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 rand_tensor_integer(wei);
                 /// \ref copy_non_packed_output_before_convolution
                 rand_tensor_integer(out);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueWriteBuffer(q,
-                                              in_dev,
-                                              CL_TRUE,
-                                              0,
-                                              sizeof(TRef) * in_sz,
-                                              in.data.data(),
-                                              0,
-                                              nullptr,
-                                              nullptr);
-                status |= clEnqueueWriteBuffer(q,
-                                               wei_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * wei_sz,
-                                               wei.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -884,7 +732,6 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-#endif
 
                 cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(),
                                         in,
@@ -910,23 +757,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                        miopenStatusSuccess);
 
                 tensor<Tout> out_host(out_len, out_strides);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueReadBuffer(q,
-                                             out_dev,
-                                             CL_TRUE,
-                                             0,
-                                             sizeof(Tout) * out_sz,
-                                             out_host.data.data(),
-                                             0,
-                                             nullptr,
-                                             nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(out_host.data.data(),
                                  out_dev,
                                  sizeof(Tout) * out_sz,
                                  hipMemcpyDeviceToHost) == hipSuccess);
-#endif
 
                 // we expect excact match, since use integer
                 valid_result = verify_tensor(out_host, out);
@@ -938,27 +773,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 rand_tensor_integer(wei);
                 /// \ref copy_non_packed_output_before_convolution
                 rand_tensor_integer(in);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueWriteBuffer(q,
-                                              out_dev,
-                                              CL_TRUE,
-                                              0,
-                                              sizeof(TRef) * out_sz,
-                                              out.data.data(),
-                                              0,
-                                              nullptr,
-                                              nullptr);
-                status |= clEnqueueWriteBuffer(q,
-                                               wei_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * wei_sz,
-                                               wei.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 /// \ref copy_non_packed_output_before_convolution
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
@@ -971,7 +786,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-#endif
+
                 cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(),
                                               in,
                                               wei,
@@ -996,23 +811,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                        miopenStatusSuccess);
 
                 tensor<TRef> in_host(in_len, in_strides);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueReadBuffer(q,
-                                             in_dev,
-                                             CL_TRUE,
-                                             0,
-                                             sizeof(TRef) * in_sz,
-                                             in_host.data.data(),
-                                             0,
-                                             nullptr,
-                                             nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(in_host.data.data(),
                                  in_dev,
                                  sizeof(TRef) * in_sz,
                                  hipMemcpyDeviceToHost) == hipSuccess);
-#endif
 
                 // we expect excact match, since use integer
                 valid_result = verify_tensor(in_host, in);
@@ -1023,27 +826,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 rand_tensor_integer(out, 3, -2);
                 /// \ref copy_non_packed_output_before_convolution
                 rand_tensor_integer(wei);
-#if MIOPEN_BACKEND_OPENCL
-                status |= clEnqueueWriteBuffer(q,
-                                               in_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * in_sz,
-                                               in.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                status |= clEnqueueWriteBuffer(q,
-                                               out_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * out_sz,
-                                               out.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -1056,7 +839,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                                  out.data.data(),
                                  sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-#endif
+
                 cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(),
                                                 in,
                                                 wei,
@@ -1081,23 +864,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                        miopenStatusSuccess);
 
                 tensor<TRef> wei_host(wei_len, wei_strides);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueReadBuffer(q,
-                                             wei_dev,
-                                             CL_TRUE,
-                                             0,
-                                             sizeof(TRef) * wei_sz,
-                                             wei_host.data.data(),
-                                             0,
-                                             nullptr,
-                                             nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(wei_host.data.data(),
                                  wei_dev,
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyDeviceToHost) == hipSuccess);
-#endif
 
                 // we expect excact match, since use integer
                 valid_result = verify_tensor(wei_host, wei, 8.0); // max possible int
@@ -1123,15 +894,9 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             miopenDestroyTensorDescriptor(weiDesc);
             miopenDestroyTensorDescriptor(outDesc);
 
-#if MIOPEN_BACKEND_OPENCL
-            clReleaseMemObject(in_dev);
-            clReleaseMemObject(wei_dev);
-            clReleaseMemObject(out_dev);
-#elif MIOPEN_BACKEND_HIP
             hipFree(in_dev);
             hipFree(wei_dev);
             hipFree(out_dev);
-#endif
         };
 
         iterate_conv_3d(run_conv_3d);

From 505f6abd240a9198bc0fc04761fc3c397e110895 Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Mon, 9 Oct 2023 09:26:53 +0000
Subject: [PATCH 20/23] fix build

---
 src/solver/conv_direct_naive_conv_wrw.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp
index 4bb7f9c657..a8c4d40e0b 100644
--- a/src/solver/conv_direct_naive_conv_wrw.cpp
+++ b/src/solver/conv_direct_naive_conv_wrw.cpp
@@ -127,8 +127,6 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
 
     int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);
 
-    int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);
-
     if(problem.Is2d())
     {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {

From c34c72a2ecf2fffcfb00fba176e7801e9dccbd35 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Mon, 9 Oct 2023 09:29:41 -0700
Subject: [PATCH 21/23] fix Clang Format issue

---
 src/kernels/gpu_reference_kernel/naive_conv.cpp | 2 +-
 src/solver/conv_direct_naive_conv.cpp           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp
index 531c7fd7bb..125eff94f3 100644
--- a/src/kernels/gpu_reference_kernel/naive_conv.cpp
+++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp
@@ -128,7 +128,7 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val)
 ///
 /// - strides here are stored right to left, i.e., for NHWC, stride for N is
 /// at index 3 while stride for C is at index 0. This is different from how the
-/// tensor descriptors store strides, which is always NCHW order, left-to-right. 
+/// tensor descriptors store strides, which is always NCHW order, left-to-right.
 
 template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index f4f2027293..fb3e42bfb1 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -113,7 +113,7 @@ std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
     std::ostringstream kernel_name;
 
     /// \todo remove packed reference convolution kernels --amberhassaan
-#ifndef NDEBUG// enable in debug mode only
+#ifndef NDEBUG // enable in debug mode only
     if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS()))
     {
         kernel_name << "naive_conv_packed_";

From 220e42323756426fdbddb4eb7ad76e340c69e355 Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Wed, 11 Oct 2023 20:09:37 +0000
Subject: [PATCH 22/23] resolve conflicts and handle feedback

---
 src/ocl/convolutionocl.cpp              | 6 +++++-
 test/gtest/conv_api_strided_tensors.cpp | 4 ++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp
index bc60fc17f4..d66186577c 100644
--- a/src/ocl/convolutionocl.cpp
+++ b/src/ocl/convolutionocl.cpp
@@ -397,8 +397,12 @@ void ConvolutionDescriptor::ValidateTensors(const ConvTensors& tensors) const
             // strides in NCHW order for some weird reason.
             g_stride_index = td.GetStrides().size() - 1;
         }
+        else
+        {
+            MIOPEN_THROW(miopenStatusInternalError, "Layout not supported for grouped convolution");
+        }
 
-        if(g_stride_index != 1)
+        if(g_stride_index != -1)
         {
             return (td.GetStrides()[g_stride_index] % this->group_count) != 0;
         }
diff --git a/test/gtest/conv_api_strided_tensors.cpp b/test/gtest/conv_api_strided_tensors.cpp
index 9a2876b3f0..04d56ec908 100644
--- a/test/gtest/conv_api_strided_tensors.cpp
+++ b/test/gtest/conv_api_strided_tensors.cpp
@@ -139,6 +139,9 @@ class ConvStridedTensors : public ::testing::Test
     std::vector<float> h_output;
 };
 
+/// \todo re-enable this test after NCDHW grouped convolution lands (PR 2429)
+/// \todo add cpu reference convolution for verification --amberhassaan
+#if 0
 TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented)
 {
     auto device = Device(handle);
@@ -194,3 +197,4 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented)
               miopenStatusSuccess);
     ASSERT_TRUE(device.Synchronize());
 }
+#endif

From baa2da45878f151db1ad1e517973aab109b5642f Mon Sep 17 00:00:00 2001
From: "M. Amber Hassaan" <amber_474@yahoo.com>
Date: Wed, 11 Oct 2023 21:47:44 +0000
Subject: [PATCH 23/23] advance fin

---
 fin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fin b/fin
index b2f3f4db3c..afc1a8d87e 160000
--- a/fin
+++ b/fin
@@ -1 +1 @@
-Subproject commit b2f3f4db3c3d7dd757e6d9e68719a780d8114dfa
+Subproject commit afc1a8d87e6d00c82903942007bb370ee1f6c760