ROCm · junliume · Oct 5, 2023 · Aug 20, 2023 · Sep 23, 2023 · Sep 23, 2023
@@ -388,6 +388,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/workaround_issue_1431.hpp
         kernels/hip_f8_impl.hpp
         kernels/hip_float8.hpp
+        kernels/stride_array.hpp
         )
 
     set(MIOPEN_KERNELS

@@ -73,7 +73,7 @@ static boost::filesystem::path HipBuildImpl(boost::optional<TmpDir>& tmp_dir,
     auto env = std::string("");
 
     if(params.find("-std=") == std::string::npos)
-        params += " --std=c++11";
+        params += " --std=c++17";
 
 #if HIP_PACKAGE_VERSION_FLAT < 4001000000ULL
     params += " --cuda-gpu-arch=" + lots.device;

@@ -26,14 +26,15 @@
 #ifndef GUARD_MIOPEN_HIPOC_KERNEL_HPP
 #define GUARD_MIOPEN_HIPOC_KERNEL_HPP
 
-#include <array>
-#include <cassert>
 #include <miopen/errors.hpp>
 #include <miopen/hipoc_program.hpp>
 #include <miopen/stringutils.hpp>
 #include <miopen/op_kernel_args.hpp>
+
+#include <array>
+#include <cassert>
+#include <cstring>
 #include <vector>
-#include <memory.h>
 
 namespace miopen {
 
@@ -47,29 +48,20 @@ inline HipEventPtr make_hip_event()
 
 #if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017
 
-#if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017
 template <class T, class U>
 struct KernelArgsPair
 {
-    static const int alignment    = sizeof(U);
-    static const int padding      = (alignment - sizeof(T) % alignment) % alignment;
-    static const int second_index = sizeof(T) + padding;
+    constexpr static auto alignU       = alignof(U);
+    constexpr static auto padding      = (alignU - (sizeof(T) % alignU)) % alignU;
+    constexpr static auto second_index = sizeof(T) + padding;
     KernelArgsPair(T x, U y)
     {
         new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
         new(buffer + second_index) U(y);
     }
+
     alignas(U) char buffer[second_index + sizeof(U)] = {};
 };
-#else
-template <class T, class U>
-struct KernelArgsPair
-{
-    KernelArgsPair(T x, U y) : first(x), second(y) {}
-    T first;
-    U second;
-};
-#endif
 
 template <class... Ts>
 struct KernelArgsPack;

@@ -25,9 +25,15 @@
  *******************************************************************************/
 #pragma once
 
-#include <string>
 #include <miopen/execution_context.hpp>
 #include <miopen/problem_description.hpp>
+#include "miopen/../../kernels/stride_array.hpp"
+
+#include <array>
+#include <algorithm>
+#include <cassert>
+#include <string>
+#include <vector>
 
 namespace miopen {
 
@@ -54,5 +60,92 @@ bool IsOutputBfp16(const ProblemDescription&);
 bool IsOutputInt8(const ProblemDescription&);
 bool IsOutputInt32(const ProblemDescription&);
 
+namespace conv_internal {
+
+void DebugPrintTensorStrides(const TensorDescriptor& inDesc,
+                             const TensorDescriptor& wDesc,
+                             const TensorDescriptor& outDesc);
+
+/**
+ * Get the index where group (G) stride should go. For NCHW, we want to convert
+ * its strides to NGCHW, and for NHWC, we want to convert its strides to NHWGC.
+ * Same applies for the 3D case.
+ */
+int GetGroupStrideIndex(const ProblemDescription& problem);
+
+/**
+ * split the strides for C dimension in a tensor descriptor into (G, C_per_group).
+ * Normally, (in packed case) num channels is a multiplying factor in the stride of
+ * whatever lies to the left of C, e.g., in NCHW, N's stride contains C as a
+ * factor. We output NGCHW for NCHW (and NHWGC for NHWC)
+ * where the stride[G] = stride[N] / num_groups
+ */
+template <typename V>
+V SplitStrideCtoGC(int num_groups, const V& orig_strides, int G_stride_idx)
+{
+    assert(G_stride_idx > 0 && G_stride_idx <= orig_strides.size());
+    // (G_stride_idx - 1) is the stride index of whatever lies to the left and
+    // contains C or K as a multiplying factor. We divide this value by num_groups
+    // to get G_stride_val
+    assert(orig_strides[G_stride_idx - 1] % num_groups == 0);
+
+    V ret{orig_strides};
+    auto G_stride_val = orig_strides[G_stride_idx - 1] / num_groups;
+
+    ret.insert(ret.begin() + G_stride_idx, G_stride_val);
+
+    return ret;
+}
+
+/**
+ * Weight tensor has original dims: [K, C_per_group, Y, X] (2D case)
+ * We return a new stride vector with strides for [G, K_per_group, C_per_group, Y, X]
+ * Stride for G is computed as stride[C_per_group] * K_per_group and inserted at
+ * left most position
+ */
+template <typename V>
+V SplitWeiStrideKtoGK(int k_per_group, const V& wei_strides)
+{
+    V ret{wei_strides};
+    ret.insert(ret.begin(), wei_strides[0] * k_per_group);
+    return ret;
+}
+
+template <unsigned N>
+struct ChooseStride
+{
+};
+
+template <>
+struct ChooseStride<5u>
+{
+    using type = Strides5D;
+};
+
+template <>
+struct ChooseStride<6u>
+{
+    using type = Strides6D;
+};
+
+template <unsigned N, typename V>
+auto MakeStrideArray(V vec)
+{
+    typename ChooseStride<N>::type ret;
+    assert(vec.size() == N);
+
+    // MIOpen stores strides for NHWC in NCHW order, i.e. C stride in 2nd from left.
+    // We sort the input stride vector so that smallest stride is at index 0. This
+    // (little-endian) order is what naive convolution kernel expects for strides
+    std::sort(vec.begin(), vec.end());
+
+    for(unsigned i = 0; i < N; ++i)
+    {
+        ret[i] = static_cast<StrideIndexType>(vec[i]);
+    }
+    return ret;
+}
+} // end namespace conv_internal
+
 } // namespace solver
 } // namespace miopen
@@ -58,6 +58,6 @@
 
 #define KERNEL_NAME_SUFFIX CAT(CAT(INPUT_TYPE, _), CAT(CAT(WEIGHTS_TYPE, _), OUTPUT_TYPE))
 
-#define FWD_KERNEL_NAME CAT(naive_conv_fwd_nchw_, KERNEL_NAME_SUFFIX)
-#define BWD_KERNEL_NAME CAT(naive_conv_bwd_nchw_, KERNEL_NAME_SUFFIX)
-#define WRW_KERNEL_NAME CAT(naive_conv_wrw_nchw_, KERNEL_NAME_SUFFIX)
+#define FWD_KERNEL_NAME CAT(naive_conv_packed_fwd_nchw_, KERNEL_NAME_SUFFIX)
+#define BWD_KERNEL_NAME CAT(naive_conv_packed_bwd_nchw_, KERNEL_NAME_SUFFIX)
+#define WRW_KERNEL_NAME CAT(naive_conv_packed_wrw_nchw_, KERNEL_NAME_SUFFIX)