ROCm · junliume · Oct 12, 2023 · Aug 20, 2023 · Sep 23, 2023 · Sep 23, 2023
@@ -91,6 +91,7 @@ inline T gen_0_to_B(T B)
 template <typename T>
 inline T gen_A_to_B(T A, T B)
 {
+    assert(B > A);
     return gen_0_to_B(B - A) + A;
 }
 

@@ -36,6 +36,7 @@
 #include <miopen/names.hpp>
 #include <miopen/invoke_params.hpp>
 #include <miopen/invoker.hpp>
+#include <miopen/conv/tensors.hpp>
 
 #include <nlohmann/json_fwd.hpp>
 
@@ -404,6 +405,9 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor
 
     friend void to_json(nlohmann::json& json, const ConvolutionDescriptor& conv);
     friend void from_json(const nlohmann::json& json, ConvolutionDescriptor& conv);
+
+private:
+    void ValidateTensors(const ConvTensors& conv_tensors) const;
 };
 
 void ConvolutionBackwardBias(const Handle& handle,

@@ -126,9 +126,9 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val)
 /// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw,
 /// and nhwc should be nhwgc. Same follows for the 3D case.
 ///
-/// - strides here are in the little-endian order, i.e., for NHWC, stride for N is
-/// at index 3 while stride for C is at index 0. This is reverse of how strides are
-/// stored in tensor descriptors, which are big-endian.
+/// - strides here are stored right to left, i.e., for NHWC, stride for N is
+/// at index 3 while stride for C is at index 0. This is different from how the
+/// tensor descriptors store strides, which is always NCHW order, left-to-right.
 
 template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,

@@ -287,30 +287,6 @@ void ConvolutionDescriptor::FindConvFwdAlgorithm(Handle& handle,
 
 namespace {
 
-void ValidateConvTensors(const ConvTensors& tensors)
-{
-    const auto invalid_buffers =
-        tensors.x == nullptr || tensors.w == nullptr || tensors.y == nullptr;
-
-    const auto tensor_sizes_not_matched = tensors.xDesc.GetSize() != tensors.yDesc.GetSize() ||
-                                          tensors.xDesc.GetSize() != tensors.wDesc.GetSize();
-
-    const auto trivial_tensor_types_not_matched =
-        tensors.xDesc.GetType() != tensors.yDesc.GetType() && tensors.xDesc.GetType() != miopenInt8;
-
-    // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) {
-    //    MIOPEN_THROW(miopenStatusBadParm);
-    //}
-
-    const auto x_tensor_invalid = tensors.xDesc.GetSize() < 3;
-
-    const auto bad_parameters = invalid_buffers || tensor_sizes_not_matched ||
-                                trivial_tensor_types_not_matched || x_tensor_invalid;
-
-    if(bad_parameters)
-        MIOPEN_THROW(miopenStatusBadParm);
-}
-
 void ValidateAlphaBeta(const void* alpha, const void* beta)
 {
     if(!float_equal(*(static_cast<const float*>(alpha)), 1.0) ||
@@ -401,6 +377,88 @@ static void ConvForwardCheckNumerics(const Handle& handle,
     }
 }
 
+void ConvolutionDescriptor::ValidateTensors(const ConvTensors& tensors) const
+{
+
+    // Group stride in current TensorDescriptor is implicit. When invoking kernels,
+    // we need to add the group dimension G and compute its stride. We want the stride
+    // left of C to be a multiple of group count G. e.g. for NCHW, the stride for N
+    // should be a multiple of G so that we can compute the strides for NGCHW
+    auto bad_group_stride = [this](const TensorDescriptor& td) {
+        auto l             = td.GetLayout_t();
+        int g_stride_index = -1;
+        if(l == miopenTensorNCHW || l == miopenTensorNCDHW)
+        {
+            g_stride_index = 0; // stride index for N;
+        }
+        else if(l == miopenTensorNHWC || l == miopenTensorNDHWC)
+        {
+            // stride index for W. Normally this would be 2nd-last stride but we store
+            // strides in NCHW order for some weird reason.
+            g_stride_index = td.GetStrides().size() - 1;
+        }
+        else
+        {
+            MIOPEN_THROW(miopenStatusInternalError, "Layout not supported for grouped convolution");
+        }
+
+        if(g_stride_index != -1)
+        {
+            return (td.GetStrides()[g_stride_index] % this->group_count) != 0;
+        }
+
+        return false;
+    };
+
+    // invalid_buffers
+    if(tensors.x == nullptr || tensors.w == nullptr || tensors.y == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "One of the convolution tensors is null");
+    }
+
+    // x_tensor_invalid =
+    if(tensors.xDesc.GetSize() < 3)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "input tensor's number of dimensions is wrong");
+    }
+
+    // tensor_sizes_not_matched =
+    if(tensors.xDesc.GetSize() != tensors.yDesc.GetSize() ||
+       tensors.xDesc.GetSize() != tensors.wDesc.GetSize())
+    {
+        MIOPEN_THROW(miopenStatusBadParm,
+                     "number of dimensions mismatch between input, output and weights tensors");
+    }
+
+    // trivial_tensor_types_not_matched =
+    if(tensors.xDesc.GetType() != tensors.yDesc.GetType() &&
+       tensors.xDesc.GetType() != miopenInt8 && tensors.xDesc.GetType() != miopenInt8x4)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "input/output tensor data types do not match");
+    }
+
+    // check for bad_group_stride. This applies for input and output only. There
+    // is no check for weight tensor currently.
+    // no need to check for group_count == 1
+
+    if((this->group_count > 1) && bad_group_stride(tensors.xDesc))
+    {
+        MIOPEN_THROW(
+            miopenStatusBadParm,
+            "Invalid input tensor strides. Channel stride must be a multiple of group count");
+    }
+    if((this->group_count > 1) && bad_group_stride(tensors.yDesc))
+    {
+        MIOPEN_THROW(
+            miopenStatusBadParm,
+            "Invalid output tensor strides. Channel stride must be a multiple of group count");
+    }
+
+    // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) {
+    //    MIOPEN_THROW(miopenStatusBadParm);
+    //}
+}
+
 void ConvolutionDescriptor::ConvolutionForward(Handle& handle,
                                                const void* alpha,
                                                const TensorDescriptor& xDesc,
@@ -416,13 +474,8 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle,
 {
     MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize);
 
-    if(!(xDesc.IsPacked() && wDesc.IsPacked() && yDesc.IsPacked()))
-    {
-        MIOPEN_THROW(miopenStatusNotImplemented, "Only fully packed tensors are supported");
-    }
-
     const auto tensors = ConvFwdTensors{xDesc, x, wDesc, w, yDesc, y};
-    ValidateConvTensors(tensors);
+    ValidateTensors(tensors);
     ValidateAlphaBeta(alpha, beta);
 
     ConvForwardCheckNumerics(handle, tensors, [&]() {
@@ -735,7 +788,7 @@ void ConvolutionDescriptor::ConvolutionForwardImmediate(Handle& handle,
     MIOPEN_LOG_I("solver_id = " << solver_id.ToString() << ", workspace = " << workSpaceSize);
     const auto tensors = ConvFwdTensors{xDesc, x, wDesc, w, yDesc, y};
 
-    ValidateConvTensors(tensors);
+    ValidateTensors(tensors);
     if(!solver_id.IsValid())
         MIOPEN_THROW(miopenStatusBadParm);
 
@@ -871,7 +924,7 @@ void ConvolutionDescriptor::ConvolutionBackwardData(Handle& handle,
 
     auto tensors = ConvBwdTensors{dyDesc, dy, wDesc, w, dxDesc, dx};
 
-    ValidateConvTensors(tensors);
+    ValidateTensors(tensors);
     ValidateAlphaBeta(alpha, beta);
 
     ConvBwdCheckNumerics(handle, tensors, beta, [&]() {
@@ -937,7 +990,7 @@ void ConvolutionDescriptor::ConvolutionBackwardImmediate(Handle& handle,
     MIOPEN_LOG_I("solver_id = " << solver_id.ToString() << ", workspace = " << workSpaceSize);
     auto tensors = ConvBwdTensors{dyDesc, dy, wDesc, w, dxDesc, dx};
 
-    ValidateConvTensors(tensors);
+    ValidateTensors(tensors);
 
     static const float beta = 0.0f;
     ConvBwdCheckNumerics(handle, tensors, &beta, [&]() {
@@ -1071,7 +1124,7 @@ void ConvolutionDescriptor::ConvolutionBackwardWeights(const Handle& handle,
 {
     MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize);
     decltype(auto) tensors = ConvWrwTensors{dyDesc, dy, xDesc, x, dwDesc, dw};
-    ValidateConvTensors(tensors);
+    ValidateTensors(tensors);
     ValidateAlphaBeta(alpha, beta);
 
     if(xDesc.GetType() == miopenInt8)
@@ -1134,7 +1187,7 @@ void ConvolutionDescriptor::ConvolutionWrwImmediate(Handle& handle,
 {
     MIOPEN_LOG_I("solver_id = " << solver_id.ToString() << ", workspace = " << workSpaceSize);
     auto tensors = ConvWrwTensors{dyDesc, dy, xDesc, x, dwDesc, dw};
-    ValidateConvTensors(tensors);
+    ValidateTensors(tensors);
 
     if(xDesc.GetType() == miopenInt8)
         MIOPEN_THROW(miopenStatusBadParm);

@@ -111,11 +111,15 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS);
 std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
 {
     std::ostringstream kernel_name;
+
+    /// \todo remove packed reference convolution kernels --amberhassaan
+#ifndef NDEBUG // enable in debug mode only
     if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS()))
     {
         kernel_name << "naive_conv_packed_";
     }
     else
+#endif
     {
         kernel_name << "naive_conv_nonpacked_";
     }

@@ -134,12 +134,8 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
-    const auto is_f8 = [&]() {
-        if(kernel.kernel_file == "fp8_naive_conv.cpp")
-            return true;
-        else
-            return false;
-    }();
+    const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp");
+
     kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
     int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);

@@ -122,12 +122,6 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
     KernelInfo kernel;
 
     kernel.kernel_file = ConvDirectNaiveConvKernelFile(ctx, problem);
-    const auto is_f8   = [&]() {
-        if(kernel.kernel_file == "fp8_naive_conv.cpp")
-            return true;
-        else
-            return false;
-    }();
     kernel.kernel_name = ConvDirectNaiveConvKernelName(problem);
     kernel.g_wk.clear();
 
@@ -139,6 +133,8 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
+    const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp");
+
     kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
     int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);

@@ -121,13 +121,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
+    const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp");
+
     kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
-    const auto is_f8    = [&]() {
-        if(kernel.kernel_file == "fp8_naive_conv.cpp")
-            return true;
-        else
-            return false;
-    }();
 
     int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);