From 26d801a0c2d7a83c253c1a47d849ffcd0085b504 Mon Sep 17 00:00:00 2001
From: JD <jahandad@gmail.com>
Date: Tue, 19 Sep 2023 12:53:55 -0500
Subject: [PATCH 01/36] MIOpen FP8 and BFP8 enablement (#2251)

---------

Co-authored-by: xinlipn <xinlipn@gmail.com>
Co-authored-by: Sujatha Phatak <technicalgrp89@gmail.com>
Co-authored-by: Evgenii Averin <86725875+averinevg@users.noreply.github.com>
Co-authored-by: Jun Liu <Liu.Jun@amd.com>
---
 driver/conv_driver.hpp                        | 160 ++++-
 driver/driver.hpp                             |  34 +-
 driver/main.cpp                               |   8 +
 include/miopen/config.h.in                    |   2 +
 include/miopen/miopen.h                       |  29 +-
 src/CMakeLists.txt                            |   9 +-
 src/check_numerics.cpp                        |  43 +-
 src/conv/problem_description.cpp              |   2 +-
 src/convolution.cpp                           |  13 +
 src/gemm_v2.cpp                               | 589 +++++++++++-----
 src/hipoc/hipoc_program.cpp                   |   9 +-
 .../miopen/conv/problem_description.hpp       |  19 +
 src/include/miopen/convolution.hpp            |  38 +
 src/include/miopen/datatype.hpp               |  21 +-
 src/include/miopen/gemm_v2.hpp                |  76 +-
 src/include/miopen/handle.hpp                 |   4 +
 src/include/miopen/hip_f8_impl.hpp            |   1 +
 src/include/miopen/hip_float8.hpp             |   1 +
 src/include/miopen/problem_description.hpp    |   1 +
 .../miopen/solver/conv_direct_naive_conv.hpp  |   6 +-
 .../problem_description_interpreter.hpp       |  16 +
 src/include/miopen/tensor.hpp                 |  10 +-
 src/include/miopen/visit_float.hpp            |   2 +
 src/kernels/MIOpenCheckNumerics.cpp           | 205 ++++++
 src/kernels/MIOpenIm2d2Col.cl                 |  10 +-
 src/kernels/MIOpenUtilKernels4.cl             |  10 +-
 src/kernels/bfloat16_dev.hpp                  | 179 +++++
 src/kernels/float_types.h                     |  45 ++
 .../gpu_reference_kernel/fp8_kern_types.h     |  63 ++
 .../gpu_reference_kernel/fp8_naive_conv.cpp   | 571 +++++++++++++++
 src/kernels/hip_f8_impl.hpp                   | 361 ++++++++++
 src/kernels/hip_float8.hpp                    | 651 ++++++++++++++++++
 src/ocl/tensorocl.cpp                         |  14 +-
 src/pooling.cpp                               |   4 +-
 src/pooling_api.cpp                           |   2 +
 src/reducetensor.cpp                          |  11 +-
 src/solver/batchnorm/forward_inference_ck.cpp |   4 +
 src/solver/conv_MP_bidirectional_winograd.cpp |   3 +
 src/solver/conv_asm_1x1u.cpp                  |   6 +
 src/solver/conv_asm_1x1u_bias_activ_fused.cpp |   7 +-
 src/solver/conv_asm_1x1u_stride2.cpp          |   6 +
 src/solver/conv_asm_3x3u.cpp                  |   6 +
 src/solver/conv_asm_5x10u2v2b1.cpp            |   2 +
 src/solver/conv_asm_5x10u2v2f1.cpp            |   3 +
 .../conv_asm_7x7c3h224w224k64u2v2p3q3f1.cpp   |   3 +
 src/solver/conv_asm_dir_BwdWrW1x1.cpp         |   3 +
 src/solver/conv_asm_dir_BwdWrW3x3.cpp         |   3 +
 ...onv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp |   3 +
 src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp |   3 +
 .../conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp   |   3 +
 src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp |   3 +
 .../conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp  |   3 +
 .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp   |   3 +
 .../conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp   |   3 +
 .../conv_asm_implicit_gemm_v4r1_dynamic.cpp   |   3 +
 ...m_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp |   3 +
 ...onv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp |   3 +
 src/solver/conv_bin_wino3x3U.cpp              |   3 +
 src/solver/conv_bin_winoRxS.cpp               |   3 +
 src/solver/conv_bin_winoRxS_fused.cpp         |   3 +
 .../conv_ck_igemm_fwd_bias_activ_fused.cpp    |  11 +
 .../conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp     |   3 +
 src/solver/conv_direct_naive_conv.cpp         |  57 +-
 src/solver/conv_direct_naive_conv_bwd.cpp     |  93 ++-
 src/solver/conv_direct_naive_conv_fwd.cpp     |  97 ++-
 src/solver/conv_direct_naive_conv_wrw.cpp     |  93 ++-
 ...ip_implicit_gemm_3d_grouped_bwd_xdlops.cpp |  10 +
 ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp |   8 +
 ...ip_implicit_gemm_3d_grouped_wrw_xdlops.cpp |   8 +
 ...conv_hip_implicit_gemm_bwd_data_xdlops.cpp |  11 +
 .../conv_hip_implicit_gemm_bwd_v1r1.cpp       |   3 +
 ...conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp |   3 +
 .../conv_hip_implicit_gemm_bwd_v4r1.cpp       |   3 +
 ...conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp |   3 +
 .../conv_hip_implicit_gemm_fwd_v4r1.cpp       |   7 +
 .../conv_hip_implicit_gemm_fwd_v4r4.cpp       |   3 +
 ...conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp |   3 +
 ...licit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp |   3 +
 ...conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp |   3 +
 .../conv_hip_implicit_gemm_fwd_xdlops.cpp     |  10 +
 ...v_hip_implicit_gemm_grouped_fwd_xdlops.cpp |  10 +
 .../conv_hip_implicit_gemm_wrw_v4r4.cpp       |   3 +
 ...conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp |   3 +
 ...licit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp |   3 +
 src/solver/conv_mlir_igemm_bwd.cpp            |   2 +
 src/solver/conv_mlir_igemm_bwd_xdlops.cpp     |   2 +
 src/solver/conv_mlir_igemm_fwd.cpp            |   2 +
 src/solver/conv_mlir_igemm_fwd_xdlops.cpp     |   2 +
 src/solver/conv_mlir_igemm_wrw.cpp            |   2 +
 src/solver/conv_mlir_igemm_wrw_xdlops.cpp     |   2 +
 src/solver/conv_multipass_wino3x3WrW.cpp      |   3 +
 src/solver/conv_ocl_dir2D11x11.cpp            |   3 +
 src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp      |   3 +
 src/solver/conv_ocl_dir2D_bwdWrW_2.cpp        |   3 +
 src/solver/conv_ocl_dir2D_bwdWrW_53.cpp       |   3 +
 src/solver/conv_ocl_dir2Dfwd.cpp              |   3 +
 src/solver/conv_ocl_dir2Dfwd1x1.cpp           |   3 +
 src/solver/conv_ocl_dir2Dfwdgen.cpp           |   3 +
 src/solver/conv_winoRxS.cpp                   |   3 +
 src/solver/conv_winoRxS_fused.cpp             |   3 +
 src/solver/gemm.cpp                           | 171 +++--
 src/solver/gemm_bwd.cpp                       | 126 +++-
 src/solver/gemm_wrw.cpp                       | 101 ++-
 src/solver/mlir_common.cpp                    |   2 +
 src/tensor.cpp                                |  21 +
 src/tensor_api.cpp                            |  31 +
 test/conv_common.hpp                          |  99 ++-
 test/cpu_conv.hpp                             | 203 +++---
 test/driver.hpp                               |  19 +
 test/gtest/api_convbiasactiv.cpp              |   7 +-
 test/gtest/cba.hpp                            |   1 +
 test/gtest/conv_embed_db.cpp                  |   6 +-
 test/gtest/conv_hip_igemm_xdlops.cpp          |   2 +
 test/gtest/conv_igemm_mlir.cpp                |   7 +-
 test/gtest/conv_igemm_mlir_xdlops.cpp         |   2 +
 test/gtest/conv_tensor_gen.hpp                | 125 ++++
 test/gtest/conv_test_base.hpp                 |  40 +-
 test/gtest/conv_trans.cpp                     |   2 +
 test/gtest/{solver.hpp => get_solver.hpp}     |  51 +-
 test/gtest/solver_bwd.hpp                     | 185 +++++
 test/gtest/solver_bwd_f8.cpp                  |  68 ++
 test/gtest/solver_convasm3x3u.cpp             |  55 +-
 test/gtest/solver_f8.hpp                      | 263 +++++++
 test/gtest/solver_fwd.hpp                     | 125 ++++
 test/gtest/solver_fwd_f8.cpp                  |  70 ++
 test/gtest/solver_wrw.hpp                     | 186 +++++
 test/gtest/solver_wrw_f8.cpp                  |  41 ++
 test/gtest/tensor_api.cpp                     |   2 +-
 test/perf_models/resnet50_v1.5.sh             |   2 +-
 test/tensor_holder.hpp                        |  24 +
 test/verify.hpp                               |  34 +-
 131 files changed, 5222 insertions(+), 646 deletions(-)
 create mode 120000 src/include/miopen/hip_f8_impl.hpp
 create mode 120000 src/include/miopen/hip_float8.hpp
 create mode 100644 src/kernels/MIOpenCheckNumerics.cpp
 create mode 100644 src/kernels/gpu_reference_kernel/fp8_kern_types.h
 create mode 100644 src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp
 create mode 100644 src/kernels/hip_f8_impl.hpp
 create mode 100644 src/kernels/hip_float8.hpp
 create mode 100644 test/gtest/conv_tensor_gen.hpp
 rename test/gtest/{solver.hpp => get_solver.hpp} (59%)
 create mode 100644 test/gtest/solver_bwd.hpp
 create mode 100644 test/gtest/solver_bwd_f8.cpp
 create mode 100644 test/gtest/solver_f8.hpp
 create mode 100644 test/gtest/solver_fwd.hpp
 create mode 100644 test/gtest/solver_fwd_f8.cpp
 create mode 100644 test/gtest/solver_wrw.hpp
 create mode 100644 test/gtest/solver_wrw_f8.cpp
 mode change 100755 => 100644 test/perf_models/resnet50_v1.5.sh

diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp
index 868ebcdccc..5f67b83588 100644
--- a/driver/conv_driver.hpp
+++ b/driver/conv_driver.hpp
@@ -155,6 +155,26 @@ void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems)
     }
 }
 
+static inline miopenDataType_t DataTypeFromShortString(const std::string& type)
+{
+    static const std::unordered_map<std::string, miopenDataType_t> conv_map = {
+        {"fp32", miopenFloat},
+        {"fp16", miopenHalf},
+        {"bf16", miopenBFloat16},
+        {"fp8", miopenFloat8},
+        {"bf8", miopenBFloat8}};
+
+    const auto res = conv_map.find(type);
+    if(res != conv_map.end())
+    {
+        return res->second;
+    }
+    else
+    {
+        MIOPEN_THROW("Invalid compute/cast type short hand supplied");
+    }
+}
+
 template <typename T>
 bool readBufferFromFile(T* data, size_t dataNumItems, const char* fileName)
 {
@@ -225,6 +245,7 @@ class ConvDriver : public Driver
     int ChkLayout_ShortName();
 
     int GetandSetData() override;
+    bool TensorsCasted() const;
     std::vector<int> GetInputTensorLengthsFromCmdLine();
     std::vector<int> GetWeightTensorLengthsFromCmdLine();
     std::vector<int> GetBiasTensorLengthsFromCmdLine();
@@ -381,9 +402,14 @@ class ConvDriver : public Driver
         // Computation error of fp16 is ~2^13 (=8192) bigger than
         // the one of fp32 because mantissa is shorter by 13 bits.
         auto tolerance = (sizeof(Tgpu) == 4 || sizeof(Tgpu) == 1) ? 1.5e-6 : 8.2e-3;
+
         // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
         if(std::is_same<Tgpu, bfloat16>::value)
             tolerance *= 8.0;
+        constexpr bool is_fp8  = std::is_same<Tgpu, float8>::value;
+        constexpr bool is_bfp8 = std::is_same<Tgpu, bfloat8>::value;
+        if(is_bfp8 || is_fp8 || TensorsCasted())
+            tolerance *= 37.0;
         return tolerance;
     }
 
@@ -557,6 +583,34 @@ int ConvDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
     if(solution_value >= 0)
         immediate_solution = solution_value;
 
+    const std::set<std::string> valid_cast_types = {"fp32", "fp16", "bf16", "fp8", "bf8"};
+    if(inflags.GetValueStr("in_cast_type") != "-1")
+    {
+        const auto in_cast_type = inflags.GetValueStr("in_cast_type");
+        if(valid_cast_types.find(in_cast_type) == valid_cast_types.end())
+        {
+            std::cout << "Invalid value for in_cast_type argument:" << in_cast_type << std::endl;
+            return 1;
+        }
+    }
+    if(inflags.GetValueStr("wei_cast_type") != "-1")
+    {
+        const auto wei_cast_type = inflags.GetValueStr("wei_cast_type");
+        if(valid_cast_types.find(wei_cast_type) == valid_cast_types.end())
+        {
+            std::cout << "Invalid value for wei_cast_type argument:" << wei_cast_type << std::endl;
+            return 1;
+        }
+    }
+    if(inflags.GetValueStr("out_cast_type") != "-1")
+    {
+        const auto out_cast_type = inflags.GetValueStr("out_cast_type");
+        if(valid_cast_types.find(out_cast_type) == valid_cast_types.end())
+        {
+            std::cout << "Invalid value for out_cast_type argument:" << out_cast_type << std::endl;
+            return 1;
+        }
+    }
     return 0;
 }
 
@@ -619,6 +673,14 @@ int ConvDriver<Tgpu, Tref>::ChkLayout_ShortName()
     }
 }
 
+template <typename Tgpu, typename Tref>
+bool ConvDriver<Tgpu, Tref>::TensorsCasted() const
+{
+    return inflags.GetValueStr("in_cast_type") != "-1" ||
+           inflags.GetValueStr("wei_cast_type") != "-1" ||
+           inflags.GetValueStr("out_cast_type") != "-1";
+}
+
 template <typename Tgpu, typename Tref>
 int ConvDriver<Tgpu, Tref>::GetandSetData()
 {
@@ -626,7 +688,17 @@ int ConvDriver<Tgpu, Tref>::GetandSetData()
     std::vector<int> wei_len = GetWeightTensorLengthsFromCmdLine();
 
     SetTensorNd(inputTensor, in_len, inflags.GetValueStr("in_layout"), data_type);
+    if(inflags.GetValueStr("in_cast_type") != "-1")
+    {
+        const auto in_cast_type = DataTypeFromShortString(inflags.GetValueStr("in_cast_type"));
+        miopenSetTensorCastType(inputTensor, in_cast_type);
+    }
     SetTensorNd(weightTensor, wei_len, inflags.GetValueStr("fil_layout"), data_type);
+    if(inflags.GetValueStr("wei_cast_type") != "-1")
+    {
+        const auto in_cast_type = DataTypeFromShortString(inflags.GetValueStr("wei_cast_type"));
+        miopenSetTensorCastType(weightTensor, in_cast_type);
+    }
 
     if(inflags.GetValueInt("tensor_vect") == 1 && data_type == miopenInt8)
     {
@@ -658,6 +730,11 @@ int ConvDriver<Tgpu, Tref>::GetandSetData()
     miopenDataType_t y_type =
         (data_type == miopenInt8 || data_type == miopenInt8x4) ? miopenInt32 : data_type;
     SetTensorNd(outputTensor, out_len, inflags.GetValueStr("out_layout"), y_type);
+    if(inflags.GetValueStr("out_cast_type") != "-1")
+    {
+        const auto out_cast_type = DataTypeFromShortString(inflags.GetValueStr("out_cast_type"));
+        miopenSetTensorCastType(outputTensor, out_cast_type);
+    }
 
     if(inflags.GetValueInt("bias") != 0)
     {
@@ -821,6 +898,12 @@ int ConvDriver<Tgpu, Tref>::AddCmdLineArgs()
                          "\n<valid name>   Immediate mode, build and run specified solution"
                          "\n<invalid name> Use Find() API",
                          "string");
+    inflags.AddInputFlag(
+        "in_cast_type", 'U', "-1", "Cast type for input tensor, default to not set", "string");
+    inflags.AddInputFlag(
+        "out_cast_type", 'T', "-1", "Cast type for output tensor, default to not set", "string");
+    inflags.AddInputFlag(
+        "wei_cast_type", 'R', "-1", "Cast type for weight tensor, default to not set", "string");
 
     return 0;
 }
@@ -1049,7 +1132,6 @@ int ConvDriver<Tgpu, Tref>::SetConvDescriptorFromCmdLineArgs()
         convDesc, spatial_dim, pads.data(), conv_strides.data(), conv_dilations.data(), mode);
 
     miopenSetConvolutionGroupCount(convDesc, group_count);
-
     if(mode == miopenTranspose)
     {
         miopenSetTransposeConvNdOutputPadding(convDesc, spatial_dim, trans_output_pads.data());
@@ -1109,6 +1191,32 @@ void RanGenSubnormBuffer(T* buf, size_t size, int percentage)
     });
 }
 
+template <>
+float8 RanGenWeights()
+{
+    const auto tmp =
+        prng::gen_0_to_B(1.0) > 0.5 ? static_cast<float>(0.0) : static_cast<float>(1.0);
+    // 1 in 2 chance of number being positive
+    const float sign =
+        (prng::gen_0_to_B(1.0) > 0.5) ? static_cast<float>(-1) : static_cast<float>(1);
+    const auto tmp2 = static_cast<float>(std::numeric_limits<float8>::epsilon()) *
+                      static_cast<float>(2) * sign * static_cast<float>(tmp);
+    return static_cast<float8>(tmp2);
+}
+
+template <>
+bfloat8 RanGenWeights()
+{
+    const auto tmp =
+        prng::gen_0_to_B(1.0) > 0.5 ? static_cast<float>(0.0) : static_cast<float>(1.0);
+    // 1 in 2 chance of number being positive
+    const float sign =
+        (prng::gen_0_to_B(1.0) > 0.5) ? static_cast<float>(-1) : static_cast<float>(1);
+    const auto tmp2 = static_cast<float>(std::numeric_limits<float8>::epsilon()) *
+                      static_cast<float>(2) * sign * static_cast<float>(tmp);
+    return static_cast<bfloat8>(tmp2);
+}
+
 } // namespace detail
 
 template <typename Tgpu, typename Tref>
@@ -1135,11 +1243,14 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
         }
     }
 
-    bool is_transform       = IsInputTensorTransform();
-    bool is_int8            = data_type == miopenInt8 || data_type == miopenInt8x4;
-    size_t in_sz            = GetTensorSize(inputTensor);
-    size_t wei_sz           = GetTensorSize(weightTensor);
-    size_t out_sz           = GetTensorSize(outputTensor);
+    bool is_transform = IsInputTensorTransform();
+    bool is_int8      = data_type == miopenInt8 || data_type == miopenInt8x4;
+    // Data generated for very low precision types follows the same constraints whether its fp8,
+    // bfp8 or even if the interim tensors are being casted
+    bool is_fp8   = data_type == miopenFloat8 || data_type == miopenBFloat8 || TensorsCasted();
+    size_t in_sz  = GetTensorSize(inputTensor);
+    size_t wei_sz = GetTensorSize(weightTensor);
+    size_t out_sz = GetTensorSize(outputTensor);
     auto subnorm_percentage = miopen::Value(MIOPEN_DRIVER_SUBNORM_PERCENTAGE{});
 
     // Workaround: Pad buffers allocations to be a multiple of 2M
@@ -1338,7 +1449,10 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
         if(!weiFileName.empty())
             weiRead = readBufferFromFile<Tgpu>(wei.data.data(), wei_sz, weiFileName.c_str());
 
-    const Tgpu Data_scale = is_int8 ? static_cast<Tgpu>(127) : static_cast<Tgpu>(0.01);
+    const Tgpu Data_scale = is_int8 ? static_cast<Tgpu>(127)
+                                    : (is_fp8 ? static_cast<Tgpu>(1.0) : static_cast<Tgpu>(0.01));
+    const Tgpu Data_min   = (is_fp8 ? static_cast<Tgpu>(-1.0) : static_cast<Tgpu>(0.0));
+    const Tgpu Data_max   = (is_fp8 ? static_cast<Tgpu>(1.0) : static_cast<Tgpu>(1.0));
     if(is_int8)
     {
         if(inflags.GetValueInt("bias") != 0)
@@ -1361,6 +1475,7 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     }
     else
     {
+
         bool doutRead = false;
         if(is_bwd || is_wrw)
             if(!doutFileName.empty())
@@ -1375,7 +1490,8 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
                 /// initialization of input buffers regardless of which kinds of
                 /// convolutions are currently selectedfor testing (see the "-F" option).
                 /// Verification cache would be broken otherwise.
-                auto val = prng::gen_0_to_B(Data_scale);
+                auto val =
+                    is_fp8 ? prng::gen_A_to_B(Data_min, Data_max) : prng::gen_0_to_B(Data_scale);
                 if(is_bwd || is_wrw)
                     dout.data[i] = val;
             }
@@ -1394,8 +1510,11 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
             db_host     = tensor<Tref>(miopen::deref(biasTensor));
             for(int i = 0; i < b_sz; i++)
             {
-                b.data[i] = static_cast<Tgpu>(i % 8) + prng::gen_canonical<Tgpu>();
-                db[i]     = static_cast<Tgpu>(i % 8) + prng::gen_canonical<Tgpu>();
+                b.data[i] =
+                    static_cast<Tgpu>(i % 8) +
+                    (is_fp8 ? prng::gen_A_to_B(Data_min, Data_max) : prng::gen_canonical<Tgpu>());
+                db[i] = static_cast<Tgpu>(i % 8) + (is_fp8 ? prng::gen_A_to_B(Data_min, Data_max)
+                                                           : prng::gen_canonical<Tgpu>());
             }
 
             if(!biasFileName.empty())
@@ -1413,7 +1532,7 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
         for(int i = 0; i < in_sz; i++)
         {
             /// \ref move_rand
-            auto val = prng::gen_0_to_B(Data_scale);
+            auto val = is_fp8 ? prng::gen_A_to_B(Data_min, Data_max) : prng::gen_0_to_B(Data_scale);
             if(is_fwd || is_wrw)
                 in.data[i] = val;
         }
@@ -1475,7 +1594,12 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     }
     if(is_fwd)
     {
-        out_dev = std::unique_ptr<GPUMem>(
+        // TODO: For the temporary conversion to half, this is required, however, that would also
+        // need change elsewhere which has not yet been implemented out_dev =
+        // std::unique_ptr<GPUMem>(new GPUMem(
+        //     ctx, out_sz, is_int8 ? sizeof(float) : (is_fp8 ? sizeof(half) : sizeof(Tgpu))));
+        std::ignore = is_fp8;
+        out_dev     = std::unique_ptr<GPUMem>(
             new GPUMem(ctx, out_sz, is_int8 ? sizeof(float) : sizeof(Tgpu)));
         status |=
             (is_int8 ? out_dev->ToGPU(q, out_int8.data()) : out_dev->ToGPU(q, out.data.data()));
@@ -1496,7 +1620,8 @@ bool ConvDriver<Tgpu, Tref>::UseGPUReference()
     {
         if((miopen_type<Tref>{} == miopenFloat &&
             (miopen_type<Tgpu>{} == miopenFloat || miopen_type<Tgpu>{} == miopenHalf ||
-             miopen_type<Tgpu>{} == miopenBFloat16)) ||
+             miopen_type<Tgpu>{} == miopenBFloat16 || miopen_type<Tgpu>{} == miopenFloat8 ||
+             miopen_type<Tgpu>{} == miopenBFloat8)) ||
            (miopen_type<Tref>{} == miopenInt32 && miopen_type<Tgpu>{} == miopenInt8))
             return true;
         else
@@ -3394,8 +3519,8 @@ int ConvDriver<Tgpu, Tref>::VerifyBackward()
         else
         {
             std::cout << "Backward Convolution Data Verifies OK on "
-                      << (UseGPUReference() ? "GPU" : "CPU") << " reference (" << error_data << ')'
-                      << std::endl;
+                      << (UseGPUReference() ? "GPU" : "CPU") << " reference (" << error_data
+                      << " < " << tolerance << ')' << std::endl;
         }
     }
 
@@ -3429,6 +3554,9 @@ int ConvDriver<Tgpu, Tref>::VerifyBackward()
             else if(std::is_same<Tgpu, float16>::value)
                 tolerance *= 5;
         }
+        // bfloat8 has very poor accuracy in wrw direction
+        if(std::is_same<Tgpu, bfloat8>::value)
+            tolerance = tolerance * 2;
 
         auto error_weights = is_wrw_run_failed ? std::numeric_limits<double>::max()
                                                : miopen::rms_range(dwei_host.data, dwei);
@@ -3443,7 +3571,7 @@ int ConvDriver<Tgpu, Tref>::VerifyBackward()
         {
             std::cout << "Backward Convolution Weights Verifies OK on "
                       << (UseGPUReference() ? "GPU" : "CPU") << " reference (" << error_weights
-                      << ')' << std::endl;
+                      << " < " << tolerance << ')' << std::endl;
         }
     }
 
diff --git a/driver/driver.hpp b/driver/driver.hpp
index 0760a749e7..8e15894705 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -34,8 +34,6 @@
 
 #include "random.hpp"
 
-using float16 = half_float::half;
-
 #include "InputFlags.hpp"
 #include <algorithm>
 #include <cstdio>
@@ -44,6 +42,12 @@ using float16 = half_float::half;
 #include <memory>
 #include <miopen/miopen.h>
 #include <miopen/bfloat16.hpp>
+using half         = half_float::half;
+using hip_bfloat16 = bfloat16;
+#include <miopen/hip_float8.hpp>
+using float16 = half_float::half;
+using float8  = miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>;
+using bfloat8 = miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>;
 #include <numeric>
 #include <vector>
 
@@ -143,7 +147,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
 [[gnu::noreturn]] inline void Usage()
 {
     printf("Usage: ./driver *base_arg* *other_args*\n");
-    printf("Supported Base Arguments: conv[fp16|int8|bfp16], CBAInfer[fp16], "
+    printf("Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], "
            "pool[fp16], lrn[fp16], "
            "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm, ctc, dropout[fp16], "
            "tensorop[fp16], reduce[fp16,fp64]\n");
@@ -161,13 +165,13 @@ inline std::string ParseBaseArg(int argc, char* argv[])
     std::string arg = argv[1];
 
     if(arg != "conv" && arg != "convfp16" && arg != "convint8" && arg != "convbfp16" &&
-       arg != "CBAInfer" && arg != "CBAInferfp16" && arg != "pool" && arg != "poolfp16" &&
-       arg != "lrn" && arg != "lrnfp16" && arg != "activ" && arg != "activfp16" &&
-       arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" && arg != "bnormfp16" &&
-       arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" && arg != "rnn_seqfp16" &&
-       arg != "gemm" /*&& arg != "gemmfp16"*/ && arg != "ctc" && arg != "dropout" &&
-       arg != "dropoutfp16" && arg != "tensorop" && arg != "tensoropfp16" && arg != "reduce" &&
-       arg != "reducefp16" && arg != "reducefp64" && arg != "--version")
+       arg != "convfp8" && arg != "convbfp8" && arg != "CBAInfer" && arg != "CBAInferfp16" &&
+       arg != "pool" && arg != "poolfp16" && arg != "lrn" && arg != "lrnfp16" && arg != "activ" &&
+       arg != "activfp16" && arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" &&
+       arg != "bnormfp16" && arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" &&
+       arg != "rnn_seqfp16" && arg != "gemm" /*&& arg != "gemmfp16"*/ && arg != "ctc" &&
+       arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "tensoropfp16" &&
+       arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
@@ -249,6 +253,16 @@ inline void Driver::InitDataType<bfloat16>()
 {
     data_type = miopenBFloat16;
 }
+template <>
+inline void Driver::InitDataType<float8>()
+{
+    data_type = miopenFloat8;
+}
+template <>
+inline void Driver::InitDataType<bfloat8>()
+{
+    data_type = miopenBFloat8;
+}
 // "std::is_same<Tgpu, float>{}" used to avoid "static_assert" compilation error,
 // which occurs when the condition does not depend in any way on the template parameters.
 template <typename Tgpu>
diff --git a/driver/main.cpp b/driver/main.cpp
index 6db3952c12..abdefc34a3 100644
--- a/driver/main.cpp
+++ b/driver/main.cpp
@@ -81,6 +81,14 @@ int main(int argc, char* argv[])
     {
         drv = new ConvDriver<int8_t, int32_t>();
     }
+    else if(base_arg == "convfp8")
+    {
+        drv = new ConvDriver<float8, float>();
+    }
+    else if(base_arg == "convbfp8")
+    {
+        drv = new ConvDriver<bfloat8, float>();
+    }
     else if(base_arg == "CBAInfer")
     {
         drv = new CBAInferFusionDriver<float, double>();
diff --git a/include/miopen/config.h.in b/include/miopen/config.h.in
index 67edaeb284..74b0185fde 100644
--- a/include/miopen/config.h.in
+++ b/include/miopen/config.h.in
@@ -74,6 +74,8 @@
 //     remain in the future)  perform final conversion (and rounding) of FP32
 //     to BF16 results. This affects the main functionality of the library.
 #cmakedefine01 MIOPEN_USE_RNE_BFLOAT16
+#cmakedefine01 MIOPEN_FP8_IEEE_EXPONENT_BIAS
+#cmakedefine01 MIOPEN_FP8_CLIPPING
 
 // clang-format off
 #cmakedefine MIOPEN_DEFAULT_FIND_MODE @MIOPEN_DEFAULT_FIND_MODE@
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 7bc268620b..fcbc60a0b2 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -111,6 +111,12 @@ typedef enum
     miopenStatusVersionMismatch = 10, /*!< Version mismatch of the supplied binary data argment. */
 } miopenStatus_t;
 
+typedef enum
+{
+    miopenF8RoundingModeStandard   = 0,
+    miopenF8RoundingModeStochastic = 1,
+} miopenF8RoundingMode_t;
+
 /*! @brief Get character string for an error code.
  *
  * A function which returns a NULL terminated character string of the error code.
@@ -347,7 +353,9 @@ typedef enum
         4, /*!< Pack of four 8-bit int points in NCHW_VECT_C format (Partially supported) */
     miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction)
                            (Partially supported) */
-    miopenDouble = 6,   /*!< 64-bit floating point (Partially supported) */
+    miopenDouble  = 6,  /*!< 64-bit floating point (Partially supported) */
+    miopenFloat8  = 7,
+    miopenBFloat8 = 8
 } miopenDataType_t;
 
 /*! @ingroup tensor
@@ -581,6 +589,11 @@ typedef enum
     MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC =
         1, /*!< Restrict MIOpen convolutions to kernels which produce numerically deterministic
               results. 0 - disabled (default), 1 - enabled >*/
+    MIOPEN_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE =
+        2, /*!<Specifies the rounding mode for the 8-bit floating data types. Currently, two
+              rounding modes are supported miopenF8RoundingModeStandard and
+              miopenF8RoundingModeStochastic. These are listed as part of the miopenF8RoundingMode_t
+              enum.>*/
 } miopenConvolutionAttrib_t;
 
 /** @addtogroup tensor
@@ -698,7 +711,19 @@ MIOPEN_EXPORT miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t
                                                        const int* dimsA,
                                                        const int* stridesA);
 
-/*! @brief Get size of N-dimensional tensor
+/*! @brief Set the tensor cast type
+ *
+ *  For tensors where the cast_type attribute is set, the tensor elements would be converted to the
+ * target type before the target operation is applied. Currently, only supported for convolution
+ * operations targeting the FP8 datatype
+ *
+ *  @param tensorDesc Tensor descriptor type (input)
+ *  @param cast_type  MIOpen datatype (input)
+ */
+MIOPEN_EXPORT miopenStatus_t miopenSetTensorCastType(miopenTensorDescriptor_t tensorDesc,
+                                                     miopenDataType_t cast_type);
+
+/*! @brief Set shape of N-dimensional tensor
  *
  * Interface for querying tensor size. MIOpen has support for 1, 2, 3, 4, 5 dimensional tensor of
  * layout.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 95da2f345d..58ff101c33 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -39,10 +39,14 @@ endif()
 #     remain in the future)  perform final conversion (and rounding) of FP32
 #     to BF16 results. This affects the main functionality of the library.
 option( MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON )
+option( MIOPEN_FP8_IEEE_EXPONENT_BIAS "Sets the FP8 exponent bias to IEEE" ON)
+option( MIOPEN_FP8_CLIPPING "Sets the FP8 clipping" ON)
 set ( MIOPEN_DEFAULT_FIND_MODE "DynamicHybrid" CACHE STRING "Sets the default find mode")
 set_property(CACHE MIOPEN_DEFAULT_FIND_MODE PROPERTY STRINGS 
     Normal Fast Hybrid FastHybrid DynamicHybrid)
 
+option( MIOPEN_FP8_CLIPPING "Sets the FP8 clipping" ON)
+
 configure_file("${PROJECT_SOURCE_DIR}/include/miopen/config.h.in" "${PROJECT_BINARY_DIR}/include/miopen/config.h")
 
 # configure a header file to pass the CMake version settings to the source, and package the header files in the output archive
@@ -279,6 +283,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         ${COMPOSABLE_KERNEL_DYNAMIC_ASM_INCLUDE}
         ${GPU_GENERAL_TENSOR_REORDER_KERNEL_HIP_INCLUDE}
         include/miopen/implicitgemm_params.hpp
+        kernels/gpu_reference_kernel/fp8_kern_types.h
         kernels/Conv_Winograd_v13_3_12_fp16dot_stride1.inc
         kernels/Conv_Winograd_v13_3_12_fp16dot_stride2_dec.inc
         kernels/Conv_Winograd_v13_3_12_fp16dot_stride2_dil.inc
@@ -380,6 +385,8 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/bfloat16_dev.hpp
         kernels/float_types.h
         kernels/workaround_issue_1431.hpp
+        kernels/hip_f8_impl.hpp
+        kernels/hip_float8.hpp
         )
 
     set(MIOPEN_KERNELS
@@ -392,7 +399,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         ${GPU_BATCHED_TRANSPOSE_KERNEL_HIP}
         ${GPU_GENERAL_TENSOR_REORDER_KERNEL_HIP_SOURCE}
         kernels/detect_llvm_amdgcn_buffer_atomic_fadd_f32_float.cpp
-        kernels/MIOpenCheckNumerics.cl
+        kernels/MIOpenCheckNumerics.cpp
         kernels/MIOpenBatchNormActivBwdPerAct.cl
         kernels/MIOpenBatchNormActivBwdSpatial.cl
         kernels/MIOpenBatchNormActivFwdTrainPerAct.cl
diff --git a/src/check_numerics.cpp b/src/check_numerics.cpp
index b7d6bc1eb8..b0882995e9 100644
--- a/src/check_numerics.cpp
+++ b/src/check_numerics.cpp
@@ -53,30 +53,41 @@ struct CheckNumericsResult
     int hasInf  = 0;
 };
 
+std::string GetKernelName(miopenDataType_t data_type)
+{
+    switch(data_type)
+    {
+    case miopenFloat: return {"check_numerics_fp32"};
+    case miopenHalf: return {"check_numerics_fp16"};
+    case miopenBFloat16: return {"check_numerics_bf16"};
+    case miopenFloat8: return {"check_numerics_fp8"};
+    case miopenBFloat8: return {"check_numerics_bf8"};
+    case miopenInt32:
+    case miopenInt8:
+    case miopenInt8x4:
+    case miopenDouble:
+    default: return {""};
+    }
+}
+
 bool checkNumericsImpl(
     const Handle& handle, int mode, const TensorDescriptor& dDesc, ConstData_t data, bool isInput)
 {
     int numElements = dDesc.GetElementSize();
-
-    // TODO - some constants we should get from the device:
-    const int blockSize             = 256;
-    const auto numBlocks            = handle.GetMaxComputeUnits() * 6;
-    const size_t numGlobalWorkItems = blockSize * numBlocks;
-
-    const int computeStats = (mode & CheckNumerics::ComputeStats);
-
     CheckNumericsResult abnormal_h;
-
     auto abnormal_d =
         handle.Create(sizeof(CheckNumericsResult)); // TODO - someday avoid slow malloc/free here
     handle.WriteTo(&abnormal_h, abnormal_d, sizeof(CheckNumericsResult));
-
-    std::string params            = GetDataTypeKernelParams(dDesc.GetType());
-    std::string program_name      = "MIOpenCheckNumerics.cl";
-    std::string kernel_name       = "MIOpenCheckNumerics";
-    const std::vector<size_t> vld = {size_t{blockSize}, size_t{1}, size_t{1}};
-    const std::vector<size_t> vgd = {numGlobalWorkItems, size_t{1}, size_t{1}};
-    handle.AddKernel("MIOpenCheckNumerics", "", program_name, kernel_name, vld, vgd, params)(
+    const size_t threadsPerBlock = 256;
+    const size_t numBlocks       = handle.GetMaxComputeUnits() * 6;
+    const int computeStats       = (mode & CheckNumerics::ComputeStats);
+    // TODO - some constants we should get from the device:
+    std::string program_name      = "MIOpenCheckNumerics.cpp";
+    std::string kernel_name       = GetKernelName(dDesc.GetType());
+    const std::vector<size_t> vld = {size_t{threadsPerBlock}, size_t{1}, size_t{1}};
+    const std::vector<size_t> vgd = {numBlocks, size_t{1}, size_t{1}};
+    handle.AddKernel(
+        "MIOpenCheckNumerics", "MIOpenCheckNumerics", program_name, kernel_name, vld, vgd, "")(
         data, numElements, abnormal_d.get(), computeStats);
 
     handle.ReadTo(&abnormal_h, abnormal_d, sizeof(CheckNumericsResult));
diff --git a/src/conv/problem_description.cpp b/src/conv/problem_description.cpp
index e44160b4d5..76c47cbcd9 100644
--- a/src/conv/problem_description.cpp
+++ b/src/conv/problem_description.cpp
@@ -224,7 +224,7 @@ bool ProblemDescription::IsNCHWc_CHWNc() const
 
 void ProblemDescription::SetupFloats(ExecutionContext& ctx) const
 {
-    if(IsFp32() || IsFp16() || IsBfp16() || IsInt8())
+    if(IsFp32() || IsFp16() || IsBfp16() || IsInt8() || IsFp8() || IsBfp8())
     {
         ctx.general_compile_options += GetDataTypeKernelParams(GetInDataType());
         return;
diff --git a/src/convolution.cpp b/src/convolution.cpp
index cb6cde5eda..5f7539f70d 100644
--- a/src/convolution.cpp
+++ b/src/convolution.cpp
@@ -519,6 +519,17 @@ void ConvolutionAttribute::Set(miopenConvolutionAttrib_t attr, int value)
                              std::to_string(value));
         deterministic.value = value;
     }
+    else if(attr == MIOPEN_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE)
+    {
+        const auto rounding_mode = static_cast<miopenF8RoundingMode_t>(value);
+        if(rounding_mode != miopenF8RoundingModeStochastic &&
+           rounding_mode != miopenF8RoundingModeStandard)
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "[Set conv attribute] Error: Attempt to set invalid value for "
+                         "MIOPEN_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE" +
+                             std::to_string(value));
+        fp8rounding_mode.rounding_mode = rounding_mode;
+    }
     else
     {
         MIOPEN_THROW(miopenStatusBadParm,
@@ -531,6 +542,8 @@ int ConvolutionAttribute::Get(miopenConvolutionAttrib_t attr) const
 {
     if(attr == MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL)
         return gfx90aFp16alt.value;
+    else if(attr == MIOPEN_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE)
+        return static_cast<int>(fp8rounding_mode.rounding_mode);
     else if(attr == MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC)
         return deterministic.value;
     MIOPEN_THROW(miopenStatusBadParm,
diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp
index a75a58ee6b..804587aac0 100644
--- a/src/gemm_v2.cpp
+++ b/src/gemm_v2.cpp
@@ -35,6 +35,10 @@
 #endif
 
 #if MIOPEN_USE_ROCBLAS
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-macros"
+#define ROCBLAS_BETA_FEATURES_API 1
+#pragma clang diagnostic pop
 #if HIP_PACKAGE_VERSION_FLAT >= 5006000000ULL
 #include <half/half.hpp>
 #else
@@ -44,6 +48,7 @@
 #include <rocblas.h>
 #else
 #include <rocblas/rocblas.h>
+#define USE_ROCBLAS_GEMM_EX3 ((MIOPEN_ROCBLAS_VERSION_FLAT >= 2047000) && ROCBLAS_BETA_FEATURES_API)
 #endif
 #include <miopen/perf_field.hpp>
 #endif
@@ -68,14 +73,126 @@
 /// Let's pass literal value as a workaround; there should be no harm.
 #define USE_GEMM_FLAGS_FP16_ALT_IMPL_242 (MIOPEN_ROCBLAS_VERSION_FLAT == 2042000)
 
+static inline uint32_t
+FlagsForRocblasFp32Fp16Call(const miopen::GemmDescriptor& desc) // bool gfx90aFp16Alt)
+{
+#if USE_GEMM_FLAGS_FP16_ALT_IMPL
+    return desc.gfx90a_alt_impl ? rocblas_gemm_flags_fp16_alt_impl : 0;
+#elif USE_GEMM_FLAGS_FP16_ALT_IMPL_242
+    return desc.gfx90a_alt_impl ? 0x4 : 0;
+#else
+    std::ignore = desc;
+    MIOPEN_LOG_W("The gfx90aFp16Alt is not supported by rocBlas");
+    return 0;
+#endif
+#if USE_GEMM_FLAGS_FP16_ALT_IMPL_242 // -warning: macro is not used
+#endif
+}
+
+#if USE_ROCBLAS_GEMM_EX3
+static inline rocblas_computetype rocBlasComputeType_ex3(const miopen::GemmDescriptor& desc)
+{
+    if(desc.a_cast_type == miopenFloat8 && desc.b_cast_type == miopenFloat8)
+        return rocblas_compute_type_f8_f8_f32;
+    else if(desc.a_cast_type == miopenFloat8 && desc.b_cast_type == miopenBFloat8)
+        return rocblas_compute_type_f8_bf8_f32;
+    else if(desc.a_cast_type == miopenBFloat8 && desc.b_cast_type == miopenFloat8)
+        return rocblas_compute_type_bf8_f8_f32;
+    else if(desc.a_cast_type == miopenBFloat8 && desc.b_cast_type == miopenBFloat8)
+        return rocblas_compute_type_bf8_bf8_f32;
+    else
+        return rocblas_compute_type_f32;
+}
+#endif
+
+static inline rocblas_datatype rocBlasComputeType(const miopen::GemmDescriptor& desc)
+{
+    // Complex compute types are only supported in newer version of the API
+    assert(desc.dataType == desc.a_cast_type && desc.dataType == desc.b_cast_type);
+    if(desc.dataType == miopenInt8 || desc.dataType == miopenInt8x4)
+        return rocblas_datatype::rocblas_datatype_i32_r;
+    else
+        return rocblas_datatype::rocblas_datatype_f32_r;
+}
+
+auto rocBlasDataType(miopenDataType_t data_type)
+{
+    if(data_type == miopenFloat8)
+        return rocblas_datatype::rocblas_datatype_f8_r;
+    else if(data_type == miopenBFloat8)
+        return rocblas_datatype::rocblas_datatype_bf8_r;
+    else if(data_type == miopenHalf)
+        return rocblas_datatype::rocblas_datatype_f16_r;
+    MIOPEN_THROW(miopenStatusInternalError, "Invalid data type passed");
+}
+
+template <typename T>
+rocblas_status miopen_rocblas_gemm_ex3(const miopen::Handle& handle,
+                                       const miopen::GemmDescriptor& gemm_desc,
+                                       ConstData_t A,
+                                       int a_offset,
+                                       ConstData_t B,
+                                       int b_offset,
+                                       Data_t C,
+                                       int c_offset)
+{
+    rocblas_status rb_status =
+        rocblas_status::rocblas_status_internal_error; // cppcheck-suppress redundantInitialization
+#if USE_ROCBLAS_GEMM_EX3
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdisabled-macro-expansion"
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+    float alpha = gemm_desc.alpha;
+    float beta  = gemm_desc.beta;
+    auto flags  = FlagsForRocblasFp32Fp16Call(gemm_desc);
+    if(gemm_desc.conv_attributes.fp8rounding_mode.Get() == miopenF8RoundingModeStochastic)
+        flags = flags | rocblas_gemm_flags::rocblas_gemm_flags_stochastic_rounding;
+
+    rb_status = // cppcheck-suppress redundantInitialization
+        rocblas_gemm_ex3(handle.rhandle().get(),
+                         gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
+                         gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
+                         gemm_desc.m,
+                         gemm_desc.n,
+                         gemm_desc.k,
+                         &alpha,
+                         static_cast<const T*>(A) + a_offset,
+                         rocBlasDataType(gemm_desc.dataType),
+                         gemm_desc.lda,
+                         static_cast<const T*>(B) + b_offset,
+                         rocBlasDataType(gemm_desc.dataType),
+                         gemm_desc.ldb,
+                         &beta,
+                         static_cast<const T*>(C) + c_offset,
+                         rocBlasDataType(gemm_desc.dataType),
+                         gemm_desc.ldc,
+                         static_cast<T*>(C) + c_offset,
+                         rocBlasDataType(gemm_desc.dataType),
+                         gemm_desc.ldc,
+                         rocBlasComputeType_ex3(gemm_desc),
+                         rocblas_gemm_algo::rocblas_gemm_algo_standard,
+                         0,
+                         flags); // gfx90a_alt_impl));
+#pragma clang diagnostic pop
+#endif
+    MIOPEN_THROW(miopenStatusBadParm, "An appropriate version of rocBLAS is required for this op");
+    std::ignore = handle;
+    std::ignore = gemm_desc;
+    return rb_status;
+}
+
 template <class... Ts>
-auto miopen_rocblas_gemm_ex(Ts... xs)
+auto miopen_rocblas_gemm_ex(const miopen::Handle& handle,
+                            const miopen::GemmDescriptor& gemm_desc,
+                            Ts... xs)
 {
+    std::ignore = handle;
+    std::ignore = gemm_desc;
 #if AVOID_ROCBLAS_WRAPPERS_204
-    return (rocblas_gemm_ex)(xs...);
+    return (rocblas_gemm_ex)(handle.rhandle().get(), xs...);
 #else
     std::size_t zero = 0;
-    return rocblas_gemm_ex(xs..., &zero, nullptr);
+    return rocblas_gemm_ex(handle.rhandle().get(), xs..., &zero, nullptr);
 #endif
 }
 
@@ -90,6 +207,33 @@ auto miopen_rocblas_gemm_strided_batched_ex(Ts... xs)
 #endif
 }
 
+template <typename T>
+rocblas_status miopen_rocblas_gemm_strided_batched_ex3(const miopen::Handle& handle,
+                                                       const miopen::GemmDescriptor& gemm_desc,
+                                                       ConstData_t A,
+                                                       int a_offset,
+                                                       ConstData_t B,
+                                                       int b_offset,
+                                                       Data_t C,
+                                                       int c_offset)
+{
+    rocblas_status rb_status = rocblas_status::rocblas_status_internal_error;
+    // Until there is a batched counter part to the ex3 rocBlas call we need to iterate over the
+    // batched GEMM
+    for(int bCount = 0; bCount < gemm_desc.batch_count; ++bCount)
+    {
+        rb_status = miopen_rocblas_gemm_ex3<T>(handle,
+                                               gemm_desc,
+                                               A,
+                                               a_offset + (bCount * gemm_desc.strideA),
+                                               B,
+                                               b_offset + (bCount * gemm_desc.strideB),
+                                               C,
+                                               c_offset + (bCount * gemm_desc.strideC));
+    }
+    return rb_status;
+}
+
 #endif // MIOPEN_USE_ROCBLAS
 
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_GEMM_ENFORCE_BACKEND)
@@ -114,7 +258,10 @@ std::ostream& operator<<(std::ostream& stream, const GemmDescriptor& gemm_desc)
                   << "strideC " << gemm_desc.strideC << ", "
                   << "alpha " << gemm_desc.alpha << ", "
                   << "beta " << gemm_desc.beta << ", "
-                  << "dataType " << gemm_desc.dataType << "} ";
+                  << "dataType " << gemm_desc.dataType << "a_cast_type" << gemm_desc.a_cast_type
+                  << ", "
+                  << "b_cast_type" << gemm_desc.b_cast_type << ", "
+                  << "} ";
 }
 
 #if MIOPEN_USE_ROCBLAS
@@ -207,8 +354,7 @@ miopenStatus_t CallGemmTimeMeasure(const Handle& handle,
                                    int c_offset,
                                    bool time_precision,
                                    CallGemmType_t call_gemm_type,
-                                   GemmBackend_t gemm_backend,
-                                   bool gfx90a_alt_impl)
+                                   GemmBackend_t gemm_backend)
 {
     switch(call_gemm_type)
     {
@@ -216,103 +362,37 @@ miopenStatus_t CallGemmTimeMeasure(const Handle& handle,
         if(time_precision)
         {
             // rocBLAS need a warm-up call for accurate timing
-            CallGemm(handle,
-                     gemm_desc,
-                     A,
-                     a_offset,
-                     B,
-                     b_offset,
-                     C,
-                     c_offset,
-                     gemm_backend,
-                     gfx90a_alt_impl);
+            CallGemm(handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset, gemm_backend);
         }
 
-        return CallGemm(handle,
-                        gemm_desc,
-                        A,
-                        a_offset,
-                        B,
-                        b_offset,
-                        C,
-                        c_offset,
-                        gemm_backend,
-                        gfx90a_alt_impl);
+        return CallGemm(handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset, gemm_backend);
     }
     case callGemmStridedBatched: {
         if(time_precision)
         {
             // rocBLAS need extra warm-up call for accurate timing
-            CallGemmStridedBatched(handle,
-                                   gemm_desc,
-                                   A,
-                                   a_offset,
-                                   B,
-                                   b_offset,
-                                   C,
-                                   c_offset,
-                                   gemm_backend,
-                                   gfx90a_alt_impl);
+            CallGemmStridedBatched(
+                handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset, gemm_backend);
         }
 
-        return CallGemmStridedBatched(handle,
-                                      gemm_desc,
-                                      A,
-                                      a_offset,
-                                      B,
-                                      b_offset,
-                                      C,
-                                      c_offset,
-                                      gemm_backend,
-                                      gfx90a_alt_impl);
+        return CallGemmStridedBatched(
+            handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset, gemm_backend);
     }
     case callGemmStridedBatchedSequential: {
         if(time_precision)
         {
             // rocBLAS need a warm-up call for accurate timing
-            CallGemmStridedBatchedSequential(handle,
-                                             gemm_desc,
-                                             A,
-                                             a_offset,
-                                             B,
-                                             b_offset,
-                                             C,
-                                             c_offset,
-                                             gemm_backend,
-                                             gfx90a_alt_impl);
+            CallGemmStridedBatchedSequential(
+                handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset, gemm_backend);
         }
 
-        return CallGemmStridedBatchedSequential(handle,
-                                                gemm_desc,
-                                                A,
-                                                a_offset,
-                                                B,
-                                                b_offset,
-                                                C,
-                                                c_offset,
-                                                gemm_backend,
-                                                gfx90a_alt_impl);
+        return CallGemmStridedBatchedSequential(
+            handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset, gemm_backend);
     }
     }
     return miopenStatusNotImplemented;
 }
 
-#if MIOPEN_USE_ROCBLAS
-static inline uint32_t FlagsForRocblasFp32Fp16Call(const bool gfx90aFp16Alt)
-{
-#if USE_GEMM_FLAGS_FP16_ALT_IMPL
-    return gfx90aFp16Alt ? rocblas_gemm_flags_fp16_alt_impl : 0;
-#elif USE_GEMM_FLAGS_FP16_ALT_IMPL_242
-    return gfx90aFp16Alt ? 0x4 : 0;
-#else
-    std::ignore = gfx90aFp16Alt;
-    return 0;
-#endif
-#if USE_GEMM_FLAGS_FP16_ALT_IMPL_242 // -warning: macro is not used
-#endif
-}
-#endif // MIOPEN_USE_ROCBLAS
-
 miopenStatus_t CallGemm(const Handle& handle,
                         GemmDescriptor gemm_desc,
                         ConstData_t A,
@@ -321,8 +401,7 @@ miopenStatus_t CallGemm(const Handle& handle,
                         int b_offset,
                         Data_t C,
                         int c_offset,
-                        GemmBackend_t gemm_backend,
-                        bool gfx90a_alt_impl)
+                        GemmBackend_t gemm_backend)
 {
     MIOPEN_LOG_I2("gemm_desc: " << gemm_desc);
 
@@ -368,7 +447,8 @@ miopenStatus_t CallGemm(const Handle& handle,
             auto beta  = int(gemm_desc.beta);
 
             rb_status = miopen_rocblas_gemm_ex(
-                handle.rhandle().get(),
+                handle,
+                gemm_desc,
                 gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
                 gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
                 gemm_desc.m,
@@ -388,7 +468,7 @@ miopenStatus_t CallGemm(const Handle& handle,
                 static_cast<rocblas_int*>(C) + c_offset,
                 rocblas_datatype::rocblas_datatype_i32_r,
                 gemm_desc.ldc,
-                rocblas_datatype::rocblas_datatype_i32_r,
+                rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_i32_r,
                 rocblas_gemm_algo::rocblas_gemm_algo_standard,
                 0,
 #if USE_GEMM_FLAGS_PACK_INT8X4
@@ -401,35 +481,61 @@ miopenStatus_t CallGemm(const Handle& handle,
         break;
         case miopenInt32: break;
         case miopenHalf: {
-
-            float alpha = gemm_desc.alpha;
-            float beta  = gemm_desc.beta;
-
-            rb_status = miopen_rocblas_gemm_ex(
-                handle.rhandle().get(),
-                gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
-                gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
-                gemm_desc.m,
-                gemm_desc.n,
-                gemm_desc.k,
-                &alpha,
-                static_cast<const rocblas_half*>(A) + a_offset,
-                rocblas_datatype::rocblas_datatype_f16_r,
-                gemm_desc.lda,
-                static_cast<const rocblas_half*>(B) + b_offset,
-                rocblas_datatype::rocblas_datatype_f16_r,
-                gemm_desc.ldb,
-                &beta,
-                static_cast<const rocblas_half*>(C) + c_offset,
-                rocblas_datatype::rocblas_datatype_f16_r,
-                gemm_desc.ldc,
-                static_cast<rocblas_half*>(C) + c_offset,
-                rocblas_datatype::rocblas_datatype_f16_r,
-                gemm_desc.ldc,
-                rocblas_datatype::rocblas_datatype_f32_r,
-                rocblas_gemm_algo::rocblas_gemm_algo_standard,
-                0,
-                FlagsForRocblasFp32Fp16Call(gfx90a_alt_impl));
+            const auto is_gfx94x = miopen::StartsWith(handle.GetDeviceName(), "gfx94");
+            // We need ex3 API if any of the dataType or the cast type is an 8-bit floating type
+            const auto needs_ex3 = [&]() {
+                if((gemm_desc.dataType == miopenFloat8 || gemm_desc.dataType == miopenBFloat8) ||
+                   (gemm_desc.a_cast_type == miopenFloat8 ||
+                    gemm_desc.a_cast_type == miopenBFloat8) ||
+                   (gemm_desc.b_cast_type == miopenBFloat8 ||
+                    gemm_desc.b_cast_type == miopenFloat8))
+                    return true;
+                else
+                    return false;
+            }();
+            // ex3 API only works on the gfx94x ASIC;
+            if(needs_ex3)
+            {
+                if(is_gfx94x)
+                {
+                    rb_status = miopen_rocblas_gemm_ex3<rocblas_half>(
+                        handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset);
+                }
+                else
+                    MIOPEN_THROW(miopenStatusBadParm,
+                                 "8-bit floating types are only supported on gfx94x");
+            }
+            else
+            {
+                float alpha = gemm_desc.alpha;
+                float beta  = gemm_desc.beta;
+                rb_status   = miopen_rocblas_gemm_ex(
+                    handle,
+                    gemm_desc,
+                    gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
+                    gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
+                    gemm_desc.m,
+                    gemm_desc.n,
+                    gemm_desc.k,
+                    &alpha,
+                    static_cast<const rocblas_half*>(A) + a_offset,
+                    rocblas_datatype::rocblas_datatype_f16_r,
+                    gemm_desc.lda,
+                    static_cast<const rocblas_half*>(B) + b_offset,
+                    rocblas_datatype::rocblas_datatype_f16_r,
+                    gemm_desc.ldb,
+                    &beta,
+                    static_cast<const rocblas_half*>(C) + c_offset,
+                    rocblas_datatype::rocblas_datatype_f16_r,
+                    gemm_desc.ldc,
+                    static_cast<rocblas_half*>(C) + c_offset,
+                    rocblas_datatype::rocblas_datatype_f16_r,
+                    gemm_desc.ldc,
+                    rocBlasComputeType(gemm_desc),
+                    rocblas_gemm_algo::rocblas_gemm_algo_standard,
+                    0,
+                    FlagsForRocblasFp32Fp16Call(gemm_desc)); // gfx90a_alt_impl));
+            }
         }
         break;
 
@@ -439,7 +545,8 @@ miopenStatus_t CallGemm(const Handle& handle,
             float beta  = gemm_desc.beta;
 
             rb_status = miopen_rocblas_gemm_ex(
-                handle.rhandle().get(),
+                handle,
+                gemm_desc,
                 gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
                 gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
                 gemm_desc.m,
@@ -459,7 +566,7 @@ miopenStatus_t CallGemm(const Handle& handle,
                 static_cast<rocblas_bfloat16*>(C) + c_offset,
                 rocblas_datatype::rocblas_datatype_bf16_r,
                 gemm_desc.ldc,
-                rocblas_datatype::rocblas_datatype_f32_r,
+                rocBlasComputeType(gemm_desc),
                 rocblas_gemm_algo::rocblas_gemm_algo_standard,
                 0,
                 0);
@@ -471,7 +578,8 @@ miopenStatus_t CallGemm(const Handle& handle,
             float beta  = gemm_desc.beta;
 
             rb_status = miopen_rocblas_gemm_ex(
-                handle.rhandle().get(),
+                handle,
+                gemm_desc,
                 gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
                 gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
                 gemm_desc.m,
@@ -491,13 +599,27 @@ miopenStatus_t CallGemm(const Handle& handle,
                 static_cast<float*>(C) + c_offset,
                 rocblas_datatype::rocblas_datatype_f32_r,
                 gemm_desc.ldc,
-                rocblas_datatype::rocblas_datatype_f32_r,
+                rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_f32_r,
                 rocblas_gemm_algo::rocblas_gemm_algo_standard,
                 0,
                 0);
         }
         break;
 
+        case miopenFloat8:
+        case miopenBFloat8: {
+            const auto is_gfx94x = miopen::StartsWith(handle.GetDeviceName(), "gfx94");
+            if(is_gfx94x)
+            {
+                rb_status = miopen_rocblas_gemm_ex3<char>(
+                    handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset);
+            }
+            else
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "8-bit floating types are only supported on gfx94x");
+        };
+        break;
+
         case miopenDouble: {
             MIOPEN_THROW(miopenStatusBadParm,
                          "miopenDouble data type not supported by MIOpenGEMM.");
@@ -531,8 +653,7 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
                                       int b_offset,
                                       Data_t C,
                                       int c_offset,
-                                      GemmBackend_t gemm_backend,
-                                      bool gfx90a_alt_impl)
+                                      GemmBackend_t gemm_backend)
 {
     MIOPEN_LOG_I2("gemm_desc: " << gemm_desc);
 
@@ -560,7 +681,6 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
         HipEventPtr stop  = nullptr;
         if(handle.IsProfilingEnabled())
         {
-
             ProfilingRecordStart(handle, start, stop);
         }
         rocblas_atomics_mode cur_mode =
@@ -619,40 +739,67 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
         case miopenInt32: break;
 
         case miopenHalf: {
+            const auto is_gfx94x = miopen::StartsWith(handle.GetDeviceName(), "gfx94");
+            // We need ex3 API if any of the dataType or the cast type is an 8-bit floating type
+            const auto needs_ex3 = [&]() {
+                if((gemm_desc.dataType == miopenFloat8 || gemm_desc.dataType == miopenBFloat8) ||
+                   (gemm_desc.a_cast_type == miopenFloat8 ||
+                    gemm_desc.a_cast_type == miopenBFloat8) ||
+                   (gemm_desc.b_cast_type == miopenBFloat8 ||
+                    gemm_desc.b_cast_type == miopenFloat8))
+                    return true;
+                else
+                    return false;
+            }();
+            // ex3 API only works on the gfx94x ASIC;
+            if(needs_ex3)
+            {
+                if(is_gfx94x)
+                {
+                    rb_status = miopen_rocblas_gemm_strided_batched_ex3<rocblas_half>(
+                        handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset);
+                }
+                else
+                    MIOPEN_THROW(miopenStatusBadParm,
+                                 "8-bit floating types are only supported on gfx94x");
+            }
+            else
+            {
 
-            float alpha = gemm_desc.alpha;
-            float beta  = gemm_desc.beta;
+                float alpha = gemm_desc.alpha;
+                float beta  = gemm_desc.beta;
 
-            rb_status = miopen_rocblas_gemm_strided_batched_ex(
-                handle.rhandle().get(),
-                gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
-                gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
-                gemm_desc.m,
-                gemm_desc.n,
-                gemm_desc.k,
-                &alpha,
-                static_cast<const rocblas_half*>(A) + a_offset,
-                rocblas_datatype::rocblas_datatype_f16_r,
-                gemm_desc.lda,
-                gemm_desc.strideA,
-                static_cast<const rocblas_half*>(B) + b_offset,
-                rocblas_datatype::rocblas_datatype_f16_r,
-                gemm_desc.ldb,
-                gemm_desc.strideB,
-                &beta,
-                static_cast<const rocblas_half*>(C) + c_offset,
-                rocblas_datatype::rocblas_datatype_f16_r,
-                gemm_desc.ldc,
-                gemm_desc.strideC,
-                static_cast<rocblas_half*>(C) + c_offset,
-                rocblas_datatype::rocblas_datatype_f16_r,
-                gemm_desc.ldc,
-                gemm_desc.strideC,
-                gemm_desc.batch_count,
-                rocblas_datatype::rocblas_datatype_f32_r,
-                rocblas_gemm_algo::rocblas_gemm_algo_standard,
-                0,
-                FlagsForRocblasFp32Fp16Call(gfx90a_alt_impl));
+                rb_status = miopen_rocblas_gemm_strided_batched_ex(
+                    handle.rhandle().get(),
+                    gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
+                    gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
+                    gemm_desc.m,
+                    gemm_desc.n,
+                    gemm_desc.k,
+                    &alpha,
+                    static_cast<const rocblas_half*>(A) + a_offset,
+                    rocblas_datatype::rocblas_datatype_f16_r,
+                    gemm_desc.lda,
+                    gemm_desc.strideA,
+                    static_cast<const rocblas_half*>(B) + b_offset,
+                    rocblas_datatype::rocblas_datatype_f16_r,
+                    gemm_desc.ldb,
+                    gemm_desc.strideB,
+                    &beta,
+                    static_cast<const rocblas_half*>(C) + c_offset,
+                    rocblas_datatype::rocblas_datatype_f16_r,
+                    gemm_desc.ldc,
+                    gemm_desc.strideC,
+                    static_cast<rocblas_half*>(C) + c_offset,
+                    rocblas_datatype::rocblas_datatype_f16_r,
+                    gemm_desc.ldc,
+                    gemm_desc.strideC,
+                    gemm_desc.batch_count,
+                    rocblas_datatype::rocblas_datatype_f32_r,
+                    rocblas_gemm_algo::rocblas_gemm_algo_standard,
+                    0,
+                    FlagsForRocblasFp32Fp16Call(gemm_desc));
+            }
         }
         break;
 
@@ -730,6 +877,21 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
         }
         break;
 
+        case miopenFloat8:
+        case miopenBFloat8: {
+            const auto is_gfx94x = miopen::StartsWith(handle.GetDeviceName(), "gfx94");
+            if(is_gfx94x)
+            {
+                rb_status = miopen_rocblas_gemm_strided_batched_ex3<char>(
+                    handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset);
+            }
+            else
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "8-bit floating types are only supported on gfx94x");
+
+            break;
+        }
+
         case miopenDouble: {
             MIOPEN_THROW(miopenStatusBadParm,
                          "miopenDouble data type not supported by MIOpenGEMM.");
@@ -764,8 +926,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
                                                 int b_offset,
                                                 Data_t C,
                                                 int c_offset,
-                                                GemmBackend_t gemm_backend,
-                                                bool gfx90a_alt_impl)
+                                                GemmBackend_t gemm_backend)
 {
     MIOPEN_LOG_I2("gemm_desc: " << gemm_desc);
 
@@ -816,7 +977,8 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
             for(int i = 0; i < gemm_desc.batch_count; ++i)
             {
                 rb_status = miopen_rocblas_gemm_ex(
-                    handle.rhandle().get(),
+                    handle,
+                    gemm_desc,
                     gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
                     gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
                     gemm_desc.m,
@@ -836,7 +998,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
                     static_cast<rocblas_int*>(C) + c_offset + i * gemm_desc.strideC,
                     rocblas_datatype::rocblas_datatype_i32_r,
                     gemm_desc.ldc,
-                    rocblas_datatype::rocblas_datatype_i32_r,
+                    rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_i32_r,
                     rocblas_gemm_algo::rocblas_gemm_algo_standard,
                     0,
 #if USE_GEMM_FLAGS_PACK_INT8X4
@@ -850,37 +1012,65 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
         break;
         case miopenInt32: break;
         case miopenHalf: {
+            const auto is_gfx94x = miopen::StartsWith(handle.GetDeviceName(), "gfx94");
+            // We need ex3 API if any of the dataType or the cast type is an 8-bit floating type
+            const auto needs_ex3 = [&]() {
+                if((gemm_desc.dataType == miopenFloat8 || gemm_desc.dataType == miopenBFloat8) ||
+                   (gemm_desc.a_cast_type == miopenFloat8 ||
+                    gemm_desc.a_cast_type == miopenBFloat8) ||
+                   (gemm_desc.b_cast_type == miopenBFloat8 ||
+                    gemm_desc.b_cast_type == miopenFloat8))
+                    return true;
+                else
+                    return false;
+            }();
+            // ex3 API only works on the gfx94x ASIC;
+            if(needs_ex3)
+            {
+                if(is_gfx94x)
+                {
+                    rb_status = miopen_rocblas_gemm_strided_batched_ex3<rocblas_half>(
+                        handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset);
+                }
+                else
+                    MIOPEN_THROW(miopenStatusBadParm,
+                                 "8-bit floating types are only supported on gfx94x");
+            }
+            else
+            {
 
-            float alpha = gemm_desc.alpha;
-            float beta  = gemm_desc.beta;
+                float alpha = gemm_desc.alpha;
+                float beta  = gemm_desc.beta;
 
-            for(int i = 0; i < gemm_desc.batch_count; ++i)
-            {
-                rb_status = miopen_rocblas_gemm_ex(
-                    handle.rhandle().get(),
-                    gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
-                    gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
-                    gemm_desc.m,
-                    gemm_desc.n,
-                    gemm_desc.k,
-                    &alpha,
-                    static_cast<const rocblas_half*>(A) + a_offset + i * gemm_desc.strideA,
-                    rocblas_datatype::rocblas_datatype_f16_r,
-                    gemm_desc.lda,
-                    static_cast<const rocblas_half*>(B) + b_offset + i * gemm_desc.strideB,
-                    rocblas_datatype::rocblas_datatype_f16_r,
-                    gemm_desc.ldb,
-                    &beta,
-                    static_cast<const rocblas_half*>(C) + c_offset + i * gemm_desc.strideC,
-                    rocblas_datatype::rocblas_datatype_f16_r,
-                    gemm_desc.ldc,
-                    static_cast<rocblas_half*>(C) + c_offset + i * gemm_desc.strideC,
-                    rocblas_datatype::rocblas_datatype_f16_r,
-                    gemm_desc.ldc,
-                    rocblas_datatype::rocblas_datatype_f32_r,
-                    rocblas_gemm_algo::rocblas_gemm_algo_standard,
-                    0,
-                    FlagsForRocblasFp32Fp16Call(gfx90a_alt_impl));
+                for(int i = 0; i < gemm_desc.batch_count; ++i)
+                {
+                    rb_status = miopen_rocblas_gemm_ex(
+                        handle,
+                        gemm_desc,
+                        gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
+                        gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
+                        gemm_desc.m,
+                        gemm_desc.n,
+                        gemm_desc.k,
+                        &alpha,
+                        static_cast<const rocblas_half*>(A) + a_offset + i * gemm_desc.strideA,
+                        rocblas_datatype::rocblas_datatype_f16_r,
+                        gemm_desc.lda,
+                        static_cast<const rocblas_half*>(B) + b_offset + i * gemm_desc.strideB,
+                        rocblas_datatype::rocblas_datatype_f16_r,
+                        gemm_desc.ldb,
+                        &beta,
+                        static_cast<const rocblas_half*>(C) + c_offset + i * gemm_desc.strideC,
+                        rocblas_datatype::rocblas_datatype_f16_r,
+                        gemm_desc.ldc,
+                        static_cast<rocblas_half*>(C) + c_offset + i * gemm_desc.strideC,
+                        rocblas_datatype::rocblas_datatype_f16_r,
+                        gemm_desc.ldc,
+                        rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_f32_r,
+                        rocblas_gemm_algo::rocblas_gemm_algo_standard,
+                        0,
+                        FlagsForRocblasFp32Fp16Call(gemm_desc));
+                }
             }
         }
         break;
@@ -892,7 +1082,8 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
             for(int i = 0; i < gemm_desc.batch_count; ++i)
             {
                 rb_status = miopen_rocblas_gemm_ex(
-                    handle.rhandle().get(),
+                    handle,
+                    gemm_desc,
                     gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
                     gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
                     gemm_desc.m,
@@ -912,7 +1103,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
                     static_cast<rocblas_half*>(C) + c_offset + i * gemm_desc.strideC,
                     rocblas_datatype::rocblas_datatype_bf16_r,
                     gemm_desc.ldc,
-                    rocblas_datatype::rocblas_datatype_f32_r,
+                    rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_f32_r,
                     rocblas_gemm_algo::rocblas_gemm_algo_standard,
                     0,
                     0);
@@ -927,7 +1118,8 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
             for(int i = 0; i < gemm_desc.batch_count; ++i)
             {
                 rb_status = miopen_rocblas_gemm_ex(
-                    handle.rhandle().get(),
+                    handle,
+                    gemm_desc,
                     gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none,
                     gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none,
                     gemm_desc.m,
@@ -947,7 +1139,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
                     static_cast<float*>(C) + c_offset + i * gemm_desc.strideC,
                     rocblas_datatype::rocblas_datatype_f32_r,
                     gemm_desc.ldc,
-                    rocblas_datatype::rocblas_datatype_f32_r,
+                    rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_f32_r,
                     rocblas_gemm_algo::rocblas_gemm_algo_standard,
                     0,
                     0);
@@ -955,6 +1147,21 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
         }
         break;
 
+        case miopenFloat8:
+        case miopenBFloat8: {
+            const auto is_gfx94x = miopen::StartsWith(handle.GetDeviceName(), "gfx94");
+            if(is_gfx94x)
+            {
+                rb_status = miopen_rocblas_gemm_strided_batched_ex3<char>(
+                    handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset);
+            }
+            else
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "8-bit floating types are only supported on gfx94x");
+
+            break;
+        }
+
         case miopenDouble: {
             MIOPEN_THROW(miopenStatusBadParm,
                          "miopenDouble data type not supported by MIOpenGEMM.");
diff --git a/src/hipoc/hipoc_program.cpp b/src/hipoc/hipoc_program.cpp
index eea3b32291..8a87d0d3f7 100644
--- a/src/hipoc/hipoc_program.cpp
+++ b/src/hipoc/hipoc_program.cpp
@@ -321,23 +321,20 @@ void HIPOCProgramImpl::BuildCodeObject(std::string params,
         return GetKernelSrc(program);
     }();
 
+#if MIOPEN_BUILD_DEV
     if(miopen::EndsWith(filename, ".cpp"))
     {
-#if MIOPEN_BUILD_DEV
         params += " -Werror" + HipKernelWarningsString();
-#else
-        params += " -Wno-everything";
-#endif
     }
     else if(miopen::EndsWith(filename, ".cl"))
     {
-#if MIOPEN_BUILD_DEV
         params +=
             " -Werror" + (is_kernel_str ? MiopengemmWarningsString() : OclKernelWarningsString());
+    }
 #else
+    if(miopen::EndsWith(filename, ".cpp") || miopen::EndsWith(filename, ".cl"))
         params += " -Wno-everything";
 #endif
-    }
 
 #if MIOPEN_USE_COMGR /// \todo Refactor when functionality stabilize.
     BuildCodeObjectInMemory(params, src, filename);
diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp
index c458f3421d..438e1d306a 100644
--- a/src/include/miopen/conv/problem_description.hpp
+++ b/src/include/miopen/conv/problem_description.hpp
@@ -55,6 +55,8 @@ inline std::string GetDataTypeName(miopenDataType_t data_type)
     case miopenInt32: return "INT32";
     case miopenBFloat16: return "BF16";
     case miopenDouble: return "FP64";
+    case miopenFloat8: return "FP8";
+    case miopenBFloat8: return "BFP8";
     }
 
     return "Unknown(" + std::to_string(data_type) + ")";
@@ -190,6 +192,7 @@ struct ProblemDescription : ProblemDescriptionBase
 
     // In getters
     miopenDataType_t GetInDataType() const { return in.GetType(); }
+    std::optional<miopenDataType_t> GetInCastType() const { return in.GetCastType(); }
     unsigned GetInBatchSize_() const { return GetN5(GetSpatialDims(), in.GetLengths()); }
     unsigned GetBatchSize_() const { return GetInBatchSize_(); } // alias of GetInBatchSize_()
     unsigned GetInChannels_() const { return GetC5(GetSpatialDims(), in.GetLengths()); }
@@ -223,6 +226,7 @@ struct ProblemDescription : ProblemDescriptionBase
 
     // Out getters
     miopenDataType_t GetOutDataType() const { return out.GetType(); }
+    std::optional<miopenDataType_t> GetOutCastType() const { return out.GetCastType(); }
     unsigned GetOutBatchSize_() const { return GetN5(GetSpatialDims(), out.GetLengths()); }
     unsigned GetOutChannels_() const { return GetC5(GetSpatialDims(), out.GetLengths()); }
     unsigned GetOutDepth_() const { return GetD5(GetSpatialDims(), out.GetLengths()); }
@@ -255,6 +259,7 @@ struct ProblemDescription : ProblemDescriptionBase
 
     // Weights getters
     miopenDataType_t GetWeightsDataType() const { return weights.GetType(); }
+    std::optional<miopenDataType_t> GetWeightsCastType() const { return weights.GetCastType(); }
     unsigned GetWeightsDepth_() const { return GetD5(GetSpatialDims(), weights.GetLengths()); }
     unsigned GetWeightsHeight_() const
     {
@@ -343,6 +348,20 @@ struct ProblemDescription : ProblemDescriptionBase
         return GetInDataType() == miopenInt8 && GetWeightsDataType() == miopenInt8 &&
                (GetOutDataType() == miopenInt32 || GetOutDataType() == miopenFloat);
     }
+    bool IsFp8() const
+    {
+        return GetInDataType() == miopenFloat8 || GetWeightsDataType() == miopenFloat8 ||
+               GetOutDataType() == miopenFloat8;
+    }
+    bool IsBfp8() const
+    {
+        return GetInDataType() == miopenBFloat8 || GetWeightsDataType() == miopenBFloat8 ||
+               GetOutDataType() == miopenBFloat8;
+    }
+    bool IsTensorsCasted() const
+    {
+        return GetInCastType() || GetWeightsCastType() || GetOutCastType();
+    }
 
     // To be used in Solvers that do not implement ALT FP16 kernels.
     // Those Solvers must be non-applicable for gfx90a when this function returns true.
diff --git a/src/include/miopen/convolution.hpp b/src/include/miopen/convolution.hpp
index c4b5b7ea46..5e0507ddb6 100644
--- a/src/include/miopen/convolution.hpp
+++ b/src/include/miopen/convolution.hpp
@@ -45,9 +45,12 @@
 #include <tuple>
 #include <vector>
 #include <unordered_map>
+#include <random>
 
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL)
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC)
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE)
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_SEED)
 
 namespace miopen {
 
@@ -77,6 +80,9 @@ using ExtraKernelArgs = std::tuple<int /*N*/,
                                    int /*out_H*/,
                                    int /*out_W*/>;
 
+struct ConvFwdTensors;
+struct ConvWrwTensors;
+
 struct ConvolutionAttribute
 {
     class Gfx90aFp16alt
@@ -100,6 +106,38 @@ struct ConvolutionAttribute
         friend void from_json(const nlohmann::json& json, Gfx90aFp16alt& attribute);
     } gfx90aFp16alt;
 
+    struct FP8RoundingMode
+    {
+        inline uint32_t InitSeed()
+        {
+            std::random_device rd;
+            std::mt19937 gen(rd());
+            std::uniform_int_distribution<uint32_t> distribution(0, 0xFFFFFFFF);
+            return distribution(gen);
+        }
+        miopenF8RoundingMode_t rounding_mode = miopenF8RoundingModeStochastic;
+        uint32_t seed                        = InitSeed();
+        friend struct ConvolutionAttribute;
+
+        inline miopenF8RoundingMode_t Get() const
+        {
+            if(nullptr != miopen::GetStringEnv(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE{}))
+                return static_cast<miopenF8RoundingMode_t>(
+                    miopen::Value(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE{}));
+            return rounding_mode;
+        }
+
+        inline uint32_t GetSeed() const
+        {
+            // assert(rounding_mode == miopenF8RoundingModeStochastic);
+            if(nullptr != miopen::GetStringEnv(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_SEED{}))
+                return miopen::Value(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_SEED{});
+            return seed;
+        }
+
+        inline void SetSeed(const uint32_t s) { seed = s; }
+    } fp8rounding_mode;
+
     class Deterministic
     {
         int value = 0;
diff --git a/src/include/miopen/datatype.hpp b/src/include/miopen/datatype.hpp
index 2a0cb48ef4..485bdb3d67 100644
--- a/src/include/miopen/datatype.hpp
+++ b/src/include/miopen/datatype.hpp
@@ -66,6 +66,14 @@ inline std::string GetDataType(miopenDataType_t type)
         type_str = "double";
     }
     break;
+    case miopenFloat8: {
+        type_str = "float8";
+    }
+    break;
+    case miopenBFloat8: {
+        type_str = "bfloat8";
+    }
+    break;
     }
     return type_str;
 }
@@ -133,6 +141,8 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type)
     int use_int32              = 0;
     int use_bfp16              = 0;
     int use_fp64               = 0;
+    int use_fp8                = 0;
+    int use_bfp8               = 0;
     const int use_rne_bfloat16 = MIOPEN_USE_RNE_BFLOAT16;
 
     switch(type)
@@ -144,8 +154,11 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type)
     case miopenBFloat16: use_bfp16 = 1; break;
     case miopenInt32: use_int32 = 1; break;
     case miopenDouble: use_fp64 = 1; break;
+    case miopenFloat8: use_fp8 = 1; break;
+    case miopenBFloat8: use_bfp8 = 1; break;
     default:
-        MIOPEN_THROW("Only float, half, bfloat16, int8, int8x4 data type is supported.");
+        MIOPEN_THROW(
+            "Only float, half, bfloat16, int8, int8x4, float8, bfloat8 data type is supported.");
         break;
     }
 
@@ -159,9 +172,15 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type)
         {"MIOPEN_USE_BFP16", use_bfp16},
         {"MIOPEN_USE_INT32", use_int32},
         {"MIOPEN_USE_RNE_BFLOAT16", use_rne_bfloat16},
+        {"MIOPEN_FP8_IEEE_EXPONENT_BIAS", MIOPEN_FP8_IEEE_EXPONENT_BIAS},
+        {"MIOPEN_FP8_CLIPPING", MIOPEN_FP8_CLIPPING},
     };
     if(use_fp64 != 0)
         kbp.Define("MIOPEN_USE_FP64", use_fp64);
+    if(use_fp8 != 0)
+        kbp.Define("MIOPEN_USE_FP8", use_fp8);
+    if(use_bfp8 != 0)
+        kbp.Define("MIOPEN_USE_FP8", use_bfp8);
     return kbp;
 }
 
diff --git a/src/include/miopen/gemm_v2.hpp b/src/include/miopen/gemm_v2.hpp
index f206771c89..9300ffa29b 100644
--- a/src/include/miopen/gemm_v2.hpp
+++ b/src/include/miopen/gemm_v2.hpp
@@ -27,6 +27,7 @@
 #define GUARD_MIOPEN_GEMM_V2_HPP_
 
 #include <miopen/common.hpp>
+#include <miopen/convolution.hpp>
 #include <miopen/miopen.h>
 
 namespace miopen {
@@ -72,6 +73,50 @@ struct GemmDescriptor
     float alpha, beta;
     miopenDataType_t dataType;
     bool deterministic;
+    bool gfx90a_alt_impl;
+    miopenDataType_t a_cast_type;
+    miopenDataType_t b_cast_type;
+    ConvolutionAttribute conv_attributes;
+    GemmDescriptor() {}
+    GemmDescriptor(bool isColMajor_,
+                   bool transA_,
+                   bool transB_,
+                   int m_,
+                   int n_,
+                   int k_,
+                   int lda_,
+                   int ldb_,
+                   int ldc_,
+                   int batch_count_,
+                   long long int strideA_,
+                   long long int strideB_,
+                   long long int strideC_,
+                   float alpha_,
+                   float beta_,
+                   miopenDataType_t dataType_,
+                   bool deterministic_)
+        : isColMajor(isColMajor_),
+          transA(transA_),
+          transB(transB_),
+          m(m_),
+          n(n_),
+          k(k_),
+          lda(lda_),
+          ldb(ldb_),
+          ldc(ldc_),
+          batch_count(batch_count_),
+          strideA(strideA_),
+          strideB(strideB_),
+          strideC(strideC_),
+          alpha(alpha_),
+          beta(beta_),
+          dataType(dataType_),
+          deterministic(deterministic_),
+          gfx90a_alt_impl(false),
+          a_cast_type(dataType),
+          b_cast_type(dataType)
+    {
+    }
 
     friend std::ostream& operator<<(std::ostream& stream, const GemmDescriptor& gemm_desc);
 };
@@ -86,8 +131,7 @@ miopenStatus_t CallGemmTimeMeasure(const Handle& handle,
                                    int c_offset,
                                    bool time_precision,
                                    CallGemmType_t call_gemm_type,
-                                   GemmBackend_t gemm_backend = GemmBackend_t::rocblas,
-                                   bool gfx90a_alt_impl       = false);
+                                   GemmBackend_t gemm_backend = GemmBackend_t::rocblas);
 
 miopenStatus_t CallGemm(const Handle& handle,
                         GemmDescriptor gemm_desc,
@@ -97,8 +141,7 @@ miopenStatus_t CallGemm(const Handle& handle,
                         int b_offset,
                         Data_t C,
                         int c_offset,
-                        GemmBackend_t gemm_backend = GemmBackend_t::rocblas,
-                        bool gfx90a_alt_impl       = false);
+                        GemmBackend_t gemm_backend = GemmBackend_t::rocblas);
 
 miopenStatus_t CallGemmStridedBatched(const Handle& handle,
                                       GemmDescriptor gemm_desc,
@@ -108,19 +151,18 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
                                       int b_offset,
                                       Data_t C,
                                       int c_offset,
-                                      GemmBackend_t gemm_backend = GemmBackend_t::rocblas,
-                                      bool gfx90a_alt_impl       = false);
-
-miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
-                                                GemmDescriptor gemm_desc,
-                                                ConstData_t A,
-                                                int a_offset,
-                                                ConstData_t B,
-                                                int b_offset,
-                                                Data_t C,
-                                                int c_offset,
-                                                GemmBackend_t gemm_backend = GemmBackend_t::rocblas,
-                                                bool gfx90a_alt_impl       = false);
+                                      GemmBackend_t gemm_backend = GemmBackend_t::rocblas);
+
+miopenStatus_t
+CallGemmStridedBatchedSequential(const Handle& handle,
+                                 GemmDescriptor gemm_desc,
+                                 ConstData_t A,
+                                 int a_offset,
+                                 ConstData_t B,
+                                 int b_offset,
+                                 Data_t C,
+                                 int c_offset,
+                                 GemmBackend_t gemm_backend = GemmBackend_t::rocblas);
 
 // GEMM parameters for Convolution (using Im2Col) Fwd
 // y = w * Im2Col(x)
diff --git a/src/include/miopen/handle.hpp b/src/include/miopen/handle.hpp
index ee26d7985f..7d1bb79a37 100644
--- a/src/include/miopen/handle.hpp
+++ b/src/include/miopen/handle.hpp
@@ -51,6 +51,10 @@
 #include <unordered_map>
 
 #if MIOPEN_USE_ROCBLAS
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-macros"
+#define ROCBLAS_BETA_FEATURES_API 1
+#pragma clang diagnostic pop
 #include <miopen/manage_ptr.hpp>
 #if MIOPEN_ROCBLAS_VERSION_FLAT < 2045000
 #include <rocblas.h>
diff --git a/src/include/miopen/hip_f8_impl.hpp b/src/include/miopen/hip_f8_impl.hpp
new file mode 120000
index 0000000000..22052778a0
--- /dev/null
+++ b/src/include/miopen/hip_f8_impl.hpp
@@ -0,0 +1 @@
+../../kernels/hip_f8_impl.hpp
\ No newline at end of file
diff --git a/src/include/miopen/hip_float8.hpp b/src/include/miopen/hip_float8.hpp
new file mode 120000
index 0000000000..5e16a70c91
--- /dev/null
+++ b/src/include/miopen/hip_float8.hpp
@@ -0,0 +1 @@
+../../kernels/hip_float8.hpp
\ No newline at end of file
diff --git a/src/include/miopen/problem_description.hpp b/src/include/miopen/problem_description.hpp
index bc781d4b1e..b8e70db5ff 100644
--- a/src/include/miopen/problem_description.hpp
+++ b/src/include/miopen/problem_description.hpp
@@ -198,6 +198,7 @@ struct ProblemDescriptionCompatTemporary
     /*
      *  set bot tensor
      */
+
     void setBotDescr(const std::string& layout,
                      miopenDataType_t data_type,
                      int batch,
diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp
index e5ed014831..f05bbdf712 100644
--- a/src/include/miopen/solver/conv_direct_naive_conv.hpp
+++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp
@@ -34,8 +34,10 @@ namespace solver {
 
 bool ConvDirectNaiveConvIsAssemblyKernel(const ExecutionContext&, const ProblemDescription&);
 std::string ConvDirectNaiveConvKernelName(const ProblemDescription&);
-std::string ConvDirectNaiveConvKernelFile();
-std::string ConvDirectNaiveConvCompileOption(const ConvolutionContext& ctx);
+std::string ConvDirectNaiveConvKernelFile(const ConvolutionContext& ctx,
+                                          const ProblemDescription& problem);
+std::string ConvDirectNaiveConvCompileOption(const ConvolutionContext& ctx,
+                                             const ProblemDescription& problem);
 bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext&,
                                                  const ProblemDescription&);
 
diff --git a/src/include/miopen/solver/problem_description_interpreter.hpp b/src/include/miopen/solver/problem_description_interpreter.hpp
index 3e9e7fb3de..0690d3d36e 100644
--- a/src/include/miopen/solver/problem_description_interpreter.hpp
+++ b/src/include/miopen/solver/problem_description_interpreter.hpp
@@ -105,6 +105,14 @@ struct ProblemInterpreter
             return problem.GetOutWidth_();
     }
 
+    static auto GetInputCastType(const ProblemDescription& problem)
+    {
+        if(problem.direction.IsForward())
+            return problem.GetInCastType();
+        else
+            return problem.GetOutCastType();
+    }
+
     static int GetOutputDepthDo(const ProblemDescription& problem)
     {
         if(problem.direction.IsForward())
@@ -129,6 +137,14 @@ struct ProblemInterpreter
             return problem.GetInWidth_();
     }
 
+    static auto GetOutputCastType(const ProblemDescription& problem)
+    {
+        if(problem.direction.IsForward())
+            return problem.GetOutCastType();
+        else
+            return problem.GetInCastType();
+    }
+
     static auto GetOutputDataType(const ProblemDescription& problem)
     {
         return problem.direction.IsForward() ? problem.GetOutDataType() : problem.GetInDataType();
diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp
index e27622bc4a..b8d72df67c 100644
--- a/src/include/miopen/tensor.hpp
+++ b/src/include/miopen/tensor.hpp
@@ -41,6 +41,7 @@
 #include <cassert>
 #include <numeric>
 #include <vector>
+#include <optional>
 
 namespace miopen {
 
@@ -101,7 +102,9 @@ inline std::size_t GetTypeSize(miopenDataType_t d)
     case miopenHalf:
     case miopenBFloat16: return 2;
     case miopenInt8x4:
-    case miopenInt8: return 1;
+    case miopenInt8:
+    case miopenFloat8:
+    case miopenBFloat8: return 1;
     case miopenDouble: return 8;
     }
     MIOPEN_THROW("Unknown data type");
@@ -185,6 +188,8 @@ struct TensorDescriptor : miopenTensorDescriptor
     std::string GetLayout_str() const;
 
     std::size_t GetVectorLength() const;
+    std::optional<miopenDataType_t> GetCastType() const;
+    void SetCastType(miopenDataType_t cast_type_);
 
     std::size_t GetElementSize() const;
 
@@ -280,7 +285,8 @@ struct TensorDescriptor : miopenTensorDescriptor
     bool packed;
     std::size_t vector_length = 1;
 
-    miopenDataType_t type             = miopenFloat;
+    miopenDataType_t type = miopenFloat;
+    std::optional<miopenDataType_t> cast_type;
     miopenTensorLayout_t tensorLayout = GetDefaultLayout();
 };
 
diff --git a/src/include/miopen/visit_float.hpp b/src/include/miopen/visit_float.hpp
index 8170ce5478..d26afba9dd 100644
--- a/src/include/miopen/visit_float.hpp
+++ b/src/include/miopen/visit_float.hpp
@@ -77,6 +77,8 @@ void visit_float(miopenDataType_t t, F f)
         f(as_float<bfloat16>{});
         break;
     }
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt8x4:
     case miopenInt8: {
         f(as_float<int8_t>{});
diff --git a/src/kernels/MIOpenCheckNumerics.cpp b/src/kernels/MIOpenCheckNumerics.cpp
new file mode 100644
index 0000000000..827f4d1397
--- /dev/null
+++ b/src/kernels/MIOpenCheckNumerics.cpp
@@ -0,0 +1,205 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include <hip/hip_bfloat16.h>
+
+// Copied over from naive_conv.cpp
+#ifdef __HIPCC_RTC__
+#ifdef WORKAROUND_ISSUE_HIPRTC_TRUE_TYPE
+/// Definitions from <cstdint>, <cmath> conflict with
+/// /opt/rocm/include/hip/amd_detail/amd_hip_vector_types.h.
+
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+typedef signed short int16_t;
+typedef unsigned short uint16_t;
+typedef float float_t;
+
+// std::conditional requires type_traits which has a few other things
+// which result in collision with amd_hip_vector_types.h
+
+namespace std {
+template <bool predicate, typename X, typename Y>
+struct conditional;
+
+template <typename X, typename Y>
+struct conditional<true, X, Y>
+{
+    using type = X;
+};
+
+template <typename X, typename Y>
+struct conditional<false, X, Y>
+{
+    using type = Y;
+};
+
+template <bool predicate, typename X, typename Y>
+using conditional_t = typename conditional<predicate, X, Y>::type;
+} // namespace std
+#else
+#include <cstdint> // int8_t, int16_t
+#include <cmath>   // float_t
+#endif
+#endif // __HIPCC_RTC__
+
+#include <limits> // std::numeric_limits
+
+#define MIOPEN_ENABLE_F8_DEVICE_CODE 1
+#include "hip_float8.hpp"
+
+struct Numerics
+{
+    float sum;
+    float absSum;
+    float min;
+    float max;
+};
+
+struct CheckNumericsResult
+{
+    Numerics n;
+
+    int hasZero;
+    int hasNan;
+    int hasInf;
+};
+
+__device__ void thread_redux(Numerics* stats, size_t wid)
+{
+    const auto lid = threadIdx.x;
+    if(lid < wid)
+    {
+        stats[lid].sum += stats[lid + wid].sum;
+        stats[lid].absSum += stats[lid + wid].absSum;
+        stats[lid].min = fmin(stats[lid].min, stats[lid + wid].min);
+        stats[lid].max = fmax(stats[lid].max, stats[lid + wid].max);
+    }
+}
+
+template <typename T, typename U>
+__device__ void
+check_numerics(const T* C_d, size_t sz, CheckNumericsResult* abnormal, bool computeStats)
+{
+    __shared__ Numerics stats[256];
+    U sum    = 0;
+    U absSum = 0;
+    T minV   = std::numeric_limits<T>::max();
+    T maxV   = std::numeric_limits<T>::min();
+
+    size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
+    size_t stride = blockDim.x * gridDim.x;
+
+    for(size_t i = offset; i < sz; i += stride)
+    {
+        T val = C_d[i];
+        sum += static_cast<U>(val);
+        const auto abs_val = fabs(static_cast<U>(val));
+        absSum += abs_val;
+        minV = min(minV, val);
+        maxV = max(maxV, val);
+        if(abs_val <= static_cast<U>(0.0f))
+            abnormal->hasZero = 1;
+        if(isnan(static_cast<U>(val)))
+            abnormal->hasNan = 1;
+        if(isinf(static_cast<U>(val)))
+            abnormal->hasInf = 1;
+    }
+    if(computeStats)
+    {
+        stats[threadIdx.x].sum    = static_cast<float>(sum);
+        stats[threadIdx.x].absSum = static_cast<float>(absSum);
+        stats[threadIdx.x].min    = static_cast<float>(minV);
+        stats[threadIdx.x].max    = static_cast<float>(maxV);
+        __syncthreads();
+        for(int idx = 128; idx > 0; idx = idx >> 1)
+        {
+            thread_redux(stats, idx);
+            __syncthreads();
+        }
+        if(threadIdx.x == 0)
+        {
+            atomicAdd(&abnormal->n.sum, stats[0].sum);
+            atomicAdd(&abnormal->n.absSum, stats[0].absSum);
+            atomicMin(&abnormal->n.min, stats[0].min);
+            atomicMax(&abnormal->n.max, stats[0].max);
+        }
+    }
+}
+
+extern "C" __global__ void check_numerics_fp32(const void* __restrict__ C_d,
+                                               size_t sz,
+                                               CheckNumericsResult* __restrict__ abnormal,
+                                               bool computeStats)
+{
+    check_numerics<float, float>(reinterpret_cast<const float*>(C_d), sz, abnormal, computeStats);
+}
+
+extern "C" __global__ void check_numerics_fp16(const void* __restrict__ C_d,
+                                               size_t sz,
+                                               CheckNumericsResult* __restrict__ abnormal,
+                                               bool computeStats)
+{
+    check_numerics<_Float16, float>(
+        reinterpret_cast<const _Float16*>(C_d), sz, abnormal, computeStats);
+}
+
+extern "C" __global__ void check_numerics_bf16(const void* __restrict__ C_d,
+                                               size_t sz,
+                                               CheckNumericsResult* __restrict__ abnormal,
+                                               bool computeStats)
+{
+    check_numerics<hip_bfloat16, float>(
+        reinterpret_cast<const hip_bfloat16*>(C_d), sz, abnormal, computeStats);
+}
+
+extern "C" __global__ void check_numerics_fp8(const void* __restrict__ C_d,
+                                              size_t sz,
+                                              CheckNumericsResult* __restrict__ abnormal,
+                                              bool computeStats)
+{
+    check_numerics<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>, float>(
+        reinterpret_cast<const miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>*>(C_d),
+        sz,
+        abnormal,
+        computeStats);
+}
+
+extern "C" __global__ void check_numerics_bf8(const void* __restrict__ C_d,
+                                              size_t sz,
+                                              CheckNumericsResult* __restrict__ abnormal,
+                                              bool computeStats)
+{
+    check_numerics<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>, float>(
+        reinterpret_cast<const miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>*>(C_d),
+        sz,
+        abnormal,
+        computeStats);
+}
diff --git a/src/kernels/MIOpenIm2d2Col.cl b/src/kernels/MIOpenIm2d2Col.cl
index c64f14dad5..7b1522db6f 100644
--- a/src/kernels/MIOpenIm2d2Col.cl
+++ b/src/kernels/MIOpenIm2d2Col.cl
@@ -48,7 +48,15 @@
 #define MIOPEN_USE_INT32 0
 #endif
 
-#if MIOPEN_USE_INT8
+#ifndef MIOPEN_USE_FP8
+#define MIOPEN_USE_FP8 0
+#endif
+
+#ifndef MIOPEN_USE_BFP8
+#define MIOPEN_USE_BFP8 0
+#endif
+
+#if MIOPEN_USE_INT8 || MIOPEN_USE_FP8 || MIOPEN_USE_BFP8
 typedef char data_t;
 #elif MIOPEN_USE_INT8x4
 typedef uint data_t;
diff --git a/src/kernels/MIOpenUtilKernels4.cl b/src/kernels/MIOpenUtilKernels4.cl
index d7e0d3f789..861e563012 100644
--- a/src/kernels/MIOpenUtilKernels4.cl
+++ b/src/kernels/MIOpenUtilKernels4.cl
@@ -48,7 +48,15 @@
 #define MIOPEN_USE_INT32 0
 #endif
 
-#if MIOPEN_USE_INT8
+#ifndef MIOPEN_USE_FP8
+#define MIOPEN_USE_FP8 0
+#endif
+
+#ifndef MIOPEN_USE_BFP8
+#define MIOPEN_USE_BFP8 0
+#endif
+
+#if MIOPEN_USE_INT8 || MIOPEN_USE_FP8 || MIOPEN_USE_BFP8
 typedef char data_t;
 #elif MIOPEN_USE_INT8x4
 typedef uint data_t;
diff --git a/src/kernels/bfloat16_dev.hpp b/src/kernels/bfloat16_dev.hpp
index 84346b5d36..c1a77c90db 100644
--- a/src/kernels/bfloat16_dev.hpp
+++ b/src/kernels/bfloat16_dev.hpp
@@ -118,6 +118,185 @@ EXECUTION_SPECIFIER ushort float_to_bfloat16(float src_val)
 #endif // MIOPEN_BACKEND_HIP
 }
 
+#ifndef MIOPEN_USE_FP8
+#define MIOPEN_USE_FP8 0
+#endif
+
+#ifndef MIOPEN_USE_BFP8
+#define MIOPEN_USE_BFP8 0
+#endif
+
+#if MIOPEN_USE_FP8 || MIOPEN_USE_BFP8
+// TODO: Convert the Col2Im kernels from OpenCL to HIP and remove the following
+// functions which are rewrites of the f8 header impl functions
+EXECUTION_SPECIFIER float fp8_to_float_impl(uchar x, const int wm, const int we)
+{
+    bool negative_zero_nan = MIOPEN_FP8_IEEE_EXPONENT_BIAS ? false : true;
+
+    const int weo = 8;
+    const int wmo = 23;
+
+    float fInf, fNegInf, fNaN, fNeg0;
+    const uint ifInf    = 0x7F800000;
+    const uint ifNegInf = 0xFF800000;
+    const uint ifNaN    = 0x7F800001;
+    const uint ifNeg0   = 0x80000000;
+    fInf                = *((const float*)(&ifInf));
+    fNegInf             = *((const float*)(&ifNegInf));
+    fNaN                = *((const float*)(&ifNaN));
+    fNeg0               = *((const float*)(&ifNeg0));
+
+    if(x == 0)
+        return (float)(0);
+
+    uint sign     = x >> 7;
+    uint mantissa = x & ((1 << wm) - 1);
+    int exponent  = (x & 0x7F) >> wm;
+    if(negative_zero_nan)
+    {
+        if(x == 0x80)
+            return fNaN;
+    }
+    else
+    {
+        if(x == 0x80)
+            return fNeg0;
+        if(exponent == ((1 << we) - 1))
+            return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
+    }
+    uint retval;
+    const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
+
+    // subnormal input
+    if(exponent == 0)
+    {
+        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+        // TODO: verify __builtin_clz and OpenCL's clz do the same thing
+        int sh = 1 + clz(mantissa) - (32 - wm);
+        mantissa <<= sh;
+        exponent += 1 - sh;
+        /*
+        exponent++;
+        while(mantissa<(1<<wm)) {
+          mantissa <<= 1;
+          exponent--;
+        }
+        */
+        mantissa &= ((1 << wm) - 1);
+    }
+    exponent += exp_low_cutoff - 1;
+    mantissa <<= wmo - wm;
+
+    // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
+    if(exponent <= 0)
+    {
+        mantissa |= 1 << wmo;
+        mantissa >>= 1 - exponent;
+        exponent = 0;
+    }
+
+    retval = (sign << 31) | (exponent << 23) | mantissa;
+    return *((const float*)(&retval));
+}
+
+EXECUTION_SPECIFIER float fp8_to_float(uchar x) { return fp8_to_float_impl(x, 3, 4); }
+
+EXECUTION_SPECIFIER float bfp8_to_float(uchar x) { return fp8_to_float_impl(x, 2, 5); }
+
+inline uchar float_to_fp8_impl(float _x, const int wm, const int we) // bool stoch, uint rng)
+{
+    bool negative_zero_nan = MIOPEN_FP8_IEEE_EXPONENT_BIAS ? false : true;
+    bool clip              = MIOPEN_FP8_CLIPPING;
+
+    // Conserve the logic for stochastic rounding:
+    bool stoch     = false;
+    uint rng       = 0;
+    const int mfmt = 23;
+    uint x;
+    x = *((uint*)(&_x));
+
+    uint head, mantissa;
+    int exponent;
+    uint sign;
+
+    head     = x & 0xFF800000;
+    mantissa = x & 0x7FFFFF;
+    exponent = (head >> 23) & 0xFF;
+    sign     = head >> 31;
+
+    uint signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
+
+    if(negative_zero_nan)
+    {
+        if((x & 0x7F800000) == 0x7F800000)
+            return 0x80;
+    }
+    else
+    {
+        if((x & 0x7F800000) == 0x7F800000)
+            return signed_inf + (mantissa != 0 ? 1 : 0);
+    }
+    if(x == 0)
+        return 0;
+
+    uint drop_mask           = (1 << (mfmt - wm)) - 1;
+    const int max_exp        = (1 << we) - (negative_zero_nan ? 1 : 2);
+    const int exp_low_cutoff = (128) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
+
+    exponent -= exp_low_cutoff - 1;
+    if(exponent <= 0)
+        drop_mask = (1 << (mfmt - wm + 1 - exponent)) - 1;
+    mantissa += 1 << mfmt;
+    mantissa += (stoch ? rng : mantissa) & drop_mask;
+    if(mantissa >= (2 << mfmt))
+    {
+        mantissa >>= 1;
+        exponent++;
+    }
+    mantissa >>= (mfmt - wm);
+
+    if(exponent <= 0)
+    {
+        if(x == 0) // cppcheck-suppress identicalConditionAfterEarlyExit
+            return 0;
+        else
+        {
+            // subnormal range; represented by a subnormal float8 (exponent 0)
+            // and involves loss of accuracy
+            mantissa >>= 1 - exponent;
+            exponent = 0;
+        }
+    }
+    // above range: quantize to maximum possible float of the same sign
+    else if(exponent > max_exp)
+    {
+        if(clip)
+        {
+            mantissa = (1 << wm) - 1;
+            exponent = max_exp;
+        }
+        else
+        {
+            return signed_inf;
+        }
+    }
+    if(exponent == 0 && mantissa == 0)
+        return negative_zero_nan ? 0 : (sign << 7);
+    mantissa &= (1 << wm) - 1;
+    return (sign << 7) | (exponent << wm) | mantissa;
+}
+
+EXECUTION_SPECIFIER uchar float_to_fp8(float _x) // bool stoch, uint rng)
+{
+    return float_to_fp8_impl(_x, 3, 4);
+}
+
+EXECUTION_SPECIFIER uchar float_to_bfp8(float _x) // bool stoch, uint rng)
+{
+    return float_to_fp8_impl(_x, 2, 5);
+}
+#endif // MIOPEN_USE_FP8 || MIOPEN_USE_BFP8
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/kernels/float_types.h b/src/kernels/float_types.h
index a4c3b654ca..897e95ca67 100644
--- a/src/kernels/float_types.h
+++ b/src/kernels/float_types.h
@@ -33,6 +33,51 @@
 #define TWO 2
 #define FOUR 4
 #define EIGHT 8
+#if MIOPEN_USE_FP8 == 1
+#ifdef __HIP_PLATFORM_HCC__
+#define FLOAT hip_f8<miopen_f8::hip_f8_type::fp8>
+#define FLOAT_ACCUM float
+// HIP implements the correct operators for conversion
+
+#else
+#define _FLOAT uchar
+#define _FLOAT_ACCUM float
+// OpenCL requires explicit functions
+#define CVT_FLOAT2ACCUM(x) fp8_to_float(x)
+#define CVT_ACCUM2FLOAT(x) float_to_fp8(x)
+#endif
+#define SIZEOF_FLOAT 1
+// Max value for the main datatype
+#define MAX_VAL 0x7F
+// Max value for accumulator
+// #ifndef FLT_MAX
+// #define MAX_VAL_ACCUM 3.402823466e+38F
+// #else
+// #define MAX_VAL_ACCUM FLT_MAX
+// #endif
+#endif // MIOPEN_USE_FP8
+
+#if MIOPEN_USE_BFP8 == 1
+#ifdef __HIP_PLATFORM_HCC__
+#define FLOAT hip_f8<miopen_f8::hip_f8_type::bf8>
+#define FLOAT_ACCUM float
+#else
+#define _FLOAT uchar
+#define _FLOAT_ACCUM float
+// OpenCL requires explicit functions
+#define CVT_FLOAT2ACCUM(x) bfp8_to_float(x)
+#define CVT_ACCUM2FLOAT(x) float_to_bfp8(x)
+#endif
+#define SIZEOF_FLOAT 1
+// Max value for the main datatype
+#define MAX_VAL 0x7F
+// Max value for accumulator
+// #ifndef FLT_MAX
+// #define MAX_VAL_ACCUM 3.402823466e+38F
+// #else
+// #define MAX_VAL_ACCUM FLT_MAX
+// #endif
+#endif // MIOPEN_USE_BFP8
 
 #ifndef __HIP_PLATFORM_HCC__
 #define _FLOAT2 PPCAT(_FLOAT, TWO)
diff --git a/src/kernels/gpu_reference_kernel/fp8_kern_types.h b/src/kernels/gpu_reference_kernel/fp8_kern_types.h
new file mode 100644
index 0000000000..3bac0a31f7
--- /dev/null
+++ b/src/kernels/gpu_reference_kernel/fp8_kern_types.h
@@ -0,0 +1,63 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#define CAT_I(a, b) a##b
+#define CAT(a, b) CAT_I(a, b)
+
+#ifndef INPUT_TYPE
+#define INPUT_TYPE half
+#endif
+
+#ifndef OUTPUT_TYPE
+#define OUTPUT_TYPE half
+#endif
+
+#ifndef WEIGHTS_TYPE
+#define WEIGHTS_TYPE half
+#endif
+
+#ifndef INPUT_CAST_TYPE
+#define INPUT_CAST_TYPE float8
+#endif
+
+#ifndef WEIGHTS_CAST_TYPE
+#define WEIGHTS_CAST_TYPE float8
+#endif
+
+#ifndef OUTPUT_CAST_TYPE
+#define OUTPUT_CAST_TYPE float8
+#endif
+
+#ifndef ACCUMULATOR_TYPE
+#define ACCUMULATOR_TYPE double
+#endif
+
+#define KERNEL_NAME_SUFFIX CAT(CAT(INPUT_TYPE, _), CAT(CAT(WEIGHTS_TYPE, _), OUTPUT_TYPE))
+
+#define FWD_KERNEL_NAME CAT(naive_conv_fwd_nchw_, KERNEL_NAME_SUFFIX)
+#define BWD_KERNEL_NAME CAT(naive_conv_bwd_nchw_, KERNEL_NAME_SUFFIX)
+#define WRW_KERNEL_NAME CAT(naive_conv_wrw_nchw_, KERNEL_NAME_SUFFIX)
diff --git a/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp b/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp
new file mode 100644
index 0000000000..e6b2945beb
--- /dev/null
+++ b/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp
@@ -0,0 +1,571 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <half.hpp>
+#include <hip/hip_bfloat16.h>
+#endif
+
+// Copied over from naive_conv.cpp
+#ifdef __HIPCC_RTC__
+#ifdef WORKAROUND_ISSUE_HIPRTC_TRUE_TYPE
+/// Definitions from <cstdint>, <cmath> conflict with
+/// /opt/rocm/include/hip/amd_detail/amd_hip_vector_types.h.
+
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+typedef signed short int16_t;
+typedef unsigned short uint16_t;
+typedef float float_t;
+
+// std::conditional requires type_traits which has a few other things
+// which result in collition with amd_hip_vector_types.h
+
+namespace std {
+template <bool predicate, typename X, typename Y>
+struct conditional;
+
+template <typename X, typename Y>
+struct conditional<true, X, Y>
+{
+    using type = X;
+};
+
+template <typename X, typename Y>
+struct conditional<false, X, Y>
+{
+    using type = Y;
+};
+
+template <bool predicate, typename X, typename Y>
+using conditional_t = typename conditional<predicate, X, Y>::type;
+} // namespace std
+#else
+#include <cstdint> // int8_t, int16_t
+#include <cmath>   // float_t
+#endif
+#endif // __HIPCC_RTC__
+
+#include <limits> // std::numeric_limits
+
+#define MIOPEN_ENABLE_F8_DEVICE_CODE 1
+#include "hip_float8.hpp"
+
+#include "fp8_kern_types.h"
+
+using float8  = miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>;
+using bfloat8 = miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>;
+
+template <typename T>
+inline __device__ uint32_t draft_rng(T x, uint32_t seed)
+{
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    typedef typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type IT;
+    IT tmp             = *(reinterpret_cast<IT*>(&x));
+    uint32_t drop_bits = uint32_t(tmp) & 0xFFFFu;
+    if(sizeof(tmp) == 4)
+        drop_bits ^= tmp >> 16;
+    drop_bits = ((drop_bits & 31) << 11) | (drop_bits >> 5);
+    drop_bits *= 0x7000149;
+    uint32_t rng = (drop_bits ^ 0x13371337 ^ (i * 229791) ^ seed);
+    return rng;
+}
+
+template <typename TI, typename TO>
+inline __device__ TO cast_number(const TI input, miopen_f8::hip_f8_rounding_mode mode, uint32_t rng)
+{
+    if(std::is_same<TI, TO>::value)
+    {
+        return input;
+    }
+    if(sizeof(TI) == sizeof(TO))
+    {
+        const auto tmp = static_cast<float>(input);
+        return TO{tmp, mode, rng};
+    }
+    else if(sizeof(TO) > sizeof(TI))
+    {
+        return static_cast<TO>(input);
+    }
+    else
+    {
+        return TO{input, mode, rng};
+    }
+}
+
+template <typename TI,
+          typename TW,
+          typename TO,
+          typename in_cast_type,
+          typename wei_cast_type,
+          typename TACC = double>
+inline __device__ void naive_conv_fwd_nchw(const TI* __restrict__ p_in,
+                                           const TW* __restrict__ p_wei,
+                                           TO* __restrict__ p_out,
+                                           const int hi,
+                                           const int wi,
+                                           const int n,
+                                           const int k_per_group,
+                                           const int c_per_group,
+                                           const int ho,
+                                           const int wo,
+                                           const int sy,
+                                           const int sx,
+                                           const int dy,
+                                           const int dx,
+                                           const int py,
+                                           const int px,
+                                           const int fy,
+                                           const int fx,
+                                           const int group,
+                                           bool stoch,
+                                           uint32_t seed)
+{
+    /*
+     *  need to compute total output pixel: `group * n * k_per_group * ho * wo`.
+     *  to distribute this workload, let one workgroup compute `ho * wo` pixel,
+     *  hence need `group * n * k_per_group` workgroups (grid_size).
+     */
+    int k             = k_per_group * group;
+    int c             = c_per_group * group;
+    int thread_length = ho * wo;
+    int bid           = blockIdx.x;
+    int ik            = bid % k_per_group;
+    int in            = (bid / k_per_group) % n;
+    int ig            = bid / (n * k_per_group);
+
+    p_in += static_cast<size_t>(in) * c * hi * wi + static_cast<size_t>(ig) * c_per_group * hi * wi;
+    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
+             static_cast<size_t>(ik) * c_per_group * fy * fx;
+    p_out += static_cast<size_t>(in) * k * ho * wo +
+             static_cast<size_t>(ig) * k_per_group * ho * wo + static_cast<size_t>(ik) * ho * wo;
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+    {
+        int iho = tid / wo;
+        int iwo = tid % wo;
+
+        TACC value = .0f;
+
+        for(int ic = 0; ic < c_per_group; ic++)
+        {
+            for(int iy = 0; iy < fy; iy++)
+            {
+                int valid_h = 1;
+                int cur_h   = sy * iho - py + dy * iy;
+                if(cur_h < 0 || cur_h >= hi)
+                    valid_h &= 0;
+                for(int ix = 0; ix < fx; ix++)
+                {
+                    int valid_w = 1;
+                    int cur_w   = sx * iwo - px + dx * ix;
+                    if(cur_w < 0 || cur_w >= wi)
+                        valid_w &= 0;
+
+                    if(valid_w & valid_h)
+                    {
+                        size_t i_idx = static_cast<size_t>(ic) * hi * wi +
+                                       static_cast<size_t>(cur_h) * wi + static_cast<size_t>(cur_w);
+                        size_t f_idx = static_cast<size_t>(ic) * fy * fx +
+                                       static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
+                        uint32_t rng1 = 0;
+                        uint32_t rng2 = 0;
+                        auto rnd_mode = miopen_f8::hip_f8_rounding_mode::standard;
+                        if(stoch)
+                        {
+                            rng1     = draft_rng(p_in[i_idx], seed);
+                            rng2     = draft_rng(p_in[f_idx], seed);
+                            rnd_mode = miopen_f8::hip_f8_rounding_mode::stochastic;
+                        }
+                        const auto item_in  = in_cast_type(p_in[i_idx], rnd_mode, rng1);
+                        const auto item_wei = wei_cast_type(p_wei[f_idx], rnd_mode, rng2);
+                        value += static_cast<TACC>(item_in) * static_cast<TACC>(item_wei);
+                    }
+                }
+            }
+        }
+        size_t o_idx = static_cast<size_t>(iho) * wo + static_cast<size_t>(iwo);
+        // p_out[o_idx] = __float2half(static_cast<float>(value));
+        p_out[o_idx] = static_cast<TO>(value);
+    }
+}
+
+extern "C" __global__ void FWD_KERNEL_NAME(const INPUT_TYPE* __restrict__ p_in,
+
+                                           const WEIGHTS_TYPE* __restrict__ p_wei,
+                                           OUTPUT_TYPE* __restrict__ p_out,
+                                           int hi,
+                                           int wi,
+                                           int n,
+                                           int k_per_group,
+                                           int c_per_group,
+                                           int ho,
+                                           int wo,
+                                           int sy,
+                                           int sx,
+                                           int dy,
+                                           int dx,
+                                           int py,
+                                           int px,
+                                           int fy,
+                                           int fx,
+                                           int group,
+                                           bool stochastic,
+                                           uint32_t seed)
+{
+    // instantiate the kernel
+    naive_conv_fwd_nchw<INPUT_TYPE,
+                        WEIGHTS_TYPE,
+                        OUTPUT_TYPE,
+                        INPUT_CAST_TYPE,
+                        WEIGHTS_CAST_TYPE,
+                        ACCUMULATOR_TYPE>(p_in,
+                                          p_wei,
+                                          p_out,
+                                          hi,
+                                          wi,
+                                          n,
+                                          k_per_group,
+                                          c_per_group,
+                                          ho,
+                                          wo,
+                                          sy,
+                                          sx,
+                                          dy,
+                                          dx,
+                                          py,
+                                          px,
+                                          fy,
+                                          fx,
+                                          group,
+                                          stochastic,
+                                          seed);
+}
+
+template <typename TI,
+          typename TW,
+          typename TO,
+          typename wei_cast_type,
+          typename out_cast_type,
+          typename TACC = double>
+inline __device__ void naive_conv_bwd_nchw(TI* __restrict__ p_in,
+                                           const TW* __restrict__ p_wei,
+                                           const TO* __restrict__ p_out,
+                                           int hi,
+                                           int wi,
+                                           int n,
+                                           int k_per_group,
+                                           int c_per_group,
+                                           int ho,
+                                           int wo,
+                                           int sy,
+                                           int sx,
+                                           int dy,
+                                           int dx,
+                                           int py,
+                                           int px,
+                                           int fy,
+                                           int fx,
+                                           int group,
+                                           bool stoch,
+                                           uint32_t seed)
+{
+    /*
+     *  need to compute total input pixel: `group * n * c_per_group * hi * wi`.
+     *  to distribute this workload, let one workgroup compute `hi * wi` pixel,
+     *  hence need `group * n * c_per_group` workgroups (grid_size).
+     */
+    int k             = k_per_group * group;
+    int c             = c_per_group * group;
+    int thread_length = hi * wi;
+    int bid           = blockIdx.x;
+    int ic            = bid % c_per_group;
+    int in            = (bid / c_per_group) % n;
+    int ig            = bid / (n * c_per_group);
+
+    p_in += static_cast<size_t>(in) * c * hi * wi +
+            static_cast<size_t>(ig) * c_per_group * hi * wi + static_cast<size_t>(ic) * hi * wi;
+    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
+             static_cast<size_t>(ic) * fy * fx;
+    p_out +=
+        static_cast<size_t>(in) * k * ho * wo + static_cast<size_t>(ig) * k_per_group * ho * wo;
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+    {
+        int ihi = tid / wi;
+        int iwi = tid % wi;
+
+        TACC value = .0f;
+
+        for(int ik = 0; ik < k_per_group; ik++)
+        {
+            for(int iy = 0; iy < fy; iy++)
+            {
+                int valid_h = 1;
+                int cur_ho  = ihi + py - dy * iy; // cur_h = sy*iho-py+dy*iy;
+                if(cur_ho < 0 || cur_ho % sy)
+                    valid_h &= 0;
+                cur_ho /= sy;
+                if(cur_ho >= ho)
+                    valid_h &= 0;
+                for(int ix = 0; ix < fx; ix++)
+                {
+                    int valid_w = 1;
+                    int cur_wo  = iwi + px - dx * ix; // cur_w = sx*iwo-px+dx*ix;
+                    if(cur_wo < 0 || cur_wo % sx)
+                        valid_w &= 0;
+                    cur_wo /= sx;
+                    if(cur_wo >= wo)
+                        valid_w &= 0;
+
+                    if(valid_h & valid_w)
+                    {
+                        size_t o_idx = static_cast<size_t>(ik) * ho * wo +
+                                       static_cast<size_t>(cur_ho) * wo +
+                                       static_cast<size_t>(cur_wo);
+                        size_t f_idx = static_cast<size_t>(ik) * c_per_group * fy * fx +
+                                       static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
+                        uint32_t rng1 = 0;
+                        uint32_t rng2 = 0;
+                        auto rnd_mode = miopen_f8::hip_f8_rounding_mode::standard;
+                        if(stoch)
+                        {
+                            rng1     = draft_rng(p_out[o_idx], seed);
+                            rng2     = draft_rng(p_wei[f_idx], seed);
+                            rnd_mode = miopen_f8::hip_f8_rounding_mode::stochastic;
+                        }
+                        const auto item_out = out_cast_type(p_out[o_idx], rnd_mode, rng1);
+                        const auto item_wei = wei_cast_type(p_wei[f_idx], rnd_mode, rng2);
+                        value += static_cast<TACC>(item_out) * static_cast<TACC>(item_wei);
+                    }
+                }
+            }
+        }
+        size_t i_idx = static_cast<size_t>(ihi) * wi + static_cast<size_t>(iwi);
+        p_in[i_idx]  = static_cast<TI>(value);
+    }
+}
+
+extern "C" __global__ void BWD_KERNEL_NAME(INPUT_TYPE* __restrict__ p_in,
+                                           const WEIGHTS_TYPE* __restrict__ p_wei,
+                                           const OUTPUT_TYPE* __restrict__ p_out,
+                                           int hi,
+                                           int wi,
+                                           int n,
+                                           int k_per_group,
+                                           int c_per_group,
+                                           int ho,
+                                           int wo,
+                                           int sy,
+                                           int sx,
+                                           int dy,
+                                           int dx,
+                                           int py,
+                                           int px,
+                                           int fy,
+                                           int fx,
+                                           int group,
+                                           bool stochastic,
+                                           uint32_t seed)
+{
+    // instantiate template
+    naive_conv_bwd_nchw<INPUT_TYPE,
+                        WEIGHTS_TYPE,
+                        OUTPUT_TYPE,
+                        WEIGHTS_CAST_TYPE,
+                        OUTPUT_CAST_TYPE,
+                        ACCUMULATOR_TYPE>(p_in,
+                                          p_wei,
+                                          p_out,
+                                          hi,
+                                          wi,
+                                          n,
+                                          k_per_group,
+                                          c_per_group,
+                                          ho,
+                                          wo,
+                                          sy,
+                                          sx,
+                                          dy,
+                                          dx,
+                                          py,
+                                          px,
+                                          fy,
+                                          fx,
+                                          group,
+                                          stochastic,
+                                          seed);
+}
+
+template <typename TI,
+          typename TW,
+          typename TO,
+          typename in_cast_type,
+          typename out_cast_type,
+          typename TACC = double>
+inline __device__ void naive_conv_wrw_nchw(const TI* __restrict__ p_in,
+                                           TW* __restrict__ p_wei,
+                                           const TO* __restrict__ p_out,
+                                           int hi,
+                                           int wi,
+                                           int n,
+                                           int k_per_group,
+                                           int c_per_group,
+                                           int ho,
+                                           int wo,
+                                           int sy,
+                                           int sx,
+                                           int dy,
+                                           int dx,
+                                           int py,
+                                           int px,
+                                           int fy,
+                                           int fx,
+                                           int group,
+                                           bool stoch,
+                                           uint32_t seed)
+{
+    /*
+     *  need to compute total filter pixel: `group * k_per_group * c_per_group *
+     * fy * fx`.
+     *  to distribute this workload, let one workgroup compute `c_per_group * fy
+     * * fx` pixel,
+     *  hence need `group * k_per_group` workgroups (grid_size).
+     */
+    int k             = k_per_group * group;
+    int c             = c_per_group * group;
+    int thread_length = c_per_group * fy * fx;
+    int bid           = blockIdx.x;
+    int ik            = bid % k_per_group;
+    int ig            = bid / k_per_group;
+
+    p_in += static_cast<size_t>(ig) * c_per_group * hi * wi;
+    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
+             static_cast<size_t>(ik) * c_per_group * fy * fx;
+    p_out += static_cast<size_t>(ig) * k_per_group * ho * wo + static_cast<size_t>(ik) * ho * wo;
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+    {
+        int ix = tid % fx;
+        int iy = (tid / fx) % fy;
+        int ic = tid / (fx * fy);
+
+        TACC value = .0f;
+
+        for(int in = 0; in < n; in++)
+        {
+            for(int iho = 0; iho < ho; iho++)
+            {
+                int valid_h = 1;
+                int cur_h   = sy * iho - py + dy * iy;
+                if(cur_h < 0 || cur_h >= hi)
+                    valid_h &= 0;
+                for(int iwo = 0; iwo < wo; iwo++)
+                {
+                    int valid_w = 1;
+                    int cur_w   = sx * iwo - px + dx * ix;
+                    if(cur_w < 0 || cur_w >= wi)
+                        valid_w &= 0;
+
+                    if(valid_h & valid_w)
+                    {
+                        size_t i_idx = static_cast<size_t>(in) * c * hi * wi +
+                                       static_cast<size_t>(ic) * hi * wi +
+                                       static_cast<size_t>(cur_h) * wi + static_cast<size_t>(cur_w);
+                        size_t o_idx = static_cast<size_t>(in) * k * ho * wo +
+                                       static_cast<size_t>(iho) * wo + static_cast<size_t>(iwo);
+                        uint32_t rng1 = 0;
+                        uint32_t rng2 = 0;
+                        auto rnd_mode = miopen_f8::hip_f8_rounding_mode::standard;
+                        if(stoch)
+                        {
+                            rng1     = draft_rng(p_in[i_idx], seed);
+                            rng2     = draft_rng(p_out[o_idx], seed);
+                            rnd_mode = miopen_f8::hip_f8_rounding_mode::stochastic;
+                        }
+                        const auto item_in  = in_cast_type(p_in[i_idx], rnd_mode, rng1);
+                        const auto item_out = out_cast_type(p_out[o_idx], rnd_mode, rng2);
+                        value += static_cast<TACC>(item_in) * static_cast<TACC>(item_out);
+                    }
+                }
+            }
+        }
+        size_t f_idx = static_cast<size_t>(ic) * fy * fx + static_cast<size_t>(iy) * fx +
+                       static_cast<size_t>(ix);
+        p_wei[f_idx] = static_cast<TW>(value);
+    }
+}
+
+extern "C" __global__ void WRW_KERNEL_NAME(const INPUT_TYPE* __restrict__ p_in,
+                                           WEIGHTS_TYPE* __restrict__ p_wei,
+                                           const OUTPUT_TYPE* __restrict__ p_out,
+                                           int hi,
+                                           int wi,
+                                           int n,
+                                           int k_per_group,
+                                           int c_per_group,
+                                           int ho,
+                                           int wo,
+                                           int sy,
+                                           int sx,
+                                           int dy,
+                                           int dx,
+                                           int py,
+                                           int px,
+                                           int fy,
+                                           int fx,
+                                           int group,
+                                           bool stochastic,
+                                           uint32_t seed)
+{
+    naive_conv_wrw_nchw<INPUT_TYPE,
+                        WEIGHTS_TYPE,
+                        OUTPUT_TYPE,
+                        INPUT_CAST_TYPE,
+                        OUTPUT_CAST_TYPE,
+                        ACCUMULATOR_TYPE>(p_in,
+                                          p_wei,
+                                          p_out,
+                                          hi,
+                                          wi,
+                                          n,
+                                          k_per_group,
+                                          c_per_group,
+                                          ho,
+                                          wo,
+                                          sy,
+                                          sx,
+                                          dy,
+                                          dx,
+                                          py,
+                                          px,
+                                          fy,
+                                          fx,
+                                          group,
+                                          stochastic,
+                                          seed);
+}
diff --git a/src/kernels/hip_f8_impl.hpp b/src/kernels/hip_f8_impl.hpp
new file mode 100644
index 0000000000..c7a62f9f72
--- /dev/null
+++ b/src/kernels/hip_f8_impl.hpp
@@ -0,0 +1,361 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+// #include <miopen/bfloat16.hpp>
+// #include <half.hpp>
+namespace miopen_hip_f8_impl {
+
+#ifndef __HIP_PLATFORM_HCC__
+using hip_bfloat16 = bfloat16;
+using half         = half_float::half;
+#endif
+
+template <int wm, int we, typename T>
+MIOPEN_HIP_HOST_DEVICE uint8_t cast_to_f8_no_range_reduce(T _x,
+                                                          bool stoch   = false,
+                                                          uint32_t rng = 0)
+{
+    static_assert(we == 5, "we==5");
+    static_assert(sizeof(T) == 2, "no_range_reduce only works for float16");
+
+    uint32_t x = *(reinterpret_cast<uint16_t*>(&_x));
+
+    uint32_t head, mantissa, exponent;
+    uint32_t sign;
+
+    const int mfmt      = 10;
+    head                = x & 0xFC00;
+    mantissa            = x & 0x3FF;
+    exponent            = (head >> 10) & 0x1F;
+    sign                = head >> 15;
+    uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
+
+    if((x & 0x7FFF) == 0x7C00)
+        return signed_inf;
+    if((x & 0x7C00) == 0x7C00)
+        return signed_inf + 1;
+    if(x == 0)
+        return 0;
+    if(x == 0x8000)
+        return 0x80;
+
+    //  uint32_t nextbit = 1<<(mfmt-wm-1);
+    uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
+
+    // const int max_exp = (1<<we)-(negative_zero_nan ? 1 : 2);
+    mantissa += (stoch ? rng : mantissa) & drop_mask;
+    if(exponent != 0)
+        mantissa += 1 << mfmt;
+    if(mantissa >= (2 << mfmt))
+    {
+        mantissa >>= 1;
+        exponent++;
+    }
+    else if(mantissa >= (1 << mfmt) && exponent == 0)
+    {
+        exponent++;
+    }
+    mantissa >>= (mfmt - wm);
+    mantissa &= (1 << wm) - 1;
+    if(exponent == 31)
+        return (sign << 7) | 0x7B;
+    return (sign << 7) | (exponent << wm) | mantissa;
+}
+
+template <int wm, int we, typename T, bool negative_zero_nan, bool clip>
+MIOPEN_HIP_HOST_DEVICE uint8_t cast_to_f8(T _x, bool stoch, uint32_t rng)
+{
+    constexpr bool is_half  = std::is_same<T, half>::value;
+    constexpr bool is_float = std::is_same<T, float>::value;
+    static_assert(wm + we == 7, "wm+we==7");
+    static_assert(is_half || is_float, "Only half and float can be cast to f8");
+
+    if(sizeof(T) == 2 && we == 5 && !negative_zero_nan)
+        return cast_to_f8_no_range_reduce<2, 5, half>(static_cast<half>(_x), stoch, rng);
+
+    const int mfmt = (sizeof(T) == 4) ? 23 : 10;
+    uint32_t x;
+    if(sizeof(T) == 4)
+        x = *(reinterpret_cast<uint32_t*>(&_x)); // cppcheck-suppress invalidPointerCast
+    else
+        x = *(reinterpret_cast<uint16_t*>(&_x)); // cppcheck-suppress invalidPointerCast
+
+    uint32_t head, mantissa;
+    int exponent, bias;
+    uint32_t sign;
+
+    if(sizeof(T) == 4)
+    {
+        head     = x & 0xFF800000;
+        mantissa = x & 0x7FFFFF;
+        exponent = (head >> 23) & 0xFF;
+        sign     = head >> 31;
+        bias     = 127;
+    }
+    else
+    {
+        head     = x & 0xFC00;
+        mantissa = x & 0x3FF;
+        exponent = (head >> 10) & 0x1F;
+        sign     = head >> 15;
+        bias     = 15;
+    }
+
+    uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
+
+    if(negative_zero_nan)
+    {
+        if(sizeof(T) == 4)
+        {
+            if((x & 0x7F800000) == 0x7F800000)
+                return 0x80;
+        }
+        else
+        {
+            // if(__hisinf(x) || __hisnan(x))
+            if((x & 0x7C00) == 0x7C00)
+                return 0x80;
+        }
+    }
+    else
+    {
+        if(sizeof(T) == 4)
+        {
+            if((x & 0x7F800000) == 0x7F800000)
+                return signed_inf + (mantissa != 0 ? 1 : 0);
+        }
+        else
+        {
+            if((x & 0x7C00) == 0x7C00)
+                return signed_inf + (mantissa != 0 ? 1 : 0);
+        }
+    }
+    if(x == 0)
+        return 0;
+    // First need to check if it is normal or denorm as there is a difference of implict 1
+    // Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift
+    // The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for
+    // RNE, no need to add rng. Then probably need to check whether there is carry and adjust
+    // exponent and mantissa again
+
+    // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits
+    const int f8_bias                  = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
+    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
+    // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
+    // f8_exponent is the converted f8 exponent with bias encoding
+    // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
+    // the difference needs to be adjusted and mantissa shifted
+    int act_exponent, f8_exponent, exponent_diff;
+
+    if(exponent == 0)
+    { // fp32/fp16 is in denormal.
+        /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16
+           here. In this case, f8 is usually in denormal. But there could be exceptions. fp16
+           denormal has exponent bias 15 while bf8 with NANOO has exponent bias 16. It means that
+           there are some numbers in fp16 denormal but they are bf8 (NANOO) normals - smallest bf8
+           (NANOO) normal is 2^-15. fp16 numbers where exponent==0 (actual exponent -14) and highest
+           bit of mantissa is 1 are bf8 (NANOO) normal. In this case, the fp16 mantissa should be
+           shift left by 1  */
+        act_exponent  = exponent - bias + 1;
+        exponent_diff = f8_denormal_act_exponent -
+                        act_exponent; // actual exponent is exponent-bias+1 as it is denormal
+    }
+    else
+    { // fp32/fp16 is normal with implicit 1
+        act_exponent = exponent - bias;
+        if(act_exponent <= f8_denormal_act_exponent)
+        {
+            /* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
+               For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16
+               actual exponent is -7, it is actually larger due to the implict 1,
+               Therefore it needs to be adjust to -6 and mantissa shift right by 1.
+               So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
+            exponent_diff = f8_denormal_act_exponent - act_exponent;
+        }
+        else
+        { // both fp32/fp16 and f8 are in normal range
+            exponent_diff =
+                0; // exponent_diff=0 does not mean there is no difference for this case,
+                   // act_exponent could be larger. Just that it does not need shift mantissa
+        }
+        mantissa += (1 << mfmt); // Add the implicit 1 into mantissa
+    }
+    const long tmp = (mfmt - wm + exponent_diff);
+    if(tmp == 33)
+        printf("Gotcha");
+
+    bool midpoint = (mantissa & ((static_cast<uint32_t>(1) << (mfmt - wm + exponent_diff)) - 1)) ==
+                    (static_cast<uint32_t>(1) << (mfmt - wm + exponent_diff - 1));
+    /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we
+       shift right as shift right could rip off some residual part and make something not midpoint
+       look like midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger
+       than midpoint, but after shift right by 4 bits, it would look like midpoint.
+       */
+
+    if(exponent_diff > 0)
+        mantissa >>= exponent_diff;
+    else if(exponent_diff == -1)
+        mantissa <<= -exponent_diff;
+    bool implicit_one = mantissa & (1 << mfmt);
+    // if there is no implict 1, it  means the f8 is denormal and need to adjust to denorm exponent
+    f8_exponent =
+        (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
+
+    // Now we have the exponent and mantissa adjusted
+    uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
+    bool odd =
+        mantissa & (1 << (mfmt - wm)); // if the least significant bit that is not truncated is 1
+    mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask;
+
+    // Now we deal with overflow
+    if(f8_exponent == 0)
+    {
+        if((1 << mfmt) & mantissa)
+        {
+            f8_exponent = 1; // denormal overflow to become normal, promote exponent
+        }
+    }
+    else
+    {
+        if((1 << (mfmt + 1)) & mantissa)
+        {
+            mantissa >>= 1;
+            f8_exponent++;
+        }
+    }
+
+    mantissa >>= (mfmt - wm);
+
+    // above range: quantize to maximum possible float of the same sign
+    const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
+    if(f8_exponent > max_exp)
+    {
+        if(clip)
+        {
+            mantissa    = (1 << wm) - 1;
+            f8_exponent = max_exp;
+        }
+        else
+        {
+            return signed_inf;
+        }
+    }
+
+    if(f8_exponent == 0 && mantissa == 0)
+        return negative_zero_nan ? 0 : (sign << 7);
+    mantissa &= (1 << wm) - 1;
+    return (sign << 7) | (f8_exponent << wm) | mantissa;
+}
+
+template <int wm, int we, typename T, bool negative_zero_nan>
+MIOPEN_HIP_HOST_DEVICE T cast_from_f8(uint8_t x)
+{
+    constexpr bool is_half  = std::is_same<T, half>::value;
+    constexpr bool is_float = std::is_same<T, float>::value;
+    static_assert(is_half || is_float, "only half and float are supported");
+
+    constexpr int weo = is_half ? 5 : 8;
+    constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
+
+    T fInf, fNegInf, fNaN, fNeg0;
+    if(is_half)
+    {
+        const uint16_t ihInf    = 0x7C00;
+        const uint16_t ihNegInf = 0xFC00;
+        const uint16_t ihNaN    = 0x7C01;
+        const uint16_t ihNeg0   = 0x8000;
+        fInf                    = *(reinterpret_cast<const half*>(&ihInf));
+        fNegInf                 = *(reinterpret_cast<const half*>(&ihNegInf));
+        fNaN                    = *(reinterpret_cast<const half*>(&ihNaN));
+        fNeg0                   = *(reinterpret_cast<const half*>(&ihNeg0));
+    }
+    else if(is_float)
+    {
+        const uint32_t ifInf    = 0x7F800000;
+        const uint32_t ifNegInf = 0xFF800000;
+        const uint32_t ifNaN    = 0x7F800001;
+        const uint32_t ifNeg0   = 0x80000000;
+        fInf = *(reinterpret_cast<const float*>(&ifInf)); // cppcheck-suppress invalidPointerCast
+        fNegInf =
+            *(reinterpret_cast<const float*>(&ifNegInf));   // cppcheck-suppress invalidPointerCast
+        fNaN  = *(reinterpret_cast<const float*>(&ifNaN));  // cppcheck-suppress invalidPointerCast
+        fNeg0 = *(reinterpret_cast<const float*>(&ifNeg0)); // cppcheck-suppress invalidPointerCast
+    }
+
+    if(x == 0)
+        return static_cast<T>(0);
+
+    uint32_t sign     = x >> 7;
+    uint32_t mantissa = x & ((1 << wm) - 1);
+    int exponent      = (x & 0x7F) >> wm;
+    if(negative_zero_nan)
+    {
+        if(x == 0x80)
+            return fNaN;
+    }
+    else
+    {
+        if(x == 0x80)
+            return fNeg0;
+        if(exponent == ((1 << we) - 1))
+            return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
+    }
+    typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
+    if(we == 5 && is_half && !negative_zero_nan)
+    {
+        retval = x << 8;
+        return *(reinterpret_cast<const T*>(&retval));
+    }
+
+    const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
+
+    // subnormal input
+    if(exponent == 0)
+    {
+        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+        int sh = 1 + __builtin_clz(mantissa) - (32 - wm);
+        mantissa <<= sh;
+        exponent += 1 - sh;
+        mantissa &= ((1 << wm) - 1);
+    }
+    exponent += exp_low_cutoff - 1;
+    mantissa <<= wmo - wm;
+
+    // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
+    if(exponent <= 0)
+    {
+        mantissa |= 1 << wmo;
+        mantissa >>= 1 - exponent;
+        exponent = 0;
+    }
+
+    if(sizeof(T) == 2)
+        retval = (sign << 15) | (exponent << 10) | mantissa;
+    else
+        retval = (sign << 31) | (exponent << 23) | mantissa;
+    return *(reinterpret_cast<const T*>(&retval));
+}
+
+} // namespace miopen_hip_f8_impl
diff --git a/src/kernels/hip_float8.hpp b/src/kernels/hip_float8.hpp
new file mode 100644
index 0000000000..dd57c9ca5b
--- /dev/null
+++ b/src/kernels/hip_float8.hpp
@@ -0,0 +1,651 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+#ifndef MIOPEN_ENABLE_F8_DEVICE_CODE
+#define MIOPEN_ENABLE_F8_DEVICE_CODE 0
+#endif
+
+// FP8 header version 0.4, 2021/05/11
+#if defined __HIP_PLATFORM_HCC__ && MIOPEN_ENABLE_F8_DEVICE_CODE
+// MIOpen by default does not have device code in the regular compilation paths,
+// therefore, when this file is used from the host side, compilation takes much
+// longer. By guarding the __device__ directive we can control that such compilation
+// only happens for kernels which include this file.
+#define MIOPEN_HIP_HOST_DEVICE __host__ __device__
+#else
+#define MIOPEN_HIP_HOST_DEVICE
+#endif
+
+#define USE_SIMPLER_HIP_F8x8 0
+
+#ifndef MIOPEN_FP8_CLIPPING
+#define MIOPEN_FP8_CLIPPING 1
+#endif
+
+#ifndef MIOPEN_FP8_IEEE_EXPONENT_BIAS
+#define MIOPEN_FP8_IEEE_EXPONENT_BIAS 1
+#endif
+
+namespace miopen_hip_f8_impl {
+
+template <int wm, int we, typename T, bool negative_zero_nan, bool clip>
+MIOPEN_HIP_HOST_DEVICE uint8_t cast_to_f8(T _x, bool stoch = false, uint32_t rng = 0);
+
+template <int wm, int we, typename T, bool negative_zero_nan>
+MIOPEN_HIP_HOST_DEVICE T cast_from_f8(uint8_t x);
+
+} // namespace miopen_hip_f8_impl
+
+#include "hip_f8_impl.hpp"
+
+namespace miopen_f8 {
+enum class hip_f8_type
+{
+    bf8 = 0, // 1:5:2
+    fp8 = 1  // 1:4:3
+};
+
+enum class hip_f8_rounding_mode
+{
+    standard,
+    stochastic
+};
+
+inline MIOPEN_HIP_HOST_DEVICE bool get_hip_f8_bias_mode()
+{
+#if MIOPEN_FP8_IEEE_EXPONENT_BIAS
+    return false;
+#else
+    return true;
+#endif
+}
+
+template <hip_f8_type T>
+struct hip_f8
+{
+    uint8_t data;
+
+    // default constructor
+    MIOPEN_HIP_HOST_DEVICE hip_f8() = default;
+
+    MIOPEN_HIP_HOST_DEVICE hip_f8(hip_f8<T> const&) = default;
+
+    // constructor from bits
+    explicit MIOPEN_HIP_HOST_DEVICE hip_f8(uint8_t v) { data = v; }
+
+    // constructor from in
+    explicit MIOPEN_HIP_HOST_DEVICE hip_f8(int v) : hip_f8(static_cast<float>(v)) {}
+
+    explicit MIOPEN_HIP_HOST_DEVICE hip_f8(double v) : hip_f8(static_cast<float>(v)) {}
+
+    // constructor from float
+    explicit MIOPEN_HIP_HOST_DEVICE
+    hip_f8(float v,
+           miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
+           uint32_t rng                       = 0)
+    {
+        if(T == hip_f8_type::bf8)
+        {
+            if(get_hip_f8_bias_mode())
+            {
+                data = miopen_hip_f8_impl::cast_to_f8<2,
+                                                      5,
+                                                      float,
+                                                      true /*negative_zero_nan*/,
+                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
+                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
+            }
+            else
+            {
+                data = miopen_hip_f8_impl::cast_to_f8<2,
+                                                      5,
+                                                      float,
+                                                      false /*negative_zero_nan*/,
+                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
+                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
+            }
+        }
+        else /* fp8*/
+        {
+            if(get_hip_f8_bias_mode())
+            {
+                data = miopen_hip_f8_impl::cast_to_f8<3,
+                                                      4,
+                                                      float,
+                                                      true /*negative_zero_nan*/,
+                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
+                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
+            }
+            else
+            {
+                data = miopen_hip_f8_impl::cast_to_f8<3,
+                                                      4,
+                                                      float,
+                                                      false /*negative_zero_nan*/,
+                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
+                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
+            }
+        }
+    }
+
+    // constructor from half
+    explicit MIOPEN_HIP_HOST_DEVICE
+    hip_f8(half v,
+           miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
+           uint32_t rng                       = 0)
+    {
+        if(T == hip_f8_type::bf8)
+        {
+            if(get_hip_f8_bias_mode())
+            {
+                data = miopen_hip_f8_impl::cast_to_f8<2,
+                                                      5,
+                                                      half,
+                                                      true /*negative_zero_nan*/,
+                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
+                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
+            }
+            else
+            {
+                data = miopen_hip_f8_impl::cast_to_f8<2,
+                                                      5,
+                                                      half,
+                                                      false /*negative_zero_nan*/,
+                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
+                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
+            }
+        }
+        else /* fp8*/
+        {
+            if(get_hip_f8_bias_mode())
+            {
+                data = miopen_hip_f8_impl::cast_to_f8<3,
+                                                      4,
+                                                      half,
+                                                      true /*negative_zero_nan*/,
+                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
+                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
+            }
+            else
+            {
+                data = miopen_hip_f8_impl::cast_to_f8<3,
+                                                      4,
+                                                      half,
+                                                      false /*negative_zero_nan*/,
+                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
+                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
+            }
+        }
+    }
+    template <hip_f8_type U>
+    explicit MIOPEN_HIP_HOST_DEVICE
+    hip_f8(hip_f8<U> v,
+           miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
+           uint32_t rng                       = 0)
+    {
+        if(T == U)
+        {
+            data = v.data;
+        }
+        else
+        {
+            const auto tmp  = static_cast<float>(v);
+            const auto tmp2 = hip_f8<U>(tmp, rm, rng);
+            data            = tmp2.data;
+        }
+    }
+
+    explicit MIOPEN_HIP_HOST_DEVICE hip_f8(hip_f8<T> v, hip_f8_rounding_mode, uint32_t)
+    {
+        this->data = v.data;
+    }
+
+    // constructor from hip_bfloat16
+    explicit MIOPEN_HIP_HOST_DEVICE
+    hip_f8(hip_bfloat16 v,
+           hip_f8_rounding_mode r = miopen_f8::hip_f8_rounding_mode::standard,
+           uint32_t rng           = 0);
+
+    MIOPEN_HIP_HOST_DEVICE
+    hip_f8& operator*=(const hip_f8& rhs)
+    {
+        const auto tmp = static_cast<float>(*this) * static_cast<float>(rhs);
+        *this          = static_cast<hip_f8>(tmp);
+        return *this;
+    }
+
+    MIOPEN_HIP_HOST_DEVICE
+    hip_f8& operator+=(const hip_f8& rhs)
+    {
+        const auto tmp = static_cast<float>(*this) + static_cast<float>(rhs);
+        *this          = static_cast<hip_f8>(tmp);
+        return *this;
+    }
+
+    MIOPEN_HIP_HOST_DEVICE
+    hip_f8& operator-=(const hip_f8& rhs)
+    {
+        const auto tmp = static_cast<float>(*this) - static_cast<float>(rhs);
+        *this          = static_cast<hip_f8>(tmp);
+        return *this;
+    }
+
+    inline MIOPEN_HIP_HOST_DEVICE hip_f8& operator=(const hip_f8& rhs)
+    {
+        if(&rhs != this)
+            this->data = rhs.data;
+        return *this;
+    }
+
+    inline MIOPEN_HIP_HOST_DEVICE bool operator==(const hip_f8& rhs) const
+    {
+        if((rhs.is_zero() && this->is_zero()) ||
+           (fabs(rhs - *this) < std::numeric_limits<hip_f8<T>>::epsilon()))
+            return true;
+        else if(rhs.is_nan() || rhs.is_inf() || this->is_nan() || this->is_inf())
+            return false;
+
+        return false;
+    }
+
+    inline MIOPEN_HIP_HOST_DEVICE bool operator<(const hip_f8& rhs) const
+    {
+        const auto we   = static_cast<float>(*this);
+        const auto them = static_cast<float>(rhs);
+        return we < them;
+    }
+
+    inline MIOPEN_HIP_HOST_DEVICE bool operator>(const hip_f8& rhs) const
+    {
+        const auto we   = static_cast<float>(*this);
+        const auto them = static_cast<float>(rhs);
+        return we > them;
+    }
+#if 0
+    /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator double()
+    {
+        // float tmp = static_cast<float>(*this);
+        // return tmp;
+    }
+
+    /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator double() const
+    {
+        // float tmp = static_cast<float>(*this);
+        // return tmp;
+    }
+#endif
+    // convert to float
+    /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator float() const
+    {
+        if(T == hip_f8_type::bf8)
+        {
+            if(get_hip_f8_bias_mode())
+            {
+                return miopen_hip_f8_impl::cast_from_f8<2, 5, float, true /*negative_zero_nan*/>(
+                    data);
+            }
+            else
+            {
+                return miopen_hip_f8_impl::cast_from_f8<2, 5, float, false /*negative_zero_nan*/>(
+                    data);
+            }
+        }
+        else /* fp8*/
+        {
+            if(get_hip_f8_bias_mode())
+            {
+                return miopen_hip_f8_impl::cast_from_f8<3, 4, float, true /*negative_zero_nan*/>(
+                    data);
+            }
+            else
+            {
+                return miopen_hip_f8_impl::cast_from_f8<3, 4, float, false /*negative_zero_nan*/>(
+                    data);
+            }
+        }
+    }
+
+    // convert to half
+    /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator half() const
+    {
+        if(T == hip_f8_type::bf8)
+        {
+            if(get_hip_f8_bias_mode())
+            {
+                return miopen_hip_f8_impl::cast_from_f8<2, 5, half, true /*negative_zero_nan*/>(
+                    data);
+            }
+            else
+            {
+                return miopen_hip_f8_impl::cast_from_f8<2, 5, half, false /*negative_zero_nan*/>(
+                    data);
+            }
+        }
+        else /* fp8*/
+        {
+            if(get_hip_f8_bias_mode())
+            {
+                return miopen_hip_f8_impl::cast_from_f8<3, 4, half, true /*negative_zero_nan*/>(
+                    data);
+            }
+            else
+            {
+                return miopen_hip_f8_impl::cast_from_f8<3, 4, half, false /*negative_zero_nan*/>(
+                    data);
+            }
+        }
+    }
+
+    // convert to hip_bfloat16
+    /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator hip_bfloat16() const;
+
+    // check for zero
+    inline MIOPEN_HIP_HOST_DEVICE bool is_zero() const
+    {
+        if(get_hip_f8_bias_mode())
+        {
+            return data == 0x00;
+        }
+        else
+        {
+            return (data == 0x00) || (data == 0x80);
+        }
+    }
+
+    // check for nan
+    inline MIOPEN_HIP_HOST_DEVICE bool is_nan() const
+    {
+        if(get_hip_f8_bias_mode())
+        {
+            return data == 0x80;
+        }
+        else
+        {
+            if(T == hip_f8_type::bf8)
+            {
+                return (data == 0x7d) || (data == 0x7e) || (data == 0x7f) || (data == 0xfd) ||
+                       (data == 0xfe) || (data == 0xff);
+            }
+            else
+            {
+                return (data == 0x79) || (data == 0x7a) || (data == 0x7b) || (data == 0x7c) ||
+                       (data == 0x7d) || (data == 0x7e) || (data == 0x7f) || (data == 0xf9) ||
+                       (data == 0xfa) || (data == 0xfb) || (data == 0xfc) || (data == 0xfd) ||
+                       (data == 0xfe) || (data == 0xff);
+            }
+        }
+    }
+
+    // check for inf
+    inline MIOPEN_HIP_HOST_DEVICE bool is_inf() const
+    {
+        if(get_hip_f8_bias_mode())
+        {
+            return data == 0x80;
+        }
+        else
+        {
+            if(T == hip_f8_type::bf8)
+            {
+                return (data == 0x7c) || (data == 0xfc);
+            }
+            else
+            {
+                return (data == 0x78) || (data == 0xf8);
+            }
+        }
+    }
+}; // end of class hip_f8
+
+template <miopen_f8::hip_f8_type T>
+inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<T> operator*(miopen_f8::hip_f8<T> lhs,
+                                                             const miopen_f8::hip_f8<T>& rhs)
+{
+    lhs *= rhs;
+    return lhs;
+}
+
+template <miopen_f8::hip_f8_type T>
+inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<T> operator+(miopen_f8::hip_f8<T> lhs,
+                                                             const miopen_f8::hip_f8<T>& rhs)
+{
+    lhs += rhs;
+    return lhs;
+}
+
+template <miopen_f8::hip_f8_type T>
+inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<T> operator-(miopen_f8::hip_f8<T> lhs,
+                                                             const miopen_f8::hip_f8<T>& rhs)
+{
+    lhs -= rhs;
+    return lhs;
+}
+
+template <miopen_f8::hip_f8_type T, typename U>
+inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<T> operator-(U lhs, const miopen_f8::hip_f8<T>& rhs)
+{
+    const auto tmp = static_cast<U>(rhs);
+    return static_cast<miopen_f8::hip_f8<T>>(lhs - tmp);
+}
+
+template <miopen_f8::hip_f8_type T>
+inline MIOPEN_HIP_HOST_DEVICE bool operator<(const miopen_f8::hip_f8<T>& lhs,
+                                             const miopen_f8::hip_f8<T>& rhs)
+{
+    return static_cast<float>(lhs) < static_cast<float>(rhs);
+}
+
+template <miopen_f8::hip_f8_type T>
+inline MIOPEN_HIP_HOST_DEVICE bool operator>(const miopen_f8::hip_f8<T>& lhs,
+                                             const miopen_f8::hip_f8<T>& rhs)
+{
+    return static_cast<float>(lhs) > static_cast<float>(rhs);
+}
+
+template <miopen_f8::hip_f8_type T>
+inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<T> fabs(miopen_f8::hip_f8<T> v)
+{
+    v.data = v.data & 0x7f;
+    return v;
+}
+template <class T>
+MIOPEN_HIP_HOST_DEVICE T F8_Max()
+{
+    union
+    {
+        uint8_t bits;
+        T value;
+    } x;
+
+    x.bits = 0x7F;
+    return x.value;
+}
+} // namespace miopen_f8
+
+// define numeric limits for the new data type
+namespace std {
+inline bool isfinite(miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8> x) // NOLINT
+{
+    return x.is_inf();
+}
+
+inline bool isfinite(miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8> x) // NOLINT
+{
+    return x.is_inf();
+}
+
+template <>
+class numeric_limits<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>
+{
+public:
+    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8> epsilon()
+    {
+        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>(float(0.0625));
+    }
+
+    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8> quiet_NaN()
+    {
+        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>(
+            static_cast<uint8_t>(miopen_f8::get_hip_f8_bias_mode() ? 0X80 : 0x79));
+    }
+
+    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8> max()
+    {
+        return miopen_f8::F8_Max<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>();
+    }
+
+    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8> min()
+    {
+        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>(-1.0f) *
+               miopen_f8::F8_Max<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>();
+    }
+};
+
+template <>
+class numeric_limits<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>
+{
+public:
+    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8> epsilon()
+    {
+        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>(float(0.125));
+    }
+
+    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8> quiet_NaN()
+    {
+        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>(
+            static_cast<uint8_t>(miopen_f8::get_hip_f8_bias_mode() ? 0X80 : 0x7d));
+    }
+
+    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8> max()
+    {
+        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>(
+            miopen_f8::F8_Max<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>());
+    }
+    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8> min()
+    {
+        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>(-1.0f) *
+               miopen_f8::F8_Max<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>();
+    }
+};
+
+} // namespace std
+
+template <miopen_f8::hip_f8_type T>
+struct hip_f8x4
+{
+    // define some convenience types
+    using float32x2 = float __attribute__((ext_vector_type(2)));
+    using float32x4 = float __attribute__((ext_vector_type(4)));
+
+    using halfx2 = _Float16 __attribute__((ext_vector_type(2)));
+    using halfx4 = _Float16 __attribute__((ext_vector_type(4)));
+
+    using hip_bfloat16x2 = uint16_t __attribute__((ext_vector_type(2)));
+    using hip_bfloat16x4 = uint16_t __attribute__((ext_vector_type(4)));
+
+    uint32_t data;
+
+    // default constructor
+    MIOPEN_HIP_HOST_DEVICE hip_f8x4() = default;
+
+    // constructor from bits
+    MIOPEN_HIP_HOST_DEVICE hip_f8x4(uint32_t v);
+
+    // constructor from float
+    MIOPEN_HIP_HOST_DEVICE
+    hip_f8x4(float v0,
+             float v1                           = 0,
+             float v2                           = 0,
+             float v3                           = 0,
+             miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
+             uint32_t rng                       = 0);
+    MIOPEN_HIP_HOST_DEVICE
+    hip_f8x4(float32x2 v,
+             miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
+             uint32_t rng                       = 0);
+    MIOPEN_HIP_HOST_DEVICE
+    hip_f8x4(float32x4 v,
+             miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
+             uint32_t rng                       = 0);
+
+    // constructor from half
+    MIOPEN_HIP_HOST_DEVICE
+    hip_f8x4(half v0,
+             half v1                            = {},
+             half v2                            = {},
+             half v3                            = {},
+             miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
+             uint32_t rng                       = 0);
+    MIOPEN_HIP_HOST_DEVICE
+    hip_f8x4(halfx2 v,
+             miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
+             uint32_t rng                       = 0);
+    MIOPEN_HIP_HOST_DEVICE
+    hip_f8x4(halfx4 v,
+             miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
+             uint32_t rng                       = 0);
+
+    // convert to float32x4
+    inline MIOPEN_HIP_HOST_DEVICE operator float32x4() const;
+
+    // convert to halfx4
+    inline MIOPEN_HIP_HOST_DEVICE operator halfx4() const;
+};
+
+template <miopen_f8::hip_f8_type T>
+struct hip_f8x8
+{
+    // define some convenience types
+    using f8x8 = hip_f8x4<T> __attribute__((ext_vector_type(2)));
+
+    f8x8 data;
+
+    // default constructor
+    MIOPEN_HIP_HOST_DEVICE hip_f8x8() = default;
+
+    // do we need to define other constructors or any conversion routines here?
+};
+
+// If we do not end up needing either any constructors or conversion routines for the above type,
+// then we can simplify the above type to the following
+#if USE_SIMPLER_HIP_F8x8
+template <hip_f8_type T>
+using hip_f8x8 = hip_f8x4<T> __attribute__((ext_vector_type(2)));
+#endif
+
+using hip_float32x4  = float __attribute__((ext_vector_type(4)));
+using hip_float32x16 = float __attribute__((ext_vector_type(16)));
+
+// these are device-specific and we don't expect them to exist unless we're compiling with hip-clang
+// for MI300.
+template <miopen_f8::hip_f8_type T_A, miopen_f8::hip_f8_type T_B>
+__device__ hip_float32x4 mfma_f32_16x16x32(hip_f8x8<T_A> a, hip_f8x8<T_B> b, hip_float32x4 c);
+
+template <miopen_f8::hip_f8_type T_A, miopen_f8::hip_f8_type T_B>
+__device__ hip_float32x16 mfma_f32_32x32x16(hip_f8x8<T_A> a, hip_f8x8<T_B> b, hip_float32x16 c);
+
+using float8  = miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>;
+using bfloat8 = miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>;
diff --git a/src/ocl/tensorocl.cpp b/src/ocl/tensorocl.cpp
index 8d63f21054..9c7bff6992 100644
--- a/src/ocl/tensorocl.cpp
+++ b/src/ocl/tensorocl.cpp
@@ -1472,12 +1472,12 @@ void SetTensor(const Handle& handle,
                                           std::multiplies<std::size_t>());
 
         std::size_t wld = 256 < wgd ? 256 : wgd;
-
-        std::string parms = "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET" +
-                            GetDataTypeKernelParams(dataType);
+        std::stringstream ss;
+        ss << "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET"
+           << GetDataTypeKernelParams(dataType);
         for(int i = 0; i < yDim_flat; ++i)
         {
-            parms += " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]);
+            ss << " -DWORK_LENGTH_" << std::to_string(i) << "=" << std::to_string(worker_sizes[i]);
         }
 
         kernel = handle.AddKernel(kernel_name,
@@ -1486,7 +1486,7 @@ void SetTensor(const Handle& handle,
                                   kernel_name,
                                   {wld, 1, 1},
                                   {wgd, 1, 1},
-                                  parms);
+                                  ss.str());
     }
 
     switch(yDim_flat)
@@ -1934,6 +1934,10 @@ std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, mio
     case miopenHalf: return option += "2";
     case miopenFloat: return option += "3";
     case miopenBFloat16: return option += "4";
+    case miopenFloat8:
+        MIOPEN_THROW(miopenStatusBadParm, "miopenFloat8 data type not supported in cast tensor.");
+    case miopenBFloat8:
+        MIOPEN_THROW(miopenStatusBadParm, "miopenBFloat8 data type not supported in cast tensor.");
     case miopenDouble:
         // TODO
         MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor.");
diff --git a/src/pooling.cpp b/src/pooling.cpp
index ab928ffd81..91a27f324a 100644
--- a/src/pooling.cpp
+++ b/src/pooling.cpp
@@ -131,8 +131,8 @@ PoolingDescriptor::GetForwardOutputDim(const TensorDescriptor& xDesc) const
 
     assert(stride_h > 0);
     assert(stride_w > 0);
-    assert(window_h < (input_h + 2 * pad_h));
-    assert(window_w < (input_w + 2 * pad_w));
+    assert(window_h < (input_h + static_cast<std::size_t>(2) * pad_h));
+    assert(window_w < (input_w + static_cast<std::size_t>(2) * pad_w));
 
     auto output_h = std::max<std::ptrdiff_t>(
         1, ((input_h + 2 * static_cast<std::ptrdiff_t>(pad_h) - window_h) / stride_h + 1));
diff --git a/src/pooling_api.cpp b/src/pooling_api.cpp
index 8bfd1ac64d..ef526804cf 100644
--- a/src/pooling_api.cpp
+++ b/src/pooling_api.cpp
@@ -53,6 +53,8 @@ inline void Pooling_logging_cmd(const miopenPoolingDescriptor_t poolDesc,
         case miopenInt8x4:
         case miopenBFloat16:
         case miopenDouble:
+        case miopenFloat8:
+        case miopenBFloat8:
         default:
             MIOPEN_LOG_W(
                 "Pooing cmd args logging is not implemented properly for " +
diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index 6cb00bc82b..6cf29c2f64 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -208,6 +208,8 @@ inline int GetDataTypeSize(miopenDataType_t t)
     case miopenHalf: return (2);
     case miopenFloat: return (4);
     case miopenDouble: return (8);
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt8: return (1);
     case miopenInt8x4: return (4);
     case miopenBFloat16: return (2);
@@ -268,8 +270,10 @@ inline int GetDataTypeId(miopenDataType_t t)
     case miopenDouble: return (static_cast<int>('D'));
     case miopenInt8:
     case miopenInt8x4:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt32: return (static_cast<int>('O'));
-    default: MIOPEN_THROW("Only float, half, bfloat16 data type is supported.");
+    default: MIOPEN_THROW("Only float, half, bfloat16, float8, bfloat8 data type is supported.");
     };
 };
 
@@ -307,6 +311,8 @@ static ck::DataTypeEnum_t mapDataTypeId(miopenDataType_t t)
     case miopenInt8: return DataTypeEnum_t::Int8;
     case miopenInt8x4: return DataTypeEnum_t::Int8x4;
     case miopenInt32: return DataTypeEnum_t::Int32;
+    case miopenFloat8:
+    case miopenBFloat8:
     default: MIOPEN_THROW("Only float, half, double data type is supported.");
     };
 };
@@ -720,6 +726,9 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
             " -DCK_PARAM_DST_DATATYPE=" + std::to_string(detailStatic::GetDataTypeId(dstDataType));
         param +=
             " -DCK_PARAM_REDUCE_COMPTYPE=" + std::to_string(detailStatic::GetDataTypeId(compType));
+        param +=
+            " -DMIOPEN_FP8_IEEE_EXPONENT_BIAS=" + std::to_string(MIOPEN_FP8_IEEE_EXPONENT_BIAS);
+        param += " -DMIOPEN_FP8_CLIPPING" + std::to_string(MIOPEN_FP8_CLIPPING);
 
         param += " -DCK_PARAM_SRC_DESC_LENGTHS=";
         for(int i = 0; i < inDescLengths.size(); i++)
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index 5de1c43c3c..186bc28ff2 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -201,6 +201,8 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& context,
     case miopenInt32:
     case miopenInt8:
     case miopenInt8x4:
+    case miopenFloat8:
+    case miopenBFloat8:
     default: MIOPEN_THROW("Unsupported datatype");
     }
     return false;
@@ -243,6 +245,8 @@ BnCKFwdInference::GetSolution(const ExecutionContext& context,
             case miopenInt8:
             case miopenInt32:
             case miopenInt8x4:
+            case miopenFloat8:
+            case miopenBFloat8:
             default: MIOPEN_THROW("Unsupported datatype");
             }
         };
diff --git a/src/solver/conv_MP_bidirectional_winograd.cpp b/src/solver/conv_MP_bidirectional_winograd.cpp
index fc2d769520..5d638469d5 100644
--- a/src/solver/conv_MP_bidirectional_winograd.cpp
+++ b/src/solver/conv_MP_bidirectional_winograd.cpp
@@ -329,6 +329,9 @@ bool ConvMPBidirectWinograd<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>::IsA
         return false;
     }
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!IsApplicableGEMM<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>(problem))
         return false;
 
diff --git a/src/solver/conv_asm_1x1u.cpp b/src/solver/conv_asm_1x1u.cpp
index b71d195667..4da4f6dc02 100644
--- a/src/solver/conv_asm_1x1u.cpp
+++ b/src/solver/conv_asm_1x1u.cpp
@@ -537,6 +537,9 @@ bool ConvAsm1x1U::IsApplicable(const ConvolutionContext& ctx,
     if(!(problem.IsFp32() || problem.IsFp16()))
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     const auto target = ctx.GetStream().GetTargetProperties();
     if(target.Xnack() && *target.Xnack())
         return false;
@@ -551,6 +554,9 @@ bool ConvAsm1x1U::IsApplicable(const ConvolutionContext& ctx,
         return false;
     }
 
+    if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8())
+        return false;
+
     if(name == "gfx90a" && problem.IsGfx90aFp16altRequired())
         return false;
 
diff --git a/src/solver/conv_asm_1x1u_bias_activ_fused.cpp b/src/solver/conv_asm_1x1u_bias_activ_fused.cpp
index c4ae30f859..c935a2aff6 100644
--- a/src/solver/conv_asm_1x1u_bias_activ_fused.cpp
+++ b/src/solver/conv_asm_1x1u_bias_activ_fused.cpp
@@ -94,9 +94,9 @@ bool ConvBiasActivAsm1x1U::IsValidPerformanceConfig(
 PerformanceConfigConvBiasActivAsm1x1U
 ConvBiasActivAsm1x1U::Search(const FusionContext& context,
                              const FusionDescription& problem,
-                             const AnyInvokeParams& invoke_ctx) const
+                             const AnyInvokeParams& invoke_params) const
 {
-    return GenericSearch(*this, context, problem, invoke_ctx);
+    return GenericSearch(*this, context, problem, invoke_params);
 }
 
 ConvSolution
@@ -256,6 +256,9 @@ bool ConvBiasActivAsm1x1U::IsApplicable(const FusionContext& context,
     if(conv_problem.GetDilationH() != 1)
         return false;
 
+    if(conv_problem.IsTensorsCasted())
+        return false;
+
     // Check if the conovlution part is applicable
     return sol.IsApplicable(conv_ctx, conv_problem);
 }
diff --git a/src/solver/conv_asm_1x1u_stride2.cpp b/src/solver/conv_asm_1x1u_stride2.cpp
index 9b3dd0462d..ba23b9d9dc 100644
--- a/src/solver/conv_asm_1x1u_stride2.cpp
+++ b/src/solver/conv_asm_1x1u_stride2.cpp
@@ -496,6 +496,9 @@ bool ConvAsm1x1UV2::IsApplicable(const ConvolutionContext& ctx,
     if(!problem.IsFp32())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     const auto target = ctx.GetStream().GetTargetProperties();
     if(target.Xnack() && *target.Xnack())
         return false;
@@ -510,6 +513,9 @@ bool ConvAsm1x1UV2::IsApplicable(const ConvolutionContext& ctx,
         return false;
     }
 
+    if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8())
+        return false;
+
     const auto elements_in_dword = 4 / GetTypeSize(problem.GetInDataType());
     // clang-format off
     const auto img_hw = problem.GetOutHeight_() * problem.GetOutWidth_();
diff --git a/src/solver/conv_asm_3x3u.cpp b/src/solver/conv_asm_3x3u.cpp
index 1ebb39c84f..b185b959af 100644
--- a/src/solver/conv_asm_3x3u.cpp
+++ b/src/solver/conv_asm_3x3u.cpp
@@ -188,6 +188,9 @@ bool ConvAsm3x3U::IsApplicable(const ConvolutionContext& ctx,
     if(target.Xnack() && *target.Xnack())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     const std::string name = ctx.GetStream().GetDeviceName();
     if(!(StartsWith(name, "gfx8") || StartsWith(name, "gfx90")))
         return false;
@@ -196,6 +199,9 @@ bool ConvAsm3x3U::IsApplicable(const ConvolutionContext& ctx,
         return false;
     }
 
+    if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8())
+        return false;
+
     constexpr auto GIB                         = static_cast<int64_t>(1024) * 1024 * 1024;
     constexpr auto TIB                         = GIB * 1024;
     constexpr auto ELEM_SZ                     = static_cast<int64_t>(sizeof(float));
diff --git a/src/solver/conv_asm_5x10u2v2b1.cpp b/src/solver/conv_asm_5x10u2v2b1.cpp
index fe0f0f42a3..6da4863f6d 100644
--- a/src/solver/conv_asm_5x10u2v2b1.cpp
+++ b/src/solver/conv_asm_5x10u2v2b1.cpp
@@ -74,6 +74,8 @@ bool ConvAsm5x10u2v2b1::IsApplicable(const ExecutionContext& ctx,
     {
         return false;
     }
+    if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8())
+        return false;
 
     // Min image + padding shall be not smaller than filter matrix.
     const int min_out_width  = 138;
diff --git a/src/solver/conv_asm_5x10u2v2f1.cpp b/src/solver/conv_asm_5x10u2v2f1.cpp
index 09e2d2abed..74301fe5fd 100644
--- a/src/solver/conv_asm_5x10u2v2f1.cpp
+++ b/src/solver/conv_asm_5x10u2v2f1.cpp
@@ -76,6 +76,9 @@ bool ConvAsm5x10u2v2f1::IsApplicable(const ExecutionContext& ctx,
         return false;
     }
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     // Min image + padding shall be not smaller than filter matrix.
     const int min_in_width  = static_cast<int>(problem.GetWeightsWidth_()) - problem.GetPadW() * 2;
     const int min_in_height = static_cast<int>(problem.GetWeightsHeight_()) - problem.GetPadH() * 2;
diff --git a/src/solver/conv_asm_7x7c3h224w224k64u2v2p3q3f1.cpp b/src/solver/conv_asm_7x7c3h224w224k64u2v2p3q3f1.cpp
index 4310a87fd6..4426a3eeca 100644
--- a/src/solver/conv_asm_7x7c3h224w224k64u2v2p3q3f1.cpp
+++ b/src/solver/conv_asm_7x7c3h224w224k64u2v2p3q3f1.cpp
@@ -51,6 +51,9 @@ bool ConvAsm7x7c3h224w224k64u2v2p3q3f1::IsApplicable(const ExecutionContext& ctx
     if(!ctx.rmv.IsV2orV3())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     const auto target = ctx.GetStream().GetTargetProperties();
     if(target.Xnack() && *target.Xnack())
         return false;
diff --git a/src/solver/conv_asm_dir_BwdWrW1x1.cpp b/src/solver/conv_asm_dir_BwdWrW1x1.cpp
index 4cd78b7357..b6b2458157 100644
--- a/src/solver/conv_asm_dir_BwdWrW1x1.cpp
+++ b/src/solver/conv_asm_dir_BwdWrW1x1.cpp
@@ -484,6 +484,9 @@ bool ConvAsmBwdWrW1x1::IsApplicable(const ConvolutionContext& ctx,
     if(!ctx.rmv.IsV2orV3())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     const auto target = ctx.GetStream().GetTargetProperties();
     if(target.Xnack() && *target.Xnack())
         return false;
diff --git a/src/solver/conv_asm_dir_BwdWrW3x3.cpp b/src/solver/conv_asm_dir_BwdWrW3x3.cpp
index 2781d25d07..7b0b0567d8 100644
--- a/src/solver/conv_asm_dir_BwdWrW3x3.cpp
+++ b/src/solver/conv_asm_dir_BwdWrW3x3.cpp
@@ -378,6 +378,9 @@ bool ConvAsmBwdWrW3x3::IsApplicable(const ConvolutionContext& ctx,
     {
         return false;
     }
+
+    if(problem.IsTensorsCasted())
+        return false;
 #if WORKAROUND_ISSUE_532
     if(StartsWith(name, "gfx9") &&
        (problem.GetKernelStrideW() > 1 || problem.GetKernelStrideH() > 1))
diff --git a/src/solver/conv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp b/src/solver/conv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp
index dbfdb0b69c..9e41d56c82 100644
--- a/src/solver/conv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp
+++ b/src/solver/conv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp
@@ -149,6 +149,9 @@ bool ConvAsmImplicitGemmV4R1DynamicBwd::IsApplicable(const ExecutionContext& ctx
     if(!problem.IsFp32())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!ctx.rmv.IsV3())
         return false;
 
diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp
index caa26d1f23..ee6b16d38b 100644
--- a/src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp
+++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp
@@ -995,6 +995,9 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlops::IsApplicable(const ExecutionContext
     if(!problem.IsFp32() && !problem.IsFp16())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!ctx.rmv.IsV3())
         return false;
 
diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp
index 41a2b018fa..71c53f61a9 100644
--- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp
+++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp
@@ -942,6 +942,9 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable(
        !(problem.IsBfp16() && (device_name == "gfx90a" || StartsWith(device_name, "gfx94"))))
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!ctx.rmv.IsV3())
         return false;
 
diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp
index 081e12a532..32b50167cf 100644
--- a/src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp
+++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp
@@ -1521,6 +1521,9 @@ bool ConvAsmImplicitGemmGTCDynamicFwdXdlops::IsApplicable(const ExecutionContext
     if(!problem.IsFp32() && !problem.IsFp16())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!ctx.rmv.IsV3())
         return false;
 
diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp
index c52372b6d2..bbedf8d680 100644
--- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp
+++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp
@@ -567,6 +567,9 @@ bool ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::IsApplicable(
        !(problem.IsFp16() && problem.GetVectorLength() == 8))
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!ctx.rmv.IsV3())
         return false;
 
diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp
index 601b2e1211..e315fd0895 100644
--- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp
+++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp
@@ -880,6 +880,9 @@ bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsApplicable(
        !(problem.IsBfp16() && (device_name == "gfx90a" || StartsWith(device_name, "gfx94"))))
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!ctx.rmv.IsV3())
         return false;
 
diff --git a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp
index 35de228c45..c8dee39a79 100644
--- a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp
+++ b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp
@@ -870,6 +870,9 @@ bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsApplicable(
        !(problem.IsBfp16() && (device_name == "gfx90a" || StartsWith(device_name, "gfx94"))))
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!ctx.rmv.IsV3())
         return false;
 
diff --git a/src/solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp b/src/solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp
index 7846179475..8e1450c7a3 100644
--- a/src/solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp
+++ b/src/solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp
@@ -295,6 +295,9 @@ bool ConvAsmImplicitGemmV4R1DynamicFwd::IsApplicable(const ExecutionContext& ctx
     if(!problem.IsFp32())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!ctx.rmv.IsV3())
         return false;
 
diff --git a/src/solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp b/src/solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp
index 3594b26277..a5d056178d 100644
--- a/src/solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp
+++ b/src/solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp
@@ -836,6 +836,9 @@ bool ConvAsmImplicitGemmGTCDynamicWrwXdlops::IsApplicable(const ExecutionContext
     if(!problem.IsFp32() && !problem.IsFp16())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!ctx.rmv.IsV3())
         return false;
 
diff --git a/src/solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp b/src/solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp
index 35020df9b6..fb5f0caf7c 100644
--- a/src/solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp
+++ b/src/solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp
@@ -318,6 +318,9 @@ bool ConvAsmImplicitGemmV4R1DynamicWrw::IsApplicable(const ExecutionContext& ctx
     if(!problem.IsFp32())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!ctx.rmv.IsV3())
         return false;
 
diff --git a/src/solver/conv_bin_wino3x3U.cpp b/src/solver/conv_bin_wino3x3U.cpp
index e42aab012c..c8508cf372 100644
--- a/src/solver/conv_bin_wino3x3U.cpp
+++ b/src/solver/conv_bin_wino3x3U.cpp
@@ -69,6 +69,9 @@ bool ConvBinWinograd3x3U::IsApplicable(const ExecutionContext& ctx,
         return false;
     }
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     // clang-format off
     return problem.GetPadW() == 1
         && problem.GetPadH() == 1
diff --git a/src/solver/conv_bin_winoRxS.cpp b/src/solver/conv_bin_winoRxS.cpp
index 8b42bf5899..eb4d7386f1 100644
--- a/src/solver/conv_bin_winoRxS.cpp
+++ b/src/solver/conv_bin_winoRxS.cpp
@@ -222,6 +222,9 @@ bool ConvBinWinogradRxS::IsApplicable(const ExecutionContext& ctx,
         return false;
     if(!(problem.IsFp32() || problem.IsFp16()))
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(miopen::IsDisabled(MIOPEN_DEBUG_AMD_WINOGRAD_RXS{}))
         return false;
     if(problem.direction.IsBackwardWrW())
diff --git a/src/solver/conv_bin_winoRxS_fused.cpp b/src/solver/conv_bin_winoRxS_fused.cpp
index f184b0e291..f11edc368e 100644
--- a/src/solver/conv_bin_winoRxS_fused.cpp
+++ b/src/solver/conv_bin_winoRxS_fused.cpp
@@ -87,6 +87,9 @@ bool ConvBinWinogradRxSFused::IsApplicable(const FusionContext& context,
 
     size_t padded_y = 0;
     size_t padded_x = 0;
+
+    if(conv_problem.IsTensorsCasted())
+        return false;
     if(conv_problem.GetKernelStrideH() == 1)
     {
         if(y <= 3)
diff --git a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
index cdb4225b88..ed975285ee 100644
--- a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
+++ b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
@@ -282,6 +282,8 @@ void PerformanceConfigConvCKIgemmFwdBiasActivFused::HeuristicInit(
     switch(conv_problem.GetInDataType())
     {
     case miopenHalf: Init<ck::half_t>(conv_problem); break;
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
@@ -335,6 +337,8 @@ bool PerformanceConfigConvCKIgemmFwdBiasActivFused::IsValid(
     switch(conv_problem.GetInDataType())
     {
     case miopenHalf: return CheckIsSupportCKArgs<ck::half_t>(conv_problem);
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
@@ -406,6 +410,9 @@ bool ConvCKIgemmFwdBiasActivFused::IsApplicable(const FusionContext& ctx,
     if(activ_op.activMode != miopenActivationRELU)
         return false;
     const auto conv_problem = fdesc_problem.GetConvProblem(0, conv::Direction::Forward);
+
+    if(conv_problem.IsTensorsCasted())
+        return false;
     if(conv_problem.GetConv().attribute.deterministic)
         return false;
     if(conv_problem.GetInDataType() != conv_problem.GetWeightsDataType() ||
@@ -422,6 +429,8 @@ bool ConvCKIgemmFwdBiasActivFused::IsApplicable(const FusionContext& ctx,
     switch(conv_problem.GetInDataType())
     {
     case miopenHalf: return CheckCKApplicability<ck::half_t>(conv_problem);
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
@@ -454,6 +463,8 @@ ConvSolution ConvCKIgemmFwdBiasActivFused::GetSolution(
             case miopenHalf:
                 RunCKSolution<ck::half_t>(handle, primitive_parameters, conv_problem, config);
                 break;
+            case miopenFloat8:
+            case miopenBFloat8:
             case miopenInt8:
             case miopenFloat:
             case miopenInt32:
diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
index 0c743ee9d7..7ddf2e3049 100644
--- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
+++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
@@ -105,6 +105,9 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!(problem.IsFp32() or problem.IsFp16()))
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(problem.GetGroupCount() != 1)
         return false;
     if(ctx.GetStream().GetTargetProperties().Name() == "gfx90a" &&
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index 51e7f0a641..64c95257e6 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -29,6 +29,8 @@
 #include <miopen/problem_description.hpp>
 #include <miopen/gcn_asm_utils.hpp>
 #include <miopen/stringutils.hpp>
+#include <miopen/solver/implicitgemm_util.hpp>
+#include <miopen/datatype.hpp>
 #include <ostream>
 
 namespace miopen {
@@ -133,7 +135,14 @@ std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
     else
         MIOPEN_THROW("unsupported tensor layout");
 
-    if(IsInputFp32(problem))
+    if(problem.IsFp8() || problem.IsTensorsCasted() || problem.IsBfp8())
+    {
+        kernel_name << miopen::GetDataType(ProblemInterpreter::GetInputDataType(problem));
+        kernel_name << "_" << miopen::GetDataType(problem.GetWeightsDataType());
+        kernel_name << "_" << miopen::GetDataType(ProblemInterpreter::GetOutputDataType(problem));
+        return kernel_name.str();
+    }
+    else if(IsInputFp32(problem))
         kernel_name << "float_";
     else if(IsInputFp16(problem))
         kernel_name << "half_";
@@ -167,18 +176,56 @@ std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
     return kernel_name.str();
 }
 
-std::string ConvDirectNaiveConvKernelFile() { return "naive_conv.cpp"; }
+std::string ConvDirectNaiveConvKernelFile(const ConvolutionContext& ctx,
+                                          const ProblemDescription& problem)
+{
+    const auto device_name = ctx.GetStream().GetDeviceName();
+    // The above function, ConvDirectNaiveConvKernelName is not in sync for the asm kernel,
+    // resulting in empty code objects. This happens for systems with COv3 as the default type.
+    // if(device_name == "gfx906" || device_name == "gfx908")
+    // {
+    //     if(ctx.rmv.IsV3() && problem.IsLayoutDefault() && !problem.IsFp8() &&
+    //        !problem.IsTensorsCasted() && !problem.IsBfp8())
+    //         return "naive_conv_gcn.s";
+    // }
+    if(problem.IsFp8() || problem.IsTensorsCasted() || problem.IsBfp8())
+        return "fp8_naive_conv.cpp";
+    return "naive_conv.cpp";
+}
 
-std::string ConvDirectNaiveConvCompileOption(const ConvolutionContext& ctx)
+std::string ConvDirectNaiveConvCompileOption(const ConvolutionContext& ctx,
+                                             const ProblemDescription& problem)
 {
-    std::string filename = ConvDirectNaiveConvKernelFile();
+    std::string filename = ConvDirectNaiveConvKernelFile(ctx, problem);
     if(miopen::EndsWith(filename, ".s"))
     {
         std::ostringstream options;
         GenerateClangDefsym(options, "ROCM_METADATA_VERSION", 5);
         return options.str();
     }
-    return ctx.general_compile_options;
+    std::ostringstream ss;
+    ss << ctx.general_compile_options;
+    if(problem.IsFp8() || problem.IsTensorsCasted() || problem.IsBfp8())
+    {
+        ss << " -DINPUT_TYPE="
+           << miopen::GetDataType(ProblemInterpreter::GetInputDataType(problem));
+        ss << " -DWEIGHTS_TYPE=" << miopen::GetDataType(problem.GetWeightsDataType());
+        ss << " -DOUTPUT_TYPE="
+           << miopen::GetDataType(ProblemInterpreter::GetOutputDataType(problem));
+        const auto in_cast_type = problem.GetInCastType();
+        if(in_cast_type)
+            ss << " -DINPUT_CAST_TYPE=" << miopen::GetDataType(*in_cast_type);
+        const auto wei_cast_type = problem.GetWeightsCastType();
+        if(wei_cast_type)
+            ss << " -DWEIGHTS_CAST_TYPE=" << miopen::GetDataType(*(wei_cast_type));
+        const auto out_cast_type = ProblemInterpreter::GetOutputCastType(problem);
+        if(out_cast_type)
+            ss << " -DOUTPUT_CAST_TYPE=" << miopen::GetDataType(*out_cast_type);
+        ss << " -DMIOPEN_FP8_CLIPPING=" << MIOPEN_FP8_CLIPPING;
+        ss << " -DMIOPEN_FP8_IEEE_EXPONENT_BIAS=" << MIOPEN_FP8_IEEE_EXPONENT_BIAS;
+        //     Let the kernel choose its accumulator (double for naive kernels )
+    }
+    return ss.str();
 }
 
 bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext& ctx,
diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp
index 077fe550bc..c5d793860c 100644
--- a/src/solver/conv_direct_naive_conv_bwd.cpp
+++ b/src/solver/conv_direct_naive_conv_bwd.cpp
@@ -47,11 +47,29 @@ bool ConvDirectNaiveConvBwd::IsApplicable(const ConvolutionContext& ctx,
     if(!problem.IsLayoutDefault() && !problem.IsLayoutNHWC())
         return false;
 
-    if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16()))
+    if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16() || problem.IsFp8() ||
+         problem.IsBfp8()))
         return false;
 
     if(!problem.direction.IsBackwardData())
         return false;
+    if(problem.IsTensorsCasted())
+    {
+        auto test_cast = [&](const TensorDescriptor& desc) {
+            if(desc.GetCastType())
+            {
+                const auto cast_type = *desc.GetCastType();
+                if(cast_type == miopenFloat8 || cast_type == miopenBFloat8)
+                    return false;
+            }
+            // all tested tensors must have cast type set
+            return true;
+        };
+        if(test_cast(problem.GetOut()))
+            return false;
+        if(test_cast(problem.GetWeights()))
+            return false;
+    }
 
     return true;
 }
@@ -104,7 +122,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
 
     KernelInfo kernel;
 
-    kernel.kernel_file = ConvDirectNaiveConvKernelFile();
+    kernel.kernel_file = ConvDirectNaiveConvKernelFile(ctx, problem);
     kernel.kernel_name = ConvDirectNaiveConvKernelName(problem);
     kernel.g_wk.clear();
 
@@ -116,7 +134,13 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
-    kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx);
+    const auto is_f8 = [&]() {
+        if(kernel.kernel_file == "fp8_naive_conv.cpp")
+            return true;
+        else
+            return false;
+    }();
+    kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
     if(problem.Is2d())
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
@@ -125,26 +149,49 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
                 decltype(auto) data_ctx = primitive_parameters.CastTo<conv::DataInvokeParams>();
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
-
-                handle.Run(kern)(tensors.out,
-                                 tensors.w,
-                                 tensors.in,
-                                 hi,
-                                 wi,
-                                 n,
-                                 k_per_group,
-                                 c_per_group,
-                                 ho,
-                                 wo,
-                                 sy,
-                                 sx,
-                                 dy,
-                                 dx,
-                                 py,
-                                 px,
-                                 fy,
-                                 fx,
-                                 group);
+                if(is_f8)
+                    handle.Run(kern)(tensors.out,
+                                     tensors.w,
+                                     tensors.in,
+                                     hi,
+                                     wi,
+                                     n,
+                                     k_per_group,
+                                     c_per_group,
+                                     ho,
+                                     wo,
+                                     sy,
+                                     sx,
+                                     dy,
+                                     dx,
+                                     py,
+                                     px,
+                                     fy,
+                                     fx,
+                                     group,
+                                     problem.GetConv().attribute.fp8rounding_mode.Get() ==
+                                         miopenF8RoundingModeStochastic,
+                                     problem.GetConv().attribute.fp8rounding_mode.GetSeed());
+                else
+                    handle.Run(kern)(tensors.out,
+                                     tensors.w,
+                                     tensors.in,
+                                     hi,
+                                     wi,
+                                     n,
+                                     k_per_group,
+                                     c_per_group,
+                                     ho,
+                                     wo,
+                                     sy,
+                                     sx,
+                                     dy,
+                                     dx,
+                                     py,
+                                     px,
+                                     fy,
+                                     fx,
+                                     group);
                 if(handle.IsProfilingEnabled())
                     elapsed += handle.GetKernelTime();
 
diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp
index b3b2da870c..fc8d8e77fd 100644
--- a/src/solver/conv_direct_naive_conv_fwd.cpp
+++ b/src/solver/conv_direct_naive_conv_fwd.cpp
@@ -47,12 +47,30 @@ bool ConvDirectNaiveConvFwd::IsApplicable(const ConvolutionContext& ctx,
     if(!problem.IsLayoutDefault() && !problem.IsLayoutNHWC())
         return false;
 
-    if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16() || problem.IsInt8()))
+    if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16() || problem.IsInt8() ||
+         problem.IsFp8() || problem.IsBfp8()))
         return false;
 
     if(!problem.direction.IsForward())
         return false;
 
+    if(problem.IsTensorsCasted())
+    {
+        auto test_cast = [&](const TensorDescriptor& desc) {
+            if(desc.GetCastType())
+            {
+                const auto cast_type = *desc.GetCastType();
+                if(cast_type == miopenFloat8 || cast_type == miopenBFloat8)
+                    return false;
+            }
+            // all tested tensors must have cast type set
+            return true;
+        };
+        if(test_cast(problem.GetIn()))
+            return false;
+        if(test_cast(problem.GetWeights()))
+            return false;
+    }
     return true;
 }
 
@@ -104,7 +122,13 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
 
     KernelInfo kernel;
 
-    kernel.kernel_file = ConvDirectNaiveConvKernelFile();
+    kernel.kernel_file = ConvDirectNaiveConvKernelFile(ctx, problem);
+    const auto is_f8   = [&]() {
+        if(kernel.kernel_file == "fp8_naive_conv.cpp")
+            return true;
+        else
+            return false;
+    }();
     kernel.kernel_name = ConvDirectNaiveConvKernelName(problem);
     kernel.g_wk.clear();
 
@@ -116,7 +140,7 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
-    kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx);
+    kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
     if(problem.Is2d())
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
@@ -125,26 +149,53 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
                 decltype(auto) data_ctx = primitive_parameters.CastTo<conv::DataInvokeParams>();
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
-
-                handle.Run(kern)(tensors.in,
-                                 tensors.w,
-                                 tensors.out,
-                                 hi,
-                                 wi,
-                                 n,
-                                 k_per_group,
-                                 c_per_group,
-                                 ho,
-                                 wo,
-                                 sy,
-                                 sx,
-                                 dy,
-                                 dx,
-                                 py,
-                                 px,
-                                 fy,
-                                 fx,
-                                 group);
+                if(is_f8)
+                {
+                    handle.Run(kern)(tensors.in,
+                                     tensors.w,
+                                     tensors.out,
+                                     hi,
+                                     wi,
+                                     n,
+                                     k_per_group,
+                                     c_per_group,
+                                     ho,
+                                     wo,
+                                     sy,
+                                     sx,
+                                     dy,
+                                     dx,
+                                     py,
+                                     px,
+                                     fy,
+                                     fx,
+                                     group,
+                                     problem.GetConv().attribute.fp8rounding_mode.Get() ==
+                                         miopenF8RoundingModeStochastic,
+                                     problem.GetConv().attribute.fp8rounding_mode.GetSeed());
+                }
+                else
+                {
+                    handle.Run(kern)(tensors.in,
+                                     tensors.w,
+                                     tensors.out,
+                                     hi,
+                                     wi,
+                                     n,
+                                     k_per_group,
+                                     c_per_group,
+                                     ho,
+                                     wo,
+                                     sy,
+                                     sx,
+                                     dy,
+                                     dx,
+                                     py,
+                                     px,
+                                     fy,
+                                     fx,
+                                     group);
+                }
                 if(handle.IsProfilingEnabled())
                     elapsed += handle.GetKernelTime();
 
diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp
index f25d3a3baa..2c85949ad4 100644
--- a/src/solver/conv_direct_naive_conv_wrw.cpp
+++ b/src/solver/conv_direct_naive_conv_wrw.cpp
@@ -47,11 +47,29 @@ bool ConvDirectNaiveConvWrw::IsApplicable(const ConvolutionContext& ctx,
     if(!problem.IsLayoutDefault() && !problem.IsLayoutNHWC())
         return false;
 
-    if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16()))
+    if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16() || problem.IsFp8() ||
+         problem.IsBfp8()))
         return false;
 
     if(!problem.direction.IsBackwardWrW())
         return false;
+    if(problem.IsTensorsCasted())
+    {
+        auto test_cast = [&](const TensorDescriptor& desc) {
+            if(desc.GetCastType())
+            {
+                const auto cast_type = *desc.GetCastType();
+                if(cast_type == miopenFloat8 || cast_type == miopenBFloat8)
+                    return false;
+            }
+            // all tested tensors must have cast type set
+            return true;
+        };
+        if(test_cast(problem.GetIn()))
+            return false;
+        if(test_cast(problem.GetOut()))
+            return false;
+    }
 
     return true;
 }
@@ -91,7 +109,7 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
 
     KernelInfo kernel;
 
-    kernel.kernel_file = ConvDirectNaiveConvKernelFile();
+    kernel.kernel_file = ConvDirectNaiveConvKernelFile(ctx, problem);
     kernel.kernel_name = ConvDirectNaiveConvKernelName(problem);
     kernel.g_wk.clear();
 
@@ -103,7 +121,13 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
-    kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx);
+    kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
+    const auto is_f8    = [&]() {
+        if(kernel.kernel_file == "fp8_naive_conv.cpp")
+            return true;
+        else
+            return false;
+    }();
 
     if(problem.Is2d())
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
@@ -112,26 +136,49 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
                 decltype(auto) data_ctx = primitive_parameters.CastTo<conv::WrWInvokeParams>();
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
-
-                handle.Run(kern)(tensors.x,
-                                 tensors.dw,
-                                 tensors.dy,
-                                 hi,
-                                 wi,
-                                 n,
-                                 k_per_group,
-                                 c_per_group,
-                                 ho,
-                                 wo,
-                                 sy,
-                                 sx,
-                                 dy,
-                                 dx,
-                                 py,
-                                 px,
-                                 fy,
-                                 fx,
-                                 group);
+                if(is_f8)
+                    handle.Run(kern)(tensors.x,
+                                     tensors.dw,
+                                     tensors.dy,
+                                     hi,
+                                     wi,
+                                     n,
+                                     k_per_group,
+                                     c_per_group,
+                                     ho,
+                                     wo,
+                                     sy,
+                                     sx,
+                                     dy,
+                                     dx,
+                                     py,
+                                     px,
+                                     fy,
+                                     fx,
+                                     group,
+                                     problem.GetConv().attribute.fp8rounding_mode.Get() ==
+                                         miopenF8RoundingModeStochastic,
+                                     problem.GetConv().attribute.fp8rounding_mode.GetSeed());
+                else
+                    handle.Run(kern)(tensors.x,
+                                     tensors.dw,
+                                     tensors.dy,
+                                     hi,
+                                     wi,
+                                     n,
+                                     k_per_group,
+                                     c_per_group,
+                                     ho,
+                                     wo,
+                                     sy,
+                                     sx,
+                                     dy,
+                                     dx,
+                                     py,
+                                     px,
+                                     fy,
+                                     fx,
+                                     group);
                 if(handle.IsProfilingEnabled())
                     elapsed += handle.GetKernelTime();
 
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
index 0b310ea71a..0b880b2fc8 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
@@ -212,6 +212,8 @@ void PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::HeuristicInit(
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenDouble: break;
     }
 #endif
@@ -253,6 +255,8 @@ bool PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::IsValid(
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenDouble: break;
     }
 #endif
@@ -303,6 +307,8 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable(
        problem.GetWeightsDataType() != problem.GetOutDataType() ||
        problem.GetInDataType() != problem.GetOutDataType())
         return false;
+    if(problem.IsTensorsCasted())
+        return false;
     if(!problem.direction.IsBackwardData())
         return false;
     if(!problem.Is3d())
@@ -320,6 +326,8 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable(
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenDouble: break;
     }
 #endif
@@ -347,6 +355,8 @@ ConvSolution ConvHipImplicitGemm3DGroupBwdXdlops::GetSolution(
     case miopenInt8x4:
     case miopenBFloat16:
     case miopenDouble:
+    case miopenFloat8:
+    case miopenBFloat8:
     default:
         MIOPEN_THROW(miopenStatusInternalError,
                      "ConvHipImplicitGemmBwdXdlops operation not implemented for this data type");
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
index 9b7079df1d..80b969fcbd 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
@@ -211,6 +211,8 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::HeuristicInit(
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
     case miopenInt8x4:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -252,6 +254,8 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::IsValid(
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
     case miopenInt8x4:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -319,6 +323,8 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable(
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
     case miopenInt8x4:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -347,6 +353,8 @@ ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution(
     case miopenInt8x4:
     case miopenBFloat16:
     case miopenDouble:
+    case miopenFloat8:
+    case miopenBFloat8:
     default:
         MIOPEN_THROW(miopenStatusInternalError,
                      "ConvHipImplicitGemmFwdXdlops operation not implemented for this data type");
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
index 89081fa339..7292828f69 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
@@ -208,6 +208,8 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::HeuristicInit(
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenDouble: break;
     }
 #endif
@@ -249,6 +251,8 @@ bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::IsValid(
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenDouble: break;
     }
 #endif
@@ -318,6 +322,8 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable(
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenDouble: break;
     }
 #endif
@@ -344,6 +350,8 @@ ConvSolution ConvHipImplicitGemm3DGroupWrwXdlops::GetSolution(
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenDouble:
     default:
         MIOPEN_THROW(miopenStatusInternalError,
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
index 11b3d4bfbb..7e380f6289 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
@@ -168,6 +168,8 @@ void PerformanceConfigHipImplicitGemmBwdXdlops::HeuristicInit(
     {
     case miopenHalf: Init<ck::half_t>(problem); break;
     case miopenFloat: Init<float>(problem); break;
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
     case miopenInt8x4:
@@ -208,6 +210,8 @@ bool PerformanceConfigHipImplicitGemmBwdXdlops::IsValid(
     {
     case miopenHalf: return CheckIsSupportCKArgs<ck::half_t>(problem);
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
     case miopenInt8x4:
@@ -262,6 +266,9 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable(
        problem.GetWeightsDataType() != problem.GetOutDataType() ||
        problem.GetInDataType() != problem.GetOutDataType())
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(!problem.direction.IsBackwardData())
         return false;
     if(!problem.Is2d())
@@ -283,6 +290,8 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable(
     {
     case miopenHalf: return CheckCKApplicability<ck::half_t>(problem);
     case miopenFloat: return CheckCKApplicability<float>(problem);
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
     case miopenInt8x4:
@@ -312,6 +321,8 @@ ConvSolution ConvHipImplicitGemmBwdXdlops::GetSolution(
     case miopenInt8x4:
     case miopenBFloat16:
     case miopenDouble:
+    case miopenFloat8:
+    case miopenBFloat8:
     default:
         MIOPEN_THROW(miopenStatusInternalError,
                      "ConvHipImplicitGemmFwdXdlops operation not implemented for this data type");
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
index ec3ea2fc65..c389cb0cee 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
@@ -649,6 +649,9 @@ bool ConvHipImplicitGemmBwdDataV1R1::IsApplicable(const ConvolutionContext& ctx,
 
     if(!(problem.IsFp32() || problem.IsBfp16()))
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(problem.GetGroupCount() != 1)
         return false;
     if(!IsIndexRangeLargeEnough(problem))
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
index 1075556d39..5b533d72ee 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
@@ -784,6 +784,9 @@ bool ConvHipImplicitGemmBwdDataV1R1Xdlops::IsApplicable(const ConvolutionContext
     if(!problem.Is2d())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(ctx.GetStream().GetDeviceName() == "gfx90a" && problem.IsGfx90aFp16altRequired())
         return false;
 
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
index 79250a6b61..65f8cf6675 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
@@ -754,6 +754,9 @@ bool ConvHipImplicitGemmBwdDataV4R1::IsApplicable(const ConvolutionContext& ctx,
     if(!problem.IsFp32())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(problem.GetGroupCount() != 1)
         return false;
 
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
index 8a147585ab..3e85e4c966 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
@@ -840,6 +840,9 @@ bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsApplicable(const ConvolutionContext
         return false;
     if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16()))
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(!IsApplicableXdlops(ctx, problem))
         return false;
     if(!IsIndexRangeLargeEnough(problem))
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
index 99edfd139d..773f5a1d32 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
@@ -66,6 +66,9 @@ bool ConvHipImplicitGemmV4R1Fwd::IsApplicable(const ConvolutionContext& ctx,
     if(ctx.GetStream().GetDeviceName() == "gfx90a" && problem.IsGfx90aFp16altRequired())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     std::size_t n         = problem.GetBatchSize_();
     std::size_t k         = problem.GetOutChannels_() / problem.GetGroupCount();
     std::size_t c         = problem.GetInChannels_() / problem.GetGroupCount();
@@ -88,6 +91,8 @@ bool ConvHipImplicitGemmV4R1WrW::IsApplicable(const ConvolutionContext& ctx,
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1{}))
         return false;
+    if(ThisSolverIsDeprecatedStatic::IsDisabled(ctx))
+        return false;
     if(!IsComposableKernelSupportedHardware(ctx))
         return false;
     if(!problem.direction.IsBackwardWrW())
@@ -104,6 +109,8 @@ bool ConvHipImplicitGemmV4R1WrW::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(ctx.GetStream().GetDeviceName() == "gfx90a" && problem.IsGfx90aFp16altRequired())
         return false;
+    if(problem.IsTensorsCasted())
+        return false;
 
     // retrieve dimension from ProblemDescription
     // remember: ProblemDescription has swapped some dimensions for you!
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp
index 15f247e7d3..07fbae436e 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp
@@ -598,6 +598,9 @@ bool ConvHipImplicitGemmV4R4Fwd::IsApplicable(const ConvolutionContext& ctx,
     if(!IsIndexRangeLargeEnough(problem))
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     int gemm_m = 0;
     int gemm_n = 0;
     int gemm_k = 0;
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp
index edcce82e68..795e3d1704 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp
@@ -990,6 +990,9 @@ bool ConvHipImplicitGemmForwardV4R4Xdlops::IsApplicable(const ConvolutionContext
     if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16()))
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!problem.direction.IsForward())
         return false;
 
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
index 95fed60757..7fa139d21a 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
@@ -1047,6 +1047,9 @@ bool ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsApplicable(
     if(!ctx.use_hip_kernels)
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!IsComposableKernelSupportedHardware(ctx))
         return false;
 
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp
index 71f97ff3b2..9bd59d36b9 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp
@@ -1020,6 +1020,9 @@ bool ConvHipImplicitGemmForwardV4R5Xdlops::IsApplicable(const ConvolutionContext
     if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16()))
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     const auto y = ProblemInterpreter::GetFilterHeightY(problem);
     const auto x = ProblemInterpreter::GetFilterWidthX(problem);
 
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
index 762893bf9e..ae2395dd0a 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
@@ -169,6 +169,8 @@ void PerformanceConfigHipImplicitGemmFwdXdlops::HeuristicInit(
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenHalf: Init<ck::half_t>(problem); break;
     case miopenFloat: Init<float>(problem); break;
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
@@ -210,6 +212,8 @@ bool PerformanceConfigHipImplicitGemmFwdXdlops::IsValid(
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenHalf: return CheckIsSupportCKArgs<ck::half_t>(problem);
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
@@ -278,6 +282,8 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable(
         return false;
     if(!problem.IsLayoutNHWC())
         return false;
+    if(problem.IsTensorsCasted())
+        return false;
     if(problem.GetGroupCount() > 1)
         return false;
     switch(problem.GetInDataType())
@@ -285,6 +291,8 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable(
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenHalf: return CheckCKApplicability<ck::half_t>(problem);
     case miopenFloat: return CheckCKApplicability<float>(problem);
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
@@ -315,6 +323,8 @@ ConvSolution ConvHipImplicitGemmFwdXdlops::GetSolution(
     case miopenInt8x4:
     case miopenBFloat16:
     case miopenDouble:
+    case miopenFloat8:
+    case miopenBFloat8:
     default:
         MIOPEN_THROW(miopenStatusInternalError,
                      "ConvHipImplicitGemmFwdXdlops operation not implemented for this data type");
diff --git a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
index 1763d30d6b..b51031780f 100644
--- a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
@@ -200,6 +200,8 @@ void PerformanceConfigHipImplicitGemmGroupFwdXdlops::HeuristicInit(
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenDouble: break;
     }
 #endif
@@ -240,6 +242,8 @@ bool PerformanceConfigHipImplicitGemmGroupFwdXdlops::IsValid(
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenDouble: break;
     }
 #endif
@@ -284,6 +288,8 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable(
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
     if(miopen::IsDisabled(MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS{}))
         return false;
+    if(problem.IsTensorsCasted())
+        return false;
     if(problem.GetConv().attribute.deterministic)
         return false;
     if(problem.GetInDataType() != problem.GetWeightsDataType() ||
@@ -307,6 +313,8 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable(
     case miopenInt32:
     case miopenInt8x4:
     case miopenBFloat16:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenDouble: break;
     }
 #endif
@@ -334,6 +342,8 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlops::GetSolution(
     case miopenInt8x4:
     case miopenBFloat16:
     case miopenDouble:
+    case miopenFloat8:
+    case miopenBFloat8:
     default:
         MIOPEN_THROW(miopenStatusInternalError,
                      "ConvHipImplicitGemmFwdXdlops operation not implemented for this data type");
diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
index 331e8a14c2..0a33f611c0 100644
--- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
+++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
@@ -596,6 +596,9 @@ bool ConvHipImplicitGemmV4R4WrW::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!problem.IsFp32())
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(problem.GetGroupCount() != 1)
         return false;
     if(!IsIndexRangeLargeEnough(problem))
diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
index f6c4847551..070ad3615f 100644
--- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
@@ -1072,6 +1072,9 @@ bool ConvHipImplicitGemmWrwV4R4Xdlops::IsApplicable(const ConvolutionContext& ct
     if(!IsIndexRangeLargeEnough(problem))
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(!problem.IsLayoutDefault())
     {
         return false;
diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
index 9f46af0245..7b4295df35 100644
--- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
+++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
@@ -1132,6 +1132,9 @@ bool ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsApplicable(
     if(!problem.Is2d())
         return false;
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     if(ctx.GetStream().GetDeviceName() == "gfx90a" && problem.IsGfx90aFp16altRequired())
         return false;
 
diff --git a/src/solver/conv_mlir_igemm_bwd.cpp b/src/solver/conv_mlir_igemm_bwd.cpp
index 6fa2b2e7f4..58787c2532 100644
--- a/src/solver/conv_mlir_igemm_bwd.cpp
+++ b/src/solver/conv_mlir_igemm_bwd.cpp
@@ -49,6 +49,8 @@ bool ConvMlirIgemmBwd::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!IsComposableKernelSupportedHardware(ctx))
         return false;
+    if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8())
+        return false;
     // Note: ConvMlirIgemmBwd can run on a machine with xdlops support, however, it is
     // guaranteed to be slower than its xdlops alternative, therefore disabling it to
     // save compilation overhead
diff --git a/src/solver/conv_mlir_igemm_bwd_xdlops.cpp b/src/solver/conv_mlir_igemm_bwd_xdlops.cpp
index c55d89464a..a4a9549db8 100644
--- a/src/solver/conv_mlir_igemm_bwd_xdlops.cpp
+++ b/src/solver/conv_mlir_igemm_bwd_xdlops.cpp
@@ -50,6 +50,8 @@ bool ConvMlirIgemmBwdXdlops::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!problem.direction.IsBackwardData())
         return false;
+    if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8())
+        return false;
     if(!IsComposableKernelSupportedHardware(ctx))
         return false;
 
diff --git a/src/solver/conv_mlir_igemm_fwd.cpp b/src/solver/conv_mlir_igemm_fwd.cpp
index 3a1eb3068d..c4983852b1 100644
--- a/src/solver/conv_mlir_igemm_fwd.cpp
+++ b/src/solver/conv_mlir_igemm_fwd.cpp
@@ -169,6 +169,8 @@ bool ConvMlirIgemmFwd::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!IsComposableKernelSupportedHardware(ctx))
         return false;
+    if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8())
+        return false;
     // Note: ConvMlirIgemmFwd can run on a machine with xdlops support, however, it is
     // guaranteed to be slower than its xdlops alternative, therefore disabling it to
     // save compilation overhead
diff --git a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp
index 692b2aeba2..8256213623 100644
--- a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp
+++ b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp
@@ -66,6 +66,8 @@ bool ConvMlirIgemmFwdXdlops::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!IsComposableKernelSupportedHardware(ctx))
         return false;
+    if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8())
+        return false;
     return MiirIsConfigApplicable(mlir::ConstructBuildOptions(ctx, problem, true));
 #else
     std::ignore = ctx;
diff --git a/src/solver/conv_mlir_igemm_wrw.cpp b/src/solver/conv_mlir_igemm_wrw.cpp
index bb1e1229b2..d5f88dcaea 100644
--- a/src/solver/conv_mlir_igemm_wrw.cpp
+++ b/src/solver/conv_mlir_igemm_wrw.cpp
@@ -50,6 +50,8 @@ bool ConvMlirIgemmWrW::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!IsComposableKernelSupportedHardware(ctx))
         return false;
+    if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8())
+        return false;
     // Note: ConvMlirIgemmWrW can run on a machine with xdlops support, however, it is
     // guaranteed to be slower than its xdlops alternative, therefore disabling it to
     // save compilation overhead
diff --git a/src/solver/conv_mlir_igemm_wrw_xdlops.cpp b/src/solver/conv_mlir_igemm_wrw_xdlops.cpp
index 34c99d39cd..2f3bc63f50 100644
--- a/src/solver/conv_mlir_igemm_wrw_xdlops.cpp
+++ b/src/solver/conv_mlir_igemm_wrw_xdlops.cpp
@@ -51,6 +51,8 @@ bool ConvMlirIgemmWrWXdlops::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!problem.direction.IsBackwardWrW())
         return false;
+    if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8())
+        return false;
     if(!IsComposableKernelSupportedHardware(ctx))
         return false;
 
diff --git a/src/solver/conv_multipass_wino3x3WrW.cpp b/src/solver/conv_multipass_wino3x3WrW.cpp
index e41a434253..233489c4fc 100644
--- a/src/solver/conv_multipass_wino3x3WrW.cpp
+++ b/src/solver/conv_multipass_wino3x3WrW.cpp
@@ -438,6 +438,9 @@ bool ConvWinograd3x3MultipassWrW<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>
         return false;
     if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16()))
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(!problem.IsLayoutDefault())
     {
         return false;
diff --git a/src/solver/conv_ocl_dir2D11x11.cpp b/src/solver/conv_ocl_dir2D11x11.cpp
index ec56fe9f56..145cf9ed13 100644
--- a/src/solver/conv_ocl_dir2D11x11.cpp
+++ b/src/solver/conv_ocl_dir2D11x11.cpp
@@ -51,6 +51,9 @@ bool ConvOclDirectFwd11x11::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16()))
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(!problem.IsLayoutDefault())
     {
         return false;
diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp
index 562e98c366..eded6fddf8 100644
--- a/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp
+++ b/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp
@@ -67,6 +67,9 @@ bool ConvOclBwdWrW1x1::IsApplicable(const ConvolutionContext& ctx,
         return false;
     }
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     bool result = (problem.GetWeightsWidth_() == 1 && problem.GetWeightsHeight_() == 1 &&
                    problem.GetDilationW() == 1 && problem.GetDilationH() == 1 &&
                    problem.GetGroupCount() == 1);
diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp
index d24eb17320..d4e79250e1 100644
--- a/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp
+++ b/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp
@@ -468,6 +468,9 @@ bool ConvOclBwdWrW2<N_BATCH_LOOPS>::IsApplicableBase(const ConvolutionContext& c
         return false;
     }
 
+    if(problem.IsTensorsCasted())
+        return false;
+
     return problem.GetDilationW() == 1 && problem.GetDilationH() == 1 &&
 #if 0
            // There is a stronger restriction than this one, which make this one unnecessary.
diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp
index e2ae607157..2db6109bf5 100644
--- a/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp
+++ b/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp
@@ -55,6 +55,9 @@ bool ConvOclBwdWrW53::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16()))
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(!problem.direction.IsBackwardWrW())
         return false;
     if(!problem.IsLayoutDefault())
diff --git a/src/solver/conv_ocl_dir2Dfwd.cpp b/src/solver/conv_ocl_dir2Dfwd.cpp
index 18086410da..70005e606e 100644
--- a/src/solver/conv_ocl_dir2Dfwd.cpp
+++ b/src/solver/conv_ocl_dir2Dfwd.cpp
@@ -52,6 +52,9 @@ bool ConvOclDirectFwd::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16()))
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(!problem.IsLayoutDefault())
     {
         return false;
diff --git a/src/solver/conv_ocl_dir2Dfwd1x1.cpp b/src/solver/conv_ocl_dir2Dfwd1x1.cpp
index 9c6f392821..71dc41fc65 100644
--- a/src/solver/conv_ocl_dir2Dfwd1x1.cpp
+++ b/src/solver/conv_ocl_dir2Dfwd1x1.cpp
@@ -61,6 +61,9 @@ bool ConvOclDirectFwd1x1::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16()))
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(!problem.IsLayoutDefault())
     {
         return false;
diff --git a/src/solver/conv_ocl_dir2Dfwdgen.cpp b/src/solver/conv_ocl_dir2Dfwdgen.cpp
index 06399252ea..80fb0708e1 100644
--- a/src/solver/conv_ocl_dir2Dfwdgen.cpp
+++ b/src/solver/conv_ocl_dir2Dfwdgen.cpp
@@ -49,6 +49,9 @@ bool ConvOclDirectFwdGen::IsApplicable(const ConvolutionContext& ctx,
         return false;
     if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16()))
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(!problem.IsLayoutDefault())
     {
         return false;
diff --git a/src/solver/conv_winoRxS.cpp b/src/solver/conv_winoRxS.cpp
index 5bce82163c..cdff22ac1d 100644
--- a/src/solver/conv_winoRxS.cpp
+++ b/src/solver/conv_winoRxS.cpp
@@ -624,6 +624,9 @@ static bool IsApplicableBase(const ConvolutionContext& ctx, const ProblemDescrip
         return false;
     if(!(problem.IsFp32() || problem.IsFp16()))
         return false;
+
+    if(problem.IsTensorsCasted())
+        return false;
     if(!ctx.use_asm_kernels)
         return false;
     if(!ctx.rmv.IsV3())
diff --git a/src/solver/conv_winoRxS_fused.cpp b/src/solver/conv_winoRxS_fused.cpp
index e242d5d0d8..377656cfb7 100644
--- a/src/solver/conv_winoRxS_fused.cpp
+++ b/src/solver/conv_winoRxS_fused.cpp
@@ -177,6 +177,9 @@ bool ConvBinWinogradRxSf2x3g1Fused::IsApplicable(const FusionContext& context,
         return false;
     // clang-format on
 
+    if(conv_problem.IsTensorsCasted())
+        return false;
+
     const auto group_count = conv_problem.GetGroupCount();
     if(group_count != 1)
         return false;
diff --git a/src/solver/gemm.cpp b/src/solver/gemm.cpp
index 2634eb3282..7ad238e7fa 100644
--- a/src/solver/gemm.cpp
+++ b/src/solver/gemm.cpp
@@ -94,6 +94,42 @@ bool GemmFwdBase::IsApplicable(const ExecutionContext& ctx,
            yDesc.GetType() != miopenInt8x4)
             return false;
     }
+    const auto rblas_fp8_supported = miopen::StartsWith(ctx.GetStream().GetDeviceName(), "gfx94");
+    if(problem.IsTensorsCasted())
+    {
+        if(!rblas_fp8_supported)
+        {
+            MIOPEN_LOG_I2("GEMM not supported with casted tensors on this GPU architecture");
+            return false;
+        }
+        if(xDesc.GetCastType() && wDesc.GetCastType())
+        {
+            const auto x_cast_type = xDesc.GetCastType();
+            const auto w_cast_type = wDesc.GetCastType();
+            if(x_cast_type != miopenFloat8 && x_cast_type != miopenBFloat8)
+            {
+                MIOPEN_LOG_W(
+                    "Casting is only supported for the miopenFloat8 and miopenBFloat8 data types");
+                return false;
+            }
+            if(w_cast_type != miopenFloat8 && w_cast_type != miopenBFloat8)
+            {
+                MIOPEN_LOG_W(
+                    "Casting is only supported for the miopenFloat8 and miopenBFloat8 data types");
+                return false;
+            }
+        }
+        else
+        {
+            MIOPEN_LOG_I("Both the input and weights tensors need to be casted");
+            return false;
+        }
+    }
+    if(problem.IsFp8() && !rblas_fp8_supported)
+    {
+        MIOPEN_LOG_I2("GEMM not applicable for F8 on this GPU architecture");
+        return false;
+    }
     return problem.GetDirection() == conv::Direction::Forward && problem.IsLayoutDefault() &&
            !(IsAnyBufferBF16(xDesc, yDesc, wDesc) && !IsBf16Supported) &&
            !(IsAnyBufferFp16(xDesc, yDesc, wDesc) && !IsFp16Supported);
@@ -274,11 +310,20 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context,
     decltype(auto) wDesc = problem.GetWeights();
     decltype(auto) yDesc = problem.GetOut();
 
-    const GemmDescriptor gemm_desc = [&]() {
+    const GemmDescriptor tmp_gemm_desc = [&]() {
         auto tmp          = conv.group_count > 1
                                 ? CreateGemmDescriptorGroupConvCNHWFwd(wDesc, xDesc, yDesc, conv.group_count)
                                 : CreateGemmDescriptorConvCNHWFwd(wDesc, xDesc, yDesc);
         tmp.deterministic = problem.GetConv().attribute.deterministic;
+        if(problem.IsTensorsCasted())
+        {
+            // IsApplicable ensures that both are casted
+            if(xDesc.GetCastType())
+                tmp.a_cast_type = *wDesc.GetCastType();
+            if(wDesc.GetCastType())
+                tmp.b_cast_type = *xDesc.GetCastType();
+        }
+        tmp.conv_attributes = problem.GetConv().attribute;
         return tmp;
     }();
 
@@ -377,7 +422,11 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context,
             }
 
             miopenStatus_t gemm_status;
-
+            auto gemm_desc = [&]() {
+                auto tmp            = tmp_gemm_desc;
+                tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt;
+                return tmp;
+            }();
             if(conv_params.type == InvokeType::Run)
             {
                 if(group_count > 1)
@@ -390,8 +439,7 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context,
                                                          0,
                                                          workSpace,
                                                          x_t_size,
-                                                         GemmBackend_t::rocblas,
-                                                         conv_params.gfx90aFp16alt);
+                                                         GemmBackend_t::rocblas);
                 }
                 else
                 {
@@ -404,8 +452,7 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context,
                                            wksp_offset,
                                            workSpace,
                                            x_t_size,
-                                           GemmBackend_t::rocblas,
-                                           conv_params.gfx90aFp16alt);
+                                           GemmBackend_t::rocblas);
                 }
             }
             else
@@ -421,8 +468,7 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context,
                                         x_t_size,
                                         time_precision,
                                         group_count > 1 ? callGemmStridedBatched : callGemm,
-                                        GemmBackend_t::rocblas,
-                                        conv_params.gfx90aFp16alt);
+                                        GemmBackend_t::rocblas);
             }
 
             if(gemm_status != miopenStatusSuccess)
@@ -524,6 +570,8 @@ bool GemmFwd1x1_0_1_int8::IsApplicable(const ExecutionContext& context,
 
     const auto spatial_dim = conv.GetSpatialDimension();
     const auto wei_spatial = boost::adaptors::slice(wDesc.GetLengths(), 2, 2 + spatial_dim);
+    if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8())
+        return false;
 
     return miopen::all_of(wei_spatial, [](auto v) { return v == 1; }) &&
            miopen::all_of(conv.GetConvPads(), [](auto v) { return v == 0; }) &&
@@ -562,9 +610,18 @@ ConvSolution GemmFwd1x1_0_1_int8::GetSolution(const ExecutionContext& context,
     solution.workspace_sz = workspace_req;
 
     TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides());
-    const GemmDescriptor gemm_desc = [&]() {
+    const GemmDescriptor tmp_gemm_desc = [&]() {
         auto tmp          = CreateGemmDescriptorConvFwd(wDesc, xDesc, yDesc);
         tmp.deterministic = problem.GetConv().attribute.deterministic;
+        if(problem.IsTensorsCasted())
+        {
+            // IsApplicable ensures that both are casted
+            if(xDesc.GetCastType())
+                tmp.a_cast_type = *xDesc.GetCastType();
+            if(wDesc.GetCastType())
+                tmp.b_cast_type = *wDesc.GetCastType();
+        }
+        tmp.conv_attributes = problem.GetConv().attribute;
         return tmp;
     }();
     const auto x_type     = xDesc.GetType();
@@ -601,7 +658,11 @@ ConvSolution GemmFwd1x1_0_1_int8::GetSolution(const ExecutionContext& context,
             miopenStatus_t gemm_status = miopenStatusNotInitialized;
             float time                 = 0;
             const auto runs            = conv_params.type == InvokeType::Run ? in_n : 1;
-
+            const auto gemm_desc       = [&]() {
+                auto tmp            = tmp_gemm_desc;
+                tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt;
+                return tmp;
+            }();
             for(std::size_t i = 0; i < runs; i++)
             {
                 std::size_t out_offset = i * wei_k * out_spatial_size;
@@ -623,8 +684,7 @@ ConvSolution GemmFwd1x1_0_1_int8::GetSolution(const ExecutionContext& context,
                                            0,
                                            y,
                                            out_offset,
-                                           GemmBackend_t::rocblas,
-                                           conv_params.gfx90aFp16alt);
+                                           GemmBackend_t::rocblas);
                 }
                 else
                 {
@@ -638,8 +698,7 @@ ConvSolution GemmFwd1x1_0_1_int8::GetSolution(const ExecutionContext& context,
                                                       out_offset,
                                                       time_precision,
                                                       callGemm,
-                                                      GemmBackend_t::rocblas,
-                                                      conv_params.gfx90aFp16alt);
+                                                      GemmBackend_t::rocblas);
                 }
 
                 if(gemm_status != miopenStatusSuccess)
@@ -727,9 +786,18 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
 
     if(group_count > 1)
     {
-        GemmDescriptor gemm_desc = [&]() {
+        const GemmDescriptor tmp_gemm_desc = [&]() {
             auto tmp          = CreateGemmDescriptorGroupConvFwd(wDesc, xDesc, yDesc, group_count);
             tmp.deterministic = problem.GetConv().attribute.deterministic;
+            if(problem.IsTensorsCasted())
+            {
+                // IsApplicable ensures that both are casted
+                if(xDesc.GetCastType())
+                    tmp.a_cast_type = *wDesc.GetCastType();
+                if(wDesc.GetCastType())
+                    tmp.b_cast_type = *xDesc.GetCastType();
+            }
+            tmp.conv_attributes = problem.GetConv().attribute;
             return tmp;
         }();
 
@@ -765,6 +833,11 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
                                   : conv_params.type == InvokeType::Run ? in_n
                                                                         : 1;
 
+                const auto gemm_desc = [&]() {
+                    auto tmp            = tmp_gemm_desc;
+                    tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt;
+                    return tmp;
+                }();
                 for(std::size_t i = 0; i < runs; i++)
                 {
                     std::size_t out_offset = i * wei_k * out_spatial_size;
@@ -780,8 +853,7 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
                                                              in_offset,
                                                              y,
                                                              out_offset,
-                                                             GemmBackend_t::rocblas,
-                                                             conv_params.gfx90aFp16alt);
+                                                             GemmBackend_t::rocblas);
                     }
                     else
                     {
@@ -795,8 +867,7 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
                                                           out_offset,
                                                           time_precision,
                                                           callGemmStridedBatched,
-                                                          GemmBackend_t::rocblas,
-                                                          conv_params.gfx90aFp16alt);
+                                                          GemmBackend_t::rocblas);
                     }
 
                     if(gemm_status != miopenStatusSuccess)
@@ -829,9 +900,20 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
     else
     {
         // tensors.y = tensors.w * tensors.x
-        GemmDescriptor gemm_desc =
-            CreateGemmStridedBatchedDescriptorConv1x1Fwd(wDesc, xDesc, yDesc);
-        gemm_desc.deterministic = problem.GetConv().attribute.deterministic;
+        const GemmDescriptor tmp_gemm_desc = [&]() {
+            auto tmp          = CreateGemmStridedBatchedDescriptorConv1x1Fwd(wDesc, xDesc, yDesc);
+            tmp.deterministic = problem.GetConv().attribute.deterministic;
+            if(problem.IsTensorsCasted())
+            {
+                // IsApplicable ensures that both are casted
+                if(xDesc.GetCastType())
+                    tmp.a_cast_type = *wDesc.GetCastType();
+                if(wDesc.GetCastType())
+                    tmp.b_cast_type = *xDesc.GetCastType();
+            }
+            tmp.conv_attributes = problem.GetConv().attribute;
+            return tmp;
+        }();
 
         const auto in_spatial  = std::vector<std::size_t>(in_spatial_.begin(), in_spatial_.end());
         const auto out_spatial = std::vector<std::size_t>(out_spatial_.begin(), out_spatial_.end());
@@ -854,18 +936,15 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
 
                 // tensors.y = tensors.w * tensors.x
                 miopenStatus_t gemm_status;
+                const auto gemm_desc = [&]() {
+                    auto tmp            = tmp_gemm_desc;
+                    tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt;
+                    return tmp;
+                }();
                 if(conv_params.type == InvokeType::Run)
                 {
-                    gemm_status = CallGemmStridedBatched(handle,
-                                                         gemm_desc,
-                                                         w,
-                                                         0,
-                                                         x,
-                                                         0,
-                                                         y,
-                                                         0,
-                                                         GemmBackend_t::rocblas,
-                                                         conv_params.gfx90aFp16alt);
+                    gemm_status = CallGemmStridedBatched(
+                        handle, gemm_desc, w, 0, x, 0, y, 0, GemmBackend_t::rocblas);
                 }
                 else
                 {
@@ -879,8 +958,7 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
                                                       0,
                                                       time_precision,
                                                       callGemmStridedBatched,
-                                                      GemmBackend_t::rocblas,
-                                                      conv_params.gfx90aFp16alt);
+                                                      GemmBackend_t::rocblas);
                 }
 
                 if(gemm_status != miopenStatusSuccess)
@@ -1035,11 +1113,20 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context,
     solution.workspace_sz = workspace_req;
 
     solution.invoker_factory = [=](const std::vector<Kernel>&) {
-        const auto gemm_desc = [&]() {
+        const auto tmp_gemm_desc = [&]() {
             auto tmp          = conv.group_count > 1
                                     ? CreateGemmDescriptorGroupConvFwd(wDesc, xDesc, yDesc, conv.group_count)
                                     : CreateGemmDescriptorConvFwd(wDesc, xDesc, yDesc);
             tmp.deterministic = problem.GetConv().attribute.deterministic;
+            if(problem.IsTensorsCasted())
+            {
+                // IsApplicable ensures that both are casted
+                if(xDesc.GetCastType())
+                    tmp.a_cast_type = *wDesc.GetCastType();
+                if(wDesc.GetCastType())
+                    tmp.b_cast_type = *xDesc.GetCastType();
+            }
+            tmp.conv_attributes = problem.GetConv().attribute;
             return tmp;
         }();
 
@@ -1124,6 +1211,11 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context,
                 miopenStatus_t gemm_status = miopenStatusNotInitialized;
 
                 // tensors.y = tensors.w * Im2Col(tensors.x)
+                const auto gemm_desc = [&]() {
+                    auto tmp            = tmp_gemm_desc;
+                    tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt;
+                    return tmp;
+                }();
                 if(conv_params.type != InvokeType::Run)
                 {
                     gemm_status = CallGemmTimeMeasure(handle,
@@ -1137,8 +1229,7 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context,
                                                       time_precision,
                                                       conv.group_count > 1 ? callGemmStridedBatched
                                                                            : callGemm,
-                                                      GemmBackend_t::rocblas,
-                                                      conv_params.gfx90aFp16alt);
+                                                      GemmBackend_t::rocblas);
                 }
                 else
                 {
@@ -1151,8 +1242,7 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context,
                                                              0,
                                                              y,
                                                              out_offset,
-                                                             GemmBackend_t::rocblas,
-                                                             conv_params.gfx90aFp16alt);
+                                                             GemmBackend_t::rocblas);
                     else
                         gemm_status = CallGemm(handle,
                                                gemm_desc,
@@ -1162,8 +1252,7 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context,
                                                wksp_offset,
                                                y,
                                                out_offset,
-                                               GemmBackend_t::rocblas,
-                                               conv_params.gfx90aFp16alt);
+                                               GemmBackend_t::rocblas);
                 }
 
                 if(gemm_status != miopenStatusSuccess)
diff --git a/src/solver/gemm_bwd.cpp b/src/solver/gemm_bwd.cpp
index 697583742c..df7d08304b 100644
--- a/src/solver/gemm_bwd.cpp
+++ b/src/solver/gemm_bwd.cpp
@@ -100,9 +100,45 @@ bool GemmBwdBase::IsApplicable(const ExecutionContext& ctx,
 #if MIOPEN_USE_GEMM
     if(conv::solver::gemm::IsWorkaroundIssue1315(ctx))
         return false;
-    const auto& dyDesc = problem.GetIn();
-    const auto& wDesc  = problem.GetWeights();
-    const auto& dxDesc = problem.GetOut();
+    const auto& dyDesc             = problem.GetIn();
+    const auto& wDesc              = problem.GetWeights();
+    const auto& dxDesc             = problem.GetOut();
+    const auto rblas_fp8_supported = miopen::StartsWith(ctx.GetStream().GetDeviceName(), "gfx94");
+    if(problem.IsTensorsCasted())
+    {
+        if(!rblas_fp8_supported)
+        {
+            MIOPEN_LOG_I2("GEMM not supported with casted tensors on this GPU architecture");
+            return false;
+        }
+        if(dyDesc.GetCastType() && wDesc.GetCastType())
+        {
+            const auto a_cast_type = dyDesc.GetCastType();
+            const auto b_cast_type = wDesc.GetCastType();
+            if(a_cast_type != miopenFloat8 && a_cast_type != miopenBFloat8)
+            {
+                MIOPEN_LOG_W(
+                    "Casting is only supported for the miopenFloat8 and miopenBFloat8 data types");
+                return false;
+            }
+            if(b_cast_type != miopenFloat8 && b_cast_type != miopenBFloat8)
+            {
+                MIOPEN_LOG_W(
+                    "Casting is only supported for the miopenFloat8 and miopenBFloat8 data types");
+                return false;
+            }
+        }
+        else
+        {
+            MIOPEN_LOG_I("Both the output and weights tensors need to be casted");
+            return false;
+        }
+    }
+    if(problem.IsFp8() && !rblas_fp8_supported)
+    {
+        MIOPEN_LOG_I2("GEMM not applicable for F8 on this GPU architecture");
+        return false;
+    }
     return problem.GetDirection() == conv::Direction::BackwardData && problem.IsLayoutDefault() &&
            !(IsAnyBufferBF16(dxDesc, dyDesc, wDesc) && !IsBf16Supported) &&
            !(IsAnyBufferFp16(dxDesc, dyDesc, wDesc) && !IsFp16Supported);
@@ -242,12 +278,21 @@ ConvSolution GemmBwd1x1_stride2::GetSolution(const ExecutionContext& context,
 
     const auto group_count = conv.group_count;
 
-    GemmDescriptor gemm_desc = [&]() {
+    GemmDescriptor tmp_gemm_desc = [&]() {
         auto tmp =
             group_count > 1
                 ? CreateGemmDescriptorGroupConvCNHWBwdData(wDesc, dyDesc, dxDesc, group_count)
                 : CreateGemmDescriptorConvCNHWBwdData(wDesc, dyDesc, dxDesc);
         tmp.deterministic = problem.GetConv().attribute.deterministic;
+        if(problem.IsTensorsCasted())
+        {
+            // IsApplicable ensures that both are casted
+            if(dyDesc.GetCastType())
+                tmp.a_cast_type = *wDesc.GetCastType();
+            if(wDesc.GetCastType())
+                tmp.b_cast_type = *dyDesc.GetCastType();
+        }
+        tmp.conv_attributes = problem.GetConv().attribute;
         return tmp;
     }();
     std::size_t in_n, in_c;
@@ -322,6 +367,11 @@ ConvSolution GemmBwd1x1_stride2::GetSolution(const ExecutionContext& context,
 
             miopenStatus_t gemm_status;
 
+            const auto gemm_desc = [&]() {
+                auto tmp            = tmp_gemm_desc;
+                tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt;
+                return tmp;
+            }();
             if(conv_params.type == InvokeType::Run)
             {
                 if(group_count > 1)
@@ -333,8 +383,7 @@ ConvSolution GemmBwd1x1_stride2::GetSolution(const ExecutionContext& context,
                                                          0,
                                                          workspace,
                                                          dyDesc_.GetElementSize(),
-                                                         GemmBackend_t::rocblas,
-                                                         conv_params.gfx90aFp16alt);
+                                                         GemmBackend_t::rocblas);
                 else
                     // tensors.dx = CNHW2NCHW(transpose(tensors.w) * NCHW2CNHW(tensors.dy))
                     gemm_status = CallGemm(handle,
@@ -345,8 +394,7 @@ ConvSolution GemmBwd1x1_stride2::GetSolution(const ExecutionContext& context,
                                            0,
                                            workspace,
                                            dyDesc_.GetElementSize(),
-                                           GemmBackend_t::rocblas,
-                                           conv_params.gfx90aFp16alt);
+                                           GemmBackend_t::rocblas);
             }
             else
             {
@@ -361,8 +409,7 @@ ConvSolution GemmBwd1x1_stride2::GetSolution(const ExecutionContext& context,
                                         dyDesc_.GetElementSize(),
                                         time_precision,
                                         group_count > 1 ? callGemmStridedBatched : callGemm,
-                                        GemmBackend_t::rocblas,
-                                        conv_params.gfx90aFp16alt);
+                                        GemmBackend_t::rocblas);
             }
 
             if(gemm_status != miopenStatusSuccess)
@@ -469,12 +516,21 @@ ConvSolution GemmBwd1x1_stride1::GetSolution(const ExecutionContext&,
             const auto in_n = dxDesc.GetLengths()[0];
 
             // dx = transpose(w) * dy
-            const auto gemm_desc = [&]() {
+            const auto tmp_gemm_desc = [&]() {
                 auto tmp =
                     group_count > 1
                         ? CreateGemmDescriptorGroupConvBwdData(wDesc, dyDesc, dxDesc, group_count)
                         : CreateGemmStridedBatchedDescriptorConv1x1BwdData(wDesc, dyDesc, dxDesc);
                 tmp.deterministic = problem.GetConv().attribute.deterministic;
+                if(problem.IsTensorsCasted())
+                {
+                    // IsApplicable ensures that both are casted
+                    if(dyDesc.GetCastType())
+                        tmp.a_cast_type = *wDesc.GetCastType();
+                    if(wDesc.GetCastType())
+                        tmp.b_cast_type = *dyDesc.GetCastType();
+                }
+                tmp.conv_attributes = problem.GetConv().attribute;
                 return tmp;
             }();
 
@@ -506,6 +562,11 @@ ConvSolution GemmBwd1x1_stride1::GetSolution(const ExecutionContext&,
 
             miopenStatus_t gemm_status = miopenStatusUnknownError;
 
+            const auto gemm_desc = [&]() {
+                auto tmp            = tmp_gemm_desc;
+                tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt;
+                return tmp;
+            }();
             if(conv_params.type == InvokeType::Run)
             {
                 if(group_count > 1)
@@ -525,8 +586,7 @@ ConvSolution GemmBwd1x1_stride1::GetSolution(const ExecutionContext&,
                                                              out_offset,
                                                              dx,
                                                              in_offset,
-                                                             GemmBackend_t::rocblas,
-                                                             conv_params.gfx90aFp16alt);
+                                                             GemmBackend_t::rocblas);
 
                         if(handle.IsProfilingEnabled())
                         {
@@ -538,16 +598,8 @@ ConvSolution GemmBwd1x1_stride1::GetSolution(const ExecutionContext&,
                 }
                 else
                 {
-                    gemm_status = CallGemmStridedBatched(handle,
-                                                         gemm_desc,
-                                                         w,
-                                                         0,
-                                                         dy,
-                                                         0,
-                                                         dx,
-                                                         0,
-                                                         GemmBackend_t::rocblas,
-                                                         conv_params.gfx90aFp16alt);
+                    gemm_status = CallGemmStridedBatched(
+                        handle, gemm_desc, w, 0, dy, 0, dx, 0, GemmBackend_t::rocblas);
                 }
             }
             else
@@ -562,8 +614,7 @@ ConvSolution GemmBwd1x1_stride1::GetSolution(const ExecutionContext&,
                                                   0,
                                                   time_precision,
                                                   callGemmStridedBatched,
-                                                  GemmBackend_t::rocblas,
-                                                  conv_params.gfx90aFp16alt);
+                                                  GemmBackend_t::rocblas);
             }
 
             if(gemm_status != miopenStatusSuccess)
@@ -665,11 +716,20 @@ ConvSolution GemmBwdRest::GetSolution(const ExecutionContext& context,
     const auto out_spatial  = std::vector<std::size_t>(out_spatial_.begin(), out_spatial_.end());
 
     // dx = transpose(w) * dy
-    const auto gemm_desc = [&]() {
+    const auto tmp_gemm_desc = [&]() {
         auto tmp          = group_count > 1
                                 ? CreateGemmDescriptorGroupConvBwdData(wDesc, dyDesc, dxDesc, group_count)
                                 : CreateGemmDescriptorConvBwdData(wDesc, dyDesc, dxDesc);
         tmp.deterministic = problem.GetConv().attribute.deterministic;
+        if(problem.IsTensorsCasted())
+        {
+            // IsApplicable ensures that both are casted
+            if(dyDesc.GetCastType())
+                tmp.a_cast_type = *wDesc.GetCastType();
+            if(wDesc.GetCastType())
+                tmp.b_cast_type = *dyDesc.GetCastType();
+        }
+        tmp.conv_attributes = problem.GetConv().attribute;
         return tmp;
     }();
     const auto spatial_dims = conv.GetSpatialDimension();
@@ -718,6 +778,11 @@ ConvSolution GemmBwdRest::GetSolution(const ExecutionContext& context,
                              std::to_string(workspace_size) + " < " +
                              std::to_string(workspace_req) + ")");
 
+            const auto gemm_desc = [&]() {
+                auto tmp            = tmp_gemm_desc;
+                tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt;
+                return tmp;
+            }();
             if(conv_params.type == InvokeType::Run)
             {
                 float time_gemm = 0;
@@ -739,8 +804,7 @@ ConvSolution GemmBwdRest::GetSolution(const ExecutionContext& context,
                                                              out_offset,
                                                              workspace,
                                                              0,
-                                                             GemmBackend_t::rocblas,
-                                                             conv_params.gfx90aFp16alt);
+                                                             GemmBackend_t::rocblas);
                     else
                         gemm_status = CallGemm(handle,
                                                gemm_desc,
@@ -750,8 +814,7 @@ ConvSolution GemmBwdRest::GetSolution(const ExecutionContext& context,
                                                out_offset,
                                                workspace,
                                                0,
-                                               GemmBackend_t::rocblas,
-                                               conv_params.gfx90aFp16alt);
+                                               GemmBackend_t::rocblas);
 
                     if(gemm_status != miopenStatusSuccess)
                         MIOPEN_THROW("GemmBwdRest execution failure.");
@@ -795,8 +858,7 @@ ConvSolution GemmBwdRest::GetSolution(const ExecutionContext& context,
                                         0,
                                         time_precision,
                                         group_count > 1 ? callGemmStridedBatched : callGemm,
-                                        GemmBackend_t::rocblas,
-                                        conv_params.gfx90aFp16alt);
+                                        GemmBackend_t::rocblas);
 
                 if(gemm_status != miopenStatusSuccess)
                     MIOPEN_THROW("GemmBwdRest execution failure.");
diff --git a/src/solver/gemm_wrw.cpp b/src/solver/gemm_wrw.cpp
index 26a5f582c0..1bda06eca6 100644
--- a/src/solver/gemm_wrw.cpp
+++ b/src/solver/gemm_wrw.cpp
@@ -67,9 +67,45 @@ bool GemmWrwBase::IsApplicable(const ExecutionContext& ctx,
 #if MIOPEN_USE_GEMM
     if(conv::solver::gemm::IsWorkaroundIssue1315(ctx))
         return false;
-    const auto& dyDesc = problem.GetIn();
-    const auto& dwDesc = problem.GetWeights();
-    const auto& xDesc  = problem.GetOut();
+    const auto& dyDesc             = problem.GetIn();
+    const auto& dwDesc             = problem.GetWeights();
+    const auto& xDesc              = problem.GetOut();
+    const auto rblas_fp8_supported = miopen::StartsWith(ctx.GetStream().GetDeviceName(), "gfx94");
+    if(problem.IsTensorsCasted())
+    {
+        if(!rblas_fp8_supported)
+        {
+            MIOPEN_LOG_I2("GEMM not supported with casted tensors on this GPU architecture");
+            return false;
+        }
+        if(xDesc.GetCastType() && dyDesc.GetCastType())
+        {
+            const auto a_cast_type = xDesc.GetCastType();
+            const auto b_cast_type = dyDesc.GetCastType();
+            if(a_cast_type != miopenFloat8 && b_cast_type != miopenBFloat8)
+            {
+                MIOPEN_LOG_W(
+                    "Casting is only supported for the miopenFloat8 and miopenBFloat8 data types");
+                return false;
+            }
+            if(a_cast_type != miopenFloat8 && b_cast_type != miopenBFloat8)
+            {
+                MIOPEN_LOG_W(
+                    "Casting is only supported for the miopenFloat8 and miopenBFloat8 data types");
+                return false;
+            }
+        }
+        else
+        {
+            MIOPEN_LOG_I("Both the input and output tensors need to be casted");
+            return false;
+        }
+    }
+    if(problem.IsFp8() && !rblas_fp8_supported)
+    {
+        MIOPEN_LOG_I2("GEMM not applicable for F8 on this GPU architecture");
+        return false;
+    }
     return problem.GetDirection() == conv::Direction::BackwardWeights &&
            problem.IsLayoutDefault() &&
            !(IsAnyBufferBF16(xDesc, dyDesc, dwDesc) && !IsBF16PathValid) &&
@@ -171,11 +207,20 @@ ConvSolution GemmWrw1x1_stride1::GetSolution(const ExecutionContext&,
     }
 
     // dw = sum_over_batch(dy[i] * transpose(x[i])), i is batch id
-    const auto gemm_desc = [&]() {
+    const auto tmp_gemm_desc = [&]() {
         auto tmp          = group_count > 1
                                 ? CreateGemmDescriptorGroupConvBwdWeight(dyDesc, xDesc, dwDesc, group_count)
                                 : CreateGemmStridedBatchedDescriptorConv1x1BwdWeight(dyDesc, xDesc, dwDesc);
         tmp.deterministic = problem.GetConv().attribute.deterministic;
+        if(problem.IsTensorsCasted())
+        {
+            // IsApplicable ensures that both are casted
+            if(dyDesc.GetCastType())
+                tmp.a_cast_type = *dyDesc.GetCastType();
+            if(xDesc.GetCastType())
+                tmp.b_cast_type = *xDesc.GetCastType();
+        }
+        tmp.conv_attributes = problem.GetConv().attribute;
         return tmp;
     }();
 
@@ -216,6 +261,11 @@ ConvSolution GemmWrw1x1_stride1::GetSolution(const ExecutionContext&,
                 MIOPEN_LOG_FUNCTION("conv, 1x1");
             }
 
+            const auto gemm_desc = [&]() {
+                auto tmp            = tmp_gemm_desc;
+                tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt;
+                return tmp;
+            }();
             if(conv_params.type != InvokeType::Run)
             {
                 const auto status = CallGemmTimeMeasure(
@@ -229,8 +279,7 @@ ConvSolution GemmWrw1x1_stride1::GetSolution(const ExecutionContext&,
                     0,
                     time_precision,
                     group_count > 1 ? callGemmStridedBatched : callGemmStridedBatchedSequential,
-                    GemmBackend_t::rocblas,
-                    conv_params.gfx90aFp16alt);
+                    GemmBackend_t::rocblas);
 
                 if(status != miopenStatusSuccess)
                     MIOPEN_THROW("GemmWrw1x1_stride1 execution failure.");
@@ -266,8 +315,7 @@ ConvSolution GemmWrw1x1_stride1::GetSolution(const ExecutionContext&,
                                                                    in_offset,
                                                                    dw,
                                                                    0,
-                                                                   GemmBackend_t::rocblas,
-                                                                   conv_params.gfx90aFp16alt);
+                                                                   GemmBackend_t::rocblas);
 
                         if(status != miopenStatusSuccess)
                             MIOPEN_THROW("GemmWrw1x1_stride1 execution failure.");
@@ -285,16 +333,8 @@ ConvSolution GemmWrw1x1_stride1::GetSolution(const ExecutionContext&,
                 else
                 {
                     // dw = sum_over_batch(dy[i] * transpose(x[i])), i is batch id
-                    const auto status = CallGemmStridedBatchedSequential(handle,
-                                                                         gemm_desc,
-                                                                         dy,
-                                                                         0,
-                                                                         x,
-                                                                         0,
-                                                                         dw,
-                                                                         0,
-                                                                         GemmBackend_t::rocblas,
-                                                                         conv_params.gfx90aFp16alt);
+                    const auto status = CallGemmStridedBatchedSequential(
+                        handle, gemm_desc, dy, 0, x, 0, dw, 0, GemmBackend_t::rocblas);
 
                     if(status != miopenStatusSuccess)
                         MIOPEN_THROW("GemmWrw1x1_stride1 execution failure.");
@@ -373,11 +413,20 @@ ConvSolution GemmWrwUniversal::GetSolution(const ExecutionContext& context,
     const auto group_count = conv.group_count;
 
     // dw = dy * transpose(Im2Col(x))
-    const auto gemm_desc = [&]() {
+    const auto tmp_gemm_desc = [&]() {
         auto tmp          = group_count > 1
                                 ? CreateGemmDescriptorGroupConvBwdWeight(dyDesc, xDesc, dwDesc, group_count)
                                 : CreateGemmDescriptorConvBwdWeight(dyDesc, xDesc, dwDesc);
         tmp.deterministic = problem.GetConv().attribute.deterministic;
+        if(problem.IsTensorsCasted())
+        {
+            // IsApplicable ensures that both are casted
+            if(dyDesc.GetCastType())
+                tmp.a_cast_type = *dyDesc.GetCastType();
+            if(xDesc.GetCastType())
+                tmp.b_cast_type = *xDesc.GetCastType();
+        }
+        tmp.conv_attributes = problem.GetConv().attribute;
         return tmp;
     }();
 
@@ -439,6 +488,11 @@ ConvSolution GemmWrwUniversal::GetSolution(const ExecutionContext& context,
                              std::to_string(workspace_req) + ")");
             }
 
+            const auto gemm_desc = [&]() {
+                auto tmp            = tmp_gemm_desc;
+                tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt;
+                return tmp;
+            }();
             if(conv_params.type == InvokeType::Run)
             {
                 // Zeroing out the output buffer
@@ -478,8 +532,7 @@ ConvSolution GemmWrwUniversal::GetSolution(const ExecutionContext& context,
                                                         0,
                                                         dw,
                                                         0,
-                                                        GemmBackend_t::rocblas,
-                                                        conv_params.gfx90aFp16alt);
+                                                        GemmBackend_t::rocblas);
                     }
                     else
                     {
@@ -492,8 +545,7 @@ ConvSolution GemmWrwUniversal::GetSolution(const ExecutionContext& context,
                                           0,
                                           dw,
                                           0,
-                                          GemmBackend_t::rocblas,
-                                          conv_params.gfx90aFp16alt);
+                                          GemmBackend_t::rocblas);
                     }
 
                     if(status != miopenStatusSuccess)
@@ -539,8 +591,7 @@ ConvSolution GemmWrwUniversal::GetSolution(const ExecutionContext& context,
                                         0,
                                         time_precision,
                                         group_count > 1 ? callGemmStridedBatched : callGemm,
-                                        GemmBackend_t::rocblas,
-                                        conv_params.gfx90aFp16alt);
+                                        GemmBackend_t::rocblas);
 
                 if(status != miopenStatusSuccess)
                     MIOPEN_THROW("GemmWrw1x1_stride1 execution failure.");
diff --git a/src/solver/mlir_common.cpp b/src/solver/mlir_common.cpp
index c9250fe26f..eaaa0e42c3 100644
--- a/src/solver/mlir_common.cpp
+++ b/src/solver/mlir_common.cpp
@@ -58,6 +58,8 @@ static const char* DTypeName(miopenDataType_t ty)
     case miopenInt32: return "i32";
     case miopenInt8: return "i8";
     case miopenInt8x4: return "i8x4";
+    case miopenFloat8: return "fp8";
+    case miopenBFloat8: return "bfp8";
     }
     MIOPEN_THROW(miopenStatusInternalError, "Value outside of datatype enum");
 }
diff --git a/src/tensor.cpp b/src/tensor.cpp
index 877e319c33..ca4f1afc7a 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -47,6 +47,8 @@ bool IsDataTypeSupported(miopenDataType_t t)
     case miopenHalf:
     case miopenFloat:
     case miopenInt32:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt8:
     case miopenInt8x4:
     case miopenBFloat16:
@@ -338,6 +340,13 @@ std::size_t TensorDescriptor::GetElementSize() const
 
 miopenDataType_t TensorDescriptor::GetType() const { return this->type; }
 
+std::optional<miopenDataType_t> TensorDescriptor::GetCastType() const { return this->cast_type; }
+
+void TensorDescriptor::SetCastType(const miopenDataType_t cast_type_)
+{
+    this->cast_type = cast_type_;
+}
+
 miopenTensorLayout_t TensorDescriptor::GetLayout_t() const { return this->tensorLayout; }
 
 std::string TensorDescriptor::GetLayout_str() const
@@ -456,6 +465,18 @@ std::ostream& operator<<(std::ostream& stream, const TensorDescriptor& t)
         stream << "packed"
                << ", ";
 
+    if(t.cast_type)
+    {
+        stream << "cast_type: ";
+        const auto ct = *t.cast_type;
+        if(ct == miopenFloat8)
+            stream << "miopenFloat8";
+        else if(ct == miopenBFloat8)
+            stream << "miopenBFloat8";
+        else
+            stream << "Other";
+    }
+
     return stream;
 }
 
diff --git a/src/tensor_api.cpp b/src/tensor_api.cpp
index 76742a1084..307395f59d 100644
--- a/src/tensor_api.cpp
+++ b/src/tensor_api.cpp
@@ -199,6 +199,37 @@ extern "C" miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t ten
     });
 }
 
+extern "C" miopenStatus_t miopenSetTensorCastType(miopenTensorDescriptor_t tensorDesc,
+                                                  miopenDataType_t cast_type)
+{
+    if(miopen::IsLoggingFunctionCalls())
+    {
+        MIOPEN_LOG_FUNCTION(tensorDesc, cast_type);
+    }
+
+    return miopen::try_([&] { miopen::deref(tensorDesc).SetCastType(cast_type); });
+}
+
+extern "C" miopenStatus_t miopenGetTensorCastType(miopenTensorDescriptor_t tensorDesc,
+                                                  miopenDataType_t& cast_type)
+{
+    if(miopen::IsLoggingFunctionCalls())
+    {
+        MIOPEN_LOG_FUNCTION(tensorDesc);
+    }
+    return miopen::try_([&] {
+        const auto c_type = miopen::deref(tensorDesc).GetCastType();
+        if(c_type)
+        {
+            cast_type = *c_type;
+        }
+        else
+        {
+            cast_type = miopen::deref(tensorDesc).GetType();
+        }
+    });
+}
+
 extern "C" miopenStatus_t miopenGetTensorNumBytes(miopenTensorDescriptor_t tensorDesc,
                                                   size_t* numBytes)
 {
diff --git a/test/conv_common.hpp b/test/conv_common.hpp
index db3da76e3b..3d510bb21b 100644
--- a/test/conv_common.hpp
+++ b/test/conv_common.hpp
@@ -380,7 +380,7 @@ tensor<Tout> ref_conv_fwd(const tensor<T>& input,
     auto rout = out;
     if(filter.mode == miopenTranspose)
     {
-        std::fill(rout.begin(), rout.end(), 0);
+        std::fill(rout.begin(), rout.end(), static_cast<Tout>(0));
         bool gpu_ref_used = gpu_ref_convolution_bwd(rout, weights, input, filter);
         if(!gpu_ref_used)
         {
@@ -440,27 +440,92 @@ tensor<Twei> ref_conv_wrw(const tensor<T>& input,
 }
 
 template <typename T, typename Tout = T>
-tensor<Tout> ref_conv_bwd(const tensor<T>& input,
+tensor<Tout> ref_conv_bwd(const tensor<Tout>& input,
                           const tensor<T>& weights,
-                          const tensor<Tout>& out,
+                          const tensor<T>& out,
                           const miopen::ConvolutionDescriptor& filter)
 {
-    auto rin = input;
-    std::fill(rin.begin(), rin.end(), 0);
-    bool gpu_ref_used = gpu_ref_convolution_bwd(rin, weights, out, filter);
-    if(!gpu_ref_used)
+    auto rinput = input;
+
+    std::fill(rinput.begin(), rinput.end(), 0);
+
+    if(filter.mode == miopenTranspose)
     {
-        MIOPEN_LOG_W("GPU reference skipped");
-        cpu_convolution_backward_data(filter.GetSpatialDimension(),
-                                      rin,
-                                      weights,
-                                      out,
-                                      filter.GetConvPads(),
-                                      filter.GetConvStrides(),
-                                      filter.GetConvDilations(),
-                                      filter.GetGroupCount());
+        bool gpu_ref_used = gpu_ref_convolution_fwd(out, weights, rinput, filter);
+        if(!gpu_ref_used)
+        {
+            MIOPEN_LOG_W("GPU reference not run");
+            cpu_convolution_forward(filter.GetSpatialDimension(),
+                                    out,
+                                    weights,
+                                    rinput,
+                                    filter.GetConvPads(),
+                                    filter.GetConvStrides(),
+                                    filter.GetConvDilations(),
+                                    filter.GetGroupCount());
+        }
+    }
+    else
+    {
+        bool gpu_ref_used = gpu_ref_convolution_bwd(rinput, weights, out, filter);
+        if(!gpu_ref_used)
+        {
+            MIOPEN_LOG_W("GPU reference not run");
+            cpu_convolution_backward_data(filter.GetSpatialDimension(),
+                                          rinput,
+                                          weights,
+                                          out,
+                                          filter.GetConvPads(),
+                                          filter.GetConvStrides(),
+                                          filter.GetConvDilations(),
+                                          filter.GetGroupCount());
+        }
+    }
+    return rinput;
+}
+
+template <typename T, typename Tout = T>
+tensor<Tout> ref_conv_wrw(const tensor<T>& input,
+                          const tensor<Tout>& weights,
+                          const tensor<T>& out,
+                          const miopen::ConvolutionDescriptor& filter)
+{
+    auto rweights = weights;
+    std::fill(rweights.begin(), rweights.end(), 0);
+
+    if(filter.mode == miopenTranspose)
+    {
+        bool gpu_ref_used = gpu_ref_convolution_wrw(out, rweights, input, filter);
+        if(!gpu_ref_used)
+        {
+            MIOPEN_LOG_W("GPU reference not run");
+            cpu_convolution_backward_weight(filter.GetSpatialDimension(),
+                                            out,
+                                            rweights,
+                                            input,
+                                            filter.GetConvPads(),
+                                            filter.GetConvStrides(),
+                                            filter.GetConvDilations(),
+                                            filter.GetGroupCount());
+        }
+    }
+    else
+    {
+        bool gpu_ref_used = gpu_ref_convolution_wrw(input, rweights, out, filter);
+        if(!gpu_ref_used)
+        {
+            MIOPEN_LOG_W("GPU reference not run");
+            cpu_convolution_backward_weight(filter.GetSpatialDimension(),
+                                            input,
+                                            rweights,
+                                            out,
+                                            filter.GetConvPads(),
+                                            filter.GetConvStrides(),
+                                            filter.GetConvDilations(),
+                                            filter.GetGroupCount());
+        }
     }
-    return rin;
+    return rweights;
 }
 
 // Mainline convolution tests
diff --git a/test/cpu_conv.hpp b/test/cpu_conv.hpp
index 78c1176503..560c867c6b 100644
--- a/test/cpu_conv.hpp
+++ b/test/cpu_conv.hpp
@@ -39,6 +39,7 @@
 #include "tensor_holder.hpp"
 #include <miopen/stringutils.hpp>
 #include <miopen/functional.hpp>
+#include <miopen/hip_float8.hpp>
 
 template <class T, class... Ts>
 static constexpr auto make_array(T x, Ts... xs)
@@ -46,6 +47,12 @@ static constexpr auto make_array(T x, Ts... xs)
     return std::array<T, 1 + sizeof...(Ts)>{{x, xs...}};
 }
 
+template <typename T>
+struct PassThru
+{
+    T operator()(T t) { return t; }
+};
+
 template <typename Tin, typename Twei, typename Tout>
 struct cpu_convolution_acc_type
 {
@@ -66,6 +73,8 @@ struct cpu_convolution_acc_type<int8_t, int8_t, float>
 
 template <std::size_t ConvDim,
           typename Tacc,
+          typename FI,
+          typename FW,
           typename Tin,
           typename Twei,
           typename Tout,
@@ -76,7 +85,9 @@ void cpu_convolution_forward_impl(const tensor<Tin>& in,
                                   const Range& pads,
                                   const Range& strides,
                                   const Range& dilations,
-                                  std::size_t group_count)
+                                  std::size_t group_count,
+                                  FI fi = {},
+                                  FW fw = {})
 {
     static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
     assert(in.desc.GetSize() == ConvDim + 2 and wei.desc.GetSize() == ConvDim + 2 and
@@ -162,22 +173,26 @@ void cpu_convolution_forward_impl(const tensor<Tin>& in,
                         in_id[0] = out_n_id;
                         in_id[1] = in_c_id;
                         std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2);
-                        acc +=
-                            Tacc(in(in_id)) * Tacc(wei(out_k_id, wei_c_id, wei_spatial_id_pack...));
+                        Tacc tmp1 = static_cast<Tacc>(fi(in(in_id)));
+                        Tacc tmp2 =
+                            static_cast<Tacc>(fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...)));
+                        acc += tmp1 * tmp2;
                     }
                 }
             });
         });
         if(vector_len > 1)
             out(out_k_id % vector_len, out_n_id, out_k_id / vector_len, out_spatial_id_pack...) =
-                acc;
+                static_cast<Tout>(acc);
         else
-            out(out_n_id, out_k_id, out_spatial_id_pack...) = acc;
+            out(out_n_id, out_k_id, out_spatial_id_pack...) = static_cast<Tout>(acc);
     });
 }
 
 template <std::size_t ConvDim,
           typename Tacc,
+          typename FW,
+          typename FO,
           typename Tin,
           typename Twei,
           typename Tout,
@@ -188,7 +203,9 @@ void cpu_convolution_backward_data_impl(tensor<Tin>& in,
                                         const Range& pads,
                                         const Range& strides,
                                         const Range& dilations,
-                                        std::size_t group_count)
+                                        std::size_t group_count,
+                                        FW fw = {},
+                                        FO fo = {})
 {
     static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
     assert(in.desc.GetSize() == ConvDim + 2 and wei.desc.GetSize() == ConvDim + 2 and
@@ -255,19 +272,21 @@ void cpu_convolution_backward_data_impl(tensor<Tin>& in,
                         out_id[0] = in_n_id;
                         out_id[1] = out_k_id;
                         std::copy_n(out_spatial_id.begin(), ConvDim, out_id.begin() + 2);
-
-                        acc += Tacc(out(out_id)) *
-                               Tacc(wei(out_k_id, wei_c_id, wei_spatial_id_pack...));
+                        Tacc tmp1 = fo(out(out_id));
+                        Tacc tmp2 = fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...));
+                        acc += tmp1 * tmp2;
                     }
                 });
             });
-
-            in(in_n_id, in_c_id, in_spatial_id_pack...) = acc;
+            // TODO: Why do we need a no-lint here ?
+            in(in_n_id, in_c_id, in_spatial_id_pack...) = static_cast<Tout>(acc); // NOLINT
         });
 }
 
 template <std::size_t ConvDim,
           typename Tacc,
+          typename FI,
+          typename FO,
           typename Tin,
           typename Twei,
           typename Tout,
@@ -278,7 +297,9 @@ void cpu_convolution_backward_weight_impl(const tensor<Tin>& in,
                                           const Range& pads,
                                           const Range& strides,
                                           const Range& dilations,
-                                          std::size_t group_count)
+                                          std::size_t group_count,
+                                          FI fi,
+                                          FO fo)
 {
     static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0");
     assert(in.desc.GetSize() == ConvDim + 2 and wei.desc.GetSize() == ConvDim + 2 and
@@ -303,54 +324,60 @@ void cpu_convolution_backward_weight_impl(const tensor<Tin>& in,
     auto par_ford_wei_kc_spatial =
         miopen::unpacker(miopen::prepender(par_ford, wei_k_len, wei_c_len))(wei_spatial_len);
 
-    par_ford_wei_kc_spatial([&](std::size_t wei_k_id,
-                                std::size_t wei_c_id,
-                                auto... wei_spatial_id_pack) {
-        auto wei_spatial_id = make_array(wei_spatial_id_pack...);
+    par_ford_wei_kc_spatial(
+        [&](std::size_t wei_k_id, std::size_t wei_c_id, auto... wei_spatial_id_pack) {
+            auto wei_spatial_id = make_array(wei_spatial_id_pack...);
 
-        std::size_t group_id = wei_k_id / wei_k_len_per_group;
-        std::size_t in_c_id  = group_id * wei_c_len + wei_c_id;
+            std::size_t group_id = wei_k_id / wei_k_len_per_group;
+            std::size_t in_c_id  = group_id * wei_c_len + wei_c_id;
 
-        Tacc acc = 0;
+            Tacc acc = 0;
 
-        ford(out_n_len)([&](std::size_t out_n_id) {
-            auto ford_out_spatial = miopen::unpacker(ford)(out_spatial_len);
+            ford(out_n_len)([&](std::size_t out_n_id) {
+                auto ford_out_spatial = miopen::unpacker(ford)(out_spatial_len);
 
-            ford_out_spatial([&](auto... out_spatial_id_pack) {
-                auto out_spatial_id = make_array(out_spatial_id_pack...);
+                ford_out_spatial([&](auto... out_spatial_id_pack) {
+                    auto out_spatial_id = make_array(out_spatial_id_pack...);
 
-                std::array<std::ptrdiff_t, ConvDim> in_spatial_id{};
+                    std::array<std::ptrdiff_t, ConvDim> in_spatial_id{};
 
-                for(std::size_t i = 0; i < ConvDim; ++i)
-                {
-                    in_spatial_id[i] =
-                        out_spatial_id[i] * strides[i] + wei_spatial_id[i] * dilations[i] - pads[i];
-                }
+                    for(std::size_t i = 0; i < ConvDim; ++i)
+                    {
+                        in_spatial_id[i] = out_spatial_id[i] * strides[i] +
+                                           wei_spatial_id[i] * dilations[i] - pads[i];
+                    }
 
-                bool out_of_bound = false;
-                for(std::size_t i = 0; i < ConvDim; ++i)
-                {
-                    out_of_bound = out_of_bound or
-                                   (in_spatial_id[i] < 0 or in_spatial_id[i] >= in_spatial_len[i]);
-                }
+                    bool out_of_bound = false;
+                    for(std::size_t i = 0; i < ConvDim; ++i)
+                    {
+                        out_of_bound = out_of_bound or (in_spatial_id[i] < 0 or
+                                                        in_spatial_id[i] >= in_spatial_len[i]);
+                    }
 
-                if(!out_of_bound)
-                {
-                    std::array<std::size_t, ConvDim + 2> in_id{};
-                    in_id[0] = out_n_id;
-                    in_id[1] = in_c_id;
-                    std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2);
+                    if(!out_of_bound)
+                    {
+                        std::array<std::size_t, ConvDim + 2> in_id{};
+                        in_id[0] = out_n_id;
+                        in_id[1] = in_c_id;
+                        std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2);
+                        Tacc tmp1 = fi(in(in_id));
+                        Tacc tmp2 = fo(out(out_n_id, wei_k_id, out_spatial_id_pack...));
+                        acc += tmp1 * tmp2;
+                    }
+                });
 
-                    acc += Tacc(in(in_id)) * Tacc(out(out_n_id, wei_k_id, out_spatial_id_pack...));
-                }
+                wei(wei_k_id, wei_c_id, wei_spatial_id_pack...) = static_cast<Twei>(acc);
             });
-
-            wei(wei_k_id, wei_c_id, wei_spatial_id_pack...) = acc;
         });
-    });
 }
 
-template <typename Tin, typename Twei, typename Tout, typename Range>
+template <typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range,
+          typename Tacc = double,
+          typename FI   = PassThru<Tin>,
+          typename FW   = PassThru<Twei>>
 void cpu_convolution_forward(std::size_t spatial_dim,
                              const tensor<Tin>& in,
                              const tensor<Twei>& wei,
@@ -358,30 +385,30 @@ void cpu_convolution_forward(std::size_t spatial_dim,
                              const Range& pads,
                              const Range& strides,
                              const Range& dilations,
-                             std::size_t group_count)
+                             std::size_t group_count,
+                             FI fi = {},
+                             FW fw = {})
 {
-    using acc_type = typename cpu_convolution_acc_type<Tin, Twei, Tout>::type;
-
     switch(spatial_dim)
     {
     case 1: {
-        cpu_convolution_forward_impl<1, acc_type>(
-            in, wei, out, pads, strides, dilations, group_count);
+        cpu_convolution_forward_impl<1, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fw);
         break;
     }
     case 2: {
-        cpu_convolution_forward_impl<2, acc_type>(
-            in, wei, out, pads, strides, dilations, group_count);
+        cpu_convolution_forward_impl<2, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fw);
         break;
     }
     case 3: {
-        cpu_convolution_forward_impl<3, acc_type>(
-            in, wei, out, pads, strides, dilations, group_count);
+        cpu_convolution_forward_impl<3, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fw);
         break;
     }
     case 4: {
-        cpu_convolution_forward_impl<4, acc_type>(
-            in, wei, out, pads, strides, dilations, group_count);
+        cpu_convolution_forward_impl<4, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fw);
         break;
     }
     default: {
@@ -390,7 +417,13 @@ void cpu_convolution_forward(std::size_t spatial_dim,
     }
 }
 
-template <typename Tin, typename Twei, typename Tout, typename Range>
+template <typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range,
+          typename Tacc = double,
+          typename FW   = PassThru<Twei>,
+          typename FO   = PassThru<Tout>>
 void cpu_convolution_backward_data(std::size_t spatial_dim,
                                    tensor<Tin>& in,
                                    const tensor<Twei>& wei,
@@ -398,30 +431,30 @@ void cpu_convolution_backward_data(std::size_t spatial_dim,
                                    const Range& pads,
                                    const Range& strides,
                                    const Range& dilations,
-                                   std::size_t group_count)
+                                   std::size_t group_count,
+                                   FW fw = {},
+                                   FO fo = {})
 {
-    using acc_type = typename cpu_convolution_acc_type<Tin, Twei, Tout>::type;
-
     switch(spatial_dim)
     {
     case 1: {
-        cpu_convolution_backward_data_impl<1, acc_type>(
-            in, wei, out, pads, strides, dilations, group_count);
+        cpu_convolution_backward_data_impl<1, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fw, fo);
         break;
     }
     case 2: {
-        cpu_convolution_backward_data_impl<2, acc_type>(
-            in, wei, out, pads, strides, dilations, group_count);
+        cpu_convolution_backward_data_impl<2, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fw, fo);
         break;
     }
     case 3: {
-        cpu_convolution_backward_data_impl<3, acc_type>(
-            in, wei, out, pads, strides, dilations, group_count);
+        cpu_convolution_backward_data_impl<3, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fw, fo);
         break;
     }
     case 4: {
-        cpu_convolution_backward_data_impl<4, acc_type>(
-            in, wei, out, pads, strides, dilations, group_count);
+        cpu_convolution_backward_data_impl<4, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fw, fo);
         break;
     }
     default: {
@@ -430,7 +463,13 @@ void cpu_convolution_backward_data(std::size_t spatial_dim,
     }
 }
 
-template <typename Tin, typename Twei, typename Tout, typename Range>
+template <typename Tin,
+          typename Twei,
+          typename Tout,
+          typename Range,
+          typename Tacc = double,
+          typename FI   = PassThru<Tin>,
+          typename FO   = PassThru<Tout>>
 void cpu_convolution_backward_weight(std::size_t spatial_dim,
                                      const tensor<Tin>& in,
                                      tensor<Twei>& wei,
@@ -438,30 +477,30 @@ void cpu_convolution_backward_weight(std::size_t spatial_dim,
                                      const Range& pads,
                                      const Range& strides,
                                      const Range& dilations,
-                                     std::size_t group_count)
+                                     std::size_t group_count,
+                                     FI fi = {},
+                                     FO fo = {})
 {
-    using acc_type = typename cpu_convolution_acc_type<Tin, Twei, Tout>::type;
-
     switch(spatial_dim)
     {
     case 1: {
-        cpu_convolution_backward_weight_impl<1, acc_type>(
-            in, wei, out, pads, strides, dilations, group_count);
+        cpu_convolution_backward_weight_impl<1, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fo);
         break;
     }
     case 2: {
-        cpu_convolution_backward_weight_impl<2, acc_type>(
-            in, wei, out, pads, strides, dilations, group_count);
+        cpu_convolution_backward_weight_impl<2, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fo);
         break;
     }
     case 3: {
-        cpu_convolution_backward_weight_impl<3, acc_type>(
-            in, wei, out, pads, strides, dilations, group_count);
+        cpu_convolution_backward_weight_impl<3, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fo);
         break;
     }
     case 4: {
-        cpu_convolution_backward_weight_impl<4, acc_type>(
-            in, wei, out, pads, strides, dilations, group_count);
+        cpu_convolution_backward_weight_impl<4, Tacc>(
+            in, wei, out, pads, strides, dilations, group_count, fi, fo);
         break;
     }
     default: {
diff --git a/test/driver.hpp b/test/driver.hpp
index 8ff0df639d..0a8e2d3080 100644
--- a/test/driver.hpp
+++ b/test/driver.hpp
@@ -279,6 +279,8 @@ struct test_driver
         case miopenInt32: ss << "--int32 "; break;
         case miopenFloat: ss << "--float "; break;
         case miopenDouble: ss << "--double "; break;
+        case miopenFloat8: ss << "--float8"; break;
+        case miopenBFloat8: ss << "--bfloat8"; break;
         }
         for(auto&& arg : this->arguments)
         {
@@ -306,6 +308,8 @@ struct test_driver
         case miopenInt32: ret.emplace_back("--int32"); break;
         case miopenFloat: ret.emplace_back("--float"); break;
         case miopenDouble: ret.emplace_back("--double"); break;
+        case miopenFloat8: ret.emplace_back("--float8"); break;
+        case miopenBFloat8: ret.emplace_back("--bfloat8"); break;
         }
 
         for(auto&& arg : this->arguments)
@@ -876,6 +880,21 @@ struct test_driver
         }
     }
 
+    template <class V, class... Ts>
+    auto verify_eps(V&& v, Ts&&... xs) -> decltype(std::make_pair(v.cpu(xs...), v.gpu(xs...)))
+    {
+        return verify_impl(
+            [&](std::vector<double>& error, auto&& cpu, auto&& gpu) {
+                CHECK(miopen::range_distance(cpu) == miopen::range_distance(gpu));
+
+                double threshold = v.epsilon() * tolerance;
+                error            = {miopen::rms_range(cpu, gpu)};
+                return error.front() <= threshold;
+            },
+            v,
+            xs...);
+    }
+
     template <class V, class... Ts>
     auto verify(V&& v, Ts&&... xs) -> decltype(std::make_pair(v.cpu(xs...), v.gpu(xs...)))
     {
diff --git a/test/gtest/api_convbiasactiv.cpp b/test/gtest/api_convbiasactiv.cpp
index f065730eda..d59d3ae03d 100644
--- a/test/gtest/api_convbiasactiv.cpp
+++ b/test/gtest/api_convbiasactiv.cpp
@@ -25,6 +25,8 @@
  *******************************************************************************/
 #include <miopen/miopen.h>
 
+#define WORKAROUND_ISSUE_2212 1
+
 #if MIOPEN_BACKEND_HIP
 #include <gtest/gtest.h>
 #include <miopen/miopen.h>
@@ -174,7 +176,10 @@ TEST_P(ConvBiasActivFwdTest, DriveAPI)
 
 void GatherCBATestCases(std::vector<CBATestCase>& cba_test_cases)
 {
-    if(!miopen::StartsWith(get_handle().GetDeviceName(), "gfx11"))
+    const auto dev_name = get_handle().GetDeviceName();
+#if WORKAROUND_ISSUE_2212
+    if(!miopen::StartsWith(dev_name, "gfx11") && !miopen::StartsWith(dev_name, "gfx94"))
+#endif
     {
         cba_test_cases.push_back(CBATestCase{
             16, 128, 16, 16, 128, 3, 3, 0, 0, 1, 1, 1, 1, miopenActivationRELU, miopenConvolution});
diff --git a/test/gtest/cba.hpp b/test/gtest/cba.hpp
index 4529f714a4..70091e107c 100644
--- a/test/gtest/cba.hpp
+++ b/test/gtest/cba.hpp
@@ -38,6 +38,7 @@
 #include "conv_common.hpp"
 
 #include "conv_test_base.hpp"
+#include "conv_tensor_gen.hpp"
 
 template <typename T = float>
 struct ConvBiasActivInferTest
diff --git a/test/gtest/conv_embed_db.cpp b/test/gtest/conv_embed_db.cpp
index ba9889dfd7..b69fde1b5e 100644
--- a/test/gtest/conv_embed_db.cpp
+++ b/test/gtest/conv_embed_db.cpp
@@ -75,9 +75,11 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenBFloat16: params = ConfigWithBFloat16::GetParam(); break;
     case miopenInt8x4:
     case miopenInt32:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenDouble:
-        FAIL() << "miopenInt8x4, miopenInt32, miopenDouble data type not supported by "
-                  "conv_embed_db test";
+        FAIL() << "miopenInt8x4, miopenInt32, miopenFloat8, miopenBFloat8, miopenDouble data type "
+                  "not supported by conv_embed_db test";
 
     default: params = ConfigWithFloat::GetParam();
     }
diff --git a/test/gtest/conv_hip_igemm_xdlops.cpp b/test/gtest/conv_hip_igemm_xdlops.cpp
index 5fdb842fe0..508624c847 100644
--- a/test/gtest/conv_hip_igemm_xdlops.cpp
+++ b/test/gtest/conv_hip_igemm_xdlops.cpp
@@ -60,6 +60,8 @@ void Run2dDriver(miopenDataType_t prec)
     switch(prec)
     {
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenHalf:
     case miopenBFloat16:
     case miopenFloat:
diff --git a/test/gtest/conv_igemm_mlir.cpp b/test/gtest/conv_igemm_mlir.cpp
index cf3d93f07a..d5fc0d426c 100644
--- a/test/gtest/conv_igemm_mlir.cpp
+++ b/test/gtest/conv_igemm_mlir.cpp
@@ -83,11 +83,12 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenBFloat16:
     case miopenInt8x4:
     case miopenInt32:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenDouble:
         MIOPEN_THROW(miopenStatusBadParm,
-                     "miopenBFloat16, miopenInt8x4, miopenInt32, miopenDouble data "
-                     "type not supported by "
-                     "conv_igemm_mlir test");
+                     "miopenBFloat16, miopenInt8x4, miopenInt32, miopenFloat8, miopenBFloat8, "
+                     "miopenDouble data type not supported by conv_igemm_mlir test");
 
     default: params = ConfigWithFloat::GetParam();
     }
diff --git a/test/gtest/conv_igemm_mlir_xdlops.cpp b/test/gtest/conv_igemm_mlir_xdlops.cpp
index e490b15219..19913093c0 100644
--- a/test/gtest/conv_igemm_mlir_xdlops.cpp
+++ b/test/gtest/conv_igemm_mlir_xdlops.cpp
@@ -56,6 +56,8 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenInt8x4:
     case miopenInt32:
     case miopenDouble:
+    case miopenFloat8:
+    case miopenBFloat8:
         MIOPEN_THROW(miopenStatusBadParm,
                      "miopenBFloat16, miopenFloat, miopenInt8x4, miopenInt32, miopenDouble data "
                      "type not supported by "
diff --git a/test/gtest/conv_tensor_gen.hpp b/test/gtest/conv_tensor_gen.hpp
new file mode 100644
index 0000000000..f578e91973
--- /dev/null
+++ b/test/gtest/conv_tensor_gen.hpp
@@ -0,0 +1,125 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+// Copied from conv_driver.hpp
+
+template <typename T>
+inline T FRAND()
+{
+    double d = static_cast<double>(rand() / (static_cast<double>(RAND_MAX)));
+    return static_cast<T>(d);
+}
+
+template <typename T>
+inline T RAN_GEN(T A, T B)
+{
+    T r = (FRAND<T>() * (B - A)) + A;
+    return r;
+}
+template <typename T>
+T RanGenData()
+{
+    return RAN_GEN<T>(static_cast<T>(0.0f), static_cast<T>(1.0f));
+}
+
+template <>
+float8 RanGenData()
+{
+    return RAN_GEN<float8>(static_cast<float8>(-1.0f), static_cast<float8>(1.0f));
+}
+
+template <>
+bfloat8 RanGenData()
+{
+    const auto tmp = RAN_GEN<float>(static_cast<float>(-1.0f), static_cast<float>(1.0f));
+    return static_cast<bfloat8>(tmp);
+}
+
+template <typename T>
+struct GenData
+{
+    template <class... Ts>
+    T operator()(Ts...) const
+    {
+        return RanGenData<T>();
+    }
+};
+
+template <typename T>
+T RanGenWeights()
+{
+    return RAN_GEN<T>(static_cast<T>(-0.5), static_cast<T>(0.5));
+}
+
+// Shift FP16 distribution towards positive numbers,
+// otherwise Winograd FP16 validation fails.
+template <>
+half_float::half RanGenWeights()
+{
+    return RAN_GEN<half_float::half>(static_cast<half_float::half>(-1.0 / 3.0),
+                                     static_cast<half_float::half>(0.5));
+}
+
+template <>
+float8 RanGenWeights()
+{
+    const auto tmp =
+        RAN_GEN<float>(0.0, 1.0) > 0.5 ? static_cast<float>(0.0) : static_cast<float>(1.0);
+    // 1 in 2 chance of number being positive
+    const float sign =
+        (RAN_GEN<float>(0.0, 1.0) > 0.5) ? static_cast<float>(-1) : static_cast<float>(1);
+    const auto tmp2 = static_cast<float>(std::numeric_limits<float8>::epsilon()) *
+                      static_cast<float>(2) * sign * static_cast<float>(tmp);
+    return static_cast<float8>(tmp2);
+}
+
+template <>
+bfloat8 RanGenWeights()
+{
+    const auto tmp =
+        RAN_GEN<float>(0.0, 1.0) > 0.5 ? static_cast<float>(0.0) : static_cast<float>(1.0);
+    // 1 in 2 chance of number being positive
+    const float sign =
+        (RAN_GEN<float>(0.0, 1.0) > 0.5) ? static_cast<float>(-1) : static_cast<float>(1);
+    const auto tmp2 = static_cast<float>(std::numeric_limits<float8>::epsilon()) *
+                      static_cast<float>(2) * sign * static_cast<float>(tmp);
+    return static_cast<bfloat8>(tmp2);
+}
+
+template <typename T>
+struct GenWeights
+{
+    template <class... Ts>
+    T operator()(Ts...) const
+    {
+        return RanGenWeights<T>();
+    }
+};
diff --git a/test/gtest/conv_test_base.hpp b/test/gtest/conv_test_base.hpp
index a8413797b0..2f91d784bf 100644
--- a/test/gtest/conv_test_base.hpp
+++ b/test/gtest/conv_test_base.hpp
@@ -29,6 +29,7 @@
 #include <iostream>
 
 #include "conv_common.hpp"
+#include "conv_tensor_gen.hpp"
 
 template <typename T>
 miopenDataType_t GetDataType();
@@ -45,6 +46,18 @@ miopenDataType_t GetDataType<half_float::half>()
     return miopenHalf;
 }
 
+template <>
+miopenDataType_t GetDataType<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>()
+{
+    return miopenFloat8;
+}
+
+template <>
+miopenDataType_t GetDataType<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>()
+{
+    return miopenBFloat8;
+}
+
 struct ConvTestCase
 {
     size_t N;
@@ -67,7 +80,7 @@ struct ConvTestCase
                   << " k: " << tc.k << " y:" << tc.y << " x:" << tc.x << " pad_y:" << tc.pad_y
                   << " pad_x:" << tc.pad_x << " stride_y:" << tc.stride_y
                   << " stride_x:" << tc.stride_x << " dilation_y:" << tc.dilation_y
-                  << " dilation_x:" << tc.dilation_x << " )";
+                  << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode << " )";
     }
     const std::vector<size_t> GetInput() { return {N, C, H, W}; }
     const std::vector<size_t> GetWeights() { return {k, C, y, x}; }
@@ -123,7 +136,7 @@ std::vector<ConvTestCase> ConvTestConfigs()
             {64, 1024, 14, 14, 1024, 3, 3, 1, 1, 1, 1, 1, 1, miopenConvolution}};
 }
 
-template <typename T>
+template <typename T, typename Tref = float, bool use_cpu_ref = false>
 struct ConvFwdSolverTestBase
 {
 protected:
@@ -131,8 +144,8 @@ struct ConvFwdSolverTestBase
     {
         input   = tensor<T>{miopen_type<T>{}, tensor_layout, conv_config.GetInput()};
         weights = tensor<T>{miopen_type<T>{}, tensor_layout, conv_config.GetWeights()};
-        input.generate(tensor_elem_gen_integer{3});
-        weights.generate(tensor_elem_gen_integer{3});
+        input.generate(GenData<T>{});
+        weights.generate(GenWeights<T>{});
 
         conv_desc = conv_config.GetConv();
 
@@ -140,7 +153,7 @@ struct ConvFwdSolverTestBase
             conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType<T>());
 
         output = tensor<T>{miopen_type<T>{}, tensor_layout, output_desc.GetLengths()};
-        std::fill(output.begin(), output.end(), std::numeric_limits<double>::quiet_NaN());
+        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
 
         auto&& handle = get_handle();
         in_dev        = handle.Write(input.data);
@@ -152,7 +165,22 @@ struct ConvFwdSolverTestBase
     {
         miopen::TensorDescriptor output_desc =
             conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType<T>());
-        ref_out = ref_conv_fwd(input, weights, output, conv_desc);
+        ref_out = tensor<T>{miopen_type<T>{}, output.desc.GetLayout_t(), output_desc.GetLengths()};
+        if(use_cpu_ref)
+        {
+            cpu_convolution_forward(conv_desc.GetSpatialDimension(),
+                                    input,
+                                    weights,
+                                    ref_out,
+                                    conv_desc.GetConvPads(),
+                                    conv_desc.GetConvStrides(),
+                                    conv_desc.GetConvDilations(),
+                                    conv_desc.GetGroupCount());
+        }
+        else
+        {
+            ref_out = ref_conv_fwd(input, weights, ref_out, conv_desc);
+        }
     }
 
     void ThresholdChecks()
diff --git a/test/gtest/conv_trans.cpp b/test/gtest/conv_trans.cpp
index 4537ce8cbb..20015336a0 100644
--- a/test/gtest/conv_trans.cpp
+++ b/test/gtest/conv_trans.cpp
@@ -51,6 +51,8 @@ void Run2dDriver(miopenDataType_t prec)
     {
     case miopenFloat: params = ConfigWithFloat::GetParam(); break;
     case miopenHalf:
+    case miopenFloat8:
+    case miopenBFloat8:
     case miopenInt8:
     case miopenBFloat16:
     case miopenInt8x4:
diff --git a/test/gtest/solver.hpp b/test/gtest/get_solver.hpp
similarity index 59%
rename from test/gtest/solver.hpp
rename to test/gtest/get_solver.hpp
index 60147352cc..9a995c8916 100644
--- a/test/gtest/solver.hpp
+++ b/test/gtest/get_solver.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2022 Advanced Micro Devices, Inc.
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -26,36 +26,31 @@
 #pragma once
 
 #include <gtest/gtest.h>
-#include "cpu_conv.hpp"
+#include "conv_common.hpp"
 #include "get_handle.hpp"
 #include "tensor_util.hpp"
-#include <fusionHost.hpp>
 #include <miopen/conv/data_invoke_params.hpp>
 
-#include "conv_test_base.hpp"
+#include <miopen/type_name.hpp>
+#include <miopen/rank.hpp>
 
-template <typename T = float>
-struct ConvFwdSolverTest
-    : public ::testing::TestWithParam<
-          std::tuple<miopenConvFwdAlgorithm_t, ConvTestCase, miopenTensorLayout_t>>,
-      ConvFwdSolverTestBase<T>
+template <typename Solver, typename Context, typename Problem>
+auto GetSolutionImpl(miopen::rank<1>, Solver s, const Context& ctx, const Problem& problem)
+    -> decltype(s.GetSolution(ctx, problem, s.GetDefaultPerformanceConfig(ctx, problem)))
 {
-public:
-    void SetUp() override
-    {
-        test_skipped                               = false;
-        std::tie(algo, conv_config, tensor_layout) = GetParam();
-        ConvFwdSolverTestBase<T>::SetUpImpl(conv_config, tensor_layout);
-    }
-    void TearDown() override
-    {
-        if(test_skipped)
-            return;
-        ConvFwdSolverTestBase<T>::TearDownConv();
-        ConvFwdSolverTestBase<T>::ThresholdChecks();
-    }
-    ConvTestCase conv_config;
-    miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoDirect;
-    bool test_skipped             = false;
-    miopenTensorLayout_t tensor_layout;
-};
+    return s.GetSolution(ctx, problem, s.GetDefaultPerformanceConfig(ctx, problem));
+}
+
+template <typename Solver, typename Context, typename Problem>
+auto GetSolutionImpl(miopen::rank<0>, Solver s, const Context& ctx, const Problem& problem)
+    -> decltype(s.GetSolution(ctx, problem))
+{
+    return s.GetSolution(ctx, problem);
+}
+
+template <typename Solver, typename Context, typename Problem>
+miopen::solver::ConvSolution GetSolution(Solver s, const Context& ctx, const Problem& problem)
+{
+    auto solution = GetSolutionImpl(miopen::rank<1>{}, s, ctx, problem);
+    return solution;
+}
diff --git a/test/gtest/solver_bwd.hpp b/test/gtest/solver_bwd.hpp
new file mode 100644
index 0000000000..728c38fcda
--- /dev/null
+++ b/test/gtest/solver_bwd.hpp
@@ -0,0 +1,185 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <gtest/gtest.h>
+#include "conv_common.hpp"
+#include "get_handle.hpp"
+#include "tensor_util.hpp"
+#include <fusionHost.hpp>
+#include <miopen/conv/data_invoke_params.hpp>
+
+#include <miopen/hip_float8.hpp>
+#include <miopen/type_name.hpp>
+#include <miopen/rank.hpp>
+
+#include "conv_test_base.hpp"
+#include "conv_tensor_gen.hpp"
+
+#include "get_solver.hpp"
+
+template <typename T = float, typename Tref = float, bool use_cpu_ref = false>
+struct ConvBwdSolverTest
+    : public ::testing::TestWithParam<std::tuple<miopenConvFwdAlgorithm_t, ConvTestCase>>
+{
+
+    template <typename Solver>
+    void SolverBwd(Solver solv)
+    {
+        auto&& handle = get_handle();
+
+        const auto tensors = miopen::ConvBwdTensors{
+            output.desc, out_dev.get(), weights.desc, wei_dev.get(), input.desc, in_dev.get()};
+        const auto conv_problem =
+            miopen::conv::ProblemDescription(input.desc,
+                                             weights.desc,
+                                             output.desc,
+                                             conv_desc,
+                                             miopen::conv::Direction::BackwardData);
+        const auto problem                   = miopen::ProblemDescription{conv_problem};
+        const miopen::ConvolutionContext ctx = [&] {
+            auto tmp = miopen::ConvolutionContext{&handle};
+            problem.conv_problem.SetupFloats(tmp);
+            return tmp;
+        }();
+
+        // const auto network_config = problem.BuildConfKey();
+
+        if(!solv.IsApplicable(ctx, problem))
+        {
+            test_skipped = true;
+            GTEST_SKIP() << solv.SolverDbId() << ": Not Applicable for this problem" << conv_config;
+        }
+
+        if(solv.MayNeedWorkspace())
+        {
+            const auto cur_sol_ws = solv.GetWorkspaceSize(ctx, problem);
+            workspace_dev         = handle.Create<T>(cur_sol_ws);
+            workspace_size        = cur_sol_ws;
+        }
+
+        const auto invoke_params =
+            miopen::conv::DataInvokeParams{tensors,
+                                           workspace_dev.get(),
+                                           workspace_size,
+                                           conv_desc.attribute.gfx90aFp16alt.GetBwd()};
+
+        auto sol = GetSolution(solv, ctx, problem);
+        ASSERT_TRUE(sol.Succeeded());
+        ASSERT_TRUE(sol.invoker_factory);
+        const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params);
+        (invoker)(handle, invoke_params);
+        handle.Finish();
+    }
+
+protected:
+    void SetUp() override
+    {
+        test_skipped                = false;
+        std::tie(algo, conv_config) = GetParam();
+        input   = tensor<T>{conv_config.N, conv_config.C, conv_config.H, conv_config.W};
+        weights = tensor<T>{conv_config.k, conv_config.C, conv_config.y, conv_config.x};
+        weights.generate(GenWeights<T>{});
+
+        conv_desc = conv_config.GetConv();
+
+        miopen::TensorDescriptor output_desc =
+            conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType<T>());
+
+        output = tensor<T>{output_desc.GetLengths()};
+        output.generate(GenData<T>{});
+
+        std::fill(input.begin(), input.end(), std::numeric_limits<T>::quiet_NaN());
+
+        auto&& handle = get_handle();
+        in_dev        = handle.Write(input.data);
+        wei_dev       = handle.Write(weights.data);
+        out_dev       = handle.Write(output.data);
+    }
+    void TearDown() override
+    {
+        if(test_skipped)
+            return;
+
+        auto&& handle = get_handle();
+
+        miopen::TensorDescriptor output_desc =
+            conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType<T>());
+        ref_in = tensor<Tref>{output_desc.GetLengths()};
+        if(use_cpu_ref)
+        {
+            cpu_convolution_backward_data(conv_desc.GetSpatialDimension(),
+                                          ref_in,
+                                          weights,
+                                          output,
+                                          conv_desc.GetConvPads(),
+                                          conv_desc.GetConvStrides(),
+                                          conv_desc.GetConvDilations(),
+                                          conv_desc.GetGroupCount());
+        }
+        else
+        {
+            ref_in = ref_conv_bwd(ref_in, weights, output, conv_desc);
+        }
+        input.data = handle.Read<T>(in_dev, input.data.size());
+#if defined(__clang__) || defined(__GNUG__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        const auto zero_chk = [](T x) { return static_cast<T>(x) == static_cast<T>(0.0); };
+#if defined(__clang__) || defined(__GNUG__)
+#pragma GCC diagnostic pop
+#endif
+
+        EXPECT_FALSE(std::all_of(ref_in.begin(), ref_in.end(), [](float x) { return x == 0.0f; }))
+            << "Cpu data is all zeros";
+        EXPECT_FALSE(std::all_of(input.begin(), input.end(), zero_chk)) << "Gpu data is all zeros";
+        EXPECT_TRUE(miopen::range_distance(ref_in) == miopen::range_distance(input));
+
+        const double tolerance = 80;
+        double threshold       = static_cast<float>(std::numeric_limits<T>::epsilon()) * tolerance;
+        auto error             = miopen::rms_range(ref_in, input);
+
+        EXPECT_FALSE(miopen::find_idx(ref_in, miopen::not_finite) >= 0)
+            << "Non finite number found in the CPU data";
+
+        EXPECT_TRUE(error < threshold)
+            << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
+    }
+    ConvTestCase conv_config;
+    miopen::ConvolutionDescriptor conv_desc;
+    tensor<T> input;
+    tensor<T> weights;
+    tensor<T> output;
+    tensor<Tref> ref_in;
+    miopen::Allocator::ManageDataPtr in_dev;
+    miopen::Allocator::ManageDataPtr wei_dev;
+    miopen::Allocator::ManageDataPtr out_dev;
+    miopen::Allocator::ManageDataPtr workspace_dev;
+    size_t workspace_size;
+    miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoDirect;
+    bool test_skipped             = false;
+};
diff --git a/test/gtest/solver_bwd_f8.cpp b/test/gtest/solver_bwd_f8.cpp
new file mode 100644
index 0000000000..7c286cbfc2
--- /dev/null
+++ b/test/gtest/solver_bwd_f8.cpp
@@ -0,0 +1,68 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "solver_bwd.hpp"
+
+struct ConvBwdFp8 : ConvBwdSolverTest<float8, float, true>
+{
+};
+
+struct ConvBwdFp8Naive : ConvBwdSolverTest<float8, float, true>
+{
+};
+
+TEST_P(ConvBwdFp8, DISABLED_GemmBwd1x1_stride2)
+{
+    miopen::solver::GemmBwd1x1_stride2 solv{};
+    SolverBwd(solv);
+}
+
+TEST_P(ConvBwdFp8, DISABLED_GemmBwd1x1_stride1)
+{
+    miopen::solver::GemmBwd1x1_stride1 solv{};
+    SolverBwd(solv);
+}
+
+TEST_P(ConvBwdFp8, DISABLED_GemmBwdRest)
+{
+    miopen::solver::GemmBwdRest solv{};
+    SolverBwd(solv);
+}
+
+TEST_P(ConvBwdFp8Naive, DISABLED_Bwd)
+{
+    miopen::solver::ConvDirectNaiveConvBwd solv{};
+    SolverBwd(solv);
+}
+INSTANTIATE_TEST_SUITE_P(ConvBwdTest,
+                         ConvBwdFp8,
+                         testing::Combine(testing::Values(miopenConvolutionAlgoGEMM),
+                                          testing::ValuesIn(GetNetwork1())));
+// Since NaiveConv is verified against the CPU, we are conservative in the number and type
+// of test cases we instantiate
+INSTANTIATE_TEST_SUITE_P(ConvBwdTest,
+                         ConvBwdFp8Naive,
+                         testing::Combine(testing::Values(miopenConvolutionAlgoGEMM),
+                                          testing::ValuesIn(ConvTestConfigs())));
diff --git a/test/gtest/solver_convasm3x3u.cpp b/test/gtest/solver_convasm3x3u.cpp
index 3f174039f2..91133c68ba 100644
--- a/test/gtest/solver_convasm3x3u.cpp
+++ b/test/gtest/solver_convasm3x3u.cpp
@@ -23,65 +23,16 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include "solver.hpp"
+#include "solver_fwd.hpp"
 
 struct ConvFwdSolverTestFloat : ConvFwdSolverTest<float>
 {
 };
 
-template <typename Solver>
-void SolverFwd(const miopen::TensorDescriptor& inputDesc,
-               ConstData_t input,
-               const miopen::TensorDescriptor& wDesc,
-               ConstData_t weight,
-               const miopen::TensorDescriptor& outputDesc,
-               Data_t output,
-               const miopen::ConvolutionDescriptor& convDesc,
-               const ConvTestCase& conv_config,
-               bool& test_skipped)
-{
-    auto&& handle = get_handle();
-
-    Solver solv{};
-
-    const auto tensors =
-        miopen::ConvFwdTensors{inputDesc, input, wDesc, weight, outputDesc, output};
-
-    const auto problem = miopen::conv::ProblemDescription{
-        inputDesc, wDesc, outputDesc, convDesc, miopen::conv::Direction::Forward};
-    auto ctx = miopen::ConvolutionContext{};
-
-    ctx.SetStream(&handle);
-
-    if(!solv.IsApplicable(ctx, problem))
-    {
-        test_skipped = true;
-        GTEST_SKIP() << solv.SolverDbId() << "ConvAsm3x3U Not Applicable for this problem"
-                     << conv_config;
-    }
-    const auto invoke_params = miopen::conv::DataInvokeParams{
-        tensors, nullptr, 0, convDesc.attribute.gfx90aFp16alt.GetFwd()};
-
-    ASSERT_TRUE(solv.IsApplicable(ctx, problem));
-    auto sol = solv.GetSolution(ctx, problem, solv.GetDefaultPerformanceConfig(ctx, problem));
-    ASSERT_TRUE(sol.Succeeded());
-    ASSERT_TRUE(sol.invoker_factory);
-    const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params);
-    (invoker)(handle, invoke_params);
-    handle.Finish();
-}
-
 TEST_P(ConvFwdSolverTestFloat, ConvASM3x3UFwd)
 {
-    SolverFwd<miopen::solver::ConvAsm3x3U>(input.desc,
-                                           in_dev.get(),
-                                           weights.desc,
-                                           wei_dev.get(),
-                                           output.desc,
-                                           out_dev.get(),
-                                           conv_desc,
-                                           conv_config,
-                                           test_skipped);
+    miopen::solver::ConvAsm3x3U solv{};
+    SolverFwd<miopen::solver::ConvAsm3x3U>(solv);
 }
 
 INSTANTIATE_TEST_SUITE_P(ConvFwdTest,
diff --git a/test/gtest/solver_f8.hpp b/test/gtest/solver_f8.hpp
new file mode 100644
index 0000000000..34a10a4cfe
--- /dev/null
+++ b/test/gtest/solver_f8.hpp
@@ -0,0 +1,263 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <gtest/gtest.h>
+#include "cpu_conv.hpp"
+#include "get_handle.hpp"
+#include "tensor_util.hpp"
+#include <fusionHost.hpp>
+#include <miopen/conv/data_invoke_params.hpp>
+#include "conv_common.hpp"
+#include <miopen/hip_float8.hpp>
+#include "verify.hpp"
+using float8  = miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>;
+using bfloat8 = miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>;
+
+template <typename T>
+miopenDataType_t GetDataType();
+
+template <>
+miopenDataType_t GetDataType<float8>()
+{
+    return miopenFloat8;
+}
+
+template <>
+miopenDataType_t GetDataType<bfloat8>()
+{
+    return miopenBFloat8;
+}
+
+template <>
+miopenDataType_t GetDataType<float>()
+{
+    return miopenFloat;
+}
+
+struct ConvTestCase
+{
+    size_t N;
+    size_t C;
+    size_t H;
+    size_t W;
+    size_t k;
+    size_t y;
+    size_t x;
+    size_t pad_x;
+    size_t pad_y;
+    size_t stride_x;
+    size_t stride_y;
+    size_t dialtion_x;
+    size_t dilation_y;
+    miopenConvolutionMode_t conv_mode;
+    friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc)
+    {
+        return os << "N: " << tc.N << " C:" << tc.C << " H:" << tc.H << " W:" << tc.W
+                  << " k: " << tc.k << " y:" << tc.y << " x:" << tc.x << " pad_y:" << tc.pad_y
+                  << " pad_x:" << tc.pad_x << " stride_y:" << tc.stride_y
+                  << " dilation_y:" << tc.dilation_y << " conv_mode:" << tc.conv_mode;
+    }
+
+    miopen::ConvolutionDescriptor GetConv()
+    {
+        return miopen::ConvolutionDescriptor{
+            {static_cast<int>(pad_y), static_cast<int>(pad_x)},
+            {static_cast<int>(stride_y), static_cast<int>(stride_x)},
+            {static_cast<int>(dilation_y), static_cast<int>(dilation_y)}};
+    }
+};
+
+std::vector<ConvTestCase> ConvTestConfigs()
+{           // n  c   h   w   k   y  x pad_x pad_y stri_x stri_y dia_x dia_y
+    return {// New tests begin
+            {1, 32, 4, 4, 16, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {2, 32, 4, 4, 16, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {4, 32, 4, 4, 16, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {8, 32, 4, 4, 16, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {16, 32, 4, 4, 16, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {16, 128, 16, 16, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 128, 28, 28, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 128, 64, 64, 64, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 128, 128, 64, 64, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 128, 128, 128, 64, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 128, 128, 128, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 256, 128, 128, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 256, 256, 128, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 256, 256, 256, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 256, 256, 256, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {128, 256, 256, 256, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {128, 256, 512, 256, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {128, 256, 512, 512, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {128, 256, 1024, 512, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {128, 256, 1024, 1024, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {128, 512, 1024, 1024, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {128, 512, 1024, 1024, 512, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {128, 1024, 1024, 1024, 512, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {128, 1024, 1024, 1024, 1024, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {256, 1024, 1024, 1024, 1024, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {1024, 1024, 1024, 1024, 1024, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {1024, 2048, 2048, 2048, 2048, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            // New tests end
+            {16, 128, 16, 16, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 128, 28, 28, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 256, 14, 14, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 512, 7, 7, 512, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution},
+            {64, 1024, 14, 14, 1024, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}};
+}
+
+template <typename U, typename V>
+struct Fp8Cast
+{
+    uint64_t seed = 1234;
+    bool is_stoch = true;
+    V operator()(U x)
+    {
+        if(is_stoch)
+        {
+            auto tmp =
+                float8(static_cast<float>(x), miopen_f8::hip_f8_rounding_mode::stochastic, seed);
+            return static_cast<V>(tmp);
+        }
+        else
+        {
+            auto tmp = float8(static_cast<float>(x));
+            return static_cast<V>(tmp);
+        }
+    }
+};
+
+template <typename T, typename Tout = T, typename Tacc = float>
+struct ConvFwdSolverTest
+    : public ::testing::TestWithParam<std::tuple<miopenConvFwdAlgorithm_t, ConvTestCase>>
+{
+protected:
+    void SetUp() override
+    {
+        test_skipped                = false;
+        std::tie(algo, conv_config) = GetParam();
+        input   = tensor<T>{conv_config.N, conv_config.C, conv_config.H, conv_config.W};
+        weights = tensor<T>{conv_config.k, conv_config.C, conv_config.x, conv_config.y};
+
+        auto gen_fp8_value = [=](auto...) {
+            const auto tmp = float8(scalar_gen_random_float{-0.5, 0.5}());
+            return tmp;
+        };
+
+        input.generate(gen_fp8_value);
+        weights.generate(gen_fp8_value);
+
+        conv_desc = conv_config.GetConv();
+
+        miopen::TensorDescriptor output_desc = conv_desc.GetForwardOutputTensor(
+            input.desc, weights.desc, GetDataType<Tout>()); // Tgpu Datatype?
+
+        output = tensor<Tout>{output_desc.GetLengths()}; // half_float::half instead?
+
+        std::fill(output.begin(), output.end(), std::numeric_limits<Tout>::quiet_NaN());
+
+        auto&& handle = get_handle();
+        in_dev        = handle.Write(input.data);
+        wei_dev       = handle.Write(weights.data);
+        out_dev       = handle.Write(output.data);
+    }
+    void TearDown() override
+    {
+        if(test_skipped)
+            return;
+
+        auto&& handle = get_handle();
+
+        miopen::TensorDescriptor output_desc = conv_desc.GetForwardOutputTensor(
+            input.desc, weights.desc, GetDataType<Tout>()); // miopenFloat or GetDataType<Tgpu>() ?
+        ref_out = tensor<Tout>{output_desc.GetLengths()};
+
+        using FI       = Fp8Cast<T, T>;
+        using FW       = Fp8Cast<T, T>;
+        FI in_func     = {0, true};
+        FW weight_func = {0, true};
+
+        cpu_convolution_forward<T, T, Tout, decltype(conv_desc.GetConvPads()), Tacc, FW, FI>(
+            conv_desc.GetSpatialDimension(),
+            input,
+            weights,
+            ref_out,
+            conv_desc.GetConvPads(),
+            conv_desc.GetConvStrides(),
+            conv_desc.GetConvDilations(),
+            conv_desc.GetGroupCount(),
+            in_func,
+            weight_func);
+
+        output.data = handle.Read<Tout>(out_dev, output.data.size());
+        EXPECT_FALSE(miopen::f8_range_zero(ref_out)) << "Cpu data is all zeros";
+        EXPECT_FALSE(miopen::f8_range_zero(output)) << "Gpu data is all zeros";
+        EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output));
+
+        const float tolerance = 80.0;
+        auto threshold        = (static_cast<float>(std::numeric_limits<Tout>::epsilon()) *
+                          static_cast<float>(tolerance));
+
+        auto error = miopen::rms_range(ref_out, output);
+
+        bool refOutNan = false;
+        for(auto refOutElem : ref_out.data)
+        {
+            if(refOutElem.is_nan())
+            {
+                refOutNan = true;
+                break;
+            }
+        }
+
+        bool outputNan = false;
+        for(auto outputElem : output.data)
+        {
+            if(outputElem.is_nan())
+            {
+                outputNan = true;
+                break;
+            }
+        }
+
+        EXPECT_FALSE(refOutNan) << "NAN found in CPU data";
+        EXPECT_FALSE(outputNan) << "NAN found in GPU data";
+
+        EXPECT_TRUE(error < threshold)
+            << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
+    }
+    ConvTestCase conv_config;
+    miopen::ConvolutionDescriptor conv_desc;
+    tensor<T> input;
+    tensor<T> weights;
+    tensor<Tout> output; // Or T?
+    tensor<Tout> ref_out;
+    miopen::Allocator::ManageDataPtr in_dev;
+    miopen::Allocator::ManageDataPtr wei_dev;
+    miopen::Allocator::ManageDataPtr out_dev;
+    miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoGEMM;
+    bool test_skipped             = false;
+};
diff --git a/test/gtest/solver_fwd.hpp b/test/gtest/solver_fwd.hpp
new file mode 100644
index 0000000000..ac30ad15ca
--- /dev/null
+++ b/test/gtest/solver_fwd.hpp
@@ -0,0 +1,125 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <gtest/gtest.h>
+#include "conv_common.hpp"
+#include "get_handle.hpp"
+#include "tensor_util.hpp"
+#include <fusionHost.hpp>
+#include <miopen/conv/data_invoke_params.hpp>
+
+#include <miopen/hip_float8.hpp>
+#include <miopen/type_name.hpp>
+#include <miopen/rank.hpp>
+
+#include "conv_test_base.hpp"
+#include "get_solver.hpp"
+
+template <typename T = float, typename Tref = float, bool use_cpu_ref = false>
+struct ConvFwdSolverTest
+    : public ::testing::TestWithParam<
+          std::tuple<miopenConvFwdAlgorithm_t, ConvTestCase, miopenTensorLayout_t>>,
+      ConvFwdSolverTestBase<T, Tref, use_cpu_ref>
+{
+    template <typename Solver>
+    void SolverFwd(Solver solv)
+    {
+        auto&& handle = get_handle();
+
+        const auto tensors = miopen::ConvFwdTensors{this->input.desc,
+                                                    this->in_dev.get(),
+                                                    this->weights.desc,
+                                                    this->wei_dev.get(),
+                                                    this->output.desc,
+                                                    this->out_dev.get()};
+        const auto problem = miopen::ProblemDescription(
+            miopen::conv::ProblemDescription{this->input.desc,
+                                             this->weights.desc,
+                                             this->output.desc,
+                                             this->conv_desc,
+                                             miopen::conv::Direction::Forward});
+        const miopen::ConvolutionContext ctx = [&] {
+            auto tmp = miopen::ConvolutionContext{&handle};
+            problem.conv_problem.SetupFloats(tmp);
+            return tmp;
+        }();
+
+        // const auto network_config = problem.BuildConfKey();
+
+        if(!solv.IsApplicable(ctx, problem))
+        {
+            test_skipped = true;
+            GTEST_SKIP() << solv.SolverDbId() << ": Not Applicable for this problem" << conv_config;
+        }
+        if(solv.MayNeedWorkspace())
+        {
+            const auto cur_sol_ws = solv.GetWorkspaceSize(ctx, problem);
+            workspace_dev         = handle.Create<T>(cur_sol_ws);
+            workspace_size        = cur_sol_ws;
+        }
+
+        const auto invoke_params =
+            miopen::conv::DataInvokeParams{tensors,
+                                           workspace_dev.get(),
+                                           workspace_size,
+                                           this->conv_desc.attribute.gfx90aFp16alt.GetFwd()};
+
+        // auto sol = solv.GetSolution(ctx, problem);
+        // This is complicated due to the split between tunable and non-tunable solvers
+        // since the signature for solver.GetSolution needs a consutructed tuning params
+        // in the tunable case and not otherwise
+        const auto sol = GetSolution(solv, ctx, problem);
+        ASSERT_TRUE(sol.Succeeded());
+        ASSERT_TRUE(sol.invoker_factory);
+        const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params);
+        (invoker)(handle, invoke_params);
+        handle.Finish();
+    }
+
+protected:
+    void SetUp() override
+    {
+        test_skipped                               = false;
+        std::tie(algo, conv_config, tensor_layout) = GetParam();
+        this->SetUpImpl(conv_config, tensor_layout);
+    }
+
+    void TearDown() override
+    {
+        if(test_skipped)
+            return;
+        this->TearDownConv();
+        this->ThresholdChecks();
+    }
+
+    ConvTestCase conv_config;
+    miopen::Allocator::ManageDataPtr workspace_dev;
+    size_t workspace_size;
+    miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoDirect;
+    bool test_skipped             = false;
+    miopenTensorLayout_t tensor_layout;
+};
diff --git a/test/gtest/solver_fwd_f8.cpp b/test/gtest/solver_fwd_f8.cpp
new file mode 100644
index 0000000000..36f0ec67cd
--- /dev/null
+++ b/test/gtest/solver_fwd_f8.cpp
@@ -0,0 +1,70 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "solver_fwd.hpp"
+
+struct ConvFwdFp8 : ConvFwdSolverTest<float8, float>
+{
+};
+
+struct ConvFwdFp8Naive : ConvFwdSolverTest<float8, float, true>
+{
+};
+
+TEST_P(ConvFwdFp8, DISABLED_GemmFwdRest)
+{
+    miopen::solver::GemmFwdRest solv{};
+    SolverFwd(solv);
+}
+
+TEST_P(ConvFwdFp8, DISABLED_GemmFwd1x1_0_2)
+{
+    miopen::solver::GemmFwd1x1_0_2 solv{};
+    SolverFwd(solv);
+}
+
+TEST_P(ConvFwdFp8, DISABLED_Gemm1x1x0x1)
+{
+    miopen::solver::GemmFwd1x1_0_1 solv{};
+    SolverFwd(solv);
+}
+
+TEST_P(ConvFwdFp8Naive, DISABLED_Fwd)
+{
+    miopen::solver::ConvDirectNaiveConvFwd solv{};
+    SolverFwd<miopen::solver::ConvDirectNaiveConvFwd>(solv);
+}
+INSTANTIATE_TEST_SUITE_P(ConvFwdTest,
+                         ConvFwdFp8,
+                         testing::Combine(testing::Values(miopenConvolutionAlgoGEMM),
+                                          testing::ValuesIn(ConvTestConfigs()),
+                                          testing::Values(miopenTensorNCHW)));
+// Since NaiveConv is verified against the CPU, we are conservative in the number and type
+// of test cases we instantiate
+INSTANTIATE_TEST_SUITE_P(ConvFwdTest,
+                         ConvFwdFp8Naive,
+                         testing::Combine(testing::Values(miopenConvolutionAlgoGEMM),
+                                          testing::ValuesIn(ConvTestConfigs()),
+                                          testing::Values(miopenTensorNCHW)));
diff --git a/test/gtest/solver_wrw.hpp b/test/gtest/solver_wrw.hpp
new file mode 100644
index 0000000000..51f078fd21
--- /dev/null
+++ b/test/gtest/solver_wrw.hpp
@@ -0,0 +1,186 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <gtest/gtest.h>
+#include "conv_common.hpp"
+#include "get_handle.hpp"
+#include "tensor_util.hpp"
+#include <fusionHost.hpp>
+#include <miopen/conv/wrw_invoke_params.hpp>
+
+#include <miopen/hip_float8.hpp>
+#include <miopen/type_name.hpp>
+#include <miopen/rank.hpp>
+
+#include "conv_test_base.hpp"
+#include "conv_tensor_gen.hpp"
+
+#include "get_solver.hpp"
+
+template <typename T = float, typename Tref = float, bool use_cpu_ref = false>
+struct ConvWrwSolverTest
+    : public ::testing::TestWithParam<std::tuple<miopenConvFwdAlgorithm_t, ConvTestCase>>
+{
+
+    template <typename Solver>
+    void SolverWrw(Solver solv)
+    {
+        auto&& handle = get_handle();
+
+        const auto tensors = miopen::ConvWrwTensors{
+            output.desc, out_dev.get(), input.desc, in_dev.get(), weights.desc, wei_dev.get()};
+        const auto problem = miopen::ProblemDescription(
+            miopen::conv::ProblemDescription{output.desc,
+                                             weights.desc,
+                                             input.desc,
+                                             conv_desc,
+                                             miopen::conv::Direction::BackwardWeights});
+        const miopen::ConvolutionContext ctx = [&] {
+            auto tmp = miopen::ConvolutionContext{&handle};
+            problem.conv_problem.SetupFloats(tmp);
+            return tmp;
+        }();
+
+        // const auto network_config = problem.BuildConfKey();
+
+        if(!solv.IsApplicable(ctx, problem))
+        {
+            test_skipped = true;
+            GTEST_SKIP() << solv.SolverDbId() << ": Not Applicable for this problem" << conv_config;
+        }
+
+        if(solv.MayNeedWorkspace())
+        {
+            const auto cur_sol_ws = solv.GetWorkspaceSize(ctx, problem);
+            workspace_dev         = handle.Create<T>(cur_sol_ws);
+            workspace_size        = cur_sol_ws;
+        }
+
+        const auto invoke_params =
+            miopen::conv::WrWInvokeParams{tensors,
+                                          workspace_dev.get(),
+                                          workspace_size,
+                                          conv_desc.attribute.gfx90aFp16alt.GetBwd()};
+
+        auto sol = GetSolution(solv, ctx, problem);
+        ASSERT_TRUE(sol.Succeeded());
+        ASSERT_TRUE(sol.invoker_factory);
+        const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params);
+        (invoker)(handle, invoke_params);
+        handle.Finish();
+    }
+
+protected:
+    void SetUp() override
+    {
+        test_skipped                = false;
+        std::tie(algo, conv_config) = GetParam();
+        input   = tensor<T>{conv_config.N, conv_config.C, conv_config.H, conv_config.W};
+        weights = tensor<T>{conv_config.k, conv_config.C, conv_config.y, conv_config.x};
+        input.generate(GenData<T>{});
+
+        conv_desc = conv_config.GetConv();
+
+        miopen::TensorDescriptor output_desc =
+            conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType<T>());
+
+        output = tensor<T>{output_desc.GetLengths()};
+        output.generate(GenData<T>{});
+
+        std::fill(weights.begin(), weights.end(), std::numeric_limits<T>::quiet_NaN());
+
+        auto&& handle = get_handle();
+        in_dev        = handle.Write(input.data);
+        wei_dev       = handle.Write(weights.data);
+        out_dev       = handle.Write(output.data);
+    }
+    void TearDown() override
+    {
+        if(test_skipped)
+            return;
+
+        auto&& handle = get_handle();
+
+        miopen::TensorDescriptor output_desc =
+            conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType<T>());
+        ref_weights = tensor<Tref>{output_desc.GetLengths()};
+        if(use_cpu_ref)
+        {
+            cpu_convolution_backward_weight(conv_desc.GetSpatialDimension(),
+                                            input,
+                                            ref_weights,
+                                            output,
+                                            conv_desc.GetConvPads(),
+                                            conv_desc.GetConvStrides(),
+                                            conv_desc.GetConvDilations(),
+                                            conv_desc.GetGroupCount());
+        }
+        else
+        {
+            ref_weights = ref_conv_wrw(input, ref_weights, output, conv_desc);
+        }
+        weights.data = handle.Read<T>(in_dev, input.data.size());
+#if defined(__clang__) || defined(__GNUG__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        const auto zero_chk = [](T x) { return static_cast<T>(x) == static_cast<T>(0.0); };
+#if defined(__clang__) || defined(__GNUG__)
+#pragma GCC diagnostic pop
+#endif
+
+        EXPECT_FALSE(std::all_of(ref_weights.begin(), ref_weights.end(), [](float x) {
+            return x == 0.0f;
+        })) << "Cpu data is all zeros";
+        EXPECT_FALSE(std::all_of(weights.begin(), weights.end(), zero_chk))
+            << "Gpu data is all zeros";
+        EXPECT_TRUE(miopen::range_distance(ref_weights) == miopen::range_distance(weights));
+
+        const double tolerance = 80;
+        double threshold       = static_cast<float>(std::numeric_limits<T>::epsilon()) * tolerance;
+        auto error             = miopen::rms_range(ref_weights, weights);
+
+        EXPECT_FALSE(miopen::find_idx(ref_weights, miopen::not_finite) >= 0)
+            << "Non finite number found in the CPU data";
+
+        EXPECT_TRUE(error < threshold)
+            << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
+    }
+    ConvTestCase conv_config;
+    miopen::ConvolutionDescriptor conv_desc;
+    tensor<T> input;
+    tensor<T> weights;
+    tensor<T> output;
+    tensor<Tref> ref_weights;
+    miopen::Allocator::ManageDataPtr in_dev;
+    miopen::Allocator::ManageDataPtr wei_dev;
+    miopen::Allocator::ManageDataPtr out_dev;
+    miopen::Allocator::ManageDataPtr workspace_dev;
+    size_t workspace_size;
+    miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoDirect;
+    bool test_skipped             = false;
+};
diff --git a/test/gtest/solver_wrw_f8.cpp b/test/gtest/solver_wrw_f8.cpp
new file mode 100644
index 0000000000..76c608b622
--- /dev/null
+++ b/test/gtest/solver_wrw_f8.cpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "solver_wrw.hpp"
+struct ConvWrwFp8Naive : ConvWrwSolverTest<float8, float, true>
+{
+};
+
+TEST_P(ConvWrwFp8Naive, DISABLED_Wrw)
+{
+    miopen::solver::ConvDirectNaiveConvWrw solv{};
+    SolverWrw<miopen::solver::ConvDirectNaiveConvWrw>(solv);
+}
+// Since NaiveConv is verified against the CPU, we are conservative in the number and type
+// of test cases we instantiate
+INSTANTIATE_TEST_SUITE_P(ConvWrwTest,
+                         ConvWrwFp8Naive,
+                         testing::Combine(testing::Values(miopenConvolutionAlgoGEMM),
+                                          testing::ValuesIn(ConvTestConfigs())));
diff --git a/test/gtest/tensor_api.cpp b/test/gtest/tensor_api.cpp
index dddd10d5d4..eb38a78fb1 100644
--- a/test/gtest/tensor_api.cpp
+++ b/test/gtest/tensor_api.cpp
@@ -285,7 +285,7 @@ void RunWrongTestConfigs(const TestConfig& valid_config,
 {
 #if USE_OUT_OF_RANGE_ENUM
     const auto wrong_datatypes = {static_cast<miopenDataType_t>(miopenHalf - 1),
-                                  static_cast<miopenDataType_t>(miopenDouble + 1)};
+                                  static_cast<miopenDataType_t>(miopenBFloat8 + 1)};
     const auto wrong_layouts   = {static_cast<miopenTensorLayout_t>(miopenTensorNCHW - 1),
                                 static_cast<miopenTensorLayout_t>(miopenTensorNDHWC + 1)};
 #endif
diff --git a/test/perf_models/resnet50_v1.5.sh b/test/perf_models/resnet50_v1.5.sh
old mode 100755
new mode 100644
index 7ea94de7db..9d3055ab0d
--- a/test/perf_models/resnet50_v1.5.sh
+++ b/test/perf_models/resnet50_v1.5.sh
@@ -89,4 +89,4 @@ echo resnet50_v1.5.sh CMD:${cmd} BS:${bs} CAST:${incast},${weicast}
 ./bin/MIOpenDriver ${cmd} --batchsize ${bs}  --spatial_dim 2 --pad_h 1 --pad_w 1 --pad_d 0 --conv_stride_h 1 --conv_stride_w 1 --conv_stride_d 1 --dilation_h 1 --dilation_w 1 --dilation_d 1 --group_count 1 --mode conv --pad_mode default --trans_output_pad_h 0 --trans_output_pad_w 0 --trans_output_pad_d 0  --in_d 1 --in_h 28 --in_w 28 --fil_d 1 --fil_h 3 --fil_w 3 --in_channels 128 --out_channels 128 --forw 4 ${wrwcast} -V ${verif} -t 1
 ./bin/MIOpenDriver ${cmd} --batchsize ${bs}  --spatial_dim 2 --pad_h 0 --pad_w 0 --pad_d 0 --conv_stride_h 1 --conv_stride_w 1 --conv_stride_d 1 --dilation_h 1 --dilation_w 1 --dilation_d 1 --group_count 1 --mode conv --pad_mode default --trans_output_pad_h 0 --trans_output_pad_w 0 --trans_output_pad_d 0  --in_d 1 --in_h 28 --in_w 28 --fil_d 1 --fil_h 1 --fil_w 1 --in_channels 512 --out_channels 128 --forw 2 ${bwdcast} -V ${verif} -t 1
 ./bin/MIOpenDriver ${cmd} --batchsize ${bs}  --spatial_dim 2 --pad_h 0 --pad_w 0 --pad_d 0 --conv_stride_h 1 --conv_stride_w 1 --conv_stride_d 1 --dilation_h 1 --dilation_w 1 --dilation_d 1 --group_count 1 --mode conv --pad_mode default --trans_output_pad_h 0 --trans_output_pad_w 0 --trans_output_pad_d 0  --in_d 1 --in_h 28 --in_w 28 --fil_d 1 --fil_h 1 --fil_w 1 --in_channels 512 --out_channels 128 --forw 4 ${wrwcast} -V ${verif} -t 1
-./bin/MIOpenDriver ${cmd} --batchsize ${bs}  --spatial_dim 2 --pad_h 0 --pad_w 0 --pad_d 0 --conv_stride_h 2 --conv_stride_w 2 --conv_stride_d 1 --dilation_h 1 --dilation_w 1 --dilation_d 1 --group_count 1 --mode conv --pad_mode default --trans_output_pad_h 0 --trans_output_pad_w 0 --trans_output_pad_d 0  --in_d 1 --in_h 56 --in_w 56 --fil_d 1 --fil_h 1 --fil_w 1 --in_channels 256 --out_channels 512 --forw 2 ${bwdcast} -V ${verif} -t 1
\ No newline at end of file
+./bin/MIOpenDriver ${cmd} --batchsize ${bs}  --spatial_dim 2 --pad_h 0 --pad_w 0 --pad_d 0 --conv_stride_h 2 --conv_stride_w 2 --conv_stride_d 1 --dilation_h 1 --dilation_w 1 --dilation_d 1 --group_count 1 --mode conv --pad_mode default --trans_output_pad_h 0 --trans_output_pad_w 0 --trans_output_pad_d 0  --in_d 1 --in_h 56 --in_w 56 --fil_d 1 --fil_h 1 --fil_w 1 --in_channels 256 --out_channels 512 --forw 2 ${bwdcast} -V ${verif} -t 1
diff --git a/test/tensor_holder.hpp b/test/tensor_holder.hpp
index 70a844f17b..0b05a1e5e8 100644
--- a/test/tensor_holder.hpp
+++ b/test/tensor_holder.hpp
@@ -42,6 +42,12 @@
 #else
 #include <half.hpp>
 #endif
+using half         = half_float::half;
+using hip_bfloat16 = bfloat16;
+#include <miopen/hip_float8.hpp>
+using float8  = miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>;
+using bfloat8 = miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>;
+
 #include <iomanip>
 #include <fstream>
 
@@ -110,6 +116,16 @@ struct miopen_type<int> : std::integral_constant<miopenDataType_t, miopenInt32>
 {
 };
 
+template <>
+struct miopen_type<float8> : std::integral_constant<miopenDataType_t, miopenFloat8>
+{
+};
+
+template <>
+struct miopen_type<bfloat8> : std::integral_constant<miopenDataType_t, miopenBFloat8>
+{
+};
+
 template <class T>
 struct tensor
 {
@@ -403,6 +419,14 @@ tensor<T> make_tensor(std::initializer_list<std::size_t> dims, G g)
     return tensor<T>{miopen::TensorDescriptor{miopen_type<T>{}, dims}}.generate(g);
 }
 
+// This is needed since there is no TensorDescriptor(miopenDataType_t t, const size_t* plens, int
+// size) constructor
+template <class T>
+tensor<T> make_tensor(const std::vector<std::size_t>& dims)
+{
+    return tensor<T>{miopen::TensorDescriptor{miopen_type<T>{}, dims}};
+};
+
 template <class T, class X>
 tensor<T> make_tensor(const std::vector<X>& dims)
 {
diff --git a/test/verify.hpp b/test/verify.hpp
index 8bf8b80684..984237a48c 100644
--- a/test/verify.hpp
+++ b/test/verify.hpp
@@ -33,6 +33,11 @@
 #include <miopen/float_equal.hpp>
 #include <miopen/returns.hpp>
 #include <numeric>
+#include <miopen/bfloat16.hpp>
+using half         = half_float::half;
+using hip_bfloat16 = bfloat16;
+#include <miopen/hip_float8.hpp>
+#include "tensor_holder.hpp"
 
 namespace miopen {
 
@@ -105,7 +110,7 @@ struct square_diff_fn
     template <class T, class U>
     double operator()(T x, U y) const
     {
-        return (x - y) * (x - y);
+        return static_cast<double>((x - y) * (x - y));
     }
 };
 static constexpr square_diff_fn square_diff{};
@@ -119,6 +124,27 @@ bool range_empty(R1&& r1)
 template <class R1>
 auto range_distance(R1&& r1) MIOPEN_RETURNS(std::distance(r1.begin(), r1.end()));
 
+template <class R>
+bool f8_range_zero(R& r);
+
+template <>
+inline bool f8_range_zero<tensor<float8>>(tensor<float8>& r1)
+{
+    return std::all_of(r1.data.begin(), r1.data.end(), [&](float8 x) { return x.is_zero(); });
+}
+
+template <>
+inline bool f8_range_zero<tensor<bfloat8>>(tensor<bfloat8>& r1)
+{
+    return std::all_of(r1.data.begin(), r1.data.end(), [&](bfloat8 x) { return x.is_zero(); });
+}
+
+template <>
+inline bool f8_range_zero<tensor<float>>(tensor<float>& r1)
+{
+    return std::all_of(r1.data.begin(), r1.data.end(), [](float x) { return x == 0.0; });
+}
+
 template <class R1>
 bool range_zero(R1&& r1)
 {
@@ -172,14 +198,14 @@ double rms_range(R1&& r1, R2&& r2)
     if(n == range_distance(r2) && n != 0)
     {
         double square_difference = range_product(r1, r2, 0.0, sum_fn{}, square_diff);
-        double mag1              = *std::max_element(r1.begin(), r1.end(), compare_mag);
-        double mag2              = *std::max_element(r2.begin(), r2.end(), compare_mag);
+        double mag1 = static_cast<double>(*std::max_element(r1.begin(), r1.end(), compare_mag));
+        double mag2 = static_cast<double>(*std::max_element(r2.begin(), r2.end(), compare_mag));
         double mag =
             std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
         return std::sqrt(square_difference) / (std::sqrt(n) * mag);
     }
     else
-        return std::numeric_limits<range_value<R1>>::max();
+        return double(std::numeric_limits<range_value<R1>>::max());
 }
 } // namespace miopen
 #endif

From 4aa64c1f07bbcfb1249902aeeffc83719114ed6b Mon Sep 17 00:00:00 2001
From: JD <jahandad@gmail.com>
Date: Tue, 19 Sep 2023 12:57:59 -0500
Subject: [PATCH 02/36] [CI][Jenkins] remove reboot, Vega, and Navi21 stages
 (#2395)

* remove reboot from MIOpen CI, remove Vega and Navi21 stages

* Update Docker ROCm to official 5.7

---------

Co-authored-by: Jun Liu <Liu.Jun@amd.com>
---
 Dockerfile  | 6 +++---
 Jenkinsfile | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ede050ddc2..31a0334eeb 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,7 +18,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
-RUN wget https://repo.radeon.com/amdgpu-install/.5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb --no-check-certificate
+RUN wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb --no-check-certificate
 RUN apt-get update && \
 DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
     ./amdgpu-install_5.7.50700-1_all.deb
@@ -26,8 +26,8 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
 # Add rocm repository
 RUN export ROCM_APT_VER=5.7;\
 echo $ROCM_APT_VER &&\
-sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/.$ROCM_APT_VER/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list' &&\
-sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/.apt_$ROCM_APT_VER focal main > /etc/apt/sources.list.d/rocm.list'
+sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCM_APT_VER/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list' &&\
+sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/$ROCM_APT_VER focal main > /etc/apt/sources.list.d/rocm.list'
 RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
 
 RUN amdgpu-install -y --usecase=rocm --no-dkms
diff --git a/Jenkinsfile b/Jenkinsfile
index e7f4ed8400..9d10064b59 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -316,7 +316,7 @@ def buildHipClangJobAndReboot(Map conf=[:]){
         throw e
     }
     finally{
-        if (conf.get("needs_gpu", true)) {
+        if (conf.get("needs_reboot", false)) {
             reboot()
         }
     }
@@ -453,11 +453,11 @@ pipeline {
             description: "")
         booleanParam(
             name: "TARGET_VEGA10",
-            defaultValue: true,
+            defaultValue: false,
             description: "")
         booleanParam(
             name: "TARGET_VEGA20",
-            defaultValue: true,
+            defaultValue: false,
             description: "")
         booleanParam(
             name: "TARGET_GFX908",
@@ -469,7 +469,7 @@ pipeline {
             description: "")
         booleanParam(
             name: "TARGET_NAVI21",
-            defaultValue: true,
+            defaultValue: false,
             description: "")
         booleanParam(
             name: "DATATYPE_NA",

From 7315546fc5a12b37184e278e70009dee45ad9386 Mon Sep 17 00:00:00 2001
From: Artem Tamazov <artem.tamazov@gmail.com>
Date: Tue, 19 Sep 2023 21:00:58 +0300
Subject: [PATCH 03/36] [pooling][backward][2D] Support wide pooling window.
 Fix FP16 correctness issues of Average pooling. (#2372)

---
 driver/pool_driver.hpp                        |  62 ++--
 src/kernels/MIOpenPoolingBwd.cl               |  13 +-
 src/kernels/MIOpenPoolingBwdND.cl             |  14 +-
 src/kernels/float_types.h                     | 102 ++++--
 src/ocl/pooling_ocl.cpp                       |  36 +-
 src/solver/batchnorm/forward_inference_ck.cpp |   2 +-
 src/solver/pooling/backwardNd.cpp             | 292 +++++++++-------
 src/solver/pooling/forwardNaive.cpp           |  27 +-
 src/solver/pooling/forwardNd.cpp              |  23 +-
 test/CMakeLists.txt                           |  40 +--
 test/pooling2d.cpp                            |  48 +--
 test/pooling_common.hpp                       | 318 +++++++++++-------
 12 files changed, 582 insertions(+), 395 deletions(-)

diff --git a/driver/pool_driver.hpp b/driver/pool_driver.hpp
index c82f6442dd..bb3fa161e1 100644
--- a/driver/pool_driver.hpp
+++ b/driver/pool_driver.hpp
@@ -392,7 +392,7 @@ int PoolDriver_impl<Tgpu, Tref, Index>::AllocateBuffersAndCopy()
     maskhost = std::vector<size_t>(out_sz, static_cast<size_t>(0));
     outhost  = std::vector<Tref>(out_sz, static_cast<Tref>(0));
 
-    din     = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
+    din     = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(1.0));
     dout    = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
     dinhost = std::vector<Tref>(in_sz, static_cast<Tref>(0));
 
@@ -456,25 +456,27 @@ int PoolDriver_impl<Tgpu, Tref, Index>::RunForwardGPU()
 
     Timer t;
     START_TIME
+    int rc = 0;
 
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
-        miopenPoolingForward(GetHandle(),
-                             poolDesc,
-                             &alpha,
-                             inputTensor,
-                             in_dev->GetMem(),
-                             &beta,
-                             outputTensor,
-                             out_dev->GetMem(),
-                             do_backward,
-                             mask_dev->GetMem(),
-                             0);
+        rc |= miopenPoolingForward(GetHandle(),
+                                   poolDesc,
+                                   &alpha,
+                                   inputTensor,
+                                   in_dev->GetMem(),
+                                   &beta,
+                                   outputTensor,
+                                   out_dev->GetMem(),
+                                   do_backward,
+                                   mask_dev->GetMem(),
+                                   0);
     }
     if(inflags.GetValueInt("time") == 1)
     {
         float time = 0.0;
-        miopenGetKernelTime(GetHandle(), &time);
+        if(rc == 0)
+            miopenGetKernelTime(GetHandle(), &time);
 
         STOP_TIME
         if(WALL_CLOCK)
@@ -494,7 +496,7 @@ int PoolDriver_impl<Tgpu, Tref, Index>::RunForwardGPU()
         dumpBufferToFile<Index>((dump_root + "/dump_mask.bin").c_str(), mask.data(), out_sz);
     }
 
-    return miopenStatusSuccess;
+    return rc;
 }
 
 template <typename Tgpu, typename Tref, typename Index>
@@ -524,27 +526,29 @@ int PoolDriver_impl<Tgpu, Tref, Index>::RunBackwardGPU()
 
     Timer t;
     START_TIME
+    int rc = 0;
 
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
-        miopenPoolingBackward(GetHandle(),
-                              poolDesc,
-                              &alpha,
-                              outputTensor,
-                              out_dev->GetMem(),
-                              dOutputTensor,
-                              dout_dev->GetMem(),
-                              inputTensor,
-                              in_dev->GetMem(),
-                              &beta,
-                              dInputTensor,
-                              din_dev->GetMem(),
-                              mask_dev->GetMem());
+        rc |= miopenPoolingBackward(GetHandle(),
+                                    poolDesc,
+                                    &alpha,
+                                    outputTensor,
+                                    out_dev->GetMem(),
+                                    dOutputTensor,
+                                    dout_dev->GetMem(),
+                                    inputTensor,
+                                    in_dev->GetMem(),
+                                    &beta,
+                                    dInputTensor,
+                                    din_dev->GetMem(),
+                                    mask_dev->GetMem());
     }
     if(inflags.GetValueInt("time") == 1)
     {
         float time = 0.0;
-        miopenGetKernelTime(GetHandle(), &time);
+        if(rc == 0)
+            miopenGetKernelTime(GetHandle(), &time);
 
         STOP_TIME
         if(WALL_CLOCK)
@@ -561,7 +565,7 @@ int PoolDriver_impl<Tgpu, Tref, Index>::RunBackwardGPU()
         dumpBufferToFile<Tgpu>((dump_root + "/dump_din.bin").c_str(), din.data(), in_sz);
     }
 
-    return miopenStatusSuccess;
+    return rc;
 }
 
 template <typename Tgpu, typename Tref, typename Index>
diff --git a/src/kernels/MIOpenPoolingBwd.cl b/src/kernels/MIOpenPoolingBwd.cl
index 9e3afd8d82..6c88bebadf 100644
--- a/src/kernels/MIOpenPoolingBwd.cl
+++ b/src/kernels/MIOpenPoolingBwd.cl
@@ -95,12 +95,12 @@ mloPoolingAveBwd(const __global _FLOAT* top_diff,
                       : (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1;
     int top_off = b * mlo_topdf_batch_str + o * mlo_topdf_channel_str;
 
-    _FLOAT res[MLO_POOLBWD_N_VERT_OUT_PIX][MLO_POOLBWD_N_HORIZ_OUT_PIX];
+    _FLOAT_ACCUM res[MLO_POOLBWD_N_VERT_OUT_PIX][MLO_POOLBWD_N_HORIZ_OUT_PIX];
     for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++)
     {
         for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++)
         {
-            res[k][l] = 0;
+            res[k][l] = (_FLOAT_ACCUM)0;
         }
     }
 
@@ -183,9 +183,10 @@ mloPoolingAveBwd(const __global _FLOAT* top_diff,
                     pool_size     = (pool_size == 0) ? 1 : pool_size;
                     int lcl_top_h = top_h - top_y;
                     int lcl_top_w = top_w - top_x;
-                    _FLOAT add_val =
-                        (lcl_top_diff[lcl_top_h * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_top_w] /
-                         (_FLOAT)pool_size);
+                    _FLOAT_ACCUM add_val =
+                        CVT_FLOAT2ACCUM(
+                            lcl_top_diff[lcl_top_h * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_top_w]) /
+                        CVT_INTEGRAL2ACCUM(pool_size);
                     res[k][l] += add_val;
 #if 0
 				if (bot_x+l==6&&bot_y+k==0&&o==3&&b==0)
@@ -206,7 +207,7 @@ mloPoolingAveBwd(const __global _FLOAT* top_diff,
         {
             if(bot_y + k < mlo_bot_height && bot_x + l < mlo_bot_width)
             {
-                bot_diff[bot_off + k * mlo_botdf_str + l] = res[k][l];
+                bot_diff[bot_off + k * mlo_botdf_str + l] = CVT_ACCUM2FLOAT(res[k][l]);
 #if 0
 					if (lcl_id0==0&&lcl_id1==0&&o==0&&b==0)
 					{
diff --git a/src/kernels/MIOpenPoolingBwdND.cl b/src/kernels/MIOpenPoolingBwdND.cl
index bfa45a61b5..7daacd24ab 100644
--- a/src/kernels/MIOpenPoolingBwdND.cl
+++ b/src/kernels/MIOpenPoolingBwdND.cl
@@ -27,6 +27,8 @@
 #include "float_types.h"
 #include "pooling_functions.h"
 
+#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX
+
 #ifndef MLO_POOLING_INDEX_MAX
 #error "MLO_POOLING_INDEX_MAX not defined"
 #endif
@@ -145,6 +147,8 @@ mloPoolingNDMaxBwd(const __global _FLOAT* top_df,
     }
 }
 
+#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE || MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE
+
 __attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void
 mloPoolingNDAveBwd(const __global _FLOAT* top_df,
                    __global _FLOAT* bot_df,
@@ -202,7 +206,7 @@ mloPoolingNDAveBwd(const __global _FLOAT* top_df,
         top_h_end = min(top_h_end, (int)top_h);
         top_w_end = min(top_w_end, (int)top_w);
 
-        _FLOAT bot_data[PIX_D_PER_WORK][PIX_H_PER_WORK][PIX_W_PER_WORK] = {0};
+        _FLOAT_ACCUM bot_data[PIX_D_PER_WORK][PIX_H_PER_WORK][PIX_W_PER_WORK] = {0};
 
         for(int h = top_d_start; h < top_d_end; ++h)
         {
@@ -232,8 +236,9 @@ mloPoolingNDAveBwd(const __global _FLOAT* top_df,
 
                     uint top_gbl_off =
                         b_id * top_str_b + c_id * top_str_c + h * top_str_d + j * top_str_h + i;
-                    _FLOAT add_val = b_id < batch ? top_df[top_gbl_off] : 0;
-                    add_val /= (_FLOAT)pool_size;
+                    _FLOAT_ACCUM add_val =
+                        b_id < batch ? CVT_FLOAT2ACCUM(top_df[top_gbl_off]) : CVT_FP32_2ACCUM(0.0f);
+                    add_val /= CVT_INTEGRAL2ACCUM(pool_size);
 
                     for(int m = dstart; m < dend; ++m)
                     {
@@ -269,10 +274,11 @@ mloPoolingNDAveBwd(const __global _FLOAT* top_df,
                     {
                         uint bot_idx = bot_off + m * bot_str_d + k * bot_str_h + l;
 
-                        bot_df[bot_idx] = bot_data[m][k][l];
+                        bot_df[bot_idx] = CVT_ACCUM2FLOAT(bot_data[m][k][l]);
                     }
                 }
             }
         }
     }
 }
+#endif
diff --git a/src/kernels/float_types.h b/src/kernels/float_types.h
index 897e95ca67..5406ba85ec 100644
--- a/src/kernels/float_types.h
+++ b/src/kernels/float_types.h
@@ -85,14 +85,46 @@
 #define _FLOAT8 PPCAT(_FLOAT, EIGHT)
 #endif
 
-#if MIOPEN_USE_FP16 == 1
+/// If MIOPEN_USE_DOUBLE_ACCUM is defined as 1 when "float_types.h" is included,
+/// then all the ACCUM macros (the represent operations and types) will use FP64
+/// instead of FP32. In other words, the computations will be
+/// performed using the native datatype even if ACCUM macros are used.
+/// This functionality is indended mostly for debugging.
+#ifdef MIOPEN_USE_DOUBLE_ACCUM
+#if !(MIOPEN_USE_DOUBLE_ACCUM == 0 || MIOPEN_USE_DOUBLE_ACCUM == 1)
+#error "Invalid value of MIOPEN_USE_DOUBLE_ACCUM"
+#endif
+#else
+#define MIOPEN_USE_DOUBLE_ACCUM 0
+#endif
+
+#if MIOPEN_USE_DOUBLE_ACCUM
+#ifdef __HIP_PLATFORM_HCC__
+#define FLOAT_ACCUM double
+#else
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#define _FLOAT_ACCUM double
+#endif // __HIP_PLATFORM_HCC__
+#define MAX_VAL_ACCUM DBL_MAX
+#else // MIOPEN_USE_DOUBLE_ACCUM
 #ifdef __HIP_PLATFORM_HCC__
-#define FLOAT _Float16
 #define FLOAT_ACCUM float
 #else
+#define _FLOAT_ACCUM float
+#endif // __HIP_PLATFORM_HCC__
+#ifndef FLT_MAX
+#define MAX_VAL_ACCUM 3.402823466e+38F
+#else
+#define MAX_VAL_ACCUM FLT_MAX
+#endif
+#endif // MIOPEN_USE_DOUBLE_ACCUM
+
+#if MIOPEN_USE_FP16 == 1
+#ifdef __HIP_PLATFORM_HCC__
+#define FLOAT _Float16
+#else // __HIP_PLATFORM_HCC__
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define _FLOAT half
-#define _FLOAT_ACCUM float
 #endif // __HIP_PLATFORM_HCC__
 #define SIZEOF_FLOAT 2
 // Max value for the main datatype
@@ -101,21 +133,13 @@
 #else
 #define MAX_VAL HALF_MAX
 #endif
-// Max value for accumulator
-#ifndef FLT_MAX
-#define MAX_VAL_ACCUM 3.402823466e+38F
-#else
-#define MAX_VAL_ACCUM FLT_MAX
-#endif
 #endif // MIOPEN_USE_FP16
 
 #if MIOPEN_USE_FP32 == 1
 #ifdef __HIP_PLATFORM_HCC__
 #define FLOAT float
-#define FLOAT_ACCUM float
 #else
 #define _FLOAT float
-#define _FLOAT_ACCUM float
 #endif // __HIP_PLATFORM_HCC__
 #define SIZEOF_FLOAT 4
 // Max value for the main datatype
@@ -124,36 +148,28 @@
 #else
 #define MAX_VAL FLT_MAX
 #endif
-// Max value for accumulator
-#define MAX_VAL_ACCUM MAX_VAL
 #endif // MIOPEN_USE_FP32
 
 #if MIOPEN_USE_BFP16 == 1
 #ifdef __HIP_PLATFORM_HCC__
 #define FLOAT ushort
-#define FLOAT_ACCUM float
 #else
 #define _FLOAT ushort
-#define _FLOAT_ACCUM float
 #endif //
 #define SIZEOF_FLOAT 2
 // Max value for the main datatype
 #define MAX_VAL 0x7F7F
-// Max value for accumulator
-#ifndef FLT_MAX
-#define MAX_VAL_ACCUM 3.402823466e+38F
-#else
-#define MAX_VAL_ACCUM FLT_MAX
-#endif
 #endif // MIOPEN_USE_BFP16
 
 #if MIOPEN_USE_FP16 == 1
 #ifdef __HIP_PLATFORM_HCC__
 #define CVT_FLOAT2ACCUM(x) (static_cast<FLOAT_ACCUM>(x))
 #define CVT_ACCUM2FLOAT(x) (static_cast<FLOAT>(x))
+#define CVT_INTEGRAL2ACCUM(x) (static_cast<FLOAT_ACCUM>(x))
 #else
 #define CVT_FLOAT2ACCUM(x) ((_FLOAT_ACCUM)(x))
 #define CVT_ACCUM2FLOAT(x) ((_FLOAT)(x))
+#define CVT_INTEGRAL2ACCUM(x) ((_FLOAT_ACCUM)(x))
 #endif
 // These two are required to uniformly initialize
 // variables with non-zero literal constants of FP32 type
@@ -165,29 +181,48 @@
 #endif // MIOPEN_USE_FP16
 
 #if MIOPEN_USE_FP32 == 1
+/// \todo Basically, conversions from float to accum and vice versa
+/// should be removed because FLOAT_ACCUM and FLOAT are identical.
+/// However this may lead to problems if these macros are used in
+/// inappropriate contexts (e.g. with integral types), so this
+/// refactoring should be considered as nontrivial and requires
+/// a separate PR. Let's keep this historical stuff for now.
+/// --atamazov 30.08.2023
 #ifdef __HIP_PLATFORM_HCC__
 #define CVT_FLOAT2ACCUM(x) (static_cast<FLOAT_ACCUM>(x))
 #define CVT_ACCUM2FLOAT(x) (static_cast<FLOAT>(x))
+#define CVT_INTEGRAL2ACCUM(x) (static_cast<FLOAT_ACCUM>(x))
 #else
 #define CVT_FLOAT2ACCUM(x) ((_FLOAT_ACCUM)(x))
 #define CVT_ACCUM2FLOAT(x) ((_FLOAT)(x))
+#define CVT_INTEGRAL2ACCUM(x) ((_FLOAT_ACCUM)(x))
 #endif
 #define CVT_FP32_2FLOAT(x) (CVT_ACCUM2FLOAT(x))
 #define CVT_FP32_2ACCUM(x) (x)
 #endif // MIOPEN_USE_FP32
 
 #if MIOPEN_USE_BFP16 == 1
-#define CVT_FLOAT2ACCUM(x) bfloat16_to_float(x)
-#define CVT_ACCUM2FLOAT(x) float_to_bfloat16(x)
+#ifdef __HIP_PLATFORM_HCC__
+#define CVT_FLOAT2ACCUM(x) MIOPEN_ERROR_NOT_IMLEMENTED
+#define CVT_ACCUM2FLOAT(x) MIOPEN_ERROR_NOT_IMLEMENTED
+#define CVT_INTEGRAL2ACCUM(x) MIOPEN_ERROR_NOT_IMLEMENTED
+#define CVT_FP32_2FLOAT(x) MIOPEN_ERROR_NOT_IMLEMENTED
+#define CVT_FP32_2ACCUM(x) MIOPEN_ERROR_NOT_IMLEMENTED
+#else
+#define CVT_FLOAT2ACCUM(x) (bfloat16_to_float(x))
+#define CVT_ACCUM2FLOAT(x) (float_to_bfloat16(x))
+#define CVT_INTEGRAL2ACCUM(x) ((_FLOAT_ACCUM)(x))
 #define CVT_FP32_2FLOAT(x) (CVT_ACCUM2FLOAT(x))
 #define CVT_FP32_2ACCUM(x) (x)
 #endif
+#endif
 
 /// If MIOPEN_USE_NATIVE_DATATYPE_ACCUM is defined as 1 when "float_types.h" is included,
 /// then all the ACCUM macros (the represent operations and types) will use the native
 /// datatype (BF16 or FP16) instead of FP32. In other words, the computations will be
 /// performed using the native datatype even if ACCUM macros are used. This allows for
 /// building both mixed-precision and "pure" kernels from the single source.
+/// Note: This macro has higher priority than MIOPEN_USE_DOUBLE_ACCUM.
 #ifdef MIOPEN_USE_NATIVE_DATATYPE_ACCUM
 #if !(MIOPEN_USE_NATIVE_DATATYPE_ACCUM == 0 || MIOPEN_USE_NATIVE_DATATYPE_ACCUM == 1)
 #error "Invalid value of MIOPEN_USE_NATIVE_DATATYPE_ACCUM"
@@ -197,8 +232,14 @@
 #endif
 
 #if MIOPEN_USE_NATIVE_DATATYPE_ACCUM
+#ifdef __HIP_PLATFORM_HCC__
+#undef FLOAT_ACCUM
+#define FLOAT_ACCUM MIOPEN_ERROR_NOT_IMLEMENTED
+#else
 #undef _FLOAT_ACCUM
 #define _FLOAT_ACCUM _FLOAT
+#endif
+
 #undef MAX_VAL_ACCUM
 #define MAX_VAL_ACCUM MAX_VAL
 #undef CVT_FLOAT2ACCUM
@@ -207,6 +248,19 @@
 #define CVT_ACCUM2FLOAT(x) (x)
 #undef CVT_FP32_2ACCUM
 #define CVT_FP32_2ACCUM(x) (CVT_FP32_2FLOAT(x))
-#endif // !(AVERAGE_OPS && MIOPEN_USE_FP16)
+
+#undef CVT_INTEGRAL2ACCUM
+#ifdef __HIP_PLATFORM_HCC__
+#define CVT_INTEGRAL2ACCUM(x) MIOPEN_ERROR_NOT_IMLEMENTED
+#else
+#if MIOPEN_USE_BFP16 == 1
+// No direct conversion from integral types to BF16 is available.
+// WARNING: Precision loss when integral type is wider than 16 bits.
+#define CVT_INTEGRAL2ACCUM(x) (float_to_bfloat16(static_cast<float>(x)))
+#else
+#define CVT_INTEGRAL2ACCUM(x) ((_FLOAT)(x))
+#endif
+#endif
+#endif // MIOPEN_USE_NATIVE_DATATYPE_ACCUM
 
 #endif // GUARD_FLOAT_TYPES_H
diff --git a/src/ocl/pooling_ocl.cpp b/src/ocl/pooling_ocl.cpp
index 94aac5d31f..86fca9004b 100644
--- a/src/ocl/pooling_ocl.cpp
+++ b/src/ocl/pooling_ocl.cpp
@@ -88,26 +88,23 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle,
 
     auto index_max = get_index_max(GetIndexType());
 
-    // for kernel implementation max pooling backward pass,
-    //   "index_max" means ghost, and thus should not be reached
+    /// \anchor max_pooling_index_max_restriction
+    /// For kernel implementation max pooling backward pass,
+    /// "index_max" means ghost, and thus should not be reached.
     if(mode == miopenPoolingMax && save_index)
     {
-        if((workspaceIndexMode == miopenPoolingWorkspaceIndexMask &&
-            !(index_max >= std::accumulate(lens.begin(), lens.end(), 1, std::multiplies<int>()))) ||
-           (workspaceIndexMode == miopenPoolingWorkspaceIndexImage &&
-            !(index_max >= std::accumulate(xDesc.GetLengths().begin() + 2,
-                                           xDesc.GetLengths().end(),
-                                           1,
-                                           std::multiplies<int>()))))
+        if((workspaceIndexMode == miopenPoolingWorkspaceIndexMask                                 //
+            && index_max <= std::accumulate(lens.begin(), lens.end(), 1, std::multiplies<int>())) //
+           ||                                                                                     //
+           (workspaceIndexMode == miopenPoolingWorkspaceIndexImage                                //
+            && index_max <= std::accumulate(xDesc.GetLengths().begin() + 2,
+                                            xDesc.GetLengths().end(),
+                                            1,
+                                            std::multiplies<int>())))
         {
             MIOPEN_THROW("Index range not enough for max pooling bwd");
         }
 
-        if(workspaceIndexMode == miopenPoolingWorkspaceIndexMask && pool_dim == 5)
-        {
-            MIOPEN_THROW("3D pooling doesn't support workspace index mask mode");
-        }
-
         if(workSpace == nullptr)
         {
             throw std::invalid_argument("workSpace cannot be NULL in Forward Pooling MAX mode when "
@@ -115,9 +112,9 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle,
         }
     }
 
-    const auto algo_name =
-        AlgorithmName{pool_dim == 5 ? "miopenPoolingNdForward" : "miopenPooling2dForward"};
-    const auto problem = pooling::ProblemDescription{*this, xDesc, yDesc, save_index};
+    // So far, all pooling solvers implement the Direct (trivial) computation algorithm.
+    const auto algo_name = AlgorithmName{"miopenPoolingForwardDirect"};
+    const auto problem   = pooling::ProblemDescription{*this, xDesc, yDesc, save_index};
 
     const auto invoke_params = [&]() {
         auto tmp           = pooling::FwdInvokeParams{};
@@ -180,9 +177,8 @@ miopenStatus_t PoolingDescriptor::Backward(Handle& handle,
         MIOPEN_THROW("Unsupported pooling dimension");
     }
 
-    const auto problem = pooling::ProblemDescription{*this, xDesc, yDesc, dxDesc, dyDesc};
-    const auto algo_name =
-        AlgorithmName{pool_dim == 5 ? "miopenPoolingNdBackward" : "miopenPooling2dBackward"};
+    const auto problem   = pooling::ProblemDescription{*this, xDesc, yDesc, dxDesc, dyDesc};
+    const auto algo_name = AlgorithmName{"miopenPoolingBackwardDirect"};
 
     const auto invoke_params = [&]() {
         auto tmp      = pooling::BwdInvokeParams{};
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index 186bc28ff2..5a7918cc64 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -180,7 +180,7 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& context,
 {
 #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
     std::ignore = context;
-    std::ignore = fdesc_problem;
+    std::ignore = bn_problem;
     return false;
 #else
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_INFER{}))
diff --git a/src/solver/pooling/backwardNd.cpp b/src/solver/pooling/backwardNd.cpp
index 25c5df3297..77dc917d2a 100644
--- a/src/solver/pooling/backwardNd.cpp
+++ b/src/solver/pooling/backwardNd.cpp
@@ -31,6 +31,8 @@
 #include <miopen/pooling.hpp>
 #include <miopen/kernel_build_params.hpp>
 
+#define WORKAROUND_ISSUE_MIFIN_80 1 // https://github.com/ROCmSoftwarePlatform/MIFin/issues/80
+
 namespace miopen {
 
 namespace solver {
@@ -40,12 +42,25 @@ namespace pooling {
 bool PoolingBackwardNd::IsApplicable(const ExecutionContext&,
                                      const miopen::pooling::ProblemDescription& problem) const
 {
-    return problem.GetDirection() == miopen::pooling::Direction::Backward &&
-           (problem.GetPooling().GetMode() == miopenPoolingMax ||
-            problem.GetPooling().GetMode() == miopenPoolingAverage ||
-            problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) &&
-           problem.GetXDesc().GetSize() == 5 && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW" &&
-           problem.GetYDesc().GetLayout("NCDHW") == "NCDHW";
+    return problem.GetDirection() == miopen::pooling::Direction::Backward          //
+           && problem.GetXDesc().GetType() == problem.GetYDesc().GetType()         //
+           && (problem.GetXDesc().GetType() == miopenFloat                         //
+               || problem.GetXDesc().GetType() == miopenHalf)                      //
+           && (problem.GetPooling().GetMode() == miopenPoolingMax                  //
+               || problem.GetPooling().GetMode() == miopenPoolingAverage           //
+               || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) //
+           && (                                                                    //
+                  (problem.GetXDesc().GetSize() == 5                               //
+                   && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW"             //
+                   && problem.GetYDesc().GetLayout("NCDHW") == "NCDHW")            //
+                  ||                                                               //
+                  (problem.GetXDesc().GetSize() == 4                               //
+                   && problem.GetXDesc().GetLayout("NCHW") == "NCHW"               //
+                   && problem.GetYDesc().GetLayout("NCHW") == "NCHW")              //
+                  )                                                                //
+           /// \todo This solver does not support workspace index mask mode yet.
+           && !(problem.GetPooling().GetMode() == miopenPoolingMax //
+                && problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask);
 }
 
 ConvSolution
@@ -54,165 +69,186 @@ PoolingBackwardNd::GetSolution(const ExecutionContext&,
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
+    auto kernel        = KernelInfo{};
+    kernel.kernel_file = "MIOpenPoolingBwdND.cl";
+    kernel.kernel_name = "mloPoolingND";
+
+    if(problem.GetPooling().GetMode() == miopenPoolingMax)
+    {
+        kernel.kernel_name += "MaxBwd";
+    }
+    else if(problem.GetPooling().GetMode() == miopenPoolingAverage ||
+            problem.GetPooling().GetMode() == miopenPoolingAverageInclusive)
     {
-        auto kernel = KernelInfo{};
+        kernel.kernel_name += "AveBwd";
+    }
 
-        kernel.kernel_file = "MIOpenPoolingBwdND.cl";
-        kernel.kernel_name = "mloPoolingND";
+    const auto& bot = problem.GetXDesc();
+    const auto& top = problem.GetYDesc();
 
-        if(problem.GetPooling().GetMode() == miopenPoolingMax)
-        {
-            kernel.kernel_name += "MaxBwd";
-        }
-        else if(problem.GetPooling().GetMode() == miopenPoolingAverage ||
-                problem.GetPooling().GetMode() == miopenPoolingAverageInclusive)
-        {
-            kernel.kernel_name += "AveBwd";
-        }
+    std::size_t batch_sz, n_inputs, in_height, in_width;
+    std::tie(batch_sz, n_inputs, in_height, in_width) = miopen::tien<4>(bot.GetLengths(), 1);
 
-        std::size_t batch_sz, n_inputs, in_height, in_width;
-        std::tie(batch_sz, n_inputs, in_height, in_width) =
-            miopen::tien<4>(problem.GetXDesc().GetLengths(), 1);
-
-        const int pooling_method = (problem.GetPooling().GetMode() == miopenPoolingMax)
-                                       ? MLO_POOLING_OP_MAX
-                                       : ((problem.GetPooling().GetMode() == miopenPoolingAverage)
-                                              ? MLO_POOLING_OP_AVE
-                                              : MLO_POOLING_OP_AVE_INCLUSIVE);
-
-        int pix_w_per_work = 1;
-        int pix_h_per_work = 4;
-        int pix_d_per_work = 2;
-
-        int batch = problem.GetDYDesc().GetLengths()[0];
-        int chal  = problem.GetDYDesc().GetLengths()[1];
-
-        int bot_d = *(problem.GetDXDesc().GetLengths().rbegin() + 2);
-        int bot_h = *(problem.GetDXDesc().GetLengths().rbegin() + 1);
-        int bot_w = *(problem.GetDXDesc().GetLengths().rbegin());
-
-        int pix_blk_w = std::max((bot_w + pix_w_per_work - 1) / pix_w_per_work, 1);
-        int pix_blk_h = std::max((bot_h + pix_h_per_work - 1) / pix_h_per_work, 1);
-        int pix_blk_d = std::max((bot_d + pix_d_per_work - 1) / pix_d_per_work, 1);
-
-        int max_activ_workitem = 65536;
-        int total_work         = batch * chal * pix_blk_w * pix_blk_h * pix_blk_d;
-        int activ_work         = std::min(total_work, max_activ_workitem);
-
-        size_t lcl_work = 64;
-        size_t grp_num  = (activ_work + lcl_work - 1) / lcl_work;
-
-        bool territory_overlap = false;
-        for(std::size_t i = 0; i < problem.GetPooling().strides.size(); i++)
-            territory_overlap |= (problem.GetPooling().strides[i] < problem.GetPooling().lens[i]);
-
-        const auto build_params =
-            KernelBuildParameters{
-                {"MLO_POOLING_OP_ID", static_cast<long long>(pooling_method)},
-                {"MAX_ACTIV_WORKITEM", static_cast<unsigned>(max_activ_workitem)},
-                {"MLO_POOLING_GROUP_SZ0", static_cast<long long>(lcl_work)},
-                {"MLO_POOLING_GROUP_SZ1", 1},
-                {"MLO_POOLING_GROUP_SZ2", 1},
-                {"PIX_W_PER_WORK", static_cast<unsigned>(pix_w_per_work)},
-                {"PIX_H_PER_WORK", static_cast<unsigned>(pix_h_per_work)},
-                {"PIX_D_PER_WORK", static_cast<unsigned>(pix_d_per_work)},
-                {"KERNEL_SZ_D", static_cast<unsigned>(problem.GetPooling().lens[0])},
-                {"KERNEL_SZ_H", static_cast<unsigned>(problem.GetPooling().lens[1])},
-                {"KERNEL_SZ_W", static_cast<unsigned>(problem.GetPooling().lens[2])},
-                {"STRIDE_D", static_cast<unsigned>(problem.GetPooling().strides[0])},
-                {"STRIDE_H", static_cast<unsigned>(problem.GetPooling().strides[1])},
-                {"STRIDE_W", static_cast<unsigned>(problem.GetPooling().strides[2])},
-                {"TERRITORY_OVERLAP", static_cast<int>(territory_overlap)},
-                {"MLO_POOLING_INDEX_TYPE",
-                 get_pooling_index_type_name(problem.GetPooling().GetIndexType())},
-                {"MLO_POOLING_INDEX_MAX",
-                 get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())},
-            }
-            << GetDataTypeKBP(problem.GetDYDesc().GetType());
+    const int pooling_method = (problem.GetPooling().GetMode() == miopenPoolingMax)
+                                   ? MLO_POOLING_OP_MAX
+                                   : ((problem.GetPooling().GetMode() == miopenPoolingAverage)
+                                          ? MLO_POOLING_OP_AVE
+                                          : MLO_POOLING_OP_AVE_INCLUSIVE);
+
+    int pix_w_per_work = 1;
+    int pix_h_per_work = 4;
+    int pix_d_per_work = 2;
+
+    int batch = top.GetLengths()[0];
+    int chal  = top.GetLengths()[1];
+
+    const bool is2d = (bot.GetSize() == 4);
+
+    int bot_d = is2d ? 1 : *(bot.GetLengths().rbegin() + 2);
+    int bot_h = *(bot.GetLengths().rbegin() + 1);
+    int bot_w = *(bot.GetLengths().rbegin());
 
-        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+    int pix_blk_w = std::max((bot_w + pix_w_per_work - 1) / pix_w_per_work, 1);
+    int pix_blk_h = std::max((bot_h + pix_h_per_work - 1) / pix_h_per_work, 1);
+    int pix_blk_d = std::max((bot_d + pix_d_per_work - 1) / pix_d_per_work, 1);
 
-        kernel.l_wk = {lcl_work, 1, 1};
-        kernel.g_wk = {lcl_work * grp_num, 1, 1};
+    int max_activ_workitem = 65536;
+    int total_work         = batch * chal * pix_blk_w * pix_blk_h * pix_blk_d;
+    int activ_work         = std::min(total_work, max_activ_workitem);
 
-        result.construction_params.push_back(kernel);
+#if WORKAROUND_ISSUE_MIFIN_80
+    const std::size_t wavesize = 64;
+#else
+    const std::size_t wavesize = context.GetStream().GetWavefrontWidth();
+#endif
+    size_t grp_num = (activ_work + wavesize - 1) / wavesize;
+
+    auto strides = problem.GetPooling().strides;
+    auto lens    = problem.GetPooling().lens;
+    auto pads    = problem.GetPooling().pads;
+
+    if(is2d)
+    {
+        strides.push_back(strides[1]);
+        strides[1] = strides[0];
+        lens.push_back(lens[1]);
+        lens[1] = lens[0];
+        lens[0] = 1;
+        pads.push_back(pads[1]);
+        pads[1] = pads[0];
+        pads[0] = 0;
     }
 
-    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
-        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::pooling::BwdInvokeParams>();
+    bool territory_overlap = false;
+    for(std::size_t i = 0; i < strides.size(); i++)
+        territory_overlap |= (strides[i] < lens[i]);
+
+    const auto build_params =
+        KernelBuildParameters{
+            {"MLO_POOLING_OP_ID", pooling_method},
+            {"MAX_ACTIV_WORKITEM", max_activ_workitem},
+            {"MLO_POOLING_GROUP_SZ0", wavesize},
+            {"MLO_POOLING_GROUP_SZ1", 1},
+            {"MLO_POOLING_GROUP_SZ2", 1},
+            {"PIX_W_PER_WORK", pix_w_per_work},
+            {"PIX_H_PER_WORK", pix_h_per_work},
+            {"PIX_D_PER_WORK", pix_d_per_work},
+            {"KERNEL_SZ_D", lens[0]},
+            {"KERNEL_SZ_H", lens[1]},
+            {"KERNEL_SZ_W", lens[2]},
+            {"STRIDE_D", strides[0]},
+            {"STRIDE_H", strides[1]},
+            {"STRIDE_W", strides[2]},
+            {"TERRITORY_OVERLAP", static_cast<int>(territory_overlap)},
+            {"MLO_POOLING_INDEX_TYPE",
+             get_pooling_index_type_name(problem.GetPooling().GetIndexType())},
+            {"MLO_POOLING_INDEX_MAX",
+             get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())},
+        }
+        << GetDataTypeKBP(problem.GetDYDesc().GetType());
+
+    kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
 
-            const auto top_d = *(params.dyDesc.GetLengths().rbegin() + 2);
-            const auto top_h = *(params.dyDesc.GetLengths().rbegin() + 1);
-            const auto top_w = *(params.dyDesc.GetLengths().rbegin());
+    kernel.l_wk = {wavesize, 1, 1};
+    kernel.g_wk = {wavesize * grp_num, 1, 1};
 
-            int pix_w_per_work = 1;
-            int pix_h_per_work = 4;
-            int pix_d_per_work = 2;
+    result.construction_params.push_back(kernel);
 
-            int batch = params.dyDesc.GetLengths()[0];
-            int chal  = params.dyDesc.GetLengths()[1];
+    const auto top_d = is2d ? 1 : *(top.GetLengths().rbegin() + 2);
+    const auto top_h = *(top.GetLengths().rbegin() + 1);
+    const auto top_w = *(top.GetLengths().rbegin());
 
-            int bot_d = *(params.dxDesc.GetLengths().rbegin() + 2);
-            int bot_h = *(params.dxDesc.GetLengths().rbegin() + 1);
-            int bot_w = *(params.dxDesc.GetLengths().rbegin());
+    auto unpackStrides = [is2d](const auto& strides) {
+        return std::make_tuple(strides[0], // N stride
+                               strides[1], // C stride
+                               strides[2], // D stride. Same as H_stride in 3D converted from 2D.
+                               is2d        //
+                                   ? strides[2] // 2D H stride
+                                   : strides[3] // 3D H stride
+        );
+    };
 
-            int pix_blk_w = std::max((bot_w + pix_w_per_work - 1) / pix_w_per_work, 1);
-            int pix_blk_h = std::max((bot_h + pix_h_per_work - 1) / pix_h_per_work, 1);
-            int pix_blk_d = std::max((bot_d + pix_d_per_work - 1) / pix_d_per_work, 1);
+    std::size_t bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride;
+    std::size_t top_n_stride, top_c_stride, top_d_stride, top_h_stride;
+    std::tie(bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride) =
+        unpackStrides(bot.GetStrides());
+    std::tie(top_n_stride, top_c_stride, top_d_stride, top_h_stride) =
+        unpackStrides(top.GetStrides());
 
-            int total_work = batch * chal * pix_blk_w * pix_blk_h * pix_blk_d;
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::pooling::BwdInvokeParams>();
 
             if(params.pooling.GetMode() == miopenPoolingMax)
             {
                 kernel(params.dy,
                        params.dx,
                        params.workspace,
-                       static_cast<unsigned>(params.pooling.pads[0]),
-                       static_cast<unsigned>(params.pooling.pads[1]),
-                       static_cast<unsigned>(params.pooling.pads[2]),
+                       static_cast<unsigned>(pads[0]),
+                       static_cast<unsigned>(pads[1]),
+                       static_cast<unsigned>(pads[2]),
                        static_cast<unsigned>(batch),
                        static_cast<unsigned>(chal),
-                       static_cast<unsigned>(params.dxDesc.GetLengths()[2]),
-                       static_cast<unsigned>(params.dxDesc.GetLengths()[3]),
-                       static_cast<unsigned>(params.dxDesc.GetLengths()[4]),
+                       static_cast<unsigned>(bot_d),
+                       static_cast<unsigned>(bot_h),
+                       static_cast<unsigned>(bot_w),
                        static_cast<unsigned>(top_d),
                        static_cast<unsigned>(top_h),
                        static_cast<unsigned>(top_w),
-                       static_cast<unsigned>(params.dxDesc.GetStrides()[0]),
-                       static_cast<unsigned>(params.dxDesc.GetStrides()[1]),
-                       static_cast<unsigned>(params.dxDesc.GetStrides()[2]),
-                       static_cast<unsigned>(params.dxDesc.GetStrides()[3]),
-                       static_cast<unsigned>(params.dyDesc.GetStrides()[0]),
-                       static_cast<unsigned>(params.dyDesc.GetStrides()[1]),
-                       static_cast<unsigned>(params.dyDesc.GetStrides()[2]),
-                       static_cast<unsigned>(params.dyDesc.GetStrides()[3]),
+                       static_cast<unsigned>(bot_n_stride),
+                       static_cast<unsigned>(bot_c_stride),
+                       static_cast<unsigned>(bot_d_stride),
+                       static_cast<unsigned>(bot_h_stride),
+                       static_cast<unsigned>(top_n_stride),
+                       static_cast<unsigned>(top_c_stride),
+                       static_cast<unsigned>(top_d_stride),
+                       static_cast<unsigned>(top_h_stride),
                        static_cast<unsigned>(total_work));
             }
             else
             {
                 kernel(params.dy,
                        params.dx,
-                       static_cast<unsigned>(params.pooling.pads[0]),
-                       static_cast<unsigned>(params.pooling.pads[1]),
-                       static_cast<unsigned>(params.pooling.pads[2]),
+                       static_cast<unsigned>(pads[0]),
+                       static_cast<unsigned>(pads[1]),
+                       static_cast<unsigned>(pads[2]),
                        static_cast<unsigned>(batch),
                        static_cast<unsigned>(chal),
-                       static_cast<unsigned>(params.dxDesc.GetLengths()[2]),
-                       static_cast<unsigned>(params.dxDesc.GetLengths()[3]),
-                       static_cast<unsigned>(params.dxDesc.GetLengths()[4]),
+                       static_cast<unsigned>(bot_d),
+                       static_cast<unsigned>(bot_h),
+                       static_cast<unsigned>(bot_w),
                        static_cast<unsigned>(top_d),
                        static_cast<unsigned>(top_h),
                        static_cast<unsigned>(top_w),
-                       static_cast<unsigned>(params.dxDesc.GetStrides()[0]),
-                       static_cast<unsigned>(params.dxDesc.GetStrides()[1]),
-                       static_cast<unsigned>(params.dxDesc.GetStrides()[2]),
-                       static_cast<unsigned>(params.dxDesc.GetStrides()[3]),
-                       static_cast<unsigned>(params.dyDesc.GetStrides()[0]),
-                       static_cast<unsigned>(params.dyDesc.GetStrides()[1]),
-                       static_cast<unsigned>(params.dyDesc.GetStrides()[2]),
-                       static_cast<unsigned>(params.dyDesc.GetStrides()[3]),
+                       static_cast<unsigned>(bot_n_stride),
+                       static_cast<unsigned>(bot_c_stride),
+                       static_cast<unsigned>(bot_d_stride),
+                       static_cast<unsigned>(bot_h_stride),
+                       static_cast<unsigned>(top_n_stride),
+                       static_cast<unsigned>(top_c_stride),
+                       static_cast<unsigned>(top_d_stride),
+                       static_cast<unsigned>(top_h_stride),
                        static_cast<unsigned>(total_work));
             }
         };
diff --git a/src/solver/pooling/forwardNaive.cpp b/src/solver/pooling/forwardNaive.cpp
index d8a13a330f..c0d0ccb5b2 100644
--- a/src/solver/pooling/forwardNaive.cpp
+++ b/src/solver/pooling/forwardNaive.cpp
@@ -68,18 +68,21 @@ inline uint32_t RoundUpNearestPower2Positive(uint32_t v)
 bool PoolingForwardNaive::IsApplicable(const ExecutionContext&,
                                        const miopen::pooling::ProblemDescription& problem) const
 {
-    return problem.GetDirection() == miopen::pooling::Direction::Forward   //
-           && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() //
-           && (problem.GetXDesc().GetType() == miopenFloat                 //
-               || problem.GetXDesc().GetType() == miopenHalf)              //
-           && (                                                            //
-                  (problem.GetXDesc().GetSize() == 5                       //
-                   && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW"     //
-                   && problem.GetYDesc().GetLayout("NCDHW") == "NCDHW")    //
-                  ||                                                       //
-                  (problem.GetXDesc().GetSize() == 4                       //
-                   && problem.GetXDesc().GetLayout("NCHW") == "NCHW"       //
-                   && problem.GetYDesc().GetLayout("NCHW") == "NCHW")      //
+    return problem.GetDirection() == miopen::pooling::Direction::Forward           //
+           && problem.GetXDesc().GetType() == problem.GetYDesc().GetType()         //
+           && (problem.GetXDesc().GetType() == miopenFloat                         //
+               || problem.GetXDesc().GetType() == miopenHalf)                      //
+           && (problem.GetPooling().GetMode() == miopenPoolingMax                  //
+               || problem.GetPooling().GetMode() == miopenPoolingAverage           //
+               || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) //
+           && (                                                                    //
+                  (problem.GetXDesc().GetSize() == 5                               //
+                   && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW"             //
+                   && problem.GetYDesc().GetLayout("NCDHW") == "NCDHW")            //
+                  ||                                                               //
+                  (problem.GetXDesc().GetSize() == 4                               //
+                   && problem.GetXDesc().GetLayout("NCHW") == "NCHW"               //
+                   && problem.GetYDesc().GetLayout("NCHW") == "NCHW")              //
               );
 }
 
diff --git a/src/solver/pooling/forwardNd.cpp b/src/solver/pooling/forwardNd.cpp
index 7663b1cdfe..183e8b3705 100644
--- a/src/solver/pooling/forwardNd.cpp
+++ b/src/solver/pooling/forwardNd.cpp
@@ -106,11 +106,24 @@ std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& pro
 bool PoolingForwardNd::IsApplicable(const ExecutionContext& context,
                                     const miopen::pooling::ProblemDescription& problem) const
 {
-    return problem.GetDirection() == miopen::pooling::Direction::Forward &&
-           problem.GetXDesc().GetSize() == 5 && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW" &&
-           problem.GetYDesc().GetLayout("NCDHW") == "NCDHW" &&
-           sizeof_private_memory(problem) <=
-               TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth();
+
+    return problem.GetDirection() == miopen::pooling::Direction::Forward                      //
+           && problem.GetXDesc().GetSize() == 5                                               //
+           && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW"                                //
+           && problem.GetYDesc().GetLayout("NCDHW") == "NCDHW"                                //
+           && problem.GetXDesc().GetType() == problem.GetYDesc().GetType()                    //
+           && (problem.GetXDesc().GetType() == miopenFloat                                    //
+               || problem.GetXDesc().GetType() == miopenHalf)                                 //
+           && (problem.GetPooling().GetMode() == miopenPoolingMax                             //
+               || problem.GetPooling().GetMode() == miopenPoolingAverage                      //
+               || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive)            //
+           && sizeof_private_memory(problem) <= TargetProperties::GetMaxWaveScratchSize()     //
+                                                    / context.GetStream().GetWavefrontWidth() //
+           /// \todo This solver does not support workspace index mask mode yet.
+           &&
+           !(problem.GetPooling().GetMode() == miopenPoolingMax                                 //
+             && problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask //
+             && problem.SaveIndex() == true);
 }
 
 ConvSolution PoolingForwardNd::GetSolution(const ExecutionContext&,
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index ad0a7c8a1f..9f6432ec6d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -252,13 +252,16 @@ elseif(MIOPEN_TEST_BFLOAT16)
               test_deepbench_conv test_conv_igemm_dynamic_xdlops_nhwc_wrw_bf16
               test_conv_igemm_dynamic_xdlops_nhwc_fwd_bf16
               test_conv_igemm_dynamic_xdlops_nhwc_bwd_bf16)
-  endif()
-  if(${CODECOV_TEST})
-      list(APPEND SKIP_TESTS test_conv3d test_conv3d_find2 test_immed_conv3d test_immed_conv2d test_pooling2d test_pooling2d_asymmetric)
-      # replaced by smaller tests with suffix _codecov
-  endif()
+endif()
+
+if(${CODECOV_TEST})
+    list(APPEND SKIP_TESTS
+        test_conv3d test_conv3d_find2 test_immed_conv3d test_immed_conv2d test_pooling2d test_pooling2d_asymmetric
+        test_pooling2d_wide)
+    # replaced by smaller tests with suffix _codecov
+endif()
 
-if (MIOPEN_NO_GPU)
+if(MIOPEN_NO_GPU)
     set(SKIP_ALL_EXCEPT_TESTS test_include_inliner test_kernel_build_params
             test_test_errors test_type_name test_tensor_test test_sqlite_perfdb test_sequences
             test_pooling3d test_perfdb)
@@ -714,27 +717,14 @@ if(${MIOPEN_TEST_WITH_MIOPENDRIVER})
     add_custom_test(test_miopendriver_regression_half SKIP_UNLESS_ALL GFX94X_ENABLED GFX103X_ENABLED GFX110X_ENABLED FLOAT_DISABLED HALF_ENABLED
         # Regression test for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/1576
         COMMAND MIOPEN_FIND_MODE=1 MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvDirectNaiveConvBwd $<TARGET_FILE:MIOpenDriver> ${MIOPENDRIVER_MODE_CONV} --forw 2 --in_layout NCHW --out_layout NCHW --fil_layout NCHW -n 256 -c 1024 -H 14 -W 14 -k 256 -y 1 -x 1 -p 0 -q 0 -u 1 -v 1 -l 1 -j 1 -m conv -g 1 -t 1
-        # WORKAROUND_ISSUE_2110_2: tests for 2109, 2110 and 2160 shall be added to "test_pooling2d/3d --all" but this is
-        # impossible until backward pooling limitation (issue #2110 (2)) is fixed.
-        # Regression tests for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/2109
-        COMMAND $<TARGET_FILE:MIOpenDriver> ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x3x255x255,195075x65025x255x1 -y 255 -x 255 -p 0 -q 0 -v 1 -u 1 -m avg -F 1 -t 1 -i 1
-        COMMAND $<TARGET_FILE:MIOpenDriver> ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x3x227x227,154587x51529x227x1 -y 100 -x 100 -p 0 -q 0 -v 1 -u 1 -m avg -F 0 -t 1 -i 1
-        # Regression tests for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/2160
+        # WORKAROUND_ISSUE_2110_2: tests for 2110 and 2160 shall be added to "test_pooling3d --all" but this is
+        # impossible until backward pooling limitation (issue #2110 (2)) is fully fixed.
+        # Partial (3D only) regression test for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/2160.
         COMMAND $<TARGET_FILE:MIOpenDriver> ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x64x41x40x70 -y 41 -x 40 -Z 70 -m avg -F 1 -t 1 -i 1
-        # Regression tests for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/2110 (1)
-        COMMAND $<TARGET_FILE:MIOpenDriver> ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x64x410x400 -y 410 -x 400 -m avg -F 1 -t 1 -i 1
+        # Partial (3D only) regression test for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/2110 (1).
         COMMAND $<TARGET_FILE:MIOpenDriver> ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x64x41x40x100 -y 4 -x 4 -Z 100 -m max -F 1 -t 1 -i 1
     )
 
-    add_custom_test(test_miopendriver_regression_float SKIP_UNLESS_ALL GFX103X_ENABLED GFX110X_ENABLED
-        # WORKAROUND_ISSUE_2110_2: tests for 2109 shall be added to "test_pooling2d/3d --all" but this is
-        # impossible until backward pooling limitation (issue #2110 (2)) is fixed.
-        # Regression test for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/2109
-        COMMAND $<TARGET_FILE:MIOpenDriver> ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x3x255x255,195075x65025x255x1 -y 255 -x 255 -p 0 -q 0 -v 1 -u 1 -m avg -F 1 -t 1 -i 1
-        COMMAND $<TARGET_FILE:MIOpenDriver> ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x3x227x227,154587x51529x227x1 -y 100 -x 100 -p 0 -q 0 -v 1 -u 1 -m avg -F 0 -t 1 -i 1
-        COMMAND $<TARGET_FILE:MIOpenDriver> ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x3x227x63,42903x14301x63x1 -y 30 -x 30 -p 0 -q 0 -v 1 -u 1 -m avg -F 0 -t 1 -i 1
-    )
-
     add_custom_test(test_miopendriver_regression_int8 SKIP_UNLESS_ALL GFX94X_ENABLED GFX103X_ENABLED GFX110X_ENABLED FLOAT_DISABLED INT8_ENABLED
         COMMAND MIOPEN_FIND_MODE=1 MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvDirectNaiveConvFwd $<TARGET_FILE:MIOpenDriver> ${MIOPENDRIVER_MODE_CONV} --forw 1 --in_layout NCHW --out_layout NCHW --fil_layout NCHW -n 256 -c 1024 -H 14 -W 14 -k 256 -y 1 -x 1 -p 0 -q 0 -u 1 -v 1 -l 1 -j 1 -m conv -g 1 -t 1
     )
@@ -788,6 +778,10 @@ add_custom_test(test_pooling2d_asymmetric SKIP_UNLESS_ALL HALF_ENABLED GFX94X_EN
     COMMAND $<TARGET_FILE:test_pooling2d> ${MIOPEN_TEST_FLOAT_ARG} --all --dataset 1 --limit 0 ${MIOPEN_TEST_FLAGS_ARGS}
 )
 
+add_custom_test(test_pooling2d_wide SKIP_UNLESS_ALL HALF_ENABLED GFX94X_ENABLED GFX103X_ENABLED GFX110X_ENABLED
+    COMMAND $<TARGET_FILE:test_pooling2d> ${MIOPEN_TEST_FLOAT_ARG} --all --dataset 2 --limit 0 ${MIOPEN_TEST_FLAGS_ARGS}
+)
+
 set(IMPLICITGEMM_MLIR_ENV_F_XDLOPS ${IMPLICITGEMM_MLIR_ENV_BASE} MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvMlirIgemmFwdXdlops)
 set(IMPLICITGEMM_MLIR_ENV_B_XDLOPS ${IMPLICITGEMM_MLIR_ENV_BASE} MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvMlirIgemmBwdXdlops)
 set(IMPLICITGEMM_MLIR_ENV_W_XDLOPS ${IMPLICITGEMM_MLIR_ENV_BASE} MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvMlirIgemmWrWXdlops)
diff --git a/test/pooling2d.cpp b/test/pooling2d.cpp
index 571f44cd54..c0df88c481 100644
--- a/test/pooling2d.cpp
+++ b/test/pooling2d.cpp
@@ -59,6 +59,12 @@ struct pooling2d_driver : pooling_driver<T>
     // Dataset 1 is intended for testing of asymmetric configs.
     std::vector<U> get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}}; }
 
+    // Dataset 2 is intended for testing of configs with wide window.
+    std::vector<U> get_2d_pooling_input_shapes_wide()
+    {
+        return {{1, 3, 255, 255}, {2, 3, 227, 227}, {1, 7, 127, 127}, {1, 1, 410, 400}};
+    }
+
 public:
     pooling2d_driver() : pooling_driver<T>()
     {
@@ -67,33 +73,35 @@ struct pooling2d_driver : pooling_driver<T>
         std::vector<U> in_dim_vec(in_dim_set.begin(), in_dim_set.end());
         this->add(this->in_shape, "input", this->generate_data(in_dim_vec, {16, 32, 8, 8}));
 #else
-        this->add(this->in_shape,
-                  "input",
-                  this->template generate_multi_data_limited<U>(
-                      {get_2d_pooling_input_shapes(), get_2d_pooling_input_shapes_minimal()}, 9));
-#endif
         this->add(
-            this->lens,
-            "lens",
-            this->template generate_multi_data<U>({{{2, 2}, {3, 3}}, {{2, 2}, {1, 2}, {2, 1}}}));
+            this->in_shape,
+            "input",
+            this->template generate_multi_data_limited<U>({get_2d_pooling_input_shapes(),
+                                                           get_2d_pooling_input_shapes_minimal(),
+                                                           get_2d_pooling_input_shapes_wide()},
+                                                          9));
+#endif
+        this->add(this->lens,
+                  "lens",
+                  this->template generate_multi_data<U>(
+                      {{{2, 2}, {3, 3}},         //
+                       {{2, 2}, {1, 2}, {2, 1}}, //
+                       {{35, 35}, {100, 100}, {255, 255}, {410, 400}}}));
         this->add(this->strides,
                   "strides",
-                  this->template generate_multi_data<U>(
-                      {{{2, 2}, {1, 1}}, {{1, 1}, {2, 1}, {1, 2}, {2, 2}}}));
+                  this->template generate_multi_data<U>({{{2, 2}, {1, 1}},                 //
+                                                         {{1, 1}, {2, 1}, {1, 2}, {2, 2}}, //
+                                                         {{1, 1}}}));
+        // clang-format off
         this->add(this->pads, "pads", this->template generate_multi_data<U>({
-            {{0, 0}, {1, 1}},
+            {{0, 0}, {1, 1}}, //
 #if WORKAROUND_ISSUE_1670
-            {
-                {
-                    0, 0
-                }
-            }
+            {{0, 0}}, //
 #else
-            {
-                {0, 0}, {0, 1}, {1, 0}, { 1, 1 }
-            }
+            {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, //
 #endif
-        }));
+            {{0, 0}}}));
+        // clang-format on
         this->add(this->wsidx, "wsidx", this->generate_data({0, 1}));
     }
 };
diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp
index 873c203aaf..9058d34321 100644
--- a/test/pooling_common.hpp
+++ b/test/pooling_common.hpp
@@ -59,6 +59,24 @@ static int num_uint64_case = 0;
 // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
 static int num_uint64_case_imgidx = 0;
 
+static inline void print(const miopen::PoolingDescriptor& filter)
+{
+    std::cout << "Pooling: ";
+    if(filter.GetMode() == miopenPoolingAverage)
+        std::cout << "Average";
+    else if(filter.GetMode() == miopenPoolingAverageInclusive)
+        std::cout << "AverageInclusive";
+    else
+        std::cout << "Max";
+    std::cout << std::endl;
+    std::cout << "Lengths: ";
+    miopen::LogRange(std::cout, filter.GetLengths(), ", ") << std::endl;
+    std::cout << "Pads: ";
+    miopen::LogRange(std::cout, filter.GetPads(), ", ") << std::endl;
+    std::cout << "Strides: ";
+    miopen::LogRange(std::cout, filter.GetStrides(), ", ") << std::endl;
+}
+
 template <class T>
 tensor<T> get_output_tensor(const miopen::PoolingDescriptor& filter, const tensor<T>& input)
 {
@@ -208,20 +226,8 @@ struct verify_forward_pooling
               const miopen::PoolingDescriptor& filter,
               const std::vector<Index>&) const
     {
-        std::cout << "Forward pooling: ";
-        if(filter.GetMode() == miopenPoolingAverage)
-            std::cout << "Average";
-        else if(filter.GetMode() == miopenPoolingAverageInclusive)
-            std::cout << "AverageInclusive";
-        else
-            std::cout << "Max";
-        std::cout << std::endl;
-        std::cout << "Lengths: ";
-        miopen::LogRange(std::cout, filter.GetLengths(), ", ") << std::endl;
-        std::cout << "Pads: ";
-        miopen::LogRange(std::cout, filter.GetPads(), ", ") << std::endl;
-        std::cout << "Strides: ";
-        miopen::LogRange(std::cout, filter.GetStrides(), ", ") << std::endl;
+        std::cout << "Forward ";
+        print(filter);
         std::cout << "Input tensor: " << input.desc.ToString() << std::endl;
         std::cout << "Output tensor: " << filter.GetForwardOutputTensor(input.desc).ToString()
                   << std::endl;
@@ -241,7 +247,7 @@ struct verify_backward_pooling
                   bool verify_index) const
     {
         auto dinput = input;
-        std::vector<T> din_vec(input.desc.GetElementSpace(), T(0));
+        std::vector<double> din_vec(input.desc.GetElementSpace(), 0.0);
         CHECK(dout.desc == out.desc);
         std::array<int, SptDim + 2> in_dim{};
         std::copy_n(input.desc.GetLengths().begin(), SptDim + 2, in_dim.begin());
@@ -360,7 +366,8 @@ struct verify_backward_pooling
                                 din_idx += in_idx[i] * in_str[i];
                             }
 
-                            din_vec.at(din_idx) += dout(o, w, out_spatial_id_pack...) / pool_size;
+                            din_vec.at(din_idx) +=
+                                static_cast<double>(dout(o, w, out_spatial_id_pack...)) / pool_size;
                         }
                     });
                 });
@@ -432,22 +439,10 @@ struct verify_backward_pooling
               bool,
               bool) const
     {
-        std::cout << "Backward pooling: ";
-        if(filter.GetMode() == miopenPoolingAverage)
-            std::cout << "Average";
-        else if(filter.GetMode() == miopenPoolingAverageInclusive)
-            std::cout << "AverageInclusive";
-        else
-            std::cout << "Max";
-        std::cout << std::endl;
-        std::cout << "Lengths: ";
-        miopen::LogRange(std::cout, filter.GetLengths(), ", ") << std::endl;
-        std::cout << "Pads: ";
-        miopen::LogRange(std::cout, filter.GetPads(), ", ") << std::endl;
-        std::cout << "Strides: ";
-        miopen::LogRange(std::cout, filter.GetStrides(), ", ") << std::endl;
-        std::cout << "Output tensor: " << out.desc.ToString() << std::endl;
+        std::cout << "Backward ";
+        print(filter);
         std::cout << "Input tensor: " << input.desc.ToString() << std::endl;
+        std::cout << "Output tensor: " << out.desc.ToString() << std::endl;
     }
 };
 
@@ -491,10 +486,14 @@ struct pooling_driver : test_driver
     {
         add(index_type,
             "index_type",
-            generate_data({"miopenIndexUint8",
-                           "miopenIndexUint16",
-                           "miopenIndexUint32",
-                           "miopenIndexUint64"}));
+            generate_multi_data<const char*>( //
+                {{"miopenIndexUint8",
+                  "miopenIndexUint16",
+                  "miopenIndexUint32",
+                  "miopenIndexUint64"},                     //
+                 {"miopenIndexUint8", "miopenIndexUint32"}, //
+                 {"miopenIndexUint32"}}                     //
+                ));
         add(mode,
             "mode",
             generate_data(
@@ -530,29 +529,95 @@ struct pooling_driver : test_driver
         auto idx_sz  = sizeof(uint8_t);
         int spt_dim  = in_shape.size() - 2;
         const bool skip_many_configs_with_non_int8_index =
-            (dataset_id == 0); // Otherwise the default dataset takes too much time.
+            (dataset_id == 0) && full_set; // Otherwise the default dataset takes too much time.
+        const bool wide_dataset = (dataset_id == 2) && full_set;
+
+        filter = miopen::PoolingDescriptor
+        {
+            mode_lookup.at(miopen::ToUpper(mode)),
+#if TEST_PADDING_MODE == 1
+                pmode_lookup.at(miopen::ToUpper(pmode)),
+#else
+                miopenPaddingDefault,
+#endif
+                lens, strides, pads
+        };
+
+        filter.SetIndexType(idx_typ);
+        filter.SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t(wsidx));
+
+        if(wsidx == 0 && spt_dim == 3 && filter.GetMode() == miopenPoolingMax && full_set)
+        {
+            show_command();
+            std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented "
+                         "yet in 3D max pooling solvers."
+                      << std::endl;
+            return;
+        }
+
+        if(wsidx == 0 && spt_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset)
+        {
+            show_command();
+            std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented "
+                         "yet in 2D max backward solvers that support wide pooling window."
+                      << std::endl;
+            return;
+        }
+
+        if(wsidx == 0 &&
+           (filter.GetMode() == miopenPoolingAverage ||
+            filter.GetMode() == miopenPoolingAverageInclusive) &&
+           full_set)
+        {
+            show_command();
+            std::cout << "Warning: Config skipped. Workspace index modes are irrelevant for "
+                         "Average pooling. "
+                         "In order to optimize performance of full tests, we "
+                         "skip average pooling configs when (wsidx == 0). "
+                         "Please make sure that dataset includes counterparts with (wsidx == 1)."
+                      << std::endl;
+            return;
+        }
+
         switch(idx_typ)
         {
+        /// The "index is too small" limitation is an approximation
+        /// of the real limitation, and therefore applied only when
+        /// the "full test" is ran. See:
+        /// \ref max_pooling_index_max_restriction
         case miopenIndexUint8: {
-            // index size too small for 3D image
-            if(spt_dim == 3 || (spt_dim == 2 && wsidx == 1))
+            if((spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) && full_set &&
+               filter.GetMode() == miopenPoolingMax)
             {
+                show_command();
+                std::cout << "Warning: Config skipped: uint8 index is too small "
+                             "(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) "
+                             "&& filter.GetMode() == miopenPoolingMax"
+                          << std::endl;
                 return;
             }
             break;
         }
         case miopenIndexUint16: {
-            // index size too small for 3D image
-            if(spt_dim == 3 || (spt_dim == 2 && wsidx == 1))
+            if((spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) && full_set &&
+               filter.GetMode() == miopenPoolingMax)
             {
+                show_command();
+                std::cout << "Warning: Config skipped: uint16 index is too small "
+                             "(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) "
+                             "&& filter.GetMode() == miopenPoolingMax"
+                          << std::endl;
                 return;
             }
-
             if(skip_many_configs_with_non_int8_index)
             {
                 // test_pooling_test --all only test 5 uint16 cases
                 if(num_uint16_case > 5)
                 {
+                    show_command();
+                    std::cout << "Warning: Config skipped for the default dataset to speed "
+                                 "up testing (num_uint16_case > 5)"
+                              << std::endl;
                     return;
                 }
                 ++num_uint16_case;
@@ -566,20 +631,29 @@ struct pooling_driver : test_driver
                 // test_pooling_test --all only test 5 uint32 cases
                 if(wsidx == 0)
                 {
-                    if(num_uint32_case > 5 || spt_dim == 3)
+                    if(num_uint32_case > 5)
+                    {
+                        show_command();
+                        std::cout << "Warning: Config skipped for the default dataset to speed up "
+                                     "testing (wsidx == 0 && num_uint32_case > 5)"
+                                  << std::endl;
                         return;
-
+                    }
                     ++num_uint32_case;
                 }
                 else
                 {
                     if(num_uint32_case_imgidx > 5)
+                    {
+                        show_command();
+                        std::cout << "Warning: Config skipped for the default dataset to speed up "
+                                     "testing (wsidx != 0 && num_uint32_case_imgidx > 5)"
+                                  << std::endl;
                         return;
-
+                    }
                     ++num_uint32_case_imgidx;
                 }
             }
-
             idx_sz = sizeof(uint32_t);
             break;
         }
@@ -588,20 +662,30 @@ struct pooling_driver : test_driver
             {
                 if(wsidx == 0)
                 {
-                    if(num_uint64_case > 5 || spt_dim == 3)
+                    if(num_uint64_case > 5)
+                    {
+                        show_command();
+                        std::cout << "Warning: Config skipped for the default dataset to speed up "
+                                     "testing (wsidx == 0) && (num_uint64_case > 5)"
+                                  << std::endl;
                         return;
-
+                    }
                     ++num_uint64_case;
                 }
                 else
                 {
                     if(num_uint64_case_imgidx > 5 && spt_dim == 2)
+                    {
+                        show_command();
+                        std::cout << "Warning: Config skipped to speed up testing of the "
+                                     "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 "
+                                     "&& spt_dim == 2)"
+                                  << std::endl;
                         return;
-
+                    }
                     ++num_uint64_case_imgidx;
                 }
             }
-
             idx_sz = sizeof(uint64_t);
             break;
         }
@@ -611,42 +695,39 @@ struct pooling_driver : test_driver
 
         if(spt_dim != 2 && spt_dim != 3)
         {
+            show_command();
+            std::cout << "Warning: Config skipped becuse it is not supported " //
+                         "(spt_dim != 2 && spt_dim != 3)"
+                      << std::endl;
             return;
         }
 
-        filter = miopen::PoolingDescriptor
-        {
-            mode_lookup.at(miopen::ToUpper(mode)),
-#if TEST_PADDING_MODE == 1
-                pmode_lookup.at(miopen::ToUpper(pmode))
-#else
-                miopenPaddingDefault
-#endif
-                    ,
-                lens, strides, pads
-        };
-
-        filter.SetIndexType(idx_typ);
-        filter.SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t(wsidx));
-
         for(int i = 0; i < spt_dim; i++)
-            if(lens[i] >= (input_desc.GetLengths()[i + 2] + static_cast<uint64_t>(2) * pads[i]))
+            if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast<uint64_t>(2) * pads[i]))
             {
+                show_command();
+                std::cout << "Warning: Config skipped becuse it is invalid "
+                             "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))"
+                          << std::endl;
                 return;
             }
 
-        auto output_desc = filter.GetForwardOutputTensor(input_desc);
-        size_t total_mem = 3 * input_desc.GetNumBytes() + output_desc.GetNumBytes() +
-                           idx_sz * output_desc.GetElementSize(); // estimate based on backward pass
-
-        size_t device_mem = get_handle().GetGlobalMemorySize();
-        if(total_mem >= device_mem)
+        if(full_set)
         {
-            show_command();
-            std::cout << "Config requires " << total_mem
-                      << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem
-                      << " Bytes of memory." << std::endl;
-            return;
+            auto output_desc = filter.GetForwardOutputTensor(input_desc);
+            size_t total_mem =
+                3 * input_desc.GetNumBytes() + output_desc.GetNumBytes() +
+                idx_sz * output_desc.GetElementSize(); // estimate based on backward pass
+
+            size_t device_mem = get_handle().GetGlobalMemorySize();
+            if(total_mem >= device_mem)
+            {
+                show_command();
+                std::cout << "Config skipped because it requires " << total_mem
+                          << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem
+                          << " Bytes of memory." << std::endl;
+                return;
+            }
         }
 
         std::vector<int> in_dim(input_desc.GetLengths().begin() + 2, input_desc.GetLengths().end());
@@ -691,61 +772,52 @@ struct pooling_driver : test_driver
                 return;
         }
 #endif
-        std::vector<int> check_dim(spt_dim);
-        for(int i = 0; i < spt_dim; i++)
+        switch(filter.GetIndexType())
         {
-            check_dim[i] = in_dim[i] + 2 * filter.GetPads()[i] - ker_dim[i];
+        case miopenIndexUint8: {
+            if(spt_dim == 3)
+            {
+                run_impl<uint8_t, 3>();
+            }
+            else
+            {
+                run_impl<uint8_t, 2>();
+            }
+            break;
         }
-
-        if(std::all_of(check_dim.begin(), check_dim.end(), [](int i) { return i > 0; }))
-        {
-            switch(filter.GetIndexType())
+        case miopenIndexUint16: {
+            if(spt_dim == 3)
             {
-            case miopenIndexUint8: {
-                if(spt_dim == 3)
-                {
-                    run_impl<uint8_t, 3>();
-                }
-                else
-                {
-                    run_impl<uint8_t, 2>();
-                }
-                break;
+                run_impl<uint16_t, 3>();
             }
-            case miopenIndexUint16: {
-                if(spt_dim == 3)
-                {
-                    run_impl<uint16_t, 3>();
-                }
-                else
-                {
-                    run_impl<uint16_t, 2>();
-                }
-                break;
+            else
+            {
+                run_impl<uint16_t, 2>();
             }
-            case miopenIndexUint32: {
-                if(spt_dim == 3)
-                {
-                    run_impl<uint32_t, 3>();
-                }
-                else
-                {
-                    run_impl<uint32_t, 2>();
-                }
-                break;
+            break;
+        }
+        case miopenIndexUint32: {
+            if(spt_dim == 3)
+            {
+                run_impl<uint32_t, 3>();
             }
-            case miopenIndexUint64: {
-                if(spt_dim == 3)
-                {
-                    run_impl<uint64_t, 3>();
-                }
-                else
-                {
-                    run_impl<uint64_t, 2>();
-                }
-                break;
+            else
+            {
+                run_impl<uint32_t, 2>();
+            }
+            break;
+        }
+        case miopenIndexUint64: {
+            if(spt_dim == 3)
+            {
+                run_impl<uint64_t, 3>();
             }
+            else
+            {
+                run_impl<uint64_t, 2>();
             }
+            break;
+        }
         }
     }
 };

From e9697654bf154cee714f3cb06aaa4f949f2bc19f Mon Sep 17 00:00:00 2001
From: Sam Wu <sam.wu2@amd.com>
Date: Tue, 19 Sep 2023 12:06:36 -0600
Subject: [PATCH 04/36] [Doc] Update read the docs yaml configuration with
 build.os (#2398)

---
 .readthedocs.yaml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 43a0890c96..2c1c2064c5 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,6 +9,10 @@ sphinx:
 formats: [htmlzip]
 
 python:
-   version: "3.8"
    install:
    - requirements: docs/.sphinx/requirements.txt
+
+build:
+   os: ubuntu-20.04
+   tools:
+      python: "3.8"

From 91ccb03aec691eb5f915b2c381b77af09b31e2a5 Mon Sep 17 00:00:00 2001
From: Seungman Han <120356720+seungmanhan@users.noreply.github.com>
Date: Wed, 20 Sep 2023 03:13:51 +0900
Subject: [PATCH 05/36] Add support for layernorm primitive (#2320)

---
 driver/driver.hpp                |  12 +-
 driver/layernorm_driver.hpp      | 429 +++++++++++++++++++++++++++++++
 driver/main.cpp                  |  17 ++
 driver/mloLayerNormHost.hpp      |  91 +++++++
 driver/tensor_driver.hpp         |  12 +-
 include/miopen/miopen.h          |  63 ++++-
 src/CMakeLists.txt               |   3 +
 src/include/miopen/layernorm.hpp |  57 ++++
 src/kernels/MIOpenLayerNorm.cpp  | 119 +++++++++
 src/layer_norm.cpp               | 135 ++++++++++
 src/layernorm_api.cpp            | 137 ++++++++++
 test/cpu_layernorm.hpp           |  83 ++++++
 test/gtest/layernorm_test.cpp    |  38 +++
 test/gtest/layernorm_test.hpp    | 247 ++++++++++++++++++
 14 files changed, 1430 insertions(+), 13 deletions(-)
 create mode 100644 driver/layernorm_driver.hpp
 create mode 100644 driver/mloLayerNormHost.hpp
 create mode 100644 src/include/miopen/layernorm.hpp
 create mode 100644 src/kernels/MIOpenLayerNorm.cpp
 create mode 100644 src/layer_norm.cpp
 create mode 100644 src/layernorm_api.cpp
 create mode 100644 test/cpu_layernorm.hpp
 create mode 100644 test/gtest/layernorm_test.cpp
 create mode 100644 test/gtest/layernorm_test.hpp

diff --git a/driver/driver.hpp b/driver/driver.hpp
index 8e15894705..0862652cd5 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -150,7 +150,11 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
     printf("Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], "
            "pool[fp16], lrn[fp16], "
            "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm, ctc, dropout[fp16], "
-           "tensorop[fp16], reduce[fp16,fp64]\n");
+           "tensorop[fp16], reduce[fp16,fp64]"
+#ifdef MIOPEN_BETA_API
+           ", layernorm[bf16, fp16, fp32]"
+#endif
+           "\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -171,7 +175,11 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "bnormfp16" && arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" &&
        arg != "rnn_seqfp16" && arg != "gemm" /*&& arg != "gemmfp16"*/ && arg != "ctc" &&
        arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "tensoropfp16" &&
-       arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && arg != "--version")
+       arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" &&
+#ifdef MIOPEN_BETA_API
+       arg != "layernorm" && arg != "layernormfp16" && arg != "layernormbfp16" &&
+#endif
+       arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/layernorm_driver.hpp b/driver/layernorm_driver.hpp
new file mode 100644
index 0000000000..8251472625
--- /dev/null
+++ b/driver/layernorm_driver.hpp
@@ -0,0 +1,429 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/miopen.h>
+#ifdef MIOPEN_BETA_API
+#ifndef GUARD_MIOPEN_LAYERNORM_DRIVER_HPP
+#define GUARD_MIOPEN_LAYERNORM_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "mloLayerNormHost.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+#include <../test/verify.hpp>
+#include <algorithm>
+#include <cstdlib>
+#include <cfloat>
+#include <memory>
+#include <miopen/tensor.hpp>
+#include <numeric>
+#include <vector>
+#include <../test/tensor_holder.hpp>
+#include "random.hpp"
+
+template <typename Tgpu, typename Tref>
+class LayerNormDriver : public Driver
+{
+public:
+    LayerNormDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&weightDesc);
+        miopenCreateTensorDescriptor(&biasDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&meanDesc);
+        miopenCreateTensorDescriptor(&rstdDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+    std::vector<int> GetInputTensorLengthsFromCmdLine();
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~LayerNormDriver() override
+    {
+
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(weightDesc);
+        miopenDestroyTensorDescriptor(biasDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(meanDesc);
+        miopenDestroyTensorDescriptor(rstdDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+    int dim_size;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t weightDesc;
+    miopenTensorDescriptor_t biasDesc;
+    miopenTensorDescriptor_t outputDesc;
+    miopenTensorDescriptor_t meanDesc;
+    miopenTensorDescriptor_t rstdDesc;
+
+    std::unique_ptr<GPUMem> in_dev;
+    std::unique_ptr<GPUMem> weight_dev;
+    std::unique_ptr<GPUMem> bias_dev;
+    std::unique_ptr<GPUMem> out_dev;
+    std::unique_ptr<GPUMem> mean_dev;
+    std::unique_ptr<GPUMem> rstd_dev;
+
+    std::vector<Tgpu> in;
+    std::vector<Tgpu> weight;
+    std::vector<Tgpu> bias;
+    std::vector<Tgpu> out;
+    std::vector<Tgpu> mean;
+    std::vector<Tgpu> rstd;
+    std::vector<Tref> outhost;
+    std::vector<Tref> meanhost;
+    std::vector<Tref> rstdhost;
+
+    float eps;
+    int dim;
+    miopenLayerNormMode_t mode;
+};
+
+template <typename Tgpu, typename Tref>
+int LayerNormDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int LayerNormDriver<Tgpu, Tref>::GetandSetData()
+{
+    std::vector<int> in_len = GetInputTensorLengthsFromCmdLine();
+
+    dim = static_cast<int>(inflags.GetValueDouble("nomalized_dim"));
+
+    std::vector<int> inner_len;
+    if(dim == in_len.size())
+        inner_len = {1};
+    else
+        inner_len = {in_len.begin() + dim, in_len.end()};
+
+    std::vector<int> outer_len;
+    if(dim == 0)
+        outer_len = {1};
+    else
+        outer_len = {in_len.begin(), in_len.end() - (in_len.size() - dim)};
+
+    SetTensorNd(inputDesc, in_len, data_type);
+    SetTensorNd(weightDesc, inner_len, data_type);
+    SetTensorNd(biasDesc, inner_len, data_type);
+    SetTensorNd(outputDesc, in_len, data_type);
+    SetTensorNd(meanDesc, outer_len, data_type);
+    SetTensorNd(rstdDesc, outer_len, data_type);
+
+    eps  = static_cast<double>(inflags.GetValueDouble("eps"));
+    mode = miopenLayerNormMode_t(inflags.GetValueInt("mode"));
+
+    return (0);
+}
+
+template <typename Tgpu, typename Tref>
+int LayerNormDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward LayerNorm (Default=1)", "int");
+    inflags.AddInputFlag("batchsize", 'n', "100", "Mini-batch size (Default=100)", "int");
+    inflags.AddInputFlag("in_channels", 'c', "3", "Number of Input Channels (Default=3)", "int");
+    inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int");
+    inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int");
+    inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int");
+
+    inflags.AddInputFlag("eps", 'e', "0.00001", "Alpha (Default=0.00001)", "double");
+    inflags.AddInputFlag("nomalized_dim", 'o', "3", "Nomalized Dim (Default=3)", "int");
+    inflags.AddInputFlag(
+        "mode", 'm', "0", "elemwise affine mode (0), weight and bias mode (1) (Default=0)", "int");
+
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int> LayerNormDriver<Tgpu, Tref>::GetInputTensorLengthsFromCmdLine()
+{
+    int in_n = inflags.GetValueInt("batchsize");
+    int in_c = inflags.GetValueInt("in_channels");
+    int in_w = inflags.GetValueInt("in_w");
+    int in_h = inflags.GetValueInt("in_h");
+    int in_d = inflags.GetValueInt("in_d");
+
+    if(in_h != 0)
+    {
+        if(in_d != 0)
+        {
+            dim_size = 5;
+            return std::vector<int>({in_n, in_c, in_d, in_h, in_w});
+        }
+        else
+        {
+            dim_size = 4;
+            return std::vector<int>({in_n, in_c, in_h, in_w});
+        }
+    }
+    else
+    {
+        dim_size = 3;
+        return std::vector<int>({in_n, in_c, in_w});
+    }
+}
+
+template <typename Tgpu, typename Tref>
+int LayerNormDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t in_sz     = GetTensorSize(inputDesc);
+    size_t weight_sz = GetTensorSize(weightDesc);
+    size_t bias_sz   = GetTensorSize(biasDesc);
+    size_t out_sz    = GetTensorSize(outputDesc);
+    size_t mean_sz   = GetTensorSize(meanDesc);
+    size_t rstd_sz   = GetTensorSize(rstdDesc);
+
+    // MIOPEN_BACKEND_HIP
+    uint32_t ctx = 0;
+
+    in_dev     = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
+    weight_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, weight_sz, sizeof(Tgpu)));
+    bias_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, bias_sz, sizeof(Tgpu)));
+    out_dev    = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
+    mean_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, mean_sz, sizeof(Tgpu)));
+    rstd_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, rstd_sz, sizeof(Tgpu)));
+
+    in       = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
+    weight   = std::vector<Tgpu>(weight_sz, static_cast<Tgpu>(0));
+    bias     = std::vector<Tgpu>(bias_sz, static_cast<Tgpu>(0));
+    out      = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
+    mean     = std::vector<Tgpu>(mean_sz, static_cast<Tgpu>(0));
+    rstd     = std::vector<Tgpu>(rstd_sz, static_cast<Tgpu>(0));
+    outhost  = std::vector<Tref>(out_sz, static_cast<Tref>(0));
+    meanhost = std::vector<Tref>(mean_sz, static_cast<Tref>(0));
+    rstdhost = std::vector<Tref>(rstd_sz, static_cast<Tref>(0));
+
+    // MIOPEN_BACKEND_HIP
+    int status;
+
+    for(int i = 0; i < in_sz; i++)
+    {
+        in[i] = RAN_GEN<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+    }
+    status = in_dev->ToGPU(q, in.data());
+
+    for(int i = 0; i < weight_sz; i++)
+    {
+        weight[i] = RAN_GEN<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+    }
+    status = weight_dev->ToGPU(q, weight.data());
+
+    for(int i = 0; i < bias_sz; i++)
+    {
+        bias[i] = RAN_GEN<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+    }
+    status = bias_dev->ToGPU(q, bias.data());
+
+    status |= out_dev->ToGPU(q, out.data());
+    status |= mean_dev->ToGPU(q, mean.data());
+    status |= rstd_dev->ToGPU(q, rstd.data());
+
+    if(status != CL_SUCCESS)
+        printf("Error copying data to GPU\n");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int LayerNormDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0.0;
+    float kernel_first_time = 0.0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenLayerNormForward(GetHandle(),
+                               mode,
+                               inputDesc,
+                               in_dev->GetMem(),
+                               weightDesc,
+                               weight_dev->GetMem(),
+                               biasDesc,
+                               bias_dev->GetMem(),
+                               eps,
+                               dim,
+                               outputDesc,
+                               out_dev->GetMem(),
+                               meanDesc,
+                               mean_dev->GetMem(),
+                               rstdDesc,
+                               rstd_dev->GetMem());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            printf("Wall-clock Time Forward LayerNorm Elapsed: %f ms\n", t.gettime_ms() / iter);
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        printf("GPU Kernel Time Forward LayerNorm Elapsed: %f ms\n", kernel_average_time);
+    }
+
+    out_dev->FromGPU(GetStream(), out.data());
+    mean_dev->FromGPU(GetStream(), mean.data());
+    rstd_dev->FromGPU(GetStream(), rstd.data());
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int LayerNormDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    mloLayerNormForwardRunHost<Tgpu, Tref>(inputDesc,
+                                           in.data(),
+                                           weight.data(),
+                                           bias.data(),
+                                           outhost.data(),
+                                           meanhost.data(),
+                                           rstdhost.data(),
+                                           eps,
+                                           dim,
+                                           mode);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int LayerNormDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+Tref LayerNormDriver<Tgpu, Tref>::GetTolerance()
+{
+    if(data_type == miopenHalf)
+    {
+        return 1e-3;
+    }
+    else if(data_type == miopenFloat)
+    {
+        return 5e-5;
+    }
+    else if(data_type == miopenDouble)
+    {
+        return 1e-10;
+    }
+    else if(data_type == miopenBFloat16)
+    {
+        return 5e-3;
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tref>
+int LayerNormDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(outhost, out);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Forward LayerNorm FAILED: " << error << std::endl;
+    }
+    else
+    {
+        printf("Forward LayerNorm Verifies on CPU and GPU (err=%f)\n", error);
+    }
+
+    auto meanerror = miopen::rms_range(meanhost, mean);
+    if(!std::isfinite(meanerror) || meanerror > tolerance)
+    {
+        std::cout << "Forward LayerNorm mean FAILED: " << meanerror << std::endl;
+    }
+    else
+    {
+        printf("Forward LayerNorm mean Verifies on CPU and GPU (err=%f)\n", meanerror);
+    }
+
+    auto rstderror = miopen::rms_range(rstdhost, rstd);
+    if(!std::isfinite(rstderror) || rstderror > tolerance)
+    {
+        std::cout << "Forward LayerNorm rstd FAILED: " << rstderror << std::endl;
+    }
+    else
+    {
+        printf("Forward LayerNorm rstd Verifies on CPU and GPU (err=%f)\n", rstderror);
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int LayerNormDriver<Tgpu, Tref>::VerifyBackward()
+{
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_SOFTMAX_DRIVER_HPP
+#endif
diff --git a/driver/main.cpp b/driver/main.cpp
index abdefc34a3..79e52e5e38 100644
--- a/driver/main.cpp
+++ b/driver/main.cpp
@@ -43,6 +43,9 @@
 #include "reduce_driver.hpp"
 #include <miopen/config.h>
 #include <miopen/stringutils.hpp>
+#ifdef MIOPEN_BETA_API
+#include "layernorm_driver.hpp"
+#endif
 
 int main(int argc, char* argv[])
 {
@@ -196,6 +199,20 @@ int main(int argc, char* argv[])
     {
         drv = new ReduceDriver<double, double>();
     }
+#ifdef MIOPEN_BETA_API
+    else if(base_arg == "layernorm")
+    {
+        drv = new LayerNormDriver<float, float>();
+    }
+    else if(base_arg == "layernormfp16")
+    {
+        drv = new LayerNormDriver<float16, float>();
+    }
+    else if(base_arg == "layernormbfp16")
+    {
+        drv = new LayerNormDriver<bfloat16, float>();
+    }
+#endif
     else
     {
         printf("Incorrect BaseArg\n");
diff --git a/driver/mloLayerNormHost.hpp b/driver/mloLayerNormHost.hpp
new file mode 100644
index 0000000000..5c504f8068
--- /dev/null
+++ b/driver/mloLayerNormHost.hpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifdef MIOPEN_BETA_API
+#ifndef MLO_LAYERNORMHOST_H_
+#define MLO_LAYERNORMHOST_H_
+
+////////////////////////////////////////////////////////////
+//
+///////////////////////////////////////////////////////////
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloLayerNormForwardRunHost(miopenTensorDescriptor_t inputDesc,
+                                   Tgpu* input,
+                                   Tgpu* weight,
+                                   Tgpu* bias,
+                                   Tcheck* outputhost,
+                                   Tcheck* meanhost,
+                                   Tcheck* rstdhost,
+                                   float eps,
+                                   int32_t normalized_dim,
+                                   miopenLayerNormMode_t mode)
+{
+    auto dims         = miopen::deref(inputDesc).GetLengths();
+    size_t outer_size = 1;
+    size_t inner_size = 1;
+    size_t i          = 0;
+    for(; i < normalized_dim; i++)
+    {
+        outer_size *= dims[i];
+    }
+
+    for(; i < dims.size(); i++)
+    {
+        inner_size *= dims[i];
+    }
+
+    int32_t ret = 0;
+
+    for(int32_t o = 0; o < outer_size; o++)
+    {
+        Tcheck pmean = 0.0f;
+        Tcheck pvar  = 0.0f;
+        for(i = 0; i < inner_size; i++)
+        {
+            Tcheck tmp = static_cast<Tcheck>(input[o * inner_size + i]);
+            pmean += tmp;
+            pvar += tmp * tmp;
+        }
+
+        pmean        = pmean / inner_size;
+        pvar         = pvar / inner_size - pmean * pmean;
+        Tcheck prstd = 1.0f / sqrt(pvar + eps);
+
+        meanhost[o] = pmean;
+        rstdhost[o] = prstd;
+
+        for(i = 0; i < inner_size; i++)
+        {
+            Tcheck pweight = mode ? 1 : static_cast<Tcheck>(weight[i]);
+            Tcheck pbias   = mode ? 0 : static_cast<Tcheck>(bias[i]);
+            outputhost[o * inner_size + i] =
+                (static_cast<Tcheck>(input[o * inner_size + i]) - pmean) * prstd * pweight + pbias;
+        }
+    }
+    return ret;
+}
+#endif
+#endif
diff --git a/driver/tensor_driver.hpp b/driver/tensor_driver.hpp
index 077cac4003..cb3139bf48 100644
--- a/driver/tensor_driver.hpp
+++ b/driver/tensor_driver.hpp
@@ -74,16 +74,8 @@ inline std::size_t GetTensorVectorLength(const miopenTensorDescriptor_t& tensor)
     int size = 0;
     miopenGetTensorDescriptorSize(tensor, &size);
 
-    if(size == 4 || size == 5)
-    {
-        miopenGetNdTensorDescriptorVectorLength(tensor, &vectorLength);
-        return vectorLength;
-    }
-    else
-    {
-        MIOPEN_THROW("We only support 4D layout in vector format");
-    }
-    return 0;
+    miopenGetNdTensorDescriptorVectorLength(tensor, &vectorLength);
+    return vectorLength;
 }
 
 inline std::vector<int> GetTensorLengths(const miopenTensorDescriptor_t& tensor)
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index fcbc60a0b2..f0c0ce1aa6 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -53,6 +53,7 @@
  * @defgroup convolutions
  * @defgroup pooling
  * @defgroup handle
+ * @defgroup layernorm
  * @defgroup LRN
  * @defgroup batchnorm
  * @defgroup activation
@@ -455,7 +456,18 @@ typedef enum
     miopenLRNWithinChannel = 0, /*!< Channel independent */
     miopenLRNCrossChannel  = 1, /*!< Cross Channel */
 } miopenLRNMode_t;
-
+#ifdef MIOPEN_BETA_API
+/*! @ingroup layernorm
+ * @enum miopenLayerNormAlgorithm_t
+ * LayerNorm implementation algorithms
+ */
+typedef enum
+{
+    MIOPEN_ELEMENTWISE_AFFINE = 0, /*!< initialized to ones for weights and zeros for biases */
+    MIOPEN_WEIGHT_BIAS =
+        1, /*!< learnable weights and biases of the module of shape normalized_shape */
+} miopenLayerNormMode_t;
+#endif
 /*! @ingroup batchnorm
  * @enum miopenBatchNormMode_t
  * Batch Normalization layer mode
@@ -2453,6 +2465,55 @@ MIOPEN_EXPORT miopenStatus_t miopenDestroyLRNDescriptor(miopenLRNDescriptor_t lr
 /** @} */
 // CLOSEOUT LRN DOXYGEN GROUP
 
+#ifdef MIOPEN_BETA_API
+// LayerNorm APIs
+/** @addtogroup layernorm
+ *
+ *  @{
+ */
+/*! @brief Execute a layernorm forward layer
+ *
+ * This API only implements the LAYERNORM_MODE_CHANNEL in LAYERNORM_ACCURATE path.
+ *
+ * @param handle         MIOpen handle (input)
+ * @param mode           LayerNorm mode (input)
+ * @param xDesc          Tensor descriptor for data input tensor x (input)
+ * @param x              Data tensor x (input)
+ * @param weightDesc     Tensor descriptor for data input tensor weight (input)
+ * @param weight         Data tensor weight (input)
+ * @param biasDesc       Tensor descriptor for data input tensor bias (input)
+ * @param bias           Data tensor bias (input)
+ * @param epsilon        Value to stablize inverse variance calculation (input)
+ * @param normalized_dim Nomalized dimensions in the input array (input)
+ * @param yDesc          Tensor descriptor for output data tensor y (input)
+ * @param y              Data tensor y (output)
+ * @param meanDesc       Tensor descriptor for output data tensor mean (input)
+ * @param mean           Data tensor mean (output)
+ * @param rstdDesc       Tensor descriptor for output data tensor rstd (input)
+ * @param rstd           Data tensor rstd (output)
+ * @return               miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenLayerNormForward(miopenHandle_t handle,
+                                                    miopenLayerNormMode_t mode,
+                                                    const miopenTensorDescriptor_t xDesc,
+                                                    const void* x,
+                                                    const miopenTensorDescriptor_t weightDesc,
+                                                    const void* weight,
+                                                    const miopenTensorDescriptor_t biasDesc,
+                                                    const void* bias,
+                                                    const float epsilon,
+                                                    const int32_t normalized_dim,
+                                                    const miopenTensorDescriptor_t yDesc,
+                                                    void* y,
+                                                    const miopenTensorDescriptor_t meanDesc,
+                                                    void* mean,
+                                                    const miopenTensorDescriptor_t rstdDesc,
+                                                    void* rstd);
+
+/** @} */
+// CLOSEOUT LAYERNORM DOXYGEN GROUP
+#endif
+
 // Batch-Normalization APIs
 /** @addtogroup batchnorm
  *
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 58ff101c33..ef7ba9558a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -127,6 +127,7 @@ set( MIOpen_Source
     load_file.cpp
     lock_file.cpp
     logger.cpp
+    layernorm_api.cpp
     lrn_api.cpp
     op_args.cpp
     operator.cpp
@@ -413,6 +414,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenConvDirUni.cl
         kernels/MIOpenConvDirBatchNormActiv.cl
         kernels/MIOpenConvDirGenFwd.cl
+        kernels/MIOpenLayerNorm.cpp
         kernels/MIOpenLRNBwd.cl
         kernels/MIOpenLRNFwd.cl
         kernels/MIOpenNeuron.cl
@@ -552,6 +554,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         hip/hip_build_utils.cpp
         hip/batched_transpose_sol.cpp
         hip/general_tensor_reorder_sol.cpp
+        layer_norm.cpp
         pooling.cpp
         ocl/fusionopconvocl.cpp
         ocl/fusionopbiasbnactivocl.cpp
diff --git a/src/include/miopen/layernorm.hpp b/src/include/miopen/layernorm.hpp
new file mode 100644
index 0000000000..8ec2d96055
--- /dev/null
+++ b/src/include/miopen/layernorm.hpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/miopen.h>
+#ifdef MIOPEN_BETA_API
+#ifndef MIOPEN_LAYERNORM_HPP_
+#define MIOPEN_LAYERNORM_HPP_
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+miopenStatus_t LayerNormForward(const Handle& handle,
+                                const TensorDescriptor& xDesc,
+                                ConstData_t x,
+                                const TensorDescriptor& weightDesc,
+                                ConstData_t weight,
+                                const TensorDescriptor& biasDesc,
+                                ConstData_t bias,
+                                const TensorDescriptor& yDesc,
+                                Data_t y,
+                                const TensorDescriptor& meanDesc,
+                                Data_t mean,
+                                const TensorDescriptor& rstdDesc,
+                                Data_t rstd,
+                                miopenLayerNormMode_t mode,
+                                const float epsilon,
+                                const int32_t normalized_dim);
+
+} // namespace miopen
+#endif // _MIOPEN_LAYERNORM_HPP_
+#endif
diff --git a/src/kernels/MIOpenLayerNorm.cpp b/src/kernels/MIOpenLayerNorm.cpp
new file mode 100644
index 0000000000..58891d6538
--- /dev/null
+++ b/src/kernels/MIOpenLayerNorm.cpp
@@ -0,0 +1,119 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifdef MIOPEN_BETA_API
+
+#include "float_types.h"
+
+//#if MIOPEN_USE_BFP16 == 1
+//#undef FLOAT
+//#define FLOAT hip_bfloat16
+//#endif
+
+extern "C" __global__ void LayernormFwdContiguous(const FLOAT* __restrict__ x,
+                                                  FLOAT* __restrict__ y,
+                                                  const FLOAT* __restrict__ weight,
+                                                  const FLOAT* __restrict__ bias,
+                                                  FLOAT* __restrict__ mean,
+                                                  FLOAT* __restrict__ rstd,
+                                                  float eps,
+                                                  uint64_t inner_size,
+                                                  bool mode)
+{
+    /*
+     * Each group works on a single channel.
+     * Example)
+     * x dim = {N, C, L}, normalized shape = {C, L}
+     * outer_size = N, inner_size = C * L
+     *
+     * Example2)
+     * x dim = {N, C, L}, normalized shape = {L}
+     * outer_size = N * C, inner_size = L
+     *
+     * => gws = {outer_size * LOCAL_SIZE}, lws = {LOCAL_SIZE}
+     */
+
+    /*
+     * Reduction to calculate mean and rstd
+     */
+
+    const uint64_t gid = blockIdx.x;
+    const uint64_t lid = threadIdx.x;
+
+    FLOAT_ACCUM pmean = CVT_FLOAT2ACCUM(0);
+    FLOAT_ACCUM pvar  = CVT_FLOAT2ACCUM(0);
+    __shared__ FLOAT_ACCUM ltmp1[LOCAL_SIZE];
+    __shared__ FLOAT_ACCUM ltmp2[LOCAL_SIZE];
+
+    // reduce sum for mean and var
+    for(uint64_t i = lid; i < inner_size; i += LOCAL_SIZE)
+    {
+        uint64_t x_idx = gid * inner_size + i;
+
+        FLOAT_ACCUM tmp = CVT_FLOAT2ACCUM(x[x_idx]);
+        pmean += tmp;
+        pvar += tmp * tmp;
+    }
+
+    ltmp1[lid] = pmean;
+    ltmp2[lid] = pvar;
+    __syncthreads();
+    for(uint64_t i = LOCAL_SIZE >> 1; i > 0; i >>= 1)
+    {
+        if(lid < i)
+        {
+            ltmp1[lid] += ltmp1[lid + i];
+            ltmp2[lid] += ltmp2[lid + i];
+        }
+        __syncthreads();
+    }
+    pmean             = ltmp1[0] / inner_size;
+    pvar              = ltmp2[0] / inner_size - pmean * pmean;
+    FLOAT_ACCUM prstd = rsqrt(pvar + FLOAT_ACCUM(eps));
+
+    if(lid == 0)
+    {
+        if(mean)
+            mean[gid] = CVT_ACCUM2FLOAT(pmean);
+        if(rstd)
+            rstd[gid] = CVT_ACCUM2FLOAT(prstd);
+    }
+
+    // forward calculation
+    for(uint64_t i = lid; i < inner_size; i += LOCAL_SIZE)
+    {
+        uint64_t idx = gid * inner_size + i;
+
+        FLOAT_ACCUM pweight;
+        FLOAT_ACCUM pbias;
+
+        pweight = mode ? CVT_FLOAT2ACCUM(1) : CVT_FLOAT2ACCUM(weight[i]);
+        pbias   = mode ? CVT_FLOAT2ACCUM(0) : CVT_FLOAT2ACCUM(bias[i]);
+
+        FLOAT_ACCUM val = (CVT_FLOAT2ACCUM(x[idx]) - pmean) * prstd * pweight + pbias;
+        y[idx]          = CVT_ACCUM2FLOAT(val);
+    }
+}
+#endif
diff --git a/src/layer_norm.cpp b/src/layer_norm.cpp
new file mode 100644
index 0000000000..3d52bc771f
--- /dev/null
+++ b/src/layer_norm.cpp
@@ -0,0 +1,135 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/layernorm.hpp>
+#ifdef MIOPEN_BETA_API
+#include <miopen/kernel_cache.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/check_numerics.hpp>
+#include <miopen/tensor.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+miopenStatus_t LayerNormForward(const Handle& handle,
+                                const TensorDescriptor& xDesc,
+                                ConstData_t x,
+                                const TensorDescriptor& weightDesc,
+                                ConstData_t weight,
+                                const TensorDescriptor& biasDesc,
+                                ConstData_t bias,
+                                const TensorDescriptor& yDesc,
+                                Data_t y,
+                                const TensorDescriptor& meanDesc,
+                                Data_t mean,
+                                const TensorDescriptor& rstdDesc,
+                                Data_t rstd,
+                                miopenLayerNormMode_t mode,
+                                const float epsilon,
+                                const int32_t normalized_dim)
+{
+    if(x == nullptr || y == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor.");
+    }
+
+    if(xDesc.GetType() != yDesc.GetType())
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Tensor types do not match.");
+    }
+
+    if(xDesc.GetLengths() != yDesc.GetLengths())
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match.");
+    }
+
+    bool is_all_packed = xDesc.IsPacked() && weightDesc.IsPacked() && biasDesc.IsPacked() &&
+                         yDesc.IsPacked() && meanDesc.IsPacked() && rstdDesc.IsPacked();
+
+    if(!is_all_packed)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "All tensor is not packed.");
+    }
+
+    auto dims         = xDesc.GetLengths();
+    size_t grid_size  = 1;
+    size_t outer_size = 1;
+    size_t inner_size = 1;
+    size_t i          = 0;
+    for(; i < normalized_dim; i++)
+    {
+        outer_size *= dims[i];
+        grid_size *= dims[i];
+    }
+
+    for(; i < dims.size(); i++)
+    {
+        inner_size *= dims[i];
+        grid_size *= dims[i];
+    }
+
+    auto dtype = xDesc.GetType();
+
+    const std::vector<size_t> vld{LOCAL_SIZE, 1, 1};
+    const std::vector<size_t> vgd{outer_size * vld[0], 1, 1};
+
+    std::string algo_name = "LayerNormForward";
+    std::string network_config =
+        "lnfwd-dtype" + std::to_string(static_cast<int32_t>(dtype)) + "g" + std::to_string(vgd[0]) +
+        "l" + std::to_string(vld[0]) + "normalized_dim" + std::to_string(normalized_dim) + "grid" +
+        std::to_string(grid_size) + "outer_size" + std::to_string(outer_size) + "inner_size" +
+        std::to_string(inner_size) + "mode" + std::to_string(static_cast<int32_t>(mode)) + "eps" +
+        std::to_string(static_cast<float>(epsilon));
+
+    std::string program_name = "MIOpenLayerNorm.cpp";
+    std::string kernel_name  = "LayernormFwdContiguous";
+
+    // compile parameters
+    std::string parms =
+        " -DMIOPEN_USE_FP16=" + std::to_string(static_cast<int32_t>(dtype == miopenHalf)) +
+        " -DMIOPEN_USE_FP32=" + std::to_string(static_cast<int32_t>(dtype == miopenFloat)) +
+        " -DMIOPEN_USE_FP64=" + std::to_string(static_cast<int32_t>(dtype == miopenDouble)) +
+        " -DMIOPEN_USE_BFP16=" + std::to_string(static_cast<int32_t>(dtype == miopenBFloat16));
+
+    parms += " -DMIOPEN_BETA_API=1";
+    parms += " -DLOCAL_SIZE=" + std::to_string(LOCAL_SIZE);
+
+    auto&& kernels = handle.GetKernels(algo_name, network_config);
+    if(!kernels.empty())
+    {
+        kernels.front()(x, y, weight, bias, mean, rstd, epsilon, inner_size, mode);
+    }
+    else
+    {
+        handle.AddKernel(algo_name, network_config, program_name, kernel_name, vld, vgd, parms)(
+            x, y, weight, bias, mean, rstd, epsilon, inner_size, mode);
+    }
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
+#endif
diff --git a/src/layernorm_api.cpp b/src/layernorm_api.cpp
new file mode 100644
index 0000000000..1c8f8d0cca
--- /dev/null
+++ b/src/layernorm_api.cpp
@@ -0,0 +1,137 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/layernorm.hpp>
+#ifdef MIOPEN_BETA_API
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
+
+static void
+LogCmdLayerNorm(const miopenTensorDescriptor_t xDesc, const miopenLayerNormMode_t mode, bool is_fwd)
+{
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(xDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "layernormfp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "layernormfp32";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "layernormbf16";
+        }
+        else if(dtype == miopenDouble)
+        {
+            ss << "layernormfp64";
+        }
+
+        int32_t size = {0};
+        miopenGetTensorDescriptorSize(xDesc, &size);
+        ss << " -n " << miopen::deref(xDesc).GetLengths()[0] << " -c "
+           << miopen::deref(xDesc).GetLengths()[1];
+        if(size == 5)
+        {
+            ss << " -D " << miopen::deref(xDesc).GetLengths()[2] << " -H "
+               << miopen::deref(xDesc).GetLengths()[3] << " -W "
+               << miopen::deref(xDesc).GetLengths()[4];
+        }
+        else if(size == 4)
+        {
+            ss << " -H " << miopen::deref(xDesc).GetLengths()[2] << " -W "
+               << miopen::deref(xDesc).GetLengths()[3];
+        }
+        else if(size == 3)
+        {
+            ss << " -W " << miopen::deref(xDesc).GetLengths()[2];
+        }
+
+        ss << " -F " << ((is_fwd) ? "1" : "2") << " -m " << mode;
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
+
+extern "C" miopenStatus_t miopenLayerNormForward(miopenHandle_t handle,
+                                                 miopenLayerNormMode_t mode,
+                                                 const miopenTensorDescriptor_t xDesc,
+                                                 const void* x,
+                                                 const miopenTensorDescriptor_t weightDesc,
+                                                 const void* weight,
+                                                 const miopenTensorDescriptor_t biasDesc,
+                                                 const void* bias,
+                                                 const float epsilon,
+                                                 const int32_t normalized_dim,
+                                                 const miopenTensorDescriptor_t yDesc,
+                                                 void* y,
+                                                 const miopenTensorDescriptor_t meanDesc,
+                                                 void* mean,
+                                                 const miopenTensorDescriptor_t rstdDesc,
+                                                 void* rstd)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        mode,
+                        xDesc,
+                        x,
+                        weightDesc,
+                        weight,
+                        biasDesc,
+                        bias,
+                        epsilon,
+                        normalized_dim,
+                        yDesc,
+                        y,
+                        meanDesc,
+                        mean,
+                        rstdDesc,
+                        rstd);
+
+    LogCmdLayerNorm(xDesc, mode, true);
+    return miopen::try_([&] {
+        miopen::LayerNormForward(miopen::deref(handle),
+                                 miopen::deref(xDesc),
+                                 DataCast(x),
+                                 miopen::deref(weightDesc),
+                                 DataCast(weight),
+                                 miopen::deref(biasDesc),
+                                 DataCast(bias),
+                                 miopen::deref(yDesc),
+                                 DataCast(y),
+                                 miopen::deref(meanDesc),
+                                 DataCast(mean),
+                                 miopen::deref(rstdDesc),
+                                 DataCast(rstd),
+                                 mode,
+                                 epsilon,
+                                 normalized_dim);
+    });
+}
+#endif
diff --git a/test/cpu_layernorm.hpp b/test/cpu_layernorm.hpp
new file mode 100644
index 0000000000..08cf44368e
--- /dev/null
+++ b/test/cpu_layernorm.hpp
@@ -0,0 +1,83 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifdef MIOPEN_BETA_API
+#ifndef GUARD_CPU_LAYERNORM_HPP
+#define GUARD_CPU_LAYERNORM_HPP
+
+#include "tensor_holder.hpp"
+
+template <class T>
+void cpu_layernorm_forward(tensor<T> input,
+                           tensor<T> weight,
+                           tensor<T> bias,
+                           tensor<T>& ref_output,
+                           tensor<T>& ref_mean,
+                           tensor<T>& ref_rstd,
+                           float eps,
+                           int32_t dim,
+                           miopenLayerNormMode_t mode)
+{
+    auto dims         = input.desc.GetLengths();
+    size_t outer_size = 1;
+    size_t inner_size = 1;
+    size_t i          = 0;
+    for(; i < dim; i++)
+    {
+        outer_size *= dims[i];
+    }
+
+    for(; i < dims.size(); i++)
+    {
+        inner_size *= dims[i];
+    }
+
+    par_ford(outer_size)([&](int32_t o) {
+        T mean_v = 0.0f;
+        T var_v  = 0.0f;
+
+        ford(inner_size)([&](int32_t i) {
+            T tmp = input[o * inner_size + i];
+            mean_v += tmp;
+            var_v += tmp * tmp;
+        });
+
+        mean_v   = mean_v / inner_size;
+        var_v    = var_v / inner_size - mean_v * mean_v;
+        T rstd_v = 1.0f / sqrt(var_v + eps);
+
+        ref_mean[o] = mean_v;
+        ref_rstd[o] = rstd_v;
+
+        ford(inner_size)([&](int32_t i) {
+            T weight_v = mode ? 1 : weight[i];
+            T bias_v   = mode ? 0 : bias[i];
+            ref_output[o * inner_size + i] =
+                (input[o * inner_size + i] - mean_v) * rstd_v * weight_v + bias_v;
+        });
+    });
+}
+#endif
+#endif
diff --git a/test/gtest/layernorm_test.cpp b/test/gtest/layernorm_test.cpp
new file mode 100644
index 0000000000..d60bfe963c
--- /dev/null
+++ b/test/gtest/layernorm_test.cpp
@@ -0,0 +1,38 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "layernorm_test.hpp"
+#ifdef MIOPEN_BETA_API
+
+struct LayerNormSolverTestFloat : LayerNormSolverTest<float>
+{
+};
+
+TEST_P(LayerNormSolverTestFloat, LayerNormTestFw){};
+
+INSTANTIATE_TEST_SUITE_P(LayerNormTestSet,
+                         LayerNormSolverTestFloat,
+                         testing::ValuesIn(LayerNormTestConfigs()));
+#endif
diff --git a/test/gtest/layernorm_test.hpp b/test/gtest/layernorm_test.hpp
new file mode 100644
index 0000000000..740108a887
--- /dev/null
+++ b/test/gtest/layernorm_test.hpp
@@ -0,0 +1,247 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/miopen.h>
+#ifdef MIOPEN_BETA_API
+#include <gtest/gtest.h>
+#include <miopen/layernorm.hpp>
+
+#include "tensor_holder.hpp"
+#include "cpu_layernorm.hpp"
+#include "get_handle.hpp"
+#include "../driver/tensor_driver.hpp"
+#include "verify.hpp"
+#include <random>
+
+struct LayerNormTestCase
+{
+    size_t N;
+    size_t C;
+    size_t D;
+    size_t H;
+    size_t W;
+    size_t nomalized_dim;
+    float eps;
+    miopenLayerNormMode_t ln_mode;
+    friend std::ostream& operator<<(std::ostream& os, const LayerNormTestCase& tc)
+    {
+        return os << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H
+                  << " W:" << tc.W << " dim:" << tc.nomalized_dim << " eps:" << tc.eps
+                  << " LayerNorm_mode:" << tc.ln_mode;
+    }
+
+    std::vector<size_t> GetInput() { return {N, C, D, H, W}; }
+};
+
+std::vector<LayerNormTestCase> LayerNormTestConfigs()
+{ // n c h d w nomalized_dim eps ln_mode
+    // clang-format off
+    return {
+	{ 32,   1,   32,  32,  32  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},       // 32x32x32 based on VoxNet arch
+        { 32,   1,   14,  14,  14  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 32,  32,   14,  14,  14  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 32,  32,   12,  12,  12  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 32,  32,    6,   6,   6  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 256,  1,   32,  32,  32  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},      // 32x32x32 based on VoxNet arch
+        { 256, 32,   14,  14,  14  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 256, 32,   12,  12,  12  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 256, 32,    6,   6,   6  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 512,  1,   32,  32,  32  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},      // 32x32x32 based on VoxNet arch
+        { 512, 32,   14,  14,  14  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 512, 32,   12,  12,  12  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 512, 32,    6,   6,   6  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 32,  2,   32,  57, 125  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},       // Hand-gesture recognition CVPR 2015 paper High Res Net Path
+        { 32, 32,   14,  25,  59  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 32, 32,    6,  10,  27  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 32, 32,    4,   6,  11  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 32, 32,    2,   2,   3  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 32, 32,   32,  28,  62  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},       // Hand-gesture recognition CVPR 2015 paper Low Res Net Path
+        { 32, 32,   14,  12,  29  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 32, 32,    6,   4,  12  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 32, 32,    4,   2,   2  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},
+        { 16, 32,    6,  50,  50  ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},       // Multi-view 3D convnet
+        { 1, 3,     8,  240, 320 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},      // 3D convet on video
+        { 1, 3,    16,  240, 320 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},      // 3D convet on video
+        { 1, 3,     8,  128, 171 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},      // 3D convet on video
+        { 1, 3,    16,  128, 171 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},      // 3D convet on video
+        { 1, 3,     8,  112, 112 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE},      // 3D convet on video
+        { 1, 3,    16,  112, 112 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}      // 3D convet on video
+      };
+    // clang-format on
+}
+
+inline int32_t SetTensorLayout(miopen::TensorDescriptor& desc)
+{
+    std::vector<std::size_t> lens = desc.GetLengths();
+    std::vector<int32_t> int32_t_lens(lens.begin(), lens.end());
+
+    // set the strides for the tensor
+    return SetTensorNd(&desc, int32_t_lens, desc.GetType());
+}
+
+template <typename T = float>
+struct LayerNormSolverTest : public ::testing::TestWithParam<LayerNormTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle    = get_handle();
+        test_skipped     = false;
+        layernorm_config = GetParam();
+        std::mt19937 gen(0);
+        std::uniform_real_distribution<> d{-3, 3};
+        auto gen_value = [&](auto...) { return d(gen); };
+
+        nomalized_dim = layernorm_config.nomalized_dim;
+        eps           = layernorm_config.eps;
+        ln_mode       = layernorm_config.ln_mode;
+
+        auto in_dim = layernorm_config.GetInput();
+
+        input = tensor<T>{in_dim}.generate(gen_value);
+
+        if(ln_mode == MIOPEN_ELEMENTWISE_AFFINE)
+        {
+            std::vector<size_t> inner_dim;
+            if(nomalized_dim == in_dim.size())
+                inner_dim = {1};
+            else
+                inner_dim = {in_dim.begin() + nomalized_dim, in_dim.end()};
+            weight = tensor<T>{inner_dim}.generate(gen_value);
+            bias   = tensor<T>{inner_dim}.generate(gen_value);
+            SetTensorLayout(weight.desc);
+            SetTensorLayout(bias.desc);
+        }
+
+        std::vector<size_t> outer_dim;
+        if(nomalized_dim == 0)
+            outer_dim = {1};
+        else
+            outer_dim = {in_dim.begin(), in_dim.end() - (in_dim.size() - nomalized_dim)};
+
+        SetTensorLayout(input.desc);
+
+        output = tensor<T>{in_dim};
+        mean   = tensor<T>{outer_dim};
+        rstd   = tensor<T>{outer_dim};
+        SetTensorLayout(output.desc);
+        SetTensorLayout(mean.desc);
+        SetTensorLayout(rstd.desc);
+        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
+        std::fill(mean.begin(), mean.end(), std::numeric_limits<T>::quiet_NaN());
+        std::fill(rstd.begin(), rstd.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_output = tensor<T>{in_dim};
+        ref_mean   = tensor<T>{outer_dim};
+        ref_rstd   = tensor<T>{outer_dim};
+        std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits<T>::quiet_NaN());
+        std::fill(ref_mean.begin(), ref_mean.end(), std::numeric_limits<T>::quiet_NaN());
+        std::fill(ref_rstd.begin(), ref_rstd.end(), std::numeric_limits<T>::quiet_NaN());
+
+        input_dev  = handle.Write(input.data);
+        weight_dev = handle.Write(weight.data);
+        bias_dev   = handle.Write(bias.data);
+        output_dev = handle.Write(output.data);
+        mean_dev   = handle.Write(mean.data);
+        rstd_dev   = handle.Write(rstd.data);
+    }
+    void TearDown() override
+    {
+        if(test_skipped)
+            return;
+
+        auto&& handle = get_handle();
+
+        cpu_layernorm_forward<T>(
+            input, weight, bias, ref_output, ref_mean, ref_rstd, eps, nomalized_dim, ln_mode);
+        miopenStatus_t status;
+
+        status = miopen::LayerNormForward(handle,
+                                          input.desc,
+                                          input_dev.get(),
+                                          weight.desc,
+                                          weight_dev.get(),
+                                          bias.desc,
+                                          bias_dev.get(),
+                                          output.desc,
+                                          output_dev.get(),
+                                          mean.desc,
+                                          mean_dev.get(),
+                                          rstd.desc,
+                                          rstd_dev.get(),
+                                          ln_mode,
+                                          eps,
+                                          nomalized_dim);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+
+        output.data = handle.Read<T>(output_dev, output.data.size());
+        mean.data   = handle.Read<T>(mean_dev, mean.data.size());
+        rstd.data   = handle.Read<T>(rstd_dev, rstd.data.size());
+
+        double threshold = std::numeric_limits<T>::epsilon();
+        auto error       = miopen::rms_range(ref_output, output);
+
+        EXPECT_TRUE(miopen::range_distance(ref_output) == miopen::range_distance(output));
+        EXPECT_TRUE(error < threshold * 1000) << "Error output beyond tolerance Error:" << error
+                                              << ",  Thresholdx1000: " << threshold * 1000;
+
+        error = miopen::rms_range(ref_mean, mean);
+        EXPECT_TRUE(miopen::range_distance(ref_mean) == miopen::range_distance(mean));
+        EXPECT_TRUE(error < threshold)
+            << "Error mean beyond tolerance Error:" << error << ",  Threshold: " << threshold;
+
+        error = miopen::rms_range(ref_rstd, rstd);
+        EXPECT_TRUE(miopen::range_distance(ref_rstd) == miopen::range_distance(rstd));
+        EXPECT_TRUE(error < threshold * 2000) << "Error rstd beyond tolerance Error:" << error
+                                              << ",  Thresholdx2000: " << threshold * 2000;
+    }
+    LayerNormTestCase layernorm_config;
+
+    tensor<T> input;
+    tensor<T> weight;
+    tensor<T> bias;
+    tensor<T> output;
+    tensor<T> mean;
+    tensor<T> rstd;
+
+    tensor<T> ref_output;
+    tensor<T> ref_mean;
+    tensor<T> ref_rstd;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr weight_dev;
+    miopen::Allocator::ManageDataPtr bias_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+    miopen::Allocator::ManageDataPtr mean_dev;
+    miopen::Allocator::ManageDataPtr rstd_dev;
+
+    size_t nomalized_dim;
+    float eps;
+    miopenLayerNormMode_t ln_mode;
+
+    bool test_skipped = false;
+};
+#endif

From 71f159cbcf300ed88b3c28566c3d0b76d54a1eda Mon Sep 17 00:00:00 2001
From: mentat <108366729+bghimireamd@users.noreply.github.com>
Date: Tue, 19 Sep 2023 19:06:03 -0500
Subject: [PATCH 06/36] [SWDEV-414487] Enable 2d conv + bias + activ CK kernel
 for MI300 (#2399)

* SWDEV-414487 : enable 2d conv + bias + activ CK kernel for MI300

* SWDEV-414487: fix compilation error
---
 src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
index ed975285ee..9520a79a90 100644
--- a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
+++ b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
@@ -421,7 +421,8 @@ bool ConvCKIgemmFwdBiasActivFused::IsApplicable(const FusionContext& ctx,
     if(!conv_problem.Is2d())
         return false;
     const std::string arch = ctx.GetStream().GetDeviceName();
-    if(arch != "gfx908" && arch != "gfx90a")
+    if(arch != "gfx908" && arch != "gfx90a" && arch != "gfx940" && arch != "gfx941" &&
+       arch != "gfx942")
         return false;
     if(!conv_problem.IsLayoutNHWC())
         return false;

From c09dac85714da92fbb2672c3db6e8faa7b70b878 Mon Sep 17 00:00:00 2001
From: Artur Wojcik <artur.wojcik@outlook.com>
Date: Thu, 21 Sep 2023 01:23:17 +0200
Subject: [PATCH 07/36] [Windows] roctracer: disable on Windows (not supported)
 (#2404)

Co-authored-by: Artur Wojcik <artur.wojcik@amd.com>
---
 src/CMakeLists.txt            | 6 ++++--
 src/include/miopen/logger.hpp | 4 ++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ef7ba9558a..4ffed2b4c8 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -850,8 +850,10 @@ if(NOT WIN32 AND NOT APPLE)
         target_internal_library(MIOpen ${LIBRT})
     endif()
 endif()
-############################################################
-target_link_libraries(MIOpen PRIVATE "-lroctx64")
+
+if(NOT WIN32)
+    target_link_libraries(MIOpen PRIVATE roctx64)
+endif()
 
 ############################################################
 # Installation
diff --git a/src/include/miopen/logger.hpp b/src/include/miopen/logger.hpp
index 659906ddb6..dc18eae5c0 100644
--- a/src/include/miopen/logger.hpp
+++ b/src/include/miopen/logger.hpp
@@ -38,7 +38,9 @@
 #include <miopen/object.hpp>
 #include <miopen/config.h>
 
+#ifndef _WIN32
 #include <roctracer/roctx.h>
+#endif
 
 // See https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
 #define MIOPEN_PP_CAT(x, y) MIOPEN_PP_PRIMITIVE_CAT(x, y)
@@ -410,6 +412,7 @@ class LogScopeTime
 #define MIOPEN_LOG_SCOPE_TIME
 #endif
 
+#ifndef _WIN32
 class LogScopeRoctx
 {
 public:
@@ -434,6 +437,7 @@ class LogScopeRoctx
 private:
     bool m_active{false};
 };
+#endif
 
 } // namespace miopen
 

From 3825849cd5f63cdb6378c6e3c9cf559a5d2917a8 Mon Sep 17 00:00:00 2001
From: Chris Erb <chris.erb65@gmail.com>
Date: Mon, 25 Sep 2023 09:52:06 -0500
Subject: [PATCH 08/36] [MI200] Refresh kdb using db_sync (#2411)

---
 src/kernels/gfx90a.kdb.bz2 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/kernels/gfx90a.kdb.bz2 b/src/kernels/gfx90a.kdb.bz2
index d2c8722d2e..613df18f7a 100644
--- a/src/kernels/gfx90a.kdb.bz2
+++ b/src/kernels/gfx90a.kdb.bz2
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:91eaa7412acf3e9a6f23cd70b386037434318a63e1d7be7212979a9ee50fe617
-size 592987974
+oid sha256:3d76d7c53648f4864a5cfe9267e8cb9171abab81de9d1732a9f94bafb0816b61
+size 250548882

From 115686c1f75c115308d0099d79bfa7cc87a965c4 Mon Sep 17 00:00:00 2001
From: Vasilii Filippov <DrizztDoUrden@users.noreply.github.com>
Date: Mon, 25 Sep 2023 20:14:06 +0200
Subject: [PATCH 09/36] Removal of convolution context (#2402)

---
 src/conv/heuristics/ai_heuristics.cpp         |    8 +-
 src/conv/invokers/impl_gemm_dynamic.cpp       |    4 +-
 src/conv/solver_finders.cpp                   |   22 +-
 src/convolution.cpp                           |   43 +-
 src/fusion.cpp                                |    2 +-
 src/include/miopen/any_solver.hpp             |   81 +-
 .../miopen/conv/compiled_in_parameters.hpp    |    3 +-
 src/include/miopen/conv/context.hpp           |   24 +-
 .../miopen/conv/heuristics/ai_heuristics.hpp  |    3 +-
 .../miopen/conv/invokers/impl_gemm.hpp        |    2 +-
 .../conv/invokers/impl_gemm_dynamic.hpp       |    5 +-
 .../miopen/conv/invokers/mlir_impl_gemm.hpp   |    2 +-
 src/include/miopen/conv/solver_finders.hpp    |   11 +-
 src/include/miopen/convolution.hpp            |   14 +-
 src/include/miopen/execution_context.hpp      |    5 +
 src/include/miopen/fusion/context.hpp         |   13 +-
 src/include/miopen/fusion/solvers.hpp         |    7 +-
 src/include/miopen/generic_search.hpp         |    4 +-
 src/include/miopen/mlo_internal.hpp           |   36 +-
 src/include/miopen/solver.hpp                 | 1147 +++++++----------
 .../miopen/solver/conv_direct_naive_conv.hpp  |    7 +-
 .../miopen/solver/implicitgemm_util.hpp       |   12 +-
 src/include/miopen/solver/mlir_common.hpp     |    7 +-
 src/mlo_dir_conv.cpp                          |   33 +-
 src/ocl/convolutionocl.cpp                    |   33 +-
 src/problem.cpp                               |    2 +-
 src/solution.cpp                              |    2 +-
 src/solver.cpp                                |    2 +-
 src/solver/conv_MP_bidirectional_winograd.cpp |   24 +-
 src/solver/conv_asm_1x1u.cpp                  |   19 +-
 src/solver/conv_asm_1x1u_stride2.cpp          |   10 +-
 src/solver/conv_asm_3x3u.cpp                  |   11 +-
 src/solver/conv_asm_dir_BwdWrW1x1.cpp         |   16 +-
 src/solver/conv_asm_dir_BwdWrW3x3.cpp         |   14 +-
 .../conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp   |   14 +-
 .../conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp  |   12 +-
 .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp   |   14 +-
 ...conv_asm_implicit_gemm_gtc_perf_config.cpp |    4 +-
 .../conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp   |   14 +-
 .../conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp     |   12 +-
 src/solver/conv_direct_naive_conv.cpp         |    4 +-
 src/solver/conv_direct_naive_conv_bwd.cpp     |    4 +-
 src/solver/conv_direct_naive_conv_fwd.cpp     |    4 +-
 src/solver/conv_direct_naive_conv_wrw.cpp     |    4 +-
 ...ip_implicit_gemm_3d_grouped_bwd_xdlops.cpp |   10 +-
 ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp |   10 +-
 ...ip_implicit_gemm_3d_grouped_wrw_xdlops.cpp |   10 +-
 ...conv_hip_implicit_gemm_bwd_data_xdlops.cpp |   10 +-
 .../conv_hip_implicit_gemm_bwd_v1r1.cpp       |   26 +-
 ...conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp |   18 +-
 .../conv_hip_implicit_gemm_bwd_v4r1.cpp       |   12 +-
 ...conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp |   16 +-
 .../conv_hip_implicit_gemm_fwd_v4r1.cpp       |   20 +-
 .../conv_hip_implicit_gemm_fwd_v4r4.cpp       |   12 +-
 ...conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp |   16 +-
 ...licit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp |   16 +-
 ...conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp |   16 +-
 .../conv_hip_implicit_gemm_fwd_xdlops.cpp     |   10 +-
 ...v_hip_implicit_gemm_grouped_fwd_xdlops.cpp |   10 +-
 ...onv_hip_implicit_gemm_nonxdlops_common.cpp |    6 +-
 .../conv_hip_implicit_gemm_wrw_v4r4.cpp       |   12 +-
 ...conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp |   24 +-
 ...licit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp |   24 +-
 src/solver/conv_mlir_igemm_bwd.cpp            |   10 +-
 src/solver/conv_mlir_igemm_bwd_xdlops.cpp     |   10 +-
 src/solver/conv_mlir_igemm_fwd.cpp            |   12 +-
 src/solver/conv_mlir_igemm_fwd_xdlops.cpp     |   12 +-
 src/solver/conv_mlir_igemm_wrw.cpp            |   10 +-
 src/solver/conv_mlir_igemm_wrw_xdlops.cpp     |   12 +-
 src/solver/conv_ocl_dir2D11x11.cpp            |    4 +-
 src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp      |    6 +-
 src/solver/conv_ocl_dir2D_bwdWrW_2.cpp        |   20 +-
 src/solver/conv_ocl_dir2D_bwdWrW_53.cpp       |    6 +-
 src/solver/conv_ocl_dir2Dfwd.cpp              |    8 +-
 src/solver/conv_ocl_dir2Dfwd1x1.cpp           |    4 +-
 .../conv_ocl_dir2Dfwd_exhaustive_search.cpp   |    8 +-
 src/solver/conv_ocl_dir2Dfwdgen.cpp           |    4 +-
 src/solver/conv_winoRxS.cpp                   |   28 +-
 src/solver/conv_wino_fury_RxS.cpp             |    6 +-
 src/solver/mlir_common.cpp                    |    2 +-
 test/conv_common.hpp                          |    6 +-
 test/embed_sqlite.cpp                         |    4 +-
 test/gpu_conv.hpp                             |    6 +-
 test/gtest/db_sync.cpp                        |    6 +-
 test/gtest/group_conv3d_bwd.cpp               |    2 +-
 test/gtest/group_conv3d_fwd.cpp               |    2 +-
 test/gtest/group_conv3d_wrw.cpp               |    2 +-
 test/gtest/group_conv_fwd.cpp                 |    2 +-
 test/gtest/kernel_tuning_net.cpp              |    2 +-
 test/gtest/solver_bwd.hpp                     |    6 +-
 test/gtest/solver_fwd.hpp                     |    4 +-
 test/gtest/solver_wrw.hpp                     |    4 +-
 test/gtest/tuna_net.cpp                       |    2 +-
 test/solver.cpp                               |   30 +-
 94 files changed, 1007 insertions(+), 1225 deletions(-)

diff --git a/src/conv/heuristics/ai_heuristics.cpp b/src/conv/heuristics/ai_heuristics.cpp
index 50cd495de6..ffe4b0c591 100644
--- a/src/conv/heuristics/ai_heuristics.cpp
+++ b/src/conv/heuristics/ai_heuristics.cpp
@@ -119,9 +119,9 @@ class Model
           offset(metadata.num_outputs - metadata.num_solvers)
     {
     }
-    virtual ~Model()                                                     = default;
+    virtual ~Model()                                                   = default;
     virtual bool IsProblemSupported(const ProblemDescription& problem,
-                                    const ConvolutionContext& ctx) const = 0;
+                                    const ExecutionContext& ctx) const = 0;
     std::vector<float> Forward(const ProblemDescription& problem) const
     {
         std::vector<float> features       = ToFeatures(problem);
@@ -150,7 +150,7 @@ class Gfx908Model : public Model
 public:
     Gfx908Model() : Model("gfx908") {}
     bool IsProblemSupported(const ProblemDescription& problem,
-                            const ConvolutionContext& ctx) const override
+                            const ExecutionContext& ctx) const override
     {
         // check if problem is of the kind TunaNet was trained to handle
         if(!problem.Is2d())
@@ -258,7 +258,7 @@ class Gfx908Model : public Model
 std::unique_ptr<Model> GetModel(const std::string&) { return std::make_unique<Gfx908Model>(); }
 
 std::vector<uint64_t> PredictSolver(const ProblemDescription& problem,
-                                    const ConvolutionContext& ctx,
+                                    const ExecutionContext& ctx,
                                     const std::string& device)
 {
     const static std::unique_ptr<Model> model = GetModel(device);
diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp
index 01e931dd69..2416217ea2 100644
--- a/src/conv/invokers/impl_gemm_dynamic.cpp
+++ b/src/conv/invokers/impl_gemm_dynamic.cpp
@@ -438,7 +438,7 @@ MakeImplGemmDynamicBackwardDataInvokerFactory<solver::TunableImplicitGemmGTCDyna
 }
 
 InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const miopen::ProblemDescription& problem,
     const solver::PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config)
 {
@@ -731,7 +731,7 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(
 }
 
 InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const miopen::ProblemDescription& problem,
     const solver::PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config)
 {
diff --git a/src/conv/solver_finders.cpp b/src/conv/solver_finders.cpp
index adec061aa5..b20a933bfc 100644
--- a/src/conv/solver_finders.cpp
+++ b/src/conv/solver_finders.cpp
@@ -51,14 +51,14 @@ class DirectSolverFinder : public SolversFinder
     }
 
 protected:
-    bool IsEnabled(const ConvolutionContext& /*ctx*/,
+    bool IsEnabled(const ExecutionContext& /*ctx*/,
                    const conv::ProblemDescription& /*problem*/,
                    bool use_winograd_only) const override
     {
         return !use_winograd_only && !IsDisabled(MIOPEN_DEBUG_CONV_DIRECT{});
     }
 
-    std::vector<solver::ConvSolution> FindImpl(const ConvolutionContext& ctx,
+    std::vector<solver::ConvSolution> FindImpl(const ExecutionContext& ctx,
                                                const ProblemDescription& problem,
                                                const AnyInvokeParams& invoke_ctx,
                                                bool /*use_winograd_only*/) const override
@@ -79,14 +79,14 @@ class ImplicitGemmSolverFinder : public SolversFinder
     }
 
 protected:
-    bool IsEnabled(const ConvolutionContext& /*ctx*/,
+    bool IsEnabled(const ExecutionContext& /*ctx*/,
                    const conv::ProblemDescription& /*problem*/,
                    bool use_winograd_only) const override
     {
         return !use_winograd_only && !IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM{});
     }
 
-    std::vector<solver::ConvSolution> FindImpl(const ConvolutionContext& ctx,
+    std::vector<solver::ConvSolution> FindImpl(const ExecutionContext& ctx,
                                                const ProblemDescription& problem,
                                                const AnyInvokeParams& invoke_ctx,
                                                bool /*use_winograd_only*/) const override
@@ -107,7 +107,7 @@ class FftSolverFinder : public SolversFinder
     }
 
 protected:
-    bool IsEnabled(const ConvolutionContext& /*ctx*/,
+    bool IsEnabled(const ExecutionContext& /*ctx*/,
                    const conv::ProblemDescription& problem,
                    bool use_winograd_only) const override
     {
@@ -115,7 +115,7 @@ class FftSolverFinder : public SolversFinder
                !IsDisabled(MIOPEN_DEBUG_CONV_FFT{});
     }
 
-    std::vector<solver::ConvSolution> FindImpl(const ConvolutionContext& ctx,
+    std::vector<solver::ConvSolution> FindImpl(const ExecutionContext& ctx,
                                                const ProblemDescription& problem,
                                                const AnyInvokeParams& invoke_ctx,
                                                bool /*use_winograd_only*/) const override
@@ -134,14 +134,14 @@ class GemmSolverFinder : public SolversFinder
     }
 
 protected:
-    bool IsEnabled(const ConvolutionContext& /*ctx*/,
+    bool IsEnabled(const ExecutionContext& /*ctx*/,
                    const conv::ProblemDescription& /*problem*/,
                    bool use_winograd_only) const override
     {
         return !use_winograd_only && !IsDisabled(MIOPEN_DEBUG_CONV_GEMM{});
     }
 
-    std::vector<solver::ConvSolution> FindImpl(const ConvolutionContext& ctx,
+    std::vector<solver::ConvSolution> FindImpl(const ExecutionContext& ctx,
                                                const ProblemDescription& problem,
                                                const AnyInvokeParams& invoke_ctx,
                                                bool /*use_winograd_only*/) const override
@@ -160,14 +160,14 @@ class WinogradSolverFinder : public SolversFinder
     }
 
 protected:
-    bool IsEnabled(const ConvolutionContext& /*ctx*/,
+    bool IsEnabled(const ExecutionContext& /*ctx*/,
                    const conv::ProblemDescription& /*problem*/,
                    bool /*use_winograd_only*/) const override
     {
         return !IsDisabled(MIOPEN_DEBUG_CONV_WINOGRAD{});
     }
 
-    std::vector<solver::ConvSolution> FindImpl(const ConvolutionContext& ctx,
+    std::vector<solver::ConvSolution> FindImpl(const ExecutionContext& ctx,
                                                const ProblemDescription& problem,
                                                const AnyInvokeParams& invoke_ctx,
                                                bool use_winograd_only) const override
@@ -277,7 +277,7 @@ static void EvaluateInvokers(Handle& handle,
 
 void ConvFindCore(const AnyInvokeParams& invoke_ctx,
                   DbRecord& record,
-                  const ConvolutionContext& ctx,
+                  const ExecutionContext& ctx,
                   const ProblemDescription& problem,
                   bool use_winograd_only,
                   const std::vector<std::unique_ptr<SolversFinder>>& finders)
diff --git a/src/convolution.cpp b/src/convolution.cpp
index 5f7539f70d..403ff777cd 100644
--- a/src/convolution.cpp
+++ b/src/convolution.cpp
@@ -75,7 +75,7 @@ std::size_t GetMaxWorkSpaceSize(const std::vector<std::pair<std::string, std::si
     return sz;
 }
 
-std::size_t GetWorkSpaceSizeGEMM(const miopen::ConvolutionContext& ctx,
+std::size_t GetWorkSpaceSizeGEMM(const miopen::ExecutionContext& ctx,
                                  const miopen::ProblemDescription& problem)
 {
 #if MIOPEN_USE_GEMM
@@ -91,7 +91,7 @@ std::size_t GetWorkSpaceSizeGEMM(const miopen::ConvolutionContext& ctx,
 #endif
 }
 
-std::size_t GetWorkSpaceSizeImplicitGemm(const miopen::ConvolutionContext& ctx,
+std::size_t GetWorkSpaceSizeImplicitGemm(const miopen::ExecutionContext& ctx,
                                          const miopen::ProblemDescription& problem)
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM{}))
@@ -99,7 +99,7 @@ std::size_t GetWorkSpaceSizeImplicitGemm(const miopen::ConvolutionContext& ctx,
     return GetMaxWorkSpaceSize(FindAllImplicitGemmWorkspaceSizes(ctx, problem));
 }
 
-std::size_t GetWorkSpaceSizeDirect(const miopen::ConvolutionContext& ctx,
+std::size_t GetWorkSpaceSizeDirect(const miopen::ExecutionContext& ctx,
                                    const miopen::ProblemDescription& problem)
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT{}))
@@ -107,7 +107,7 @@ std::size_t GetWorkSpaceSizeDirect(const miopen::ConvolutionContext& ctx,
     return GetMaxWorkSpaceSize(AllDirectForwardBackwardDataWorkspaceSize(ctx, problem));
 }
 
-std::size_t GetWorkSpaceSizeFFT(const miopen::ConvolutionContext& ctx,
+std::size_t GetWorkSpaceSizeFFT(const miopen::ExecutionContext& ctx,
                                 const miopen::ProblemDescription& problem)
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_FFT{}))
@@ -115,7 +115,7 @@ std::size_t GetWorkSpaceSizeFFT(const miopen::ConvolutionContext& ctx,
     return GetMaxWorkSpaceSize(AllFFTForwardBackwardDataWorkspaceSize(ctx, problem));
 }
 
-std::size_t GetWorkSpaceSizeWinograd(const miopen::ConvolutionContext& ctx,
+std::size_t GetWorkSpaceSizeWinograd(const miopen::ExecutionContext& ctx,
                                      const miopen::ProblemDescription& problem)
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_WINOGRAD{}))
@@ -123,7 +123,7 @@ std::size_t GetWorkSpaceSizeWinograd(const miopen::ConvolutionContext& ctx,
     return GetMaxWorkSpaceSize(FindAllWinogradWorkspaceSizes(ctx, problem));
 }
 
-std::size_t GetWorkSpaceSizeDirectWrW(const miopen::ConvolutionContext& ctx,
+std::size_t GetWorkSpaceSizeDirectWrW(const miopen::ExecutionContext& ctx,
                                       const miopen::ProblemDescription& problem)
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT{}))
@@ -131,7 +131,7 @@ std::size_t GetWorkSpaceSizeDirectWrW(const miopen::ConvolutionContext& ctx,
     return GetMaxWorkSpaceSize(AllDirectBwdWrW2DWorkspaceSize(ctx, problem));
 }
 
-std::size_t GetWorkSpaceSizeWinogradWrW(const miopen::ConvolutionContext& ctx,
+std::size_t GetWorkSpaceSizeWinogradWrW(const miopen::ExecutionContext& ctx,
                                         const miopen::ProblemDescription& problem)
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_WINOGRAD{}))
@@ -139,7 +139,7 @@ std::size_t GetWorkSpaceSizeWinogradWrW(const miopen::ConvolutionContext& ctx,
     return GetMaxWorkSpaceSize(FindWinogradWrWWorkspaceSizes(ctx, problem));
 }
 
-std::size_t GetWorkSpaceSizeImplicitGemmWrW(const miopen::ConvolutionContext& ctx,
+std::size_t GetWorkSpaceSizeImplicitGemmWrW(const miopen::ExecutionContext& ctx,
                                             const miopen::ProblemDescription& problem)
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM{}))
@@ -382,7 +382,7 @@ TensorDescriptor ConvolutionDescriptor::GetForwardOutputTensor(const TensorDescr
 /// for some related host-side optimizations.
 ///
 /// These optimizations are kind of cutting corners, but advantages are quite high.
-bool ConvolutionDescriptor::IsWinograd3x3SupportedAndFast(const miopen::ConvolutionContext& ctx,
+bool ConvolutionDescriptor::IsWinograd3x3SupportedAndFast(const miopen::ExecutionContext& ctx,
                                                           const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_WINOGRAD{}))
@@ -433,31 +433,30 @@ std::size_t ConvolutionDescriptor::GetWorkSpaceSize(ExecutionContext ctx,
         return solutions.front().workspace_size;
     }
 
-    auto conv_ctx = ConvolutionContext{ctx};
     size_t workspace_size;
 
     if(problem.GetDirection() != conv::Direction::BackwardWeights)
     {
-        if(IsWinograd3x3SupportedAndFast(conv_ctx, problem))
+        if(IsWinograd3x3SupportedAndFast(ctx, problem))
         {
-            conv_ctx.use_dynamic_solutions_only = true;
-            workspace_size                      = GetWorkSpaceSizeWinograd(conv_ctx, problem);
+            ctx.use_dynamic_solutions_only = true;
+            workspace_size                 = GetWorkSpaceSizeWinograd(ctx, problem);
         }
         else
         {
-            workspace_size = std::max({GetWorkSpaceSizeFFT(conv_ctx, problem),
-                                       GetWorkSpaceSizeGEMM(conv_ctx, problem),
-                                       GetWorkSpaceSizeDirect(conv_ctx, problem),
-                                       GetWorkSpaceSizeImplicitGemm(conv_ctx, problem),
-                                       GetWorkSpaceSizeWinograd(conv_ctx, problem)});
+            workspace_size = std::max({GetWorkSpaceSizeFFT(ctx, problem),
+                                       GetWorkSpaceSizeGEMM(ctx, problem),
+                                       GetWorkSpaceSizeDirect(ctx, problem),
+                                       GetWorkSpaceSizeImplicitGemm(ctx, problem),
+                                       GetWorkSpaceSizeWinograd(ctx, problem)});
         }
     }
     else
     {
-        workspace_size = std::max({GetWorkSpaceSizeGEMM(conv_ctx, problem),
-                                   GetWorkSpaceSizeDirectWrW(conv_ctx, problem),
-                                   GetWorkSpaceSizeImplicitGemmWrW(conv_ctx, problem),
-                                   GetWorkSpaceSizeWinogradWrW(conv_ctx, problem)});
+        workspace_size = std::max({GetWorkSpaceSizeGEMM(ctx, problem),
+                                   GetWorkSpaceSizeDirectWrW(ctx, problem),
+                                   GetWorkSpaceSizeImplicitGemmWrW(ctx, problem),
+                                   GetWorkSpaceSizeWinogradWrW(ctx, problem)});
     }
 
     MIOPEN_LOG_I(workspace_size);
diff --git a/src/fusion.cpp b/src/fusion.cpp
index 037fee7f63..bea336e67f 100644
--- a/src/fusion.cpp
+++ b/src/fusion.cpp
@@ -199,7 +199,7 @@ std::string LogCmdConvolutionFusion(const miopenFusionPlanDescriptor_t fusePlanD
 
 std::string LogCmdBnormFusion(const miopenFusionPlanDescriptor_t fusePlanDesc, int fusion_mode)
 {
-    assert(deref(fusePlanDesc).op_map.size() >= 1);
+    assert(!deref(fusePlanDesc).op_map.empty());
 
     std::string str;
     if(deref(fusePlanDesc).data_type == miopenBFloat16)
diff --git a/src/include/miopen/any_solver.hpp b/src/include/miopen/any_solver.hpp
index 8de0e8a759..b2f177b6ea 100644
--- a/src/include/miopen/any_solver.hpp
+++ b/src/include/miopen/any_solver.hpp
@@ -46,7 +46,7 @@ struct AnySolver
     AnySolver() : ptr_value(nullptr){};
     template <class U>
     AnySolver(U src) : ptr_value(new AnySolver_tmpl<U>(std::forward<U>(src))){};
-    bool IsApplicable(const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const
     {
         assert(ptr_value != nullptr);
         return ptr_value->IsApplicable(ctx, problem);
@@ -56,14 +56,14 @@ struct AnySolver
         assert(ptr_value != nullptr);
         return ptr_value->IsTunable();
     };
-    bool TestPerfCfgParams(const ConvolutionContext& ctx,
+    bool TestPerfCfgParams(const ExecutionContext& ctx,
                            const ProblemDescription& problem,
                            const std::string& params) const
     {
         assert(ptr_value != nullptr);
         return ptr_value->TestPerfCfgParams(ctx, problem, params);
     };
-    std::vector<ConvSolution> GetAllSolutions(const ConvolutionContext& ctx,
+    std::vector<ConvSolution> GetAllSolutions(const ExecutionContext& ctx,
                                               const ProblemDescription& problem) const
     {
         assert(ptr_value != nullptr);
@@ -74,7 +74,7 @@ struct AnySolver
         assert(ptr_value != nullptr);
         return ptr_value->IsDynamic();
     };
-    float GetWti(const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    float GetWti(const ExecutionContext& ctx, const ProblemDescription& problem) const
     {
         assert(ptr_value != nullptr);
         return ptr_value->GetWti(ctx, problem);
@@ -85,7 +85,7 @@ struct AnySolver
         return ptr_value->Type();
     };
     bool IsEmpty() const { return ptr_value == nullptr; };
-    ConvSolution FindSolution(const ConvolutionContext& ctx,
+    ConvSolution FindSolution(const ExecutionContext& ctx,
                               const ProblemDescription& problem,
                               PerformanceDb& db,
                               const miopen::AnyInvokeParams& invoke_ctx,
@@ -94,7 +94,7 @@ struct AnySolver
         assert(ptr_value != nullptr);
         return ptr_value->FindSolution(ctx, problem, db, invoke_ctx, perf_cfg);
     };
-    std::string GetPerfCfgParams(const ConvolutionContext& ctx,
+    std::string GetPerfCfgParams(const ExecutionContext& ctx,
                                  const ProblemDescription& problem,
                                  PerformanceDb& db) const
     {
@@ -107,7 +107,7 @@ struct AnySolver
         return ptr_value->GetSolverDbId();
     }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    size_t GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const
     {
         assert(ptr_value != nullptr);
         return ptr_value->GetWorkspaceSize(ctx, problem);
@@ -125,30 +125,30 @@ struct AnySolver
         using ptr = std::shared_ptr<const AnySolver_base>;
 
         virtual ~AnySolver_base(){};
-        virtual bool IsApplicable(const ConvolutionContext& ctx,
+        virtual bool IsApplicable(const ExecutionContext& ctx,
                                   const ProblemDescription& problem) const = 0;
         virtual bool IsTunable() const                                     = 0;
-        virtual bool TestPerfCfgParams(const ConvolutionContext& ctx,
+        virtual bool TestPerfCfgParams(const ExecutionContext& ctx,
                                        const ProblemDescription& problem,
                                        const std::string& params) const    = 0;
         virtual std::vector<ConvSolution>
-        GetAllSolutions(const ConvolutionContext& ctx, const ProblemDescription& problem) const = 0;
-        virtual bool IsDynamic() const                                                          = 0;
-        virtual float GetWti(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const                           = 0;
-        virtual const std::type_info& Type() const                                              = 0;
-        virtual std::string GetSolverDbId() const                                               = 0;
-        virtual ConvSolution FindSolution(const ConvolutionContext& ctx,
+        GetAllSolutions(const ExecutionContext& ctx, const ProblemDescription& problem) const = 0;
+        virtual bool IsDynamic() const                                                        = 0;
+        virtual float GetWti(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const                         = 0;
+        virtual const std::type_info& Type() const                                            = 0;
+        virtual std::string GetSolverDbId() const                                             = 0;
+        virtual ConvSolution FindSolution(const ExecutionContext& ctx,
                                           const ProblemDescription& problem,
                                           PerformanceDb& db,
                                           const miopen::AnyInvokeParams& invoke_ctx,
-                                          const std::string& perf_cfg) const                    = 0;
-        virtual std::string GetPerfCfgParams(const ConvolutionContext& ctx,
+                                          const std::string& perf_cfg) const                  = 0;
+        virtual std::string GetPerfCfgParams(const ExecutionContext& ctx,
                                              const ProblemDescription& problem,
-                                             PerformanceDb& db) const                           = 0;
-        virtual size_t GetWorkspaceSize(const ConvolutionContext& ctx,
-                                        const ProblemDescription& problem) const                = 0;
-        virtual bool MayNeedWorkspace() const                                                   = 0;
+                                             PerformanceDb& db) const                         = 0;
+        virtual size_t GetWorkspaceSize(const ExecutionContext& ctx,
+                                        const ProblemDescription& problem) const              = 0;
+        virtual bool MayNeedWorkspace() const                                                 = 0;
     };
 
     // templated derived class
@@ -160,7 +160,7 @@ struct AnySolver
             template <typename U>
             static constexpr auto Test(U*) ->
                 typename std::is_class<decltype(std::declval<U>().GetDefaultPerformanceConfig(
-                    std::declval<const ConvolutionContext&>(),
+                    std::declval<const ExecutionContext&>(),
                     std::declval<const ProblemDescription&>()))>::type;
 
             template <typename U>
@@ -176,7 +176,7 @@ struct AnySolver
             static constexpr auto Test(U*) ->
                 typename std::is_same<LegacyPerformanceConfig,
                                       decltype(std::declval<U>().GetDefaultPerformanceConfig(
-                                          std::declval<const ConvolutionContext&>(),
+                                          std::declval<const ExecutionContext&>(),
                                           std::declval<const ProblemDescription&>()))>::type;
 
             template <typename U>
@@ -186,13 +186,13 @@ struct AnySolver
             static constexpr bool Is = type::value;
         };
 
-        bool TestPerfCfgParams(const ConvolutionContext& ctx,
+        bool TestPerfCfgParams(const ExecutionContext& ctx,
                                const ProblemDescription& problem,
                                const std::string& params,
                                std::true_type) const
         {
             using PerformanceConfig = decltype(value.GetDefaultPerformanceConfig(
-                std::declval<const ConvolutionContext&>(),
+                std::declval<const ExecutionContext&>(),
                 std::declval<const ProblemDescription&>()));
             PerformanceConfig config{};
 
@@ -208,7 +208,7 @@ struct AnySolver
 
             return success;
         }
-        bool TestPerfCfgParams(const ConvolutionContext&,
+        bool TestPerfCfgParams(const ExecutionContext&,
                                const ProblemDescription&,
                                const std::string&,
                                std::false_type) const
@@ -216,7 +216,7 @@ struct AnySolver
             return false;
         }
 
-        bool TestPerfCfgParams(const ConvolutionContext& ctx,
+        bool TestPerfCfgParams(const ExecutionContext& ctx,
                                const ProblemDescription& problem,
                                const std::string& params) const override
         {
@@ -225,7 +225,7 @@ struct AnySolver
         }
 
         // tunable legacy solver
-        std::vector<ConvSolution> GetAllSolutions(const ConvolutionContext&,
+        std::vector<ConvSolution> GetAllSolutions(const ExecutionContext&,
                                                   const ProblemDescription&,
                                                   std::true_type,
                                                   std::true_type) const
@@ -234,7 +234,7 @@ struct AnySolver
         }
 
         // tunable solver, not legacy
-        std::vector<ConvSolution> GetAllSolutions(const ConvolutionContext& ctx,
+        std::vector<ConvSolution> GetAllSolutions(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem,
                                                   std::true_type,
                                                   std::false_type) const
@@ -243,7 +243,7 @@ struct AnySolver
         }
 
         // non tunable solver
-        std::vector<ConvSolution> GetAllSolutions(const ConvolutionContext& ctx,
+        std::vector<ConvSolution> GetAllSolutions(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem,
                                                   std::false_type,
                                                   std::true_type) const
@@ -252,7 +252,7 @@ struct AnySolver
             solutions.push_back(value.GetSolution(ctx, problem));
             return solutions;
         }
-        std::vector<ConvSolution> GetAllSolutions(const ConvolutionContext& ctx,
+        std::vector<ConvSolution> GetAllSolutions(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem,
                                                   std::false_type,
                                                   std::false_type) const
@@ -262,7 +262,7 @@ struct AnySolver
             return solutions;
         }
 
-        std::vector<ConvSolution> GetAllSolutions(const ConvolutionContext& ctx,
+        std::vector<ConvSolution> GetAllSolutions(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem) const override
         {
             return GetAllSolutions(ctx,
@@ -273,20 +273,19 @@ struct AnySolver
 
         AnySolver_tmpl(T obj) : value(std::move(obj)){};
 
-        bool IsApplicable(const ConvolutionContext& ctx,
+        bool IsApplicable(const ExecutionContext& ctx,
                           const ProblemDescription& problem) const override
         {
             return value.IsApplicable(ctx, problem);
         }
         bool IsTunable() const override { return TunableSolver::Is; }
         bool IsDynamic() const override { return value.IsDynamic(); }
-        float GetWti(const ConvolutionContext& ctx,
-                     const ProblemDescription& problem) const override
+        float GetWti(const ExecutionContext& ctx, const ProblemDescription& problem) const override
         {
             return value.GetWti(ctx, problem);
         }
 
-        ConvSolution FindSolution(const ConvolutionContext& ctx,
+        ConvSolution FindSolution(const ExecutionContext& ctx,
                                   const ProblemDescription& problem,
                                   PerformanceDb& db,
                                   const miopen::AnyInvokeParams& invoke_ctx,
@@ -295,7 +294,7 @@ struct AnySolver
             return miopen::solver::FindSolution(value, ctx, problem, db, invoke_ctx, perf_cfg);
         };
 
-        std::string GetPerfCfgParams(const ConvolutionContext& ctx,
+        std::string GetPerfCfgParams(const ExecutionContext& ctx,
                                      const ProblemDescription& problem,
                                      PerformanceDb& db,
                                      std::true_type) const
@@ -327,7 +326,7 @@ struct AnySolver
             config = value.GetDefaultPerformanceConfig(ctx, problem);
             return config.ToString();
         }
-        std::string GetPerfCfgParams(const ConvolutionContext&,
+        std::string GetPerfCfgParams(const ExecutionContext&,
                                      const ProblemDescription&,
                                      const PerformanceDb&,
                                      std::false_type) const
@@ -336,7 +335,7 @@ struct AnySolver
             return "";
         }
 
-        std::string GetPerfCfgParams(const ConvolutionContext& ctx,
+        std::string GetPerfCfgParams(const ExecutionContext& ctx,
                                      const ProblemDescription& problem,
                                      PerformanceDb& db) const override
         {
@@ -344,7 +343,7 @@ struct AnySolver
                 ctx, problem, db, std::integral_constant<bool, TunableSolver::Is>());
         }
 
-        size_t GetWorkspaceSize(const ConvolutionContext& ctx,
+        size_t GetWorkspaceSize(const ExecutionContext& ctx,
                                 const ProblemDescription& problem) const override
         {
             return value.GetWorkspaceSize(ctx, problem);
diff --git a/src/include/miopen/conv/compiled_in_parameters.hpp b/src/include/miopen/conv/compiled_in_parameters.hpp
index 28def48761..a72ccf7d47 100644
--- a/src/include/miopen/conv/compiled_in_parameters.hpp
+++ b/src/include/miopen/conv/compiled_in_parameters.hpp
@@ -26,7 +26,8 @@
 
 #pragma once
 
-#include <miopen/conv/context.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/problem_description.hpp>
 #include <miopen/handle.hpp>
 
 #include <cassert>
diff --git a/src/include/miopen/conv/context.hpp b/src/include/miopen/conv/context.hpp
index 63a1469f5d..8974b684a5 100644
--- a/src/include/miopen/conv/context.hpp
+++ b/src/include/miopen/conv/context.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -24,25 +24,9 @@
  *
  *******************************************************************************/
 
+// Todo: this is a temporary header for fin compatibility
+// It would be removed in a separate PR after changes to fin would be merged
+
 #pragma once
 
 #include <miopen/execution_context.hpp>
-#include <miopen/problem_description.hpp>
-
-#include <string>
-
-namespace miopen {
-/// A leftover of the legacy design, houses
-/// environmental context (e.g. HW/SW platform) and solver-specific state.
-///
-/// TODO: These two entities should be made separate.
-struct ConvolutionContext : ExecutionContext
-{
-    ConvolutionContext() = default;
-    explicit ConvolutionContext(const ExecutionContext& ctx) : ExecutionContext(ctx) {}
-
-public:
-    bool is_for_generic_search = false;
-};
-
-} // namespace miopen
diff --git a/src/include/miopen/conv/heuristics/ai_heuristics.hpp b/src/include/miopen/conv/heuristics/ai_heuristics.hpp
index dca891515d..7da9497070 100644
--- a/src/include/miopen/conv/heuristics/ai_heuristics.hpp
+++ b/src/include/miopen/conv/heuristics/ai_heuristics.hpp
@@ -37,7 +37,6 @@
 #include <queue>
 #include <fstream>
 #include <miopen/miopen.h>
-#include <miopen/conv/context.hpp>
 #include <miopen/solver.hpp>
 #include <nlohmann/json.hpp>
 #include <miopen/db_path.hpp>
@@ -72,7 +71,7 @@ struct Metadata
 };
 class Model;
 std::vector<uint64_t> PredictSolver(const ProblemDescription& problem,
-                                    const ConvolutionContext& ctx,
+                                    const ExecutionContext& ctx,
                                     const std::string& device);
 } // namespace immed_mode
 
diff --git a/src/include/miopen/conv/invokers/impl_gemm.hpp b/src/include/miopen/conv/invokers/impl_gemm.hpp
index fcc4666fd1..23c5afddc5 100644
--- a/src/include/miopen/conv/invokers/impl_gemm.hpp
+++ b/src/include/miopen/conv/invokers/impl_gemm.hpp
@@ -28,7 +28,7 @@
 
 #include <miopen/invoker.hpp>
 #include <miopen/kernel.hpp>
-#include <miopen/conv/context.hpp>
+#include <miopen/problem_description.hpp>
 
 #include <vector>
 
diff --git a/src/include/miopen/conv/invokers/impl_gemm_dynamic.hpp b/src/include/miopen/conv/invokers/impl_gemm_dynamic.hpp
index e2d329b0a9..b1a0e426a0 100644
--- a/src/include/miopen/conv/invokers/impl_gemm_dynamic.hpp
+++ b/src/include/miopen/conv/invokers/impl_gemm_dynamic.hpp
@@ -30,7 +30,6 @@
 #include <miopen/handle.hpp>
 #include <miopen/invoker.hpp>
 #include <miopen/kernel.hpp>
-#include <miopen/conv/context.hpp>
 #include <miopen/conv/asm_implicit_gemm.hpp>
 #include <miopen/tensor_ops.hpp>
 #include <miopen/solver.hpp>
@@ -223,11 +222,11 @@ MakeImplGemmDynamicBackwardDataInvokerFactory<solver::TunableImplicitGemmGTCDyna
     const miopen::ProblemDescription& problem, const solver::TunableImplicitGemmGTCDynamic_t& cfg);
 
 InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const miopen::ProblemDescription& problem,
     const solver::PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config);
 InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const miopen::ProblemDescription& problem,
     const solver::PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config);
 InvokerFactory MakeImplGemmDynamicForwardDlopsNCHWCInvokerFactory(
diff --git a/src/include/miopen/conv/invokers/mlir_impl_gemm.hpp b/src/include/miopen/conv/invokers/mlir_impl_gemm.hpp
index 5c9e904301..db467daad1 100644
--- a/src/include/miopen/conv/invokers/mlir_impl_gemm.hpp
+++ b/src/include/miopen/conv/invokers/mlir_impl_gemm.hpp
@@ -27,7 +27,7 @@
 #pragma once
 
 #include <miopen/invoker.hpp>
-#include <miopen/conv/context.hpp>
+#include <miopen/problem_description.hpp>
 
 namespace miopen {
 namespace conv {
diff --git a/src/include/miopen/conv/solver_finders.hpp b/src/include/miopen/conv/solver_finders.hpp
index 69425f09a7..4be112b0fb 100644
--- a/src/include/miopen/conv/solver_finders.hpp
+++ b/src/include/miopen/conv/solver_finders.hpp
@@ -27,8 +27,9 @@
 #pragma once
 
 #include <miopen/conv_solution.hpp>
-#include <miopen/conv/context.hpp>
+#include <miopen/conv/problem_description.hpp>
 #include <miopen/errors.hpp>
+#include <miopen/problem_description.hpp>
 
 #include <memory>
 #include <vector>
@@ -44,7 +45,7 @@ class SolversFinder
 
     virtual AlgorithmName GetAlgorithmName(const conv::ProblemDescription& ptroblem) const = 0;
 
-    inline std::vector<solver::ConvSolution> Find(const ConvolutionContext& ctx,
+    inline std::vector<solver::ConvSolution> Find(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem,
                                                   const AnyInvokeParams& invoke_ctx,
                                                   bool use_winograd_only) const
@@ -68,10 +69,10 @@ class SolversFinder
     }
 
 protected:
-    virtual bool IsEnabled(const ConvolutionContext& ctx,
+    virtual bool IsEnabled(const ExecutionContext& ctx,
                            const conv::ProblemDescription& problem,
                            bool use_winograd_only) const                             = 0;
-    virtual std::vector<solver::ConvSolution> FindImpl(const ConvolutionContext& ctx,
+    virtual std::vector<solver::ConvSolution> FindImpl(const ExecutionContext& ctx,
                                                        const ProblemDescription& problem,
                                                        const AnyInvokeParams& invoke_ctx,
                                                        bool use_winograd_only) const = 0;
@@ -81,7 +82,7 @@ const std::vector<std::unique_ptr<SolversFinder>>& GetConvSolverFinders();
 
 void ConvFindCore(const AnyInvokeParams& invoke_ctx,
                   DbRecord& record,
-                  const ConvolutionContext& ctx,
+                  const ExecutionContext& ctx,
                   const ProblemDescription& problem,
                   bool use_winograd_only,
                   const std::vector<std::unique_ptr<SolversFinder>>& finders);
diff --git a/src/include/miopen/convolution.hpp b/src/include/miopen/convolution.hpp
index 5e0507ddb6..bac0133106 100644
--- a/src/include/miopen/convolution.hpp
+++ b/src/include/miopen/convolution.hpp
@@ -64,7 +64,7 @@ struct ConvSolution;
 
 struct AnyInvokeParams;
 struct ExecutionContext;
-struct ConvolutionContext;
+struct ExecutionContext;
 struct Handle;
 struct TensorDescriptor;
 struct ProblemDescription;
@@ -208,7 +208,7 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor
                                             const TensorDescriptor& wDesc,
                                             miopenDataType_t yType = miopenFloat) const;
 
-    bool IsWinograd3x3SupportedAndFast(const miopen::ConvolutionContext& ctx,
+    bool IsWinograd3x3SupportedAndFast(const miopen::ExecutionContext& ctx,
                                        const ProblemDescription& problem) const;
 
     std::size_t GetWorkSpaceSize(ExecutionContext ctx,
@@ -229,15 +229,15 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor
                               bool exhaustiveSearch) const;
 
     std::vector<miopen::solver::ConvSolution>
-    FindWinogradSolutions(const ConvolutionContext& ctx,
+    FindWinogradSolutions(const ExecutionContext& ctx,
                           const ProblemDescription& problem,
                           const AnyInvokeParams& invoke_ctx) const;
 
     std::vector<miopen::solver::ConvSolution>
-    FindWinogradSolutions(const ConvolutionContext& ctx, const AnyInvokeParams& invoke_ctx) const;
+    FindWinogradSolutions(const ExecutionContext& ctx, const AnyInvokeParams& invoke_ctx) const;
 
     std::vector<miopen::solver::ConvSolution>
-    FindDataGemmSolutions(const ConvolutionContext& ctx, const AnyInvokeParams& invoke_ctx) const;
+    FindDataGemmSolutions(const ExecutionContext& ctx, const AnyInvokeParams& invoke_ctx) const;
 
     std::vector<miopen::solver::ConvSolution>
     FindDataImplicitGemmSolutions(Handle& handle,
@@ -249,7 +249,7 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor
                                   const AnyInvokeParams& invoke_ctx) const;
 
     std::vector<miopen::solver::ConvSolution>
-    FindFftSolutions(const ConvolutionContext& ctx,
+    FindFftSolutions(const ExecutionContext& ctx,
                      const ProblemDescription& problem,
                      const AnyInvokeParams& invoke_ctx) const;
 
@@ -395,7 +395,7 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor
     FindMode findMode;
     ConvolutionAttribute attribute;
 
-    std::vector<miopenConvSolution_t> GetSolutionsFallback(const ExecutionContext& exec_ctx,
+    std::vector<miopenConvSolution_t> GetSolutionsFallback(const ExecutionContext& ctx,
                                                            const conv::ProblemDescription& problem,
                                                            size_t maxSolutionCount) const;
 
diff --git a/src/include/miopen/execution_context.hpp b/src/include/miopen/execution_context.hpp
index 83b9abdc1a..d2195d6061 100644
--- a/src/include/miopen/execution_context.hpp
+++ b/src/include/miopen/execution_context.hpp
@@ -90,6 +90,7 @@ struct ExecutionContext
     // performance config.
     bool disable_perfdb_access      = false;
     bool use_dynamic_solutions_only = false;
+    bool is_for_generic_search      = false;
 
     inline Handle& GetStream() const { return *stream; }
     inline void SetStream(Handle* stream_) { stream = stream_; }
@@ -283,6 +284,10 @@ struct ExecutionContext
     void DetectRocm();
 };
 
+struct [[deprecated]] ConvolutionContext : ExecutionContext
+{
+};
+
 bool IsHipKernelsEnabled();
 
 } // namespace miopen
diff --git a/src/include/miopen/fusion/context.hpp b/src/include/miopen/fusion/context.hpp
index cfa38f36f5..43190e6807 100644
--- a/src/include/miopen/fusion/context.hpp
+++ b/src/include/miopen/fusion/context.hpp
@@ -26,20 +26,23 @@
 
 #pragma once
 
+#include <miopen/execution_context.hpp>
+#include <miopen/problem_description.hpp>
+
 namespace miopen {
 
-struct FusionContext : miopen::ExecutionContext
+struct Handle;
+
+struct FusionContext : ExecutionContext
 {
     explicit FusionContext(Handle& handle) : ExecutionContext(&handle) {}
 
-    ConvolutionContext GetConvContext(const miopen::ProblemDescription& conv_problem) const
+    ExecutionContext GetConvContext(const ProblemDescription& conv_problem) const
     {
-        auto ctx = ConvolutionContext{*this};
+        auto ctx = ExecutionContext{*this};
         conv_problem.SetupFloats(ctx);
         return ctx;
     }
-
-    bool is_for_generic_search = false;
 };
 
 } // namespace miopen
diff --git a/src/include/miopen/fusion/solvers.hpp b/src/include/miopen/fusion/solvers.hpp
index f5621e72dc..dd8f2df494 100644
--- a/src/include/miopen/fusion/solvers.hpp
+++ b/src/include/miopen/fusion/solvers.hpp
@@ -152,9 +152,10 @@ struct ConvBiasActivAsm1x1U : FusionTunableSolver<PerformanceConfigConvBiasActiv
                 const PerformanceConfigConvBiasActivAsm1x1U& /*config*/) const override;
     PerformanceConfigConvBiasActivAsm1x1U
     GetDefaultPerformanceConfig(const FusionContext&, const FusionDescription&) const override;
-    PerformanceConfigConvBiasActivAsm1x1U Search(const FusionContext& context,
-                                                 const FusionDescription& problem,
-                                                 const AnyInvokeParams& invoke_ctx) const override;
+    PerformanceConfigConvBiasActivAsm1x1U
+    Search(const FusionContext& context,
+           const FusionDescription& problem,
+           const AnyInvokeParams& invoke_params) const override;
     bool IsValidPerformanceConfig(const FusionContext&,
                                   const FusionDescription&,
                                   const PerformanceConfigConvBiasActivAsm1x1U&) const override;
diff --git a/src/include/miopen/generic_search.hpp b/src/include/miopen/generic_search.hpp
index a05ada3e3f..71b34b908a 100644
--- a/src/include/miopen/generic_search.hpp
+++ b/src/include/miopen/generic_search.hpp
@@ -29,9 +29,9 @@
 
 #include <miopen/binary_cache.hpp>
 #include <miopen/config.h>
-#include <miopen/conv/context.hpp>
 #include <miopen/conv_solution.hpp>
 #include <miopen/env.hpp>
+#include <miopen/execution_context.hpp>
 #include <miopen/handle.hpp>
 #include <miopen/invoke_params.hpp>
 #include <miopen/logger.hpp>
@@ -255,7 +255,7 @@ using RunAndMeasure_t =
                                                           std::declval<Top>(),
                                                           std::declval<ConstData_t>(),
                                                           std::declval<ConstData_t>(),
-                                                          std::declval<ConvolutionContext>(),
+                                                          std::declval<ExecutionContext>(),
                                                           std::declval<ConvSolution>(),
                                                           std::declval<float&>()));
 
diff --git a/src/include/miopen/mlo_internal.hpp b/src/include/miopen/mlo_internal.hpp
index c3a00fc3ee..f5e7d2fb83 100644
--- a/src/include/miopen/mlo_internal.hpp
+++ b/src/include/miopen/mlo_internal.hpp
@@ -68,7 +68,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #else
 #include <miopen/readonlyramdb.hpp>
 #endif
-#include <miopen/conv/context.hpp>
+#include <miopen/execution_context.hpp>
 #include <miopen/handle.hpp>
 #include <miopen/problem_description.hpp>
 #include <miopen/ramdb.hpp>
@@ -179,74 +179,74 @@ auto mloConstruct(T& x) -> decltype(x.mloConstruct(), void())
 }
 
 std::vector<miopen::solver::ConvSolution>
-FindAllGemmSolutions(const miopen::ConvolutionContext& ctx,
+FindAllGemmSolutions(const miopen::ExecutionContext& ctx,
                      const miopen::ProblemDescription& problem,
                      const miopen::AnyInvokeParams& invoke_ctx);
 
 std::vector<std::pair<std::string, size_t>>
-AllGemmWorkspaceSize(const miopen::ConvolutionContext& ctx,
+AllGemmWorkspaceSize(const miopen::ExecutionContext& ctx,
                      const miopen::ProblemDescription& problem);
 
 std::vector<std::pair<std::string, size_t>>
-AllDirectForwardBackwardDataWorkspaceSize(const miopen::ConvolutionContext& ctx,
+AllDirectForwardBackwardDataWorkspaceSize(const miopen::ExecutionContext& ctx,
                                           const miopen::ProblemDescription& problem);
 
 std::vector<std::pair<std::string, size_t>>
-FindAllImplicitGemmWorkspaceSizes(const miopen::ConvolutionContext& ctx,
+FindAllImplicitGemmWorkspaceSizes(const miopen::ExecutionContext& ctx,
                                   const miopen::ProblemDescription& problem);
 
 std::vector<std::pair<std::string, size_t>>
-FindAllWinogradWorkspaceSizes(const miopen::ConvolutionContext& ctx,
+FindAllWinogradWorkspaceSizes(const miopen::ExecutionContext& ctx,
                               const miopen::ProblemDescription& problem);
 
 std::vector<std::pair<std::string, size_t>>
-FindWinogradWrWWorkspaceSizes(const miopen::ConvolutionContext& ctx,
+FindWinogradWrWWorkspaceSizes(const miopen::ExecutionContext& ctx,
                               const miopen::ProblemDescription& problem);
 
 std::vector<std::pair<std::string, size_t>>
-FindImplicitGemmWrWWorkspaceSizes(const miopen::ConvolutionContext& ctx,
+FindImplicitGemmWrWWorkspaceSizes(const miopen::ExecutionContext& ctx,
                                   const miopen::ProblemDescription& problem);
 
 std::vector<std::pair<std::string, size_t>>
-AllDirectBwdWrW2DWorkspaceSize(const miopen::ConvolutionContext& ctx,
+AllDirectBwdWrW2DWorkspaceSize(const miopen::ExecutionContext& ctx,
                                const miopen::ProblemDescription& problem);
 
 std::vector<std::pair<std::string, size_t>>
-AllFFTForwardBackwardDataWorkspaceSize(const miopen::ConvolutionContext& ctx,
+AllFFTForwardBackwardDataWorkspaceSize(const miopen::ExecutionContext& ctx,
                                        const miopen::ProblemDescription& problem);
 
 std::vector<miopen::solver::ConvSolution>
-FindAllDirectSolutions(const miopen::ConvolutionContext& ctx,
+FindAllDirectSolutions(const miopen::ExecutionContext& ctx,
                        const miopen::ProblemDescription& problem,
                        const miopen::AnyInvokeParams& invoke_ctx);
 
 std::vector<miopen::solver::ConvSolution>
-FindAllImplicitGemmSolutions(const miopen::ConvolutionContext& ctx,
+FindAllImplicitGemmSolutions(const miopen::ExecutionContext& ctx,
                              const miopen::ProblemDescription& problem,
                              const miopen::AnyInvokeParams& invoke_ctx);
 
 std::vector<miopen::solver::ConvSolution>
-FindAllWinogradSolutions(const miopen::ConvolutionContext& ctx,
+FindAllWinogradSolutions(const miopen::ExecutionContext& ctx,
                          const miopen::ProblemDescription& problem,
                          const miopen::AnyInvokeParams& invoke_ctx);
 
 std::vector<miopen::solver::ConvSolution>
-FindWinogradWrWAllSolutions(const miopen::ConvolutionContext& ctx,
+FindWinogradWrWAllSolutions(const miopen::ExecutionContext& ctx,
                             const miopen::ProblemDescription& problem,
                             const miopen::AnyInvokeParams& invoke_ctx);
 
 std::vector<miopen::solver::ConvSolution>
-FindImplicitGemmWrWAllSolutions(const miopen::ConvolutionContext& ctx,
+FindImplicitGemmWrWAllSolutions(const miopen::ExecutionContext& ctx,
                                 const miopen::ProblemDescription& problem,
                                 const miopen::AnyInvokeParams& invoke_ctx);
 
 std::vector<miopen::solver::ConvSolution>
-FindAllBwdWrW2DSolutions(const miopen::ConvolutionContext& ctx,
+FindAllBwdWrW2DSolutions(const miopen::ExecutionContext& ctx,
                          const miopen::ProblemDescription& problem,
                          const miopen::AnyInvokeParams& invoke_ctx);
 
 std::vector<miopen::solver::ConvSolution>
-FindAllFFTSolutions(const miopen::ConvolutionContext& ctx,
+FindAllFFTSolutions(const miopen::ExecutionContext& ctx,
                     const miopen::ProblemDescription& problem,
                     const miopen::AnyInvokeParams& invoke_ctx);
 
@@ -275,7 +275,7 @@ struct mlo_construct_base
 
 protected:
     miopen::ProblemDescriptionCompatTemporary _problem;
-    miopen::ConvolutionContext _ctx;
+    miopen::ExecutionContext _ctx;
 };
 
 #define MLO_POOLING_OP_AVE 0
diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp
index 1ed699bf32..ce40d6f081 100644
--- a/src/include/miopen/solver.hpp
+++ b/src/include/miopen/solver.hpp
@@ -114,7 +114,7 @@ struct SolverBase
     /// * @see https://github.com/ROCmSoftwarePlatform/MIOpen/issues/410
     virtual float GetWti(const ExecutionContext& ctx, const boost::any& problem) const = 0;
 
-    // Returns the workspace size required by the solver for a given ConvolutionContext
+    // Returns the workspace size required by the solver for a given ExecutionContext
     virtual size_t GetWorkspaceSize(const ExecutionContext& ctx,
                                     const boost::any& problem) const = 0;
 
@@ -181,10 +181,10 @@ struct NonTunableSolverBase : SolverMixin<Context, Problem>
 };
 
 /// Typedef for convolution solvers
-using ConvSolver = NonTunableSolverBase<ConvolutionContext, ProblemDescription>;
+using ConvSolver = NonTunableSolverBase<ExecutionContext, ProblemDescription>;
 
 /// Base class for tunable solvers
-struct ConvTunableSolverBase : SolverMixin<ConvolutionContext, ProblemDescription>
+struct ConvTunableSolverBase : SolverMixin<ExecutionContext, ProblemDescription>
 {
     /// Initializes performance config to the default values.
     /// The function may involve some heuristic to guess the best solution
@@ -195,13 +195,13 @@ struct ConvTunableSolverBase : SolverMixin<ConvolutionContext, ProblemDescriptio
     /// The int parameter is needed only to not change the name of the
     /// function in the derived class. Function declarations that differ
     /// only by its return type cannot be overloaded.
-    virtual boost::any GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+    virtual boost::any GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem,
                                                    int) const = 0;
 
     /// Should return false if performance config is wrong for a problem.
     /// Main use is validation of values read from the perf db.
-    virtual bool IsValidPerformanceConfig(const ConvolutionContext& ctx,
+    virtual bool IsValidPerformanceConfig(const ExecutionContext& ctx,
                                           const ProblemDescription& problem,
                                           const PerfConfig& config) const = 0;
 
@@ -210,13 +210,13 @@ struct ConvTunableSolverBase : SolverMixin<ConvolutionContext, ProblemDescriptio
     /// The int parameter is needed only to not change the name of the
     /// function in the derived class. Function declarations that differ
     /// only by its return type cannot be overloaded.
-    virtual boost::any Search(const ConvolutionContext& ctx,
+    virtual boost::any Search(const ExecutionContext& ctx,
                               const ProblemDescription& problem,
                               const AnyInvokeParams& invoke_ctx,
                               int) const = 0;
 
     /// Tunable solvers provide a GetSolution that takes a Context and PerformanceConfig
-    virtual ConvSolution GetSolution(const ConvolutionContext& ctx,
+    virtual ConvSolution GetSolution(const ExecutionContext& ctx,
                                      const ProblemDescription& problem,
                                      const PerfConfig& config) const = 0;
 };
@@ -227,25 +227,25 @@ struct ConvTunableSolver : ConvTunableSolverBase
     static_assert(std::is_base_of<PerfConfig, PerformanceConfig>{},
                   "PerformanceConfig must be derived of PerfConfig");
 
-    virtual PerformanceConfig GetDefaultPerformanceConfig(const ConvolutionContext&,
+    virtual PerformanceConfig GetDefaultPerformanceConfig(const ExecutionContext&,
                                                           const ProblemDescription&) const = 0;
-    virtual bool IsValidPerformanceConfig(const ConvolutionContext&,
+    virtual bool IsValidPerformanceConfig(const ExecutionContext&,
                                           const ProblemDescription&,
                                           const PerformanceConfig&) const                  = 0;
     virtual PerformanceConfig
-    Search(const ConvolutionContext&, const ProblemDescription&, const AnyInvokeParams&) const = 0;
-    virtual ConvSolution GetSolution(const ConvolutionContext&,
+    Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams&) const = 0;
+    virtual ConvSolution GetSolution(const ExecutionContext&,
                                      const ProblemDescription&,
-                                     const PerformanceConfig&) const                           = 0;
+                                     const PerformanceConfig&) const                         = 0;
 
-    boost::any GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+    boost::any GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                            const ProblemDescription& problem,
                                            int) const final
     {
         return GetDefaultPerformanceConfig(ctx, problem);
     }
 
-    bool IsValidPerformanceConfig(const ConvolutionContext& ctx,
+    bool IsValidPerformanceConfig(const ExecutionContext& ctx,
                                   const ProblemDescription& problem,
                                   const PerfConfig& config) const final
     {
@@ -253,7 +253,7 @@ struct ConvTunableSolver : ConvTunableSolverBase
             ctx, problem, dynamic_cast<const PerformanceConfig&>(config));
     }
 
-    boost::any Search(const ConvolutionContext& ctx,
+    boost::any Search(const ExecutionContext& ctx,
                       const ProblemDescription& problem,
                       const AnyInvokeParams& invoke_ctx,
                       int) const final
@@ -261,7 +261,7 @@ struct ConvTunableSolver : ConvTunableSolverBase
         return Search(ctx, problem, invoke_ctx);
     }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
+    ConvSolution GetSolution(const ExecutionContext& ctx,
                              const ProblemDescription& problem,
                              const PerfConfig& config) const final
     {
@@ -290,7 +290,7 @@ struct PerformanceConfigConvAsm3x3U : PerfConfigBase<PerformanceConfigConvAsm3x3
     void HeuristicInit(const ProblemDescription&);
     bool IsValidValue() const;
     bool SetNextValue(const ProblemDescription&);
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -302,17 +302,16 @@ struct ConvAsm3x3U final : ConvTunableSolver<PerformanceConfigConvAsm3x3U>
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvAsm3x3U>(); }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     PerformanceConfigConvAsm3x3U
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConfigConvAsm3x3U&) const override;
-    PerformanceConfigConvAsm3x3U Search(const ConvolutionContext&,
+    PerformanceConfigConvAsm3x3U Search(const ExecutionContext&,
                                         const ProblemDescription&,
                                         const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConfigConvAsm3x3U&) const override;
 };
@@ -364,15 +363,15 @@ struct PerformanceConfigConvAsm1x1U : PerfConfigBase<PerformanceConfigConvAsm1x1
     // clang-format on
 
     void StaticHeuristic(const ProblemDescription& problem);
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
 #if MIOPEN_ENABLE_AI_KERNEL_TUNING
     void
-    RunParmeterPredictionModel(const ConvolutionContext&, const ProblemDescription&, bool& valid);
+    RunParmeterPredictionModel(const ExecutionContext&, const ProblemDescription&, bool& valid);
     bool ModelApplyToken(int index, int value, const ProblemDescription&);
 #endif
     bool IsValidValue() const { return IsValidValueImpl(8); }
     bool SetNextValue(const ProblemDescription&);
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -399,18 +398,17 @@ struct ConvAsm1x1U final : ConvTunableSolver<PerformanceConfigConvAsm1x1U>
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvAsm1x1U>(); }
 
     PerformanceConfigConvAsm1x1U
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConfigConvAsm1x1U&) const override;
-    PerformanceConfigConvAsm1x1U Search(const ConvolutionContext&,
+    PerformanceConfigConvAsm1x1U Search(const ExecutionContext&,
                                         const ProblemDescription&,
                                         const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConfigConvAsm1x1U&) const override;
 };
@@ -470,7 +468,7 @@ struct PerformanceConfigConvAsm1x1UV2 : PerfConfigBase<PerformanceConfigConvAsm1
     void HeuristicInit(const ProblemDescription&);
     bool IsValidValue() const;
     bool SetNextValue(const ProblemDescription&);
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -483,16 +481,15 @@ struct ConvAsm1x1UV2 final : ConvTunableSolver<PerformanceConfigConvAsm1x1UV2>
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvAsm1x1UV2>(); }
 
     PerformanceConfigConvAsm1x1UV2
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConfigConvAsm1x1UV2&) const override;
-    PerformanceConfigConvAsm1x1UV2 Search(const ConvolutionContext&,
+    PerformanceConfigConvAsm1x1UV2 Search(const ExecutionContext&,
                                           const ProblemDescription&,
                                           const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConfigConvAsm1x1UV2&) const override;
 };
@@ -504,20 +501,10 @@ struct ConvAsm5x10u2v2f1 final : ConvSolver
 
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvAsm5x10u2v2f1>(); }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 struct ConvAsm5x10u2v2b1 final : ConvSolver
@@ -527,20 +514,10 @@ struct ConvAsm5x10u2v2b1 final : ConvSolver
 
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvAsm5x10u2v2b1>(); }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 struct ConvAsm7x7c3h224w224k64u2v2p3q3f1 final : ConvSolver
@@ -553,20 +530,10 @@ struct ConvAsm7x7c3h224w224k64u2v2p3q3f1 final : ConvSolver
         return GetSolverDbId<ConvAsm7x7c3h224w224k64u2v2p3q3f1>();
     }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 struct ConvOclDirectFwd11x11 final : ConvSolver
@@ -576,16 +543,16 @@ struct ConvOclDirectFwd11x11 final : ConvSolver
         return GetSolverDbId<ConvOclDirectFwd11x11>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override;
 };
 
 struct ConvOclDirectFwdGen final : ConvSolver
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvOclDirectFwdGen>(); }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override;
 };
 
 struct PerformanceImplicitGemm : PerfConfigBase<PerformanceImplicitGemm>
@@ -646,10 +613,10 @@ struct PerformanceImplicitGemm : PerfConfigBase<PerformanceImplicitGemm>
         f(self.WeiBlockCopyClusterLengths_K, "WeiBlockCopyClusterLengths_K");
     }
 
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool IsValidValue() const;
     bool SetNextValue(const ProblemDescription&);
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
     bool operator==(const PerformanceImplicitGemm& other) const;
 };
 
@@ -684,7 +651,7 @@ struct PerformanceImplicitGemmV4R1 : public PerformanceImplicitGemm
 
     PerformanceImplicitGemmV4R1(bool spare) : PerformanceImplicitGemm(spare) {}
 
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
 };
 
 struct PerformanceImplicitGemmV4R4Fwd : PerfConfigBase<PerformanceImplicitGemmV4R4Fwd>
@@ -735,12 +702,12 @@ struct PerformanceImplicitGemmV4R4Fwd : PerfConfigBase<PerformanceImplicitGemmV4
     CalculateGemmCThreadCopyPerformanceParameters(const ProblemDescription&) const;
     std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ProblemDescription&) const;
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
     bool IsValid(const ProblemDescription&) const;
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
 };
 
@@ -792,12 +759,12 @@ struct PerformanceImplicitGemmV4R4WrW : PerfConfigBase<PerformanceImplicitGemmV4
     CalculateGemmCThreadCopyPerformanceParameters(const ProblemDescription&) const;
     std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ProblemDescription&) const;
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
     bool IsValid(const ProblemDescription&) const;
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
 };
 
@@ -841,22 +808,22 @@ struct PerformanceImplicitGemmBwdDataV1R1 : PerfConfigBase<PerformanceImplicitGe
         f(self.GemmNPerThread, "GemmNPerThread");
     }
 
-    std::tuple<int, bool> CalculateGridSize(const ConvolutionContext&,
+    std::tuple<int, bool> CalculateGridSize(const ExecutionContext&,
                                             const ProblemDescription&) const;
     std::tuple<int, int, int, int, bool> CalculateBlockGemmPerformanceParameters() const;
     std::tuple<int, int, int, int, bool>
-    CalculateGemmABlockCopyPerformanceParameters(const ConvolutionContext&,
+    CalculateGemmABlockCopyPerformanceParameters(const ExecutionContext&,
                                                  const ProblemDescription&) const;
     std::tuple<int, int, int, int, bool>
-    CalculateGemmBBlockCopyPerformanceParameters(const ConvolutionContext&,
+    CalculateGemmBBlockCopyPerformanceParameters(const ExecutionContext&,
                                                  const ProblemDescription&) const;
     std::tuple<int, bool>
     CalculateGemmCThreadCopyPerformanceParameters(const ProblemDescription&) const;
-    std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ConvolutionContext&,
+    std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ExecutionContext&,
                                                            const ProblemDescription&) const;
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
 };
 
@@ -910,12 +877,12 @@ struct PerformanceImplicitGemmBwdDataV4R1 : PerfConfigBase<PerformanceImplicitGe
     CalculateGemmCThreadCopyPerformanceParameters(const ProblemDescription&) const;
     std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ProblemDescription&) const;
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
     bool IsValid(const ProblemDescription&) const;
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
 };
 
@@ -968,10 +935,10 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops
     std::tuple<int, int, int, int, int, bool>
     CalculateGemmBBlockCopyPerformanceParameters(const ProblemDescription&) const;
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
     bool IsReallyValid(const ProblemDescription&) const;
-    bool IsFastToBeUsedForTuning(const ConvolutionContext&, const ProblemDescription&) const;
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    bool IsFastToBeUsedForTuning(const ExecutionContext&, const ProblemDescription&) const;
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
 };
 
@@ -983,16 +950,15 @@ struct ConvHipImplicitGemmV4R1Fwd final : ConvTunableSolver<PerformanceImplicitG
     }
 
     PerformanceImplicitGemmV4R1
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceImplicitGemmV4R1&) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceImplicitGemmV4R1&) const override;
-    PerformanceImplicitGemmV4R1 Search(const ConvolutionContext&,
+    PerformanceImplicitGemmV4R1 Search(const ExecutionContext&,
                                        const ProblemDescription&,
                                        const AnyInvokeParams& invoke_ctx) const override;
 };
@@ -1004,17 +970,16 @@ struct ConvHipImplicitGemmV4R4Fwd final : ConvTunableSolver<PerformanceImplicitG
         return GetSolverDbId<ConvHipImplicitGemmV4R4Fwd>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     PerformanceImplicitGemmV4R4Fwd
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceImplicitGemmV4R4Fwd&) const override;
-    PerformanceImplicitGemmV4R4Fwd Search(const ConvolutionContext&,
+    PerformanceImplicitGemmV4R4Fwd Search(const ExecutionContext&,
                                           const ProblemDescription&,
                                           const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceImplicitGemmV4R4Fwd&) const override;
 
@@ -1066,7 +1031,7 @@ struct PerformanceConvMlirIgemm : PerfConfigBase<PerformanceConvMlirIgemm>
         f(self.GemmNPerThread, "GemmNPerThread");
     }
 
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
     bool SetNextValue(const ProblemDescription&);
 
 private:
@@ -1077,16 +1042,16 @@ struct ConvMlirIgemmFwd final : ConvTunableSolver<PerformanceConvMlirIgemm>
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvMlirIgemmFwd>(); }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    PerformanceConvMlirIgemm GetDefaultPerformanceConfig(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    PerformanceConvMlirIgemm GetDefaultPerformanceConfig(const ExecutionContext&,
                                                          const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConvMlirIgemm&) const override;
-    PerformanceConvMlirIgemm Search(const ConvolutionContext&,
+    PerformanceConvMlirIgemm Search(const ExecutionContext&,
                                     const ProblemDescription&,
                                     const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConvMlirIgemm&) const override;
 };
@@ -1138,7 +1103,7 @@ struct PerformanceConvMlirIgemmXdlops : PerfConfigBase<PerformanceConvMlirIgemmX
         f(self.GemmBThreadCopyMoreGemmKPack, "GemmBThreadCopyMoreGemmKPack");
     }
 
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
     bool SetNextValue(const ProblemDescription&);
 
 private:
@@ -1152,17 +1117,16 @@ struct ConvMlirIgemmFwdXdlops final : ConvTunableSolver<PerformanceConvMlirIgemm
         return GetSolverDbId<ConvMlirIgemmFwdXdlops>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     PerformanceConvMlirIgemmXdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConvMlirIgemmXdlops&) const override;
-    PerformanceConvMlirIgemmXdlops Search(const ConvolutionContext&,
+    PerformanceConvMlirIgemmXdlops Search(const ExecutionContext&,
                                           const ProblemDescription&,
                                           const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConvMlirIgemmXdlops&) const override;
 };
@@ -1174,17 +1138,16 @@ struct ConvHipImplicitGemmV4R4WrW final : ConvTunableSolver<PerformanceImplicitG
         return GetSolverDbId<ConvHipImplicitGemmV4R4WrW>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     PerformanceImplicitGemmV4R4WrW
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceImplicitGemmV4R4WrW&) const override;
-    PerformanceImplicitGemmV4R4WrW Search(const ConvolutionContext&,
+    PerformanceImplicitGemmV4R4WrW Search(const ExecutionContext&,
                                           const ProblemDescription&,
                                           const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceImplicitGemmV4R4WrW&) const override;
 
@@ -1198,16 +1161,16 @@ struct ConvMlirIgemmWrW final : ConvTunableSolver<PerformanceConvMlirIgemm>
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvMlirIgemmWrW>(); }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    PerformanceConvMlirIgemm GetDefaultPerformanceConfig(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    PerformanceConvMlirIgemm GetDefaultPerformanceConfig(const ExecutionContext&,
                                                          const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConvMlirIgemm&) const override;
-    PerformanceConvMlirIgemm Search(const ConvolutionContext&,
+    PerformanceConvMlirIgemm Search(const ExecutionContext&,
                                     const ProblemDescription&,
                                     const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConvMlirIgemm&) const override;
 };
@@ -1219,19 +1182,18 @@ struct ConvMlirIgemmWrWXdlops final : ConvTunableSolver<PerformanceConvMlirIgemm
         return GetSolverDbId<ConvMlirIgemmWrWXdlops>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     PerformanceConvMlirIgemmXdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConvMlirIgemmXdlops&) const override;
-    PerformanceConvMlirIgemmXdlops Search(const ConvolutionContext&,
+    PerformanceConvMlirIgemmXdlops Search(const ExecutionContext&,
                                           const ProblemDescription&,
                                           const AnyInvokeParams& invoke_ctx) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConvMlirIgemmXdlops&) const override;
 };
@@ -1269,12 +1231,12 @@ struct PerformanceImplicitGemmForwardV4R4Xdlops
 
     bool operator==(const PerformanceImplicitGemmForwardV4R4Xdlops& other) const;
 
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
     bool IsReallyValid(const ProblemDescription&) const;
-    bool IsFastToBeUsedForTuning(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsFastToBeUsedForTuning(const ExecutionContext&, const ProblemDescription&) const;
 
     std::tuple<int, bool> CalculateBlockSize() const;
     std::tuple<int, bool> CalculateGridSize(const ProblemDescription&) const;
@@ -1326,12 +1288,12 @@ struct PerformanceImplicitGemmForwardV4R5Xdlops
 
     bool operator==(const PerformanceImplicitGemmForwardV4R5Xdlops& other) const;
 
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
     bool IsReallyValid(const ProblemDescription&) const;
-    bool IsFastToBeUsedForTuning(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsFastToBeUsedForTuning(const ExecutionContext&, const ProblemDescription&) const;
 
     std::tuple<int, bool> CalculateBlockSize() const;
     std::tuple<int, bool> CalculateGridSize(const ProblemDescription&) const;
@@ -1385,12 +1347,12 @@ struct PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm
 
     bool operator==(const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm& other) const;
 
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
     bool IsReallyValid(const ProblemDescription&) const;
-    bool IsFastToBeUsedForTuning(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsFastToBeUsedForTuning(const ExecutionContext&, const ProblemDescription&) const;
 
     std::tuple<int, bool> CalculateBlockSize() const;
     std::tuple<int, bool> CalculateGridSize(const ProblemDescription&) const;
@@ -1431,12 +1393,12 @@ struct PerformanceImplicitGemmBwdV1R1Xdlops : PerfConfigBase<PerformanceImplicit
 
     bool operator==(const PerformanceImplicitGemmBwdV1R1Xdlops& other) const;
 
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
     bool IsReallyValid(const ProblemDescription&) const;
-    bool IsFastToBeUsedForTuning(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsFastToBeUsedForTuning(const ExecutionContext&, const ProblemDescription&) const;
 
     std::tuple<int, bool> CalculateBlockSize() const;
     std::tuple<int, bool> CalculateGridSize(const ProblemDescription&) const;
@@ -1456,17 +1418,16 @@ struct ConvHipImplicitGemmForwardV4R4Xdlops final
     }
 
     PerformanceImplicitGemmForwardV4R4Xdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceImplicitGemmForwardV4R4Xdlops&) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceImplicitGemmForwardV4R4Xdlops&) const override;
     PerformanceImplicitGemmForwardV4R4Xdlops
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
 
@@ -1485,19 +1446,18 @@ struct ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm final
     }
 
     PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsValidPerformanceConfig(
-        const ConvolutionContext&,
+        const ExecutionContext&,
         const ProblemDescription&,
         const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm&) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     ConvSolution
-    GetSolution(const ConvolutionContext&,
+    GetSolution(const ExecutionContext&,
                 const ProblemDescription&,
                 const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm&) const override;
     PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
 
@@ -1517,17 +1477,16 @@ struct ConvHipImplicitGemmForwardV4R5Xdlops final
     }
 
     PerformanceImplicitGemmForwardV4R5Xdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceImplicitGemmForwardV4R5Xdlops&) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceImplicitGemmForwardV4R5Xdlops&) const override;
     PerformanceImplicitGemmForwardV4R5Xdlops
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
 };
@@ -1540,16 +1499,15 @@ struct ConvHipImplicitGemmV4R1WrW final : ConvTunableSolver<PerformanceImplicitG
     }
 
     PerformanceImplicitGemmV4R1
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceImplicitGemmV4R1&) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceImplicitGemmV4R1&) const override;
-    PerformanceImplicitGemmV4R1 Search(const ConvolutionContext&,
+    PerformanceImplicitGemmV4R1 Search(const ExecutionContext&,
                                        const ProblemDescription&,
                                        const AnyInvokeParams& invoke_ctx) const override;
 };
@@ -1561,24 +1519,23 @@ struct ConvHipImplicitGemmBwdDataV1R1 final : ConvTunableSolver<PerformanceImpli
         return GetSolverDbId<ConvHipImplicitGemmBwdDataV1R1>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     PerformanceImplicitGemmBwdDataV1R1
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceImplicitGemmBwdDataV1R1&) const override;
-    PerformanceImplicitGemmBwdDataV1R1 Search(const ConvolutionContext&,
+    PerformanceImplicitGemmBwdDataV1R1 Search(const ExecutionContext&,
                                               const ProblemDescription&,
                                               const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceImplicitGemmBwdDataV1R1&) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
 
 private:
-    static std::tuple<int, int, int> CalculateGemmSize(const ConvolutionContext&,
+    static std::tuple<int, int, int> CalculateGemmSize(const ExecutionContext&,
                                                        const ProblemDescription&);
 
     friend struct PerformanceImplicitGemmBwdDataV1R1;
@@ -1588,16 +1545,16 @@ struct ConvMlirIgemmBwd final : ConvTunableSolver<PerformanceConvMlirIgemm>
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvMlirIgemmBwd>(); }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    PerformanceConvMlirIgemm GetDefaultPerformanceConfig(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    PerformanceConvMlirIgemm GetDefaultPerformanceConfig(const ExecutionContext&,
                                                          const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConvMlirIgemm&) const override;
-    PerformanceConvMlirIgemm Search(const ConvolutionContext&,
+    PerformanceConvMlirIgemm Search(const ExecutionContext&,
                                     const ProblemDescription&,
                                     const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConvMlirIgemm&) const override;
 };
@@ -1609,17 +1566,16 @@ struct ConvMlirIgemmBwdXdlops final : ConvTunableSolver<PerformanceConvMlirIgemm
         return GetSolverDbId<ConvMlirIgemmBwdXdlops>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     PerformanceConvMlirIgemmXdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConvMlirIgemmXdlops&) const override;
-    PerformanceConvMlirIgemmXdlops Search(const ConvolutionContext&,
+    PerformanceConvMlirIgemmXdlops Search(const ExecutionContext&,
                                           const ProblemDescription&,
                                           const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConvMlirIgemmXdlops&) const override;
 };
@@ -1631,17 +1587,16 @@ struct ConvHipImplicitGemmBwdDataV4R1 final : ConvTunableSolver<PerformanceImpli
         return GetSolverDbId<ConvHipImplicitGemmBwdDataV4R1>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     PerformanceImplicitGemmBwdDataV4R1
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceImplicitGemmBwdDataV4R1&) const override;
-    PerformanceImplicitGemmBwdDataV4R1 Search(const ConvolutionContext&,
+    PerformanceImplicitGemmBwdDataV4R1 Search(const ExecutionContext&,
                                               const ProblemDescription&,
                                               const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceImplicitGemmBwdDataV4R1&) const override;
 
@@ -1661,17 +1616,16 @@ struct ConvHipImplicitGemmBwdDataV4R1Xdlops final
     }
 
     PerformanceImplicitGemmBwdDataV4R1Xdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceImplicitGemmBwdDataV4R1Xdlops&) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceImplicitGemmBwdDataV4R1Xdlops&) const override;
     PerformanceImplicitGemmBwdDataV4R1Xdlops
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
 
@@ -1691,18 +1645,17 @@ struct ConvHipImplicitGemmBwdDataV1R1Xdlops final
     }
 
     PerformanceImplicitGemmBwdV1R1Xdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceImplicitGemmBwdV1R1Xdlops&) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
-    PerformanceImplicitGemmBwdV1R1Xdlops Search(const ConvolutionContext&,
+    PerformanceImplicitGemmBwdV1R1Xdlops Search(const ExecutionContext&,
                                                 const ProblemDescription&,
                                                 const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceImplicitGemmBwdV1R1Xdlops&) const override;
 
@@ -1722,23 +1675,13 @@ struct ConvAsmImplicitGemmV4R1DynamicFwd final : ConvSolver
         return GetSolverDbId<ConvAsmImplicitGemmV4R1DynamicFwd>();
     }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
 
     bool IsDynamic() const override { return true; }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 struct ConvAsmImplicitGemmV4R1DynamicFwd_1x1 final : ConvSolver
@@ -1751,23 +1694,13 @@ struct ConvAsmImplicitGemmV4R1DynamicFwd_1x1 final : ConvSolver
         return GetSolverDbId<ConvAsmImplicitGemmV4R1DynamicFwd_1x1>();
     }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
 
     bool IsDynamic() const override { return true; }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 struct ConvAsmImplicitGemmV4R1DynamicWrw final : ConvSolver
@@ -1781,32 +1714,18 @@ struct ConvAsmImplicitGemmV4R1DynamicWrw final : ConvSolver
         return GetSolverDbId<ConvAsmImplicitGemmV4R1DynamicWrw>();
     }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
 
     bool IsDynamic() const override { return true; }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
-                            const ProblemDescription& problem) const override
-    {
-        return GetWorkspaceSize(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
+                            const ProblemDescription& problem) const override;
 
     bool MayNeedWorkspace() const override { return true; }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 struct ConvAsmImplicitGemmGTCDynamicWrwXdlops final : ConvSolver
@@ -1820,32 +1739,18 @@ struct ConvAsmImplicitGemmGTCDynamicWrwXdlops final : ConvSolver
         return GetSolverDbId<ConvAsmImplicitGemmGTCDynamicWrwXdlops>();
     }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
 
     bool IsDynamic() const override { return true; }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
-                            const ProblemDescription& problem) const override
-    {
-        return GetWorkspaceSize(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
+                            const ProblemDescription& problem) const override;
 
     bool MayNeedWorkspace() const override { return true; }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 struct ConvAsmImplicitGemmV4R1DynamicBwd final : ConvSolver
@@ -1858,23 +1763,13 @@ struct ConvAsmImplicitGemmV4R1DynamicBwd final : ConvSolver
         return GetSolverDbId<ConvAsmImplicitGemmV4R1DynamicBwd>();
     }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
 
     bool IsDynamic() const override { return true; }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 struct ConvAsmImplicitGemmGTCDynamicFwdXdlops final : ConvSolver
@@ -1887,23 +1782,13 @@ struct ConvAsmImplicitGemmGTCDynamicFwdXdlops final : ConvSolver
         return GetSolverDbId<ConvAsmImplicitGemmGTCDynamicFwdXdlops>();
     }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
 
     bool IsDynamic() const override { return true; }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 struct ConvAsmImplicitGemmGTCDynamicBwdXdlops final : ConvSolver
@@ -1916,54 +1801,44 @@ struct ConvAsmImplicitGemmGTCDynamicBwdXdlops final : ConvSolver
         return GetSolverDbId<ConvAsmImplicitGemmGTCDynamicBwdXdlops>();
     }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
 
     bool IsDynamic() const override { return true; }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 /// Holds common member functions for the Solvers which share the same
 /// "legacy exhaustive search" machinery.
 struct ConvOclDirectFwdLegacyExhaustiveSearch : ConvTunableSolver<LegacyPerformanceConfig>
 {
-    LegacyPerformanceConfig GetDefaultPerformanceConfig(const ConvolutionContext&,
+    LegacyPerformanceConfig GetDefaultPerformanceConfig(const ExecutionContext&,
                                                         const ProblemDescription&) const override;
-    LegacyPerformanceConfig Search(const ConvolutionContext&,
+    LegacyPerformanceConfig Search(const ExecutionContext&,
                                    const ProblemDescription&,
                                    const AnyInvokeParams& invoke_ctx) const override;
 
 private:
     template <typename Tgpu>
-    LegacyPerformanceConfig SearchImpl(const ConvolutionContext&,
+    LegacyPerformanceConfig SearchImpl(const ExecutionContext&,
                                        const ProblemDescription&,
                                        const AnyInvokeParams& invoke_ctx) const;
 };
 
 struct ConvOclDirectFwd : ConvOclDirectFwdLegacyExhaustiveSearch
 {
-    static ConvSolution BaseGetSolution(const ConvolutionContext& ctx,
+    static ConvSolution BaseGetSolution(const ExecutionContext& ctx,
                                         const ProblemDescription& problem,
                                         const LegacyPerformanceConfig& config);
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvOclDirectFwd>(); }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const LegacyPerformanceConfig&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const LegacyPerformanceConfig&) const override;
 };
@@ -1972,12 +1847,12 @@ struct ConvOclDirectFwd1x1 final : ConvOclDirectFwdLegacyExhaustiveSearch
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvOclDirectFwd1x1>(); }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const LegacyPerformanceConfig&) const override;
 
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const LegacyPerformanceConfig&) const override
     {
@@ -1992,23 +1867,13 @@ struct ConvBinWinograd3x3U final : ConvSolver
 
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvBinWinograd3x3U>(); }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
 
     bool IsDynamic() const override { return true; }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 struct ConvBinWinogradRxS final : ConvSolver
@@ -2018,23 +1883,13 @@ struct ConvBinWinogradRxS final : ConvSolver
 
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvBinWinogradRxS>(); }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
 
     bool IsDynamic() const override { return true; }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 struct PerformanceConfigConvBinWinogradRxS : PerfConfigBase<PerformanceConfigConvBinWinogradRxS>
@@ -2052,14 +1907,14 @@ struct PerformanceConfigConvBinWinogradRxS : PerfConfigBase<PerformanceConfigCon
     int GetNGroups() const { return n_groups; }
 
     template <int Winodata, int Winofilter>
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool IsValidValue() const;
     bool SetNextValue(const ProblemDescription&);
-    bool IsValid(const ConvolutionContext& ctx, const ProblemDescription&) const
+    bool IsValid(const ExecutionContext& ctx, const ProblemDescription&) const
     {
         return IsValid(ctx);
     }
-    bool IsValid(const ConvolutionContext&) const;
+    bool IsValid(const ExecutionContext&) const;
     bool operator==(const PerformanceConfigConvBinWinogradRxS& other) const;
 };
 
@@ -2078,17 +1933,16 @@ struct ConvBinWinoRxS final : ConvTunableSolver<PerformanceConfigConvBinWinograd
     }
 
     PerformanceConfigConvBinWinogradRxS
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConfigConvBinWinogradRxS&) const override;
-    PerformanceConfigConvBinWinogradRxS Search(const ConvolutionContext&,
+    PerformanceConfigConvBinWinogradRxS Search(const ExecutionContext&,
                                                const ProblemDescription&,
                                                const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConfigConvBinWinogradRxS&) const override;
 
@@ -2120,10 +1974,10 @@ struct ConvBinWinogradRxSf2x3g1 final : ConvSolver
         return GetSolverDbId<ConvBinWinogradRxSf2x3g1>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
-    float GetWti(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override;
+    float GetWti(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override;
 };
 
 template <int WinoDataH, int WinoFilterH, int WinoDataW = WinoDataH, int WinoFilterW = WinoFilterH>
@@ -2135,11 +1989,11 @@ struct ConvMPBidirectWinograd final : ConvSolver
             ConvMPBidirectWinograd<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
-    ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override;
 
     // kernel_file_name for solver identification
     static std::string GetSolverFileNames(int id)
@@ -2191,7 +2045,7 @@ struct ConvMPBidirectWinograd_xdlops final
             ConvMPBidirectWinograd_xdlops<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
 
     bool IsDynamic() const override
     {
@@ -2202,7 +2056,7 @@ struct ConvMPBidirectWinograd_xdlops final
     }
 
     PerformanceImplicitGemmForwardV4R4Xdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+    GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                 const ProblemDescription& problem) const override
     {
         const auto xdlops_problem = GetTransformedProblem(problem);
@@ -2213,7 +2067,7 @@ struct ConvMPBidirectWinograd_xdlops final
     }
 
     bool
-    IsValidPerformanceConfig(const ConvolutionContext& ctx,
+    IsValidPerformanceConfig(const ExecutionContext& ctx,
                              const ProblemDescription& problem,
                              const PerformanceImplicitGemmForwardV4R4Xdlops& config) const override
     {
@@ -2224,7 +2078,7 @@ struct ConvMPBidirectWinograd_xdlops final
             xdlops_ctx, xdlops_problem, config);
     }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
                             const ProblemDescription& problem) const override
     {
         const auto xdlops_problem = GetTransformedProblem(problem);
@@ -2238,17 +2092,16 @@ struct ConvMPBidirectWinograd_xdlops final
     bool MayNeedWorkspace() const override { return true; }
 
     PerformanceImplicitGemmForwardV4R4Xdlops
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceImplicitGemmForwardV4R4Xdlops&) const override;
 
 private:
-    ConvolutionContext
-    GetTransformedConvContext(const ConvolutionContext& ctx,
-                              const ProblemDescription& transformed_problem) const;
+    ExecutionContext GetTransformedConvContext(const ExecutionContext& ctx,
+                                               const ProblemDescription& transformed_problem) const;
     ProblemDescription GetTransformedProblem(const ProblemDescription& problem) const;
 
     // kernel_file_name for solver identification
@@ -2302,27 +2155,18 @@ struct ConvWinograd3x3MultipassWrW final : ConvSolver
             ConvWinograd3x3MultipassWrW<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>>();
     }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
 
     bool IsDynamic() const override { return true; }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
-                            const ProblemDescription& problem) const override
-    {
-        return GetWorkspaceSize(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
+                            const ProblemDescription& problem) const override;
 
     bool MayNeedWorkspace() const override { return true; }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 
     // kernel_file_name for solver identification
     static std::string GetSolverFileNames(int id)
@@ -2356,10 +2200,6 @@ struct ConvWinograd3x3MultipassWrW final : ConvSolver
     }
 
 private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
-
     InvokerFactory PrepareInvokerFactory(const ExecutionContext&,
                                          const ProblemDescription&,
                                          std::size_t ws_sz) const;
@@ -2424,10 +2264,10 @@ struct PerformanceConfigAsmDirect3x3WrW : PerfConfigBase<PerformanceConfigAsmDir
     int GetNPerGroup() const { return n_per_group; }
     int GetCPerWave() const { assert(chunk_size); return 64 / chunk_size; } // clang-format on
 
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool IsValidValue() const;
     bool SetNextValue(const ProblemDescription&);
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
     bool operator==(const PerformanceConfigAsmDirect3x3WrW& other) const;
 };
 
@@ -2436,16 +2276,15 @@ struct ConvAsmBwdWrW3x3 final : ConvTunableSolver<PerformanceConfigAsmDirect3x3W
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvAsmBwdWrW3x3>(); }
 
     PerformanceConfigAsmDirect3x3WrW
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConfigAsmDirect3x3WrW&) const override;
-    PerformanceConfigAsmDirect3x3WrW Search(const ConvolutionContext&,
+    PerformanceConfigAsmDirect3x3WrW Search(const ExecutionContext&,
                                             const ProblemDescription&,
                                             const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConfigAsmDirect3x3WrW& config) const override;
 };
@@ -2458,11 +2297,11 @@ struct ConvWinoFuryRxS final : ConvSolver
         return GetSolverDbId<ConvWinoFuryRxS<Winodata, Winofilter>>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
-    float GetWti(const ConvolutionContext&, const ProblemDescription&) const override;
+    float GetWti(const ExecutionContext&, const ProblemDescription&) const override;
 
-    ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override;
 
     static constexpr bool is2x3() { return Winodata == 2 && Winofilter == 3; }
     static constexpr bool is3x2() { return Winodata == 3 && Winofilter == 2; }
@@ -2564,10 +2403,10 @@ struct PerformanceConfigConvAsmBwdWrW1x1 : PerfConfigBase<PerformanceConfigConvA
     int GetDataPrefetch() const { return data_prefetch; }
     // clang-format on
 
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool IsValidValue() const;
     bool SetNextValue(const ProblemDescription&);
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
     bool operator==(const PerformanceConfigConvAsmBwdWrW1x1& other) const;
 };
 
@@ -2576,18 +2415,17 @@ struct ConvAsmBwdWrW1x1 final : ConvTunableSolver<PerformanceConfigConvAsmBwdWrW
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvAsmBwdWrW1x1>(); }
 
     PerformanceConfigConvAsmBwdWrW1x1
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConfigConvAsmBwdWrW1x1&) const override;
-    PerformanceConfigConvAsmBwdWrW1x1 Search(const ConvolutionContext&,
+    PerformanceConfigConvAsmBwdWrW1x1 Search(const ExecutionContext&,
                                              const ProblemDescription&,
                                              const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConfigConvAsmBwdWrW1x1&) const override;
 };
@@ -2648,7 +2486,7 @@ struct PerformanceConfigConvOclBwdWrw2
     void HeuristicInit(const ProblemDescription&);
     bool IsValidValue() const;
     bool SetNextValue(const ProblemDescription&);
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
     bool operator==(const PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>& other) const;
 };
 
@@ -2661,25 +2499,24 @@ struct ConvOclBwdWrW2 : ConvTunableSolver<PerformanceConfigConvOclBwdWrw2<N_BATC
     }
 
     PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
     bool
-    IsValidPerformanceConfig(const ConvolutionContext&,
+    IsValidPerformanceConfig(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>&) const override;
     PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>&) const override;
 
 protected:
-    bool IsApplicableBase(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsApplicableBase(const ExecutionContext&, const ProblemDescription&) const;
 };
 
 // To suppress misleading clang warnings
@@ -2715,8 +2552,8 @@ struct ConvOclBwdWrW2NonTunable final : ConvOclBwdWrW2<1>
         return GetSolverDbId<ConvOclBwdWrW2NonTunable>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
 
 private:
     // This function dervied from ConvOclBwdWrW2 is declared private
@@ -2729,20 +2566,20 @@ struct ConvOclBwdWrW53 final : ConvSolver
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvOclBwdWrW53>(); }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
-    ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override;
 };
 
 struct ConvOclBwdWrW1x1 final : ConvSolver
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<ConvOclBwdWrW1x1>(); }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
-    ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override;
 };
 
 struct fft final : ConvSolver
@@ -2753,30 +2590,16 @@ struct fft final : ConvSolver
 
     const std::string& SolverDbId() const override { return GetSolverDbId<fft>(); }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
-    {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    bool IsApplicable(const ExecutionContext& ctx,
+                      const ProblemDescription& problem) const override;
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
-                            const ProblemDescription& problem) const override
-    {
-        return GetWorkspaceSize(static_cast<const ExecutionContext&>(ctx), problem);
-    }
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
+                            const ProblemDescription& problem) const override;
 
     bool MayNeedWorkspace() const override { return true; }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const ProblemDescription& problem) const override
-    {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
-    }
-
-private:
-    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const;
-    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const;
-    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const;
+    ConvSolution GetSolution(const ExecutionContext& ctx,
+                             const ProblemDescription& problem) const override;
 };
 
 struct PerformanceImplicitGemmWrwV4R4Xdlops : PerfConfigBase<PerformanceImplicitGemmWrwV4R4Xdlops>
@@ -2814,17 +2637,17 @@ struct PerformanceImplicitGemmWrwV4R4Xdlops : PerfConfigBase<PerformanceImplicit
 
     bool operator==(const PerformanceImplicitGemmWrwV4R4Xdlops& other) const;
 
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
-    bool IsReallyValid(const ConvolutionContext&, const ProblemDescription&) const;
-    bool IsFastToBeUsedForTuning(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
+    bool IsReallyValid(const ExecutionContext&, const ProblemDescription&) const;
+    bool IsFastToBeUsedForTuning(const ExecutionContext&, const ProblemDescription&) const;
 
     std::tuple<int, int, int, int, int, bool>
-    CalculateGemmSizeAndGemmKBlock(const ConvolutionContext&, const ProblemDescription&) const;
+    CalculateGemmSizeAndGemmKBlock(const ExecutionContext&, const ProblemDescription&) const;
     std::tuple<int, bool> CalculateBlockSize() const;
-    std::tuple<int, bool> CalculateGridSize(const ConvolutionContext&,
+    std::tuple<int, bool> CalculateGridSize(const ExecutionContext&,
                                             const ProblemDescription&) const;
     std::tuple<int, int, int, int, int, bool>
     CalculateGemmABlockCopyPerformanceParameters(const ProblemDescription&) const;
@@ -2842,18 +2665,17 @@ struct ConvHipImplicitGemmWrwV4R4Xdlops final
     }
 
     PerformanceImplicitGemmWrwV4R4Xdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceImplicitGemmWrwV4R4Xdlops&) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceImplicitGemmWrwV4R4Xdlops&) const override;
-    PerformanceImplicitGemmWrwV4R4Xdlops Search(const ConvolutionContext&,
+    PerformanceImplicitGemmWrwV4R4Xdlops Search(const ExecutionContext&,
                                                 const ProblemDescription&,
                                                 const AnyInvokeParams& invoke_ctx) const override;
 };
@@ -2899,17 +2721,17 @@ struct PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm
 
     bool operator==(const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm& other) const;
 
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const;
-    bool IsReallyValid(const ConvolutionContext&, const ProblemDescription&) const;
-    bool IsFastToBeUsedForTuning(const ConvolutionContext&, const ProblemDescription&) const;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const;
+    bool IsReallyValid(const ExecutionContext&, const ProblemDescription&) const;
+    bool IsFastToBeUsedForTuning(const ExecutionContext&, const ProblemDescription&) const;
 
     std::tuple<int, int, int, int, int, int, int, int, bool>
-    CalculateGemmSizeAndGemmKBlock(const ConvolutionContext&, const ProblemDescription&) const;
+    CalculateGemmSizeAndGemmKBlock(const ExecutionContext&, const ProblemDescription&) const;
     std::tuple<int, bool> CalculateBlockSize() const;
-    std::tuple<int, bool> CalculateGridSize(const ConvolutionContext&,
+    std::tuple<int, bool> CalculateGridSize(const ExecutionContext&,
                                             const ProblemDescription&) const;
     std::tuple<int, int, int, int, int, bool>
     CalculateGemmABlockCopyPerformanceParameters(const ProblemDescription&) const;
@@ -2927,21 +2749,20 @@ struct ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm final
     }
 
     PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
     bool IsValidPerformanceConfig(
-        const ConvolutionContext&,
+        const ExecutionContext&,
         const ProblemDescription&,
         const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm&) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     ConvSolution
-    GetSolution(const ConvolutionContext&,
+    GetSolution(const ExecutionContext&,
                 const ProblemDescription&,
                 const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm&) const override;
     PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
 };
@@ -2964,7 +2785,7 @@ struct PerformanceConvCkIgemmFwdV6r1DlopsNchw
     }
 
     bool SetNextValue(const ProblemDescription&);
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -2982,20 +2803,19 @@ struct ConvCkIgemmFwdV6r1DlopsNchw final : ConvTunableSolver<PerformanceConvCkIg
         return GetSolverDbId<ConvCkIgemmFwdV6r1DlopsNchw>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
     bool IsDynamic() const override { return false; }
     PerformanceConvCkIgemmFwdV6r1DlopsNchw
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConvCkIgemmFwdV6r1DlopsNchw&) const override;
-    PerformanceConvCkIgemmFwdV6r1DlopsNchw Search(const ConvolutionContext&,
+    PerformanceConvCkIgemmFwdV6r1DlopsNchw Search(const ExecutionContext&,
                                                   const ProblemDescription&,
                                                   const AnyInvokeParams& invoke_ctx) const override;
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConvCkIgemmFwdV6r1DlopsNchw&) const override;
 };
@@ -3007,15 +2827,15 @@ struct ConvDirectNaiveConvFwd final : ConvSolver
         return GetSolverDbId<ConvDirectNaiveConvFwd>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
     /// Use very small fixed value enough to backup GEMM for cases when
     /// GEMM is disabled due to MIOpenGemm or OCL compiler issues.
-    float GetWti(const ConvolutionContext&, const ProblemDescription&) const override
+    float GetWti(const ExecutionContext&, const ProblemDescription&) const override
     {
         return 0.01f;
     }
-    ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override;
 };
 
 struct ConvDirectNaiveConvBwd final : ConvSolver
@@ -3025,15 +2845,15 @@ struct ConvDirectNaiveConvBwd final : ConvSolver
         return GetSolverDbId<ConvDirectNaiveConvBwd>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
     /// Use very small fixed value enough to backup GEMM for cases when
     /// GEMM is disabled due to MIOpenGemm or OCL compiler issues.
-    float GetWti(const ConvolutionContext&, const ProblemDescription&) const override
+    float GetWti(const ExecutionContext&, const ProblemDescription&) const override
     {
         return 0.01f;
     }
-    ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override;
 };
 
 struct ConvDirectNaiveConvWrw final : ConvSolver
@@ -3043,15 +2863,15 @@ struct ConvDirectNaiveConvWrw final : ConvSolver
         return GetSolverDbId<ConvDirectNaiveConvWrw>();
     }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
     /// Use very small fixed value enough to backup GEMM for cases when
     /// GEMM is disabled due to MIOpenGemm or OCL compiler issues.
-    float GetWti(const ConvolutionContext&, const ProblemDescription&) const override
+    float GetWti(const ExecutionContext&, const ProblemDescription&) const override
     {
         return 0.01f;
     }
-    ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override;
+    ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override;
 };
 
 struct GemmFwdBase : ConvSolver
@@ -3061,14 +2881,14 @@ struct GemmFwdBase : ConvSolver
     using ConvSolver::IsApplicable;
 
     bool IsDynamic() const override { return true; }
-    float GetWti(const ConvolutionContext& ctx, const ProblemDescription& problem) const override
+    float GetWti(const ExecutionContext& ctx, const ProblemDescription& problem) const override
     {
-        return GetWti(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetWti(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
 private:
     bool IsApplicable(const ExecutionContext&, const conv::ProblemDescription&) const;
-    float GetWti(const ExecutionContext& context, const conv::ProblemDescription& problem) const;
+    float GetWti(const ExecutionContext& ctx, const conv::ProblemDescription& problem) const;
 
     friend struct GemmFwd1x1_0_2;
     friend struct GemmFwd1x1_0_1_int8;
@@ -3083,24 +2903,23 @@ struct GemmFwd1x1_0_2 final : GemmFwdBase
 
     const std::string& SolverDbId() const override { return GetSolverDbId<GemmFwd1x1_0_2>(); }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
                             const ProblemDescription& problem) const override
     {
-        return GetWorkspaceSize(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetWorkspaceSize(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
     bool MayNeedWorkspace() const override { return true; }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
+    bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override
     {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
+        return IsApplicable(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
+    ConvSolution GetSolution(const ExecutionContext& ctx,
                              const ProblemDescription& problem) const override
     {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetSolution(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
 private:
@@ -3118,24 +2937,23 @@ struct GemmFwd1x1_0_1_int8 final : GemmFwdBase
 
     const std::string& SolverDbId() const override { return GetSolverDbId<GemmFwd1x1_0_1_int8>(); }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
                             const ProblemDescription& problem) const override
     {
-        return GetWorkspaceSize(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetWorkspaceSize(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
     bool MayNeedWorkspace() const override { return true; }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
+    bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override
     {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
+        return IsApplicable(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
+    ConvSolution GetSolution(const ExecutionContext& ctx,
                              const ProblemDescription& problem) const override
     {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetSolution(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
 private:
@@ -3153,24 +2971,23 @@ struct GemmFwd1x1_0_1 final : GemmFwdBase
 
     const std::string& SolverDbId() const override { return GetSolverDbId<GemmFwd1x1_0_1>(); }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
                             const ProblemDescription& problem) const override
     {
-        return GetWorkspaceSize(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetWorkspaceSize(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
     bool MayNeedWorkspace() const override { return true; }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
+    bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override
     {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
+        return IsApplicable(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
+    ConvSolution GetSolution(const ExecutionContext& ctx,
                              const ProblemDescription& problem) const override
     {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetSolution(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
 private:
@@ -3188,24 +3005,23 @@ struct GemmFwdRest final : GemmFwdBase
 
     const std::string& SolverDbId() const override { return GetSolverDbId<GemmFwdRest>(); }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
                             const ProblemDescription& problem) const override
     {
-        return GetWorkspaceSize(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetWorkspaceSize(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
     bool MayNeedWorkspace() const override { return true; }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
+    bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override
     {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
+        return IsApplicable(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
+    ConvSolution GetSolution(const ExecutionContext& ctx,
                              const ProblemDescription& problem) const override
     {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetSolution(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
 private:
@@ -3221,9 +3037,9 @@ struct GemmBwdBase : ConvSolver
     using ConvSolver::IsApplicable;
 
     bool IsDynamic() const override { return true; }
-    float GetWti(const ConvolutionContext& ctx, const ProblemDescription& problem) const override
+    float GetWti(const ExecutionContext& ctx, const ProblemDescription& problem) const override
     {
-        return GetWti(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetWti(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
 private:
@@ -3243,24 +3059,23 @@ struct GemmBwd1x1_stride2 final : GemmBwdBase
 
     const std::string& SolverDbId() const override { return GetSolverDbId<GemmBwd1x1_stride2>(); }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
                             const ProblemDescription& problem) const override
     {
-        return GetWorkspaceSize(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetWorkspaceSize(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
     bool MayNeedWorkspace() const override { return true; }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
+    bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override
     {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
+        return IsApplicable(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
+    ConvSolution GetSolution(const ExecutionContext& ctx,
                              const ProblemDescription& problem) const override
     {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetSolution(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
 private:
@@ -3278,24 +3093,23 @@ struct GemmBwd1x1_stride1 final : GemmBwdBase
 
     const std::string& SolverDbId() const override { return GetSolverDbId<GemmBwd1x1_stride1>(); }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
                             const ProblemDescription& problem) const override
     {
-        return GetWorkspaceSize(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetWorkspaceSize(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
     bool MayNeedWorkspace() const override { return true; }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
+    bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override
     {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
+        return IsApplicable(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
+    ConvSolution GetSolution(const ExecutionContext& ctx,
                              const ProblemDescription& problem) const override
     {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetSolution(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
 private:
@@ -3315,24 +3129,23 @@ struct GemmBwdRest final : GemmBwdBase
 
     const std::string& SolverDbId() const override { return GetSolverDbId<GemmBwdRest>(); }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
                             const ProblemDescription& problem) const override
     {
-        return GetWorkspaceSize(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetWorkspaceSize(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
     bool MayNeedWorkspace() const override { return true; }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
+    bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override
     {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
+        return IsApplicable(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
+    ConvSolution GetSolution(const ExecutionContext& ctx,
                              const ProblemDescription& problem) const override
     {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetSolution(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
 private:
@@ -3348,9 +3161,9 @@ struct GemmWrwBase : ConvSolver
     using ConvSolver::IsApplicable;
 
     bool IsDynamic() const override { return true; }
-    float GetWti(const ConvolutionContext& ctx, const ProblemDescription& problem) const override
+    float GetWti(const ExecutionContext& ctx, const ProblemDescription& problem) const override
     {
-        return GetWti(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetWti(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
 private:
@@ -3368,16 +3181,15 @@ struct GemmWrw1x1_stride1 final : GemmWrwBase
 
     const std::string& SolverDbId() const override { return GetSolverDbId<GemmWrw1x1_stride1>(); }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
+    bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override
     {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
+        return IsApplicable(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
+    ConvSolution GetSolution(const ExecutionContext& ctx,
                              const ProblemDescription& problem) const override
     {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetSolution(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
 private:
@@ -3394,24 +3206,23 @@ struct GemmWrwUniversal final : GemmWrwBase
 
     const std::string& SolverDbId() const override { return GetSolverDbId<GemmWrwUniversal>(); }
 
-    size_t GetWorkspaceSize(const ConvolutionContext& ctx,
+    size_t GetWorkspaceSize(const ExecutionContext& ctx,
                             const ProblemDescription& problem) const override
     {
-        return GetWorkspaceSize(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetWorkspaceSize(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
     bool MayNeedWorkspace() const override { return true; }
 
-    bool IsApplicable(const ConvolutionContext& ctx,
-                      const ProblemDescription& problem) const override
+    bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override
     {
-        return IsApplicable(static_cast<const ExecutionContext&>(ctx), problem);
+        return IsApplicable(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
-    ConvSolution GetSolution(const ConvolutionContext& ctx,
+    ConvSolution GetSolution(const ExecutionContext& ctx,
                              const ProblemDescription& problem) const override
     {
-        return GetSolution(static_cast<const ExecutionContext&>(ctx), problem);
+        return GetSolution(ctx, static_cast<const conv::ProblemDescription&>(problem));
     }
 
 private:
@@ -3610,16 +3421,16 @@ struct PerformanceConfigAsmImplicitGemmGTC : PerfConfigBase<PerformanceConfigAsm
     }
 
     // Chilrden must provide support for ComputedContainer.
-    void HeuristicInit(const ConvolutionContext&)                            = delete;
-    bool SetNextValue(const ProblemDescription&)                             = delete;
-    bool IsValidValue() const                                                = delete;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const = delete;
+    void HeuristicInit(const ExecutionContext&)                            = delete;
+    bool SetNextValue(const ProblemDescription&)                           = delete;
+    bool IsValidValue() const                                              = delete;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const = delete;
 
     bool IsDefaultConstructed() const;
     bool operator==(const PerformanceConfigAsmImplicitGemmGTC& other) const;
     void CopyParameters(const PerformanceConfigAsmImplicitGemmGTC& other);
     std::string ToString() const override;
-    std::string ToKernelName(const ConvolutionContext& ctx) const;
+    std::string ToKernelName(const ExecutionContext& ctx) const;
     int BlockSize() const;
 };
 
@@ -3786,10 +3597,10 @@ struct PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC : PerformanceConfigAsmIm
     {
     }
 
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription& config);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -3805,22 +3616,21 @@ struct ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC final
     }
 
     PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsValidPerformanceConfig(
-        const ConvolutionContext&,
+        const ExecutionContext&,
         const ProblemDescription&,
         const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC&) const override;
     PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
     ConvSolution
-    GetSolution(const ConvolutionContext&,
+    GetSolution(const ExecutionContext&,
                 const ProblemDescription&,
                 const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC&) const override;
 };
@@ -3987,10 +3797,10 @@ struct PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC : PerformanceConfigAsmIm
                                                            spare)
     {
     }
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -4006,22 +3816,21 @@ struct ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC final
     }
 
     PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsValidPerformanceConfig(
-        const ConvolutionContext&,
+        const ExecutionContext&,
         const ProblemDescription&,
         const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC&) const override;
     PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
     ConvSolution
-    GetSolution(const ConvolutionContext&,
+    GetSolution(const ExecutionContext&,
                 const ProblemDescription&,
                 const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC&) const override;
 };
@@ -4189,10 +3998,10 @@ struct PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC : PerformanceConfigAsmIm
     {
     }
 
-    void HeuristicInit(const ConvolutionContext&, const ProblemDescription&);
+    void HeuristicInit(const ExecutionContext&, const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -4212,22 +4021,21 @@ struct ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC final
     }
 
     PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsValidPerformanceConfig(
-        const ConvolutionContext&,
+        const ExecutionContext&,
         const ProblemDescription&,
         const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC&) const override;
     PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
-    size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
+    size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override;
     bool MayNeedWorkspace() const override { return true; }
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
     ConvSolution
-    GetSolution(const ConvolutionContext&,
+    GetSolution(const ExecutionContext&,
                 const ProblemDescription&,
                 const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC&) const override;
 };
@@ -4395,16 +4203,16 @@ struct PerformanceConfigAsmImplicitGemmGTCvector
     }
 
     // Chilrden must provide support for ComputedContainer.
-    void HeuristicInit(const ConvolutionContext&)                            = delete;
-    bool SetNextValue(const ProblemDescription&)                             = delete;
-    bool IsValidValue() const                                                = delete;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription&) const = delete;
+    void HeuristicInit(const ExecutionContext&)                            = delete;
+    bool SetNextValue(const ProblemDescription&)                           = delete;
+    bool IsValidValue() const                                              = delete;
+    bool IsValid(const ExecutionContext&, const ProblemDescription&) const = delete;
 
     bool IsDefaultConstructed() const;
     bool operator==(const PerformanceConfigAsmImplicitGemmGTCvector& other) const;
     void CopyParameters(const PerformanceConfigAsmImplicitGemmGTCvector& other);
     std::string ToString() const override;
-    std::string ToKernelName(const ConvolutionContext& ctx) const;
+    std::string ToKernelName(const ExecutionContext& ctx) const;
     int BlockSize() const;
 };
 struct PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC : PerformanceConfigAsmImplicitGemmGTCvector
@@ -4546,7 +4354,7 @@ struct PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC : PerformanceConfigAsmIm
     void HeuristicInit(const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -4561,20 +4369,19 @@ struct ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC final
         return GetSolverDbId<ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC>();
     }
     PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsValidPerformanceConfig(
-        const ConvolutionContext&,
+        const ExecutionContext&,
         const ProblemDescription&,
         const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC&) const override;
     PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
     ConvSolution
-    GetSolution(const ConvolutionContext&,
+    GetSolution(const ExecutionContext&,
                 const ProblemDescription&,
                 const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC&) const override;
 };
@@ -4600,7 +4407,7 @@ struct PerformanceConfigHipImplicitGemmFwdXdlops
     void HeuristicInit(const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -4623,18 +4430,17 @@ struct ConvHipImplicitGemmFwdXdlops final
     }
 
     PerformanceConfigHipImplicitGemmFwdXdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConfigHipImplicitGemmFwdXdlops&) const override;
     PerformanceConfigHipImplicitGemmFwdXdlops
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConfigHipImplicitGemmFwdXdlops&) const override;
     /// \anchor igemm_get_wti_magic_number
@@ -4648,7 +4454,7 @@ struct ConvHipImplicitGemmFwdXdlops final
     // Since we would like to us CK before naive, and use it instead (because
     // we do expect that CK is faster than Naive), therefore we use a
     // value bigger than 0.01f, e.g. 0.02f.
-    float GetWti(const ConvolutionContext&, const ProblemDescription&) const override
+    float GetWti(const ExecutionContext&, const ProblemDescription&) const override
     {
         return 0.02f;
     };
@@ -4679,7 +4485,7 @@ struct PerformanceConfigHipImplicitGemmBwdXdlops
     void HeuristicInit(const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -4702,22 +4508,21 @@ struct ConvHipImplicitGemmBwdXdlops final
     }
 
     PerformanceConfigHipImplicitGemmBwdXdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const PerformanceConfigHipImplicitGemmBwdXdlops&) const override;
     PerformanceConfigHipImplicitGemmBwdXdlops
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConfigHipImplicitGemmBwdXdlops&) const override;
     /// \ref igemm_get_wti_magic_number
-    float GetWti(const ConvolutionContext&, const ProblemDescription&) const override
+    float GetWti(const ExecutionContext&, const ProblemDescription&) const override
     {
         return 0.02f;
     };
@@ -4748,7 +4553,7 @@ struct PerformanceConfigHipImplicitGemmGroupFwdXdlops
     void HeuristicInit(const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -4771,23 +4576,22 @@ struct ConvHipImplicitGemmGroupFwdXdlops final
     }
 
     PerformanceConfigHipImplicitGemmGroupFwdXdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
     bool
-    IsValidPerformanceConfig(const ConvolutionContext&,
+    IsValidPerformanceConfig(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConfigHipImplicitGemmGroupFwdXdlops&) const override;
     PerformanceConfigHipImplicitGemmGroupFwdXdlops
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
-    ConvSolution GetSolution(const ConvolutionContext&,
+    ConvSolution GetSolution(const ExecutionContext&,
                              const ProblemDescription&,
                              const PerformanceConfigHipImplicitGemmGroupFwdXdlops&) const override;
     /// \ref igemm_get_wti_magic_number
-    float GetWti(const ConvolutionContext&, const ProblemDescription&) const override
+    float GetWti(const ExecutionContext&, const ProblemDescription&) const override
     {
         return 0.02f;
     };
@@ -4818,7 +4622,7 @@ struct PerformanceConfigHipImplicitGemm3DGroupFwdXdlops
     void HeuristicInit(const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -4841,24 +4645,23 @@ struct ConvHipImplicitGemm3DGroupFwdXdlops final
     }
 
     PerformanceConfigHipImplicitGemm3DGroupFwdXdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsValidPerformanceConfig(
-        const ConvolutionContext&,
+        const ExecutionContext&,
         const ProblemDescription&,
         const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override;
     PerformanceConfigHipImplicitGemm3DGroupFwdXdlops
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
     ConvSolution
-    GetSolution(const ConvolutionContext&,
+    GetSolution(const ExecutionContext&,
                 const ProblemDescription&,
                 const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override;
     /// \ref igemm_get_wti_magic_number
-    float GetWti(const ConvolutionContext&, const ProblemDescription&) const override
+    float GetWti(const ExecutionContext&, const ProblemDescription&) const override
     {
         return 0.02f;
     };
@@ -4889,7 +4692,7 @@ struct PerformanceConfigHipImplicitGemm3DGroupWrwXdlops
     void HeuristicInit(const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -4917,24 +4720,23 @@ struct ConvHipImplicitGemm3DGroupWrwXdlops final
     }
 
     PerformanceConfigHipImplicitGemm3DGroupWrwXdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsValidPerformanceConfig(
-        const ConvolutionContext&,
+        const ExecutionContext&,
         const ProblemDescription&,
         const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops&) const override;
     PerformanceConfigHipImplicitGemm3DGroupWrwXdlops
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
     ConvSolution
-    GetSolution(const ConvolutionContext&,
+    GetSolution(const ExecutionContext&,
                 const ProblemDescription&,
                 const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops&) const override;
     /// \ref igemm_get_wti_magic_number
-    float GetWti(const ConvolutionContext&, const ProblemDescription&) const override
+    float GetWti(const ExecutionContext&, const ProblemDescription&) const override
     {
         return 0.02f;
     };
@@ -4965,7 +4767,7 @@ struct PerformanceConfigHipImplicitGemm3DGroupBwdXdlops
     void HeuristicInit(const ProblemDescription&);
     bool SetNextValue(const ProblemDescription&);
     bool IsValidValue() const;
-    bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
+    bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const
     {
         return IsValid(problem);
     }
@@ -4993,24 +4795,23 @@ struct ConvHipImplicitGemm3DGroupBwdXdlops final
     }
 
     PerformanceConfigHipImplicitGemm3DGroupBwdXdlops
-    GetDefaultPerformanceConfig(const ConvolutionContext&,
-                                const ProblemDescription&) const override;
+    GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsValidPerformanceConfig(
-        const ConvolutionContext&,
+        const ExecutionContext&,
         const ProblemDescription&,
         const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops&) const override;
     PerformanceConfigHipImplicitGemm3DGroupBwdXdlops
-    Search(const ConvolutionContext&,
+    Search(const ExecutionContext&,
            const ProblemDescription&,
            const AnyInvokeParams& invoke_ctx) const override;
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override;
     bool IsDynamic() const override { return true; }
     ConvSolution
-    GetSolution(const ConvolutionContext&,
+    GetSolution(const ExecutionContext&,
                 const ProblemDescription&,
                 const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops&) const override;
     /// \ref igemm_get_wti_magic_number
-    float GetWti(const ConvolutionContext&, const ProblemDescription&) const override
+    float GetWti(const ExecutionContext&, const ProblemDescription&) const override
     {
         return 0.02f;
     };
@@ -5023,7 +4824,7 @@ struct ConvHipImplicitGemm3DGroupBwdXdlops final
 // Use struct as a syntactic sugar to make the intent as clear as possible.
 struct ThisSolverIsDeprecatedStatic
 {
-    static bool IsDisabled(const ConvolutionContext& ctx);
+    static bool IsDisabled(const ExecutionContext& ctx);
 };
 
 } // namespace solver
diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp
index f05bbdf712..7bad52ff9e 100644
--- a/src/include/miopen/solver/conv_direct_naive_conv.hpp
+++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp
@@ -26,7 +26,8 @@
 #pragma once
 
 #include <string>
-#include <miopen/conv/context.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/problem_description.hpp>
 
 namespace miopen {
 
@@ -34,9 +35,9 @@ namespace solver {
 
 bool ConvDirectNaiveConvIsAssemblyKernel(const ExecutionContext&, const ProblemDescription&);
 std::string ConvDirectNaiveConvKernelName(const ProblemDescription&);
-std::string ConvDirectNaiveConvKernelFile(const ConvolutionContext& ctx,
+std::string ConvDirectNaiveConvKernelFile(const ExecutionContext& ctx,
                                           const ProblemDescription& problem);
-std::string ConvDirectNaiveConvCompileOption(const ConvolutionContext& ctx,
+std::string ConvDirectNaiveConvCompileOption(const ExecutionContext& ctx,
                                              const ProblemDescription& problem);
 bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext&,
                                                  const ProblemDescription&);
diff --git a/src/include/miopen/solver/implicitgemm_util.hpp b/src/include/miopen/solver/implicitgemm_util.hpp
index 88262b4a32..d9aad50b98 100644
--- a/src/include/miopen/solver/implicitgemm_util.hpp
+++ b/src/include/miopen/solver/implicitgemm_util.hpp
@@ -419,7 +419,7 @@ static inline bool IsApplicableXdlops(const ExecutionContext& ctx,
 
 ///\todo remove
 template <class PerformanceImplicitGemm_t>
-inline static auto GetPerformanceConfigBase(const ConvolutionContext& ctx,
+inline static auto GetPerformanceConfigBase(const ExecutionContext& ctx,
                                             const ProblemDescription& problem)
 {
     PerformanceImplicitGemm_t pp;
@@ -459,7 +459,7 @@ static inline size_t ComputeLDSRequiredSize(const ProblemDescription& problem,
     return lds_size;
 }
 
-static inline bool use_amd_inline_asm(const ConvolutionContext& ctx,
+static inline bool use_amd_inline_asm(const ExecutionContext& ctx,
                                       const ProblemDescription& problem)
 {
 
@@ -475,7 +475,7 @@ static inline bool use_amd_inline_asm(const ConvolutionContext& ctx,
     return !miopen::IsDisabled(MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM{});
 }
 
-static inline bool is_use_amd_buffer_load_store(const ConvolutionContext& ctx)
+static inline bool is_use_amd_buffer_load_store(const ExecutionContext& ctx)
 {
 #if WORKAROUND_MIOPEN_ISSUE_557
     const auto device_name = ctx.GetStream().GetDeviceName();
@@ -485,7 +485,7 @@ static inline bool is_use_amd_buffer_load_store(const ConvolutionContext& ctx)
 #endif
 }
 
-static inline bool is_use_v_fmac_f32(const ConvolutionContext& ctx)
+static inline bool is_use_v_fmac_f32(const ExecutionContext& ctx)
 {
     const auto device_name = ctx.GetStream().GetDeviceName();
     return StartsWith(device_name, "gfx103");
@@ -570,7 +570,7 @@ int amd_lds_write_max_length()
 
 constexpr std::size_t get_lds_max_number_of_byte() { return 65536; }
 
-static inline auto get_static_ck_common_compiler_flag(const ConvolutionContext& ctx)
+static inline auto get_static_ck_common_compiler_flag(const ExecutionContext& ctx)
 {
     auto compiler_flag = std::string(" --std=c++14");
 
@@ -601,7 +601,7 @@ static inline auto get_static_ck_common_compiler_flag(const ConvolutionContext&
     return compiler_flag;
 }
 
-static inline bool IsComposableKernelSupportedHardware(const ConvolutionContext& c)
+static inline bool IsComposableKernelSupportedHardware(const ExecutionContext& c)
 {
     return (StartsWith(c.GetStream().GetDeviceName(), "gfx803") &&
             c.GetStream().GetMaxComputeUnits() == 64) ||
diff --git a/src/include/miopen/solver/mlir_common.hpp b/src/include/miopen/solver/mlir_common.hpp
index ed5e289117..d926277e4a 100644
--- a/src/include/miopen/solver/mlir_common.hpp
+++ b/src/include/miopen/solver/mlir_common.hpp
@@ -27,7 +27,8 @@
 #ifndef GUARD_MLIR_COMMON_HPP_
 #define GUARD_MLIR_COMMON_HPP_
 
-#include <miopen/conv/context.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/problem_description.hpp>
 
 #include <string>
 
@@ -37,13 +38,13 @@ namespace mlir {
 
 std::string GetKernelName(const ProblemDescription& problem, bool is_xdlops, int kernel_id = 0);
 
-std::string ConstructBuildOptions(const ConvolutionContext& ctx,
+std::string ConstructBuildOptions(const ExecutionContext& ctx,
                                   const ProblemDescription& problem,
                                   bool is_xdlops,
                                   int kernel_id = 0);
 
 template <typename T>
-std::string ConstructBuildOptions(const ConvolutionContext& ctx,
+std::string ConstructBuildOptions(const ExecutionContext& ctx,
                                   const ProblemDescription& problem,
                                   const T& perf_config,
                                   bool is_xdlops,
diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp
index 8de5e6c87c..522f5931b5 100644
--- a/src/mlo_dir_conv.cpp
+++ b/src/mlo_dir_conv.cpp
@@ -206,7 +206,7 @@ static auto GetBwdWrW2DSolvers()
 static auto GetFFTSolvers() { return miopen::solver::SolverContainer<miopen::solver::fft>{}; }
 
 std::vector<miopen::solver::ConvSolution>
-FindAllGemmSolutions(const miopen::ConvolutionContext& ctx,
+FindAllGemmSolutions(const miopen::ExecutionContext& ctx,
                      const miopen::ProblemDescription& problem,
                      const miopen::AnyInvokeParams& invoke_ctx)
 {
@@ -214,14 +214,13 @@ FindAllGemmSolutions(const miopen::ConvolutionContext& ctx,
 }
 
 std::vector<std::pair<std::string, size_t>>
-AllGemmWorkspaceSize(const miopen::ConvolutionContext& ctx,
-                     const miopen::ProblemDescription& problem)
+AllGemmWorkspaceSize(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem)
 {
     return GetGemmSolvers().GetWorkspaceSizes(ctx, problem);
 }
 
 std::vector<miopen::solver::ConvSolution>
-FindAllDirectSolutions(const miopen::ConvolutionContext& ctx,
+FindAllDirectSolutions(const miopen::ExecutionContext& ctx,
                        const miopen::ProblemDescription& problem,
                        const miopen::AnyInvokeParams& invoke_ctx)
 {
@@ -229,28 +228,28 @@ FindAllDirectSolutions(const miopen::ConvolutionContext& ctx,
 }
 
 std::vector<std::pair<std::string, size_t>>
-AllDirectForwardBackwardDataWorkspaceSize(const miopen::ConvolutionContext& ctx,
+AllDirectForwardBackwardDataWorkspaceSize(const miopen::ExecutionContext& ctx,
                                           const miopen::ProblemDescription& problem)
 {
     return GetDirectSolvers().GetWorkspaceSizes(ctx, problem);
 }
 
 std::vector<std::pair<std::string, size_t>>
-FindAllWinogradWorkspaceSizes(const miopen::ConvolutionContext& ctx,
+FindAllWinogradWorkspaceSizes(const miopen::ExecutionContext& ctx,
                               const miopen::ProblemDescription& problem)
 {
     return GetWindogradSolvers().GetWorkspaceSizes(ctx, problem);
 }
 
 std::vector<std::pair<std::string, size_t>>
-FindWinogradWrWWorkspaceSizes(const miopen::ConvolutionContext& ctx,
+FindWinogradWrWWorkspaceSizes(const miopen::ExecutionContext& ctx,
                               const miopen::ProblemDescription& problem)
 {
     return GetWindogradWrWSolvers().GetWorkspaceSizes(ctx, problem);
 }
 
 std::vector<std::pair<std::string, size_t>>
-FindAllImplicitGemmWorkspaceSizes(const miopen::ConvolutionContext& ctx,
+FindAllImplicitGemmWorkspaceSizes(const miopen::ExecutionContext& ctx,
                                   const miopen::ProblemDescription& problem)
 {
 #if WORKAROUND_SWDEV_227826
@@ -264,7 +263,7 @@ FindAllImplicitGemmWorkspaceSizes(const miopen::ConvolutionContext& ctx,
 }
 
 std::vector<miopen::solver::ConvSolution>
-FindAllImplicitGemmSolutions(const miopen::ConvolutionContext& ctx,
+FindAllImplicitGemmSolutions(const miopen::ExecutionContext& ctx,
                              const miopen::ProblemDescription& problem,
                              const miopen::AnyInvokeParams& invoke_ctx)
 {
@@ -280,7 +279,7 @@ FindAllImplicitGemmSolutions(const miopen::ConvolutionContext& ctx,
 }
 
 std::vector<miopen::solver::ConvSolution>
-FindAllWinogradSolutions(const miopen::ConvolutionContext& ctx,
+FindAllWinogradSolutions(const miopen::ExecutionContext& ctx,
                          const miopen::ProblemDescription& problem,
                          const miopen::AnyInvokeParams& invoke_ctx)
 {
@@ -288,7 +287,7 @@ FindAllWinogradSolutions(const miopen::ConvolutionContext& ctx,
 }
 
 std::vector<miopen::solver::ConvSolution>
-FindWinogradWrWAllSolutions(const miopen::ConvolutionContext& ctx,
+FindWinogradWrWAllSolutions(const miopen::ExecutionContext& ctx,
                             const miopen::ProblemDescription& problem,
                             const miopen::AnyInvokeParams& invoke_ctx)
 {
@@ -296,14 +295,14 @@ FindWinogradWrWAllSolutions(const miopen::ConvolutionContext& ctx,
 }
 
 std::vector<std::pair<std::string, size_t>>
-AllDirectBwdWrW2DWorkspaceSize(const miopen::ConvolutionContext& ctx,
+AllDirectBwdWrW2DWorkspaceSize(const miopen::ExecutionContext& ctx,
                                const miopen::ProblemDescription& problem)
 {
     return GetBwdWrW2DSolvers().GetWorkspaceSizes(ctx, problem);
 }
 
 std::vector<std::pair<std::string, size_t>>
-FindImplicitGemmWrWWorkspaceSizes(const miopen::ConvolutionContext& ctx,
+FindImplicitGemmWrWWorkspaceSizes(const miopen::ExecutionContext& ctx,
                                   const miopen::ProblemDescription& problem)
 {
 #if WORKAROUND_SWDEV_227826
@@ -317,7 +316,7 @@ FindImplicitGemmWrWWorkspaceSizes(const miopen::ConvolutionContext& ctx,
 }
 
 std::vector<miopen::solver::ConvSolution>
-FindImplicitGemmWrWAllSolutions(const miopen::ConvolutionContext& ctx,
+FindImplicitGemmWrWAllSolutions(const miopen::ExecutionContext& ctx,
                                 const miopen::ProblemDescription& problem,
                                 const miopen::AnyInvokeParams& invoke_ctx)
 {
@@ -334,7 +333,7 @@ FindImplicitGemmWrWAllSolutions(const miopen::ConvolutionContext& ctx,
 }
 
 std::vector<miopen::solver::ConvSolution>
-FindAllBwdWrW2DSolutions(const miopen::ConvolutionContext& ctx,
+FindAllBwdWrW2DSolutions(const miopen::ExecutionContext& ctx,
                          const miopen::ProblemDescription& problem,
                          const miopen::AnyInvokeParams& invoke_ctx)
 {
@@ -342,7 +341,7 @@ FindAllBwdWrW2DSolutions(const miopen::ConvolutionContext& ctx,
 }
 
 std::vector<miopen::solver::ConvSolution>
-FindAllFFTSolutions(const miopen::ConvolutionContext& ctx,
+FindAllFFTSolutions(const miopen::ExecutionContext& ctx,
                     const miopen::ProblemDescription& problem,
                     const miopen::AnyInvokeParams& invoke_ctx)
 {
@@ -350,7 +349,7 @@ FindAllFFTSolutions(const miopen::ConvolutionContext& ctx,
 }
 
 std::vector<std::pair<std::string, size_t>>
-AllFFTForwardBackwardDataWorkspaceSize(const miopen::ConvolutionContext& ctx,
+AllFFTForwardBackwardDataWorkspaceSize(const miopen::ExecutionContext& ctx,
                                        const miopen::ProblemDescription& problem)
 {
     return GetFFTSolvers().GetWorkspaceSizes(ctx, problem);
diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp
index 07e6f28b19..8c042e3e7b 100644
--- a/src/ocl/convolutionocl.cpp
+++ b/src/ocl/convolutionocl.cpp
@@ -113,12 +113,11 @@ static Invoker PrepareInvoker(ExecutionContext ctx,
     problem.SetupFloats(ctx);
     ctx.do_search = false;
 
-    const auto legacy_ctx     = ConvolutionContext{ctx};
     const auto legacy_problem = ProblemDescription{problem};
     const auto solver         = solver_id.GetSolver();
     auto db                   = GetDb(ctx);
     auto solution =
-        solver.FindSolution(legacy_ctx, legacy_problem, db, {}); // auto tune is not expected here
+        solver.FindSolution(ctx, legacy_problem, db, {}); // auto tune is not expected here
     auto& handle = ctx.GetStream();
     auto invoker = handle.PrepareInvoker(*solution.invoker_factory, solution.construction_params);
     const auto algo = AlgorithmName{solver_id.GetAlgo(problem.GetDirection())};
@@ -198,15 +197,15 @@ static inline std::vector<PerfField> FindConvolution(const ExecutionContext& ctx
     else
     {
         results = UserFindDbRecord::TryLoad(ctx.GetStream(), problem, [&](DbRecord& record) {
-            auto conv_ctx                       = ConvolutionContext{ctx};
-            conv_ctx.use_dynamic_solutions_only = findMode.IsDynamicHybrid(ctx);
+            auto ctx_copy                       = ctx;
+            ctx_copy.use_dynamic_solutions_only = findMode.IsDynamicHybrid(ctx);
             auto legacy_problem                 = ProblemDescription(problem);
 
             ConvFindCore(invoke_ctx,
                          record,
-                         conv_ctx,
+                         ctx_copy,
                          legacy_problem,
-                         conv.IsWinograd3x3SupportedAndFast(conv_ctx, legacy_problem),
+                         conv.IsWinograd3x3SupportedAndFast(ctx_copy, legacy_problem),
                          GetConvSolverFinders());
         });
     }
@@ -519,7 +518,7 @@ struct SolutionTimeComparator
 };
 
 std::vector<miopenConvSolution_t>
-ConvolutionDescriptor::GetSolutionsFallback(const ExecutionContext& exec_ctx,
+ConvolutionDescriptor::GetSolutionsFallback(const ExecutionContext& ctx,
                                             const conv::ProblemDescription& problem,
                                             const size_t maxSolutionCount) const
 {
@@ -531,7 +530,6 @@ ConvolutionDescriptor::GetSolutionsFallback(const ExecutionContext& exec_ctx,
 
     /// \todo This is terrible. Should do away when we converge to
     /// single conv::ProblemDescription type.
-    const auto ctx            = ConvolutionContext{exec_ctx};
     const auto legacy_problem = ProblemDescription{problem};
     const auto& inDesc =
         (problem.GetDirection() == conv::Direction::Forward) ? problem.GetIn() : problem.GetOut();
@@ -547,7 +545,7 @@ ConvolutionDescriptor::GetSolutionsFallback(const ExecutionContext& exec_ctx,
 #if MIOPEN_ENABLE_AI_IMMED_MODE_FALLBACK
     if(!miopen::IsDisabled(MIOPEN_DEBUG_ENABLE_AI_IMMED_MODE_FALLBACK{}))
     {
-        const static std::string arch = exec_ctx.GetStream().GetDeviceName();
+        const static std::string arch = ctx.GetStream().GetDeviceName();
         auto solvers                  = ai::immed_mode::PredictSolver(legacy_problem, ctx, arch);
         if(!solvers.empty())
         {
@@ -618,7 +616,7 @@ ConvolutionDescriptor::GetSolutionsFallback(const ExecutionContext& exec_ctx,
     return interim;
 }
 
-std::vector<miopenConvSolution_t> GetSolutions(const ExecutionContext& exec_ctx,
+std::vector<miopenConvSolution_t> GetSolutions(const ExecutionContext& ctx,
                                                const conv::ProblemDescription& problem,
                                                const size_t maxSolutionCount)
 {
@@ -633,7 +631,7 @@ std::vector<miopenConvSolution_t> GetSolutions(const ExecutionContext& exec_ctx,
         break;
     }
 
-    const FindDbRecord fdb_record{exec_ctx.GetStream(), problem};
+    const FindDbRecord fdb_record{ctx.GetStream(), problem};
 
     if(fdb_record.empty())
         return {};
@@ -641,13 +639,6 @@ std::vector<miopenConvSolution_t> GetSolutions(const ExecutionContext& exec_ctx,
     auto interim = std::vector<miopenConvSolution_t>{};
     interim.reserve(20); // Heuristic for speed.
 
-    // Individual Solvers can be enabled/disabled by environment settings.
-    // Applicability is also affected by presence of external tools (e.g. assembler)
-    // ROCm version, specific features of GPU (like xnack) etc.
-    // All the above can be found by calling IsApplicable().
-    // We need fully initialized context for this, see below.
-    auto ctx = ConvolutionContext{exec_ctx};
-
     for(const auto& pair : fdb_record)
     {
         const auto algo = static_cast<miopenConvAlgorithm_t>(algo_resolver(pair.second.algorithm));
@@ -719,7 +710,7 @@ std::size_t ConvolutionDescriptor::GetForwardSolutionWorkspaceSize(Handle& handl
         return 0;
     const auto problem =
         conv::ProblemDescription{xDesc, wDesc, yDesc, *this, conv::Direction::Forward};
-    auto ctx = ConvolutionContext{};
+    auto ctx = ExecutionContext{};
     ctx.SetStream(&handle);
     if(sol.IsApplicable(ctx, problem))
         return sol.GetWorkspaceSize(ctx, problem);
@@ -928,7 +919,7 @@ std::size_t ConvolutionDescriptor::GetBackwardSolutionWorkspaceSize(Handle& hand
         return 0;
     const auto problem =
         conv::ProblemDescription{dyDesc, wDesc, dxDesc, *this, conv::Direction::BackwardData};
-    auto ctx = ConvolutionContext{};
+    auto ctx = ExecutionContext{};
     ctx.SetStream(&handle);
     if(sol.IsApplicable(ctx, problem))
         return sol.GetWorkspaceSize(ctx, problem);
@@ -1126,7 +1117,7 @@ std::size_t ConvolutionDescriptor::GetWrwSolutionWorkspaceSize(Handle& handle,
         return 0;
     const auto problem =
         conv::ProblemDescription{dyDesc, dwDesc, xDesc, *this, conv::Direction::BackwardWeights};
-    auto ctx = ConvolutionContext{};
+    auto ctx = ExecutionContext{};
     ctx.SetStream(&handle);
     if(sol.IsApplicable(ctx, problem))
         return sol.GetWorkspaceSize(ctx, problem);
diff --git a/src/problem.cpp b/src/problem.cpp
index 35a85074fe..afc806b3a2 100644
--- a/src/problem.cpp
+++ b/src/problem.cpp
@@ -347,7 +347,7 @@ std::vector<Solution> Problem::FindSolutionsImpl(Handle& handle,
 
     const auto legacy_problem = ProblemDescription{conv_problem};
     const auto netcfg         = conv_problem.BuildConfKey();
-    auto conv_ctx             = ConvolutionContext{{&handle}};
+    auto conv_ctx             = ExecutionContext{&handle};
     conv_problem.SetupFloats(conv_ctx);
 
     decltype(auto) db = GetDb(conv_ctx);
diff --git a/src/solution.cpp b/src/solution.cpp
index 5f5fa18512..97d4420cb8 100644
--- a/src/solution.cpp
+++ b/src/solution.cpp
@@ -177,7 +177,7 @@ void Solution::RunImpl(Handle& handle,
     }
 
     const auto legacy_problem = ProblemDescription{conv_problem};
-    auto conv_ctx             = ConvolutionContext{{&handle}};
+    auto conv_ctx             = ExecutionContext{&handle};
     conv_problem.SetupFloats(conv_ctx);
 
     decltype(auto) db        = GetDb(conv_ctx);
diff --git a/src/solver.cpp b/src/solver.cpp
index 0b623a5178..d83935e646 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -573,7 +573,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     // IMPORTANT: New solvers should be added to the end of the function!
 }
 
-bool ThisSolverIsDeprecatedStatic::IsDisabled(const ConvolutionContext& ctx)
+bool ThisSolverIsDeprecatedStatic::IsDisabled(const ExecutionContext& ctx)
 {
     static const bool device_is_allowed = [&]() {
         if(miopen::IsEnabled(MIOPEN_DEBUG_ENABLE_DEPRECATED_SOLVERS{}))
diff --git a/src/solver/conv_MP_bidirectional_winograd.cpp b/src/solver/conv_MP_bidirectional_winograd.cpp
index 5d638469d5..a653157f58 100644
--- a/src/solver/conv_MP_bidirectional_winograd.cpp
+++ b/src/solver/conv_MP_bidirectional_winograd.cpp
@@ -177,7 +177,7 @@ static bool IsApplicableGEMM(const ProblemDescription& problem)
 }
 
 template <int WinoDataH, int WinoFilterH, int WinoDataW, int WinoFilterW>
-static bool IsApplicableTransform(const ConvolutionContext& ctx, const ProblemDescription& problem)
+static bool IsApplicableTransform(const ExecutionContext& ctx, const ProblemDescription& problem)
 {
 #if MIOPEN_BACKEND_HIP
     if(!ctx.use_asm_kernels)
@@ -319,7 +319,7 @@ static bool IsApplicableTransform(const ConvolutionContext& ctx, const ProblemDe
 
 template <int WinoDataH, int WinoFilterH, int WinoDataW, int WinoFilterW>
 bool ConvMPBidirectWinograd<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>::IsApplicable(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     // HIP backend required for sending ptr (buffer + offset)
     // ROCBLAS for GEMM step
@@ -359,7 +359,7 @@ bool ConvMPBidirectWinograd<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>::IsA
 
 template <int WinoDataH, int WinoFilterH, int WinoDataW, int WinoFilterW>
 size_t ConvMPBidirectWinograd<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>::GetWorkspaceSize(
-    const ConvolutionContext&, const ProblemDescription& problem) const
+    const ExecutionContext&, const ProblemDescription& problem) const
 {
     const miopenDataType_t transform_data_type =
         miopen::IsEnabled(MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_EXPEREMENTAL_FP16_TRANSFORM{})
@@ -378,7 +378,7 @@ size_t ConvMPBidirectWinograd<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>::G
 }
 
 template <int WinoDataH, int WinoFilterH, int WinoDataW, int WinoFilterW>
-static InvokerFactory MakeWinogradInvokerFactory(const ConvolutionContext& ctx,
+static InvokerFactory MakeWinogradInvokerFactory(const ExecutionContext& ctx,
                                                  const ProblemDescription& problem,
                                                  InvokerFactory xdlops_factory = InvokerFactory(),
                                                  bool isXdlops                 = false)
@@ -640,7 +640,7 @@ static InvokerFactory MakeWinogradInvokerFactory(const ConvolutionContext& ctx,
 
 template <int WinoDataH, int WinoFilterH, int WinoDataW, int WinoFilterW>
 ConvSolution ConvMPBidirectWinograd<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>::GetSolution(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     ConvSolution result;
     result.workspace_sz = GetWorkspaceSize(ctx, problem);
@@ -724,14 +724,14 @@ template struct ConvMPBidirectWinograd<4, 3>;
 template struct ConvMPBidirectWinograd<5, 3>;
 template struct ConvMPBidirectWinograd<6, 3>;
 
-// ConvolutionContext and ProblemDescription transformation
+// ExecutionContext and ProblemDescription transformation
 // for winograd buffers calculation using xdlops_convolution
 template <int WinoDataH, int WinoFilterH, int WinoDataW, int WinoFilterW>
-ConvolutionContext ConvMPBidirectWinograd_xdlops<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>::
-    GetTransformedConvContext(const ConvolutionContext& ctx,
+ExecutionContext ConvMPBidirectWinograd_xdlops<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>::
+    GetTransformedConvContext(const ExecutionContext& ctx,
                               const ProblemDescription& transformed_problem) const
 {
-    auto transformed_ctx = ConvolutionContext{static_cast<const ExecutionContext&>(ctx)};
+    auto transformed_ctx = ExecutionContext{static_cast<const ExecutionContext&>(ctx)};
     transformed_problem.SetupFloats(transformed_ctx);
 
     return transformed_ctx;
@@ -846,7 +846,7 @@ static conv::DataInvokeParams GetTransformedInvokeContext(const ProblemDescripti
 
 template <int WinoDataH, int WinoFilterH, int WinoDataW, int WinoFilterW>
 bool ConvMPBidirectWinograd_xdlops<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>::IsApplicable(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
 
     static const int wino_data_tile   = std::max(WinoDataH, WinoDataW);
@@ -878,7 +878,7 @@ bool ConvMPBidirectWinograd_xdlops<WinoDataH, WinoFilterH, WinoDataW, WinoFilter
 template <int WinoDataH, int WinoFilterH, int WinoDataW, int WinoFilterW>
 ConvSolution
 ConvMPBidirectWinograd_xdlops<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmForwardV4R4Xdlops& config) const
 {
@@ -918,7 +918,7 @@ ConvMPBidirectWinograd_xdlops<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>::G
 template <int WinoDataH, int WinoFilterH, int WinoDataW, int WinoFilterW>
 PerformanceImplicitGemmForwardV4R4Xdlops
 ConvMPBidirectWinograd_xdlops<WinoDataH, WinoFilterH, WinoDataW, WinoFilterW>::Search(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const AnyInvokeParams& invoke_ctx) const
 {
diff --git a/src/solver/conv_asm_1x1u.cpp b/src/solver/conv_asm_1x1u.cpp
index 4da4f6dc02..0664c32eb8 100644
--- a/src/solver/conv_asm_1x1u.cpp
+++ b/src/solver/conv_asm_1x1u.cpp
@@ -386,7 +386,7 @@ bool PerformanceConfigConvAsm1x1U::ModelApplyToken(int index,
     return this->IsPartiallyValid(problem, index + 1);
 }
 
-static bool IsModelApplicable(const ConvolutionContext& ctx, const ProblemDescription& problem)
+static bool IsModelApplicable(const ExecutionContext& ctx, const ProblemDescription& problem)
 {
     if(!miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_AI_HEUR{}))
         return false;
@@ -415,7 +415,7 @@ static std::vector<float> TransformFeatures(const ProblemDescription& problem, s
     return features;
 }
 
-void PerformanceConfigConvAsm1x1U::RunParmeterPredictionModel(const ConvolutionContext& ctx,
+void PerformanceConfigConvAsm1x1U::RunParmeterPredictionModel(const ExecutionContext& ctx,
                                                               const ProblemDescription& problem,
                                                               bool& valid)
 {
@@ -479,7 +479,7 @@ void PerformanceConfigConvAsm1x1U::StaticHeuristic(const ProblemDescription& pro
     }
 }
 
-void PerformanceConfigConvAsm1x1U::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceConfigConvAsm1x1U::HeuristicInit(const ExecutionContext& ctx,
                                                  const ProblemDescription& problem)
 {
     if(problem.GetInDataType() == miopenDouble)
@@ -501,7 +501,7 @@ void PerformanceConfigConvAsm1x1U::HeuristicInit(const ConvolutionContext& ctx,
 }
 
 PerformanceConfigConvAsm1x1U
-ConvAsm1x1U::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+ConvAsm1x1U::GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                          const ProblemDescription& problem) const
 {
     PerformanceConfigConvAsm1x1U pp;
@@ -510,15 +510,14 @@ ConvAsm1x1U::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
     return pp;
 }
 
-bool ConvAsm1x1U::IsValidPerformanceConfig(const ConvolutionContext&,
+bool ConvAsm1x1U::IsValidPerformanceConfig(const ExecutionContext&,
                                            const ProblemDescription& problem,
                                            const PerformanceConfigConvAsm1x1U& config) const
 {
     return config.IsValidValue() && config.IsValid(problem);
 }
 
-bool ConvAsm1x1U::IsApplicable(const ConvolutionContext& ctx,
-                               const ProblemDescription& problem) const
+bool ConvAsm1x1U::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U{}))
         return false;
@@ -620,7 +619,7 @@ bool ConvAsm1x1U::IsApplicable(const ConvolutionContext& ctx,
     return ok;
 }
 
-size_t ConvAsm1x1U::GetWorkspaceSize(const ConvolutionContext&,
+size_t ConvAsm1x1U::GetWorkspaceSize(const ExecutionContext&,
                                      const ProblemDescription& problem) const
 {
     if(UseSubsample(problem) || UseUpsample(problem))
@@ -641,7 +640,7 @@ static int divide_round_plus_inf(const int x, const int y)
     return x / y;
 }
 
-ConvSolution ConvAsm1x1U::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvAsm1x1U::GetSolution(const ExecutionContext& ctx,
                                       const ProblemDescription& problem,
                                       const PerformanceConfigConvAsm1x1U& config) const
 {
@@ -914,7 +913,7 @@ ConvSolution ConvAsm1x1U::GetSolution(const ConvolutionContext& ctx,
     return result;
 }
 
-PerformanceConfigConvAsm1x1U ConvAsm1x1U::Search(const ConvolutionContext& ctx,
+PerformanceConfigConvAsm1x1U ConvAsm1x1U::Search(const ExecutionContext& ctx,
                                                  const ProblemDescription& problem,
                                                  const AnyInvokeParams& invoke_ctx) const
 {
diff --git a/src/solver/conv_asm_1x1u_stride2.cpp b/src/solver/conv_asm_1x1u_stride2.cpp
index ba23b9d9dc..b9925ee30c 100644
--- a/src/solver/conv_asm_1x1u_stride2.cpp
+++ b/src/solver/conv_asm_1x1u_stride2.cpp
@@ -460,7 +460,7 @@ void PerformanceConfigConvAsm1x1UV2::HeuristicInit(const ProblemDescription& pro
 }
 
 PerformanceConfigConvAsm1x1UV2
-ConvAsm1x1UV2::GetDefaultPerformanceConfig(const ConvolutionContext&,
+ConvAsm1x1UV2::GetDefaultPerformanceConfig(const ExecutionContext&,
                                            const ProblemDescription& problem) const
 {
     PerformanceConfigConvAsm1x1UV2 pp;
@@ -469,14 +469,14 @@ ConvAsm1x1UV2::GetDefaultPerformanceConfig(const ConvolutionContext&,
     return pp;
 }
 
-bool ConvAsm1x1UV2::IsValidPerformanceConfig(const ConvolutionContext&,
+bool ConvAsm1x1UV2::IsValidPerformanceConfig(const ExecutionContext&,
                                              const ProblemDescription& problem,
                                              const PerformanceConfigConvAsm1x1UV2& config) const
 {
     return config.IsValidValue() && config.IsValid(problem);
 }
 
-bool ConvAsm1x1UV2::IsApplicable(const ConvolutionContext& ctx,
+bool ConvAsm1x1UV2::IsApplicable(const ExecutionContext& ctx,
                                  const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1UV2{}))
@@ -594,7 +594,7 @@ bool ConvAsm1x1UV2::IsApplicable(const ConvolutionContext& ctx,
     return ok;
 }
 
-ConvSolution ConvAsm1x1UV2::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvAsm1x1UV2::GetSolution(const ExecutionContext& ctx,
                                         const ProblemDescription& problem,
                                         const PerformanceConfigConvAsm1x1UV2& config) const
 {
@@ -754,7 +754,7 @@ ConvSolution ConvAsm1x1UV2::GetSolution(const ConvolutionContext& ctx,
     return result;
 }
 
-PerformanceConfigConvAsm1x1UV2 ConvAsm1x1UV2::Search(const ConvolutionContext& ctx,
+PerformanceConfigConvAsm1x1UV2 ConvAsm1x1UV2::Search(const ExecutionContext& ctx,
                                                      const ProblemDescription& problem,
                                                      const AnyInvokeParams& invoke_ctx) const
 {
diff --git a/src/solver/conv_asm_3x3u.cpp b/src/solver/conv_asm_3x3u.cpp
index b185b959af..18f07b9630 100644
--- a/src/solver/conv_asm_3x3u.cpp
+++ b/src/solver/conv_asm_3x3u.cpp
@@ -150,7 +150,7 @@ void PerformanceConfigConvAsm3x3U::HeuristicInit(const ProblemDescription& probl
 }
 
 PerformanceConfigConvAsm3x3U
-ConvAsm3x3U::GetDefaultPerformanceConfig(const ConvolutionContext&,
+ConvAsm3x3U::GetDefaultPerformanceConfig(const ExecutionContext&,
                                          const ProblemDescription& problem) const
 {
     PerformanceConfigConvAsm3x3U pp;
@@ -159,15 +159,14 @@ ConvAsm3x3U::GetDefaultPerformanceConfig(const ConvolutionContext&,
     return pp;
 }
 
-bool ConvAsm3x3U::IsValidPerformanceConfig(const ConvolutionContext&,
+bool ConvAsm3x3U::IsValidPerformanceConfig(const ExecutionContext&,
                                            const ProblemDescription& problem,
                                            const PerformanceConfigConvAsm3x3U& config) const
 {
     return config.IsValidValue() && config.IsValid(problem);
 }
 
-bool ConvAsm3x3U::IsApplicable(const ConvolutionContext& ctx,
-                               const ProblemDescription& problem) const
+bool ConvAsm3x3U::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U{}))
         return false;
@@ -242,7 +241,7 @@ bool ConvAsm3x3U::IsApplicable(const ConvolutionContext& ctx,
     // clang-format on
 }
 
-ConvSolution ConvAsm3x3U::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvAsm3x3U::GetSolution(const ExecutionContext& ctx,
                                       const ProblemDescription& problem,
                                       const PerformanceConfigConvAsm3x3U& config) const
 {
@@ -321,7 +320,7 @@ ConvSolution ConvAsm3x3U::GetSolution(const ConvolutionContext& ctx,
     return result;
 }
 
-PerformanceConfigConvAsm3x3U ConvAsm3x3U::Search(const ConvolutionContext& ctx,
+PerformanceConfigConvAsm3x3U ConvAsm3x3U::Search(const ExecutionContext& ctx,
                                                  const ProblemDescription& problem,
                                                  const AnyInvokeParams& invoke_ctx) const
 {
diff --git a/src/solver/conv_asm_dir_BwdWrW1x1.cpp b/src/solver/conv_asm_dir_BwdWrW1x1.cpp
index b6b2458157..0abe71326f 100644
--- a/src/solver/conv_asm_dir_BwdWrW1x1.cpp
+++ b/src/solver/conv_asm_dir_BwdWrW1x1.cpp
@@ -307,7 +307,7 @@ bool PerformanceConfigConvAsmBwdWrW1x1::IsValidValue() const
         && IsFromPack<0, 1, 2, 3, 4>(data_prefetch); // clang-format on
 }
 
-bool PerformanceConfigConvAsmBwdWrW1x1::IsValid(const ConvolutionContext& ctx,
+bool PerformanceConfigConvAsmBwdWrW1x1::IsValid(const ExecutionContext& ctx,
                                                 const ProblemDescription& problem) const
 {
 
@@ -362,7 +362,7 @@ bool PerformanceConfigConvAsmBwdWrW1x1::IsValid(const ConvolutionContext& ctx,
     return true;
 }
 
-void PerformanceConfigConvAsmBwdWrW1x1::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceConfigConvAsmBwdWrW1x1::HeuristicInit(const ExecutionContext& ctx,
                                                       const ProblemDescription& problem)
 {
     short_store =
@@ -449,7 +449,7 @@ void PerformanceConfigConvAsmBwdWrW1x1::HeuristicInit(const ConvolutionContext&
 }
 
 PerformanceConfigConvAsmBwdWrW1x1
-ConvAsmBwdWrW1x1::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+ConvAsmBwdWrW1x1::GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                               const ProblemDescription& problem) const
 {
     PerformanceConfigConvAsmBwdWrW1x1 pp;
@@ -459,14 +459,14 @@ ConvAsmBwdWrW1x1::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
 }
 
 bool ConvAsmBwdWrW1x1::IsValidPerformanceConfig(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConfigConvAsmBwdWrW1x1& config) const
 {
     return config.IsValidValue() && config.IsValid(ctx, problem);
 }
 
-bool ConvAsmBwdWrW1x1::IsApplicable(const ConvolutionContext& ctx,
+bool ConvAsmBwdWrW1x1::IsApplicable(const ExecutionContext& ctx,
                                     const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW1X1{}))
@@ -549,7 +549,7 @@ static int divide_round_plus_inf(const int x, const int y)
     return x / y;
 }
 
-size_t ConvAsmBwdWrW1x1::GetWorkspaceSize(const ConvolutionContext&,
+size_t ConvAsmBwdWrW1x1::GetWorkspaceSize(const ExecutionContext&,
                                           const ProblemDescription& problem) const
 {
     if(UseSubsample(problem))
@@ -563,7 +563,7 @@ size_t ConvAsmBwdWrW1x1::GetWorkspaceSize(const ConvolutionContext&,
         return 0;
 }
 
-ConvSolution ConvAsmBwdWrW1x1::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvAsmBwdWrW1x1::GetSolution(const ExecutionContext& ctx,
                                            const ProblemDescription& problem,
                                            const PerformanceConfigConvAsmBwdWrW1x1& config) const
 {
@@ -851,7 +851,7 @@ ConvSolution ConvAsmBwdWrW1x1::GetSolution(const ConvolutionContext& ctx,
     return result;
 }
 
-PerformanceConfigConvAsmBwdWrW1x1 ConvAsmBwdWrW1x1::Search(const ConvolutionContext& ctx,
+PerformanceConfigConvAsmBwdWrW1x1 ConvAsmBwdWrW1x1::Search(const ExecutionContext& ctx,
                                                            const ProblemDescription& problem,
                                                            const AnyInvokeParams& invoke_ctx) const
 {
diff --git a/src/solver/conv_asm_dir_BwdWrW3x3.cpp b/src/solver/conv_asm_dir_BwdWrW3x3.cpp
index 7b0b0567d8..ae58cfcd9b 100644
--- a/src/solver/conv_asm_dir_BwdWrW3x3.cpp
+++ b/src/solver/conv_asm_dir_BwdWrW3x3.cpp
@@ -143,7 +143,7 @@ static bool IsReverseInOutAllowed(const ProblemDescription& problem)
 
 inline int elements_in_dword(const ProblemDescription& problem) { return problem.IsFp16() ? 2 : 1; }
 
-bool PerformanceConfigAsmDirect3x3WrW::IsValid(const ConvolutionContext& ctx,
+bool PerformanceConfigAsmDirect3x3WrW::IsValid(const ExecutionContext& ctx,
                                                const ProblemDescription& problem) const
 {
     if(!IsValidValue())
@@ -250,7 +250,7 @@ bool PerformanceConfigAsmDirect3x3WrW::IsValid(const ConvolutionContext& ctx,
     return true;
 }
 
-void PerformanceConfigAsmDirect3x3WrW::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceConfigAsmDirect3x3WrW::HeuristicInit(const ExecutionContext& ctx,
                                                      const ProblemDescription& problem)
 {
     limit_wave_cnt = 0;
@@ -332,7 +332,7 @@ void PerformanceConfigAsmDirect3x3WrW::HeuristicInit(const ConvolutionContext& c
 }
 
 PerformanceConfigAsmDirect3x3WrW
-ConvAsmBwdWrW3x3::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+ConvAsmBwdWrW3x3::GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                               const ProblemDescription& problem) const
 {
     PerformanceConfigAsmDirect3x3WrW pp;
@@ -342,14 +342,14 @@ ConvAsmBwdWrW3x3::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
 }
 
 bool ConvAsmBwdWrW3x3::IsValidPerformanceConfig(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConfigAsmDirect3x3WrW& config) const
 {
     return config.IsValidValue() && config.IsValid(ctx, problem);
 }
 
-bool ConvAsmBwdWrW3x3::IsApplicable(const ConvolutionContext& ctx,
+bool ConvAsmBwdWrW3x3::IsApplicable(const ExecutionContext& ctx,
                                     const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3{}))
@@ -445,7 +445,7 @@ bool ConvAsmBwdWrW3x3::IsApplicable(const ConvolutionContext& ctx,
     return ok;
 }
 
-ConvSolution ConvAsmBwdWrW3x3::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvAsmBwdWrW3x3::GetSolution(const ExecutionContext& ctx,
                                            const ProblemDescription& problem,
                                            const PerformanceConfigAsmDirect3x3WrW& config) const
 {
@@ -562,7 +562,7 @@ ConvSolution ConvAsmBwdWrW3x3::GetSolution(const ConvolutionContext& ctx,
     return result;
 }
 
-PerformanceConfigAsmDirect3x3WrW ConvAsmBwdWrW3x3::Search(const ConvolutionContext& ctx,
+PerformanceConfigAsmDirect3x3WrW ConvAsmBwdWrW3x3::Search(const ExecutionContext& ctx,
                                                           const ProblemDescription& problem,
                                                           const AnyInvokeParams& invoke_ctx) const
 {
diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp
index 71c53f61a9..9cfdd8aeea 100644
--- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp
+++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp
@@ -461,7 +461,7 @@ GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel(
 }
 
 void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit(
-    const ConvolutionContext&, const ProblemDescription& problem)
+    const ExecutionContext&, const ProblemDescription& problem)
 {
     static const std::vector<std::tuple<int, int, int>> tile_list_fp32 = {
         std::make_tuple(128, 128, 16),
@@ -887,7 +887,7 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid(
 
 PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC
 ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetDefaultPerformanceConfig(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC pp;
     pp.HeuristicInit(ctx, problem);
@@ -895,7 +895,7 @@ ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetDefaultPerformanceConfig(
     return pp;
 }
 bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config) const
 {
@@ -903,7 +903,7 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsValidPerformanceConfig(
 }
 
 PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC
-ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::Search(const ConvolutionContext& ctx,
+ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::Search(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem,
                                                    const AnyInvokeParams& invoke_ctx) const
 {
@@ -911,7 +911,7 @@ ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::Search(const ConvolutionContext& ctx
 }
 
 bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS_NHWC{}))
         return false;
@@ -977,7 +977,7 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable(
 }
 
 size_t ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetWorkspaceSize(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     const int hi       = problem.GetOutHeight_();
     const int wi       = problem.GetOutWidth_();
@@ -1031,7 +1031,7 @@ size_t ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetWorkspaceSize(
 }
 
 ConvSolution ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config) const
 {
diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp
index bbedf8d680..b16258235e 100644
--- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp
+++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp
@@ -257,7 +257,7 @@ static std::tuple<std::string, // kernel_name
                   size_t,      // grid_size
                   size_t>      // splits_4G
 GetImplicitGemmGtcDynamicFwdDlopsNCHWCKernel(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC& config)
 {
@@ -518,7 +518,7 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC::IsValid(
 
 PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC
 ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::GetDefaultPerformanceConfig(
-    const ConvolutionContext&, const ProblemDescription& problem) const
+    const ExecutionContext&, const ProblemDescription& problem) const
 {
     PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC pp;
     pp.HeuristicInit(problem);
@@ -527,14 +527,14 @@ ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::GetDefaultPerformanceConfig(
 }
 
 bool ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC& config) const
 {
     return config.IsValidValue() && config.IsValid(problem);
 }
 PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC
-ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::Search(const ConvolutionContext& ctx,
+ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::Search(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem,
                                                    const AnyInvokeParams& invoke_ctx) const
 {
@@ -542,7 +542,7 @@ ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::Search(const ConvolutionContext& ctx
 }
 
 bool ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::IsApplicable(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_DLOPS_NCHWC{}))
         return false;
@@ -591,7 +591,7 @@ bool ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::IsApplicable(
 }
 
 ConvSolution ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC& config) const
 {
diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp
index e315fd0895..4ab9ce1c37 100644
--- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp
+++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp
@@ -360,7 +360,7 @@ GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(
 }
 
 void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(
-    const ConvolutionContext&, const ProblemDescription& problem)
+    const ExecutionContext&, const ProblemDescription& problem)
 {
     static const std::vector<std::tuple<int, int, int>> tile_list_fp32 = {
         std::make_tuple(128, 128, 16),
@@ -768,7 +768,7 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid(
 
 PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC
 ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetDefaultPerformanceConfig(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC pp;
     pp.HeuristicInit(ctx, problem);
@@ -777,7 +777,7 @@ ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetDefaultPerformanceConfig(
 }
 
 bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config) const
 {
@@ -785,7 +785,7 @@ bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsValidPerformanceConfig(
 }
 
 PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC
-ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::Search(const ConvolutionContext& ctx,
+ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::Search(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem,
                                                    const AnyInvokeParams& invoke_ctx) const
 {
@@ -793,7 +793,7 @@ ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::Search(const ConvolutionContext& ctx
 }
 
 size_t ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetWorkspaceSize(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     const int hi          = problem.GetInHeight_();
     const int wi          = problem.GetInWidth_();
@@ -849,7 +849,7 @@ size_t ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetWorkspaceSize(
 }
 
 bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsApplicable(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS_NHWC{}))
         return false;
@@ -916,7 +916,7 @@ bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsApplicable(
     return true;
 }
 ConvSolution ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config) const
 {
diff --git a/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp b/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp
index d7395bb0e9..8560c65052 100644
--- a/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp
+++ b/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp
@@ -256,7 +256,7 @@ std::string PerformanceConfigAsmImplicitGemmGTC::ToString() const
     return ss.str();
 }
 
-std::string PerformanceConfigAsmImplicitGemmGTC::ToKernelName(const ConvolutionContext& ctx) const
+std::string PerformanceConfigAsmImplicitGemmGTC::ToKernelName(const ExecutionContext& ctx) const
 {
     std::ostringstream kernel_name;
     const auto device_name = ctx.GetStream().GetDeviceName();
@@ -487,7 +487,7 @@ std::string PerformanceConfigAsmImplicitGemmGTCvector::ToString() const
 }
 
 std::string
-PerformanceConfigAsmImplicitGemmGTCvector::ToKernelName(const ConvolutionContext& ctx) const
+PerformanceConfigAsmImplicitGemmGTCvector::ToKernelName(const ExecutionContext& ctx) const
 {
     std::ostringstream kernel_name;
     const auto device_name    = ctx.GetStream().GetDeviceName();
diff --git a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp
index c8dee39a79..8ac238395a 100644
--- a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp
+++ b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp
@@ -451,7 +451,7 @@ void PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::SetParamsForKSplit(
 }
 
 void PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::HeuristicInit(
-    const ConvolutionContext& ctx, const ProblemDescription& problem)
+    const ExecutionContext& ctx, const ProblemDescription& problem)
 {
     static const std::vector<std::tuple<int, int, int>> tile_list_fp32 = {
         std::make_tuple(128, 128, 16),
@@ -816,7 +816,7 @@ bool PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::IsValid(
 
 PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
 ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetDefaultPerformanceConfig(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC pp;
     pp.HeuristicInit(ctx, problem);
@@ -824,14 +824,14 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetDefaultPerformanceConfig(
     return pp;
 }
 bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC& config) const
 {
     return config.IsValidValue() && config.IsValid(problem);
 }
 PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
-ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::Search(const ConvolutionContext& ctx,
+ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::Search(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem,
                                                    const AnyInvokeParams& invoke_ctx) const
 {
@@ -839,7 +839,7 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::Search(const ConvolutionContext& ctx
 }
 
 bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsApplicable(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS_NHWC{}))
         return false;
@@ -956,7 +956,7 @@ ComputeDynamicIGemmWrwKernelArgsNHWC(const conv::ProblemDescription& problem,
 }
 
 size_t ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetWorkspaceSize(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     const int hi       = problem.GetOutHeight_();
     const int wi       = problem.GetOutWidth_();
@@ -1010,7 +1010,7 @@ size_t ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetWorkspaceSize(
 }
 
 ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC& config) const
 {
diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
index 7ddf2e3049..001f3a8cb7 100644
--- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
+++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp
@@ -82,7 +82,7 @@ bool PerformanceConvCkIgemmFwdV6r1DlopsNchw::IsValid(const ProblemDescription& p
         ck_utility::get_ck_convolution_problem_descriptor(problem), compile_param);
 }
 
-bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx,
+bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ExecutionContext& ctx,
                                                const ProblemDescription& problem) const
 {
 #if WORKAROUND_SWDEV_411729
@@ -121,7 +121,7 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx,
 }
 
 PerformanceConvCkIgemmFwdV6r1DlopsNchw
-ConvCkIgemmFwdV6r1DlopsNchw::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+ConvCkIgemmFwdV6r1DlopsNchw::GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                                          const ProblemDescription& problem) const
 {
     for(int i = 0; i < ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetTunableList().size(); ++i)
@@ -138,7 +138,7 @@ ConvCkIgemmFwdV6r1DlopsNchw::GetDefaultPerformanceConfig(const ConvolutionContex
 }
 
 bool ConvCkIgemmFwdV6r1DlopsNchw::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceConvCkIgemmFwdV6r1DlopsNchw& config) const
 {
@@ -146,7 +146,7 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsValidPerformanceConfig(
 }
 
 ConvSolution
-ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(const ConvolutionContext& ctx,
+ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(const ExecutionContext& ctx,
                                          const ProblemDescription& problem,
                                          const PerformanceConvCkIgemmFwdV6r1DlopsNchw& config) const
 {
@@ -252,7 +252,7 @@ ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(const ConvolutionContext& ctx,
     return sol;
 }
 
-std::size_t ConvCkIgemmFwdV6r1DlopsNchw::GetWorkspaceSize(const ConvolutionContext&,
+std::size_t ConvCkIgemmFwdV6r1DlopsNchw::GetWorkspaceSize(const ExecutionContext&,
                                                           const ProblemDescription& problem) const
 {
     return ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetMaxWorkSpaceSize(
@@ -260,7 +260,7 @@ std::size_t ConvCkIgemmFwdV6r1DlopsNchw::GetWorkspaceSize(const ConvolutionConte
 }
 
 PerformanceConvCkIgemmFwdV6r1DlopsNchw
-ConvCkIgemmFwdV6r1DlopsNchw::Search(const ConvolutionContext& ctx,
+ConvCkIgemmFwdV6r1DlopsNchw::Search(const ExecutionContext& ctx,
                                     const ProblemDescription& problem,
                                     const AnyInvokeParams& invoke_ctx) const
 {
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index 64c95257e6..4df8df5874 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -176,7 +176,7 @@ std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
     return kernel_name.str();
 }
 
-std::string ConvDirectNaiveConvKernelFile(const ConvolutionContext& ctx,
+std::string ConvDirectNaiveConvKernelFile(const ExecutionContext& ctx,
                                           const ProblemDescription& problem)
 {
     const auto device_name = ctx.GetStream().GetDeviceName();
@@ -193,7 +193,7 @@ std::string ConvDirectNaiveConvKernelFile(const ConvolutionContext& ctx,
     return "naive_conv.cpp";
 }
 
-std::string ConvDirectNaiveConvCompileOption(const ConvolutionContext& ctx,
+std::string ConvDirectNaiveConvCompileOption(const ExecutionContext& ctx,
                                              const ProblemDescription& problem)
 {
     std::string filename = ConvDirectNaiveConvKernelFile(ctx, problem);
diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp
index c5d793860c..f8af0ec2d1 100644
--- a/src/solver/conv_direct_naive_conv_bwd.cpp
+++ b/src/solver/conv_direct_naive_conv_bwd.cpp
@@ -34,7 +34,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_BWD)
 namespace miopen {
 namespace solver {
 
-bool ConvDirectNaiveConvBwd::IsApplicable(const ConvolutionContext& ctx,
+bool ConvDirectNaiveConvBwd::IsApplicable(const ExecutionContext& ctx,
                                           const ProblemDescription& problem) const
 {
     if(!miopen::debug::AlwaysEnableConvDirectNaive &&
@@ -74,7 +74,7 @@ bool ConvDirectNaiveConvBwd::IsApplicable(const ConvolutionContext& ctx,
     return true;
 }
 
-ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
                                                  const ProblemDescription& problem) const
 {
     ConvSolution result;
diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp
index fc8d8e77fd..90d8feee31 100644
--- a/src/solver/conv_direct_naive_conv_fwd.cpp
+++ b/src/solver/conv_direct_naive_conv_fwd.cpp
@@ -34,7 +34,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD)
 namespace miopen {
 namespace solver {
 
-bool ConvDirectNaiveConvFwd::IsApplicable(const ConvolutionContext& ctx,
+bool ConvDirectNaiveConvFwd::IsApplicable(const ExecutionContext& ctx,
                                           const ProblemDescription& problem) const
 {
     if(!miopen::debug::AlwaysEnableConvDirectNaive &&
@@ -74,7 +74,7 @@ bool ConvDirectNaiveConvFwd::IsApplicable(const ConvolutionContext& ctx,
     return true;
 }
 
-ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
                                                  const ProblemDescription& problem) const
 {
     ConvSolution result;
diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp
index 2c85949ad4..6fcf2f71d0 100644
--- a/src/solver/conv_direct_naive_conv_wrw.cpp
+++ b/src/solver/conv_direct_naive_conv_wrw.cpp
@@ -34,7 +34,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_WRW)
 namespace miopen {
 namespace solver {
 
-bool ConvDirectNaiveConvWrw::IsApplicable(const ConvolutionContext& ctx,
+bool ConvDirectNaiveConvWrw::IsApplicable(const ExecutionContext& ctx,
                                           const ProblemDescription& problem) const
 {
     if(!miopen::debug::AlwaysEnableConvDirectNaive &&
@@ -74,7 +74,7 @@ bool ConvDirectNaiveConvWrw::IsApplicable(const ConvolutionContext& ctx,
     return true;
 }
 
-ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
                                                  const ProblemDescription& problem) const
 {
     ConvSolution result;
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
index 0b880b2fc8..2602c54320 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
@@ -271,7 +271,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::operator==(
 
 PerformanceConfigHipImplicitGemm3DGroupBwdXdlops
 ConvHipImplicitGemm3DGroupBwdXdlops::GetDefaultPerformanceConfig(
-    const ConvolutionContext&, const ProblemDescription& problem) const
+    const ExecutionContext&, const ProblemDescription& problem) const
 {
     PerformanceConfigHipImplicitGemm3DGroupBwdXdlops pp;
     pp.HeuristicInit(problem);
@@ -279,7 +279,7 @@ ConvHipImplicitGemm3DGroupBwdXdlops::GetDefaultPerformanceConfig(
 }
 
 bool ConvHipImplicitGemm3DGroupBwdXdlops::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops& config) const
 {
@@ -287,7 +287,7 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsValidPerformanceConfig(
 }
 
 PerformanceConfigHipImplicitGemm3DGroupBwdXdlops
-ConvHipImplicitGemm3DGroupBwdXdlops::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemm3DGroupBwdXdlops::Search(const ExecutionContext& ctx,
                                             const ProblemDescription& problem,
                                             const AnyInvokeParams& invoke_ctx) const
 {
@@ -295,7 +295,7 @@ ConvHipImplicitGemm3DGroupBwdXdlops::Search(const ConvolutionContext& ctx,
 }
 
 bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable(
-    [[maybe_unused]] const ConvolutionContext& ctx,
+    [[maybe_unused]] const ExecutionContext& ctx,
     [[maybe_unused]] const ProblemDescription& problem) const
 {
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
@@ -335,7 +335,7 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable(
 }
 
 ConvSolution ConvHipImplicitGemm3DGroupBwdXdlops::GetSolution(
-    [[maybe_unused]] const ConvolutionContext& ctx,
+    [[maybe_unused]] const ExecutionContext& ctx,
     [[maybe_unused]] const ProblemDescription& problem,
     [[maybe_unused]] const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
index 80b969fcbd..f0623c642d 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
@@ -271,7 +271,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::operator==(
 
 PerformanceConfigHipImplicitGemm3DGroupFwdXdlops
 ConvHipImplicitGemm3DGroupFwdXdlops::GetDefaultPerformanceConfig(
-    const ConvolutionContext&, const ProblemDescription& problem) const
+    const ExecutionContext&, const ProblemDescription& problem) const
 {
     PerformanceConfigHipImplicitGemm3DGroupFwdXdlops pp;
     pp.HeuristicInit(problem);
@@ -279,7 +279,7 @@ ConvHipImplicitGemm3DGroupFwdXdlops::GetDefaultPerformanceConfig(
 }
 
 bool ConvHipImplicitGemm3DGroupFwdXdlops::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& config) const
 {
@@ -287,7 +287,7 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsValidPerformanceConfig(
 }
 
 PerformanceConfigHipImplicitGemm3DGroupFwdXdlops
-ConvHipImplicitGemm3DGroupFwdXdlops::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemm3DGroupFwdXdlops::Search(const ExecutionContext& ctx,
                                             const ProblemDescription& problem,
                                             const AnyInvokeParams& invoke_ctx) const
 {
@@ -295,7 +295,7 @@ ConvHipImplicitGemm3DGroupFwdXdlops::Search(const ConvolutionContext& ctx,
 }
 
 bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable(
-    [[maybe_unused]] const ConvolutionContext& ctx,
+    [[maybe_unused]] const ExecutionContext& ctx,
     [[maybe_unused]] const ProblemDescription& problem) const
 {
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
@@ -333,7 +333,7 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable(
 }
 
 ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution(
-    [[maybe_unused]] const ConvolutionContext& ctx,
+    [[maybe_unused]] const ExecutionContext& ctx,
     [[maybe_unused]] const ProblemDescription& problem,
     [[maybe_unused]] const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
index 7292828f69..6fce8a80b8 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
@@ -267,7 +267,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::operator==(
 
 PerformanceConfigHipImplicitGemm3DGroupWrwXdlops
 ConvHipImplicitGemm3DGroupWrwXdlops::GetDefaultPerformanceConfig(
-    const ConvolutionContext&, const ProblemDescription& problem) const
+    const ExecutionContext&, const ProblemDescription& problem) const
 {
     PerformanceConfigHipImplicitGemm3DGroupWrwXdlops pp;
     pp.HeuristicInit(problem);
@@ -275,7 +275,7 @@ ConvHipImplicitGemm3DGroupWrwXdlops::GetDefaultPerformanceConfig(
 }
 
 bool ConvHipImplicitGemm3DGroupWrwXdlops::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops& config) const
 {
@@ -283,7 +283,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsValidPerformanceConfig(
 }
 
 PerformanceConfigHipImplicitGemm3DGroupWrwXdlops
-ConvHipImplicitGemm3DGroupWrwXdlops::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemm3DGroupWrwXdlops::Search(const ExecutionContext& ctx,
                                             const ProblemDescription& problem,
                                             const AnyInvokeParams& invoke_ctx) const
 {
@@ -291,7 +291,7 @@ ConvHipImplicitGemm3DGroupWrwXdlops::Search(const ConvolutionContext& ctx,
 }
 
 bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable(
-    [[maybe_unused]] const ConvolutionContext& ctx,
+    [[maybe_unused]] const ExecutionContext& ctx,
     [[maybe_unused]] const ProblemDescription& problem) const
 {
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
@@ -331,7 +331,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable(
 }
 
 ConvSolution ConvHipImplicitGemm3DGroupWrwXdlops::GetSolution(
-    [[maybe_unused]] const ConvolutionContext& ctx,
+    [[maybe_unused]] const ExecutionContext& ctx,
     [[maybe_unused]] const ProblemDescription& problem,
     [[maybe_unused]] const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
index 7e380f6289..57c63267d2 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
@@ -229,7 +229,7 @@ bool PerformanceConfigHipImplicitGemmBwdXdlops::operator==(
 }
 
 PerformanceConfigHipImplicitGemmBwdXdlops
-ConvHipImplicitGemmBwdXdlops::GetDefaultPerformanceConfig(const ConvolutionContext&,
+ConvHipImplicitGemmBwdXdlops::GetDefaultPerformanceConfig(const ExecutionContext&,
                                                           const ProblemDescription& problem) const
 {
     PerformanceConfigHipImplicitGemmBwdXdlops pp;
@@ -238,7 +238,7 @@ ConvHipImplicitGemmBwdXdlops::GetDefaultPerformanceConfig(const ConvolutionConte
 }
 
 bool ConvHipImplicitGemmBwdXdlops::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceConfigHipImplicitGemmBwdXdlops& config) const
 {
@@ -246,7 +246,7 @@ bool ConvHipImplicitGemmBwdXdlops::IsValidPerformanceConfig(
 }
 
 PerformanceConfigHipImplicitGemmBwdXdlops
-ConvHipImplicitGemmBwdXdlops::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmBwdXdlops::Search(const ExecutionContext& ctx,
                                      const ProblemDescription& problem,
                                      const AnyInvokeParams& invoke_ctx) const
 {
@@ -254,7 +254,7 @@ ConvHipImplicitGemmBwdXdlops::Search(const ConvolutionContext& ctx,
 }
 
 bool ConvHipImplicitGemmBwdXdlops::IsApplicable(
-    [[maybe_unused]] const ConvolutionContext& ctx,
+    [[maybe_unused]] const ExecutionContext& ctx,
     [[maybe_unused]] const ProblemDescription& problem) const
 {
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
@@ -303,7 +303,7 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable(
 }
 
 ConvSolution ConvHipImplicitGemmBwdXdlops::GetSolution(
-    [[maybe_unused]] const ConvolutionContext& ctx,
+    [[maybe_unused]] const ExecutionContext& ctx,
     [[maybe_unused]] const ProblemDescription& problem,
     [[maybe_unused]] const PerformanceConfigHipImplicitGemmBwdXdlops& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
index c389cb0cee..b2b591b859 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp
@@ -86,7 +86,7 @@ bool PerformanceImplicitGemmBwdDataV1R1::operator==(
 }
 
 std::tuple<int, bool>
-PerformanceImplicitGemmBwdDataV1R1::CalculateGridSize(const ConvolutionContext& ctx,
+PerformanceImplicitGemmBwdDataV1R1::CalculateGridSize(const ExecutionContext& ctx,
                                                       const ProblemDescription& problem) const
 {
     int GridSize = 0;
@@ -180,7 +180,7 @@ PerformanceImplicitGemmBwdDataV1R1::CalculateBlockGemmPerformanceParameters() co
 
 std::tuple<int, int, int, int, bool>
 PerformanceImplicitGemmBwdDataV1R1::CalculateGemmABlockCopyPerformanceParameters(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     int ClusterLengths_GemmK      = 0;
     int ClusterLengths_GemmM      = 0;
@@ -245,7 +245,7 @@ PerformanceImplicitGemmBwdDataV1R1::CalculateGemmABlockCopyPerformanceParameters
 
 std::tuple<int, int, int, int, bool>
 PerformanceImplicitGemmBwdDataV1R1::CalculateGemmBBlockCopyPerformanceParameters(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     int ClusterLengths_GemmK      = 0;
     int ClusterLengths_GemmN      = 0;
@@ -392,7 +392,7 @@ PerformanceImplicitGemmBwdDataV1R1::CalculateGemmCThreadCopyPerformanceParameter
 }
 
 std::tuple<std::size_t, bool> PerformanceImplicitGemmBwdDataV1R1::CalculateLdsNumberOfByte(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     std::size_t lds_size = 0;
 
@@ -450,7 +450,7 @@ bool PerformanceImplicitGemmBwdDataV1R1::IsValidValue() const
     // clang-format on
 }
 
-bool PerformanceImplicitGemmBwdDataV1R1::IsValid(const ConvolutionContext& ctx,
+bool PerformanceImplicitGemmBwdDataV1R1::IsValid(const ExecutionContext& ctx,
                                                  const ProblemDescription& problem) const
 {
     if(!IsValidValue())
@@ -506,7 +506,7 @@ bool PerformanceImplicitGemmBwdDataV1R1::IsValid(const ConvolutionContext& ctx,
     return (valid and lds_size <= get_lds_max_number_of_byte());
 }
 
-void PerformanceImplicitGemmBwdDataV1R1::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceImplicitGemmBwdDataV1R1::HeuristicInit(const ExecutionContext& ctx,
                                                        const ProblemDescription& problem)
 {
     PerformanceImplicitGemmBwdDataV1R1 config;
@@ -587,7 +587,7 @@ bool PerformanceImplicitGemmBwdDataV1R1::SetNextValue(const ProblemDescription&)
 }
 
 std::tuple<int, int, int>
-ConvHipImplicitGemmBwdDataV1R1::CalculateGemmSize(const ConvolutionContext& ctx,
+ConvHipImplicitGemmBwdDataV1R1::CalculateGemmSize(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem)
 {
     const auto n  = ProblemInterpreter::GetBatchN(problem);
@@ -607,7 +607,7 @@ ConvHipImplicitGemmBwdDataV1R1::CalculateGemmSize(const ConvolutionContext& ctx,
     return std::make_tuple(gemm_m, gemm_n, gemm_k);
 }
 
-size_t ConvHipImplicitGemmBwdDataV1R1::GetWorkspaceSize(const ConvolutionContext&,
+size_t ConvHipImplicitGemmBwdDataV1R1::GetWorkspaceSize(const ExecutionContext&,
                                                         const ProblemDescription& problem) const
 {
     if(problem.IsFp32())
@@ -627,7 +627,7 @@ size_t ConvHipImplicitGemmBwdDataV1R1::GetWorkspaceSize(const ConvolutionContext
     }
 }
 
-bool ConvHipImplicitGemmBwdDataV1R1::IsApplicable(const ConvolutionContext& ctx,
+bool ConvHipImplicitGemmBwdDataV1R1::IsApplicable(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V1R1{}))
@@ -676,14 +676,14 @@ bool ConvHipImplicitGemmBwdDataV1R1::IsApplicable(const ConvolutionContext& ctx,
 }
 
 PerformanceImplicitGemmBwdDataV1R1
-ConvHipImplicitGemmBwdDataV1R1::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+ConvHipImplicitGemmBwdDataV1R1::GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                                             const ProblemDescription& problem) const
 {
     return GetPerformanceConfigBase<PerformanceImplicitGemmBwdDataV1R1>(ctx, problem);
 }
 
 bool ConvHipImplicitGemmBwdDataV1R1::IsValidPerformanceConfig(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmBwdDataV1R1& config) const
 {
@@ -692,7 +692,7 @@ bool ConvHipImplicitGemmBwdDataV1R1::IsValidPerformanceConfig(
 }
 
 PerformanceImplicitGemmBwdDataV1R1
-ConvHipImplicitGemmBwdDataV1R1::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmBwdDataV1R1::Search(const ExecutionContext& ctx,
                                        const ProblemDescription& problem,
                                        const AnyInvokeParams& invoke_ctx) const
 {
@@ -700,7 +700,7 @@ ConvHipImplicitGemmBwdDataV1R1::Search(const ConvolutionContext& ctx,
 }
 
 ConvSolution
-ConvHipImplicitGemmBwdDataV1R1::GetSolution(const ConvolutionContext& ctx,
+ConvHipImplicitGemmBwdDataV1R1::GetSolution(const ExecutionContext& ctx,
                                             const ProblemDescription& problem,
                                             const PerformanceImplicitGemmBwdDataV1R1& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
index 5b533d72ee..f657fa74fe 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp
@@ -105,7 +105,7 @@ bool PerformanceImplicitGemmBwdV1R1Xdlops::SetNextValue(const ProblemDescription
     return true;
 }
 
-void PerformanceImplicitGemmBwdV1R1Xdlops::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceImplicitGemmBwdV1R1Xdlops::HeuristicInit(const ExecutionContext& ctx,
                                                          const ProblemDescription& problem)
 {
     PerformanceImplicitGemmBwdV1R1Xdlops tmp;
@@ -527,7 +527,7 @@ bool PerformanceImplicitGemmBwdV1R1Xdlops::IsReallyValid(const ProblemDescriptio
 // Return false if a performance config is known to be sub-optimal, comparing to other performance
 // config inside tuning range
 bool PerformanceImplicitGemmBwdV1R1Xdlops::IsFastToBeUsedForTuning(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     // somehow, 128x128 wave-wise GEMM tend to spill register
     // TODO revisit this when 128x128 wave-wise GEMM become efficient
@@ -659,7 +659,7 @@ bool PerformanceImplicitGemmBwdV1R1Xdlops::IsFastToBeUsedForTuning(
 // Return false, if you don't want to this to be included in tuning range used by generic search
 // A performance config may still be valid w.r.t algorithm correctness, even when IsValid() return
 // false
-bool PerformanceImplicitGemmBwdV1R1Xdlops::IsValid(const ConvolutionContext& ctx,
+bool PerformanceImplicitGemmBwdV1R1Xdlops::IsValid(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem) const
 {
     return IsReallyValid(problem) && IsFastToBeUsedForTuning(ctx, problem);
@@ -667,7 +667,7 @@ bool PerformanceImplicitGemmBwdV1R1Xdlops::IsValid(const ConvolutionContext& ctx
 
 // Used by GenericSearch, not used by HeuristicInit
 bool ConvHipImplicitGemmBwdDataV1R1Xdlops::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmBwdV1R1Xdlops& config) const
 {
@@ -699,7 +699,7 @@ ConvHipImplicitGemmBwdDataV1R1Xdlops::CalculateGemmSize(const ProblemDescription
 
 PerformanceImplicitGemmBwdV1R1Xdlops
 ConvHipImplicitGemmBwdDataV1R1Xdlops::GetDefaultPerformanceConfig(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     return GetPerformanceConfigBase<PerformanceImplicitGemmBwdV1R1Xdlops>(ctx, problem);
 }
@@ -717,7 +717,7 @@ std::tuple<std::size_t, bool> PerformanceImplicitGemmBwdV1R1Xdlops::CalculateLds
 }
 
 std::size_t
-ConvHipImplicitGemmBwdDataV1R1Xdlops::GetWorkspaceSize(const ConvolutionContext&,
+ConvHipImplicitGemmBwdDataV1R1Xdlops::GetWorkspaceSize(const ExecutionContext&,
                                                        const ProblemDescription& problem) const
 {
     if(problem.IsFp32())
@@ -750,7 +750,7 @@ ConvHipImplicitGemmBwdDataV1R1Xdlops::GetWorkspaceSize(const ConvolutionContext&
     }
 }
 
-bool ConvHipImplicitGemmBwdDataV1R1Xdlops::IsApplicable(const ConvolutionContext& ctx,
+bool ConvHipImplicitGemmBwdDataV1R1Xdlops::IsApplicable(const ExecutionContext& ctx,
                                                         const ProblemDescription& problem) const
 {
 #if WORKAROUND_SWDEV_251757
@@ -809,7 +809,7 @@ bool ConvHipImplicitGemmBwdDataV1R1Xdlops::IsApplicable(const ConvolutionContext
 }
 
 PerformanceImplicitGemmBwdV1R1Xdlops
-ConvHipImplicitGemmBwdDataV1R1Xdlops::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmBwdDataV1R1Xdlops::Search(const ExecutionContext& ctx,
                                              const ProblemDescription& problem,
                                              const AnyInvokeParams& invoke_ctx) const
 {
@@ -817,7 +817,7 @@ ConvHipImplicitGemmBwdDataV1R1Xdlops::Search(const ConvolutionContext& ctx,
 }
 
 ConvSolution ConvHipImplicitGemmBwdDataV1R1Xdlops::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmBwdV1R1Xdlops& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
index 65f8cf6675..e60d6c76a3 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp
@@ -523,7 +523,7 @@ bool PerformanceImplicitGemmBwdDataV4R1::IsValid(const ProblemDescription& probl
     return (valid and lds_size <= get_lds_max_number_of_byte());
 }
 
-void PerformanceImplicitGemmBwdDataV4R1::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceImplicitGemmBwdDataV4R1::HeuristicInit(const ExecutionContext& ctx,
                                                        const ProblemDescription& problem)
 {
     std::ignore = ctx;
@@ -724,7 +724,7 @@ ConvHipImplicitGemmBwdDataV4R1::CalculateGemmSize(const ProblemDescription& prob
     }
 }
 
-bool ConvHipImplicitGemmBwdDataV4R1::IsApplicable(const ConvolutionContext& ctx,
+bool ConvHipImplicitGemmBwdDataV4R1::IsApplicable(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem) const
 {
 #if WORKAROUND_SWDEV_229277_227616_229195
@@ -787,14 +787,14 @@ bool ConvHipImplicitGemmBwdDataV4R1::IsApplicable(const ConvolutionContext& ctx,
 }
 
 PerformanceImplicitGemmBwdDataV4R1
-ConvHipImplicitGemmBwdDataV4R1::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+ConvHipImplicitGemmBwdDataV4R1::GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                                             const ProblemDescription& problem) const
 {
     return GetPerformanceConfigBase<PerformanceImplicitGemmBwdDataV4R1>(ctx, problem);
 }
 
 bool ConvHipImplicitGemmBwdDataV4R1::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmBwdDataV4R1& config) const
 {
@@ -803,7 +803,7 @@ bool ConvHipImplicitGemmBwdDataV4R1::IsValidPerformanceConfig(
 }
 
 PerformanceImplicitGemmBwdDataV4R1
-ConvHipImplicitGemmBwdDataV4R1::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmBwdDataV4R1::Search(const ExecutionContext& ctx,
                                        const ProblemDescription& problem,
                                        const AnyInvokeParams& invoke_ctx) const
 {
@@ -811,7 +811,7 @@ ConvHipImplicitGemmBwdDataV4R1::Search(const ConvolutionContext& ctx,
 }
 
 ConvSolution
-ConvHipImplicitGemmBwdDataV4R1::GetSolution(const ConvolutionContext& ctx,
+ConvHipImplicitGemmBwdDataV4R1::GetSolution(const ExecutionContext& ctx,
                                             const ProblemDescription& problem,
                                             const PerformanceImplicitGemmBwdDataV4R1& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
index 3e85e4c966..3b3dc8b4d3 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp
@@ -414,7 +414,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsReallyValid(
 }
 
 bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsFastToBeUsedForTuning(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     if(use_spare_set)
         return true;
@@ -511,7 +511,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsFastToBeUsedForTuning(
     return true;
 }
 
-bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext& ctx,
+bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ExecutionContext& ctx,
                                                        const ProblemDescription& problem) const
 {
 
@@ -604,7 +604,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue(const ProblemDescrip
     return true;
 }
 
-void PerformanceImplicitGemmBwdDataV4R1Xdlops::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceImplicitGemmBwdDataV4R1Xdlops::HeuristicInit(const ExecutionContext& ctx,
                                                              const ProblemDescription& problem)
 {
     PerformanceImplicitGemmBwdDataV4R1Xdlops tmp;
@@ -807,7 +807,7 @@ ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateGemmSize(const ProblemDescription
     return std::make_tuple(g, gemm_m, gemm_n, gemm_k);
 }
 
-bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsApplicable(const ConvolutionContext& ctx,
+bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsApplicable(const ExecutionContext& ctx,
                                                         const ProblemDescription& problem) const
 {
 #if WORKAROUND_ISSUE_1206
@@ -869,13 +869,13 @@ bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsApplicable(const ConvolutionContext
 
 PerformanceImplicitGemmBwdDataV4R1Xdlops
 ConvHipImplicitGemmBwdDataV4R1Xdlops::GetDefaultPerformanceConfig(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     return GetPerformanceConfigBase<PerformanceImplicitGemmBwdDataV4R1Xdlops>(ctx, problem);
 }
 
 bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmBwdDataV4R1Xdlops& config) const
 {
@@ -884,7 +884,7 @@ bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsValidPerformanceConfig(
 }
 
 PerformanceImplicitGemmBwdDataV4R1Xdlops
-ConvHipImplicitGemmBwdDataV4R1Xdlops::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmBwdDataV4R1Xdlops::Search(const ExecutionContext& ctx,
                                              const ProblemDescription& problem,
                                              const AnyInvokeParams& invoke_ctx) const
 {
@@ -892,7 +892,7 @@ ConvHipImplicitGemmBwdDataV4R1Xdlops::Search(const ConvolutionContext& ctx,
 }
 
 ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmBwdDataV4R1Xdlops& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
index 773f5a1d32..39e8c71c16 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp
@@ -40,7 +40,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1)
 namespace miopen {
 namespace solver {
 
-bool ConvHipImplicitGemmV4R1Fwd::IsApplicable(const ConvolutionContext& ctx,
+bool ConvHipImplicitGemmV4R1Fwd::IsApplicable(const ExecutionContext& ctx,
                                               const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1{}))
@@ -86,7 +86,7 @@ bool ConvHipImplicitGemmV4R1Fwd::IsApplicable(const ConvolutionContext& ctx,
            (c * y * x) % eMultiple == 0 && k % 16 == 0;
 }
 
-bool ConvHipImplicitGemmV4R1WrW::IsApplicable(const ConvolutionContext& ctx,
+bool ConvHipImplicitGemmV4R1WrW::IsApplicable(const ExecutionContext& ctx,
                                               const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1{}))
@@ -143,21 +143,21 @@ bool ConvHipImplicitGemmV4R1WrW::IsApplicable(const ConvolutionContext& ctx,
 }
 
 PerformanceImplicitGemmV4R1
-ConvHipImplicitGemmV4R1Fwd::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+ConvHipImplicitGemmV4R1Fwd::GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                                         const ProblemDescription& problem) const
 {
     return GetPerformanceConfigBase<PerformanceImplicitGemmV4R1>(ctx, problem);
 }
 
 PerformanceImplicitGemmV4R1
-ConvHipImplicitGemmV4R1WrW::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+ConvHipImplicitGemmV4R1WrW::GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                                         const ProblemDescription& problem) const
 {
     return GetPerformanceConfigBase<PerformanceImplicitGemmV4R1>(ctx, problem);
 }
 
 bool ConvHipImplicitGemmV4R1Fwd::IsValidPerformanceConfig(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmV4R1& config) const
 {
@@ -166,7 +166,7 @@ bool ConvHipImplicitGemmV4R1Fwd::IsValidPerformanceConfig(
 }
 
 bool ConvHipImplicitGemmV4R1WrW::IsValidPerformanceConfig(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmV4R1& config) const
 {
@@ -175,14 +175,14 @@ bool ConvHipImplicitGemmV4R1WrW::IsValidPerformanceConfig(
 }
 
 PerformanceImplicitGemmV4R1
-ConvHipImplicitGemmV4R1Fwd::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmV4R1Fwd::Search(const ExecutionContext& ctx,
                                    const ProblemDescription& problem,
                                    const AnyInvokeParams& invoke_ctx) const
 {
     return GenericSearch(*this, ctx, problem, invoke_ctx);
 }
 PerformanceImplicitGemmV4R1
-ConvHipImplicitGemmV4R1WrW::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmV4R1WrW::Search(const ExecutionContext& ctx,
                                    const ProblemDescription& problem,
                                    const AnyInvokeParams& invoke_ctx) const
 {
@@ -190,7 +190,7 @@ ConvHipImplicitGemmV4R1WrW::Search(const ConvolutionContext& ctx,
 }
 
 ConvSolution
-ConvHipImplicitGemmV4R1Fwd::GetSolution(const ConvolutionContext& ctx,
+ConvHipImplicitGemmV4R1Fwd::GetSolution(const ExecutionContext& ctx,
                                         const ProblemDescription& problem,
                                         const PerformanceImplicitGemmV4R1& config) const
 {
@@ -393,7 +393,7 @@ ConvHipImplicitGemmV4R1Fwd::GetSolution(const ConvolutionContext& ctx,
 }
 
 ConvSolution
-ConvHipImplicitGemmV4R1WrW::GetSolution(const ConvolutionContext& ctx,
+ConvHipImplicitGemmV4R1WrW::GetSolution(const ExecutionContext& ctx,
                                         const ProblemDescription& problem,
                                         const PerformanceImplicitGemmV4R1& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp
index 07fbae436e..9cbe662180 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp
@@ -471,7 +471,7 @@ bool PerformanceImplicitGemmV4R4Fwd::IsValid(const ProblemDescription& problem)
     return (valid and lds_size <= get_lds_max_number_of_byte());
 }
 
-void PerformanceImplicitGemmV4R4Fwd::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceImplicitGemmV4R4Fwd::HeuristicInit(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem)
 {
     std::ignore = ctx;
@@ -572,7 +572,7 @@ ConvHipImplicitGemmV4R4Fwd::CalculateGemmSize(const ProblemDescription& problem)
     return std::make_tuple(gemm_m, gemm_n, gemm_k);
 }
 
-bool ConvHipImplicitGemmV4R4Fwd::IsApplicable(const ConvolutionContext& ctx,
+bool ConvHipImplicitGemmV4R4Fwd::IsApplicable(const ExecutionContext& ctx,
                                               const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4{}))
@@ -610,14 +610,14 @@ bool ConvHipImplicitGemmV4R4Fwd::IsApplicable(const ConvolutionContext& ctx,
 }
 
 PerformanceImplicitGemmV4R4Fwd
-ConvHipImplicitGemmV4R4Fwd::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+ConvHipImplicitGemmV4R4Fwd::GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                                         const ProblemDescription& problem) const
 {
     return GetPerformanceConfigBase<PerformanceImplicitGemmV4R4Fwd>(ctx, problem);
 }
 
 bool ConvHipImplicitGemmV4R4Fwd::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmV4R4Fwd& config) const
 {
@@ -626,7 +626,7 @@ bool ConvHipImplicitGemmV4R4Fwd::IsValidPerformanceConfig(
 }
 
 PerformanceImplicitGemmV4R4Fwd
-ConvHipImplicitGemmV4R4Fwd::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmV4R4Fwd::Search(const ExecutionContext& ctx,
                                    const ProblemDescription& problem,
                                    const AnyInvokeParams& invoke_ctx) const
 {
@@ -634,7 +634,7 @@ ConvHipImplicitGemmV4R4Fwd::Search(const ConvolutionContext& ctx,
 }
 
 ConvSolution
-ConvHipImplicitGemmV4R4Fwd::GetSolution(const ConvolutionContext& ctx,
+ConvHipImplicitGemmV4R4Fwd::GetSolution(const ExecutionContext& ctx,
                                         const ProblemDescription& problem,
                                         const PerformanceImplicitGemmV4R4Fwd& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp
index 795e3d1704..9c09efe397 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp
@@ -120,7 +120,7 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops::SetNextValue(const ProblemDescrip
     return true;
 }
 
-void PerformanceImplicitGemmForwardV4R4Xdlops::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceImplicitGemmForwardV4R4Xdlops::HeuristicInit(const ExecutionContext& ctx,
                                                              const ProblemDescription& problem)
 {
     PerformanceImplicitGemmForwardV4R4Xdlops tmp;
@@ -624,7 +624,7 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops::IsReallyValid(
 // Return false if a performance config is known to be sub-optimal, comparing to other performance
 // config inside tuning range
 bool PerformanceImplicitGemmForwardV4R4Xdlops::IsFastToBeUsedForTuning(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     // somehow, 128x128 wave-wise GEMM tend to spill register
     // TODO revisit this when 128x128 wave-wise GEMM become efficient
@@ -807,7 +807,7 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops::IsFastToBeUsedForTuning(
 // Return false, if you don't want to this to be included in tuning range used by generic search
 // A performance config may still be valid w.r.t algorithm correctness, even when IsValid() return
 // false
-bool PerformanceImplicitGemmForwardV4R4Xdlops::IsValid(const ConvolutionContext& ctx,
+bool PerformanceImplicitGemmForwardV4R4Xdlops::IsValid(const ExecutionContext& ctx,
                                                        const ProblemDescription& problem) const
 {
     return IsReallyValid(problem) && IsFastToBeUsedForTuning(ctx, problem);
@@ -815,7 +815,7 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops::IsValid(const ConvolutionContext&
 
 // Used by GenericSearch, not used by HeuristicInit
 bool ConvHipImplicitGemmForwardV4R4Xdlops::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmForwardV4R4Xdlops& config) const
 {
@@ -847,7 +847,7 @@ ConvHipImplicitGemmForwardV4R4Xdlops::CalculateGemmSize(const ProblemDescription
 
 PerformanceImplicitGemmForwardV4R4Xdlops
 ConvHipImplicitGemmForwardV4R4Xdlops::GetDefaultPerformanceConfig(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     PerformanceImplicitGemmForwardV4R4Xdlops config;
     config.HeuristicInit(ctx, problem);
@@ -856,7 +856,7 @@ ConvHipImplicitGemmForwardV4R4Xdlops::GetDefaultPerformanceConfig(
 }
 
 ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmForwardV4R4Xdlops& config) const
 {
@@ -966,7 +966,7 @@ ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops::GetSolution(
     return result;
 }
 
-bool ConvHipImplicitGemmForwardV4R4Xdlops::IsApplicable(const ConvolutionContext& ctx,
+bool ConvHipImplicitGemmForwardV4R4Xdlops::IsApplicable(const ExecutionContext& ctx,
                                                         const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4_XDLOPS{}))
@@ -1031,7 +1031,7 @@ bool ConvHipImplicitGemmForwardV4R4Xdlops::IsApplicable(const ConvolutionContext
 }
 
 PerformanceImplicitGemmForwardV4R4Xdlops
-ConvHipImplicitGemmForwardV4R4Xdlops::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmForwardV4R4Xdlops::Search(const ExecutionContext& ctx,
                                              const ProblemDescription& problem,
                                              const AnyInvokeParams& invoke_ctx) const
 
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
index 7fa139d21a..d25ca1b68b 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp
@@ -144,7 +144,7 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm::SetNextValue(const Pr
 }
 
 void PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm::HeuristicInit(
-    const ConvolutionContext& ctx, const ProblemDescription& problem)
+    const ExecutionContext& ctx, const ProblemDescription& problem)
 {
     PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm tmp;
 
@@ -666,7 +666,7 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsReallyValid(
 /// Used by HeuristicInit and IsValid. Return false if a performance config is known
 /// to be sub-optimal, comparing to other performance config inside tuning range.
 bool PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsFastToBeUsedForTuning(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     // somehow, 128x128 wave-wise GEMM tend to spill register
     // TODO revisit this when 128x128 wave-wise GEMM become efficient
@@ -846,14 +846,14 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsFastToBeUsedForTuni
 /// included in tuning range used by generic search. A performance config may still be valid w.r.t
 /// algorithm correctness, even when IsValid() returns false.
 bool PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsValid(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     return IsReallyValid(problem) && IsFastToBeUsedForTuning(ctx, problem);
 }
 
 // Used by GenericSearch, not used by HeuristicInit
 bool ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm& config) const
 {
@@ -895,7 +895,7 @@ ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::CalculateGemmSize(
 
 PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm
 ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::GetDefaultPerformanceConfig(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm config;
     config.HeuristicInit(ctx, problem);
@@ -904,7 +904,7 @@ ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::GetDefaultPerformanceConfig(
 }
 
 ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm& config) const
 {
@@ -1033,7 +1033,7 @@ ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::GetSolution(
 }
 
 bool ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsApplicable(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4_PADDED_GEMM_XDLOPS{}))
         return false;
@@ -1124,7 +1124,7 @@ bool ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsApplicable(
 }
 
 PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm
-ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::Search(const ExecutionContext& ctx,
                                                          const ProblemDescription& problem,
                                                          const AnyInvokeParams& invoke_ctx) const
 
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp
index 9bd59d36b9..4915c48e2e 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp
@@ -154,7 +154,7 @@ bool PerformanceImplicitGemmForwardV4R5Xdlops::SetNextValue(const ProblemDescrip
     return true;
 }
 
-void PerformanceImplicitGemmForwardV4R5Xdlops::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceImplicitGemmForwardV4R5Xdlops::HeuristicInit(const ExecutionContext& ctx,
                                                              const ProblemDescription& problem)
 {
     PerformanceImplicitGemmForwardV4R5Xdlops tmp;
@@ -676,7 +676,7 @@ bool PerformanceImplicitGemmForwardV4R5Xdlops::IsReallyValid(
 // Return false if a performance config is known to be sub-optimal, comparing to other performance
 // config inside tuning range
 bool PerformanceImplicitGemmForwardV4R5Xdlops::IsFastToBeUsedForTuning(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     if(use_spare_set)
         return true;
@@ -855,7 +855,7 @@ bool PerformanceImplicitGemmForwardV4R5Xdlops::IsFastToBeUsedForTuning(
 // Return false, if you don't want to this to be included in tuning range used by generic search
 // A performance config may still be valid w.r.t algorithm correctness, even when IsValid() return
 // false
-bool PerformanceImplicitGemmForwardV4R5Xdlops::IsValid(const ConvolutionContext& ctx,
+bool PerformanceImplicitGemmForwardV4R5Xdlops::IsValid(const ExecutionContext& ctx,
                                                        const ProblemDescription& problem) const
 {
     return IsReallyValid(problem) && IsFastToBeUsedForTuning(ctx, problem);
@@ -863,7 +863,7 @@ bool PerformanceImplicitGemmForwardV4R5Xdlops::IsValid(const ConvolutionContext&
 
 // Used by GenericSearch, not used by HeuristicInit
 bool ConvHipImplicitGemmForwardV4R5Xdlops::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmForwardV4R5Xdlops& config) const
 {
@@ -872,7 +872,7 @@ bool ConvHipImplicitGemmForwardV4R5Xdlops::IsValidPerformanceConfig(
 
 PerformanceImplicitGemmForwardV4R5Xdlops
 ConvHipImplicitGemmForwardV4R5Xdlops::GetDefaultPerformanceConfig(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     PerformanceImplicitGemmForwardV4R5Xdlops config;
     config.HeuristicInit(ctx, problem);
@@ -881,7 +881,7 @@ ConvHipImplicitGemmForwardV4R5Xdlops::GetDefaultPerformanceConfig(
 }
 
 ConvSolution ConvHipImplicitGemmForwardV4R5Xdlops::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmForwardV4R5Xdlops& config) const
 {
@@ -996,7 +996,7 @@ ConvSolution ConvHipImplicitGemmForwardV4R5Xdlops::GetSolution(
     return result;
 }
 
-bool ConvHipImplicitGemmForwardV4R5Xdlops::IsApplicable(const ConvolutionContext& ctx,
+bool ConvHipImplicitGemmForwardV4R5Xdlops::IsApplicable(const ExecutionContext& ctx,
                                                         const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R5_XDLOPS{}))
@@ -1068,7 +1068,7 @@ bool ConvHipImplicitGemmForwardV4R5Xdlops::IsApplicable(const ConvolutionContext
 }
 
 PerformanceImplicitGemmForwardV4R5Xdlops
-ConvHipImplicitGemmForwardV4R5Xdlops::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmForwardV4R5Xdlops::Search(const ExecutionContext& ctx,
                                              const ProblemDescription& problem,
                                              const AnyInvokeParams& invoke_ctx) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
index ae2395dd0a..73907eb788 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
@@ -230,7 +230,7 @@ bool PerformanceConfigHipImplicitGemmFwdXdlops::operator==(
 }
 
 PerformanceConfigHipImplicitGemmFwdXdlops
-ConvHipImplicitGemmFwdXdlops::GetDefaultPerformanceConfig(const ConvolutionContext&,
+ConvHipImplicitGemmFwdXdlops::GetDefaultPerformanceConfig(const ExecutionContext&,
                                                           const ProblemDescription& problem) const
 {
     PerformanceConfigHipImplicitGemmFwdXdlops pp;
@@ -239,7 +239,7 @@ ConvHipImplicitGemmFwdXdlops::GetDefaultPerformanceConfig(const ConvolutionConte
 }
 
 bool ConvHipImplicitGemmFwdXdlops::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceConfigHipImplicitGemmFwdXdlops& config) const
 {
@@ -247,7 +247,7 @@ bool ConvHipImplicitGemmFwdXdlops::IsValidPerformanceConfig(
 }
 
 PerformanceConfigHipImplicitGemmFwdXdlops
-ConvHipImplicitGemmFwdXdlops::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmFwdXdlops::Search(const ExecutionContext& ctx,
                                      const ProblemDescription& problem,
                                      const AnyInvokeParams& invoke_ctx) const
 {
@@ -255,7 +255,7 @@ ConvHipImplicitGemmFwdXdlops::Search(const ConvolutionContext& ctx,
 }
 
 bool ConvHipImplicitGemmFwdXdlops::IsApplicable(
-    [[maybe_unused]] const ConvolutionContext& ctx,
+    [[maybe_unused]] const ExecutionContext& ctx,
     [[maybe_unused]] const ProblemDescription& problem) const
 {
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
@@ -303,7 +303,7 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable(
 }
 
 ConvSolution ConvHipImplicitGemmFwdXdlops::GetSolution(
-    [[maybe_unused]] const ConvolutionContext& ctx,
+    [[maybe_unused]] const ExecutionContext& ctx,
     [[maybe_unused]] const ProblemDescription& problem,
     [[maybe_unused]] const PerformanceConfigHipImplicitGemmFwdXdlops& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
index b51031780f..c2d0b83141 100644
--- a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
@@ -258,7 +258,7 @@ bool PerformanceConfigHipImplicitGemmGroupFwdXdlops::operator==(
 
 PerformanceConfigHipImplicitGemmGroupFwdXdlops
 ConvHipImplicitGemmGroupFwdXdlops::GetDefaultPerformanceConfig(
-    const ConvolutionContext&, const ProblemDescription& problem) const
+    const ExecutionContext&, const ProblemDescription& problem) const
 {
     PerformanceConfigHipImplicitGemmGroupFwdXdlops pp;
     pp.HeuristicInit(problem);
@@ -266,7 +266,7 @@ ConvHipImplicitGemmGroupFwdXdlops::GetDefaultPerformanceConfig(
 }
 
 bool ConvHipImplicitGemmGroupFwdXdlops::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceConfigHipImplicitGemmGroupFwdXdlops& config) const
 {
@@ -274,7 +274,7 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsValidPerformanceConfig(
 }
 
 PerformanceConfigHipImplicitGemmGroupFwdXdlops
-ConvHipImplicitGemmGroupFwdXdlops::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmGroupFwdXdlops::Search(const ExecutionContext& ctx,
                                           const ProblemDescription& problem,
                                           const AnyInvokeParams& invoke_ctx) const
 {
@@ -282,7 +282,7 @@ ConvHipImplicitGemmGroupFwdXdlops::Search(const ConvolutionContext& ctx,
 }
 
 bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable(
-    [[maybe_unused]] const ConvolutionContext& ctx,
+    [[maybe_unused]] const ExecutionContext& ctx,
     [[maybe_unused]] const ProblemDescription& problem) const
 {
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
@@ -322,7 +322,7 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable(
 }
 
 ConvSolution ConvHipImplicitGemmGroupFwdXdlops::GetSolution(
-    [[maybe_unused]] const ConvolutionContext& ctx,
+    [[maybe_unused]] const ExecutionContext& ctx,
     [[maybe_unused]] const ProblemDescription& problem,
     [[maybe_unused]] const PerformanceConfigHipImplicitGemmGroupFwdXdlops& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_nonxdlops_common.cpp b/src/solver/conv_hip_implicit_gemm_nonxdlops_common.cpp
index de11fa0fa7..472b58f913 100644
--- a/src/solver/conv_hip_implicit_gemm_nonxdlops_common.cpp
+++ b/src/solver/conv_hip_implicit_gemm_nonxdlops_common.cpp
@@ -57,7 +57,7 @@ bool PerformanceImplicitGemm::operator==(const PerformanceImplicitGemm& other) c
     // clang-format on
 }
 
-bool PerformanceImplicitGemm::IsValid(const ConvolutionContext& ctx,
+bool PerformanceImplicitGemm::IsValid(const ExecutionContext& ctx,
                                       const ProblemDescription& problem) const
 {
     std::size_t N = KernelBatchN(problem);
@@ -173,7 +173,7 @@ bool PerformanceImplicitGemm::IsValid(const ConvolutionContext& ctx,
     return (InBlockCopySubLengths_E == 1 && InBlockCopySubLengths_B == 1);
 }
 
-bool PerformanceImplicitGemmV4R1::IsValid(const ConvolutionContext& ctx,
+bool PerformanceImplicitGemmV4R1::IsValid(const ExecutionContext& ctx,
                                           const ProblemDescription& problem) const
 {
     std::size_t N = KernelBatchN(problem);
@@ -278,7 +278,7 @@ bool PerformanceImplicitGemmV4R1::IsValid(const ConvolutionContext& ctx,
     return (InBlockCopySubLengths_E == 1 && InBlockCopySubLengths_B == 1);
 }
 
-void PerformanceImplicitGemm::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceImplicitGemm::HeuristicInit(const ExecutionContext& ctx,
                                             const ProblemDescription& problem)
 {
     // default
diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
index 0a33f611c0..637486ef50 100644
--- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
+++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
@@ -474,7 +474,7 @@ bool PerformanceImplicitGemmV4R4WrW::IsValid(const ProblemDescription& problem)
     return (valid and lds_size <= get_lds_max_number_of_byte());
 }
 
-void PerformanceImplicitGemmV4R4WrW::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceImplicitGemmV4R4WrW::HeuristicInit(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem)
 {
     std::ignore = ctx;
@@ -575,7 +575,7 @@ ConvHipImplicitGemmV4R4WrW::CalculateGemmSize(const ProblemDescription& problem)
     return std::make_tuple(gemm_m, gemm_n, gemm_k);
 }
 
-bool ConvHipImplicitGemmV4R4WrW::IsApplicable(const ConvolutionContext& ctx,
+bool ConvHipImplicitGemmV4R4WrW::IsApplicable(const ExecutionContext& ctx,
                                               const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4{}))
@@ -613,14 +613,14 @@ bool ConvHipImplicitGemmV4R4WrW::IsApplicable(const ConvolutionContext& ctx,
 }
 
 PerformanceImplicitGemmV4R4WrW
-ConvHipImplicitGemmV4R4WrW::GetDefaultPerformanceConfig(const ConvolutionContext& ctx,
+ConvHipImplicitGemmV4R4WrW::GetDefaultPerformanceConfig(const ExecutionContext& ctx,
                                                         const ProblemDescription& problem) const
 {
     return GetPerformanceConfigBase<PerformanceImplicitGemmV4R4WrW>(ctx, problem);
 }
 
 bool ConvHipImplicitGemmV4R4WrW::IsValidPerformanceConfig(
-    const ConvolutionContext&,
+    const ExecutionContext&,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmV4R4WrW& config) const
 {
@@ -629,7 +629,7 @@ bool ConvHipImplicitGemmV4R4WrW::IsValidPerformanceConfig(
 }
 
 PerformanceImplicitGemmV4R4WrW
-ConvHipImplicitGemmV4R4WrW::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmV4R4WrW::Search(const ExecutionContext& ctx,
                                    const ProblemDescription& problem,
                                    const AnyInvokeParams& invoke_ctx) const
 {
@@ -637,7 +637,7 @@ ConvHipImplicitGemmV4R4WrW::Search(const ConvolutionContext& ctx,
 }
 
 ConvSolution
-ConvHipImplicitGemmV4R4WrW::GetSolution(const ConvolutionContext& ctx,
+ConvHipImplicitGemmV4R4WrW::GetSolution(const ExecutionContext& ctx,
                                         const ProblemDescription& problem,
                                         const PerformanceImplicitGemmV4R4WrW& config) const
 {
diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
index 070ad3615f..5a42ba3255 100644
--- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
@@ -122,7 +122,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops::SetNextValue(const ProblemDescription
     return true;
 }
 
-void PerformanceImplicitGemmWrwV4R4Xdlops::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceImplicitGemmWrwV4R4Xdlops::HeuristicInit(const ExecutionContext& ctx,
                                                          const ProblemDescription& problem)
 {
     PerformanceImplicitGemmWrwV4R4Xdlops tmp;
@@ -271,7 +271,7 @@ std::tuple<int, bool> PerformanceImplicitGemmWrwV4R4Xdlops::CalculateBlockSize()
 }
 
 std::tuple<int, bool>
-PerformanceImplicitGemmWrwV4R4Xdlops::CalculateGridSize(const ConvolutionContext& ctx,
+PerformanceImplicitGemmWrwV4R4Xdlops::CalculateGridSize(const ExecutionContext& ctx,
                                                         const ProblemDescription& problem) const
 {
     int GridSize = 0;
@@ -305,7 +305,7 @@ PerformanceImplicitGemmWrwV4R4Xdlops::CalculateGridSize(const ConvolutionContext
 
 std::tuple<int, int, int, int, int, bool>
 PerformanceImplicitGemmWrwV4R4Xdlops::CalculateGemmSizeAndGemmKBlock(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     int gemm_g       = -1;
     int gemm_m       = -1;
@@ -622,7 +622,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops::IsValidValue() const
 
 // Used by HeuristicInit() and GenericSearch
 // Only return false if a performance config will violate requirements given by kernel algorithm
-bool PerformanceImplicitGemmWrwV4R4Xdlops::IsReallyValid(const ConvolutionContext& ctx,
+bool PerformanceImplicitGemmWrwV4R4Xdlops::IsReallyValid(const ExecutionContext& ctx,
                                                          const ProblemDescription& problem) const
 {
     if(!IsValidValue())
@@ -689,7 +689,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops::IsReallyValid(const ConvolutionContex
 // Return false if a performance config is known to be sub-optimal, comparing to other performance
 // config inside tuning range
 bool PerformanceImplicitGemmWrwV4R4Xdlops::IsFastToBeUsedForTuning(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
 
     if(use_spare_set)
@@ -842,7 +842,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops::IsFastToBeUsedForTuning(
 // Return false, if you don't want to this to be included in tuning range used by generic search
 // A performance config may still be valid w.r.t algorithm correctness, even when IsValid() return
 // false
-bool PerformanceImplicitGemmWrwV4R4Xdlops::IsValid(const ConvolutionContext& ctx,
+bool PerformanceImplicitGemmWrwV4R4Xdlops::IsValid(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem) const
 {
     return IsReallyValid(ctx, problem) && IsFastToBeUsedForTuning(ctx, problem);
@@ -850,7 +850,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops::IsValid(const ConvolutionContext& ctx
 
 // Used by GenericSearch, not used by HeuristicInit
 bool ConvHipImplicitGemmWrwV4R4Xdlops::IsValidPerformanceConfig(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmWrwV4R4Xdlops& config) const
 {
@@ -858,7 +858,7 @@ bool ConvHipImplicitGemmWrwV4R4Xdlops::IsValidPerformanceConfig(
 }
 
 PerformanceImplicitGemmWrwV4R4Xdlops ConvHipImplicitGemmWrwV4R4Xdlops::GetDefaultPerformanceConfig(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     PerformanceImplicitGemmWrwV4R4Xdlops config;
     config.HeuristicInit(ctx, problem);
@@ -867,7 +867,7 @@ PerformanceImplicitGemmWrwV4R4Xdlops ConvHipImplicitGemmWrwV4R4Xdlops::GetDefaul
 }
 
 ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmWrwV4R4Xdlops& config) const
 {
@@ -1036,7 +1036,7 @@ ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops::GetSolution(
     return result;
 }
 
-bool ConvHipImplicitGemmWrwV4R4Xdlops::IsApplicable(const ConvolutionContext& ctx,
+bool ConvHipImplicitGemmWrwV4R4Xdlops::IsApplicable(const ExecutionContext& ctx,
                                                     const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4_XDLOPS{}))
@@ -1100,7 +1100,7 @@ bool ConvHipImplicitGemmWrwV4R4Xdlops::IsApplicable(const ConvolutionContext& ct
 }
 
 PerformanceImplicitGemmWrwV4R4Xdlops
-ConvHipImplicitGemmWrwV4R4Xdlops::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmWrwV4R4Xdlops::Search(const ExecutionContext& ctx,
                                          const ProblemDescription& problem,
                                          const AnyInvokeParams& invoke_ctx) const
 {
@@ -1109,7 +1109,7 @@ ConvHipImplicitGemmWrwV4R4Xdlops::Search(const ConvolutionContext& ctx,
 }
 
 std::size_t
-ConvHipImplicitGemmWrwV4R4Xdlops::GetWorkspaceSize(const ConvolutionContext&,
+ConvHipImplicitGemmWrwV4R4Xdlops::GetWorkspaceSize(const ExecutionContext&,
                                                    const ProblemDescription& problem) const
 {
     if(problem.IsFp32())
diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
index 7b4295df35..abd178dcca 100644
--- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
+++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp
@@ -123,7 +123,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::SetNextValue(const Proble
 }
 
 void PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::HeuristicInit(
-    const ConvolutionContext& ctx, const ProblemDescription& problem)
+    const ExecutionContext& ctx, const ProblemDescription& problem)
 {
     PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm tmp;
     // GemmMFactor GemmNFactor, GemmKTotalFactor are fixed value at this moment.
@@ -270,7 +270,7 @@ std::tuple<int, bool> PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::Calculat
 }
 
 std::tuple<int, bool> PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::CalculateGridSize(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     int GridSize = 0;
 
@@ -560,7 +560,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsValidValue() const
 // Used by HeuristicInit() and GenericSearch
 // Only return false if a performance config will violate requirements given by kernel algorithm
 bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsReallyValid(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     if(!IsValidValue())
         return false;
@@ -633,7 +633,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsReallyValid(
 // Return false if a performance config is known to be sub-optimal, comparing to other performance
 // config inside tuning range
 bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsFastToBeUsedForTuning(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     // somehow, 128x128 wave-wise GEMM tend to spill register
     // TODO revisit this when 128x128 wave-wise GEMM become efficient
@@ -798,14 +798,14 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsFastToBeUsedForTuning(
 // A performance config may still be valid w.r.t algorithm correctness, even when IsValid() return
 // false
 bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsValid(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     return IsReallyValid(ctx, problem) && IsFastToBeUsedForTuning(ctx, problem);
 }
 
 // Used by GenericSearch, not used by HeuristicInit
 bool ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsValidPerformanceConfig(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm& config) const
 {
@@ -814,7 +814,7 @@ bool ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsValidPerformanceConfig(
 
 std::tuple<int, int, int, int, int, int, int, int, bool>
 PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::CalculateGemmSizeAndGemmKBlock(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     int gemm_g           = -1;
     int gemm_m           = -1;
@@ -911,7 +911,7 @@ PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::CalculateGemmSizeAndGemmKBlock
 
 PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm
 ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::GetDefaultPerformanceConfig(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm config;
     config.HeuristicInit(ctx, problem);
@@ -920,7 +920,7 @@ ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::GetDefaultPerformanceConfig(
 }
 
 ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm& config) const
 {
@@ -1103,7 +1103,7 @@ ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::GetSolution(
 }
 
 bool ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsApplicable(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4_PADDED_GEMM_XDLOPS{}))
         return false;
@@ -1188,7 +1188,7 @@ bool ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsApplicable(
 }
 
 PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm
-ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::Search(const ConvolutionContext& ctx,
+ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::Search(const ExecutionContext& ctx,
                                                      const ProblemDescription& problem,
                                                      const AnyInvokeParams& invoke_ctx) const
 {
@@ -1197,7 +1197,7 @@ ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::Search(const ConvolutionContext& c
 }
 
 std::size_t ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::GetWorkspaceSize(
-    const ConvolutionContext&, const ProblemDescription& problem) const
+    const ExecutionContext&, const ProblemDescription& problem) const
 {
     if(problem.IsFp32())
         return 0;
diff --git a/src/solver/conv_mlir_igemm_bwd.cpp b/src/solver/conv_mlir_igemm_bwd.cpp
index 58787c2532..783c68350c 100644
--- a/src/solver/conv_mlir_igemm_bwd.cpp
+++ b/src/solver/conv_mlir_igemm_bwd.cpp
@@ -37,7 +37,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_MLIR_IGEMM_BWD)
 namespace miopen {
 namespace solver {
 
-bool ConvMlirIgemmBwd::IsApplicable(const ConvolutionContext& ctx,
+bool ConvMlirIgemmBwd::IsApplicable(const ExecutionContext& ctx,
                                     const ProblemDescription& problem) const
 {
 #if MIOPEN_USE_MLIR
@@ -70,13 +70,13 @@ bool ConvMlirIgemmBwd::IsApplicable(const ConvolutionContext& ctx,
 }
 
 PerformanceConvMlirIgemm
-ConvMlirIgemmBwd::GetDefaultPerformanceConfig(const ConvolutionContext&,
+ConvMlirIgemmBwd::GetDefaultPerformanceConfig(const ExecutionContext&,
                                               const ProblemDescription&) const
 {
     return PerformanceConvMlirIgemm::MlirHeuristicInitRequest();
 }
 
-bool ConvMlirIgemmBwd::IsValidPerformanceConfig(const ConvolutionContext& ctx,
+bool ConvMlirIgemmBwd::IsValidPerformanceConfig(const ExecutionContext& ctx,
                                                 const ProblemDescription& problem,
                                                 const PerformanceConvMlirIgemm& config) const
 {
@@ -84,14 +84,14 @@ bool ConvMlirIgemmBwd::IsValidPerformanceConfig(const ConvolutionContext& ctx,
     return config.IsValid(ctx, problem);
 }
 
-PerformanceConvMlirIgemm ConvMlirIgemmBwd::Search(const ConvolutionContext& ctx,
+PerformanceConvMlirIgemm ConvMlirIgemmBwd::Search(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem,
                                                   const AnyInvokeParams& invoke_ctx) const
 {
     return GenericSearch(*this, ctx, problem, invoke_ctx);
 }
 
-ConvSolution ConvMlirIgemmBwd::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvMlirIgemmBwd::GetSolution(const ExecutionContext& ctx,
                                            const ProblemDescription& problem,
                                            const PerformanceConvMlirIgemm& config) const
 {
diff --git a/src/solver/conv_mlir_igemm_bwd_xdlops.cpp b/src/solver/conv_mlir_igemm_bwd_xdlops.cpp
index a4a9549db8..41062cc32c 100644
--- a/src/solver/conv_mlir_igemm_bwd_xdlops.cpp
+++ b/src/solver/conv_mlir_igemm_bwd_xdlops.cpp
@@ -38,7 +38,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_MLIR_IGEMM_BWD_XDLOPS)
 namespace miopen {
 namespace solver {
 
-bool ConvMlirIgemmBwdXdlops::IsApplicable(const ConvolutionContext& ctx,
+bool ConvMlirIgemmBwdXdlops::IsApplicable(const ExecutionContext& ctx,
                                           const ProblemDescription& problem) const
 {
 #if MIOPEN_USE_MLIR
@@ -64,14 +64,14 @@ bool ConvMlirIgemmBwdXdlops::IsApplicable(const ConvolutionContext& ctx,
 }
 
 PerformanceConvMlirIgemmXdlops
-ConvMlirIgemmBwdXdlops::GetDefaultPerformanceConfig(const ConvolutionContext&,
+ConvMlirIgemmBwdXdlops::GetDefaultPerformanceConfig(const ExecutionContext&,
                                                     const ProblemDescription&) const
 {
     return PerformanceConvMlirIgemmXdlops::MlirHeuristicInitRequest();
 }
 
 bool ConvMlirIgemmBwdXdlops::IsValidPerformanceConfig(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConvMlirIgemmXdlops& config) const
 {
@@ -80,14 +80,14 @@ bool ConvMlirIgemmBwdXdlops::IsValidPerformanceConfig(
 }
 
 PerformanceConvMlirIgemmXdlops
-ConvMlirIgemmBwdXdlops::Search(const ConvolutionContext& ctx,
+ConvMlirIgemmBwdXdlops::Search(const ExecutionContext& ctx,
                                const ProblemDescription& problem,
                                const AnyInvokeParams& invoke_ctx) const
 {
     return GenericSearch(*this, ctx, problem, invoke_ctx);
 }
 
-ConvSolution ConvMlirIgemmBwdXdlops::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvMlirIgemmBwdXdlops::GetSolution(const ExecutionContext& ctx,
                                                  const ProblemDescription& problem,
                                                  const PerformanceConvMlirIgemmXdlops& config) const
 {
diff --git a/src/solver/conv_mlir_igemm_fwd.cpp b/src/solver/conv_mlir_igemm_fwd.cpp
index c4983852b1..2cc196ae10 100644
--- a/src/solver/conv_mlir_igemm_fwd.cpp
+++ b/src/solver/conv_mlir_igemm_fwd.cpp
@@ -85,7 +85,7 @@ bool PerformanceConvMlirIgemm::operator==(const PerformanceConvMlirIgemm& other)
     // clang-format on
 }
 
-bool PerformanceConvMlirIgemm::IsValid(const ConvolutionContext& ctx,
+bool PerformanceConvMlirIgemm::IsValid(const ExecutionContext& ctx,
                                        const ProblemDescription& problem) const
 {
 #if MIOPEN_USE_MLIR
@@ -136,13 +136,13 @@ bool PerformanceConvMlirIgemm::SetNextValue(const ProblemDescription&)
 }
 
 PerformanceConvMlirIgemm
-ConvMlirIgemmFwd::GetDefaultPerformanceConfig(const ConvolutionContext&,
+ConvMlirIgemmFwd::GetDefaultPerformanceConfig(const ExecutionContext&,
                                               const ProblemDescription&) const
 {
     return PerformanceConvMlirIgemm::MlirHeuristicInitRequest();
 }
 
-bool ConvMlirIgemmFwd::IsValidPerformanceConfig(const ConvolutionContext& ctx,
+bool ConvMlirIgemmFwd::IsValidPerformanceConfig(const ExecutionContext& ctx,
                                                 const ProblemDescription& problem,
                                                 const PerformanceConvMlirIgemm& config) const
 {
@@ -150,14 +150,14 @@ bool ConvMlirIgemmFwd::IsValidPerformanceConfig(const ConvolutionContext& ctx,
     return config.IsValid(ctx, problem);
 }
 
-PerformanceConvMlirIgemm ConvMlirIgemmFwd::Search(const ConvolutionContext& ctx,
+PerformanceConvMlirIgemm ConvMlirIgemmFwd::Search(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem,
                                                   const AnyInvokeParams& invoke_ctx) const
 {
     return GenericSearch(*this, ctx, problem, invoke_ctx);
 }
 
-bool ConvMlirIgemmFwd::IsApplicable(const ConvolutionContext& ctx,
+bool ConvMlirIgemmFwd::IsApplicable(const ExecutionContext& ctx,
                                     const ProblemDescription& problem) const
 {
 #if MIOPEN_USE_MLIR
@@ -189,7 +189,7 @@ bool ConvMlirIgemmFwd::IsApplicable(const ConvolutionContext& ctx,
 #endif
 }
 
-ConvSolution ConvMlirIgemmFwd::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvMlirIgemmFwd::GetSolution(const ExecutionContext& ctx,
                                            const ProblemDescription& problem,
                                            const PerformanceConvMlirIgemm& config) const
 {
diff --git a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp
index 8256213623..c761abc137 100644
--- a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp
+++ b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp
@@ -52,7 +52,7 @@ void PerformanceConvMlirIgemmXdlops::SetMlirHeuristicInitRequest()
     GemmBThreadCopyMoreGemmKPack = false;
 }
 
-bool ConvMlirIgemmFwdXdlops::IsApplicable(const ConvolutionContext& ctx,
+bool ConvMlirIgemmFwdXdlops::IsApplicable(const ExecutionContext& ctx,
                                           const ProblemDescription& problem) const
 {
 #if MIOPEN_USE_MLIR
@@ -125,7 +125,7 @@ bool PerformanceConvMlirIgemmXdlops::operator==(const PerformanceConvMlirIgemmXd
     // clang-format on
 }
 
-bool PerformanceConvMlirIgemmXdlops::IsValid(const ConvolutionContext& ctx,
+bool PerformanceConvMlirIgemmXdlops::IsValid(const ExecutionContext& ctx,
                                              const ProblemDescription& problem) const
 {
 #if MIOPEN_USE_MLIR
@@ -189,14 +189,14 @@ bool PerformanceConvMlirIgemmXdlops::SetNextValue(const ProblemDescription& prob
 }
 
 PerformanceConvMlirIgemmXdlops
-ConvMlirIgemmFwdXdlops::GetDefaultPerformanceConfig(const ConvolutionContext&,
+ConvMlirIgemmFwdXdlops::GetDefaultPerformanceConfig(const ExecutionContext&,
                                                     const ProblemDescription&) const
 {
     return PerformanceConvMlirIgemmXdlops::MlirHeuristicInitRequest();
 }
 
 bool ConvMlirIgemmFwdXdlops::IsValidPerformanceConfig(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConvMlirIgemmXdlops& config) const
 {
@@ -205,14 +205,14 @@ bool ConvMlirIgemmFwdXdlops::IsValidPerformanceConfig(
 }
 
 PerformanceConvMlirIgemmXdlops
-ConvMlirIgemmFwdXdlops::Search(const ConvolutionContext& ctx,
+ConvMlirIgemmFwdXdlops::Search(const ExecutionContext& ctx,
                                const ProblemDescription& problem,
                                const AnyInvokeParams& invoke_ctx) const
 {
     return GenericSearch(*this, ctx, problem, invoke_ctx);
 }
 
-ConvSolution ConvMlirIgemmFwdXdlops::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvMlirIgemmFwdXdlops::GetSolution(const ExecutionContext& ctx,
                                                  const ProblemDescription& problem,
                                                  const PerformanceConvMlirIgemmXdlops& config) const
 {
diff --git a/src/solver/conv_mlir_igemm_wrw.cpp b/src/solver/conv_mlir_igemm_wrw.cpp
index d5f88dcaea..cb9f6ae7b2 100644
--- a/src/solver/conv_mlir_igemm_wrw.cpp
+++ b/src/solver/conv_mlir_igemm_wrw.cpp
@@ -38,7 +38,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_MLIR_IGEMM_WRW)
 namespace miopen {
 namespace solver {
 
-bool ConvMlirIgemmWrW::IsApplicable(const ConvolutionContext& ctx,
+bool ConvMlirIgemmWrW::IsApplicable(const ExecutionContext& ctx,
                                     const ProblemDescription& problem) const
 {
 #if MIOPEN_USE_MLIR
@@ -71,13 +71,13 @@ bool ConvMlirIgemmWrW::IsApplicable(const ConvolutionContext& ctx,
 }
 
 PerformanceConvMlirIgemm
-ConvMlirIgemmWrW::GetDefaultPerformanceConfig(const ConvolutionContext&,
+ConvMlirIgemmWrW::GetDefaultPerformanceConfig(const ExecutionContext&,
                                               const ProblemDescription&) const
 {
     return PerformanceConvMlirIgemm::MlirHeuristicInitRequest();
 }
 
-bool ConvMlirIgemmWrW::IsValidPerformanceConfig(const ConvolutionContext& ctx,
+bool ConvMlirIgemmWrW::IsValidPerformanceConfig(const ExecutionContext& ctx,
                                                 const ProblemDescription& problem,
                                                 const PerformanceConvMlirIgemm& config) const
 {
@@ -85,14 +85,14 @@ bool ConvMlirIgemmWrW::IsValidPerformanceConfig(const ConvolutionContext& ctx,
     return config.IsValid(ctx, problem);
 }
 
-PerformanceConvMlirIgemm ConvMlirIgemmWrW::Search(const ConvolutionContext& ctx,
+PerformanceConvMlirIgemm ConvMlirIgemmWrW::Search(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem,
                                                   const AnyInvokeParams& invoke_ctx) const
 {
     return GenericSearch(*this, ctx, problem, invoke_ctx);
 }
 
-ConvSolution ConvMlirIgemmWrW::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvMlirIgemmWrW::GetSolution(const ExecutionContext& ctx,
                                            const ProblemDescription& problem,
                                            const PerformanceConvMlirIgemm& config) const
 {
diff --git a/src/solver/conv_mlir_igemm_wrw_xdlops.cpp b/src/solver/conv_mlir_igemm_wrw_xdlops.cpp
index 2f3bc63f50..fe11c828c8 100644
--- a/src/solver/conv_mlir_igemm_wrw_xdlops.cpp
+++ b/src/solver/conv_mlir_igemm_wrw_xdlops.cpp
@@ -39,7 +39,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_MLIR_IGEMM_WRW_XDLOPS)
 namespace miopen {
 namespace solver {
 
-bool ConvMlirIgemmWrWXdlops::IsApplicable(const ConvolutionContext& ctx,
+bool ConvMlirIgemmWrWXdlops::IsApplicable(const ExecutionContext& ctx,
                                           const ProblemDescription& problem) const
 {
 #if MIOPEN_USE_MLIR
@@ -65,14 +65,14 @@ bool ConvMlirIgemmWrWXdlops::IsApplicable(const ConvolutionContext& ctx,
 }
 
 PerformanceConvMlirIgemmXdlops
-ConvMlirIgemmWrWXdlops::GetDefaultPerformanceConfig(const ConvolutionContext&,
+ConvMlirIgemmWrWXdlops::GetDefaultPerformanceConfig(const ExecutionContext&,
                                                     const ProblemDescription&) const
 {
     return PerformanceConvMlirIgemmXdlops::MlirHeuristicInitRequest();
 }
 
 bool ConvMlirIgemmWrWXdlops::IsValidPerformanceConfig(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConvMlirIgemmXdlops& config) const
 {
@@ -81,14 +81,14 @@ bool ConvMlirIgemmWrWXdlops::IsValidPerformanceConfig(
 }
 
 PerformanceConvMlirIgemmXdlops
-ConvMlirIgemmWrWXdlops::Search(const ConvolutionContext& ctx,
+ConvMlirIgemmWrWXdlops::Search(const ExecutionContext& ctx,
                                const ProblemDescription& problem,
                                const AnyInvokeParams& invoke_ctx) const
 {
     return GenericSearch(*this, ctx, problem, invoke_ctx);
 }
 
-ConvSolution ConvMlirIgemmWrWXdlops::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvMlirIgemmWrWXdlops::GetSolution(const ExecutionContext& ctx,
                                                  const ProblemDescription& problem,
                                                  const PerformanceConvMlirIgemmXdlops& config) const
 {
@@ -130,7 +130,7 @@ ConvSolution ConvMlirIgemmWrWXdlops::GetSolution(const ConvolutionContext& ctx,
 #endif
 }
 
-std::size_t ConvMlirIgemmWrWXdlops::GetWorkspaceSize(const ConvolutionContext& ctx,
+std::size_t ConvMlirIgemmWrWXdlops::GetWorkspaceSize(const ExecutionContext& ctx,
                                                      const ProblemDescription& problem) const
 {
 #if MIOPEN_USE_MLIR
diff --git a/src/solver/conv_ocl_dir2D11x11.cpp b/src/solver/conv_ocl_dir2D11x11.cpp
index 145cf9ed13..b76621a591 100644
--- a/src/solver/conv_ocl_dir2D11x11.cpp
+++ b/src/solver/conv_ocl_dir2D11x11.cpp
@@ -36,7 +36,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD11X11)
 namespace miopen {
 namespace solver {
 
-bool ConvOclDirectFwd11x11::IsApplicable(const ConvolutionContext& ctx,
+bool ConvOclDirectFwd11x11::IsApplicable(const ExecutionContext& ctx,
                                          const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD11X11{}))
@@ -65,7 +65,7 @@ bool ConvOclDirectFwd11x11::IsApplicable(const ConvolutionContext& ctx,
            problem.GetKernelStrideH() == 4 && problem.GetKernelStrideW() == 4;
 }
 
-ConvSolution ConvOclDirectFwd11x11::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvOclDirectFwd11x11::GetSolution(const ExecutionContext& ctx,
                                                 const ProblemDescription& problem) const
 {
     ConvSolution result;
diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp
index eded6fddf8..4e0cda8629 100644
--- a/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp
+++ b/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp
@@ -39,7 +39,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW1X1)
 namespace miopen {
 namespace solver {
 
-bool ConvOclBwdWrW1x1::IsApplicable(const ConvolutionContext& ctx,
+bool ConvOclBwdWrW1x1::IsApplicable(const ExecutionContext& ctx,
                                     const ProblemDescription& problem) const
 {
 #if WORKAROUND_SWDEV_266868
@@ -96,7 +96,7 @@ static inline int GetNPasses(const ProblemDescription& problem)
     return n_passes;
 }
 
-size_t ConvOclBwdWrW1x1::GetWorkspaceSize(const ConvolutionContext&,
+size_t ConvOclBwdWrW1x1::GetWorkspaceSize(const ExecutionContext&,
                                           const ProblemDescription& problem) const
 {
     const int n_passes = GetNPasses(problem);
@@ -112,7 +112,7 @@ size_t ConvOclBwdWrW1x1::GetWorkspaceSize(const ConvolutionContext&,
         return 0;
 }
 
-ConvSolution ConvOclBwdWrW1x1::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvOclBwdWrW1x1::GetSolution(const ExecutionContext& ctx,
                                            const ProblemDescription& problem) const
 {
     ConvSolution result;
diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp
index d4e79250e1..2b400909f8 100644
--- a/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp
+++ b/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp
@@ -139,7 +139,7 @@ static bool IsTunable(const ProblemDescription& problem)
               (problem.GetWeightsWidth_() == 1 && problem.GetWeightsHeight_() == 1)));
 }
 
-bool ConvOclBwdWrW2NonTunable::IsApplicable(const ConvolutionContext& ctx,
+bool ConvOclBwdWrW2NonTunable::IsApplicable(const ExecutionContext& ctx,
                                             const ProblemDescription& problem) const
 {
     // At present, auto-tuning is disabled for non-group 3x3 and 1x1 filters for multiple
@@ -148,7 +148,7 @@ bool ConvOclBwdWrW2NonTunable::IsApplicable(const ConvolutionContext& ctx,
     return ConvOclBwdWrW2<1>::IsApplicableBase(ctx, problem) && !IsTunable(problem);
 }
 
-ConvSolution ConvOclBwdWrW2NonTunable::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvOclBwdWrW2NonTunable::GetSolution(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem) const
 {
     // Invoking base class GetSolution with default values for params obtained
@@ -230,7 +230,7 @@ static size_t GetNBatchBlks(const ProblemDescription& problem)
 
 template <int N_BATCH_LOOPS>
 bool PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>::IsValid(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     if(!IsValidValue())
     {
@@ -438,7 +438,7 @@ void PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>::HeuristicInit(
 
 template <int N_BATCH_LOOPS>
 bool ConvOclBwdWrW2<N_BATCH_LOOPS>::IsValidPerformanceConfig(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>& config) const
 {
@@ -446,7 +446,7 @@ bool ConvOclBwdWrW2<N_BATCH_LOOPS>::IsValidPerformanceConfig(
 }
 
 template <int N_BATCH_LOOPS>
-bool ConvOclBwdWrW2<N_BATCH_LOOPS>::IsApplicableBase(const ConvolutionContext& ctx,
+bool ConvOclBwdWrW2<N_BATCH_LOOPS>::IsApplicableBase(const ExecutionContext& ctx,
                                                      const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW2{}))
@@ -506,7 +506,7 @@ bool ConvOclBwdWrW2<N_BATCH_LOOPS>::IsApplicableBase(const ConvolutionContext& c
 }
 
 template <int N_BATCH_LOOPS>
-bool ConvOclBwdWrW2<N_BATCH_LOOPS>::IsApplicable(const ConvolutionContext& ctx,
+bool ConvOclBwdWrW2<N_BATCH_LOOPS>::IsApplicable(const ExecutionContext& ctx,
                                                  const ProblemDescription& problem) const
 {
     return IsApplicableBase(ctx, problem) && IsTunable(problem);
@@ -514,7 +514,7 @@ bool ConvOclBwdWrW2<N_BATCH_LOOPS>::IsApplicable(const ConvolutionContext& ctx,
 
 template <int N_BATCH_LOOPS>
 PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>
-ConvOclBwdWrW2<N_BATCH_LOOPS>::GetDefaultPerformanceConfig(const ConvolutionContext&,
+ConvOclBwdWrW2<N_BATCH_LOOPS>::GetDefaultPerformanceConfig(const ExecutionContext&,
                                                            const ProblemDescription& problem) const
 {
     PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS> pp;
@@ -523,7 +523,7 @@ ConvOclBwdWrW2<N_BATCH_LOOPS>::GetDefaultPerformanceConfig(const ConvolutionCont
 }
 
 template <int N_BATCH_LOOPS>
-size_t ConvOclBwdWrW2<N_BATCH_LOOPS>::GetWorkspaceSize(const ConvolutionContext&,
+size_t ConvOclBwdWrW2<N_BATCH_LOOPS>::GetWorkspaceSize(const ExecutionContext&,
                                                        const ProblemDescription& problem) const
 {
     const size_t n_batch_blks = GetNBatchBlks<N_BATCH_LOOPS>(problem);
@@ -543,7 +543,7 @@ size_t ConvOclBwdWrW2<N_BATCH_LOOPS>::GetWorkspaceSize(const ConvolutionContext&
 
 template <int N_BATCH_LOOPS>
 ConvSolution ConvOclBwdWrW2<N_BATCH_LOOPS>::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>& config) const
 {
@@ -747,7 +747,7 @@ ConvSolution ConvOclBwdWrW2<N_BATCH_LOOPS>::GetSolution(
 
 template <int N_BATCH_LOOPS>
 PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>
-ConvOclBwdWrW2<N_BATCH_LOOPS>::Search(const ConvolutionContext& ctx,
+ConvOclBwdWrW2<N_BATCH_LOOPS>::Search(const ExecutionContext& ctx,
                                       const ProblemDescription& problem,
                                       const AnyInvokeParams& invoke_ctx) const
 {
diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp
index 2db6109bf5..4f00c8f55b 100644
--- a/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp
+++ b/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp
@@ -40,7 +40,7 @@ namespace solver {
 // problematic configs.
 static bool WorkaroundSwdev168168() { return true; }
 
-bool ConvOclBwdWrW53::IsApplicable(const ConvolutionContext& ctx,
+bool ConvOclBwdWrW53::IsApplicable(const ExecutionContext& ctx,
                                    const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW53{}))
@@ -317,7 +317,7 @@ static inline void ComputeNumInputWidthLoops(
     }
 }
 
-size_t ConvOclBwdWrW53::GetWorkspaceSize(const ConvolutionContext&,
+size_t ConvOclBwdWrW53::GetWorkspaceSize(const ExecutionContext&,
                                          const ProblemDescription& problem) const
 {
     int n_stacks      = std::min(problem.GetBatchSize_(), 1U);
@@ -339,7 +339,7 @@ size_t ConvOclBwdWrW53::GetWorkspaceSize(const ConvolutionContext&,
         return 0;
 }
 
-ConvSolution ConvOclBwdWrW53::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvOclBwdWrW53::GetSolution(const ExecutionContext& ctx,
                                           const ProblemDescription& problem) const
 {
     ConvSolution result;
diff --git a/src/solver/conv_ocl_dir2Dfwd.cpp b/src/solver/conv_ocl_dir2Dfwd.cpp
index 70005e606e..c7bd8c00df 100644
--- a/src/solver/conv_ocl_dir2Dfwd.cpp
+++ b/src/solver/conv_ocl_dir2Dfwd.cpp
@@ -35,7 +35,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD)
 namespace miopen {
 namespace solver {
 
-bool ConvOclDirectFwd::IsApplicable(const ConvolutionContext& ctx,
+bool ConvOclDirectFwd::IsApplicable(const ExecutionContext& ctx,
                                     const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD{}))
@@ -118,7 +118,7 @@ bool ConvOclDirectFwd::IsApplicable(const ConvolutionContext& ctx,
 /// and some logic from the corresponding opencl kernel source.
 /// The cases which lead to errors can be later omitted from the search.
 /// \todo Get rid the duplication of code where possible.
-bool ConvOclDirectFwd::IsValidPerformanceConfig(const ConvolutionContext&,
+bool ConvOclDirectFwd::IsValidPerformanceConfig(const ExecutionContext&,
                                                 const ProblemDescription& problem,
                                                 const LegacyPerformanceConfig& config) const
 {
@@ -276,7 +276,7 @@ bool ConvOclDirectFwd::IsValidPerformanceConfig(const ConvolutionContext&,
     return true;
 }
 
-ConvSolution ConvOclDirectFwd::BaseGetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvOclDirectFwd::BaseGetSolution(const ExecutionContext& ctx,
                                                const ProblemDescription& problem,
                                                const LegacyPerformanceConfig& config)
 {
@@ -485,7 +485,7 @@ ConvSolution ConvOclDirectFwd::BaseGetSolution(const ConvolutionContext& ctx,
     return result;
 }
 
-ConvSolution ConvOclDirectFwd::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvOclDirectFwd::GetSolution(const ExecutionContext& ctx,
                                            const ProblemDescription& problem,
                                            const LegacyPerformanceConfig& config) const
 {
diff --git a/src/solver/conv_ocl_dir2Dfwd1x1.cpp b/src/solver/conv_ocl_dir2Dfwd1x1.cpp
index 71dc41fc65..b21effc0b3 100644
--- a/src/solver/conv_ocl_dir2Dfwd1x1.cpp
+++ b/src/solver/conv_ocl_dir2Dfwd1x1.cpp
@@ -38,7 +38,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1)
 namespace miopen {
 namespace solver {
 
-bool ConvOclDirectFwd1x1::IsApplicable(const ConvolutionContext& ctx,
+bool ConvOclDirectFwd1x1::IsApplicable(const ExecutionContext& ctx,
                                        const ProblemDescription& problem) const
 {
 #if WORKAROUND_SWDEV_271887
@@ -76,7 +76,7 @@ bool ConvOclDirectFwd1x1::IsApplicable(const ConvolutionContext& ctx,
            problem.GetPadW() == 0 && problem.GetPadH() == 0;
 }
 
-ConvSolution ConvOclDirectFwd1x1::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvOclDirectFwd1x1::GetSolution(const ExecutionContext& ctx,
                                               const ProblemDescription& problem,
                                               const LegacyPerformanceConfig& config) const
 {
diff --git a/src/solver/conv_ocl_dir2Dfwd_exhaustive_search.cpp b/src/solver/conv_ocl_dir2Dfwd_exhaustive_search.cpp
index 7d5d320b81..5e21bcae07 100644
--- a/src/solver/conv_ocl_dir2Dfwd_exhaustive_search.cpp
+++ b/src/solver/conv_ocl_dir2Dfwd_exhaustive_search.cpp
@@ -54,7 +54,7 @@ namespace solver {
  * select default configuration if a known configuration has not been found.
  */
 LegacyPerformanceConfig ConvOclDirectFwdLegacyExhaustiveSearch::GetDefaultPerformanceConfig(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     //
     LegacyPerformanceConfig result{};
@@ -142,7 +142,7 @@ static int MeasurePerfConfig(const Handle& handle,
                              ConstData_t wei_ocl_buf,
                              ConstData_t bias_ocl_buf,
                              double& processing_time,
-                             const ConvolutionContext& ctx,
+                             const ExecutionContext& ctx,
                              const ProblemDescription& problem,
                              const LegacyPerformanceConfig& config)
 {
@@ -213,7 +213,7 @@ static int MeasurePerfConfig(const Handle& handle,
 }
 
 LegacyPerformanceConfig
-ConvOclDirectFwdLegacyExhaustiveSearch::Search(const ConvolutionContext& ctx,
+ConvOclDirectFwdLegacyExhaustiveSearch::Search(const ExecutionContext& ctx,
                                                const ProblemDescription& problem,
                                                const AnyInvokeParams& invoke_ctx) const
 {
@@ -231,7 +231,7 @@ ConvOclDirectFwdLegacyExhaustiveSearch::Search(const ConvolutionContext& ctx,
 
 template <typename Tgpu>
 LegacyPerformanceConfig
-ConvOclDirectFwdLegacyExhaustiveSearch::SearchImpl(const ConvolutionContext& ctx,
+ConvOclDirectFwdLegacyExhaustiveSearch::SearchImpl(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem,
                                                    const AnyInvokeParams& invoke_ctx) const
 {
diff --git a/src/solver/conv_ocl_dir2Dfwdgen.cpp b/src/solver/conv_ocl_dir2Dfwdgen.cpp
index 80fb0708e1..f35e57b71c 100644
--- a/src/solver/conv_ocl_dir2Dfwdgen.cpp
+++ b/src/solver/conv_ocl_dir2Dfwdgen.cpp
@@ -34,7 +34,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWDGEN)
 namespace miopen {
 namespace solver {
 
-bool ConvOclDirectFwdGen::IsApplicable(const ConvolutionContext& ctx,
+bool ConvOclDirectFwdGen::IsApplicable(const ExecutionContext& ctx,
                                        const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWDGEN{}))
@@ -97,7 +97,7 @@ bool ConvOclDirectFwdGen::IsApplicable(const ConvolutionContext& ctx,
                 && (problem.GetKernelStrideW() > 1 || problem.GetKernelStrideH() > 1))); // clang-format on
 }
 
-ConvSolution ConvOclDirectFwdGen::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvOclDirectFwdGen::GetSolution(const ExecutionContext& ctx,
                                               const ProblemDescription& problem) const
 {
     int n_in_stacks = 0;
diff --git a/src/solver/conv_winoRxS.cpp b/src/solver/conv_winoRxS.cpp
index cdff22ac1d..d9cbeb713f 100644
--- a/src/solver/conv_winoRxS.cpp
+++ b/src/solver/conv_winoRxS.cpp
@@ -302,7 +302,7 @@ PerformanceConfigConvBinWinogradRxS::PerformanceConfigConvBinWinogradRxS(int n_g
 }
 
 template <int Winodata, int Winofilter>
-void PerformanceConfigConvBinWinogradRxS::HeuristicInit(const ConvolutionContext& ctx,
+void PerformanceConfigConvBinWinogradRxS::HeuristicInit(const ExecutionContext& ctx,
                                                         const ProblemDescription& problem)
 {
     const auto n_inputs_per_group  = problem.GetInChannels_() / problem.GetGroupCount(),
@@ -365,7 +365,7 @@ bool PerformanceConfigConvBinWinogradRxS::IsValidValue() const
     return PerfFieldRules().IsIn(*this);
 }
 
-bool PerformanceConfigConvBinWinogradRxS::IsValid(const ConvolutionContext& ctx) const
+bool PerformanceConfigConvBinWinogradRxS::IsValid(const ExecutionContext& ctx) const
 {
     if(ctx.GetStream().GetMaxHardwareComputeUnits() < n_groups)
         return false;
@@ -384,7 +384,7 @@ bool PerformanceConfigConvBinWinogradRxS::operator==(
 template <int Winodata, int Winofilter>
 PerformanceConfigConvBinWinogradRxS
 ConvBinWinoRxS<Winodata, Winofilter>::GetDefaultPerformanceConfig(
-    const ConvolutionContext& ctx, const ProblemDescription& problem) const
+    const ExecutionContext& ctx, const ProblemDescription& problem) const
 {
     PerformanceConfigConvBinWinogradRxS pp;
     pp.HeuristicInit<Winodata, Winofilter>(ctx, problem);
@@ -394,7 +394,7 @@ ConvBinWinoRxS<Winodata, Winofilter>::GetDefaultPerformanceConfig(
 
 template <int Winodata, int Winofilter>
 bool ConvBinWinoRxS<Winodata, Winofilter>::IsValidPerformanceConfig(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription&,
     const PerformanceConfigConvBinWinogradRxS& config) const
 {
@@ -403,7 +403,7 @@ bool ConvBinWinoRxS<Winodata, Winofilter>::IsValidPerformanceConfig(
 
 template <int Winodata, int Winofilter>
 PerformanceConfigConvBinWinogradRxS
-ConvBinWinoRxS<Winodata, Winofilter>::Search(const ConvolutionContext& ctx,
+ConvBinWinoRxS<Winodata, Winofilter>::Search(const ExecutionContext& ctx,
                                              const ProblemDescription& problem,
                                              const AnyInvokeParams& invoke_ctx) const
 {
@@ -436,7 +436,7 @@ class ShaderModel : public UnifiedDescriptionConv2d
     bool out_of_model_scope; // Shader model produces unreliable results.
 
 public:
-    ShaderModel(const ConvolutionContext& ctx,
+    ShaderModel(const ExecutionContext& ctx,
                 const ProblemDescription& problem,
                 size_t Winodata,
                 size_t Winofilter)
@@ -610,7 +610,7 @@ class ShaderModel : public UnifiedDescriptionConv2d
 };
 
 template <int Winodata, int Winofilter>
-static float GetWtiBase(const ConvolutionContext& ctx, const ProblemDescription& problem)
+static float GetWtiBase(const ExecutionContext& ctx, const ProblemDescription& problem)
 {
     constexpr auto WTI_UNKNOWN = -2.0;
     const auto rv              = ShaderModel(ctx, problem, Winodata, Winofilter).ComputeWti();
@@ -618,7 +618,7 @@ static float GetWtiBase(const ConvolutionContext& ctx, const ProblemDescription&
 }
 
 template <int Winodata, int Winofilter>
-static bool IsApplicableBase(const ConvolutionContext& ctx, const ProblemDescription& problem)
+static bool IsApplicableBase(const ExecutionContext& ctx, const ProblemDescription& problem)
 {
     if(!problem.Is2d())
         return false;
@@ -694,7 +694,7 @@ static bool IsApplicableBase(const ConvolutionContext& ctx, const ProblemDescrip
 }
 
 template <int Winodata, int Winofilter>
-bool ConvBinWinoRxS<Winodata, Winofilter>::IsApplicable(const ConvolutionContext& ctx,
+bool ConvBinWinoRxS<Winodata, Winofilter>::IsApplicable(const ExecutionContext& ctx,
                                                         const ProblemDescription& problem) const
 {
     if(IS2X3)
@@ -716,7 +716,7 @@ bool ConvBinWinoRxS<Winodata, Winofilter>::IsApplicable(const ConvolutionContext
 
 template <int Winodata, int Winofilter>
 static inline boost::optional<PerformanceConfigConvBinWinogradRxS>
-GetPerfConfFromEnv(const ConvolutionContext& ctx)
+GetPerfConfFromEnv(const ExecutionContext& ctx)
 {
     PerformanceConfigConvBinWinogradRxS fromEnv;
     std::string s;
@@ -752,7 +752,7 @@ GetPerfConfFromEnv(const ConvolutionContext& ctx)
 
 template <int Winodata, int Winofilter>
 ConvSolution ConvBinWinoRxS<Winodata, Winofilter>::GetSolution(
-    const ConvolutionContext& ctx,
+    const ExecutionContext& ctx,
     const ProblemDescription& problem,
     const PerformanceConfigConvBinWinogradRxS& config) const
 {
@@ -1075,7 +1075,7 @@ ConvSolution ConvBinWinoRxS<Winodata, Winofilter>::GetSolution(
     return result;
 }
 
-bool ConvBinWinogradRxSf2x3g1::IsApplicable(const ConvolutionContext& ctx,
+bool ConvBinWinogradRxSf2x3g1::IsApplicable(const ExecutionContext& ctx,
                                             const ProblemDescription& problem) const
 {
     if(miopen::IsDisabled(MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_G1{}))
@@ -1083,13 +1083,13 @@ bool ConvBinWinogradRxSf2x3g1::IsApplicable(const ConvolutionContext& ctx,
     return IsApplicableBase<2, 3>(ctx, problem) && problem.GetGroupCount() == 1;
 }
 
-float ConvBinWinogradRxSf2x3g1::GetWti(const ConvolutionContext& ctx,
+float ConvBinWinogradRxSf2x3g1::GetWti(const ExecutionContext& ctx,
                                        const ProblemDescription& problem) const
 {
     return GetWtiBase<2, 3>(ctx, problem);
 }
 
-ConvSolution ConvBinWinogradRxSf2x3g1::GetSolution(const ConvolutionContext& ctx,
+ConvSolution ConvBinWinogradRxSf2x3g1::GetSolution(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem) const
 {
     const auto tunable = ConvBinWinoRxS<2, 3>{};
diff --git a/src/solver/conv_wino_fury_RxS.cpp b/src/solver/conv_wino_fury_RxS.cpp
index 12c9639a66..89f870e35e 100644
--- a/src/solver/conv_wino_fury_RxS.cpp
+++ b/src/solver/conv_wino_fury_RxS.cpp
@@ -165,7 +165,7 @@ class ShaderModel : public UnifiedDescriptionConv2d
 } // namespace
 
 template <uint32_t Winodata, uint32_t Winofilter>
-bool ConvWinoFuryRxS<Winodata, Winofilter>::IsApplicable(const ConvolutionContext& ctx,
+bool ConvWinoFuryRxS<Winodata, Winofilter>::IsApplicable(const ExecutionContext& ctx,
                                                          const ProblemDescription& problem) const
 {
     if(!problem.Is2d())
@@ -195,7 +195,7 @@ bool ConvWinoFuryRxS<Winodata, Winofilter>::IsApplicable(const ConvolutionContex
 }
 
 template <uint32_t Winodata, uint32_t Winofilter>
-float ConvWinoFuryRxS<Winodata, Winofilter>::GetWti(const ConvolutionContext& ctx,
+float ConvWinoFuryRxS<Winodata, Winofilter>::GetWti(const ExecutionContext& ctx,
                                                     const ProblemDescription& problem) const
 {
     auto n_groups = ctx.GetStream().GetMaxHardwareComputeUnits();
@@ -204,7 +204,7 @@ float ConvWinoFuryRxS<Winodata, Winofilter>::GetWti(const ConvolutionContext& ct
 
 template <uint32_t Winodata, uint32_t Winofilter>
 ConvSolution
-ConvWinoFuryRxS<Winodata, Winofilter>::GetSolution(const ConvolutionContext& ctx,
+ConvWinoFuryRxS<Winodata, Winofilter>::GetSolution(const ExecutionContext& ctx,
                                                    const ProblemDescription& problem) const
 {
     // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
diff --git a/src/solver/mlir_common.cpp b/src/solver/mlir_common.cpp
index eaaa0e42c3..4101db86f9 100644
--- a/src/solver/mlir_common.cpp
+++ b/src/solver/mlir_common.cpp
@@ -121,7 +121,7 @@ static std::string GetOperation(const ProblemDescription& problem)
 
 /* Construct the options string passed to MLIR to cause it
 to generate a given convolution.*/
-std::string ConstructBuildOptions(const ConvolutionContext& ctx,
+std::string ConstructBuildOptions(const ExecutionContext& ctx,
                                   const ProblemDescription& problem,
                                   bool is_xdlops,
                                   int kernel_id)
diff --git a/test/conv_common.hpp b/test/conv_common.hpp
index 3d510bb21b..99f964a0f6 100644
--- a/test/conv_common.hpp
+++ b/test/conv_common.hpp
@@ -86,7 +86,7 @@ static inline bool is_direct_fwd_bwd_data_supported(miopen::Handle& handle,
             (dir == miopen::conv::Direction::Forward)
                 ? miopen::conv::ProblemDescription{xDesc, wDesc, yDesc, convDesc, dir}
                 : miopen::conv::ProblemDescription{yDesc, wDesc, xDesc, convDesc, dir};
-        auto ctx                    = miopen::ConvolutionContext{};
+        auto ctx                    = miopen::ExecutionContext{};
         ctx.do_search               = false;
         ctx.save_srch_req           = false;
         ctx.disable_perfdb_access   = true;
@@ -110,7 +110,7 @@ static inline bool is_direct_bwd_wrw_supported(miopen::Handle& handle,
 
     const auto problem = miopen::conv::ProblemDescription{
         yDesc, wDesc, xDesc, convDesc, miopen::conv::Direction::BackwardWeights};
-    auto ctx = miopen::ConvolutionContext{};
+    auto ctx = miopen::ExecutionContext{};
 
     ctx.do_search               = false;
     ctx.save_srch_req           = false;
@@ -136,7 +136,7 @@ static inline bool skip_config(miopen::Handle& handle,
     const auto conv_problem = miopen::conv::ProblemDescription{
         xDesc, wDesc, yDesc, convDesc, miopen::conv::Direction::Forward};
     const auto problem = miopen::ProblemDescription{conv_problem};
-    auto ctx           = miopen::ConvolutionContext{};
+    auto ctx           = miopen::ExecutionContext{};
 
     ctx.do_search               = false;
     ctx.save_srch_req           = false;
diff --git a/test/embed_sqlite.cpp b/test/embed_sqlite.cpp
index dfb663802e..32aa1371a8 100644
--- a/test/embed_sqlite.cpp
+++ b/test/embed_sqlite.cpp
@@ -31,7 +31,7 @@
 
 #if MIOPEN_EMBED_DB
 #include <miopen_data.hpp>
-#include <miopen/conv/context.hpp>
+#include <miopen/execution_context.hpp>
 #include <miopen/mlo_internal.hpp>
 #include <miopen/db.hpp>
 #include <miopen/sqlite_db.hpp>
@@ -67,7 +67,7 @@ struct EmbedSQLite : test_driver
         const auto conv_problem = miopen::conv::ProblemDescription{
             x.desc, w.desc, y.desc, filter, miopen::conv::Direction::Forward};
         const auto problem = miopen::ProblemDescription{conv_problem};
-        miopen::ConvolutionContext ctx{};
+        miopen::ExecutionContext ctx{};
         ctx.SetStream(&handle);
         // Check PerfDb
         {
diff --git a/test/gpu_conv.hpp b/test/gpu_conv.hpp
index 240b191028..3165ace5fc 100644
--- a/test/gpu_conv.hpp
+++ b/test/gpu_conv.hpp
@@ -89,7 +89,7 @@ bool gpu_ref_convolution_fwd(const tensor<Tin>& input,
             input.desc, in_dev.get(), weights.desc, wei_dev.get(), rout.desc, out_dev.get()};
         const auto problem = miopen::conv::ProblemDescription{
             input.desc, weights.desc, rout.desc, filter, miopen::conv::Direction::Forward};
-        auto ctx = miopen::ConvolutionContext{};
+        auto ctx = miopen::ExecutionContext{};
         ctx.SetStream(&handle);
         if(naive_solver.IsApplicable(ctx, problem))
         {
@@ -125,7 +125,7 @@ bool gpu_ref_convolution_bwd(tensor<Tin>& input,
             output.desc, out_dev.get(), weights.desc, wei_dev.get(), input.desc, in_dev.get()};
         const auto problem = miopen::conv::ProblemDescription{
             output.desc, weights.desc, input.desc, filter, miopen::conv::Direction::BackwardData};
-        auto ctx = miopen::ConvolutionContext{};
+        auto ctx = miopen::ExecutionContext{};
         ctx.SetStream(&handle);
         if(naive_solver.IsApplicable(ctx, problem))
         {
@@ -165,7 +165,7 @@ bool gpu_ref_convolution_wrw(const tensor<Tin>& input,
                                              input.desc,
                                              filter,
                                              miopen::conv::Direction::BackwardWeights};
-        auto ctx = miopen::ConvolutionContext{};
+        auto ctx = miopen::ExecutionContext{};
         ctx.SetStream(&handle);
         if(naive_solver.IsApplicable(ctx, problem))
         {
diff --git a/test/gtest/db_sync.cpp b/test/gtest/db_sync.cpp
index e6a4288e1f..d7e59468a2 100644
--- a/test/gtest/db_sync.cpp
+++ b/test/gtest/db_sync.cpp
@@ -29,7 +29,7 @@
 #include <miopen/miopen.h>
 #include "get_handle.hpp"
 #include <miopen/readonlyramdb.hpp>
-#include <miopen/conv/context.hpp>
+#include <miopen/execution_context.hpp>
 
 #include <miopen/find_db.hpp>
 #include <miopen/tensor.hpp>
@@ -402,7 +402,7 @@ TEST(DBSync, DISABLED_DynamicFDBSync)
 
     std::unordered_map<KDBKey, bool> checked_kdbs;
     auto& handle = get_handle();
-    auto _ctx     = miopen::ConvolutionContext{};
+    auto _ctx     = miopen::ExecutionContext{};
     _ctx.SetStream(&handle);
 
     for(const auto& kinder : find_db.GetCacheMap())
@@ -458,7 +458,7 @@ TEST(DbSync, DISABLED_StaticFDBSync)
     std::unordered_map<KDBKey, bool> checked_kdbs;
 
     auto& handle = get_handle();
-    auto _ctx     = miopen::ConvolutionContext{};
+    auto _ctx     = miopen::ExecutionContext{};
     _ctx.SetStream(&handle);
     size_t cnt_finddb_entry = 0;
     for(const auto& kinder : find_db.GetCacheMap())
diff --git a/test/gtest/group_conv3d_bwd.cpp b/test/gtest/group_conv3d_bwd.cpp
index 8e794749dd..e53a690021 100644
--- a/test/gtest/group_conv3d_bwd.cpp
+++ b/test/gtest/group_conv3d_bwd.cpp
@@ -56,7 +56,7 @@ void SolverBwd(const miopen::TensorDescriptor& inputDesc,
 
     const auto problem = miopen::conv::ProblemDescription{
         inputDesc, wDesc, outputDesc, convDesc, miopen::conv::Direction::BackwardData};
-    auto ctx = miopen::ConvolutionContext{};
+    auto ctx = miopen::ExecutionContext{};
 
     ctx.SetStream(&handle);
 
diff --git a/test/gtest/group_conv3d_fwd.cpp b/test/gtest/group_conv3d_fwd.cpp
index 876f513258..2b52a1b43a 100644
--- a/test/gtest/group_conv3d_fwd.cpp
+++ b/test/gtest/group_conv3d_fwd.cpp
@@ -56,7 +56,7 @@ void SolverFwd(const miopen::TensorDescriptor& inputDesc,
 
     const auto problem = miopen::conv::ProblemDescription{
         inputDesc, wDesc, outputDesc, convDesc, miopen::conv::Direction::Forward};
-    auto ctx = miopen::ConvolutionContext{};
+    auto ctx = miopen::ExecutionContext{};
 
     ctx.SetStream(&handle);
 
diff --git a/test/gtest/group_conv3d_wrw.cpp b/test/gtest/group_conv3d_wrw.cpp
index 0fae3402d8..13e88da5ad 100644
--- a/test/gtest/group_conv3d_wrw.cpp
+++ b/test/gtest/group_conv3d_wrw.cpp
@@ -57,7 +57,7 @@ void SolverWrw(const miopen::TensorDescriptor& inputDesc,
 
     const auto problem = miopen::conv::ProblemDescription{
         inputDesc, wDesc, outputDesc, convDesc, miopen::conv::Direction::BackwardWeights};
-    auto ctx = miopen::ConvolutionContext{};
+    auto ctx = miopen::ExecutionContext{};
 
     ctx.SetStream(&handle);
 
diff --git a/test/gtest/group_conv_fwd.cpp b/test/gtest/group_conv_fwd.cpp
index e5a1dbbea9..c8fdec4cae 100644
--- a/test/gtest/group_conv_fwd.cpp
+++ b/test/gtest/group_conv_fwd.cpp
@@ -56,7 +56,7 @@ void SolverFwd(const miopen::TensorDescriptor& inputDesc,
 
     const auto problem = miopen::conv::ProblemDescription{
         inputDesc, wDesc, outputDesc, convDesc, miopen::conv::Direction::Forward};
-    auto ctx = miopen::ConvolutionContext{};
+    auto ctx = miopen::ExecutionContext{};
 
     ctx.SetStream(&handle);
 
diff --git a/test/gtest/kernel_tuning_net.cpp b/test/gtest/kernel_tuning_net.cpp
index 0a8885486d..65c7977944 100644
--- a/test/gtest/kernel_tuning_net.cpp
+++ b/test/gtest/kernel_tuning_net.cpp
@@ -93,7 +93,7 @@ void TestParameterPredictionModel(miopen::ProblemDescription problem,
     auto&& handle = get_handle();
     if(handle.GetDeviceName() != "gfx908")
         GTEST_SKIP();
-    miopen::ConvolutionContext ctx;
+    miopen::ExecutionContext ctx;
     ctx.SetStream(&handle);
     T perf_config;
     bool valid = false;
diff --git a/test/gtest/solver_bwd.hpp b/test/gtest/solver_bwd.hpp
index 728c38fcda..febc35ae01 100644
--- a/test/gtest/solver_bwd.hpp
+++ b/test/gtest/solver_bwd.hpp
@@ -59,9 +59,9 @@ struct ConvBwdSolverTest
                                              output.desc,
                                              conv_desc,
                                              miopen::conv::Direction::BackwardData);
-        const auto problem                   = miopen::ProblemDescription{conv_problem};
-        const miopen::ConvolutionContext ctx = [&] {
-            auto tmp = miopen::ConvolutionContext{&handle};
+        const auto problem                 = miopen::ProblemDescription{conv_problem};
+        const miopen::ExecutionContext ctx = [&] {
+            auto tmp = miopen::ExecutionContext{&handle};
             problem.conv_problem.SetupFloats(tmp);
             return tmp;
         }();
diff --git a/test/gtest/solver_fwd.hpp b/test/gtest/solver_fwd.hpp
index ac30ad15ca..a41667d5fc 100644
--- a/test/gtest/solver_fwd.hpp
+++ b/test/gtest/solver_fwd.hpp
@@ -62,8 +62,8 @@ struct ConvFwdSolverTest
                                              this->output.desc,
                                              this->conv_desc,
                                              miopen::conv::Direction::Forward});
-        const miopen::ConvolutionContext ctx = [&] {
-            auto tmp = miopen::ConvolutionContext{&handle};
+        const miopen::ExecutionContext ctx = [&] {
+            auto tmp = miopen::ExecutionContext{&handle};
             problem.conv_problem.SetupFloats(tmp);
             return tmp;
         }();
diff --git a/test/gtest/solver_wrw.hpp b/test/gtest/solver_wrw.hpp
index 51f078fd21..6adba65bf6 100644
--- a/test/gtest/solver_wrw.hpp
+++ b/test/gtest/solver_wrw.hpp
@@ -59,8 +59,8 @@ struct ConvWrwSolverTest
                                              input.desc,
                                              conv_desc,
                                              miopen::conv::Direction::BackwardWeights});
-        const miopen::ConvolutionContext ctx = [&] {
-            auto tmp = miopen::ConvolutionContext{&handle};
+        const miopen::ExecutionContext ctx = [&] {
+            auto tmp = miopen::ExecutionContext{&handle};
             problem.conv_problem.SetupFloats(tmp);
             return tmp;
         }();
diff --git a/test/gtest/tuna_net.cpp b/test/gtest/tuna_net.cpp
index 288c4f0068..6eff96daeb 100644
--- a/test/gtest/tuna_net.cpp
+++ b/test/gtest/tuna_net.cpp
@@ -91,7 +91,7 @@ void TestSolverPredictionModel(miopen::ProblemDescription& problem, std::size_t
     std::string device = handle.GetDeviceName();
     if(device != "gfx908")
         GTEST_SKIP();
-    miopen::ConvolutionContext ctx;
+    miopen::ExecutionContext ctx;
     ctx.SetStream(&handle);
     std::vector<std::size_t> solvers = miopen::ai::immed_mode::PredictSolver(problem, ctx, device);
     std::size_t solver =
diff --git a/test/solver.cpp b/test/solver.cpp
index 3777daf6ae..d61524a29e 100644
--- a/test/solver.cpp
+++ b/test/solver.cpp
@@ -49,12 +49,12 @@ class TrivialTestSolver final : public solver::ConvSolver
 
     const std::string& SolverDbId() const override { return GetSolverDbId<TrivialTestSolver>(); }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription& problem) const override
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription& problem) const override
     {
         return problem.GetInWidth_() == 1;
     }
 
-    solver::ConvSolution GetSolution(const ConvolutionContext&,
+    solver::ConvSolution GetSolution(const ExecutionContext&,
                                      const ProblemDescription&) const override
     {
         solver::ConvSolution ret;
@@ -88,12 +88,12 @@ class SearchableTestSolver final : public solver::ConvTunableSolver<TestConfig>
 
     const std::string& SolverDbId() const override { return GetSolverDbId<SearchableTestSolver>(); }
 
-    bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override
+    bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override
     {
         return true;
     }
 
-    TestConfig GetDefaultPerformanceConfig(const ConvolutionContext&,
+    TestConfig GetDefaultPerformanceConfig(const ExecutionContext&,
                                            const ProblemDescription&) const override
     {
         TestConfig config{};
@@ -101,14 +101,14 @@ class SearchableTestSolver final : public solver::ConvTunableSolver<TestConfig>
         return config;
     }
 
-    bool IsValidPerformanceConfig(const ConvolutionContext&,
+    bool IsValidPerformanceConfig(const ExecutionContext&,
                                   const ProblemDescription&,
                                   const TestConfig&) const override
     {
         return true;
     }
 
-    TestConfig Search(const ConvolutionContext&,
+    TestConfig Search(const ExecutionContext&,
                       const ProblemDescription&,
                       const AnyInvokeParams&) const override
     {
@@ -118,7 +118,7 @@ class SearchableTestSolver final : public solver::ConvTunableSolver<TestConfig>
         return config;
     }
 
-    solver::ConvSolution GetSolution(const ConvolutionContext&,
+    solver::ConvSolution GetSolution(const ExecutionContext&,
                                      const ProblemDescription&,
                                      const TestConfig& config) const override
     {
@@ -140,7 +140,7 @@ class SearchableTestSolver final : public solver::ConvTunableSolver<TestConfig>
 // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
 int SearchableTestSolver::_serches_done = 0;
 
-static solver::ConvSolution FindSolution(const ConvolutionContext& ctx,
+static solver::ConvSolution FindSolution(const ExecutionContext& ctx,
                                          const ProblemDescription& problem,
                                          const std::string& db_path)
 {
@@ -175,29 +175,29 @@ class SolverTest
         ConstructTest(db_path,
                       TrivialTestSolver::FileName(),
                       {1, 1, 1, 1},
-                      [](ConvolutionContext& c) { c.do_search = true; });
+                      [](ExecutionContext& c) { c.do_search = true; });
 
         ConstructTest(db_path,
                       SearchableTestSolver::NoSearchFileName(),
                       {1, 1, 1, 2},
-                      [](ConvolutionContext& c) { c.do_search = false; });
+                      [](ExecutionContext& c) { c.do_search = false; });
 
         ConstructTest(db_path,
                       SearchableTestSolver::FileName(),
                       {1, 1, 1, 2},
-                      [](ConvolutionContext& c) { c.do_search = true; });
+                      [](ExecutionContext& c) { c.do_search = true; });
 
         const auto& searchable_solver = StaticContainer<const SearchableTestSolver>::Instance();
         const auto searches           = SearchableTestSolver::searches_done();
 
         // Should read in both cases: result is already in DB, solver is searchable.
         ConstructTest(
-            db_path, SearchableTestSolver::FileName(), {1, 1, 1, 2}, [](ConvolutionContext&) {});
+            db_path, SearchableTestSolver::FileName(), {1, 1, 1, 2}, [](ExecutionContext&) {});
 
         ConstructTest(db_path,
                       SearchableTestSolver::FileName(),
                       {1, 1, 1, 2},
-                      [](ConvolutionContext& c) { c.do_search = true; });
+                      [](ExecutionContext& c) { c.do_search = true; });
 
         // Checking no more searches were done.
         EXPECT_EQUAL(searches, searchable_solver.searches_done());
@@ -208,14 +208,14 @@ class SolverTest
         const std::string& db_path,
         const char* expected_kernel,
         const std::initializer_list<size_t>& in,
-        const std::function<void(ConvolutionContext&)>& context_filler = [](ConvolutionContext&) {})
+        const std::function<void(ExecutionContext&)>& context_filler = [](ExecutionContext&) {})
     {
         const auto problem = conv::ProblemDescription{TensorDescriptor{miopenFloat, in},
                                                       TensorDescriptor{miopenFloat, in},
                                                       TensorDescriptor{miopenFloat, in},
                                                       ConvolutionDescriptor{},
                                                       conv::Direction::Forward};
-        auto ctx           = ConvolutionContext{};
+        auto ctx           = ExecutionContext{};
         ctx.SetStream(&get_handle());
         context_filler(ctx);
 

From 55f9999d2bdef79c38bc238872930aa37f50084a Mon Sep 17 00:00:00 2001
From: JD <jahandad@gmail.com>
Date: Mon, 25 Sep 2023 21:43:03 -0500
Subject: [PATCH 10/36] [Jenkins][CI] clean workspace after each stage (#2412)

---
 Jenkinsfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 9d10064b59..433f4e9622 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -309,6 +309,7 @@ def reboot(){
 def buildHipClangJobAndReboot(Map conf=[:]){
     try{
         buildHipClangJob(conf)
+        cleanWs()
     }
     catch(e){
         echo "throwing error exception for the stage"
@@ -362,6 +363,7 @@ def RunPerfTest(Map conf=[:]){
             catch (Exception err){
                 currentBuild.result = 'SUCCESS'
             }
+            cleanWs()
         }
         }
     }

From d2909a6f6b7607b4b7cbb23b62eb8ef63dc508ab Mon Sep 17 00:00:00 2001
From: xinlipn <xinlipn@gmail.com>
Date: Mon, 25 Sep 2023 23:30:28 -0700
Subject: [PATCH 11/36] [tests] convert test_conv_igemm_mlir_fwd to gTest
 (#2291)

---
 test/CMakeLists.txt               |  42 -------
 test/gtest/CMakeLists.txt         |   2 +-
 test/gtest/conv_igemm_dynamic.cpp | 189 ++++++++++++++++++++++++++++++
 3 files changed, 190 insertions(+), 43 deletions(-)
 create mode 100644 test/gtest/conv_igemm_dynamic.cpp

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 9f6432ec6d..d2d8a83436 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1218,48 +1218,6 @@ set(DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS
     ${DYNAMIC_IMPLICITGEMM_COMMON}
     MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC)
 
-if(${CODECOV_TEST})
-    add_custom_test(test_conv_igemm_dynamic_small GFX908_DISABLED GFX90A_DISABLED SKIP_XNACK_ON
-    COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS}     $<TARGET_FILE:test_conv2d> --verbose --input  32  32 17 17 --weights 32  32 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-backward-data --disable-backward-weights --disable-validation
-    COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  64  64 28 28 --weights 32  64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data --disable-validation
-    COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  64  64 28 28 --weights 16  64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --disable-validation
-    )
-    set_tests_properties(test_conv_igemm_dynamic_small PROPERTIES COST 800)
-else()
-    add_custom_test(test_conv_igemm_dynamic_small GFX908_DISABLED GFX90A_DISABLED SKIP_XNACK_ON
-    COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS}     $<TARGET_FILE:test_conv2d> --verbose --input  16  16 56 56 --weights 64  16 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
-    COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS}     $<TARGET_FILE:test_conv2d> --verbose --input  16  64 34 34 --weights 64  64 3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
-    COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS}     $<TARGET_FILE:test_conv2d> --verbose --input  32  32 17 17 --weights 32  32 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-backward-data --disable-backward-weights
-    COMMAND ${DYNAMIC_IMPLICITGEMM_1X1_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  16 384  8  8 --weights 64 384 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
-    COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  64  64 28 28 --weights 32  64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data
-    COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  16  128 36 36 --weights 32  128 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data
-    COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  64  64 28 28 --weights 16  64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights
-    COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  16  128 36 36 --weights 32  128 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights
-    )
-endif() #if CODECOV_TEST
-
-add_custom_test(test_conv_igemm_dynamic SKIP_UNLESS_ALL GFX908_DISABLED GFX90A_DISABLED SKIP_XNACK_ON
-COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS}     $<TARGET_FILE:test_conv2d> --verbose --input  64   64 56 56 --weights 256  64  1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
-COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS}     $<TARGET_FILE:test_conv2d> --verbose --input  64  256 34 34 --weights 256  256 3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
-COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS}     $<TARGET_FILE:test_conv2d> --verbose --input 128  128 35 35 --weights 128  128 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights
-COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS}     $<TARGET_FILE:test_conv2d> --verbose --input  64 1536  8  8 --weights 256 1536 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
-COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS}     $<TARGET_FILE:test_conv2d> --verbose --input 128   48  7  7 --weights 128   48 5 5 --pads_strides_dilations 2 2 1 1 1 1 --disable-backward-data --disable-backward-weights
-COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS}     $<TARGET_FILE:test_conv2d> --verbose --input 128  128 17 17 --weights 128  128 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-backward-data --disable-backward-weights
-COMMAND ${DYNAMIC_IMPLICITGEMM_1X1_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input 128  256 28 28 --weights 128  256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
-COMMAND ${DYNAMIC_IMPLICITGEMM_1X1_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  64 1536  8  8 --weights 256 1536 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
-COMMAND ${DYNAMIC_IMPLICITGEMM_1X1_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input 128  768 17 17 --weights 128  768 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights
-COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  64   64 56 56 --weights 256  64  1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data
-COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  32  128 34 34 --weights 64  128  3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data
-COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input 128  128 35 35 --weights 128  128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-data
-COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input 128  256 56 56 --weights 64  256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data
-COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  64  512 28 28 --weights 256 512 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-data
-COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  64  512 14 14 --weights 256 512 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data
-COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  64   64 56 56 --weights 256  64  1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights
-COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input  32  128 34 34 --weights 64  128  3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights
-COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input 128  128 35 35 --weights 128  128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights
-COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $<TARGET_FILE:test_conv2d> --verbose --input 128  256 56 56 --weights 64  256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights
-)
-
 # gfx90a is disabled due to WORKAROUND_ISSUE_1187
 add_custom_test(test_conv_igemm_dynamic_xdlops_bwd SKIP_UNLESS_ALL HALF_ENABLED GFX90A_DISABLED GFX94X_ENABLED GFX900_DISABLED GFX906_DISABLED SKIP_XNACK_ON
 COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $<TARGET_FILE:test_conv2d> ${MIOPEN_TEST_FLOAT_ARG} --verbose --input  64  64 28 28 --weights 16  64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights
diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
index 61966d1888..4b7f2aaf08 100644
--- a/test/gtest/CMakeLists.txt
+++ b/test/gtest/CMakeLists.txt
@@ -35,7 +35,7 @@ function(add_gtest TEST_NAME)
       target_link_libraries(test_${TEST_NAME} gtest_main MIOpen ${Boost_LIBRARIES} hip::host $<BUILD_INTERFACE:roc::rocblas>)
     endif()
     # Enable CMake to discover the test binary
-    gtest_discover_tests(test_${TEST_NAME} PROPERTIES ENVIRONMENT "MIOPEN_USER_DB_PATH=${CMAKE_CURRENT_BINARY_DIR};MIOPEN_TEST_ALL=${MIOPEN_TEST_ALL};MIOPEN_TEST_MLIR=${MIOPEN_TEST_MLIR};MIOPEN_TEST_COMPOSABLEKERNEL=${MIOPEN_TEST_COMPOSABLEKERNEL}")
+    gtest_discover_tests(test_${TEST_NAME} PROPERTIES ENVIRONMENT "MIOPEN_USER_DB_PATH=${CMAKE_CURRENT_BINARY_DIR};MIOPEN_TEST_FLOAT_ARG=${MIOPEN_TEST_FLOAT_ARG};MIOPEN_TEST_ALL=${MIOPEN_TEST_ALL};MIOPEN_TEST_MLIR=${MIOPEN_TEST_MLIR};MIOPEN_TEST_COMPOSABLEKERNEL=${MIOPEN_TEST_COMPOSABLEKERNEL}")
 
   endif()
 endfunction()
diff --git a/test/gtest/conv_igemm_dynamic.cpp b/test/gtest/conv_igemm_dynamic.cpp
new file mode 100644
index 0000000000..25a4e179c5
--- /dev/null
+++ b/test/gtest/conv_igemm_dynamic.cpp
@@ -0,0 +1,189 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <tuple>
+
+#include <miopen/miopen.h>
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include "../conv2d.hpp"
+#include "get_handle.hpp"
+
+using TestCase = std::tuple<std::vector<std::string>, std::string>;
+
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_TEST_GPU_XNACK_ENABLED)
+
+static bool SkipTest(void) { return miopen::IsEnabled(MIOPEN_TEST_GPU_XNACK_ENABLED{}); }
+
+void GetArgs(const TestCase& param, std::vector<std::string>& tokens)
+{
+    auto env_vars = std::get<0>(param);
+    for(auto& elem : env_vars)
+    {
+        putenv(elem.data());
+    }
+
+    auto cmd = std::get<1>(param);
+
+    std::stringstream ss(cmd);
+    std::istream_iterator<std::string> begin(ss);
+    std::istream_iterator<std::string> end;
+    while(begin != end)
+        tokens.push_back(*begin++);
+}
+
+class Conv2dFloat : public testing::TestWithParam<std::vector<TestCase>>
+{
+};
+
+void Run2dDriver(miopenDataType_t prec)
+{
+
+    std::vector<TestCase> params;
+    switch(prec)
+    {
+    case miopenFloat: params = Conv2dFloat::GetParam(); break;
+    case miopenHalf:
+    case miopenInt8:
+    case miopenBFloat16:
+    case miopenInt8x4:
+    case miopenInt32:
+    case miopenDouble:
+    case miopenFloat8:
+    case miopenBFloat8:
+        FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt8x4, miopenInt32, "
+                  "miopenDouble, miopenFloat8, miopenBFloat8 "
+                  "data type not supported by conv_igemm_dynamic test";
+
+    default: params = Conv2dFloat::GetParam();
+    }
+
+    for(const auto& test_value : params)
+    {
+        std::vector<std::string> tokens;
+        GetArgs(test_value, tokens);
+        std::vector<const char*> ptrs;
+
+        std::transform(tokens.begin(),
+                       tokens.end(),
+                       std::back_inserter(ptrs),
+                       [](const std::string& str) { return str.data(); });
+
+        testing::internal::CaptureStderr();
+        test_drive<conv2d_driver>(ptrs.size(), ptrs.data());
+        auto capture = testing::internal::GetCapturedStderr();
+        std::cout << capture;
+    }
+};
+
+bool IsTestSupportedForDevice(const miopen::Handle& handle)
+{
+    std::string devName = handle.GetDeviceName();
+    if(devName == "gfx900" || devName == "gfx906")
+        return true;
+    else
+        return false;
+}
+
+TEST_P(Conv2dFloat, FloatTest)
+{
+    const auto& handle = get_handle();
+    if(IsTestSupportedForDevice(handle) && !SkipTest())
+    {
+        Run2dDriver(miopenFloat);
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+std::vector<TestCase> GetTestCases(const std::string& precision)
+{
+
+    std::vector<std::string> env = {
+        "MIOPEN_FIND_MODE=normal",
+        "MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmV4R1DynamicFwd"};
+    std::vector<std::string> env_1x1 = {
+        "MIOPEN_FIND_MODE=normal",
+        "MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmV4R1DynamicFwd_1x1"};
+    std::vector<std::string> env_wrw = {
+        "MIOPEN_FIND_MODE=normal",
+        "MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmV4R1DynamicWrw"};
+    std::vector<std::string> env_bwd = {
+        "MIOPEN_FIND_MODE=normal",
+        "MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmV4R1DynamicBwd"};
+
+    std::string v           = " --verbose";
+    std::string dis_bk_data = " --disable-backward-data";
+    std::string dis_bk_wei  = " --disable-backward-weights";
+    std::string dis_fwd     = " --disable-forward";
+    std::string dis_vali    = " --disable-validation";
+
+    const std::vector<TestCase> test_cases = {
+    // clang-format off
+#if CODECOV_TEST        
+    TestCase{env, precision + v +  " --input  32  32 17 17 --weights 32  32 1 7 --pads_strides_dilations 0 3 1 1 1 1" + dis_bk_data + dis_bk_wei + dis_vali},
+    TestCase{env_wrw, precision + v + " --input  64  64 28 28 --weights 32  64 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data + dis_vali},
+    TestCase{env_bwd, precision + v + " --input  64  64 28 28 --weights 16  64 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_wei + dis_vali},
+#else    
+    TestCase{env, precision + v + " --input  16  16 56 56 --weights 64  16 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env, precision + v + " --input  16  64 34 34 --weights 64  64 3 3 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env, precision + v + " --input  32  32 17 17 --weights 32  32 1 7 --pads_strides_dilations 0 3 1 1 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env_1x1, precision + v + " --input  16 384  8  8 --weights 64 384 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env_wrw, precision + v + " --input  64  64 28 28 --weights 32  64 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data},
+    TestCase{env_wrw, precision + v + " --input  16  128 36 36 --weights 32  128 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data},    
+    TestCase{env_bwd, precision + v + " --input  64  64 28 28 --weights 16  64 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_wei},
+    TestCase{env_bwd, precision + v + " --input  16  128 36 36 --weights 32  128 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_wei},
+#endif
+
+#if MIOPEN_TEST_ALL
+    //SKIP_UNLESS_ALL
+    TestCase{env, precision + v + " --input  64   64 56 56 --weights 256  64  1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env, precision + v + " --input  64  256 34 34 --weights 256  256 3 3 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env, precision + v + " --input 128  128 35 35 --weights 128  128 3 3 --pads_strides_dilations 0 0 2 2 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env, precision + v + " --input  64 1536  8  8 --weights 256 1536 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env, precision + v + " --input 128   48  7  7 --weights 128   48 5 5 --pads_strides_dilations 2 2 1 1 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env, precision + v + " --input 128  128 17 17 --weights 128  128 1 7 --pads_strides_dilations 0 3 1 1 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env_1x1, precision + v + " --input 128  256 28 28 --weights 128  256 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env_1x1, precision + v + " --input  64 1536  8  8 --weights 256 1536 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env_1x1, precision + v + " --input 128  768 17 17 --weights 128  768 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei},
+    TestCase{env_wrw, precision + v + " --input  64   64 56 56 --weights 256  64  1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data},
+    TestCase{env_wrw, precision + v + " --input  32  128 34 34 --weights 64  128  3 3 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data},
+    TestCase{env_wrw, precision + v + " --input 128  128 35 35 --weights 128  128 3 3 --pads_strides_dilations 1 1 1 1 1 1" + dis_fwd + dis_bk_data},
+    TestCase{env_wrw, precision + v + " --input 128  256 56 56 --weights 64  256 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data},
+    TestCase{env_wrw, precision + v + " --input  64  512 28 28 --weights 256 512 1 1 --pads_strides_dilations 0 0 2 2 1 1" + dis_fwd + dis_bk_data},
+    TestCase{env_wrw, precision + v + " --input  64  512 14 14 --weights 256 512 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data},
+    TestCase{env_bwd, precision + v + " --input  64   64 56 56 --weights 256  64  1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_wei},
+    TestCase{env_bwd, precision + v + " --input  32  128 34 34 --weights 64  128  3 3 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_wei},
+    TestCase{env_bwd, precision + v + " --input 128  128 35 35 --weights 128  128 3 3 --pads_strides_dilations 1 1 1 1 1 1" + dis_fwd + dis_bk_wei},
+    TestCase{env_bwd, precision + v + " --input 128  256 56 56 --weights 64  256 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_wei}
+#endif
+        // clang-format on
+    };
+    return test_cases;
+}
+
+INSTANTIATE_TEST_SUITE_P(ConvIgemmDynamic, Conv2dFloat, testing::Values(GetTestCases("--float")));

From 1e325a7bebb11c629e39808008b67ffed69ee486 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Mon, 25 Sep 2023 23:15:09 -0700
Subject: [PATCH 12/36] Revert "cmake: enable finding installed ZStd library
 (#2362)"

This reverts commit e608b4325646afeabb5e52846997b926d2019d19.
---
 cmake/Findzstd.cmake | 91 --------------------------------------------
 src/CMakeLists.txt   |  5 ---
 2 files changed, 96 deletions(-)
 delete mode 100644 cmake/Findzstd.cmake

diff --git a/cmake/Findzstd.cmake b/cmake/Findzstd.cmake
deleted file mode 100644
index 43ea6f9b40..0000000000
--- a/cmake/Findzstd.cmake
+++ /dev/null
@@ -1,91 +0,0 @@
-################################################################################
-#
-# MIT License
-#
-# Copyright (c) 2023 Advanced Micro Devices, Inc.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-################################################################################
-
-# Try to find the zstd library
-#
-# If successful, the following variables will be defined:
-# zstd_INCLUDE_DIR
-# zstd_LIBRARY
-# zstd_STATIC_LIBRARY
-# zstd_FOUND
-#
-# Additionally, one of the following import targets will be defined:
-# zstd::libzstd_shared
-# zstd::libzstd_static
-
-if(MSVC)
-  set(zstd_STATIC_LIBRARY_SUFFIX "_static\\${CMAKE_STATIC_LIBRARY_SUFFIX}$")
-else()
-  set(zstd_STATIC_LIBRARY_SUFFIX "\\${CMAKE_STATIC_LIBRARY_SUFFIX}$")
-endif()
-
-find_path(zstd_INCLUDE_DIR NAMES zstd.h)
-find_library(zstd_LIBRARY NAMES zstd zstd_static)
-find_library(zstd_STATIC_LIBRARY NAMES
-  zstd_static
-  "${CMAKE_STATIC_LIBRARY_PREFIX}zstd${CMAKE_STATIC_LIBRARY_SUFFIX}")
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(
-    zstd DEFAULT_MSG
-    zstd_LIBRARY zstd_INCLUDE_DIR
-)
-
-if(zstd_FOUND)
-  if(zstd_LIBRARY MATCHES "${zstd_STATIC_LIBRARY_SUFFIX}$")
-    set(zstd_STATIC_LIBRARY "${zstd_LIBRARY}")
-  elseif (NOT TARGET zstd::libzstd_shared)
-    add_library(zstd::libzstd_shared SHARED IMPORTED)
-    if(MSVC)
-      # IMPORTED_LOCATION is the path to the DLL and IMPORTED_IMPLIB is the "library".
-      get_filename_component(zstd_DIRNAME "${zstd_LIBRARY}" DIRECTORY)
-      string(REGEX REPLACE "${CMAKE_INSTALL_LIBDIR}$" "${CMAKE_INSTALL_BINDIR}" zstd_DIRNAME "${zstd_DIRNAME}")
-      get_filename_component(zstd_BASENAME "${zstd_LIBRARY}" NAME)
-      string(REGEX REPLACE "\\${CMAKE_LINK_LIBRARY_SUFFIX}$" "${CMAKE_SHARED_LIBRARY_SUFFIX}" zstd_BASENAME "${zstd_BASENAME}")
-      set_target_properties(zstd::libzstd_shared PROPERTIES
-          INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIR}"
-          IMPORTED_LOCATION "${zstd_DIRNAME}/${zstd_BASENAME}"
-          IMPORTED_IMPLIB "${zstd_LIBRARY}")
-      unset(zstd_DIRNAME)
-      unset(zstd_BASENAME)
-    else()
-      set_target_properties(zstd::libzstd_shared PROPERTIES
-          INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIR}"
-          IMPORTED_LOCATION "${zstd_LIBRARY}")
-    endif()
-  endif()
-  if(zstd_STATIC_LIBRARY MATCHES "${zstd_STATIC_LIBRARY_SUFFIX}$" AND
-     NOT TARGET zstd::libzstd_static)
-    add_library(zstd::libzstd_static STATIC IMPORTED)
-    set_target_properties(zstd::libzstd_static PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIR}"
-        IMPORTED_LOCATION "${zstd_STATIC_LIBRARY}")
-  endif()
-endif()
-
-unset(zstd_STATIC_LIBRARY_SUFFIX)
-
-mark_as_advanced(zstd_INCLUDE_DIR zstd_LIBRARY zstd_STATIC_LIBRARY)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4ffed2b4c8..f604a98c51 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -682,11 +682,6 @@ rocm_set_soversion(MIOpen ${MIOpen_SOVERSION})
 
 clang_tidy_check(MIOpen)
 
-find_package(zstd)
-if(zstd_FOUND)
-    target_link_libraries(MIOpen PRIVATE zstd::libzstd_static)
-endif()
-
 function(target_internal_library TARGET)
     target_link_libraries(${TARGET} PRIVATE ${ARGN})
     set(PASS_ARGS debug optimized)

From 839249db9286fd6ec102ae5f9bb93d7f9a7cd0b2 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Tue, 26 Sep 2023 10:13:32 -0700
Subject: [PATCH 13/36] Revert "Revert "cmake: enable finding installed ZStd
 library (#2362)""

This reverts commit 1e325a7bebb11c629e39808008b67ffed69ee486.
---
 cmake/Findzstd.cmake | 91 ++++++++++++++++++++++++++++++++++++++++++++
 src/CMakeLists.txt   |  5 +++
 2 files changed, 96 insertions(+)
 create mode 100644 cmake/Findzstd.cmake

diff --git a/cmake/Findzstd.cmake b/cmake/Findzstd.cmake
new file mode 100644
index 0000000000..43ea6f9b40
--- /dev/null
+++ b/cmake/Findzstd.cmake
@@ -0,0 +1,91 @@
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+
+# Try to find the zstd library
+#
+# If successful, the following variables will be defined:
+# zstd_INCLUDE_DIR
+# zstd_LIBRARY
+# zstd_STATIC_LIBRARY
+# zstd_FOUND
+#
+# Additionally, one of the following import targets will be defined:
+# zstd::libzstd_shared
+# zstd::libzstd_static
+
+if(MSVC)
+  set(zstd_STATIC_LIBRARY_SUFFIX "_static\\${CMAKE_STATIC_LIBRARY_SUFFIX}$")
+else()
+  set(zstd_STATIC_LIBRARY_SUFFIX "\\${CMAKE_STATIC_LIBRARY_SUFFIX}$")
+endif()
+
+find_path(zstd_INCLUDE_DIR NAMES zstd.h)
+find_library(zstd_LIBRARY NAMES zstd zstd_static)
+find_library(zstd_STATIC_LIBRARY NAMES
+  zstd_static
+  "${CMAKE_STATIC_LIBRARY_PREFIX}zstd${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+    zstd DEFAULT_MSG
+    zstd_LIBRARY zstd_INCLUDE_DIR
+)
+
+if(zstd_FOUND)
+  if(zstd_LIBRARY MATCHES "${zstd_STATIC_LIBRARY_SUFFIX}$")
+    set(zstd_STATIC_LIBRARY "${zstd_LIBRARY}")
+  elseif (NOT TARGET zstd::libzstd_shared)
+    add_library(zstd::libzstd_shared SHARED IMPORTED)
+    if(MSVC)
+      # IMPORTED_LOCATION is the path to the DLL and IMPORTED_IMPLIB is the "library".
+      get_filename_component(zstd_DIRNAME "${zstd_LIBRARY}" DIRECTORY)
+      string(REGEX REPLACE "${CMAKE_INSTALL_LIBDIR}$" "${CMAKE_INSTALL_BINDIR}" zstd_DIRNAME "${zstd_DIRNAME}")
+      get_filename_component(zstd_BASENAME "${zstd_LIBRARY}" NAME)
+      string(REGEX REPLACE "\\${CMAKE_LINK_LIBRARY_SUFFIX}$" "${CMAKE_SHARED_LIBRARY_SUFFIX}" zstd_BASENAME "${zstd_BASENAME}")
+      set_target_properties(zstd::libzstd_shared PROPERTIES
+          INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIR}"
+          IMPORTED_LOCATION "${zstd_DIRNAME}/${zstd_BASENAME}"
+          IMPORTED_IMPLIB "${zstd_LIBRARY}")
+      unset(zstd_DIRNAME)
+      unset(zstd_BASENAME)
+    else()
+      set_target_properties(zstd::libzstd_shared PROPERTIES
+          INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIR}"
+          IMPORTED_LOCATION "${zstd_LIBRARY}")
+    endif()
+  endif()
+  if(zstd_STATIC_LIBRARY MATCHES "${zstd_STATIC_LIBRARY_SUFFIX}$" AND
+     NOT TARGET zstd::libzstd_static)
+    add_library(zstd::libzstd_static STATIC IMPORTED)
+    set_target_properties(zstd::libzstd_static PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIR}"
+        IMPORTED_LOCATION "${zstd_STATIC_LIBRARY}")
+  endif()
+endif()
+
+unset(zstd_STATIC_LIBRARY_SUFFIX)
+
+mark_as_advanced(zstd_INCLUDE_DIR zstd_LIBRARY zstd_STATIC_LIBRARY)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f604a98c51..4ffed2b4c8 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -682,6 +682,11 @@ rocm_set_soversion(MIOpen ${MIOpen_SOVERSION})
 
 clang_tidy_check(MIOpen)
 
+find_package(zstd)
+if(zstd_FOUND)
+    target_link_libraries(MIOpen PRIVATE zstd::libzstd_static)
+endif()
+
 function(target_internal_library TARGET)
     target_link_libraries(${TARGET} PRIVATE ${ARGN})
     set(PASS_ARGS debug optimized)

From c977e00d2a2b1a27e4734f980a3e857c7f5cba26 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 26 Sep 2023 11:54:16 -0700
Subject: [PATCH 14/36] Bump cryptography from 41.0.3 to 41.0.4 in
 /docs/.sphinx (#2408)

Bumps [cryptography](https://github.com/pyca/cryptography) from 41.0.3 to 41.0.4.
- [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pyca/cryptography/compare/41.0.3...41.0.4)

---
updated-dependencies:
- dependency-name: cryptography
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/.sphinx/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
index 7d0f7f499e..6f10fcce12 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -26,7 +26,7 @@ charset-normalizer==3.1.0
     # via requests
 click==8.1.3
     # via sphinx-external-toc
-cryptography==41.0.3
+cryptography==41.0.4
     # via pyjwt
 deprecated==1.2.13
     # via pygithub
@@ -92,7 +92,7 @@ requests==2.31.0
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core>=0.24.0
+rocm-docs-core==0.24.2
     # via -r requirements.in
 smmap==5.0.0
     # via gitdb

From b9e724a49324369dbdd9ee3a5d08eeb0495871b8 Mon Sep 17 00:00:00 2001
From: mentat <108366729+bghimireamd@users.noreply.github.com>
Date: Tue, 26 Sep 2023 13:54:51 -0500
Subject: [PATCH 15/36] [SWDEV-416089][Doc] convolution API in MIOpen is
 restricted to alpha = 1.0 and beta = 0.0 (#2419)

---
 include/miopen/miopen.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index f0c0ce1aa6..a59181acf3 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -1715,6 +1715,8 @@ miopenFindConvolutionForwardAlgorithm(miopenHandle_t handle,
  * Runs the forward convolution layer based on the selected algorithm. The function
  * miopenFindConvolutionForwardAlgorithm() must have been executed previously to
  * determine the required memory needed for the workspace and the best convolutional algorithm.
+ * The scaling parameter alpha (float) and shift parameter beta (float) are only supported for
+ * alpha = 1 and beta = 0.
  *
  * If using Group/Depthwise convolution mode, call miopenSetConvolutionGroupCount() before running
  * this.
@@ -1751,6 +1753,8 @@ MIOPEN_EXPORT miopenStatus_t miopenConvolutionForward(miopenHandle_t handle,
 /*! @brief Calculate element-wise scale and shift of a tensor via a bias tensor
  *
  *  This function applies an element-wise bias to a data tensor from an input bias tensor.
+ *  The scaling parameter alpha (float) and shift parameter beta (float) are only supported for
+ *  alpha = 1 and beta = 0.
  *
  * @param handle         MIOpen handle (input)
  * @param alpha          Floating point scaling factor, allocated on the host (input)
@@ -2018,6 +2022,8 @@ miopenConvolutionBackwardWeights(miopenHandle_t handle,
 /*! @brief Calculates the gradient with respect to the bias.
  *
  * Compute the convolution backwards gradient with respect to the bias tensor.
+ * The scaling parameter alpha (float) and shift parameter beta (float) are only supported for
+ * alpha = 1 and beta = 0.
  *
  * @param handle         MIOpen handle (input)
  * @param alpha          Floating point scaling factor, allocated on the host (input)

From 3413d2daaeb44b7d6eadcc03033a5954a118491e Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Wed, 27 Sep 2023 07:39:12 -0700
Subject: [PATCH 16/36] [HotFix] zstd dependency on multi Linux distributes
 (#2417)

---
 src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4ffed2b4c8..71289c8b42 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -684,7 +684,7 @@ clang_tidy_check(MIOpen)
 
 find_package(zstd)
 if(zstd_FOUND)
-    target_link_libraries(MIOpen PRIVATE zstd::libzstd_static)
+    target_link_libraries(MIOpen PRIVATE $<IF:$<TARGET_EXISTS:zstd::libzstd_shared>,zstd::libzstd_shared,zstd::libzstd_static>)
 endif()
 
 function(target_internal_library TARGET)

From 6d539ee81321121570606e4ef62e6d072775bbd8 Mon Sep 17 00:00:00 2001
From: Reid Kawaja <74506315+reidkwja@users.noreply.github.com>
Date: Thu, 28 Sep 2023 13:32:20 -0400
Subject: [PATCH 17/36] [CI][Jenkins] Enable rebooting in CI stages for CI
 stages with GPU use (#2420)

* conf_reboot

* configs_chg
---
 Jenkinsfile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 433f4e9622..e7ffb0da1b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -317,7 +317,7 @@ def buildHipClangJobAndReboot(Map conf=[:]){
         throw e
     }
     finally{
-        if (conf.get("needs_reboot", false)) {
+        if (conf.get("needs_reboot", true)) {
             reboot()
         }
     }
@@ -544,7 +544,7 @@ pipeline {
                 stage("HIP Package") {
                     agent{ label rocmnode("nogpu") }
                     steps{
-                        buildHipClangJobAndReboot( package_build: "true", needs_gpu:false)
+                        buildHipClangJobAndReboot( package_build: "true", needs_gpu:false, needs_reboot:false)
                     }
                 }
             }
@@ -561,7 +561,7 @@ pipeline {
                         build_cmd = "make -j\$(nproc) -k analyze"
                     }
                     steps{
-                        buildHipClangJobAndReboot(setup_cmd: setup_cmd, build_cmd: build_cmd, needs_gpu:false)
+                        buildHipClangJobAndReboot(setup_cmd: setup_cmd, build_cmd: build_cmd, needs_gpu:false, needs_reboot:false)
                     }
                 }
                 stage('Clang Format') {
@@ -588,7 +588,7 @@ pipeline {
                       build_cmd = "make -j\$(nproc) "
                     }
                     steps{
-                      buildHipClangJobAndReboot(build_fin: "ON", needs_gpu:false, build_install: "true")
+                      buildHipClangJobAndReboot(build_fin: "ON", needs_gpu:false, needs_reboot:false, build_install: "true")
                   }
                 }
                 stage('Perf DB Validity Test') {
@@ -598,7 +598,7 @@ pipeline {
 
                     }
                     steps{
-                        CheckPerfDbValid(setup_flags: fin_flags, config_targets: "all", build_fin: "ON", needs_gpu:false, build_install: "true")
+                        CheckPerfDbValid(setup_flags: fin_flags, config_targets: "all", build_fin: "ON", needs_gpu:false, needs_reboot:false, build_install: "true")
                     }
                 }
                 stage('HipNoGPU Debug Build Test') {
@@ -612,7 +612,7 @@ pipeline {
                         build_cmd = "make -j\$(nproc)"
                     }
                     steps{
-                        buildHipClangJob( build_type: 'debug', setup_flags: HipNoGPU_flags, build_cmd: build_cmd, needs_gpu:false)
+                        buildHipClangJob( build_type: 'debug', setup_flags: HipNoGPU_flags, build_cmd: build_cmd, needs_gpu:false, needs_reboot:false)
                     }
                 }
             }

From 2e2b37ac816813a5c05b97f3e1381cd3501a64a3 Mon Sep 17 00:00:00 2001
From: Chris Erb <chris.erb65@gmail.com>
Date: Mon, 2 Oct 2023 19:25:17 -0500
Subject: [PATCH 18/36] [Bug Fixes] miopen_rocblas_gemm_ex3 call - invoker
 cache extra elements - conv direct naive input cast (#2414)

* bugfixes
miopen_rocblas_gemm_ex3 call would always throw error
invoker cache adding extra elements
conv direct naive yielding incorrect input cast for kernel arg

* clear clang format issue

---------

Co-authored-by: Jun Liu <Liu.Jun@amd.com>
---
 src/gemm_v2.cpp                       | 9 +++++----
 src/invoker_cache.cpp                 | 7 +++++--
 src/solver/conv_direct_naive_conv.cpp | 4 ++--
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp
index 804587aac0..19e302f166 100644
--- a/src/gemm_v2.cpp
+++ b/src/gemm_v2.cpp
@@ -29,6 +29,7 @@
 #include <miopen/env.hpp>
 #include <miopen/tensor.hpp>
 #include <miopen/handle.hpp>
+#include <miopen/datatype.hpp>
 
 #if MIOPEN_BACKEND_HIP
 #include <miopen/hipoc_kernel.hpp>
@@ -173,6 +174,7 @@ rocblas_status miopen_rocblas_gemm_ex3(const miopen::Handle& handle,
                          rocblas_gemm_algo::rocblas_gemm_algo_standard,
                          0,
                          flags); // gfx90a_alt_impl));
+    return rb_status;
 #pragma clang diagnostic pop
 #endif
     MIOPEN_THROW(miopenStatusBadParm, "An appropriate version of rocBLAS is required for this op");
@@ -258,10 +260,9 @@ std::ostream& operator<<(std::ostream& stream, const GemmDescriptor& gemm_desc)
                   << "strideC " << gemm_desc.strideC << ", "
                   << "alpha " << gemm_desc.alpha << ", "
                   << "beta " << gemm_desc.beta << ", "
-                  << "dataType " << gemm_desc.dataType << "a_cast_type" << gemm_desc.a_cast_type
-                  << ", "
-                  << "b_cast_type" << gemm_desc.b_cast_type << ", "
-                  << "} ";
+                  << "dataType " << GetDataType(gemm_desc.dataType) << ", "
+                  << "a_cast_type " << GetDataType(gemm_desc.a_cast_type) << ", "
+                  << "b_cast_type " << GetDataType(gemm_desc.b_cast_type) << "} ";
 }
 
 #if MIOPEN_USE_ROCBLAS
diff --git a/src/invoker_cache.cpp b/src/invoker_cache.cpp
index f7df500ee5..937a1b2e08 100644
--- a/src/invoker_cache.cpp
+++ b/src/invoker_cache.cpp
@@ -104,8 +104,11 @@ void InvokerCache::Register(const Key& key, const Invoker& invoker)
     auto it = invokers.find(key.first);
     if(it != invokers.end())
         it->second.invokers.insert({key.second, invoker});
-    auto& item = invokers.insert({key.first, Item{}}).first->second;
-    item.invokers.insert({key.second, invoker});
+    else
+    {
+        auto& item = invokers.insert({key.first, Item{}}).first->second;
+        item.invokers.insert({key.second, invoker});
+    }
     MIOPEN_LOG_I2("Invoker registered for algorithm " << key.first << " and solver " << key.second);
 }
 
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index 4df8df5874..5c468768fa 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -212,12 +212,12 @@ std::string ConvDirectNaiveConvCompileOption(const ExecutionContext& ctx,
         ss << " -DWEIGHTS_TYPE=" << miopen::GetDataType(problem.GetWeightsDataType());
         ss << " -DOUTPUT_TYPE="
            << miopen::GetDataType(ProblemInterpreter::GetOutputDataType(problem));
-        const auto in_cast_type = problem.GetInCastType();
+        const auto in_cast_type = ProblemInterpreter::GetInputCastType(problem);
         if(in_cast_type)
             ss << " -DINPUT_CAST_TYPE=" << miopen::GetDataType(*in_cast_type);
         const auto wei_cast_type = problem.GetWeightsCastType();
         if(wei_cast_type)
-            ss << " -DWEIGHTS_CAST_TYPE=" << miopen::GetDataType(*(wei_cast_type));
+            ss << " -DWEIGHTS_CAST_TYPE=" << miopen::GetDataType(*wei_cast_type);
         const auto out_cast_type = ProblemInterpreter::GetOutputCastType(problem);
         if(out_cast_type)
             ss << " -DOUTPUT_CAST_TYPE=" << miopen::GetDataType(*out_cast_type);

From 2065081d68bf9c6cd7a71903942ff121d3483c8c Mon Sep 17 00:00:00 2001
From: Reid Kawaja <74506315+reidkwja@users.noreply.github.com>
Date: Tue, 3 Oct 2023 12:44:59 -0400
Subject: [PATCH 19/36] [CI][Jenkins] Disabling smoke stages for CI branch runs
 (#2422)

---
 Jenkinsfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index e7ffb0da1b..b6f2373ece 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -431,15 +431,15 @@ pipeline {
             description: "")
         booleanParam(
             name: "BUILD_SMOKE_FP32",
-            defaultValue: true,
+            defaultValue: false,
             description: "")
         booleanParam(
             name: "BUILD_SMOKE_AUX1",
-            defaultValue: true,
+            defaultValue: false,
             description: "")
         booleanParam(
             name: "BUILD_SMOKE_FP16_BF16_INT8",
-            defaultValue: true,
+            defaultValue: false,
             description: "")
         booleanParam(
             name: "BUILD_FULL_TESTS",

From e8b4acf440c2800fd872d3450839c7b4c12907cc Mon Sep 17 00:00:00 2001
From: mentat <108366729+bghimireamd@users.noreply.github.com>
Date: Tue, 3 Oct 2023 11:45:57 -0500
Subject: [PATCH 20/36] [Tests] disable solver
 ConvHipImplicitGemm3DGroupWrwXdlops on Vega10 (#2432)

---
 src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
index 6fce8a80b8..3c94374b4e 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
@@ -312,7 +312,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable(
     const std::string& arch = ctx.GetStream().GetDeviceName();
     if(miopen::StartsWith(arch, "gfx11") || miopen::StartsWith(arch, "gfx10"))
         return false;
-    if(arch == "gfx906")
+    if(arch == "gfx906" || arch == "gfx900")
         return false;
     switch(problem.GetInDataType())
     {

From d8e5c6cd3f8cf6c3946e29fbbdc8a6b2bb6e5f2c Mon Sep 17 00:00:00 2001
From: mentat <108366729+bghimireamd@users.noreply.github.com>
Date: Tue, 3 Oct 2023 17:19:26 -0500
Subject: [PATCH 21/36] [Dockerfile] Upgrade cmake so that MIOpen docker can
 compile Composable Kernel (#2424)

* upgrade cmake so that MIOpen docker can compile Composable Kernel

* pin the cmake version to 3.27.5
---
 Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 31a0334eeb..d958879d3c 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -106,6 +106,9 @@ RUN ccache -s
 ADD docs/.sphinx/requirements.txt /doc-requirements.txt
 RUN pip3 install -r /doc-requirements.txt
 
+# Composable Kernel requires this version cmake
+RUN pip3 install --upgrade cmake==3.27.5
+
 # Use parallel job to accelerate tensile build
 # Workaround for Tensile with TargetID feature
 ARG USE_TARGETID="OFF"

From 1605ca8e0e0cd34f4f0930e8a7105f181a5214bd Mon Sep 17 00:00:00 2001
From: Chris Erb <chris.erb65@gmail.com>
Date: Tue, 3 Oct 2023 18:45:10 -0500
Subject: [PATCH 22/36] [Bug Fix] Compilation fix for -DMIOPEN_USE_ROCBLAS=Off
 (#2435)

---
 src/hip/handlehip.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/hip/handlehip.cpp b/src/hip/handlehip.cpp
index b03f8cd190..ff6d27d26e 100644
--- a/src/hip/handlehip.cpp
+++ b/src/hip/handlehip.cpp
@@ -262,7 +262,7 @@ struct HandleImpl
             rhandle_pool.push_back(std::move(r_ptr));
         }
 #else
-        void add_stream(StreamPtr& s_ptr) { stream_pool.push_back(s_ptr); }
+        void add_stream(StreamPtr s_ptr) { stream_pool.push_back(s_ptr); }
 #endif
         //  stream_pool used as cache for parallel streams created by MIOpen.
         StreamPtrPool stream_pool;
@@ -362,7 +362,7 @@ void Handle::ReserveExtraStreamsInPool(int cnt) const
             auto new_rhandle = CreateRocblasHandle(new_stream.get());
             this->impl->ms_resourse_ptr->add_resours(std::move(new_stream), std::move(new_rhandle));
 #else
-            this->impl->ms_resourse_ptr->add_resours(std::move(new_stream));
+            this->impl->ms_resourse_ptr->add_stream(std::move(new_stream));
 #endif
         }
 }

From c7e0d377ec8d0b7c348bd81d81a64a8d73fe3647 Mon Sep 17 00:00:00 2001
From: mentat <108366729+bghimireamd@users.noreply.github.com>
Date: Thu, 5 Oct 2023 09:38:20 -0500
Subject: [PATCH 23/36] bg/lwpmiopen 193 : Integrate CK's batch norm backward
 training into non-tunable MIOpen solver (#2385)

---
 src/CMakeLists.txt                            |   2 +
 src/batch_norm_api.cpp                        |   7 -
 src/include/miopen/batchnorm/solvers.hpp      |  20 +
 .../miopen/solver/implicitgemm_ck_util.hpp    |  65 +-
 src/ocl/batchnormocl.cpp                      |   9 +-
 src/solver.cpp                                |   2 +
 src/solver/batchnorm/backward_ck.cpp          | 251 ++++++
 src/solver/batchnorm/forward_training_ck.cpp  | 239 ++++++
 test/bn_spatial_nhwc_test.cpp                 | 749 ------------------
 test/fusionHost.hpp                           |  31 +-
 test/gtest/bn.hpp                             | 171 ++++
 test/gtest/bn_bwd.cpp                         |  73 ++
 test/gtest/bn_fwd_train.cpp                   |  73 ++
 test/gtest/bn_infer.cpp                       |   8 +-
 test/gtest/bn_test_data.hpp                   | 223 +++++-
 test/gtest/test_operations.hpp                |  35 +
 16 files changed, 1164 insertions(+), 794 deletions(-)
 create mode 100644 src/solver/batchnorm/backward_ck.cpp
 create mode 100644 src/solver/batchnorm/forward_training_ck.cpp
 delete mode 100644 test/bn_spatial_nhwc_test.cpp
 create mode 100644 test/gtest/bn_bwd.cpp
 create mode 100644 test/gtest/bn_fwd_train.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 71289c8b42..abc0679a8a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -152,6 +152,7 @@ set( MIOpen_Source
     solver/activ/bwd_1.cpp
     solver/activ/fwd_0.cpp
     solver/activ/fwd_1.cpp
+    solver/batchnorm/backward_ck.cpp
     solver/batchnorm/backward_per_activation.cpp
     solver/batchnorm/backward_per_activation_fused.cpp
     solver/batchnorm/backward_spatial_multiple.cpp
@@ -163,6 +164,7 @@ set( MIOpen_Source
     solver/batchnorm/forward_per_activation_fused.cpp
     solver/batchnorm/forward_spatial_multiple.cpp
     solver/batchnorm/forward_spatial_single.cpp
+    solver/batchnorm/forward_training_ck.cpp
     solver/conv_asm_1x1u.cpp
     solver/conv_asm_1x1u_bias_activ_fused.cpp
     solver/conv_asm_1x1u_stride2.cpp
diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp
index 03db138945..69454b185a 100644
--- a/src/batch_norm_api.cpp
+++ b/src/batch_norm_api.cpp
@@ -243,13 +243,6 @@ miopenBatchNormalizationBackward(miopenHandle_t handle,
                                  const void* savedMean,
                                  const void* savedInvVariance)
 {
-    // bfloat16 not supported for batchnorm operation
-    if(miopen::deref(xDesc).GetType() == miopenBFloat16 ||
-       miopen::deref(dyDesc).GetType() == miopenBFloat16 ||
-       miopen::deref(dxDesc).GetType() == miopenBFloat16)
-    {
-        return miopenStatusNotImplemented;
-    }
 
     MIOPEN_LOG_FUNCTION(handle,
                         bn_mode,
diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp
index c7d050abeb..70d64bb204 100644
--- a/src/include/miopen/batchnorm/solvers.hpp
+++ b/src/include/miopen/batchnorm/solvers.hpp
@@ -142,6 +142,26 @@ struct BnCKFwdInference final : BatchnormSolver
                              const miopen::batchnorm::ProblemDescription& problem) const override;
 };
 
+struct BnCKBwdBackward final : BatchnormSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<BnCKBwdBackward>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::batchnorm::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::batchnorm::ProblemDescription& problem) const override;
+};
+
+struct BnCKFwdTraining final : BatchnormSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<BnCKFwdTraining>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::batchnorm::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::batchnorm::ProblemDescription& problem) const override;
+};
+
 } // namespace batchnorm
 
 } // namespace solver
diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp
index 8656bdbabc..318d970170 100644
--- a/src/include/miopen/solver/implicitgemm_ck_util.hpp
+++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp
@@ -41,8 +41,10 @@ typename ConvPtrsType::iterator FindConvPtrByID(ConvPtrsType& conv_ptrs,
     });
 }
 
-template <typename DeviceOpType, typename CKArgsType>
-std::vector<std::string> FillValidKernelsIDs(const ProblemDescription& problem)
+template <typename DeviceOpType,
+          typename CKArgsType,
+          typename ProblemDescriptionType = ProblemDescription>
+std::vector<std::string> FillValidKernelsIDs(const ProblemDescriptionType& problem)
 {
     const auto args      = CKArgsType{problem};
     const auto conv_ptrs = DeviceOpType::GetInstances();
@@ -59,8 +61,10 @@ std::vector<std::string> FillValidKernelsIDs(const ProblemDescription& problem)
     return valid_kernels;
 }
 
-template <typename DeviceOpType, typename CKArgsType>
-bool IsCKArgsSupported(const ProblemDescription& problem, const std::string& kernel_id)
+template <typename DeviceOpType,
+          typename CKArgsType,
+          typename ProblemDescriptionType = ProblemDescription>
+bool IsCKArgsSupported(const ProblemDescriptionType& problem, const std::string& kernel_id)
 {
     auto conv_ptrs = DeviceOpType::GetInstances();
     auto ptr_iter  = FindConvPtrByID(conv_ptrs, kernel_id);
@@ -68,20 +72,25 @@ bool IsCKArgsSupported(const ProblemDescription& problem, const std::string& ker
     return (ptr_iter != conv_ptrs.end()) && CKArgsType{problem}.IsSupportedBy(*ptr_iter);
 }
 
-template <typename DeviceOpType, typename CKArgsType>
-bool IsCKApplicable(const ProblemDescription& problem)
+template <typename DeviceOpType,
+          typename CKArgsType,
+          typename ProblemDescriptionType = ProblemDescription>
+bool IsCKApplicable(const ProblemDescriptionType& problem)
 {
     const auto args = CKArgsType{problem};
-    if(!std::all_of(args.strides.begin(), args.strides.end(), [](auto x) { return x == 1; }))
-        return false;
+    // if(!std::all_of(args.strides.begin(), args.strides.end(), [](auto x) { return x == 1; }))
+    //     return false;
 
     const auto ptrs = DeviceOpType::GetInstances();
     return std::any_of(
         ptrs.begin(), ptrs.end(), [&args](auto& ptr) { return args.IsSupportedBy(ptr); });
 }
 
-template <typename DeviceOpType, typename CKArgsType, typename CastType>
-ConvSolution InitInvokerFactory(const ProblemDescription& problem, const std::string& kernel_id)
+template <typename DeviceOpType,
+          typename CKArgsType,
+          typename CastType,
+          typename ProblemDescriptionType = ProblemDescription>
+ConvSolution InitInvokerFactory(const ProblemDescriptionType& problem, const std::string& kernel_id)
 {
     auto conv_ptrs = DeviceOpType::GetInstances();
     auto ptr_iter  = FindConvPtrByID(conv_ptrs, kernel_id);
@@ -112,5 +121,41 @@ ConvSolution InitInvokerFactory(const ProblemDescription& problem, const std::st
     return result;
 }
 
+template <typename DeviceOpType,
+          typename CKArgsType,
+          typename CastType,
+          typename ProblemDescriptionType = ProblemDescription>
+ConvSolution InitAnyInvokerFactory(const ProblemDescriptionType& problem,
+                                   const std::string& kernel_id)
+{
+    auto conv_ptrs = DeviceOpType::GetInstances();
+    auto ptr_iter  = FindConvPtrByID(conv_ptrs, kernel_id);
+
+    if(ptr_iter == conv_ptrs.end())
+        return {miopenStatusInvalidValue};
+
+    ConvSolution result;
+    result.invoker_factory =
+        [ck_args     = CKArgsType{problem},
+         sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}](const std::vector<Kernel>&) mutable {
+            return [ck_args = std::move(ck_args), sh_conv_ptr = std::move(sh_conv_ptr)](
+                       const Handle& handle, const AnyInvokeParams& primitive_parameters) {
+                const auto& data_ctx = primitive_parameters.CastTo<CastType>();
+                auto argument_ptr    = ck_args.MakeArgPtr(sh_conv_ptr, data_ctx);
+                auto invoker_ptr     = sh_conv_ptr->MakeInvokerPointer();
+
+                const auto enable_profiling = handle.IsProfilingEnabled();
+                float elapsed_time =
+                    invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling});
+                if(enable_profiling)
+                {
+                    handle.ResetKernelTime();
+                    handle.AccumKernelTime(elapsed_time);
+                }
+            };
+        };
+    return result;
+}
+
 } // namespace solver
 } // namespace miopen
diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index 6c8a079a2a..6147a827b8 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -131,7 +131,8 @@ void BatchNormForwardTraining(Handle& handle,
         return tmp;
     }();
 
-    const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdTrainingSpatialSingle,
+    const auto solvers = solver::SolverContainer<solver::batchnorm::BnCKFwdTraining,
+                                                 solver::batchnorm::BnFwdTrainingSpatialSingle,
                                                  solver::batchnorm::BnFwdTrainingSpatialMultiple,
                                                  solver::batchnorm::BnFwdTrainingPerActivation>{};
 
@@ -300,7 +301,7 @@ void BatchNormBackward(Handle& handle,
     {
         MIOPEN_THROW(miopenStatusBadParm);
     }
-    if(dxDesc.GetType() != dyDesc.GetType() || dyDesc.GetType() != xDesc.GetType())
+    if(dxDesc.GetType() != dyDesc.GetType())
     {
         MIOPEN_THROW(miopenStatusBadParm);
     }
@@ -338,7 +339,6 @@ void BatchNormBackward(Handle& handle,
         tmp.dx                = dx;
         tmp.bnScale           = bnScale;
         tmp.resultBnScaleDiff = resultBnScaleDiff;
-        tmp.resultBnScaleDiff = resultBnScaleDiff;
         tmp.resultBnBiasDiff  = resultBnBiasDiff;
         tmp.epsilon           = epsilon;
         tmp.savedMean         = savedMean;
@@ -346,7 +346,8 @@ void BatchNormBackward(Handle& handle,
         return tmp;
     }();
 
-    const auto solvers = solver::SolverContainer<solver::batchnorm::BnBwdTrainingSpatialSingle,
+    const auto solvers = solver::SolverContainer<solver::batchnorm::BnCKBwdBackward,
+                                                 solver::batchnorm::BnBwdTrainingSpatialSingle,
                                                  solver::batchnorm::BnBwdTrainingSpatialMultiple,
                                                  solver::batchnorm::BnBwdTrainingPerActivation>{};
 
diff --git a/src/solver.cpp b/src/solver.cpp
index d83935e646..4cd680dd9c 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -569,6 +569,8 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     RegisterWithSolver(
         registry, ++id, ConvHipImplicitGemm3DGroupBwdXdlops{}, miopenConvolutionAlgoImplicitGEMM);
     Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdInference{}.SolverDbId());
+    Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKBwdBackward{}.SolverDbId());
+    Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdTraining{}.SolverDbId());
 
     // IMPORTANT: New solvers should be added to the end of the function!
 }
diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
new file mode 100644
index 0000000000..fba8724990
--- /dev/null
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -0,0 +1,251 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/batchnorm/solvers.hpp>
+#include <miopen/batchnorm/invoke_params.hpp>
+#include <miopen/batch_norm.hpp>
+#include <miopen/bfloat16.hpp>
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+#include <miopen/solver/ck_utility_common.hpp>
+#include <ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp>
+#include <miopen/solver/implicitgemm_ck_util.hpp>
+#endif
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_BACK)
+
+namespace miopen {
+namespace solver {
+namespace batchnorm {
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using index_t     = int32_t;
+
+constexpr index_t Rank                  = 4;
+constexpr index_t NumBatchNormReduceDim = 3;
+
+using F16  = ck::half_t;
+using F32  = float;
+using F64  = double;
+using BF16 = ushort;
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType>
+using DeviceOpBNBwdPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
+                                                     DxDataType,
+                                                     DyDataType,
+                                                     AccDataType,
+                                                     ScaleDataType,
+                                                     DscaleDbiasDataType,
+                                                     MeanVarDataType,
+                                                     PassThrough,
+                                                     Rank,
+                                                     NumBatchNormReduceDim>>;
+
+struct CKArgsBNormBwd
+{
+    CKArgsBNormBwd(const miopen::batchnorm::ProblemDescription& problem)
+    {
+        std::copy(problem.GetXDesc().GetLengths().begin(),
+                  problem.GetXDesc().GetLengths().end(),
+                  lens.begin());
+
+        std::copy(problem.GetXDesc().GetStrides().begin(),
+                  problem.GetXDesc().GetStrides().end(),
+                  strides.begin());
+        arrScaleBiasMeanVarLengths[0] = lens[1]; // get channel
+        arrScaleBiasMeanVarStrides[0] = 1;
+
+        // prep for CK
+        std::sort(strides.begin(), strides.end(), std::greater<>());
+        std::rotate(lens.begin() + 1, lens.begin() + 2, lens.end());
+    }
+
+    CKArgsBNormBwd(const CKArgsBNormBwd&) = default;
+    CKArgsBNormBwd(CKArgsBNormBwd&&)      = default;
+    CKArgsBNormBwd& operator=(const CKArgsBNormBwd&) = default;
+
+    template <typename InvokerPtr, typename InvokerParams>
+    auto MakeArgPtr(const InvokerPtr& invoker_ptr, const InvokerParams& data_ctx) const
+    {
+        return invoker_ptr->MakeArgumentPointer(lens,
+                                                strides,
+                                                strides,
+                                                strides,
+                                                reduceDims,
+                                                arrScaleBiasMeanVarLengths,
+                                                arrScaleBiasMeanVarStrides,
+                                                arrScaleBiasMeanVarStrides,
+                                                arrScaleBiasMeanVarStrides,
+                                                data_ctx.x,
+                                                data_ctx.dy,
+                                                data_ctx.bnScale,
+                                                data_ctx.savedMean,
+                                                data_ctx.savedInvVariance,
+                                                epsilon,
+                                                PassThrough{},
+                                                data_ctx.dx,
+                                                data_ctx.resultBnScaleDiff,
+                                                data_ctx.resultBnBiasDiff);
+    }
+
+    template <typename ConvPtr>
+    bool IsSupportedBy(const ConvPtr& invoker_ptr) const
+    {
+        auto arg_ptr = MakeArgPtr(invoker_ptr, miopen::batchnorm::BwdInvokeParams{});
+        return invoker_ptr->IsSupportedArgument(arg_ptr.get());
+    }
+
+    std::array<ck::index_t, Rank> lens;    // inOutLengths
+    std::array<ck::index_t, Rank> strides; // inOutStrides
+    std::vector<int> invariantDims;
+
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarLengths;
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarStrides;
+
+    double epsilon = 1e-5;
+    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+};
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType>
+static bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem)
+{
+    return IsCKApplicable<DeviceOpBNBwdPtrs<XDataType,
+                                            DxDataType,
+                                            DyDataType,
+                                            AccDataType,
+                                            ScaleDataType,
+                                            DscaleDbiasDataType,
+                                            MeanVarDataType>,
+                          CKArgsBNormBwd>(problem);
+}
+
+#endif
+
+bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
+                                   const miopen::batchnorm::ProblemDescription& bn_problem) const
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = ctx;
+    std::ignore = fdesc_problem;
+    return false;
+#else
+    if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_BACK{}))
+        return false;
+    if(!bn_problem.IsLayoutNHWC())
+        return false;
+    if(!ck_utility::is_ck_supported_hardware(ctx.GetStream()))
+        return false;
+    if(bn_problem.GetXDesc().GetType() != bn_problem.GetScaleBiasDiffDesc().GetType())
+        return false;
+
+    switch(bn_problem.GetXDesc().GetType())
+    {
+    case miopenFloat: return CheckCKApplicability<F32, F32, F32, F32, F32, F32, F32>(bn_problem);
+    case miopenDouble: return CheckCKApplicability<F64, F64, F64, F64, F64, F64, F64>(bn_problem);
+    case miopenHalf: return CheckCKApplicability<F16, F32, F32, F32, F16, F32, F32>(bn_problem);
+    case miopenBFloat16:
+        return CheckCKApplicability<BF16, F32, F32, F32, BF16, F32, F32>(bn_problem);
+    case miopenInt32:
+    case miopenInt8:
+    case miopenInt8x4:
+    case miopenBFloat8:
+    case miopenFloat8:
+    default: MIOPEN_THROW("Unsupported datatype");
+    }
+    return false;
+#endif
+}
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType>
+ConvSolution MakeAnyInvokerFactory(const miopen::batchnorm::ProblemDescription& bn_problem)
+{
+    const auto& valid_kernel_ids = FillValidKernelsIDs<DeviceOpBNBwdPtrs<XDataType,
+                                                                         DxDataType,
+                                                                         DyDataType,
+                                                                         AccDataType,
+                                                                         ScaleDataType,
+                                                                         DscaleDbiasDataType,
+                                                                         MeanVarDataType>,
+                                                       CKArgsBNormBwd>(bn_problem);
+    assert(!valid_kernel_ids.empty());
+    const auto& kernel_id = valid_kernel_ids[0];
+    return InitAnyInvokerFactory<DeviceOpBNBwdPtrs<XDataType,
+                                                   DxDataType,
+                                                   DyDataType,
+                                                   AccDataType,
+                                                   ScaleDataType,
+                                                   DscaleDbiasDataType,
+                                                   MeanVarDataType>,
+                                 CKArgsBNormBwd,
+                                 miopen::batchnorm::BwdInvokeParams>(bn_problem, kernel_id);
+}
+
+ConvSolution BnCKBwdBackward::GetSolution(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::batchnorm::ProblemDescription& bn_problem) const
+{
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+    switch(bn_problem.GetXDesc().GetType())
+    {
+
+    case miopenFloat: return MakeAnyInvokerFactory<F32, F32, F32, F32, F32, F32, F32>(bn_problem);
+    case miopenDouble: return MakeAnyInvokerFactory<F64, F64, F64, F64, F64, F64, F64>(bn_problem);
+    case miopenHalf: return MakeAnyInvokerFactory<F16, F32, F32, F32, F16, F32, F32>(bn_problem);
+    case miopenBFloat16:
+        return MakeAnyInvokerFactory<BF16, F32, F32, F32, BF16, F32, F32>(bn_problem);
+    case miopenInt8:
+    case miopenInt32:
+    case miopenInt8x4:
+    case miopenBFloat8:
+    case miopenFloat8:
+    default:
+        MIOPEN_THROW(miopenStatusInternalError, "BnCKBwdBackward operation not for this data type");
+    }
+#endif
+    return {};
+}
+
+} // namespace batchnorm
+} // namespace solver
+} // namespace miopen
diff --git a/src/solver/batchnorm/forward_training_ck.cpp b/src/solver/batchnorm/forward_training_ck.cpp
new file mode 100644
index 0000000000..a65cec14a9
--- /dev/null
+++ b/src/solver/batchnorm/forward_training_ck.cpp
@@ -0,0 +1,239 @@
+
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/batchnorm/solvers.hpp>
+#include <miopen/batchnorm/invoke_params.hpp>
+#include <miopen/batch_norm.hpp>
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+#include <miopen/solver/ck_utility_common.hpp>
+#include <ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp>
+#include <miopen/solver/implicitgemm_ck_util.hpp>
+#endif
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_FWD_TRAINING)
+
+namespace miopen {
+namespace solver {
+namespace batchnorm {
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+
+using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
+using index_t       = int32_t;
+
+constexpr index_t Rank                  = 4;
+constexpr index_t NumBatchNormReduceDim = 3;
+
+using F16  = ck::half_t;
+using F32  = float;
+using F64  = double;
+using BF16 = ushort;
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType>
+using DeviceOpBNFwdTrainingPtrs =
+    ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        ck::tensor_operation::device::DeviceBatchNormFwd<XDataType,
+                                                         YDataType,
+                                                         AccDataType,
+                                                         ScaleDataType,
+                                                         BiasDataType,
+                                                         MeanVarDataType,
+                                                         PassThroughOp,
+                                                         Rank,
+                                                         NumBatchNormReduceDim>>;
+
+struct CKArgsBNormFwdTraining
+{
+    CKArgsBNormFwdTraining(const miopen::batchnorm::ProblemDescription& problem)
+    {
+        std::copy(problem.GetXDesc().GetLengths().begin(),
+                  problem.GetXDesc().GetLengths().end(),
+                  xyLengths.begin());
+
+        std::copy(problem.GetXDesc().GetStrides().begin(),
+                  problem.GetXDesc().GetStrides().end(),
+                  xyStrides.begin());
+        arrScaleBiasMeanVarLengths[0] = xyLengths[1]; // get channel
+        arrScaleBiasMeanVarStrides[0] = 1;
+
+        // prep for CK
+        std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>());
+        std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
+    }
+
+    CKArgsBNormFwdTraining(const CKArgsBNormFwdTraining&) = default;
+    CKArgsBNormFwdTraining(CKArgsBNormFwdTraining&&)      = default;
+    CKArgsBNormFwdTraining& operator=(const CKArgsBNormFwdTraining&) = default;
+
+    template <typename InvokerPtr, typename InvokerParams>
+    auto MakeArgPtr(const InvokerPtr& invoker_ptr, const InvokerParams& data_ctx) const
+    {
+        return invoker_ptr->MakeArgumentPointer(xyLengths,
+                                                xyStrides,
+                                                xyStrides,
+                                                reduceDims,
+                                                arrScaleBiasMeanVarLengths,
+                                                arrScaleBiasMeanVarStrides,
+                                                arrScaleBiasMeanVarStrides,
+                                                arrScaleBiasMeanVarStrides,
+                                                data_ctx.x,
+                                                data_ctx.bnScale,
+                                                data_ctx.bnBias,
+                                                data_ctx.epsilon,
+                                                PassThroughOp{},
+                                                data_ctx.y,
+                                                data_ctx.resultSaveMean,
+                                                data_ctx.resultSaveInvVariance,
+                                                data_ctx.expAvgFactor,
+                                                data_ctx.resultRunningMean,
+                                                data_ctx.resultRunningVariance);
+    }
+
+    template <typename ConvPtr>
+    bool IsSupportedBy(const ConvPtr& invoker_ptr) const
+    {
+        auto arg_ptr = MakeArgPtr(invoker_ptr, miopen::batchnorm::InvokeParams{});
+        return invoker_ptr->IsSupportedArgument(arg_ptr.get());
+    }
+
+    std::array<ck::index_t, Rank> xyLengths;
+    std::array<ck::index_t, Rank> xyStrides;
+    std::vector<int> invariantDims;
+
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarLengths;
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarStrides;
+
+    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+};
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType>
+static bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem)
+{
+    return IsCKApplicable<DeviceOpBNFwdTrainingPtrs<XDataType,
+                                                    YDataType,
+                                                    AccDataType,
+                                                    ScaleDataType,
+                                                    BiasDataType,
+                                                    MeanVarDataType>,
+                          CKArgsBNormFwdTraining>(problem);
+}
+#endif
+
+bool BnCKFwdTraining::IsApplicable(const ExecutionContext& context,
+                                   const miopen::batchnorm::ProblemDescription& bn_problem) const
+{
+#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
+    std::ignore = context;
+    std::ignore = fdesc_problem;
+    return false;
+#else
+    if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_FWD_TRAINING{}))
+        return false;
+    if(!bn_problem.IsLayoutNHWC())
+        return false;
+    if(!ck_utility::is_ck_supported_hardware(context.GetStream()))
+        return false;
+
+    switch(bn_problem.GetXDesc().GetType())
+    {
+    case miopenHalf: return CheckCKApplicability<F16, F16, F32, F16, F16, F32>(bn_problem);
+    case miopenFloat: return CheckCKApplicability<F32, F32, F32, F32, F32, F32>(bn_problem);
+    case miopenDouble: return CheckCKApplicability<F64, F64, F64, F64, F64, F64>(bn_problem);
+    case miopenBFloat16: return CheckCKApplicability<BF16, BF16, F32, BF16, BF16, F32>(bn_problem);
+    case miopenInt32:
+    case miopenInt8:
+    case miopenInt8x4:
+    case miopenBFloat8:
+    case miopenFloat8:
+    default: MIOPEN_THROW("BnCKFwdTraining operation does not supprot this data type");
+    }
+    return false;
+#endif
+}
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType>
+ConvSolution MakeAnyInvokerFactory(const miopen::batchnorm::ProblemDescription& bn_problem)
+{
+    const auto& valid_kernel_ids = FillValidKernelsIDs<DeviceOpBNFwdTrainingPtrs<XDataType,
+                                                                                 YDataType,
+                                                                                 AccDataType,
+                                                                                 ScaleDataType,
+                                                                                 BiasDataType,
+                                                                                 MeanVarDataType>,
+                                                       CKArgsBNormFwdTraining>(bn_problem);
+    assert(!valid_kernel_ids.empty());
+    const auto& kernel_id = valid_kernel_ids[0];
+    return InitAnyInvokerFactory<DeviceOpBNFwdTrainingPtrs<XDataType,
+                                                           YDataType,
+                                                           AccDataType,
+                                                           ScaleDataType,
+                                                           BiasDataType,
+                                                           MeanVarDataType>,
+                                 CKArgsBNormFwdTraining,
+                                 miopen::batchnorm::InvokeParams>(bn_problem, kernel_id);
+}
+
+ConvSolution BnCKFwdTraining::GetSolution(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::batchnorm::ProblemDescription& bn_problem) const
+{
+#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+    switch(bn_problem.GetXDesc().GetType())
+    {
+
+    case miopenFloat: return MakeAnyInvokerFactory<F32, F32, F32, F32, F32, F32>(bn_problem);
+    case miopenDouble: return MakeAnyInvokerFactory<F64, F64, F64, F64, F64, F64>(bn_problem);
+    case miopenHalf: return MakeAnyInvokerFactory<F16, F16, F32, F16, F16, F32>(bn_problem);
+    case miopenBFloat16: return MakeAnyInvokerFactory<BF16, BF16, F32, BF16, BF16, F32>(bn_problem);
+    case miopenInt8:
+    case miopenInt32:
+    case miopenInt8x4:
+    case miopenBFloat8:
+    case miopenFloat8:
+    default:
+        MIOPEN_THROW(miopenStatusInternalError, "BnCKFwdTraining operation not for this data type");
+    }
+#endif
+    return {};
+}
+
+} // namespace batchnorm
+} // namespace solver
+} // namespace miopen
diff --git a/test/bn_spatial_nhwc_test.cpp b/test/bn_spatial_nhwc_test.cpp
deleted file mode 100644
index abca57e7ce..0000000000
--- a/test/bn_spatial_nhwc_test.cpp
+++ /dev/null
@@ -1,749 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include "driver.hpp"
-#include "get_handle.hpp"
-#include "tensor_holder.hpp"
-#include "test.hpp"
-#include "verify.hpp"
-#include "random.hpp"
-#include <array>
-#include <cmath>
-#include <ctime>
-#include <iomanip>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <miopen/batch_norm.hpp>
-#include <miopen/miopen.h>
-#include <miopen/tensor.hpp>
-#include <miopen/tensor_layout.hpp>
-#include <utility>
-#include <cfloat>
-
-#define MIO_BN_TEST_EXPAVGFACTOR 0.1
-#define MIO_BN_TEST_EPSILON 1e-5
-#define MIO_BN_USE_MIX_PREC 1
-#if MIO_BN_USE_MIX_PREC == 1
-#define PREC_TYPE float
-#else
-#define PREC_TYPE T
-#endif
-
-template <class T, class U>
-struct verify_forward_train_bn_spatial
-{
-    const tensor<T> input;
-    const tensor<U> scale;
-    const tensor<U> shift;
-
-    std::tuple<tensor<T>, tensor<U>, tensor<U>, tensor<U>, tensor<U>> cpu() const
-    {
-        double epsilon      = MIO_BN_TEST_EPSILON;
-        double expAvgFactor = MIO_BN_TEST_EXPAVGFACTOR;
-
-        std::size_t n_batch, channels, height, width;
-        std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
-
-        std::size_t rs_n_batch, rs_channels, rs_height, rs_width;
-        auto derivedBnDesc =
-            miopen::TensorDescriptor(input.desc.GetType(),
-                                     std::vector<std::size_t>{1, 1, 1, channels},
-                                     std::vector<std::size_t>{channels, channels, channels, 1});
-        std::tie(rs_n_batch, rs_height, rs_width, rs_channels) =
-            miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        tensor<U> runMean;
-        tensor<U> runVar;
-        if(input.desc.GetType() == miopenFloat)
-        {
-            runMean = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels}.generate(
-                tensor_elem_gen_integer{17});
-            runVar = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels}.generate(
-                tensor_elem_gen_integer{17});
-        }
-        else
-        {
-            prng::reset_seed();
-            runMean = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-            runVar  = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-
-            const U Data_scale = static_cast<U>(0.001);
-            for(std::size_t i = 0; i < runMean.desc.GetElementSize(); i++)
-            {
-                runMean[i] = prng::gen_descreet_uniform_sign(Data_scale, 100);
-                runVar[i]  = prng::gen_descreet_unsigned(Data_scale, 100);
-            }
-        }
-        auto saveMean   = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-        auto saveInvVar = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-        auto out        = input;
-        std::fill(out.begin(), out.end(), 0);
-
-        const auto nhw = double(height * width * n_batch);
-        par_for(channels, 1, [&](int cidx) {
-            double elemStd        = 0.;
-            double variance_accum = 0.;
-            double mean_accum     = 0.;
-            double invVar         = 0.;
-            double newRunMean     = 0.;
-            double adjust         = 0.;
-
-            std::vector<double> variance_accum_arr(height, 0.0);
-            std::vector<double> mean_accum_arr(height, 0.0);
-            std::vector<double> dshift_accum_arr(height, 0.0);
-            std::vector<double> dscale_accum_arr(height, 0.0);
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        mean_accum_arr[row] += input(bidx, cidx, row, column);
-                    }
-                }
-            }
-            for(std::size_t i = 0; i < height; i++)
-                mean_accum += mean_accum_arr[i];
-
-            mean_accum /= nhw;
-
-            elemStd        = 0.;
-            variance_accum = 0.;
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        out(bidx, cidx, row, column) = elemStd =
-                            input(bidx, cidx, row, column) - mean_accum;
-                        variance_accum_arr[row] += elemStd * elemStd;
-                    }
-                }
-            }
-            for(std::size_t i = 0; i < height; i++)
-                variance_accum += variance_accum_arr[i];
-
-            variance_accum /= nhw;
-            invVar = 1.0 / sqrt(variance_accum + epsilon);
-
-            for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-            {
-                for(std::size_t row = 0; row < height; row++)
-                {
-                    for(std::size_t column = 0; column < width; column++)
-                    {
-                        out(bidx, cidx, row, column) =
-                            scale(0, 0, 0, cidx) * (invVar * out(bidx, cidx, row, column)) +
-                            shift(0, 0, 0, cidx);
-                    }
-                }
-            }
-
-            saveMean(0, 0, 0, cidx)   = mean_accum;
-            saveInvVar(0, 0, 0, cidx) = invVar;
-
-            newRunMean             = runMean(0, 0, 0, cidx) * (1 - expAvgFactor);
-            runMean(0, 0, 0, cidx) = mean_accum * expAvgFactor + newRunMean;
-            adjust                 = (n_batch * height * width == 1) ? variance_accum
-                                                                     : (nhw / (nhw - 1)) * variance_accum;
-            runVar(0, 0, 0, cidx) =
-                (1 - expAvgFactor) * runVar(0, 0, 0, cidx) + expAvgFactor * adjust;
-        });
-
-        return std::make_tuple(out, runMean, runVar, saveMean, saveInvVar);
-    }
-
-    std::tuple<tensor<T>, tensor<U>, tensor<U>, tensor<U>, tensor<U>> gpu() const
-    {
-        auto&& handle = get_handle();
-
-        std::size_t n_batch, channels, height, width;
-        std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
-
-        auto out = input;
-        std::fill(out.begin(), out.end(), 0);
-
-        std::size_t rs_n_batch, rs_channels, rs_height, rs_width;
-        auto derivedBnDesc =
-            miopen::TensorDescriptor(input.desc.GetType(),
-                                     std::vector<std::size_t>{1, 1, 1, channels},
-                                     std::vector<std::size_t>{channels, channels, channels, 1});
-        std::tie(rs_n_batch, rs_height, rs_width, rs_channels) =
-            miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        tensor<U> runMean;
-        tensor<U> runVar;
-        if(input.desc.GetType() == miopenFloat)
-        {
-            runMean = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels}.generate(
-                tensor_elem_gen_integer{17});
-            runVar = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels}.generate(
-                tensor_elem_gen_integer{17});
-        }
-        else
-        {
-            prng::reset_seed();
-            runMean = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-            runVar  = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-
-            const U Data_scale = static_cast<U>(0.001);
-            for(std::size_t i = 0; i < runMean.desc.GetElementSize(); i++)
-            {
-                runMean[i] = prng::gen_descreet_uniform_sign(Data_scale, 100);
-                runVar[i]  = prng::gen_descreet_unsigned(Data_scale, 100);
-            }
-        }
-
-        auto saveMean   = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-        auto saveInvVar = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
-
-        auto in_dev    = handle.Write(input.data);
-        auto scale_dev = handle.Write(scale.data);
-        auto shift_dev = handle.Write(shift.data);
-
-        auto runMean_dev    = handle.Write(runMean.data);
-        auto runVar_dev     = handle.Write(runVar.data);
-        auto saveMean_dev   = handle.Create<U>(channels);
-        auto saveInvVar_dev = handle.Create<U>(channels);
-        auto out_dev        = handle.Create<T>(n_batch * channels * height * width);
-
-        double epsilon      = MIO_BN_TEST_EPSILON;
-        double expAvgFactor = MIO_BN_TEST_EXPAVGFACTOR;
-
-        float alpha = 1.0;
-        float beta  = 0.0;
-
-        miopen::BatchNormForwardTraining(handle,
-                                         miopenBNSpatial,
-                                         &alpha,
-                                         &beta,
-                                         input.desc,
-                                         in_dev.get(),
-                                         out.desc,
-                                         out_dev.get(),
-                                         scale.desc,
-                                         scale_dev.get(),
-                                         shift_dev.get(),
-                                         expAvgFactor,
-                                         runMean_dev.get(),
-                                         runVar_dev.get(),
-                                         epsilon,
-                                         saveMean_dev.get(),
-                                         saveInvVar_dev.get());
-
-        saveMean.data   = handle.Read<U>(saveMean_dev, saveMean.data.size());
-        saveInvVar.data = handle.Read<U>(saveInvVar_dev, saveInvVar.data.size());
-        runMean.data    = handle.Read<U>(runMean_dev, runMean.data.size());
-        runVar.data     = handle.Read<U>(runVar_dev, runVar.data.size());
-        out.data        = handle.Read<T>(out_dev, out.data.size());
-
-        return std::make_tuple(out, runMean, runVar, saveMean, saveInvVar);
-    }
-
-    void fail(int badtensor) const
-    {
-        std::cout << "Forward Train Spatial Batch Normalization: " << std::endl;
-        std::cout << "Input tensor: " << input.desc.ToString() << std::endl;
-
-        switch(badtensor)
-        {
-        case(0): std::cout << "Output tensor output failed verification." << std::endl; break;
-        case(1): std::cout << "Running Mean output tensor failed verification." << std::endl; break;
-        case(2):
-            std::cout << "Running Variance output tensor failed verification." << std::endl;
-            break;
-        case(3): std::cout << "Saved Mean tensor failed verification." << std::endl; break;
-        case(4): std::cout << "Saved Variance tensor failed verification." << std::endl; break;
-        default: break;
-        }
-    }
-};
-
-template <class T, class U>
-struct verify_backward_bn_spatial_recalc
-{
-    const tensor<T> x_input;
-    const tensor<T> dy_input;
-    const tensor<U> scale;
-
-    std::tuple<tensor<T>, tensor<U>, tensor<U>> cpu() const
-    {
-        double epsilon = MIO_BN_TEST_EPSILON;
-
-        std::size_t n_batch, channels, height, width;
-        std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-
-        std::size_t ss_n_batch, ss_channels, ss_height, ss_width;
-        auto derivedBnDesc =
-            miopen::TensorDescriptor(x_input.desc.GetType(),
-                                     std::vector<std::size_t>{1, 1, 1, channels},
-                                     std::vector<std::size_t>{channels, channels, channels, 1});
-        std::tie(ss_n_batch, ss_height, ss_width, ss_channels) =
-            miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        auto dx_out = dy_input;
-        std::fill(dx_out.begin(), dx_out.end(), 0);
-
-        auto dscale = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dscale.begin(), dscale.end(), 0);
-
-        auto dshift = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dshift.begin(), dshift.end(), 0);
-
-        const auto nhw = double(height * width * n_batch);
-
-        par_for(channels, 1, [&](int cidx) {
-            double elemStd = 0.;
-            unsigned int xhat_index;
-            double mean     = 0.;
-            double invVar   = 0.;
-            double dyelem   = 0.;
-            double variance = 0.;
-
-            std::vector<double> xhat(height * width * n_batch, 0.0);
-            std::vector<double> variance_accum_arr(height, 0.0);
-            std::vector<double> mean_accum_arr(height, 0.0);
-            std::vector<double> dshift_accum_arr(height, 0.0);
-            std::vector<double> dscale_accum_arr(height, 0.0);
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        mean_accum_arr[row] += x_input(bidx, cidx, row, column);
-                    }
-                }
-            }
-            for(std::size_t i = 0; i < height; i++)
-                mean += mean_accum_arr[i];
-
-            mean /= nhw;
-
-            elemStd  = 0.;
-            variance = 0.;
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        elemStd = x_input(bidx, cidx, row, column) - mean;
-                        variance_accum_arr[row] += elemStd * elemStd;
-                    }
-                }
-            }
-            for(std::size_t i = 0; i < height; i++)
-                variance += variance_accum_arr[i];
-
-            variance /= nhw;
-            invVar = 1. / double(sqrt(variance + epsilon));
-
-            dscale(0, cidx, 0, 0) = 0.;
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        xhat_index       = height * width * bidx + (width * row + column);
-                        elemStd          = x_input(bidx, cidx, row, column) - mean;
-                        xhat[xhat_index] = elemStd * invVar;
-                        dyelem           = dy_input(bidx, cidx, row, column);
-                        dshift_accum_arr[row] += dyelem;
-                        dscale_accum_arr[row] += xhat[xhat_index] * dyelem;
-                    }
-                }
-            }
-            for(std::size_t i = 0; i < height; i++)
-            {
-                dshift(0, cidx, 0, 0) += dshift_accum_arr[i];
-                dscale(0, cidx, 0, 0) += dscale_accum_arr[i];
-            }
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        xhat_index = height * width * bidx + (width * row + column);
-
-                        double tmp1 =
-                            nhw * dy_input(bidx, cidx, row, column) - dshift(0, cidx, 0, 0);
-                        double tmp2                     = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
-                        double tmp3                     = (scale(0, 0, 0, cidx) * invVar) / nhw;
-                        dx_out(bidx, cidx, row, column) = tmp3 * (tmp2 + tmp1);
-                    }
-                }
-            }
-        });
-
-        return std::make_tuple(dx_out, dscale, dshift);
-    }
-
-    std::tuple<tensor<T>, tensor<U>, tensor<U>> gpu() const
-    {
-        auto&& handle = get_handle();
-
-        std::size_t n_batch, channels, height, width;
-        std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-
-        auto dx_out = dy_input;
-        std::fill(dx_out.begin(), dx_out.end(), 0);
-
-        std::size_t ss_n_batch, ss_channels, ss_height, ss_width;
-        auto derivedBnDesc =
-            miopen::TensorDescriptor(x_input.desc.GetType(),
-                                     std::vector<std::size_t>{1, 1, 1, channels},
-                                     std::vector<std::size_t>{channels, channels, channels, 1});
-        std::tie(ss_n_batch, ss_height, ss_width, ss_channels) =
-            miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        auto dscale = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dscale.begin(), dscale.end(), 0);
-
-        auto dshift = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dshift.begin(), dshift.end(), 0);
-
-        float alpha = 1.0;
-        float beta  = 0.0;
-
-        auto xin_dev    = handle.Write(x_input.data);
-        auto dyin_dev   = handle.Write(dy_input.data);
-        auto scale_dev  = handle.Write(scale.data);
-        auto dscale_dev = handle.Write(dscale.data);
-        auto dshift_dev = handle.Write(dshift.data);
-        auto dx_out_dev = handle.Write(dx_out.data);
-
-        double epsilon = MIO_BN_TEST_EPSILON;
-
-        miopen::BatchNormBackward(handle,
-                                  miopenBNSpatial,
-                                  &alpha,
-                                  &beta,
-                                  &alpha,
-                                  &beta,
-                                  x_input.desc,
-                                  xin_dev.get(),
-                                  dy_input.desc,
-                                  dyin_dev.get(),
-                                  dx_out.desc,
-                                  dx_out_dev.get(),
-                                  scale.desc,
-                                  scale_dev.get(),
-                                  dscale_dev.get(),
-                                  dshift_dev.get(),
-                                  epsilon,
-                                  nullptr,
-                                  nullptr);
-
-        dx_out.data = handle.Read<T>(dx_out_dev, dx_out.data.size());
-        dscale.data = handle.Read<U>(dscale_dev, dscale.data.size());
-        dshift.data = handle.Read<U>(dshift_dev, dshift.data.size());
-
-        return std::make_tuple(dx_out, dscale, dshift);
-    }
-
-    void fail(int badtensor) const
-    {
-        std::cout << "Backward Batch Spatial Normalization Recalc Mean and Variance: " << std::endl;
-        std::cout << "X Input tensor: " << x_input.desc.ToString() << std::endl;
-        std::cout << "Delta Y Input tensor: " << dy_input.desc.ToString() << std::endl;
-        switch(badtensor)
-        {
-        case(0):
-            std::cout << "Delta X output tensor output failed verification." << std::endl;
-            break;
-        case(1): std::cout << "Delta scale output tensor failed verification." << std::endl; break;
-        case(2): std::cout << "Delta shift output tensor failed verification." << std::endl; break;
-        default: break;
-        }
-    }
-};
-
-template <class T, class U>
-struct verify_backward_bn_spatial_use_saved
-{
-    const tensor<T> x_input;
-    const tensor<T> dy_input;
-    const tensor<U> scale;
-    const tensor<U> savedMean;
-    const tensor<U> savedInvVar;
-    std::tuple<tensor<T>, tensor<U>, tensor<U>> cpu() const
-    {
-
-        std::size_t n_batch, channels, height, width;
-        std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-
-        auto dx_out = dy_input;
-        std::fill(dx_out.begin(), dx_out.end(), 0);
-
-        std::size_t ss_n_batch, ss_channels, ss_height, ss_width;
-        auto derivedBnDesc =
-            miopen::TensorDescriptor(x_input.desc.GetType(),
-                                     std::vector<std::size_t>{1, 1, 1, channels},
-                                     std::vector<std::size_t>{channels, channels, channels, 1});
-        std::tie(ss_n_batch, ss_height, ss_width, ss_channels) =
-            miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        auto dscale = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dscale.begin(), dscale.end(), 0);
-
-        auto dshift = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dshift.begin(), dshift.end(), 0);
-
-        const auto nhw = double(height * width * n_batch);
-
-        par_for(channels, 1, [&](int cidx) {
-            double elemStd = 0.;
-            unsigned int xhat_index;
-            double mean   = savedMean(0, 0, 0, cidx);
-            double invVar = savedInvVar(0, 0, 0, cidx);
-            double dyelem = 0.;
-
-            std::vector<double> xhat(n_batch * height * width, 0.0);
-            std::vector<double> dshift_accum_arr(height, 0.0);
-            std::vector<double> dscale_accum_arr(height, 0.0);
-            dscale(0, cidx, 0, 0) = 0.;
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        xhat_index       = height * width * bidx + (width * row + column);
-                        elemStd          = x_input(bidx, cidx, row, column) - mean;
-                        xhat[xhat_index] = elemStd * invVar;
-                        dyelem           = dy_input(bidx, cidx, row, column);
-                        dshift_accum_arr[row] += dyelem;
-                        dscale_accum_arr[row] += xhat[xhat_index] * dyelem;
-                    }
-                }
-            }
-            for(std::size_t i = 0; i < height; i++)
-            {
-                dshift(0, cidx, 0, 0) += dshift_accum_arr[i];
-                dscale(0, cidx, 0, 0) += dscale_accum_arr[i];
-            }
-
-            for(std::size_t row = 0; row < height; row++)
-            {
-                for(std::size_t column = 0; column < width; column++)
-                {
-                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
-                    {
-                        xhat_index = height * width * bidx + (width * row + column);
-
-                        double tmp1 =
-                            nhw * dy_input(bidx, cidx, row, column) - dshift(0, cidx, 0, 0);
-                        double tmp2                     = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
-                        double tmp3                     = (scale(0, 0, 0, cidx) * invVar) / nhw;
-                        dx_out(bidx, cidx, row, column) = tmp3 * (tmp2 + tmp1);
-                    }
-                }
-            }
-        });
-
-        return std::make_tuple(dx_out, dscale, dshift);
-    }
-
-    std::tuple<tensor<T>, tensor<U>, tensor<U>> gpu() const
-    {
-        auto&& handle = get_handle();
-
-        std::size_t n_batch, channels, height, width;
-        std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
-
-        auto dx_out = dy_input;
-        std::fill(dx_out.begin(), dx_out.end(), 0);
-
-        std::size_t ss_n_batch, ss_channels, ss_height, ss_width;
-        auto derivedBnDesc =
-            miopen::TensorDescriptor(x_input.desc.GetType(),
-                                     std::vector<std::size_t>{1, 1, 1, channels},
-                                     std::vector<std::size_t>{channels, channels, channels, 1});
-        std::tie(ss_n_batch, ss_height, ss_width, ss_channels) =
-            miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        auto dscale = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dscale.begin(), dscale.end(), 0);
-
-        auto dshift = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
-        std::fill(dshift.begin(), dshift.end(), 0);
-
-        float alpha = 1.0;
-        float beta  = 0.0;
-
-        auto xin_dev         = handle.Write(x_input.data);
-        auto dyin_dev        = handle.Write(dy_input.data);
-        auto scale_dev       = handle.Write(scale.data);
-        auto dscale_dev      = handle.Write(dscale.data);
-        auto dshift_dev      = handle.Write(dshift.data);
-        auto dx_out_dev      = handle.Write(dx_out.data);
-        auto savedMean_dev   = handle.Write(savedMean.data);
-        auto savedInvVar_dev = handle.Write(savedInvVar.data);
-
-        double epsilon = MIO_BN_TEST_EPSILON;
-
-        miopen::BatchNormBackward(handle,
-                                  miopenBNSpatial,
-                                  &alpha,
-                                  &beta,
-                                  &alpha,
-                                  &beta,
-                                  x_input.desc,
-                                  xin_dev.get(),
-                                  dy_input.desc,
-                                  dyin_dev.get(),
-                                  dx_out.desc,
-                                  dx_out_dev.get(),
-                                  scale.desc,
-                                  scale_dev.get(),
-                                  dscale_dev.get(),
-                                  dshift_dev.get(),
-                                  epsilon,
-                                  savedMean_dev.get(),
-                                  savedInvVar_dev.get());
-
-        dx_out.data = handle.Read<T>(dx_out_dev, dx_out.data.size());
-        dscale.data = handle.Read<U>(dscale_dev, dscale.data.size());
-        dshift.data = handle.Read<U>(dshift_dev, dshift.data.size());
-
-        return std::make_tuple(dx_out, dscale, dshift);
-    }
-
-    void fail(int badtensor) const
-    {
-        std::cout << "Backward Batch Spatial Normalization Use Saved Mean and Variance: "
-                  << std::endl;
-        std::cout << "X Input tensor: " << x_input.desc.ToString() << std::endl;
-        std::cout << "Delta Y Input tensor: " << dy_input.desc.ToString() << std::endl;
-        switch(badtensor)
-        {
-        case(0):
-            std::cout << "Delta X output tensor output failed verification." << std::endl;
-            break;
-        case(1): std::cout << "Delta scale output tensor failed verification." << std::endl; break;
-        case(2): std::cout << "Delta shift output tensor failed verification." << std::endl; break;
-        default: break;
-        }
-    }
-};
-
-template <class T>
-struct batch_norm_spatial_nhwc_driver : test_driver
-{
-    tensor<T> input;
-    tensor<PREC_TYPE> scale;
-    tensor<PREC_TYPE> shift;
-    batch_norm_spatial_nhwc_driver()
-    {
-        this->batch_factor = 4;
-        add(input,
-            "input",
-            get_bn_spatial_input_tensor(
-                tensor_elem_gen_integer{miopen_type<T>{} == miopenHalf ? 5 : 17}));
-    }
-
-    void run()
-    {
-        std::size_t n, c, h, w;
-        std::tie(n, c, h, w) = miopen::tien<4>(input.desc.GetLengths());
-
-        std::size_t ssn, ssc, ssh, ssw;
-        auto derivedBnDesc           = miopen::TensorDescriptor(input.desc.GetType(),
-                                                      std::vector<std::size_t>{1, 1, 1, c},
-                                                      std::vector<std::size_t>{c, c, c, 1});
-        std::tie(ssn, ssh, ssw, ssc) = miopen::tien<4>(derivedBnDesc.GetLengths());
-
-        std::vector<std::size_t> new_len = input.desc.GetLengths();
-        std::vector<std::size_t> new_str;
-        miopen::tensor_layout_to_strides(new_len, "NCHW", "NHWC", new_str);
-        input.desc = miopen::TensorDescriptor(miopen_type<T>{}, new_len, new_str);
-
-        if(input.desc.GetType() == miopenFloat)
-        {
-            scale = tensor<PREC_TYPE>{ssn, ssh, ssw, ssc}.generate(tensor_elem_gen_integer{17});
-            shift = tensor<PREC_TYPE>{ssn, ssh, ssw, ssc}.generate(tensor_elem_gen_integer{17});
-        }
-        else
-        {
-            scale = tensor<PREC_TYPE>{ssn, ssh, ssw, ssc};
-            shift = tensor<PREC_TYPE>{ssn, ssh, ssw, ssc};
-
-            const PREC_TYPE Data_scale = static_cast<PREC_TYPE>(1e-4);
-            for(std::size_t i = 0; i < scale.desc.GetElementSize(); i++)
-            {
-                scale[i] = prng::gen_descreet_uniform_sign(Data_scale, 100);
-                shift[i] = prng::gen_descreet_uniform_sign(Data_scale, 100);
-            }
-            for(std::size_t i = 0; i < input.desc.GetElementSize(); i++)
-            {
-                input[i] = prng::gen_descreet_uniform_sign(static_cast<T>(1e-5), 100);
-            }
-        }
-
-        auto outpair = verify(verify_forward_train_bn_spatial<T, PREC_TYPE>{input, scale, shift});
-
-        auto dy_input = std::get<0>(outpair.second);
-        for(std::size_t bidx = 0; bidx < n; bidx++)
-        {
-            for(std::size_t cidx = 0; cidx < c; cidx++)
-            {
-                for(std::size_t row = 0; row < h; row++)
-                {
-                    for(std::size_t column = 0; column < w; column++)
-                    {
-                        dy_input(bidx, cidx, row, column) *= 0.1;
-                    }
-                }
-            }
-        }
-        this->tolerance = 80 * input.desc.GetElementSize();
-        verify(verify_backward_bn_spatial_recalc<T, PREC_TYPE>{input, dy_input, scale});
-
-        auto savedMean   = std::get<3>(outpair.second);
-        auto savedInvVar = std::get<4>(outpair.second);
-        verify(verify_backward_bn_spatial_use_saved<T, PREC_TYPE>{
-            input, dy_input, scale, savedMean, savedInvVar});
-    }
-};
-
-int main(int argc, const char* argv[])
-{
-    test_drive<batch_norm_spatial_nhwc_driver>(argc, argv);
-    return 0;
-}
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index cffefea0e2..5374abd1fa 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -36,7 +36,6 @@
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
 #include <utility>
-// #include "driver.hpp"
 #include "get_handle.hpp"
 #include "tensor_holder.hpp"
 #include "verify.hpp"
@@ -203,17 +202,17 @@ void batchNormPerActivHostInference(const tensor<T>& input,
     });
 }
 
-template <class T, class U>
+template <class T, class U, class V = U>
 void batchNormSpatialHostFwdTrain(const tensor<T>& input,
                                   tensor<T>& out,
                                   const tensor<U>& scale,
                                   const tensor<U>& bias,
                                   double epsilon,
                                   double expAvgFactor,
-                                  tensor<U>& saveMean,
-                                  tensor<U>& saveInvVar,
-                                  tensor<U>& runMean,
-                                  tensor<U>& runVar)
+                                  tensor<V>& saveMean,
+                                  tensor<V>& saveInvVar,
+                                  tensor<V>& runMean,
+                                  tensor<V>& runVar)
 {
 
     int height, width, n_batch, channels;
@@ -279,15 +278,15 @@ void batchNormSpatialHostFwdTrain(const tensor<T>& input,
     });
 }
 
-template <class T, class U>
-void batchNormSpatialHostBwdTrain(const tensor<T>& x_input,
-                                  const tensor<T>& dy_input,
-                                  tensor<T>& dx_out,
-                                  const tensor<U>& scale,
-                                  tensor<U>& dscale,
-                                  tensor<U>& dbias,
-                                  const tensor<U>& savedMean,
-                                  const tensor<U>& savedInvVar)
+template <class DataType, class XAndScaleDataType>
+void batchNormSpatialHostBwdTrain(const tensor<XAndScaleDataType>& x_input,
+                                  const tensor<DataType>& dy_input,
+                                  tensor<DataType>& dx_out,
+                                  const tensor<XAndScaleDataType>& scale,
+                                  tensor<DataType>& dscale,
+                                  tensor<DataType>& dbias,
+                                  const tensor<DataType>& savedMean,
+                                  const tensor<DataType>& savedInvVar)
 {
 
     int height, width, n_batch, channels;
@@ -335,7 +334,7 @@ void batchNormSpatialHostBwdTrain(const tensor<T>& x_input,
                     double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
                     double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
                     double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw;
-                    dx_out(bidx, cidx, row, column) = static_cast<T>(tmp3 * (tmp2 + tmp1));
+                    dx_out(bidx, cidx, row, column) = static_cast<DataType>(tmp3 * (tmp2 + tmp1));
                 } // end for(n_batchs)
             }     // for (column)
         }         // for (row)
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index 0b763da411..22f8391fe6 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -84,3 +84,174 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
         bn_infer_test_data;
     miopenTensorLayout_t tensor_layout;
 };
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType>
+struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t>>
+{
+protected:
+    void SetUp() override
+    {
+        test_skipped                       = false;
+        std::tie(bn_config, tensor_layout) = GetParam();
+        bn_bwd_test_data.SetUpImpl(bn_config, tensor_layout);
+
+        auto&& handle = get_handle();
+        miopenBatchNormalizationBackward(&handle,
+                                         bn_config.mode,
+                                         &bn_bwd_test_data.alphaDataDiff,
+                                         &bn_bwd_test_data.betaDataDiff,
+                                         &bn_bwd_test_data.alphaParamDiff,
+                                         &bn_bwd_test_data.betaParamDiff,
+                                         &bn_bwd_test_data.input.desc,
+                                         bn_bwd_test_data.in_dev.get(),
+                                         &bn_bwd_test_data.dy.desc,
+                                         bn_bwd_test_data.dy_dev.get(),
+                                         &bn_bwd_test_data.output.desc,
+                                         bn_bwd_test_data.out_dev.get(),
+                                         &bn_bwd_test_data.bnScale.desc,
+                                         bn_bwd_test_data.bnScale_dev.get(),
+                                         bn_bwd_test_data.dScale_dev.get(),
+                                         bn_bwd_test_data.dBias_dev.get(),
+                                         bn_bwd_test_data.epsilon,
+                                         bn_bwd_test_data.savedMean_dev.get(),
+                                         bn_bwd_test_data.savedInvVar_dev.get());
+
+        std::fill(bn_bwd_test_data.output.begin(),
+                  bn_bwd_test_data.output.end(),
+                  std::numeric_limits<DxDataType>::quiet_NaN());
+    }
+
+    void TearDown() override
+    {
+        if(test_skipped)
+            return;
+        auto&& handle = get_handle();
+        bn_bwd_test_data.output.data =
+            handle.Read<DyDataType>(bn_bwd_test_data.out_dev, bn_bwd_test_data.output.data.size());
+        bn_bwd_test_data.dScale.data = handle.Read<DxDataType>(bn_bwd_test_data.dScale_dev,
+                                                               bn_bwd_test_data.dScale.data.size());
+        bn_bwd_test_data.dBias.data =
+            handle.Read<DxDataType>(bn_bwd_test_data.dBias_dev, bn_bwd_test_data.dBias.data.size());
+
+        test::ComputeCPUBNBwd<XDataType,
+                              DxDataType,
+                              DyDataType,
+                              AccDataType,
+                              ScaleDataType,
+                              DscaleDbiasDataType,
+                              MeanVarDataType>(bn_bwd_test_data);
+
+        // using tolerance = 1e-4 since this the tolerance CK uses
+        test::CompareTensor<DxDataType>(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 1e-4);
+        test::CompareTensor<DxDataType>(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 1e-4);
+        test::CompareTensor<DxDataType>(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 1e-4);
+    }
+
+    BNTestCase bn_config;
+    bool test_skipped = false;
+    BNBwdTestData<XDataType,
+                  DxDataType,
+                  DyDataType,
+                  AccDataType,
+                  ScaleDataType,
+                  DscaleDbiasDataType,
+                  MeanVarDataType,
+                  BNTestCase>
+        bn_bwd_test_data;
+    miopenTensorLayout_t tensor_layout;
+};
+
+template <typename XDataType,
+          typename YDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType>
+struct BNFwdTrainTest
+    : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t>>
+{
+protected:
+    void SetUp() override
+    {
+        test_skipped                       = false;
+        std::tie(bn_config, tensor_layout) = GetParam();
+        bn_fwd_train_test_data.SetUpImpl(bn_config, tensor_layout);
+
+        auto&& handle = get_handle();
+        miopenBatchNormalizationForwardTraining(&handle,
+                                                bn_config.mode,
+                                                &bn_fwd_train_test_data.alpha,
+                                                &bn_fwd_train_test_data.beta,
+                                                &bn_fwd_train_test_data.input.desc,
+                                                bn_fwd_train_test_data.in_dev.get(),
+                                                &bn_fwd_train_test_data.output.desc,
+                                                bn_fwd_train_test_data.out_dev.get(),
+                                                &bn_fwd_train_test_data.scale.desc,
+                                                bn_fwd_train_test_data.scale_dev.get(),
+                                                bn_fwd_train_test_data.shift_dev.get(),
+                                                bn_fwd_train_test_data.averageFactor,
+                                                bn_fwd_train_test_data.runMean_dev.get(),
+                                                bn_fwd_train_test_data.runVariance_dev.get(),
+                                                bn_fwd_train_test_data.epsilon,
+                                                bn_fwd_train_test_data.saveMean_dev.get(),
+                                                bn_fwd_train_test_data.saveVariance_dev.get());
+
+        std::fill(bn_fwd_train_test_data.output.begin(),
+                  bn_fwd_train_test_data.output.end(),
+                  std::numeric_limits<YDataType>::quiet_NaN());
+        std::fill(bn_fwd_train_test_data.saveMean_ref.begin(),
+                  bn_fwd_train_test_data.saveMean_ref.end(),
+                  std::numeric_limits<YDataType>::quiet_NaN());
+        std::fill(bn_fwd_train_test_data.saveVariance_ref.begin(),
+                  bn_fwd_train_test_data.saveVariance_ref.end(),
+                  std::numeric_limits<YDataType>::quiet_NaN());
+    }
+
+    void TearDown() override
+    {
+        if(test_skipped)
+            return;
+        auto&& handle                      = get_handle();
+        bn_fwd_train_test_data.output.data = handle.Read<YDataType>(
+            bn_fwd_train_test_data.out_dev, bn_fwd_train_test_data.output.data.size());
+
+        bn_fwd_train_test_data.saveMean.data = handle.Read<MeanVarDataType>(
+            bn_fwd_train_test_data.saveMean_dev, bn_fwd_train_test_data.saveMean.data.size());
+        bn_fwd_train_test_data.saveVariance.data =
+            handle.Read<MeanVarDataType>(bn_fwd_train_test_data.saveVariance_dev,
+                                         bn_fwd_train_test_data.saveVariance_ref.data.size());
+        bn_fwd_train_test_data.runMean.data = handle.Read<MeanVarDataType>(
+            bn_fwd_train_test_data.runMean_dev, bn_fwd_train_test_data.runMean_ref.data.size());
+        bn_fwd_train_test_data.runVariance.data =
+            handle.Read<MeanVarDataType>(bn_fwd_train_test_data.runVariance_dev,
+                                         bn_fwd_train_test_data.runVariance_ref.data.size());
+        test::ComputeCPUBNFwdTrain(bn_fwd_train_test_data);
+
+        // 4e-3 is tolerance used by CK kernel.
+        test::CompareTensor<YDataType>(
+            bn_fwd_train_test_data.output, bn_fwd_train_test_data.ref_out, 4e-3);
+        test::CompareTensor<MeanVarDataType>(
+            bn_fwd_train_test_data.saveMean, bn_fwd_train_test_data.saveMean_ref, 4e-3);
+        test::CompareTensor<MeanVarDataType>(
+            bn_fwd_train_test_data.saveVariance, bn_fwd_train_test_data.saveVariance_ref, 4e-3);
+        test::CompareTensor<MeanVarDataType>(
+            bn_fwd_train_test_data.runMean, bn_fwd_train_test_data.runMean_ref, 4e-3);
+        test::CompareTensor<MeanVarDataType>(
+            bn_fwd_train_test_data.runVariance, bn_fwd_train_test_data.runVariance_ref, 4e-3);
+    }
+
+    BNTestCase bn_config;
+    bool test_skipped = false;
+    BNFwdTrainTestData<XDataType,
+                       YDataType,
+                       ScaleDataType,
+                       BiasDataType,
+                       MeanVarDataType,
+                       BNTestCase>
+        bn_fwd_train_test_data;
+    miopenTensorLayout_t tensor_layout;
+};
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
new file mode 100644
index 0000000000..722b42e872
--- /dev/null
+++ b/test/gtest/bn_bwd.cpp
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "bn.hpp"
+
+struct BNBwdTestTestHalf
+    : BNBwdTest<half_float::half, float, float, float, half_float::half, float, float>
+{
+};
+
+struct BNBwdTestFloat : BNBwdTest<float, float, float, float, float, float, float>
+{
+};
+
+struct BNBwdTestBFloat16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
+{
+};
+
+struct BNBwdTestDouble : BNBwdTest<double, double, double, double, double, double, double>
+{
+};
+
+TEST_P(BNBwdTestTestHalf, BnBwdCKHalf) {}
+
+TEST_P(BNBwdTestFloat, BnBwdCKFloat) {}
+
+// Currently disabled since miopen::batchnorm::MakeForwardTrainingNetworkConfig
+// only supports half and float
+TEST_P(BNBwdTestBFloat16, DISABLED_BnBwdCKBFloat16) {}
+TEST_P(BNBwdTestDouble, DISABLED_BnBwdCKDouble) {}
+
+INSTANTIATE_TEST_SUITE_P(BNBwdTestTestHalfNHWCSuite,
+                         BNBwdTestTestHalf,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
+INSTANTIATE_TEST_SUITE_P(BNBwdTestFloatNHWCSuite,
+                         BNBwdTestFloat,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
+INSTANTIATE_TEST_SUITE_P(BNBwdTestBFloat16NHWCSuite,
+                         BNBwdTestBFloat16,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
+INSTANTIATE_TEST_SUITE_P(BNBwdTestDoubleNHWCSuite,
+                         BNBwdTestDouble,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp
new file mode 100644
index 0000000000..4a4dd4c728
--- /dev/null
+++ b/test/gtest/bn_fwd_train.cpp
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "bn.hpp"
+
+struct BNFwdTrainTestHalf
+    : BNFwdTrainTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
+{
+};
+
+struct BNFwdTrainTestFloat : BNFwdTrainTest<float, float, float, float, float>
+{
+};
+
+struct BNFwdTrainTestDouble : BNFwdTrainTest<double, double, double, double, double>
+{
+};
+
+struct BNFwdTrainTestBFloat16 : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, double>
+{
+};
+
+TEST_P(BNFwdTrainTestHalf, BnFwdTrainCKHalf) {}
+
+TEST_P(BNFwdTrainTestFloat, BnFwdTrainCKFloat) {}
+
+// Currently disabled since miopen::batchnorm::MakeForwardTrainingNetworkConfig
+// only supports half and float
+TEST_P(BNFwdTrainTestDouble, DISABLED_BnFwdTrainCKDouble) {}
+TEST_P(BNFwdTrainTestBFloat16, DISABLED_BnFwdTrainCKBFloat16) {}
+
+INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestHalfNHWCSuite,
+                         BNFwdTrainTestHalf,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
+INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite,
+                         BNFwdTrainTestFloat,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
+INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite,
+                         BNFwdTrainTestDouble,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
+
+INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite,
+                         BNFwdTrainTestBFloat16,
+                         testing::Combine(testing::ValuesIn(Network1()),
+                                          testing::Values(miopenTensorNHWC)));
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index 6598ef7169..0dceaa1ba5 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -43,14 +43,14 @@ struct BNInferTestBFloat16 : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16,
 {
 };
 
-TEST_P(BNInferTestHalf, BnFwdInferCKHalf) {}
+TEST_P(BNInferTestHalf, BnInferCKHalf) {}
 
-TEST_P(BNInferTestFloat, BnFwdInferCKFloat) {}
+TEST_P(BNInferTestFloat, BnInferCKFloat) {}
 
 // Currently disabled since miopen::batchnorm::MakeForwardTrainingNetworkConfig
 // only supports half and float
-TEST_P(BNInferTestDouble, DISABLED_BnFwdInferCKDouble) {}
-TEST_P(BNInferTestBFloat16, DISABLED_BnFwdInferCKBFloat16) {}
+TEST_P(BNInferTestDouble, DISABLED_BnInferCKDouble) {}
+TEST_P(BNInferTestBFloat16, DISABLED_BnInferCKBFloat16) {}
 
 INSTANTIATE_TEST_SUITE_P(BNInferTestHalfNHWCSuite,
                          BNInferTestHalf,
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 36d7813fd2..f65c694281 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -24,8 +24,7 @@
  *
  *******************************************************************************/
 #pragma once
-
-#include <random>
+#include "random.hpp"
 
 #include <miopen/miopen.h>
 #include <miopen/solver_id.hpp>
@@ -60,7 +59,8 @@ std::vector<BNTestCase> Network1()
 {
     // pyt_mlperf_resnet50v1.5
     return {
-        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {192, 1, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
+        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
         {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
         {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
@@ -125,7 +125,7 @@ struct BNTestData
     {
         input   = tensor<XDataType>{miopen_type<XDataType>{}, tensor_layout, bn_config.GetInput()};
         output  = tensor<YDataType>{miopen_type<YDataType>{}, tensor_layout, bn_config.GetInput()};
-        ref_out = output;
+        ref_out = tensor<YDataType>{miopen_type<YDataType>{}, tensor_layout, bn_config.GetInput()};
     }
 
     void InitTensorsWithRandValue()
@@ -226,3 +226,218 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
         estVariance_dev = handle.Write(estVariance.data);
     }
 };
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename TConfig>
+struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
+{
+    void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
+    {
+        BNTestData<XDataType, DxDataType, TConfig>::SetUpImpl(config, t_layout);
+        CreateTensors();
+        InitTensorsWithRandValue();
+        WriteToGPU();
+    }
+
+    tensor<ScaleDataType> bnScale;
+
+    tensor<MeanVarDataType> savedMean;
+    tensor<MeanVarDataType> savedInvVar;
+
+    tensor<DyDataType> dy;
+    tensor<DscaleDbiasDataType> dScale;
+    tensor<DscaleDbiasDataType> dBias;
+    tensor<DscaleDbiasDataType> dScale_ref;
+    tensor<DscaleDbiasDataType> dBias_ref;
+
+    miopen::Allocator::ManageDataPtr bnScale_dev;
+    miopen::Allocator::ManageDataPtr savedMean_dev;
+    miopen::Allocator::ManageDataPtr savedInvVar_dev;
+
+    miopen::Allocator::ManageDataPtr dy_dev;
+    miopen::Allocator::ManageDataPtr dScale_dev;
+    miopen::Allocator::ManageDataPtr dBias_dev;
+    miopen::Allocator::ManageDataPtr dScale_ref_dev;
+    miopen::Allocator::ManageDataPtr dBias_ref_dev;
+    double epsilon = std::numeric_limits<float>::epsilon();
+
+    float alphaDataDiff = static_cast<float>(1), betaDataDiff = static_cast<float>(0);
+    float alphaParamDiff = static_cast<float>(1), betaParamDiff = static_cast<float>(0);
+
+private:
+    void CreateTensors()
+    {
+        dy = tensor<DyDataType>{miopen_type<DyDataType>{},
+                                BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                BNTestData<XDataType, DyDataType, TConfig>::bn_config.GetInput()};
+
+        auto derivedBnDesc = miopen::TensorDescriptor{};
+        miopen::DeriveBNTensorDescriptor(derivedBnDesc,
+                                         BNTestData<XDataType, DyDataType, TConfig>::input.desc,
+                                         BNTestData<XDataType, DyDataType, TConfig>::bn_mode);
+        bnScale = tensor<ScaleDataType>{miopen_type<ScaleDataType>{},
+                                        BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                        derivedBnDesc.GetLengths()};
+        savedMean =
+            tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                    BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                    derivedBnDesc.GetLengths()};
+        savedInvVar =
+            tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                    BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                    derivedBnDesc.GetLengths()};
+        dScale =
+            tensor<DscaleDbiasDataType>{miopen_type<DscaleDbiasDataType>{},
+                                        BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                        derivedBnDesc.GetLengths()};
+        dBias =
+            tensor<DscaleDbiasDataType>{miopen_type<DscaleDbiasDataType>{},
+                                        BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
+                                        derivedBnDesc.GetLengths()};
+        dScale_ref = dScale;
+        dBias_ref  = dBias;
+    }
+
+    void InitTensorsWithRandValue()
+    {
+        auto gen_value = [](auto...) {
+            return prng::gen_descreet_uniform_sign(static_cast<ScaleDataType>(1e-2), 100);
+        };
+        dy.generate(gen_value);
+        bnScale.generate(gen_value);
+        savedMean.generate(gen_value);
+
+        auto gen_var = [](auto...) {
+            return static_cast<MeanVarDataType>(1e-2) *
+                   static_cast<MeanVarDataType>(prng::gen_0_to_B(100) + 1);
+        };
+        savedInvVar.generate(gen_var);
+
+        std::fill(dScale.begin(), dScale.end(), 0.);
+        std::fill(dBias.begin(), dBias.end(), 0.);
+
+        std::fill(dScale_ref.begin(), dScale_ref.end(), 0.);
+        std::fill(dBias_ref.begin(), dBias_ref.end(), 0.);
+    }
+    void WriteToGPU()
+    {
+        auto&& handle = get_handle();
+
+        bnScale_dev     = handle.Write(bnScale.data);
+        savedMean_dev   = handle.Write(savedMean.data);
+        savedInvVar_dev = handle.Write(savedInvVar.data);
+        dy_dev          = handle.Write(dy.data);
+
+        dScale_dev = handle.Write(dScale.data);
+        dBias_dev  = handle.Write(dBias.data);
+    }
+};
+
+template <typename XDataType,
+          typename YDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename TConfig>
+struct BNFwdTrainTestData : public BNTestData<XDataType, YDataType, TConfig>
+{
+    void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
+    {
+        BNTestData<XDataType, YDataType, TConfig>::SetUpImpl(config, t_layout);
+        CreateTensors();
+        InitTensorsWithRandValue();
+        WriteToGPU();
+    }
+
+    tensor<ScaleDataType> scale;
+    tensor<BiasDataType> shift;
+    tensor<MeanVarDataType> saveMean;
+    tensor<MeanVarDataType> saveVariance;
+    tensor<MeanVarDataType> runMean;
+    tensor<MeanVarDataType> runVariance;
+
+    tensor<MeanVarDataType> saveMean_ref;
+    tensor<MeanVarDataType> saveVariance_ref;
+    tensor<MeanVarDataType> runMean_ref;
+    tensor<MeanVarDataType> runVariance_ref;
+
+    miopen::Allocator::ManageDataPtr scale_dev;
+    miopen::Allocator::ManageDataPtr shift_dev; // bias
+    miopen::Allocator::ManageDataPtr saveMean_dev;
+    miopen::Allocator::ManageDataPtr saveVariance_dev;
+    miopen::Allocator::ManageDataPtr runMean_dev;
+    miopen::Allocator::ManageDataPtr runVariance_dev;
+    double epsilon          = 1.0e-5;
+    double averageFactor    = 0.1;
+    float alpha             = static_cast<float>(1.0f);
+    float beta              = static_cast<float>(0);
+    const float activ_alpha = static_cast<double>(0.5f);
+    const float activ_beta  = static_cast<double>(0.5f);
+    const float activ_gamma = static_cast<double>(0.5f);
+
+private:
+    void CreateTensors()
+    {
+        auto derivedBnDesc = miopen::TensorDescriptor{};
+        miopen::DeriveBNTensorDescriptor(derivedBnDesc,
+                                         BNTestData<XDataType, YDataType, TConfig>::input.desc,
+                                         BNTestData<XDataType, YDataType, TConfig>::bn_mode);
+        scale    = tensor<ScaleDataType>{miopen_type<ScaleDataType>{},
+                                      BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                      derivedBnDesc.GetLengths()};
+        shift    = tensor<BiasDataType>{miopen_type<BiasDataType>{},
+                                     BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                     derivedBnDesc.GetLengths()};
+        saveMean = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                           BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                           derivedBnDesc.GetLengths()};
+        saveVariance =
+            tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                    BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                    derivedBnDesc.GetLengths()};
+        runMean = tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                          BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                          derivedBnDesc.GetLengths()};
+        runVariance =
+            tensor<MeanVarDataType>{miopen_type<MeanVarDataType>{},
+                                    BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
+                                    derivedBnDesc.GetLengths()};
+    }
+
+    void InitTensorsWithRandValue()
+    {
+        auto gen_value = [](auto...) {
+            return prng::gen_descreet_uniform_sign(static_cast<ScaleDataType>(1e-2), 100);
+        };
+        scale.generate(gen_value);
+        shift.generate(gen_value);
+
+        auto gen_var = [](auto...) {
+            return static_cast<MeanVarDataType>(1e-2) *
+                   static_cast<MeanVarDataType>(prng::gen_0_to_B(100) + 1);
+        };
+        runMean.generate(gen_var);
+        runVariance.generate(gen_var);
+
+        saveMean_ref     = saveMean;
+        saveVariance_ref = saveVariance;
+        runMean_ref      = runMean;
+        runVariance_ref  = runVariance;
+    }
+    void WriteToGPU()
+    {
+        auto&& handle    = get_handle();
+        scale_dev        = handle.Write(scale.data);
+        shift_dev        = handle.Write(shift.data);
+        saveMean_dev     = handle.Write(saveMean.data);
+        saveVariance_dev = handle.Write(saveVariance.data);
+        runMean_dev      = handle.Write(runMean.data);
+        runVariance_dev  = handle.Write(runVariance.data);
+    }
+};
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index d1528fe2bb..da41212302 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -38,6 +38,41 @@ void ComputeCPUBNInference(DLModule& dl_module)
                                   dl_module.estVariance);
 }
 
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DLModule>
+void ComputeCPUBNBwd(DLModule& dl_module)
+{
+    batchNormSpatialHostBwdTrain(dl_module.input,
+                                 dl_module.dy,
+                                 dl_module.ref_out,
+                                 dl_module.bnScale,
+                                 dl_module.dScale_ref,
+                                 dl_module.dBias_ref,
+                                 dl_module.savedMean,
+                                 dl_module.savedInvVar);
+}
+
+template <typename DLModule>
+void ComputeCPUBNFwdTrain(DLModule& dl_module)
+{
+    batchNormSpatialHostFwdTrain(dl_module.input,
+                                 dl_module.ref_out,
+                                 dl_module.scale,
+                                 dl_module.shift,
+                                 dl_module.epsilon,
+                                 dl_module.averageFactor,
+                                 dl_module.saveMean_ref,
+                                 dl_module.saveVariance_ref,
+                                 dl_module.runMean_ref,
+                                 dl_module.runVariance_ref);
+}
+
 template <typename T>
 void CompareTensor(const tensor<T>& output,
                    const tensor<T>& ref_out,

From 14118a413eec00071800d4efa48ef0199bbbabd5 Mon Sep 17 00:00:00 2001
From: amberhassaan <amber_474@yahoo.com>
Date: Thu, 5 Oct 2023 18:27:19 -0400
Subject: [PATCH 24/36] Reference kernel for 3D convolution for non-packed
 tensors (#2334)

---
 src/CMakeLists.txt                            |    1 +
 src/hip/hip_build_utils.cpp                   |    2 +-
 src/include/miopen/hipoc_kernel.hpp           |   24 +-
 .../miopen/solver/conv_direct_naive_conv.hpp  |   95 +-
 .../gpu_reference_kernel/fp8_kern_types.h     |    6 +-
 .../gpu_reference_kernel/naive_conv.cpp       | 1719 +++++++++++------
 src/kernels/stride_array.hpp                  |   86 +
 src/solver/conv_direct_naive_conv.cpp         |   57 +-
 src/solver/conv_direct_naive_conv_bwd.cpp     |   39 +
 src/solver/conv_direct_naive_conv_fwd.cpp     |   31 +-
 src/solver/conv_direct_naive_conv_wrw.cpp     |   35 +
 test/gpu_reference_kernel.cpp                 |    3 +-
 test/gtest/conv3d_test_case.hpp               |  112 ++
 test/gtest/group_conv3d_bwd.cpp               |    2 +-
 test/gtest/group_conv3d_bwd.hpp               |   88 +-
 test/gtest/group_conv3d_fwd.cpp               |    2 +-
 test/gtest/group_conv3d_fwd.hpp               |   88 +-
 test/gtest/group_conv3d_wrw.cpp               |    2 +-
 test/gtest/group_conv3d_wrw.hpp               |   88 +-
 test/gtest/group_solver.hpp                   |    6 +-
 20 files changed, 1633 insertions(+), 853 deletions(-)
 create mode 100644 src/kernels/stride_array.hpp
 create mode 100644 test/gtest/conv3d_test_case.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index abc0679a8a..7866ad1a5a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -390,6 +390,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/workaround_issue_1431.hpp
         kernels/hip_f8_impl.hpp
         kernels/hip_float8.hpp
+        kernels/stride_array.hpp
         )
 
     set(MIOPEN_KERNELS
diff --git a/src/hip/hip_build_utils.cpp b/src/hip/hip_build_utils.cpp
index 8f6f9f0c50..86cf3a7272 100644
--- a/src/hip/hip_build_utils.cpp
+++ b/src/hip/hip_build_utils.cpp
@@ -73,7 +73,7 @@ static boost::filesystem::path HipBuildImpl(boost::optional<TmpDir>& tmp_dir,
     auto env = std::string("");
 
     if(params.find("-std=") == std::string::npos)
-        params += " --std=c++11";
+        params += " --std=c++17";
 
 #if HIP_PACKAGE_VERSION_FLAT < 4001000000ULL
     params += " --cuda-gpu-arch=" + lots.device;
diff --git a/src/include/miopen/hipoc_kernel.hpp b/src/include/miopen/hipoc_kernel.hpp
index ba9992bab3..73ac77f160 100644
--- a/src/include/miopen/hipoc_kernel.hpp
+++ b/src/include/miopen/hipoc_kernel.hpp
@@ -26,14 +26,15 @@
 #ifndef GUARD_MIOPEN_HIPOC_KERNEL_HPP
 #define GUARD_MIOPEN_HIPOC_KERNEL_HPP
 
-#include <array>
-#include <cassert>
 #include <miopen/errors.hpp>
 #include <miopen/hipoc_program.hpp>
 #include <miopen/stringutils.hpp>
 #include <miopen/op_kernel_args.hpp>
+
+#include <array>
+#include <cassert>
+#include <cstring>
 #include <vector>
-#include <memory.h>
 
 namespace miopen {
 
@@ -47,29 +48,20 @@ inline HipEventPtr make_hip_event()
 
 #if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017
 
-#if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017
 template <class T, class U>
 struct KernelArgsPair
 {
-    static const int alignment    = sizeof(U);
-    static const int padding      = (alignment - sizeof(T) % alignment) % alignment;
-    static const int second_index = sizeof(T) + padding;
+    constexpr static auto alignU       = alignof(U);
+    constexpr static auto padding      = (alignU - (sizeof(T) % alignU)) % alignU;
+    constexpr static auto second_index = sizeof(T) + padding;
     KernelArgsPair(T x, U y)
     {
         new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
         new(buffer + second_index) U(y);
     }
+
     alignas(U) char buffer[second_index + sizeof(U)] = {};
 };
-#else
-template <class T, class U>
-struct KernelArgsPair
-{
-    KernelArgsPair(T x, U y) : first(x), second(y) {}
-    T first;
-    U second;
-};
-#endif
 
 template <class... Ts>
 struct KernelArgsPack;
diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp
index 7bad52ff9e..6d935b249d 100644
--- a/src/include/miopen/solver/conv_direct_naive_conv.hpp
+++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp
@@ -25,9 +25,15 @@
  *******************************************************************************/
 #pragma once
 
-#include <string>
 #include <miopen/execution_context.hpp>
 #include <miopen/problem_description.hpp>
+#include "miopen/../../kernels/stride_array.hpp"
+
+#include <array>
+#include <algorithm>
+#include <cassert>
+#include <string>
+#include <vector>
 
 namespace miopen {
 
@@ -54,5 +60,92 @@ bool IsOutputBfp16(const ProblemDescription&);
 bool IsOutputInt8(const ProblemDescription&);
 bool IsOutputInt32(const ProblemDescription&);
 
+namespace conv_internal {
+
+void DebugPrintTensorStrides(const TensorDescriptor& inDesc,
+                             const TensorDescriptor& wDesc,
+                             const TensorDescriptor& outDesc);
+
+/**
+ * Get the index where group (G) stride should go. For NCHW, we want to convert
+ * its strides to NGCHW, and for NHWC, we want to convert its strides to NHWGC.
+ * Same applies for the 3D case.
+ */
+int GetGroupStrideIndex(const ProblemDescription& problem);
+
+/**
+ * split the strides for C dimension in a tensor descriptor into (G, C_per_group).
+ * Normally, (in packed case) num channels is a multiplying factor in the stride of
+ * whatever lies to the left of C, e.g., in NCHW, N's stride contains C as a
+ * factor. We output NGCHW for NCHW (and NHWGC for NHWC)
+ * where the stride[G] = stride[N] / num_groups
+ */
+template <typename V>
+V SplitStrideCtoGC(int num_groups, const V& orig_strides, int G_stride_idx)
+{
+    assert(G_stride_idx > 0 && G_stride_idx <= orig_strides.size());
+    // (G_stride_idx - 1) is the stride index of whatever lies to the left and
+    // contains C or K as a multiplying factor. We divide this value by num_groups
+    // to get G_stride_val
+    assert(orig_strides[G_stride_idx - 1] % num_groups == 0);
+
+    V ret{orig_strides};
+    auto G_stride_val = orig_strides[G_stride_idx - 1] / num_groups;
+
+    ret.insert(ret.begin() + G_stride_idx, G_stride_val);
+
+    return ret;
+}
+
+/**
+ * Weight tensor has original dims: [K, C_per_group, Y, X] (2D case)
+ * We return a new stride vector with strides for [G, K_per_group, C_per_group, Y, X]
+ * Stride for G is computed as stride[C_per_group] * K_per_group and inserted at
+ * left most position
+ */
+template <typename V>
+V SplitWeiStrideKtoGK(int k_per_group, const V& wei_strides)
+{
+    V ret{wei_strides};
+    ret.insert(ret.begin(), wei_strides[0] * k_per_group);
+    return ret;
+}
+
+template <unsigned N>
+struct ChooseStride
+{
+};
+
+template <>
+struct ChooseStride<5u>
+{
+    using type = Strides5D;
+};
+
+template <>
+struct ChooseStride<6u>
+{
+    using type = Strides6D;
+};
+
+template <unsigned N, typename V>
+auto MakeStrideArray(V vec)
+{
+    typename ChooseStride<N>::type ret;
+    assert(vec.size() == N);
+
+    // MIOpen stores strides for NHWC in NCHW order, i.e. C stride in 2nd from left.
+    // We sort the input stride vector so that smallest stride is at index 0. This
+    // (little-endian) order is what naive convolution kernel expects for strides
+    std::sort(vec.begin(), vec.end());
+
+    for(unsigned i = 0; i < N; ++i)
+    {
+        ret[i] = static_cast<StrideIndexType>(vec[i]);
+    }
+    return ret;
+}
+} // end namespace conv_internal
+
 } // namespace solver
 } // namespace miopen
diff --git a/src/kernels/gpu_reference_kernel/fp8_kern_types.h b/src/kernels/gpu_reference_kernel/fp8_kern_types.h
index 3bac0a31f7..b14302e0c2 100644
--- a/src/kernels/gpu_reference_kernel/fp8_kern_types.h
+++ b/src/kernels/gpu_reference_kernel/fp8_kern_types.h
@@ -58,6 +58,6 @@
 
 #define KERNEL_NAME_SUFFIX CAT(CAT(INPUT_TYPE, _), CAT(CAT(WEIGHTS_TYPE, _), OUTPUT_TYPE))
 
-#define FWD_KERNEL_NAME CAT(naive_conv_fwd_nchw_, KERNEL_NAME_SUFFIX)
-#define BWD_KERNEL_NAME CAT(naive_conv_bwd_nchw_, KERNEL_NAME_SUFFIX)
-#define WRW_KERNEL_NAME CAT(naive_conv_wrw_nchw_, KERNEL_NAME_SUFFIX)
+#define FWD_KERNEL_NAME CAT(naive_conv_packed_fwd_nchw_, KERNEL_NAME_SUFFIX)
+#define BWD_KERNEL_NAME CAT(naive_conv_packed_bwd_nchw_, KERNEL_NAME_SUFFIX)
+#define WRW_KERNEL_NAME CAT(naive_conv_packed_wrw_nchw_, KERNEL_NAME_SUFFIX)
diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp
index 24d7cd489e..b243b1234a 100644
--- a/src/kernels/gpu_reference_kernel/naive_conv.cpp
+++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp
@@ -46,6 +46,8 @@ typedef float float_t;
 #endif
 #endif // __HIPCC_RTC__
 
+#include "stride_array.hpp"
+
 // hcc seems need __device__ __host__ together to compile, and no extern "C"
 typedef union value_bf16_fp32_t
 {
@@ -114,10 +116,27 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val)
     return static_cast<int8_t>(val & 0xff);
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+/// \todo remove template parameter 'bool ASSUME_PACKED' in a follow up PR
+/// --amberhassaan
+/// Notes (Amber):
+/// - The following code used to assume that group (G) is an implicit
+/// dimension, i.e. c= c_per_group * group and k = k_per_group * group. This is not
+/// true for non-packed case because group (G) dimension needs to have its stride
+/// explicitly specified for address math to make sense. This is also how
+/// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw,
+/// and nhwc should be nhwgc. Same follows for the 3D case.
+///
+/// - strides here are in the little-endian order, i.e., for NHWC, stride for N is
+/// at index 3 while stride for C is at index 0. This is reverse of how strides are
+/// stored in tensor descriptors, which are big-endian.
+
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
                                            const src_data_t* __restrict__ p_wei,
                                            dst_data_t* __restrict__ p_out,
+                                           Strides5D in_strides,
+                                           Strides5D wei_strides,
+                                           Strides5D out_strides,
                                            int hi,
                                            int wi,
                                            int n,
@@ -148,18 +167,36 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
     int in            = (bid / k_per_group) % n;
     int ig            = bid / (n * k_per_group);
 
-    p_in += static_cast<size_t>(in) * c * hi * wi + static_cast<size_t>(ig) * c_per_group * hi * wi;
-    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
-             static_cast<size_t>(ik) * c_per_group * fy * fx;
-    p_out += static_cast<size_t>(in) * k * ho * wo +
-             static_cast<size_t>(ig) * k_per_group * ho * wo + static_cast<size_t>(ik) * ho * wo;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in +=
+            static_cast<size_t>(in) * c * hi * wi + static_cast<size_t>(ig) * c_per_group * hi * wi;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
+                 static_cast<size_t>(ik) * c_per_group * fy * fx;
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out += static_cast<size_t>(in) * k * ho * wo +
+                 static_cast<size_t>(ig) * k_per_group * ho * wo +
+                 static_cast<size_t>(ik) * ho * wo;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[4] + static_cast<size_t>(ig) * in_strides[3];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[4] + static_cast<size_t>(ik) * wei_strides[3];
+
+        p_out += static_cast<size_t>(in) * out_strides[4] +
+                 static_cast<size_t>(ig) * out_strides[3] +
+                 static_cast<size_t>(ik) * out_strides[2];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int iho = tid / wo;
         int iwo = tid % wo;
 
-        double value = .0f;
+        acc_data_t value = 0;
 
         for(int ic = 0; ic < c_per_group; ic++)
         {
@@ -178,25 +215,58 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
 
                     if(valid_w & valid_h)
                     {
-                        size_t i_idx = static_cast<size_t>(ic) * hi * wi +
-                                       static_cast<size_t>(cur_h) * wi + static_cast<size_t>(cur_w);
-                        size_t f_idx = static_cast<size_t>(ic) * fy * fx +
-                                       static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
-                        value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                 cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        if constexpr(ASSUME_PACKED)
+                        {
+                            size_t i_idx = static_cast<size_t>(ic) * hi * wi +
+                                           static_cast<size_t>(cur_h) * wi +
+                                           static_cast<size_t>(cur_w);
+
+                            size_t f_idx = static_cast<size_t>(ic) * fy * fx +
+                                           static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
+                        else
+                        {
+                            size_t i_idx = static_cast<size_t>(ic) * in_strides[2] +
+                                           static_cast<size_t>(cur_h) * in_strides[1] +
+                                           static_cast<size_t>(cur_w) * in_strides[0];
+
+                            size_t f_idx = static_cast<size_t>(ic) * wei_strides[2] +
+                                           static_cast<size_t>(iy) * wei_strides[1] +
+                                           static_cast<size_t>(ix) * wei_strides[0];
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
                     }
                 }
             }
         }
-        size_t o_idx = static_cast<size_t>(iho) * wo + static_cast<size_t>(iwo);
-        p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t o_idx = static_cast<size_t>(iho) * wo + static_cast<size_t>(iwo);
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t o_idx = static_cast<size_t>(iho) * out_strides[1] +
+                           static_cast<size_t>(iwo) * out_strides[0];
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in,
                                            const src_data_t* __restrict__ p_wei,
                                            const src_data_t* __restrict__ p_out,
+                                           Strides5D in_strides,
+                                           Strides5D wei_strides,
+                                           Strides5D out_strides,
                                            int hi,
                                            int wi,
                                            int n,
@@ -227,19 +297,35 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in,
     int in            = (bid / c_per_group) % n;
     int ig            = bid / (n * c_per_group);
 
-    p_in += static_cast<size_t>(in) * c * hi * wi +
-            static_cast<size_t>(ig) * c_per_group * hi * wi + static_cast<size_t>(ic) * hi * wi;
-    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
-             static_cast<size_t>(ic) * fy * fx;
-    p_out +=
-        static_cast<size_t>(in) * k * ho * wo + static_cast<size_t>(ig) * k_per_group * ho * wo;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * c * hi * wi +
+                static_cast<size_t>(ig) * c_per_group * hi * wi + static_cast<size_t>(ic) * hi * wi;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
+                 static_cast<size_t>(ic) * fy * fx;
+
+        p_out +=
+            static_cast<size_t>(in) * k * ho * wo + static_cast<size_t>(ig) * k_per_group * ho * wo;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[4] + static_cast<size_t>(ig) * in_strides[3] +
+                static_cast<size_t>(ic) * in_strides[2];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[4] + static_cast<size_t>(ic) * wei_strides[2];
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out +=
+            static_cast<size_t>(in) * out_strides[4] + static_cast<size_t>(ig) * out_strides[3];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ihi = tid / wi;
         int iwi = tid % wi;
 
-        double value = .0f;
+        acc_data_t value = 0;
 
         for(int ik = 0; ik < k_per_group; ik++)
         {
@@ -264,26 +350,59 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in,
 
                     if(valid_h & valid_w)
                     {
-                        size_t o_idx = static_cast<size_t>(ik) * ho * wo +
-                                       static_cast<size_t>(cur_ho) * wo +
-                                       static_cast<size_t>(cur_wo);
-                        size_t f_idx = static_cast<size_t>(ik) * c_per_group * fy * fx +
-                                       static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
-                        value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
-                                 cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        if constexpr(ASSUME_PACKED)
+                        {
+                            size_t o_idx = static_cast<size_t>(ik) * ho * wo +
+                                           static_cast<size_t>(cur_ho) * wo +
+                                           static_cast<size_t>(cur_wo);
+
+                            size_t f_idx = static_cast<size_t>(ik) * c_per_group * fy * fx +
+                                           static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
+
+                            value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
+                        else
+                        {
+                            size_t o_idx = static_cast<size_t>(ik) * out_strides[2] +
+                                           static_cast<size_t>(cur_ho) * out_strides[1] +
+                                           static_cast<size_t>(cur_wo) * out_strides[0];
+
+                            size_t f_idx = static_cast<size_t>(ik) * wei_strides[3] +
+                                           static_cast<size_t>(iy) * wei_strides[1] +
+                                           static_cast<size_t>(ix) * wei_strides[0];
+
+                            value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
                     }
                 }
             }
         }
-        size_t i_idx = static_cast<size_t>(ihi) * wi + static_cast<size_t>(iwi);
-        p_in[i_idx]  = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t i_idx = static_cast<size_t>(ihi) * wi + static_cast<size_t>(iwi);
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t i_idx =
+                static_cast<size_t>(ihi) * in_strides[1] + static_cast<size_t>(iwi) * in_strides[0];
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in,
                                            dst_data_t* __restrict__ p_wei,
                                            const src_data_t* __restrict__ p_out,
+                                           Strides5D in_strides,
+                                           Strides5D wei_strides,
+                                           Strides5D out_strides,
                                            int hi,
                                            int wi,
                                            int n,
@@ -315,18 +434,34 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in,
     int ik            = bid % k_per_group;
     int ig            = bid / k_per_group;
 
-    p_in += static_cast<size_t>(ig) * c_per_group * hi * wi;
-    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
-             static_cast<size_t>(ik) * c_per_group * fy * fx;
-    p_out += static_cast<size_t>(ig) * k_per_group * ho * wo + static_cast<size_t>(ik) * ho * wo;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(ig) * c_per_group * hi * wi;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fy * fx +
+                 static_cast<size_t>(ik) * c_per_group * fy * fx;
+
+        p_out +=
+            static_cast<size_t>(ig) * k_per_group * ho * wo + static_cast<size_t>(ik) * ho * wo;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(ig) * in_strides[3];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[4] + static_cast<size_t>(ik) * wei_strides[3];
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out +=
+            static_cast<size_t>(ig) * out_strides[3] + static_cast<size_t>(ik) * out_strides[2];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ix = tid % fx;
         int iy = (tid / fx) % fy;
         int ic = tid / (fx * fy);
 
-        double value = .0f;
+        acc_data_t value = 0;
 
         for(int in = 0; in < n; in++)
         {
@@ -345,28 +480,64 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in,
 
                     if(valid_h & valid_w)
                     {
-                        size_t i_idx = static_cast<size_t>(in) * c * hi * wi +
-                                       static_cast<size_t>(ic) * hi * wi +
-                                       static_cast<size_t>(cur_h) * wi + static_cast<size_t>(cur_w);
-                        size_t o_idx = static_cast<size_t>(in) * k * ho * wo +
-                                       static_cast<size_t>(iho) * wo + static_cast<size_t>(iwo);
-                        value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                 cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                        if constexpr(ASSUME_PACKED)
+                        {
+                            size_t i_idx = static_cast<size_t>(in) * c * hi * wi +
+                                           static_cast<size_t>(ic) * hi * wi +
+                                           static_cast<size_t>(cur_h) * wi +
+                                           static_cast<size_t>(cur_w);
+
+                            size_t o_idx = static_cast<size_t>(in) * k * ho * wo +
+                                           static_cast<size_t>(iho) * wo + static_cast<size_t>(iwo);
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                        }
+                        else
+                        {
+                            size_t i_idx = static_cast<size_t>(in) * in_strides[4] +
+                                           static_cast<size_t>(ic) * in_strides[2] +
+                                           static_cast<size_t>(cur_h) * in_strides[1] +
+                                           static_cast<size_t>(cur_w) * in_strides[0];
+
+                            size_t o_idx = static_cast<size_t>(in) * out_strides[4] +
+                                           static_cast<size_t>(iho) * out_strides[1] +
+                                           static_cast<size_t>(iwo) * out_strides[0];
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                        }
                     }
                 }
             }
         }
-        size_t f_idx = static_cast<size_t>(ic) * fy * fx + static_cast<size_t>(iy) * fx +
-                       static_cast<size_t>(ix);
-        p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t f_idx = static_cast<size_t>(ic) * fy * fx + static_cast<size_t>(iy) * fx +
+                           static_cast<size_t>(ix);
+
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t f_idx = static_cast<size_t>(ic) * wei_strides[2] +
+                           static_cast<size_t>(iy) * wei_strides[1] +
+                           static_cast<size_t>(ix) * wei_strides[0];
+
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
 // design block_size 256
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in,
                                             const src_data_t* __restrict__ p_wei,
                                             dst_data_t* __restrict__ p_out,
+                                            Strides6D in_strides,
+                                            Strides6D wei_strides,
+                                            Strides6D out_strides,
                                             int di,
                                             int hi,
                                             int wi,
@@ -405,21 +576,37 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in,
     int in            = (bid / k_per_group) % n;
     int ig            = bid / (n * k_per_group);
 
-    p_in += static_cast<size_t>(in) * c * di * hi * wi +
-            static_cast<size_t>(ig) * c_per_group * di * hi * wi;
-    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fz * fy * fx +
-             static_cast<size_t>(ik) * c_per_group * fz * fy * fx;
-    p_out += static_cast<size_t>(in) * k * do_ * ho * wo +
-             static_cast<size_t>(ig) * k_per_group * do_ * ho * wo +
-             static_cast<size_t>(ik) * do_ * ho * wo;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * c * di * hi * wi +
+                static_cast<size_t>(ig) * c_per_group * di * hi * wi;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fz * fy * fx +
+                 static_cast<size_t>(ik) * c_per_group * fz * fy * fx;
+
+        p_out += static_cast<size_t>(in) * k * do_ * ho * wo +
+                 static_cast<size_t>(ig) * k_per_group * do_ * ho * wo +
+                 static_cast<size_t>(ik) * do_ * ho * wo;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[5] + static_cast<size_t>(ig) * in_strides[4];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[5] + static_cast<size_t>(ik) * wei_strides[4];
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out += static_cast<size_t>(in) * out_strides[5] +
+                 static_cast<size_t>(ig) * out_strides[4] +
+                 static_cast<size_t>(ik) * out_strides[3];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int iwo = tid % wo;
         int iho = (tid / wo) % ho;
         int ido = tid / (ho * wo);
 
-        double value = .0f;
+        acc_data_t value = 0;
 
         for(int ic = 0; ic < c_per_group; ic++)
         {
@@ -444,30 +631,67 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in,
 
                         if(valid_d & valid_w & valid_h)
                         {
-                            size_t i_idx = static_cast<size_t>(ic) * di * hi * wi +
-                                           static_cast<size_t>(cur_d) * hi * wi +
-                                           static_cast<size_t>(cur_h) * wi +
-                                           static_cast<size_t>(cur_w);
-                            size_t f_idx = static_cast<size_t>(ic) * fz * fy * fx +
-                                           static_cast<size_t>(iz) * fy * fx +
-                                           static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
-                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            if constexpr(ASSUME_PACKED)
+                            {
+                                size_t i_idx = static_cast<size_t>(ic) * di * hi * wi +
+                                               static_cast<size_t>(cur_d) * hi * wi +
+                                               static_cast<size_t>(cur_h) * wi +
+                                               static_cast<size_t>(cur_w);
+
+                                size_t f_idx = static_cast<size_t>(ic) * fz * fy * fx +
+                                               static_cast<size_t>(iz) * fy * fx +
+                                               static_cast<size_t>(iy) * fx +
+                                               static_cast<size_t>(ix);
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
+                            else
+                            {
+                                size_t i_idx = static_cast<size_t>(ic) * in_strides[3] +
+                                               static_cast<size_t>(cur_d) * in_strides[2] +
+                                               static_cast<size_t>(cur_h) * in_strides[1] +
+                                               static_cast<size_t>(cur_w) * in_strides[0];
+
+                                size_t f_idx = static_cast<size_t>(ic) * wei_strides[3] +
+                                               static_cast<size_t>(iz) * wei_strides[2] +
+                                               static_cast<size_t>(iy) * wei_strides[1] +
+                                               static_cast<size_t>(ix) * wei_strides[0];
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
                         }
                     }
                 }
             }
         }
-        size_t o_idx = static_cast<size_t>(ido) * ho * wo + static_cast<size_t>(iho) * wo +
-                       static_cast<size_t>(iwo);
-        p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t o_idx = static_cast<size_t>(ido) * ho * wo + static_cast<size_t>(iho) * wo +
+                           static_cast<size_t>(iwo);
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t o_idx = static_cast<size_t>(ido) * out_strides[2] +
+                           static_cast<size_t>(iho) * out_strides[1] +
+                           static_cast<size_t>(iwo) * out_strides[0];
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in,
                                             const src_data_t* __restrict__ p_wei,
                                             const src_data_t* __restrict__ p_out,
+                                            Strides6D in_strides,
+                                            Strides6D wei_strides,
+                                            Strides6D out_strides,
                                             int di,
                                             int hi,
                                             int wi,
@@ -506,21 +730,37 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in,
     int in            = (bid / c_per_group) % n;
     int ig            = bid / (n * c_per_group);
 
-    p_in += static_cast<size_t>(in) * c * di * hi * wi +
-            static_cast<size_t>(ig) * c_per_group * di * hi * wi +
-            static_cast<size_t>(ic) * di * hi * wi;
-    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fz * fy * fx +
-             static_cast<size_t>(ic) * fz * fy * fx;
-    p_out += static_cast<size_t>(in) * k * do_ * ho * wo +
-             static_cast<size_t>(ig) * k_per_group * do_ * ho * wo;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * c * di * hi * wi +
+                static_cast<size_t>(ig) * c_per_group * di * hi * wi +
+                static_cast<size_t>(ic) * di * hi * wi;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fz * fy * fx +
+                 static_cast<size_t>(ic) * fz * fy * fx;
+
+        p_out += static_cast<size_t>(in) * k * do_ * ho * wo +
+                 static_cast<size_t>(ig) * k_per_group * do_ * ho * wo;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[5] + static_cast<size_t>(ig) * in_strides[4] +
+                static_cast<size_t>(ic) * in_strides[3];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[5] + static_cast<size_t>(ic) * wei_strides[3];
+
+        p_out +=
+            static_cast<size_t>(in) * out_strides[5] + static_cast<size_t>(ig) * out_strides[4];
+    }
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int iwi = tid % wi;
         int ihi = (tid / wi) % hi;
         int idi = tid / (hi * wi);
 
-        double value = .0f;
+        acc_data_t value = 0;
 
         for(int ik = 0; ik < k_per_group; ik++)
         {
@@ -554,30 +794,67 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in,
 
                         if(valid_d & valid_h & valid_w)
                         {
-                            size_t o_idx = static_cast<size_t>(ik) * do_ * ho * wo +
-                                           static_cast<size_t>(cur_do) * ho * wo +
-                                           static_cast<size_t>(cur_ho) * wo +
-                                           static_cast<size_t>(cur_wo);
-                            size_t f_idx = static_cast<size_t>(ik) * c_per_group * fz * fy * fx +
-                                           static_cast<size_t>(iz) * fy * fx +
-                                           static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
-                            value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
-                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            if constexpr(ASSUME_PACKED)
+                            {
+                                size_t o_idx = static_cast<size_t>(ik) * do_ * ho * wo +
+                                               static_cast<size_t>(cur_do) * ho * wo +
+                                               static_cast<size_t>(cur_ho) * wo +
+                                               static_cast<size_t>(cur_wo);
+
+                                size_t f_idx =
+                                    static_cast<size_t>(ik) * c_per_group * fz * fy * fx +
+                                    static_cast<size_t>(iz) * fy * fx +
+                                    static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
+
+                                value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
+                            else
+                            {
+                                size_t o_idx = static_cast<size_t>(ik) * out_strides[3] +
+                                               static_cast<size_t>(cur_do) * out_strides[2] +
+                                               static_cast<size_t>(cur_ho) * out_strides[1] +
+                                               static_cast<size_t>(cur_wo) * out_strides[0];
+
+                                size_t f_idx = static_cast<size_t>(ik) * wei_strides[4] +
+                                               static_cast<size_t>(iz) * wei_strides[2] +
+                                               static_cast<size_t>(iy) * wei_strides[1] +
+                                               static_cast<size_t>(ix) * wei_strides[0];
+
+                                value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
                         }
                     }
                 }
             }
         }
-        size_t i_idx = static_cast<size_t>(idi) * hi * wi + static_cast<size_t>(ihi) * wi +
-                       static_cast<size_t>(iwi);
-        p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t i_idx = static_cast<size_t>(idi) * hi * wi + static_cast<size_t>(ihi) * wi +
+                           static_cast<size_t>(iwi);
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t i_idx = static_cast<size_t>(idi) * in_strides[2] +
+                           static_cast<size_t>(ihi) * in_strides[1] +
+                           static_cast<size_t>(iwi) * in_strides[0];
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in,
                                             dst_data_t* __restrict__ p_wei,
                                             const src_data_t* __restrict__ p_out,
+                                            Strides6D in_strides,
+                                            Strides6D wei_strides,
+                                            Strides6D out_strides,
                                             int di,
                                             int hi,
                                             int wi,
@@ -615,20 +892,35 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in,
     int ik            = bid % k_per_group;
     int ig            = bid / k_per_group;
 
-    p_in += static_cast<size_t>(ig) * c_per_group * di * hi * wi;
-    p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fz * fy * fx +
-             static_cast<size_t>(ik) * c_per_group * fz * fy * fx;
-    p_out += static_cast<size_t>(ig) * k_per_group * do_ * ho * wo +
-             static_cast<size_t>(ik) * do_ * ho * wo;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(ig) * c_per_group * di * hi * wi;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * c_per_group * fz * fy * fx +
+                 static_cast<size_t>(ik) * c_per_group * fz * fy * fx;
+
+        p_out += static_cast<size_t>(ig) * k_per_group * do_ * ho * wo +
+                 static_cast<size_t>(ik) * do_ * ho * wo;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(ig) * in_strides[4];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[5] + static_cast<size_t>(ik) * wei_strides[4];
+
+        p_out +=
+            static_cast<size_t>(ig) * out_strides[4] + static_cast<size_t>(ik) * out_strides[3];
+    }
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ix = tid % fx;
         int iy = (tid / fx) % fy;
         int iz = (tid / (fx * fy)) % fz;
         int ic = tid / (fx * fy * fz);
 
-        double value = .0f;
+        acc_data_t value = 0;
 
         for(int in = 0; in < n; in++)
         {
@@ -653,33 +945,73 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in,
 
                         if(valid_d & valid_h & valid_w)
                         {
-                            size_t i_idx = static_cast<size_t>(in) * c * di * hi * wi +
-                                           static_cast<size_t>(ic) * di * hi * wi +
-                                           static_cast<size_t>(cur_d) * hi * wi +
-                                           static_cast<size_t>(cur_h) * wi +
-                                           static_cast<size_t>(cur_w);
-                            size_t o_idx = static_cast<size_t>(in) * k * do_ * ho * wo +
-                                           static_cast<size_t>(ido) * ho * wo +
-                                           static_cast<size_t>(iho) * wo + static_cast<size_t>(iwo);
-                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                     cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                            if constexpr(ASSUME_PACKED)
+                            {
+                                size_t i_idx = static_cast<size_t>(in) * c * di * hi * wi +
+                                               static_cast<size_t>(ic) * di * hi * wi +
+                                               static_cast<size_t>(cur_d) * hi * wi +
+                                               static_cast<size_t>(cur_h) * wi +
+                                               static_cast<size_t>(cur_w);
+
+                                size_t o_idx = static_cast<size_t>(in) * k * do_ * ho * wo +
+                                               static_cast<size_t>(ido) * ho * wo +
+                                               static_cast<size_t>(iho) * wo +
+                                               static_cast<size_t>(iwo);
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                            }
+                            else
+                            {
+                                size_t i_idx = static_cast<size_t>(in) * in_strides[5] +
+                                               static_cast<size_t>(ic) * in_strides[3] +
+                                               static_cast<size_t>(cur_d) * in_strides[2] +
+                                               static_cast<size_t>(cur_h) * in_strides[1] +
+                                               static_cast<size_t>(cur_w) * in_strides[0];
+
+                                size_t o_idx = static_cast<size_t>(in) * out_strides[5] +
+                                               static_cast<size_t>(ido) * out_strides[2] +
+                                               static_cast<size_t>(iho) * out_strides[1] +
+                                               static_cast<size_t>(iwo) * out_strides[0];
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                            }
                         }
                     }
                 }
             }
         }
-        size_t f_idx = static_cast<size_t>(ic) * fz * fy * fx + static_cast<size_t>(iz) * fy * fx +
-                       static_cast<size_t>(iy) * fx + static_cast<size_t>(ix);
-        p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t f_idx = static_cast<size_t>(ic) * fz * fy * fx +
+                           static_cast<size_t>(iz) * fy * fx + static_cast<size_t>(iy) * fx +
+                           static_cast<size_t>(ix);
+
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t f_idx = static_cast<size_t>(ic) * wei_strides[3] +
+                           static_cast<size_t>(iz) * wei_strides[2] +
+                           static_cast<size_t>(iy) * wei_strides[1] +
+                           static_cast<size_t>(ix) * wei_strides[0];
+
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
 /***************************** nhwc *****************************/
 // design block_size 256
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in,
                                            const src_data_t* __restrict__ p_wei,
                                            dst_data_t* __restrict__ p_out,
+                                           Strides5D in_strides,
+                                           Strides5D wei_strides,
+                                           Strides5D out_strides,
                                            int hi,
                                            int wi,
                                            int n,
@@ -711,17 +1043,32 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in,
     int in            = (bid / ho) % n;
     int ig            = bid / (n * ho);
 
-    p_in += static_cast<size_t>(in) * hi * wi * c + static_cast<size_t>(ig) * c_per_group;
-    p_wei += static_cast<size_t>(ig) * k_per_group * fy * fx * c_per_group;
-    p_out += static_cast<size_t>(in) * ho * wo * k + static_cast<size_t>(ig) * k_per_group +
-             static_cast<size_t>(iho) * wo * k;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * hi * wi * c + static_cast<size_t>(ig) * c_per_group;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * fy * fx * c_per_group;
+
+        p_out += static_cast<size_t>(in) * ho * wo * k + static_cast<size_t>(iho) * wo * k +
+                 static_cast<size_t>(ig) * k_per_group;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[4] + static_cast<size_t>(ig) * in_strides[1];
+
+        p_wei += static_cast<size_t>(ig) * wei_strides[4];
+
+        p_out += static_cast<size_t>(in) * out_strides[4] +
+                 static_cast<size_t>(iho) * out_strides[3] +
+                 static_cast<size_t>(ig) * out_strides[1];
+    }
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int iwo = tid / k_per_group;
         int ik  = tid % k_per_group;
 
-        double value = .0f;
+        acc_data_t value = 0;
 
         for(int iy = 0; iy < fy; iy++)
         {
@@ -740,27 +1087,61 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in,
 
                     if(valid_w & valid_h)
                     {
-                        size_t i_idx = static_cast<size_t>(cur_h) * wi * c +
-                                       static_cast<size_t>(cur_w) * c + static_cast<size_t>(ic);
-                        size_t f_idx = static_cast<size_t>(ik) * fy * fx * c_per_group +
-                                       static_cast<size_t>(iy) * fx * c_per_group +
-                                       static_cast<size_t>(ix) * c_per_group +
-                                       static_cast<size_t>(ic);
-                        value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                 cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        if constexpr(ASSUME_PACKED)
+                        {
+                            size_t i_idx = static_cast<size_t>(cur_h) * wi * c +
+                                           static_cast<size_t>(cur_w) * c + static_cast<size_t>(ic);
+
+                            size_t f_idx = static_cast<size_t>(ik) * fy * fx * c_per_group +
+                                           static_cast<size_t>(iy) * fx * c_per_group +
+                                           static_cast<size_t>(ix) * c_per_group +
+                                           static_cast<size_t>(ic);
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
+                        else
+                        {
+                            size_t i_idx = static_cast<size_t>(cur_h) * in_strides[3] +
+                                           static_cast<size_t>(cur_w) * in_strides[2] +
+                                           static_cast<size_t>(ic) * in_strides[0];
+
+                            size_t f_idx = static_cast<size_t>(ik) * wei_strides[3] +
+                                           static_cast<size_t>(iy) * wei_strides[2] +
+                                           static_cast<size_t>(ix) * wei_strides[1] +
+                                           static_cast<size_t>(ic) * wei_strides[0];
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
                     }
                 }
             }
         }
-        size_t o_idx = static_cast<size_t>(iwo) * k + static_cast<size_t>(ik);
-        p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t o_idx = static_cast<size_t>(iwo) * k + static_cast<size_t>(ik);
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t o_idx = static_cast<size_t>(iwo) * out_strides[2] +
+                           static_cast<size_t>(ik) * out_strides[0];
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in,
                                            const src_data_t* __restrict__ p_wei,
                                            const src_data_t* __restrict__ p_out,
+                                           Strides5D in_strides,
+                                           Strides5D wei_strides,
+                                           Strides5D out_strides,
                                            int hi,
                                            int wi,
                                            int n,
@@ -792,17 +1173,32 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in,
     int in            = (bid / hi) % n;
     int ig            = bid / (n * hi);
 
-    p_in += static_cast<size_t>(in) * hi * wi * c + static_cast<size_t>(ihi) * wi * c +
-            static_cast<size_t>(ig) * c_per_group;
-    p_wei += static_cast<size_t>(ig) * k_per_group * fy * fx * c_per_group;
-    p_out += static_cast<size_t>(in) * ho * wo * k + static_cast<size_t>(ig) * k_per_group;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * hi * wi * c + static_cast<size_t>(ihi) * wi * c +
+                static_cast<size_t>(ig) * c_per_group;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * fy * fx * c_per_group;
+
+        p_out += static_cast<size_t>(in) * ho * wo * k + static_cast<size_t>(ig) * k_per_group;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[4] + static_cast<size_t>(ihi) * in_strides[3] +
+                static_cast<size_t>(ig) * in_strides[1];
+
+        p_wei += static_cast<size_t>(ig) * wei_strides[4];
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out +=
+            static_cast<size_t>(in) * out_strides[4] + static_cast<size_t>(ig) * out_strides[1];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int iwi = tid / c_per_group;
         int ic  = tid % c_per_group;
 
-        double value = .0f;
+        acc_data_t value = 0;
 
         for(int iy = 0; iy < fy; iy++)
         {
@@ -827,27 +1223,61 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in,
 
                     if(valid_h & valid_w)
                     {
-                        size_t o_idx = static_cast<size_t>(cur_ho) * wo * k +
-                                       static_cast<size_t>(cur_wo) * k + static_cast<size_t>(ik);
-                        size_t f_idx = static_cast<size_t>(ik) * fy * fx * c_per_group +
-                                       static_cast<size_t>(iy) * fx * c_per_group +
-                                       static_cast<size_t>(ix) * c_per_group +
-                                       static_cast<size_t>(ic);
-                        value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
-                                 cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        if constexpr(ASSUME_PACKED)
+                        {
+                            size_t o_idx = static_cast<size_t>(cur_ho) * wo * k +
+                                           static_cast<size_t>(cur_wo) * k +
+                                           static_cast<size_t>(ik);
+
+                            size_t f_idx = static_cast<size_t>(ik) * fy * fx * c_per_group +
+                                           static_cast<size_t>(iy) * fx * c_per_group +
+                                           static_cast<size_t>(ix) * c_per_group +
+                                           static_cast<size_t>(ic);
+
+                            value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
+                        else
+                        {
+                            size_t o_idx = static_cast<size_t>(cur_ho) * out_strides[3] +
+                                           static_cast<size_t>(cur_wo) * out_strides[2] +
+                                           static_cast<size_t>(ik) * out_strides[0];
+
+                            size_t f_idx = static_cast<size_t>(ik) * wei_strides[3] +
+                                           static_cast<size_t>(iy) * wei_strides[2] +
+                                           static_cast<size_t>(ix) * wei_strides[1] +
+                                           static_cast<size_t>(ic) * wei_strides[0];
+
+                            value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                        }
                     }
                 }
             }
         }
-        size_t i_idx = static_cast<size_t>(iwi) * c + static_cast<size_t>(ic);
-        p_in[i_idx]  = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t i_idx = static_cast<size_t>(iwi) * c + static_cast<size_t>(ic);
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t i_idx =
+                static_cast<size_t>(iwi) * in_strides[2] + static_cast<size_t>(ic) * in_strides[0];
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in,
                                            dst_data_t* __restrict__ p_wei,
                                            const src_data_t* __restrict__ p_out,
+                                           Strides5D in_strides,
+                                           Strides5D wei_strides,
+                                           Strides5D out_strides,
                                            int hi,
                                            int wi,
                                            int n,
@@ -879,18 +1309,33 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in,
     int ik            = bid % k_per_group;
     int ig            = bid / k_per_group;
 
-    p_in += static_cast<size_t>(ig) * c_per_group;
-    p_wei += static_cast<size_t>(ig) * k_per_group * fy * fx * c_per_group +
-             static_cast<size_t>(ik) * fy * fx * c_per_group;
-    p_out += static_cast<size_t>(ig) * k_per_group + static_cast<size_t>(ik);
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(ig) * c_per_group;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * fy * fx * c_per_group +
+                 static_cast<size_t>(ik) * fy * fx * c_per_group;
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out += static_cast<size_t>(ig) * k_per_group + static_cast<size_t>(ik);
+    }
+    else
+    {
+        p_in += static_cast<size_t>(ig) * in_strides[1];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[4] + static_cast<size_t>(ik) * wei_strides[3];
+
+        p_out +=
+            static_cast<size_t>(ig) * out_strides[1] + static_cast<size_t>(ik) * out_strides[0];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ic = tid % c_per_group;
         int ix = (tid / c_per_group) % fx;
         int iy = tid / (c_per_group * fx);
 
-        double value = .0f;
+        acc_data_t value = 0;
 
         for(int in = 0; in < n; in++)
         {
@@ -909,29 +1354,65 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in,
 
                     if(valid_h & valid_w)
                     {
-                        size_t i_idx = static_cast<size_t>(in) * hi * wi * c +
-                                       static_cast<size_t>(cur_h) * wi * c +
-                                       static_cast<size_t>(cur_w) * c + static_cast<size_t>(ic);
-                        size_t o_idx = static_cast<size_t>(in) * ho * wo * k +
-                                       static_cast<size_t>(iho) * wo * k +
-                                       static_cast<size_t>(iwo) * k;
-                        value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                 cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+
+                        if constexpr(ASSUME_PACKED)
+                        {
+                            size_t i_idx = static_cast<size_t>(in) * hi * wi * c +
+                                           static_cast<size_t>(cur_h) * wi * c +
+                                           static_cast<size_t>(cur_w) * c + static_cast<size_t>(ic);
+
+                            size_t o_idx = static_cast<size_t>(in) * ho * wo * k +
+                                           static_cast<size_t>(iho) * wo * k +
+                                           static_cast<size_t>(iwo) * k;
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                        }
+                        else
+                        {
+                            size_t i_idx = static_cast<size_t>(in) * in_strides[4] +
+                                           static_cast<size_t>(cur_h) * in_strides[3] +
+                                           static_cast<size_t>(cur_w) * in_strides[2] +
+                                           static_cast<size_t>(ic) * in_strides[0];
+
+                            size_t o_idx = static_cast<size_t>(in) * out_strides[4] +
+                                           static_cast<size_t>(iho) * out_strides[3] +
+                                           static_cast<size_t>(iwo) * out_strides[2];
+
+                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                     cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                        }
                     }
                 }
             }
         }
-        size_t f_idx = static_cast<size_t>(iy) * fx * c_per_group +
-                       static_cast<size_t>(ix) * c_per_group + static_cast<size_t>(ic);
-        p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t f_idx = static_cast<size_t>(iy) * fx * c_per_group +
+                           static_cast<size_t>(ix) * c_per_group + static_cast<size_t>(ic);
+
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t f_idx = static_cast<size_t>(iy) * wei_strides[2] +
+                           static_cast<size_t>(ix) * wei_strides[1] +
+                           static_cast<size_t>(ic) * wei_strides[0];
+
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
 // design block_size 256
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in,
                                             const src_data_t* __restrict__ p_wei,
                                             dst_data_t* __restrict__ p_out,
+                                            Strides6D in_strides,
+                                            Strides6D wei_strides,
+                                            Strides6D out_strides,
                                             int di,
                                             int hi,
                                             int wi,
@@ -970,18 +1451,36 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in,
     int in            = (bid / do_) % n;
     int ig            = bid / (n * do_);
 
-    p_in += static_cast<size_t>(in) * di * hi * wi * c + static_cast<size_t>(ig) * c_per_group;
-    p_wei += static_cast<size_t>(ig) * k_per_group * fz * fy * fx * c_per_group;
-    p_out += static_cast<size_t>(in) * do_ * ho * wo * k + static_cast<size_t>(ido) * ho * wo * k +
-             static_cast<size_t>(ig) * k_per_group;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * di * hi * wi * c + static_cast<size_t>(ig) * c_per_group;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * fz * fy * fx * c_per_group;
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out += static_cast<size_t>(in) * do_ * ho * wo * k +
+                 static_cast<size_t>(ido) * ho * wo * k + static_cast<size_t>(ig) * k_per_group;
+    }
+    else
+    {
+        // dim order NDHWGC
+        // replace C and K with G * C_per_G and G * K_per_G
+        p_in += static_cast<size_t>(in) * in_strides[5] + static_cast<size_t>(ig) * in_strides[1];
+
+        // Assumes that group G is the highest dimension in the layout
+        p_wei += static_cast<size_t>(ig) * wei_strides[5];
+
+        p_out += static_cast<size_t>(in) * out_strides[5] +
+                 static_cast<size_t>(ido) * out_strides[4] +
+                 static_cast<size_t>(ig) * out_strides[1];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ik  = tid % k_per_group;
         int iwo = (tid / k_per_group) % wo;
         int iho = tid / (k_per_group * wo);
 
-        double value = .0f;
+        acc_data_t value = 0;
 
         for(int iz = 0; iz < fz; iz++)
         {
@@ -1005,30 +1504,69 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in,
                     {
                         if(valid_d & valid_w & valid_h)
                         {
-                            size_t i_idx = static_cast<size_t>(cur_d) * hi * wi * c +
-                                           static_cast<size_t>(cur_h) * wi * c +
-                                           static_cast<size_t>(cur_w) * c + static_cast<size_t>(ic);
-                            size_t f_idx = static_cast<size_t>(ik) * fz * fy * fx * c_per_group +
-                                           static_cast<size_t>(iz) * fy * fx * c_per_group +
-                                           static_cast<size_t>(iy) * fx * c_per_group +
-                                           static_cast<size_t>(ix) * c_per_group +
-                                           static_cast<size_t>(ic);
-                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            if constexpr(ASSUME_PACKED)
+                            {
+                                size_t i_idx = static_cast<size_t>(cur_d) * hi * wi * c +
+                                               static_cast<size_t>(cur_h) * wi * c +
+                                               static_cast<size_t>(cur_w) * c +
+                                               static_cast<size_t>(ic);
+
+                                size_t f_idx =
+                                    static_cast<size_t>(ik) * fz * fy * fx * c_per_group +
+                                    static_cast<size_t>(iz) * fy * fx * c_per_group +
+                                    static_cast<size_t>(iy) * fx * c_per_group +
+                                    static_cast<size_t>(ix) * c_per_group + static_cast<size_t>(ic);
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
+                            else
+                            {
+                                size_t i_idx = static_cast<size_t>(cur_d) * in_strides[4] +
+                                               static_cast<size_t>(cur_h) * in_strides[3] +
+                                               static_cast<size_t>(cur_w) * in_strides[2] +
+                                               static_cast<size_t>(ic) * in_strides[0];
+
+                                size_t f_idx = static_cast<size_t>(ik) * wei_strides[4] +
+                                               static_cast<size_t>(iz) * wei_strides[3] +
+                                               static_cast<size_t>(iy) * wei_strides[2] +
+                                               static_cast<size_t>(ix) * wei_strides[1] +
+                                               static_cast<size_t>(ic) * wei_strides[0];
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
                         }
                     }
                 }
             }
         }
-        size_t o_idx = static_cast<size_t>(iho) * wo * k + static_cast<size_t>(iwo) * k +
-                       static_cast<size_t>(ik);
-        p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t o_idx = static_cast<size_t>(iho) * wo * k + static_cast<size_t>(iwo) * k +
+                           static_cast<size_t>(ik);
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t o_idx = static_cast<size_t>(iho) * out_strides[3] +
+                           static_cast<size_t>(iwo) * out_strides[2] +
+                           static_cast<size_t>(ik) * out_strides[0];
+
+            p_out[o_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in,
                                             const src_data_t* __restrict__ p_wei,
                                             const src_data_t* __restrict__ p_out,
+                                            Strides6D in_strides,
+                                            Strides6D wei_strides,
+                                            Strides6D out_strides,
                                             int di,
                                             int hi,
                                             int wi,
@@ -1052,6 +1590,7 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in,
                                             int fx,
                                             int group)
 {
+
     /*
      *  need to compute total input pixel: `group * n * di * hi * wi *
      * c_per_group`.
@@ -1067,18 +1606,34 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in,
     int in            = (bid / di) % n;
     int ig            = bid / (n * di);
 
-    p_in += static_cast<size_t>(in) * di * hi * wi * c + static_cast<size_t>(idi) * hi * wi * c +
-            static_cast<size_t>(ig) * c_per_group;
-    p_wei += static_cast<size_t>(ig) * k_per_group * fz * fy * fx * c_per_group;
-    p_out += static_cast<size_t>(in) * do_ * ho * wo * k + static_cast<size_t>(ig) * k_per_group;
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(in) * di * hi * wi * c +
+                static_cast<size_t>(idi) * hi * wi * c + static_cast<size_t>(ig) * c_per_group;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * fz * fy * fx * c_per_group;
+
+        p_out +=
+            static_cast<size_t>(in) * do_ * ho * wo * k + static_cast<size_t>(ig) * k_per_group;
+    }
+    else
+    {
+        p_in += static_cast<size_t>(in) * in_strides[5] + static_cast<size_t>(idi) * in_strides[4] +
+                static_cast<size_t>(ig) * in_strides[1];
+
+        p_wei += static_cast<size_t>(ig) * wei_strides[5];
+
+        p_out +=
+            static_cast<size_t>(in) * out_strides[5] + static_cast<size_t>(ig) * out_strides[1];
+    }
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ic  = tid % c_per_group;
         int iwi = (tid / c_per_group) % wi;
         int ihi = (tid / (c_per_group * wi));
 
-        double value = .0f;
+        acc_data_t value = 0;
 
         for(int iz = 0; iz < fz; iz++)
         {
@@ -1111,32 +1666,69 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in,
                     {
                         if(valid_d & valid_h & valid_w)
                         {
-                            size_t o_idx = static_cast<size_t>(cur_do) * ho * wo * k +
-                                           static_cast<size_t>(cur_ho) * wo * k +
-                                           static_cast<size_t>(cur_wo) * k +
-                                           static_cast<size_t>(ik);
-                            size_t f_idx = static_cast<size_t>(ik) * fz * fy * fx * c_per_group +
-                                           static_cast<size_t>(iz) * fy * fx * c_per_group +
-                                           static_cast<size_t>(iy) * fx * c_per_group +
-                                           static_cast<size_t>(ix) * c_per_group +
-                                           static_cast<size_t>(ic);
-                            value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
-                                     cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            if constexpr(ASSUME_PACKED)
+                            {
+                                size_t o_idx = static_cast<size_t>(cur_do) * ho * wo * k +
+                                               static_cast<size_t>(cur_ho) * wo * k +
+                                               static_cast<size_t>(cur_wo) * k +
+                                               static_cast<size_t>(ik);
+
+                                size_t f_idx =
+                                    static_cast<size_t>(ik) * fz * fy * fx * c_per_group +
+                                    static_cast<size_t>(iz) * fy * fx * c_per_group +
+                                    static_cast<size_t>(iy) * fx * c_per_group +
+                                    static_cast<size_t>(ix) * c_per_group + static_cast<size_t>(ic);
+
+                                value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
+                            else
+                            {
+                                size_t o_idx = static_cast<size_t>(cur_do) * out_strides[4] +
+                                               static_cast<size_t>(cur_ho) * out_strides[3] +
+                                               static_cast<size_t>(cur_wo) * out_strides[2] +
+                                               static_cast<size_t>(ik) * out_strides[0];
+
+                                size_t f_idx = static_cast<size_t>(ik) * wei_strides[4] +
+                                               static_cast<size_t>(iz) * wei_strides[3] +
+                                               static_cast<size_t>(iy) * wei_strides[2] +
+                                               static_cast<size_t>(ix) * wei_strides[1] +
+                                               static_cast<size_t>(ic) * wei_strides[0];
+
+                                value += cast_to<src_data_t, acc_data_t>(p_out[o_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_wei[f_idx]);
+                            }
                         }
                     }
                 }
             }
         }
-        size_t i_idx = static_cast<size_t>(ihi) * wi * c + static_cast<size_t>(iwi) * c +
-                       static_cast<size_t>(ic);
-        p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t i_idx = static_cast<size_t>(ihi) * wi * c + static_cast<size_t>(iwi) * c +
+                           static_cast<size_t>(ic);
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t i_idx = static_cast<size_t>(ihi) * in_strides[3] +
+                           static_cast<size_t>(iwi) * in_strides[2] +
+                           static_cast<size_t>(ic) * in_strides[0];
+
+            p_in[i_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
 }
 
-template <typename src_data_t, typename acc_data_t, typename dst_data_t>
+template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in,
                                             dst_data_t* __restrict__ p_wei,
                                             const src_data_t* __restrict__ p_out,
+                                            Strides6D in_strides,
+                                            Strides6D wei_strides,
+                                            Strides6D out_strides,
                                             int di,
                                             int hi,
                                             int wi,
@@ -1174,19 +1766,34 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in,
     int ik            = bid % k_per_group;
     int ig            = bid / k_per_group;
 
-    p_in += static_cast<size_t>(ig) * c_per_group;
-    p_wei += static_cast<size_t>(ig) * k_per_group * fz * fy * fx * c_per_group +
-             static_cast<size_t>(ik) * fz * fy * fx * c_per_group;
-    p_out += static_cast<size_t>(ig) * k_per_group + static_cast<size_t>(ik);
+    if constexpr(ASSUME_PACKED)
+    {
+        p_in += static_cast<size_t>(ig) * c_per_group;
+
+        p_wei += static_cast<size_t>(ig) * k_per_group * fz * fy * fx * c_per_group +
+                 static_cast<size_t>(ik) * fz * fy * fx * c_per_group;
 
-    for(int tid = threadIdx.x; tid < thread_length; tid += 256)
+        p_out += static_cast<size_t>(ig) * k_per_group + static_cast<size_t>(ik);
+    }
+    else
+    {
+        p_in += static_cast<size_t>(ig) * in_strides[1];
+
+        p_wei +=
+            static_cast<size_t>(ig) * wei_strides[5] + static_cast<size_t>(ik) * wei_strides[4];
+
+        p_out +=
+            static_cast<size_t>(ig) * out_strides[1] + static_cast<size_t>(ik) * out_strides[0];
+    }
+
+    for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x)
     {
         int ic = tid % c_per_group;
         int ix = (tid / c_per_group) % fx;
         int iy = (tid / (c_per_group * fx)) % fy;
         int iz = (tid / (c_per_group * fx * fy));
 
-        double value = .0f;
+        acc_data_t value = 0;
 
         for(int in = 0; in < n; in++)
         {
@@ -1211,374 +1818,340 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in,
 
                         if(valid_d & valid_h & valid_w)
                         {
-                            size_t i_idx = static_cast<size_t>(in) * di * hi * wi * c +
-                                           static_cast<size_t>(cur_d) * hi * wi * c +
-                                           static_cast<size_t>(cur_h) * wi * c +
-                                           static_cast<size_t>(cur_w) * c + static_cast<size_t>(ic);
-                            size_t o_idx = static_cast<size_t>(in) * do_ * ho * wo * k +
-                                           static_cast<size_t>(ido) * ho * wo * k +
-                                           static_cast<size_t>(iho) * wo * k +
-                                           static_cast<size_t>(iwo) * k;
-                            value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
-                                     cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+
+                            if constexpr(ASSUME_PACKED)
+                            {
+                                size_t i_idx = static_cast<size_t>(in) * di * hi * wi * c +
+                                               static_cast<size_t>(cur_d) * hi * wi * c +
+                                               static_cast<size_t>(cur_h) * wi * c +
+                                               static_cast<size_t>(cur_w) * c +
+                                               static_cast<size_t>(ic);
+
+                                size_t o_idx = static_cast<size_t>(in) * do_ * ho * wo * k +
+                                               static_cast<size_t>(ido) * ho * wo * k +
+                                               static_cast<size_t>(iho) * wo * k +
+                                               static_cast<size_t>(iwo) * k;
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                            }
+                            else
+                            {
+
+                                size_t i_idx = static_cast<size_t>(in) * in_strides[5] +
+                                               static_cast<size_t>(cur_d) * in_strides[4] +
+                                               static_cast<size_t>(cur_h) * in_strides[3] +
+                                               static_cast<size_t>(cur_w) * in_strides[2] +
+                                               static_cast<size_t>(ic) * in_strides[0];
+
+                                size_t o_idx = static_cast<size_t>(in) * out_strides[5] +
+                                               static_cast<size_t>(ido) * out_strides[4] +
+                                               static_cast<size_t>(iho) * out_strides[3] +
+                                               static_cast<size_t>(iwo) * out_strides[2];
+
+                                value += cast_to<src_data_t, acc_data_t>(p_in[i_idx]) *
+                                         cast_to<src_data_t, acc_data_t>(p_out[o_idx]);
+                            }
                         }
                     }
                 }
             }
         }
-        size_t f_idx = static_cast<size_t>(iz) * fy * fx * c_per_group +
-                       static_cast<size_t>(iy) * fx * c_per_group +
-                       static_cast<size_t>(ix) * c_per_group + static_cast<size_t>(ic);
-        p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
-    }
-}
-
-#define DEFINE_2D_NAIVE_FWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \
-    extern "C" __global__ void                                                             \
-        naive_conv_fwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(       \
-            src_data_t* __restrict__ p_in,                                                 \
-            src_data_t* __restrict__ p_wei,                                                \
-            dst_data_t* __restrict__ p_out,                                                \
-            int hi,                                                                        \
-            int wi,                                                                        \
-            int n,                                                                         \
-            int k_per_group,                                                               \
-            int c_per_group,                                                               \
-            int ho,                                                                        \
-            int wo,                                                                        \
-            int sy,                                                                        \
-            int sx,                                                                        \
-            int dy,                                                                        \
-            int dx,                                                                        \
-            int py,                                                                        \
-            int px,                                                                        \
-            int fy,                                                                        \
-            int fx,                                                                        \
-            int group)                                                                     \
-    {                                                                                      \
-        naive_conv_fwd_##tensor_layout<src_data_t, acc_data_t, dst_data_t>(p_in,           \
-                                                                           p_wei,          \
-                                                                           p_out,          \
-                                                                           hi,             \
-                                                                           wi,             \
-                                                                           n,              \
-                                                                           k_per_group,    \
-                                                                           c_per_group,    \
-                                                                           ho,             \
-                                                                           wo,             \
-                                                                           sy,             \
-                                                                           sx,             \
-                                                                           dy,             \
-                                                                           dx,             \
-                                                                           py,             \
-                                                                           px,             \
-                                                                           fy,             \
-                                                                           fx,             \
-                                                                           group);         \
-    }
 
-#define DEFINE_2D_NAIVE_BWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \
-    extern "C" __global__ void                                                             \
-        naive_conv_bwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(       \
-            dst_data_t* __restrict__ p_in,                                                 \
-            src_data_t* __restrict__ p_wei,                                                \
-            src_data_t* __restrict__ p_out,                                                \
-            int hi,                                                                        \
-            int wi,                                                                        \
-            int n,                                                                         \
-            int k_per_group,                                                               \
-            int c_per_group,                                                               \
-            int ho,                                                                        \
-            int wo,                                                                        \
-            int sy,                                                                        \
-            int sx,                                                                        \
-            int dy,                                                                        \
-            int dx,                                                                        \
-            int py,                                                                        \
-            int px,                                                                        \
-            int fy,                                                                        \
-            int fx,                                                                        \
-            int group)                                                                     \
-    {                                                                                      \
-        naive_conv_bwd_##tensor_layout<src_data_t, acc_data_t, dst_data_t>(p_in,           \
-                                                                           p_wei,          \
-                                                                           p_out,          \
-                                                                           hi,             \
-                                                                           wi,             \
-                                                                           n,              \
-                                                                           k_per_group,    \
-                                                                           c_per_group,    \
-                                                                           ho,             \
-                                                                           wo,             \
-                                                                           sy,             \
-                                                                           sx,             \
-                                                                           dy,             \
-                                                                           dx,             \
-                                                                           py,             \
-                                                                           px,             \
-                                                                           fy,             \
-                                                                           fx,             \
-                                                                           group);         \
-    }
+        if constexpr(ASSUME_PACKED)
+        {
+            size_t f_idx = static_cast<size_t>(iz) * fy * fx * c_per_group +
+                           static_cast<size_t>(iy) * fx * c_per_group +
+                           static_cast<size_t>(ix) * c_per_group + static_cast<size_t>(ic);
 
-#define DEFINE_2D_NAIVE_WRW_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \
-    extern "C" __global__ void                                                             \
-        naive_conv_wrw_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(       \
-            src_data_t* __restrict__ p_in,                                                 \
-            dst_data_t* __restrict__ p_wei,                                                \
-            src_data_t* __restrict__ p_out,                                                \
-            int hi,                                                                        \
-            int wi,                                                                        \
-            int n,                                                                         \
-            int k_per_group,                                                               \
-            int c_per_group,                                                               \
-            int ho,                                                                        \
-            int wo,                                                                        \
-            int sy,                                                                        \
-            int sx,                                                                        \
-            int dy,                                                                        \
-            int dx,                                                                        \
-            int py,                                                                        \
-            int px,                                                                        \
-            int fy,                                                                        \
-            int fx,                                                                        \
-            int group)                                                                     \
-    {                                                                                      \
-        naive_conv_wrw_##tensor_layout<src_data_t, acc_data_t, dst_data_t>(p_in,           \
-                                                                           p_wei,          \
-                                                                           p_out,          \
-                                                                           hi,             \
-                                                                           wi,             \
-                                                                           n,              \
-                                                                           k_per_group,    \
-                                                                           c_per_group,    \
-                                                                           ho,             \
-                                                                           wo,             \
-                                                                           sy,             \
-                                                                           sx,             \
-                                                                           dy,             \
-                                                                           dx,             \
-                                                                           py,             \
-                                                                           px,             \
-                                                                           fy,             \
-                                                                           fx,             \
-                                                                           group);         \
-    }
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
+        else
+        {
+            size_t f_idx = static_cast<size_t>(iz) * wei_strides[3] +
+                           static_cast<size_t>(iy) * wei_strides[2] +
+                           static_cast<size_t>(ix) * wei_strides[1] +
+                           static_cast<size_t>(ic) * wei_strides[0];
 
-#define DEFINE_3D_NAIVE_FWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \
-    extern "C" __global__ void                                                             \
-        naive_conv_fwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(       \
-            src_data_t* __restrict__ p_in,                                                 \
-            src_data_t* __restrict__ p_wei,                                                \
-            dst_data_t* __restrict__ p_out,                                                \
-            int di,                                                                        \
-            int hi,                                                                        \
-            int wi,                                                                        \
-            int n,                                                                         \
-            int k_per_group,                                                               \
-            int c_per_group,                                                               \
-            int do_,                                                                       \
-            int ho,                                                                        \
-            int wo,                                                                        \
-            int sz,                                                                        \
-            int sy,                                                                        \
-            int sx,                                                                        \
-            int dz,                                                                        \
-            int dy,                                                                        \
-            int dx,                                                                        \
-            int pz,                                                                        \
-            int py,                                                                        \
-            int px,                                                                        \
-            int fz,                                                                        \
-            int fy,                                                                        \
-            int fx,                                                                        \
-            int group)                                                                     \
-    {                                                                                      \
-        naive_conv_fwd_##tensor_layout<src_data_t, acc_data_t, dst_data_t>(p_in,           \
-                                                                           p_wei,          \
-                                                                           p_out,          \
-                                                                           di,             \
-                                                                           hi,             \
-                                                                           wi,             \
-                                                                           n,              \
-                                                                           k_per_group,    \
-                                                                           c_per_group,    \
-                                                                           do_,            \
-                                                                           ho,             \
-                                                                           wo,             \
-                                                                           sz,             \
-                                                                           sy,             \
-                                                                           sx,             \
-                                                                           dz,             \
-                                                                           dy,             \
-                                                                           dx,             \
-                                                                           pz,             \
-                                                                           py,             \
-                                                                           px,             \
-                                                                           fz,             \
-                                                                           fy,             \
-                                                                           fx,             \
-                                                                           group);         \
+            p_wei[f_idx] = cast_to<acc_data_t, dst_data_t>(value);
+        }
     }
+}
 
-#define DEFINE_3D_NAIVE_BWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \
-    extern "C" __global__ void                                                             \
-        naive_conv_bwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(       \
-            dst_data_t* __restrict__ p_in,                                                 \
-            src_data_t* __restrict__ p_wei,                                                \
-            src_data_t* __restrict__ p_out,                                                \
-            int di,                                                                        \
-            int hi,                                                                        \
-            int wi,                                                                        \
-            int n,                                                                         \
-            int k_per_group,                                                               \
-            int c_per_group,                                                               \
-            int do_,                                                                       \
-            int ho,                                                                        \
-            int wo,                                                                        \
-            int sz,                                                                        \
-            int sy,                                                                        \
-            int sx,                                                                        \
-            int dz,                                                                        \
-            int dy,                                                                        \
-            int dx,                                                                        \
-            int pz,                                                                        \
-            int py,                                                                        \
-            int px,                                                                        \
-            int fz,                                                                        \
-            int fy,                                                                        \
-            int fx,                                                                        \
-            int group)                                                                     \
-    {                                                                                      \
-        naive_conv_bwd_##tensor_layout<src_data_t, acc_data_t, dst_data_t>(p_in,           \
-                                                                           p_wei,          \
-                                                                           p_out,          \
-                                                                           di,             \
-                                                                           hi,             \
-                                                                           wi,             \
-                                                                           n,              \
-                                                                           k_per_group,    \
-                                                                           c_per_group,    \
-                                                                           do_,            \
-                                                                           ho,             \
-                                                                           wo,             \
-                                                                           sz,             \
-                                                                           sy,             \
-                                                                           sx,             \
-                                                                           dz,             \
-                                                                           dy,             \
-                                                                           dx,             \
-                                                                           pz,             \
-                                                                           py,             \
-                                                                           px,             \
-                                                                           fz,             \
-                                                                           fy,             \
-                                                                           fx,             \
-                                                                           group);         \
+#define DEFINE_2D_NAIVE_CONV_KERNEL(direction, tensor_layout, src_data_t, acc_data_t, dst_data_t)        \
+    extern "C" __global__ void                                                                           \
+        naive_conv_packed_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(    \
+            src_data_t* __restrict__ p_in,                                                               \
+            src_data_t* __restrict__ p_wei,                                                              \
+            dst_data_t* __restrict__ p_out,                                                              \
+            Strides5D in_strides,                                                                        \
+            Strides5D wei_strides,                                                                       \
+            Strides5D out_strides,                                                                       \
+            int hi,                                                                                      \
+            int wi,                                                                                      \
+            int n,                                                                                       \
+            int k_per_group,                                                                             \
+            int c_per_group,                                                                             \
+            int ho,                                                                                      \
+            int wo,                                                                                      \
+            int sy,                                                                                      \
+            int sx,                                                                                      \
+            int dy,                                                                                      \
+            int dx,                                                                                      \
+            int py,                                                                                      \
+            int px,                                                                                      \
+            int fy,                                                                                      \
+            int fx,                                                                                      \
+            int group)                                                                                   \
+    {                                                                                                    \
+        naive_conv_##direction##_##tensor_layout<true, src_data_t, acc_data_t, dst_data_t>(              \
+            p_in,                                                                                        \
+            p_wei,                                                                                       \
+            p_out,                                                                                       \
+            in_strides,                                                                                  \
+            wei_strides,                                                                                 \
+            out_strides,                                                                                 \
+            hi,                                                                                          \
+            wi,                                                                                          \
+            n,                                                                                           \
+            k_per_group,                                                                                 \
+            c_per_group,                                                                                 \
+            ho,                                                                                          \
+            wo,                                                                                          \
+            sy,                                                                                          \
+            sx,                                                                                          \
+            dy,                                                                                          \
+            dx,                                                                                          \
+            py,                                                                                          \
+            px,                                                                                          \
+            fy,                                                                                          \
+            fx,                                                                                          \
+            group);                                                                                      \
+    }                                                                                                    \
+    extern "C" __global__ void                                                                           \
+        naive_conv_nonpacked_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \
+            src_data_t* __restrict__ p_in,                                                               \
+            src_data_t* __restrict__ p_wei,                                                              \
+            dst_data_t* __restrict__ p_out,                                                              \
+            Strides5D in_strides,                                                                        \
+            Strides5D wei_strides,                                                                       \
+            Strides5D out_strides,                                                                       \
+            int hi,                                                                                      \
+            int wi,                                                                                      \
+            int n,                                                                                       \
+            int k_per_group,                                                                             \
+            int c_per_group,                                                                             \
+            int ho,                                                                                      \
+            int wo,                                                                                      \
+            int sy,                                                                                      \
+            int sx,                                                                                      \
+            int dy,                                                                                      \
+            int dx,                                                                                      \
+            int py,                                                                                      \
+            int px,                                                                                      \
+            int fy,                                                                                      \
+            int fx,                                                                                      \
+            int group)                                                                                   \
+    {                                                                                                    \
+        naive_conv_##direction##_##tensor_layout<false, src_data_t, acc_data_t, dst_data_t>(             \
+            p_in,                                                                                        \
+            p_wei,                                                                                       \
+            p_out,                                                                                       \
+            in_strides,                                                                                  \
+            wei_strides,                                                                                 \
+            out_strides,                                                                                 \
+            hi,                                                                                          \
+            wi,                                                                                          \
+            n,                                                                                           \
+            k_per_group,                                                                                 \
+            c_per_group,                                                                                 \
+            ho,                                                                                          \
+            wo,                                                                                          \
+            sy,                                                                                          \
+            sx,                                                                                          \
+            dy,                                                                                          \
+            dx,                                                                                          \
+            py,                                                                                          \
+            px,                                                                                          \
+            fy,                                                                                          \
+            fx,                                                                                          \
+            group);                                                                                      \
     }
 
-#define DEFINE_3D_NAIVE_WRW_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \
-    extern "C" __global__ void                                                             \
-        naive_conv_wrw_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(       \
-            src_data_t* __restrict__ p_in,                                                 \
-            dst_data_t* __restrict__ p_wei,                                                \
-            src_data_t* __restrict__ p_out,                                                \
-            int di,                                                                        \
-            int hi,                                                                        \
-            int wi,                                                                        \
-            int n,                                                                         \
-            int k_per_group,                                                               \
-            int c_per_group,                                                               \
-            int do_,                                                                       \
-            int ho,                                                                        \
-            int wo,                                                                        \
-            int sz,                                                                        \
-            int sy,                                                                        \
-            int sx,                                                                        \
-            int dz,                                                                        \
-            int dy,                                                                        \
-            int dx,                                                                        \
-            int pz,                                                                        \
-            int py,                                                                        \
-            int px,                                                                        \
-            int fz,                                                                        \
-            int fy,                                                                        \
-            int fx,                                                                        \
-            int group)                                                                     \
-    {                                                                                      \
-        naive_conv_wrw_##tensor_layout<src_data_t, acc_data_t, dst_data_t>(p_in,           \
-                                                                           p_wei,          \
-                                                                           p_out,          \
-                                                                           di,             \
-                                                                           hi,             \
-                                                                           wi,             \
-                                                                           n,              \
-                                                                           k_per_group,    \
-                                                                           c_per_group,    \
-                                                                           do_,            \
-                                                                           ho,             \
-                                                                           wo,             \
-                                                                           sz,             \
-                                                                           sy,             \
-                                                                           sx,             \
-                                                                           dz,             \
-                                                                           dy,             \
-                                                                           dx,             \
-                                                                           pz,             \
-                                                                           py,             \
-                                                                           px,             \
-                                                                           fz,             \
-                                                                           fy,             \
-                                                                           fx,             \
-                                                                           group);         \
+#define DEFINE_3D_NAIVE_CONV_KERNEL(direction, tensor_layout, src_data_t, acc_data_t, dst_data_t)        \
+    extern "C" __global__ void                                                                           \
+        naive_conv_packed_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t(    \
+            src_data_t* __restrict__ p_in,                                                               \
+            src_data_t* __restrict__ p_wei,                                                              \
+            dst_data_t* __restrict__ p_out,                                                              \
+            Strides6D in_strides,                                                                        \
+            Strides6D wei_strides,                                                                       \
+            Strides6D out_strides,                                                                       \
+            int di,                                                                                      \
+            int hi,                                                                                      \
+            int wi,                                                                                      \
+            int n,                                                                                       \
+            int k_per_group,                                                                             \
+            int c_per_group,                                                                             \
+            int do_,                                                                                     \
+            int ho,                                                                                      \
+            int wo,                                                                                      \
+            int sz,                                                                                      \
+            int sy,                                                                                      \
+            int sx,                                                                                      \
+            int dz,                                                                                      \
+            int dy,                                                                                      \
+            int dx,                                                                                      \
+            int pz,                                                                                      \
+            int py,                                                                                      \
+            int px,                                                                                      \
+            int fz,                                                                                      \
+            int fy,                                                                                      \
+            int fx,                                                                                      \
+            int group)                                                                                   \
+    {                                                                                                    \
+        naive_conv_##direction##_##tensor_layout<true, src_data_t, acc_data_t, dst_data_t>(              \
+            p_in,                                                                                        \
+            p_wei,                                                                                       \
+            p_out,                                                                                       \
+            in_strides,                                                                                  \
+            wei_strides,                                                                                 \
+            out_strides,                                                                                 \
+            di,                                                                                          \
+            hi,                                                                                          \
+            wi,                                                                                          \
+            n,                                                                                           \
+            k_per_group,                                                                                 \
+            c_per_group,                                                                                 \
+            do_,                                                                                         \
+            ho,                                                                                          \
+            wo,                                                                                          \
+            sz,                                                                                          \
+            sy,                                                                                          \
+            sx,                                                                                          \
+            dz,                                                                                          \
+            dy,                                                                                          \
+            dx,                                                                                          \
+            pz,                                                                                          \
+            py,                                                                                          \
+            px,                                                                                          \
+            fz,                                                                                          \
+            fy,                                                                                          \
+            fx,                                                                                          \
+            group);                                                                                      \
+    }                                                                                                    \
+    extern "C" __global__ void                                                                           \
+        naive_conv_nonpacked_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \
+            src_data_t* __restrict__ p_in,                                                               \
+            src_data_t* __restrict__ p_wei,                                                              \
+            dst_data_t* __restrict__ p_out,                                                              \
+            Strides6D in_strides,                                                                        \
+            Strides6D wei_strides,                                                                       \
+            Strides6D out_strides,                                                                       \
+            int di,                                                                                      \
+            int hi,                                                                                      \
+            int wi,                                                                                      \
+            int n,                                                                                       \
+            int k_per_group,                                                                             \
+            int c_per_group,                                                                             \
+            int do_,                                                                                     \
+            int ho,                                                                                      \
+            int wo,                                                                                      \
+            int sz,                                                                                      \
+            int sy,                                                                                      \
+            int sx,                                                                                      \
+            int dz,                                                                                      \
+            int dy,                                                                                      \
+            int dx,                                                                                      \
+            int pz,                                                                                      \
+            int py,                                                                                      \
+            int px,                                                                                      \
+            int fz,                                                                                      \
+            int fy,                                                                                      \
+            int fx,                                                                                      \
+            int group)                                                                                   \
+    {                                                                                                    \
+        naive_conv_##direction##_##tensor_layout<false, src_data_t, acc_data_t, dst_data_t>(             \
+            p_in,                                                                                        \
+            p_wei,                                                                                       \
+            p_out,                                                                                       \
+            in_strides,                                                                                  \
+            wei_strides,                                                                                 \
+            out_strides,                                                                                 \
+            di,                                                                                          \
+            hi,                                                                                          \
+            wi,                                                                                          \
+            n,                                                                                           \
+            k_per_group,                                                                                 \
+            c_per_group,                                                                                 \
+            do_,                                                                                         \
+            ho,                                                                                          \
+            wo,                                                                                          \
+            sz,                                                                                          \
+            sy,                                                                                          \
+            sx,                                                                                          \
+            dz,                                                                                          \
+            dy,                                                                                          \
+            dx,                                                                                          \
+            pz,                                                                                          \
+            py,                                                                                          \
+            px,                                                                                          \
+            fz,                                                                                          \
+            fy,                                                                                          \
+            fx,                                                                                          \
+            group);                                                                                      \
     }
 
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, float, double, float)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, half, double, half)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, ushort, double, ushort)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, int8_t, int32_t, int32_t)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, int8_t, int32_t, float)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, float, double, float)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, half, double, half)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, ushort, double, ushort)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, int8_t, int32_t, int32_t)
-DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, int8_t, int32_t, float)
-
-DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nchw, float, double, float)
-DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nchw, half, double, half)
-DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nchw, ushort, double, ushort)
-DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nhwc, float, double, float)
-DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nhwc, half, double, half)
-DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nhwc, ushort, double, ushort)
-
-DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nchw, float, double, float)
-DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nchw, half, double, half)
-DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nchw, ushort, double, ushort)
-DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nhwc, float, double, float)
-DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nhwc, half, double, half)
-DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nhwc, ushort, double, ushort)
-
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, float, double, float)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, half, double, half)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, ushort, double, ushort)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, int8_t, int32_t, int32_t)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, int8_t, int32_t, float)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, float, double, float)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, half, double, half)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, ushort, double, ushort)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, int8_t, int32_t, int32_t)
-DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, int8_t, int32_t, float)
-
-DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ncdhw, float, double, float)
-DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ncdhw, half, double, half)
-DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ncdhw, ushort, double, ushort)
-DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ndhwc, float, double, float)
-DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ndhwc, half, double, half)
-DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ndhwc, ushort, double, ushort)
-
-DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ncdhw, float, double, float)
-DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ncdhw, half, double, half)
-DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ncdhw, ushort, double, ushort)
-DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ndhwc, float, double, float)
-DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ndhwc, half, double, half)
-DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ndhwc, ushort, double, ushort)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, float, double, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, half, double, half)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, ushort, double, ushort)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, int32_t)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, float, double, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, half, double, half)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, ushort, double, ushort)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, int32_t)
+DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, float)
+
+DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, float, double, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, half, double, half)
+DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, ushort, double, ushort)
+DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, float, double, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, half, double, half)
+DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, ushort, double, ushort)
+
+DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, float, double, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, half, double, half)
+DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, ushort, double, ushort)
+DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, float, double, float)
+DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, half, double, half)
+DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, ushort, double, ushort)
+
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, float, double, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, half, double, half)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, ushort, double, ushort)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, int8_t, int32_t, int32_t)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, int8_t, int32_t, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, float, double, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, half, double, half)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, ushort, double, ushort)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, int8_t, int32_t, int32_t)
+DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, int8_t, int32_t, float)
+
+DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, float, double, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, half, double, half)
+DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, ushort, double, ushort)
+DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, float, double, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, half, double, half)
+DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, ushort, double, ushort)
+
+DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, float, double, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, half, double, half)
+DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, ushort, double, ushort)
+DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, float, double, float)
+DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, half, double, half)
+DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, ushort, double, ushort)
+
+/// \todo discuss whether we should split the kernels into separate files, or
+/// figure out a mechanism to compile each kernel separately to reduce hipRTC
+/// compilation times.  --amberhassaan
diff --git a/src/kernels/stride_array.hpp b/src/kernels/stride_array.hpp
new file mode 100644
index 0000000000..32cb1f85b6
--- /dev/null
+++ b/src/kernels/stride_array.hpp
@@ -0,0 +1,86 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#ifdef __HIPCC_RTC__
+#ifndef WORKAROUND_ISSUE_HIPRTC_TRUE_TYPE
+#include <cstdint>
+#endif
+#endif // __HIPCC_RTC__
+
+/// \todo Uncomment when hip RTC accepts std::array -- amberhassaan
+// #include <hip/amd_detail/amd_hip_vector_types.h>
+// using StrideIndexType = int;
+// using Strides3D       = std::array<StrideIndexType, 3>;
+// using Strides4D       = std::array<StrideIndexType, 4>;
+// using Strides5D       = std::array<StrideIndexType, 5>;
+// using Strides6D       = std::array<StrideIndexType, 6>;
+template <typename T, unsigned N>
+class MyArray
+{
+    T data_[N] = {};
+
+public:
+    constexpr static const unsigned SIZE = N;
+    __host__ __device__ constexpr unsigned size() const { return N; }
+
+    __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; }
+
+    __host__ T& operator[](unsigned i) { return data_[i]; }
+
+    __host__ __device__ MyArray()                   = default;
+    __host__ __device__ MyArray(const MyArray&)     = default;
+    __host__ __device__ MyArray(MyArray&&) noexcept = default;
+    __host__ __device__ MyArray& operator=(const MyArray&) = default;
+    __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default;
+    __host__ __device__ ~MyArray()                             = default;
+};
+
+using StrideIndexType = size_t;
+using Strides5D       = MyArray<StrideIndexType, 5u>;
+using Strides6D       = MyArray<StrideIndexType, 6u>;
+
+template <typename StrideArray>
+__host__ __device__ void printStrideArray(const char* name, const StrideArray& sarr)
+{
+    printf("%s = [", name);
+    for(int i = 0; i < StrideArray::SIZE; ++i)
+    {
+        printf("%zu,", sarr[i]);
+    }
+    printf("]\n");
+}
+
+template <typename StrideArray>
+__host__ __device__ void printStrideArrays(const StrideArray& in_strides,
+                                           const StrideArray& wei_strides,
+                                           const StrideArray& out_strides)
+{
+
+    printStrideArray("in_strides", in_strides);
+    printStrideArray("wei_strides", wei_strides);
+    printStrideArray("out_strides", out_strides);
+}
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index 5c468768fa..86a8a4161e 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -24,6 +24,7 @@
  *
  *******************************************************************************/
 
+#include "miopen/env.hpp"
 #include <miopen/solver/conv_direct_naive_conv.hpp>
 #include <miopen/solver.hpp>
 #include <miopen/problem_description.hpp>
@@ -105,10 +106,20 @@ bool IsOutputInt32(const ProblemDescription& problem)
            problem.GetOutDataType() == miopenInt32;
 }
 
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS);
+
 std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
 {
     std::ostringstream kernel_name;
-    kernel_name << "naive_conv_";
+    if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS()))
+    {
+        kernel_name << "naive_conv_packed_";
+    }
+    else
+    {
+        kernel_name << "naive_conv_nonpacked_";
+    }
+
     if(problem.direction.IsForward())
         kernel_name << "fwd_";
     else if(problem.direction.IsBackwardData())
@@ -244,5 +255,49 @@ bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext& ctx,
     return true;
 }
 
+/// Figure out the index of C (channel) stride so we can expand it into
+/// (G, C_per_group). Return value G_stride_idx is the position of G stride
+/// in the stride vector, such that the (G_stride_idx - 1) is the index that
+/// contains C's stride as a multiplying factor
+int conv_internal::GetGroupStrideIndex(const ProblemDescription& problem)
+{
+    int G_stride_idx = -1;
+    if(problem.IsLayoutDefault())
+    {
+        G_stride_idx = 1;
+    }
+    else
+    {
+        assert(problem.IsLayoutNHWC());
+        assert(problem.Is2d() || problem.Is3d());
+        //
+        // G_stride_idx = problem.Is2d() ? 3 : 4;
+        // For NHWC, MIOpen stores strides in NCHW order, so we are interested in 1 + W's
+        // stride as that will be the value of G_stride_idx;
+        G_stride_idx = problem.Is2d() ? 4 : 5;
+    }
+    assert(G_stride_idx != -1);
+    return G_stride_idx;
+}
+
+void conv_internal::DebugPrintTensorStrides(const TensorDescriptor& inDesc,
+                                            const TensorDescriptor& wDesc,
+                                            const TensorDescriptor& outDesc)
+{
+
+    auto printOneStrideVec = [](const char* name, const auto& vec) {
+        MIOPEN_LOG_I(name << " = [");
+        for(const size_t v : vec)
+        {
+            MIOPEN_LOG_I(v << ",");
+        }
+        MIOPEN_LOG_I("]\n");
+    };
+
+    printOneStrideVec("inDesc = ", inDesc.GetStrides());
+    printOneStrideVec("wDesc = ", wDesc.GetStrides());
+    printOneStrideVec("outDesc = ", outDesc.GetStrides());
+}
+
 } // namespace solver
 } // namespace miopen
diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp
index f8af0ec2d1..1a28f8aae6 100644
--- a/src/solver/conv_direct_naive_conv_bwd.cpp
+++ b/src/solver/conv_direct_naive_conv_bwd.cpp
@@ -142,14 +142,27 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
     }();
     kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
+    int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);
+
     if(problem.Is2d())
+    {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
             const auto kern = kernels[0];
             return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
                 decltype(auto) data_ctx = primitive_parameters.CastTo<conv::DataInvokeParams>();
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
+                auto in_strides = conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC(
+                    group, tensors.inDesc.GetStrides(), G_stride_idx));
+                // For weights, we split K to (G, K_per_group), which is always index 0
+                auto wei_strides = conv_internal::MakeStrideArray<5>(
+                    conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
+                auto out_strides =
+                    conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC(
+                        group, tensors.outDesc.GetStrides(), G_stride_idx));
+                /// \ref backward_tensors_reversed_why
                 if(is_f8)
+                {
                     handle.Run(kern)(tensors.out,
                                      tensors.w,
                                      tensors.in,
@@ -172,10 +185,15 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
                                      problem.GetConv().attribute.fp8rounding_mode.Get() ==
                                          miopenF8RoundingModeStochastic,
                                      problem.GetConv().attribute.fp8rounding_mode.GetSeed());
+                }
                 else
+                {
                     handle.Run(kern)(tensors.out,
                                      tensors.w,
                                      tensors.in,
+                                     out_strides,
+                                     wei_strides,
+                                     in_strides,
                                      hi,
                                      wi,
                                      n,
@@ -192,6 +210,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
                                      fy,
                                      fx,
                                      group);
+                }
                 if(handle.IsProfilingEnabled())
                     elapsed += handle.GetKernelTime();
 
@@ -202,7 +221,9 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
                 }
             };
         };
+    }
     else
+    {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
             const auto kern = kernels[0];
             return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
@@ -210,9 +231,26 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
 
+                auto in_strides = conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC(
+                    group, tensors.inDesc.GetStrides(), G_stride_idx));
+                // For weights, we split K to (G, K_per_group), which is always index 0
+                auto wei_strides = conv_internal::MakeStrideArray<6>(
+                    conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
+                auto out_strides =
+                    conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC(
+                        group, tensors.outDesc.GetStrides(), G_stride_idx));
+
+                /// \anchor backward_tensors_reversed_why
+                /// \todo Someone made the silly decision of swapping in and
+                /// out pointers in ConvTensors for backward pass, so now I have to
+                /// pass out in place of in, out_strides in place of in_strides and
+                /// vice-versa --amberhassaan
                 handle.Run(kern)(tensors.out,
                                  tensors.w,
                                  tensors.in,
+                                 out_strides,
+                                 wei_strides,
+                                 in_strides,
                                  di,
                                  hi,
                                  wi,
@@ -245,6 +283,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
                 }
             };
         };
+    }
     result.construction_params.push_back(kernel);
     return result;
 }
diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp
index 90d8feee31..a4656d929a 100644
--- a/src/solver/conv_direct_naive_conv_fwd.cpp
+++ b/src/solver/conv_direct_naive_conv_fwd.cpp
@@ -27,7 +27,6 @@
 #include <miopen/solver/conv_direct_naive_conv.hpp>
 #include <miopen/solver.hpp>
 #include <miopen/conv/data_invoke_params.hpp>
-#include <miopen/env.hpp>
 
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD)
 
@@ -142,13 +141,26 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
 
     kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
+    int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);
+
     if(problem.Is2d())
+    {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
             const auto kern = kernels[0];
             return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
                 decltype(auto) data_ctx = primitive_parameters.CastTo<conv::DataInvokeParams>();
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
+
+                auto in_strides = conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC(
+                    group, tensors.inDesc.GetStrides(), G_stride_idx));
+                // For weights, we split K to (G, K_per_group), which is always index 0
+                auto wei_strides = conv_internal::MakeStrideArray<5>(
+                    conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
+                auto out_strides =
+                    conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC(
+                        group, tensors.outDesc.GetStrides(), G_stride_idx));
+
                 if(is_f8)
                 {
                     handle.Run(kern)(tensors.in,
@@ -179,6 +191,9 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
                     handle.Run(kern)(tensors.in,
                                      tensors.w,
                                      tensors.out,
+                                     in_strides,
+                                     wei_strides,
+                                     out_strides,
                                      hi,
                                      wi,
                                      n,
@@ -206,7 +221,9 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
                 }
             };
         };
+    }
     else
+    {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
             const auto kern = kernels[0];
             return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
@@ -214,9 +231,20 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
 
+                auto in_strides = conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC(
+                    group, tensors.inDesc.GetStrides(), G_stride_idx));
+                // For weights, we split K to (G, K_per_group), which is always index 0
+                auto wei_strides = conv_internal::MakeStrideArray<6>(
+                    conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides()));
+                auto out_strides =
+                    conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC(
+                        group, tensors.outDesc.GetStrides(), G_stride_idx));
                 handle.Run(kern)(tensors.in,
                                  tensors.w,
                                  tensors.out,
+                                 in_strides,
+                                 wei_strides,
+                                 out_strides,
                                  di,
                                  hi,
                                  wi,
@@ -249,6 +277,7 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
                 }
             };
         };
+    }
     result.construction_params.push_back(kernel);
     return result;
 }
diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp
index 6fcf2f71d0..dfe1c342b0 100644
--- a/src/solver/conv_direct_naive_conv_wrw.cpp
+++ b/src/solver/conv_direct_naive_conv_wrw.cpp
@@ -129,14 +129,28 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
             return false;
     }();
 
+    int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);
+
     if(problem.Is2d())
+    {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
             const auto kern = kernels[0];
             return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
                 decltype(auto) data_ctx = primitive_parameters.CastTo<conv::WrWInvokeParams>();
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
+
+                auto in_strides = conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC(
+                    group, tensors.xDesc.GetStrides(), G_stride_idx));
+                // For weights, we split K to (G, K_per_group), which is always index 0
+                auto wei_strides = conv_internal::MakeStrideArray<5>(
+                    conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides()));
+                auto out_strides =
+                    conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC(
+                        group, tensors.dyDesc.GetStrides(), G_stride_idx));
+
                 if(is_f8)
+                {
                     handle.Run(kern)(tensors.x,
                                      tensors.dw,
                                      tensors.dy,
@@ -159,10 +173,15 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
                                      problem.GetConv().attribute.fp8rounding_mode.Get() ==
                                          miopenF8RoundingModeStochastic,
                                      problem.GetConv().attribute.fp8rounding_mode.GetSeed());
+                }
                 else
+                {
                     handle.Run(kern)(tensors.x,
                                      tensors.dw,
                                      tensors.dy,
+                                     in_strides,
+                                     wei_strides,
+                                     out_strides,
                                      hi,
                                      wi,
                                      n,
@@ -179,6 +198,7 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
                                      fy,
                                      fx,
                                      group);
+                }
                 if(handle.IsProfilingEnabled())
                     elapsed += handle.GetKernelTime();
 
@@ -189,7 +209,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
                 }
             };
         };
+    }
     else
+    {
         result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
             const auto kern = kernels[0];
             return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) {
@@ -197,9 +219,21 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
                 const auto& tensors     = data_ctx.tensors;
                 float elapsed           = 0;
 
+                auto in_strides = conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC(
+                    group, tensors.xDesc.GetStrides(), G_stride_idx));
+                // For weights, we split K to (G, K_per_group), which is always index 0
+                auto wei_strides = conv_internal::MakeStrideArray<6>(
+                    conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides()));
+                auto out_strides =
+                    conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC(
+                        group, tensors.dyDesc.GetStrides(), G_stride_idx));
+
                 handle.Run(kern)(tensors.x,
                                  tensors.dw,
                                  tensors.dy,
+                                 in_strides,
+                                 wei_strides,
+                                 out_strides,
                                  di,
                                  hi,
                                  wi,
@@ -232,6 +266,7 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
                 }
             };
         };
+    }
     result.construction_params.push_back(kernel);
     return result;
 }
diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp
index c3b26a80a9..aa3dda788d 100644
--- a/test/gpu_reference_kernel.cpp
+++ b/test/gpu_reference_kernel.cpp
@@ -95,7 +95,8 @@ struct gpu_reference_kernel_base
 
     static std::vector<int> get_image_size() { return {9, 14}; }
 
-    static std::vector<int> get_channel_size() { return {3, 8}; }
+    // Warning: Channel size must be multiple of group size
+    static std::vector<int> get_channel_size() { return {4, 8}; }
 
     static std::vector<int> get_filter_depth() { return {1, 3}; }
 
diff --git a/test/gtest/conv3d_test_case.hpp b/test/gtest/conv3d_test_case.hpp
new file mode 100644
index 0000000000..242615077f
--- /dev/null
+++ b/test/gtest/conv3d_test_case.hpp
@@ -0,0 +1,112 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <random>
+
+#include "get_handle.hpp"
+#include <miopen/conv/data_invoke_params.hpp>
+
+#include "../driver/tensor_driver.hpp"
+#include "conv_common.hpp"
+
+template <typename T>
+miopenDataType_t GetDataType();
+
+template <>
+miopenDataType_t GetDataType<float>()
+{
+    return miopenFloat;
+}
+
+template <>
+miopenDataType_t GetDataType<half_float::half>()
+{
+    return miopenHalf;
+}
+
+template <>
+miopenDataType_t GetDataType<int8_t>()
+{
+    return miopenInt8;
+}
+
+struct Conv3DTestCase
+{
+    size_t G;
+    size_t N;
+    size_t C;
+    size_t D;
+    size_t H;
+    size_t W;
+    size_t k;
+    size_t z;
+    size_t y;
+    size_t x;
+    size_t pad_x;
+    size_t pad_y;
+    size_t pad_z;
+    size_t stride_x;
+    size_t stride_y;
+    size_t stride_z;
+    size_t dilation_x;
+    size_t dilation_y;
+    size_t dilation_z;
+    miopenConvolutionMode_t conv_mode;
+    friend std::ostream& operator<<(std::ostream& os, const Conv3DTestCase& tc)
+    {
+        return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D
+                  << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z
+                  << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z
+                  << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z
+                  << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x
+                  << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y
+                  << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode;
+    }
+
+    std::vector<size_t> GetInput() { return {N, C, D, H, W}; }
+    std::vector<size_t> GetWeights()
+    {
+        EXPECT_EQUAL(C % G, 0);
+        return {k, C / G, z, y, x};
+    }
+
+    miopen::ConvolutionDescriptor GetConv()
+    {
+        return miopen::ConvolutionDescriptor{
+            3,
+            miopenConvolution,
+            miopenPaddingDefault,
+            {static_cast<int>(pad_z), static_cast<int>(pad_y), static_cast<int>(pad_x)},
+            {static_cast<int>(stride_z), static_cast<int>(stride_y), static_cast<int>(stride_x)},
+            {static_cast<int>(dilation_z),
+             static_cast<int>(dilation_y),
+             static_cast<int>(dilation_x)},
+            {0, 0, 0},
+            static_cast<int>(G),
+            1.0};
+    }
+};
diff --git a/test/gtest/group_conv3d_bwd.cpp b/test/gtest/group_conv3d_bwd.cpp
index e53a690021..a9bffceff1 100644
--- a/test/gtest/group_conv3d_bwd.cpp
+++ b/test/gtest/group_conv3d_bwd.cpp
@@ -44,7 +44,7 @@ void SolverBwd(const miopen::TensorDescriptor& inputDesc,
                const miopen::TensorDescriptor& outputDesc,
                ConstData_t output,
                const miopen::ConvolutionDescriptor& convDesc,
-               const ConvTestCase& conv_config,
+               const Conv3DTestCase& conv_config,
                bool& test_skipped)
 {
     auto&& handle = get_handle();
diff --git a/test/gtest/group_conv3d_bwd.hpp b/test/gtest/group_conv3d_bwd.hpp
index 410d71e6d0..71702c5808 100644
--- a/test/gtest/group_conv3d_bwd.hpp
+++ b/test/gtest/group_conv3d_bwd.hpp
@@ -25,89 +25,9 @@
  *******************************************************************************/
 #pragma once
 
-#include <random>
+#include "conv3d_test_case.hpp"
 
-#include "get_handle.hpp"
-#include <miopen/conv/data_invoke_params.hpp>
-
-#include "../driver/tensor_driver.hpp"
-#include "conv_common.hpp"
-
-template <typename T>
-miopenDataType_t GetDataType();
-
-template <>
-miopenDataType_t GetDataType<float>()
-{
-    return miopenFloat;
-}
-
-template <>
-miopenDataType_t GetDataType<half_float::half>()
-{
-    return miopenHalf;
-}
-
-template <>
-miopenDataType_t GetDataType<int8_t>()
-{
-    return miopenInt8;
-}
-
-struct ConvTestCase
-{
-    size_t G;
-    size_t N;
-    size_t C;
-    size_t D;
-    size_t H;
-    size_t W;
-    size_t k;
-    size_t z;
-    size_t y;
-    size_t x;
-    size_t pad_x;
-    size_t pad_y;
-    size_t pad_z;
-    size_t stride_x;
-    size_t stride_y;
-    size_t stride_z;
-    size_t dilation_x;
-    size_t dilation_y;
-    size_t dilation_z;
-    miopenConvolutionMode_t conv_mode;
-    friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc)
-    {
-        return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D
-                  << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z
-                  << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z
-                  << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z
-                  << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x
-                  << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y
-                  << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode;
-    }
-
-    std::vector<size_t> GetInput() { return {N, C, D, H, W}; }
-    std::vector<size_t> GetWeights() { return {k, C, z, y, x}; }
-
-    miopen::ConvolutionDescriptor GetConv()
-    {
-        return miopen::ConvolutionDescriptor{
-            3,
-            miopenConvolution,
-            miopenPaddingDefault,
-            {static_cast<int>(pad_z), static_cast<int>(pad_y), static_cast<int>(pad_x)},
-            {static_cast<int>(stride_z), static_cast<int>(stride_y), static_cast<int>(stride_x)},
-            {static_cast<int>(dilation_z),
-             static_cast<int>(dilation_y),
-             static_cast<int>(dilation_x)},
-            {0, 0, 0},
-            static_cast<int>(G),
-            1.0};
-    }
-};
-
-std::vector<ConvTestCase> ConvTestConfigs()
+std::vector<Conv3DTestCase> ConvTestConfigs()
 { // g    n   c   d    h   w   k   z  y  x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z
     return {{1, 128, 64, 14, 28, 28, 64, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
             {1, 64, 32, 28, 28, 28, 32, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
@@ -133,7 +53,7 @@ inline int SetTensorLayout(miopen::TensorDescriptor& desc)
 template <typename T = float>
 struct ConvBwdSolverTest
     : public ::testing::TestWithParam<
-          std::tuple<miopenConvBwdDataAlgorithm_t, ConvTestCase, miopenTensorLayout_t>>
+          std::tuple<miopenConvBwdDataAlgorithm_t, Conv3DTestCase, miopenTensorLayout_t>>
 {
 protected:
     void SetUp() override
@@ -188,7 +108,7 @@ struct ConvBwdSolverTest
         EXPECT_TRUE(error < threshold)
             << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
     }
-    ConvTestCase conv_config;
+    Conv3DTestCase conv_config;
     miopen::ConvolutionDescriptor conv_desc;
     tensor<T> input;
     tensor<T> weights;
diff --git a/test/gtest/group_conv3d_fwd.cpp b/test/gtest/group_conv3d_fwd.cpp
index 2b52a1b43a..18d54355e8 100644
--- a/test/gtest/group_conv3d_fwd.cpp
+++ b/test/gtest/group_conv3d_fwd.cpp
@@ -44,7 +44,7 @@ void SolverFwd(const miopen::TensorDescriptor& inputDesc,
                const miopen::TensorDescriptor& outputDesc,
                Data_t output,
                const miopen::ConvolutionDescriptor& convDesc,
-               const ConvTestCase& conv_config,
+               const Conv3DTestCase& conv_config,
                bool& test_skipped)
 {
     auto&& handle = get_handle();
diff --git a/test/gtest/group_conv3d_fwd.hpp b/test/gtest/group_conv3d_fwd.hpp
index 983f897d78..c8767399a7 100644
--- a/test/gtest/group_conv3d_fwd.hpp
+++ b/test/gtest/group_conv3d_fwd.hpp
@@ -25,89 +25,9 @@
  *******************************************************************************/
 #pragma once
 
-#include <random>
+#include "conv3d_test_case.hpp"
 
-#include "get_handle.hpp"
-#include <miopen/conv/data_invoke_params.hpp>
-
-#include "../driver/tensor_driver.hpp"
-#include "conv_common.hpp"
-
-template <typename T>
-miopenDataType_t GetDataType();
-
-template <>
-miopenDataType_t GetDataType<float>()
-{
-    return miopenFloat;
-}
-
-template <>
-miopenDataType_t GetDataType<half_float::half>()
-{
-    return miopenHalf;
-}
-
-template <>
-miopenDataType_t GetDataType<int8_t>()
-{
-    return miopenInt8;
-}
-
-struct ConvTestCase
-{
-    size_t G;
-    size_t N;
-    size_t C;
-    size_t D;
-    size_t H;
-    size_t W;
-    size_t k;
-    size_t z;
-    size_t y;
-    size_t x;
-    size_t pad_x;
-    size_t pad_y;
-    size_t pad_z;
-    size_t stride_x;
-    size_t stride_y;
-    size_t stride_z;
-    size_t dilation_x;
-    size_t dilation_y;
-    size_t dilation_z;
-    miopenConvolutionMode_t conv_mode;
-    friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc)
-    {
-        return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D
-                  << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z
-                  << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z
-                  << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z
-                  << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x
-                  << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y
-                  << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode;
-    }
-
-    std::vector<size_t> GetInput() { return {N, C, D, H, W}; }
-    std::vector<size_t> GetWeights() { return {k, C, z, y, x}; }
-
-    miopen::ConvolutionDescriptor GetConv()
-    {
-        return miopen::ConvolutionDescriptor{
-            3,
-            miopenConvolution,
-            miopenPaddingDefault,
-            {static_cast<int>(pad_z), static_cast<int>(pad_y), static_cast<int>(pad_x)},
-            {static_cast<int>(stride_z), static_cast<int>(stride_y), static_cast<int>(stride_x)},
-            {static_cast<int>(dilation_z),
-             static_cast<int>(dilation_y),
-             static_cast<int>(dilation_x)},
-            {0, 0, 0},
-            static_cast<int>(G),
-            1.0};
-    }
-};
-
-std::vector<ConvTestCase> ConvTestConfigs()
+std::vector<Conv3DTestCase> ConvTestConfigs()
 { // g    n   c   d    h   w   k   z  y  x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z
     return {{1, 128, 64, 14, 28, 28, 64, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
             {1, 64, 32, 28, 28, 28, 32, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
@@ -139,7 +59,7 @@ inline int SetTensorLayout(miopen::TensorDescriptor& desc)
 template <typename T = float>
 struct ConvFwdSolverTest
     : public ::testing::TestWithParam<
-          std::tuple<miopenConvFwdAlgorithm_t, ConvTestCase, miopenTensorLayout_t>>
+          std::tuple<miopenConvFwdAlgorithm_t, Conv3DTestCase, miopenTensorLayout_t>>
 {
 protected:
     void SetUp() override
@@ -195,7 +115,7 @@ struct ConvFwdSolverTest
         EXPECT_TRUE(error < threshold)
             << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
     }
-    ConvTestCase conv_config;
+    Conv3DTestCase conv_config;
     miopen::ConvolutionDescriptor conv_desc;
     tensor<T> input;
     tensor<T> weights;
diff --git a/test/gtest/group_conv3d_wrw.cpp b/test/gtest/group_conv3d_wrw.cpp
index 13e88da5ad..977a06220a 100644
--- a/test/gtest/group_conv3d_wrw.cpp
+++ b/test/gtest/group_conv3d_wrw.cpp
@@ -44,7 +44,7 @@ void SolverWrw(const miopen::TensorDescriptor& inputDesc,
                const miopen::TensorDescriptor& outputDesc,
                ConstData_t output, // dy
                const miopen::ConvolutionDescriptor& convDesc,
-               const ConvTestCase& conv_config,
+               const Conv3DTestCase& conv_config,
                bool& test_skipped)
 {
 
diff --git a/test/gtest/group_conv3d_wrw.hpp b/test/gtest/group_conv3d_wrw.hpp
index 76d8ae5d90..bf5824b4fa 100644
--- a/test/gtest/group_conv3d_wrw.hpp
+++ b/test/gtest/group_conv3d_wrw.hpp
@@ -25,89 +25,9 @@
  *******************************************************************************/
 #pragma once
 
-#include <random>
+#include "conv3d_test_case.hpp"
 
-#include "get_handle.hpp"
-#include <miopen/conv/wrw_invoke_params.hpp>
-
-#include "../driver/tensor_driver.hpp"
-#include "conv_common.hpp"
-
-template <typename T>
-miopenDataType_t GetDataType();
-
-template <>
-miopenDataType_t GetDataType<float>()
-{
-    return miopenFloat;
-}
-
-template <>
-miopenDataType_t GetDataType<half_float::half>()
-{
-    return miopenHalf;
-}
-
-template <>
-miopenDataType_t GetDataType<int8_t>()
-{
-    return miopenInt8;
-}
-
-struct ConvTestCase
-{
-    size_t G;
-    size_t N;
-    size_t C;
-    size_t D;
-    size_t H;
-    size_t W;
-    size_t k;
-    size_t z;
-    size_t y;
-    size_t x;
-    size_t pad_x;
-    size_t pad_y;
-    size_t pad_z;
-    size_t stride_x;
-    size_t stride_y;
-    size_t stride_z;
-    size_t dilation_x;
-    size_t dilation_y;
-    size_t dilation_z;
-    miopenConvolutionMode_t conv_mode;
-    friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc)
-    {
-        return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D
-                  << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z
-                  << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z
-                  << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z
-                  << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x
-                  << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y
-                  << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode;
-    }
-
-    std::vector<size_t> GetInput() { return {N, C, D, H, W}; }
-    std::vector<size_t> GetWeights() { return {k, C, z, y, x}; }
-
-    miopen::ConvolutionDescriptor GetConv()
-    {
-        return miopen::ConvolutionDescriptor{
-            3,
-            miopenConvolution,
-            miopenPaddingDefault,
-            {static_cast<int>(pad_z), static_cast<int>(pad_y), static_cast<int>(pad_x)},
-            {static_cast<int>(stride_z), static_cast<int>(stride_y), static_cast<int>(stride_x)},
-            {static_cast<int>(dilation_z),
-             static_cast<int>(dilation_y),
-             static_cast<int>(dilation_x)},
-            {0, 0, 0},
-            static_cast<int>(G),
-            1.0};
-    }
-};
-
-std::vector<ConvTestCase> ConvTestConfigs()
+std::vector<Conv3DTestCase> ConvTestConfigs()
 { // g   n   c   d    h   w   k   z  y  x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z
     return {{1, 128, 64, 14, 28, 28, 64, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
             {1, 64, 32, 28, 28, 28, 32, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
@@ -135,7 +55,7 @@ inline int SetTensorLayout(miopen::TensorDescriptor& desc)
 template <typename T = float>
 struct ConvWrwSolverTest
     : public ::testing::TestWithParam<
-          std::tuple<miopenConvBwdWeightsAlgorithm_t, ConvTestCase, miopenTensorLayout_t>>
+          std::tuple<miopenConvBwdWeightsAlgorithm_t, Conv3DTestCase, miopenTensorLayout_t>>
 {
 protected:
     void SetUp() override
@@ -191,7 +111,7 @@ struct ConvWrwSolverTest
         EXPECT_TRUE(error < threshold)
             << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
     }
-    ConvTestCase conv_config;
+    Conv3DTestCase conv_config;
     miopen::ConvolutionDescriptor conv_desc;
     tensor<T> input;
     tensor<T> weights;
diff --git a/test/gtest/group_solver.hpp b/test/gtest/group_solver.hpp
index 6fe02e00da..3d9ebddca3 100644
--- a/test/gtest/group_solver.hpp
+++ b/test/gtest/group_solver.hpp
@@ -80,7 +80,11 @@ struct ConvTestCase
     }
 
     std::vector<size_t> GetInput() { return {N, C, H, W}; }
-    std::vector<size_t> GetWeights() { return {k, C, y, x}; }
+    std::vector<size_t> GetWeights()
+    {
+        EXPECT_EQUAL(C % G, 0);
+        return {k, C / G, y, x};
+    }
 
     miopen::ConvolutionDescriptor GetConv()
     {

From 9c713bbc633a1d80f59e6475a2f21a6279c55367 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 6 Oct 2023 14:45:19 -0700
Subject: [PATCH 25/36] [Doc] Bump rocm-docs-core from 0.24.2 to 0.25.0 in
 /docs/.sphinx (#2434)

Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.24.2 to 0.25.0.
- [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases)
- [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md)
- [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.24.2...v0.25.0)

---
updated-dependencies:
- dependency-name: rocm-docs-core
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/.sphinx/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
index 6f10fcce12..f358d19826 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -92,7 +92,7 @@ requests==2.31.0
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==0.24.2
+rocm-docs-core==0.25.0
     # via -r requirements.in
 smmap==5.0.0
     # via gitdb

From 05970fab9f9f6df456723ad6e1d6e33489eae234 Mon Sep 17 00:00:00 2001
From: xinlipn <xinlipn@gmail.com>
Date: Sat, 7 Oct 2023 21:11:02 -0700
Subject: [PATCH 26/36] Fix weight tensor intialization to replace old PR1950
 (#2436)

---
 test/gtest/solver_f8.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/gtest/solver_f8.hpp b/test/gtest/solver_f8.hpp
index 34a10a4cfe..bf77347ef8 100644
--- a/test/gtest/solver_f8.hpp
+++ b/test/gtest/solver_f8.hpp
@@ -160,7 +160,7 @@ struct ConvFwdSolverTest
         test_skipped                = false;
         std::tie(algo, conv_config) = GetParam();
         input   = tensor<T>{conv_config.N, conv_config.C, conv_config.H, conv_config.W};
-        weights = tensor<T>{conv_config.k, conv_config.C, conv_config.x, conv_config.y};
+        weights = tensor<T>{conv_config.k, conv_config.C, conv_config.y, conv_config.x};
 
         auto gen_fp8_value = [=](auto...) {
             const auto tmp = float8(scalar_gen_random_float{-0.5, 0.5}());

From 6f5cb6851235f1943d6ec44768f5b185e2bf4e56 Mon Sep 17 00:00:00 2001
From: Chris Erb <chris.erb65@gmail.com>
Date: Mon, 9 Oct 2023 18:21:40 -0500
Subject: [PATCH 27/36] Add typecast to config key (#2413)

* add typecast value to config key, as optional arg to fdb_key

* fix clang-format issue

* Save space in db key and optimize code. Do not print casting value when casting is not actually necessary.

Co-authored-by: Artem Tamazov <artem.tamazov@gmail.com>

* do not print casting to confkey when unnecessary, code cleanup, datatype rename

* move GetDataTypeName to problem_descrption_base.hpp, organize includes

* fix missing header

---------

Co-authored-by: Jun Liu <Liu.Jun@amd.com>
Co-authored-by: Artem Tamazov <artem.tamazov@gmail.com>
---
 src/activ/problem_description.cpp             |  2 ++
 src/conv/problem_description.cpp              | 24 +++++++++++++--
 .../miopen/activ/problem_description.hpp      |  2 +-
 .../miopen/conv/problem_description.hpp       | 29 ++++---------------
 .../miopen/pooling/problem_description.hpp    |  1 +
 .../miopen/problem_description_base.hpp       | 20 +++++++++++++
 src/pooling/problem_description.cpp           |  2 ++
 7 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/src/activ/problem_description.cpp b/src/activ/problem_description.cpp
index b1e21a76e8..9bc484259d 100644
--- a/src/activ/problem_description.cpp
+++ b/src/activ/problem_description.cpp
@@ -68,6 +68,8 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
 
     ss << ((packed) ? "11" : "10"); // + lite bit
     ss << xDesc.GetType();
+    if(const auto ct = xDesc.GetCastType())
+        ss << GetDataTypeName(*ct);
     ss << activDesc.GetMode();
     ss << read_unit;
     ss << MAP_RD;
diff --git a/src/conv/problem_description.cpp b/src/conv/problem_description.cpp
index 76c47cbcd9..40fdd2f4c9 100644
--- a/src/conv/problem_description.cpp
+++ b/src/conv/problem_description.cpp
@@ -127,6 +127,19 @@ void ProblemDescription::BuildConfKey(std::string& conf_key) const
         ss << 'x' << GetOutLayout();
     }
     ss << 'x' << EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType());
+
+    std::ostringstream optional;
+    if(const auto ct = GetInCastType())
+        optional << "ci" << GetDataTypeName(*ct);
+    if(const auto ct = GetWeightsCastType())
+        optional << "cw" << GetDataTypeName(*ct);
+    if(const auto ct = GetOutCastType())
+        optional << "co" << GetDataTypeName(*ct);
+    if(!optional.str().empty())
+    {
+        ss << 'x' << optional.str();
+    }
+
     ss << 'x' << PrintDHW('x', GetSpatialDims(), GetPadD(), GetPadH(), GetPadW());
     ss << 'x'
        << PrintDHW(
@@ -175,11 +188,18 @@ void ProblemDescription::Serialize(std::ostream& stream) const
     {
         // Group count > 1 identifies Group/Depthwise modes.
         if(GetGroupCount() != 1)
-            optional << 'g' << GetGroupCount();
+            optional << "_g" << GetGroupCount();
+
+        if(const auto ct = GetInCastType())
+            optional << "_ci" << GetDataTypeName(*ct);
+        if(const auto ct = GetWeightsCastType())
+            optional << "_cw" << GetDataTypeName(*ct);
+        if(const auto ct = GetOutCastType())
+            optional << "_co" << GetDataTypeName(*ct);
     }
     if(!optional.str().empty())
     {
-        stream << '_' << optional.str();
+        stream << optional.str();
     }
 }
 
diff --git a/src/include/miopen/activ/problem_description.hpp b/src/include/miopen/activ/problem_description.hpp
index a22a1ba0d8..d5c09850c3 100644
--- a/src/include/miopen/activ/problem_description.hpp
+++ b/src/include/miopen/activ/problem_description.hpp
@@ -27,8 +27,8 @@
 #pragma once
 
 #include <miopen/problem_description_base.hpp>
-#include <miopen/activ.hpp>
 #include <miopen/tensor.hpp>
+#include <miopen/activ.hpp>
 
 #include <string>
 
diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp
index 438e1d306a..d6b735291e 100644
--- a/src/include/miopen/conv/problem_description.hpp
+++ b/src/include/miopen/conv/problem_description.hpp
@@ -26,16 +26,17 @@
 
 #pragma once
 
+#include <boost/any.hpp>
 #include <miopen/conv_algo_name.hpp>
-#include <miopen/convolution.hpp>
 #include <miopen/names.hpp>
+
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/convolution.hpp>
+
 #if MIOPEN_ENABLE_SQLITE
 #include <miopen/sqlite_db.hpp>
 #endif
-#include <miopen/tensor.hpp>
-#include <miopen/problem_description_base.hpp>
-
-#include <boost/any.hpp>
 
 namespace miopen {
 
@@ -44,24 +45,6 @@ struct ExecutionContext;
 std::string
 EncodeDataTypesForKey(miopenDataType_t in, miopenDataType_t weights, miopenDataType_t out);
 
-inline std::string GetDataTypeName(miopenDataType_t data_type)
-{
-    switch(data_type)
-    {
-    case miopenFloat: return "FP32";
-    case miopenHalf: return "FP16";
-    case miopenInt8: return "INT8";
-    case miopenInt8x4: return "INT8x4";
-    case miopenInt32: return "INT32";
-    case miopenBFloat16: return "BF16";
-    case miopenDouble: return "FP64";
-    case miopenFloat8: return "FP8";
-    case miopenBFloat8: return "BFP8";
-    }
-
-    return "Unknown(" + std::to_string(data_type) + ")";
-}
-
 template <class TElement>
 constexpr auto GetDHW(unsigned spatial_dims, const std::vector<TElement>& data)
 {
diff --git a/src/include/miopen/pooling/problem_description.hpp b/src/include/miopen/pooling/problem_description.hpp
index 962c73c7dc..dc3e1c25b6 100644
--- a/src/include/miopen/pooling/problem_description.hpp
+++ b/src/include/miopen/pooling/problem_description.hpp
@@ -26,6 +26,7 @@
 
 #pragma once
 
+#include <miopen/problem_description_base.hpp>
 #include <miopen/tensor.hpp>
 #include <miopen/pooling.hpp>
 
diff --git a/src/include/miopen/problem_description_base.hpp b/src/include/miopen/problem_description_base.hpp
index 3ca8a178b1..a3d51f120e 100644
--- a/src/include/miopen/problem_description_base.hpp
+++ b/src/include/miopen/problem_description_base.hpp
@@ -26,8 +26,28 @@
 
 #pragma once
 
+#include <miopen/miopen.h>
+
 namespace miopen {
 
+inline std::string GetDataTypeName(miopenDataType_t data_type)
+{
+    switch(data_type)
+    {
+    case miopenFloat: return "FP32";
+    case miopenHalf: return "FP16";
+    case miopenInt8: return "INT8";
+    case miopenInt8x4: return "INT8x4";
+    case miopenInt32: return "INT32";
+    case miopenBFloat16: return "BF16";
+    case miopenDouble: return "FP64";
+    case miopenFloat8: return "FP8";
+    case miopenBFloat8: return "BF8";
+    }
+
+    return "Unknown(" + std::to_string(data_type) + ")";
+}
+
 struct ProblemDescriptionBase
 {
 };
diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp
index 0ee5469684..8e171a4ac0 100644
--- a/src/pooling/problem_description.cpp
+++ b/src/pooling/problem_description.cpp
@@ -61,6 +61,8 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
 
     ss << "m" + std::to_string(pooling_method);
     ss << "_dt" << xDesc.GetType();
+    if(const auto ct = xDesc.GetCastType())
+        ss << "_dct" << GetDataTypeName(*ct);
     ss << "_ker" << get_vect_config(pooling.lens);
     ss << "_str" << get_vect_config(pooling.strides);
     ss << "_pad" << get_vect_config(pooling.pads);

From 8c4239d4b06361729ba9359d2095cc639841ee9c Mon Sep 17 00:00:00 2001
From: Chris Erb <chris.erb65@gmail.com>
Date: Mon, 9 Oct 2023 18:22:16 -0500
Subject: [PATCH 28/36] [Bugfix] Add cast swapping for swapped gemm inputs.
 (#2443)

* add swapping for cast types when swapping A+B for gemm
---
 src/gemm_v2.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp
index 19e302f166..1750f625da 100644
--- a/src/gemm_v2.cpp
+++ b/src/gemm_v2.cpp
@@ -413,6 +413,7 @@ miopenStatus_t CallGemm(const Handle& handle,
         gemm_desc.isColMajor = !gemm_desc.isColMajor;
         std::swap(A, B);
         std::swap(a_offset, b_offset);
+        std::swap(gemm_desc.a_cast_type, gemm_desc.b_cast_type);
         std::swap(gemm_desc.transA, gemm_desc.transB);
         std::swap(gemm_desc.m, gemm_desc.n);
         std::swap(gemm_desc.lda, gemm_desc.ldb);
@@ -665,6 +666,7 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
         gemm_desc.isColMajor = !gemm_desc.isColMajor;
         std::swap(A, B);
         std::swap(a_offset, b_offset);
+        std::swap(gemm_desc.a_cast_type, gemm_desc.b_cast_type);
         std::swap(gemm_desc.transA, gemm_desc.transB);
         std::swap(gemm_desc.m, gemm_desc.n);
         std::swap(gemm_desc.lda, gemm_desc.ldb);
@@ -938,6 +940,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
         gemm_desc.isColMajor = !gemm_desc.isColMajor;
         std::swap(A, B);
         std::swap(a_offset, b_offset);
+        std::swap(gemm_desc.a_cast_type, gemm_desc.b_cast_type);
         std::swap(gemm_desc.transA, gemm_desc.transB);
         std::swap(gemm_desc.m, gemm_desc.n);
         std::swap(gemm_desc.lda, gemm_desc.ldb);

From 1d6db535f44c33624417fef65ea3718d25ae4c48 Mon Sep 17 00:00:00 2001
From: Chris Erb <chris.erb65@gmail.com>
Date: Wed, 11 Oct 2023 01:10:44 -0500
Subject: [PATCH 29/36] [Bugfix] Kernel name fix, compilation err fix (#2446)

---
 src/kernels/gpu_reference_kernel/fp8_kern_types.h   | 6 +++---
 src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/kernels/gpu_reference_kernel/fp8_kern_types.h b/src/kernels/gpu_reference_kernel/fp8_kern_types.h
index b14302e0c2..d714a0514c 100644
--- a/src/kernels/gpu_reference_kernel/fp8_kern_types.h
+++ b/src/kernels/gpu_reference_kernel/fp8_kern_types.h
@@ -58,6 +58,6 @@
 
 #define KERNEL_NAME_SUFFIX CAT(CAT(INPUT_TYPE, _), CAT(CAT(WEIGHTS_TYPE, _), OUTPUT_TYPE))
 
-#define FWD_KERNEL_NAME CAT(naive_conv_packed_fwd_nchw_, KERNEL_NAME_SUFFIX)
-#define BWD_KERNEL_NAME CAT(naive_conv_packed_bwd_nchw_, KERNEL_NAME_SUFFIX)
-#define WRW_KERNEL_NAME CAT(naive_conv_packed_wrw_nchw_, KERNEL_NAME_SUFFIX)
+#define FWD_KERNEL_NAME CAT(naive_conv_nonpacked_fwd_nchw_, KERNEL_NAME_SUFFIX)
+#define BWD_KERNEL_NAME CAT(naive_conv_nonpacked_bwd_nchw_, KERNEL_NAME_SUFFIX)
+#define WRW_KERNEL_NAME CAT(naive_conv_nonpacked_wrw_nchw_, KERNEL_NAME_SUFFIX)
diff --git a/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp b/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp
index e6b2945beb..3b4eabecfb 100644
--- a/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp
+++ b/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp
@@ -26,7 +26,7 @@
 #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
-#include <half.hpp>
+//#include <half.hpp>
 #include <hip/hip_bfloat16.h>
 #endif
 

From b20e20f72a15b6f16efbd4699726fbed212888e6 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Oct 2023 23:13:05 -0700
Subject: [PATCH 30/36] Bump gitpython from 3.1.35 to 3.1.37 in /docs/.sphinx
 (#2445)

Bumps [gitpython](https://github.com/gitpython-developers/GitPython) from 3.1.35 to 3.1.37.
- [Release notes](https://github.com/gitpython-developers/GitPython/releases)
- [Changelog](https://github.com/gitpython-developers/GitPython/blob/main/CHANGES)
- [Commits](https://github.com/gitpython-developers/GitPython/compare/3.1.35...3.1.37)

---
updated-dependencies:
- dependency-name: gitpython
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 docs/.sphinx/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt
index f358d19826..eb52b0503e 100644
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -40,7 +40,7 @@ fastjsonschema==2.16.3
     # via rocm-docs-core
 gitdb==4.0.10
     # via gitpython
-gitpython==3.1.35
+gitpython==3.1.37
     # via rocm-docs-core
 idna==3.4
     # via requests

From b438fd95f0f978a6b9aca551dfc8cf974b19d3b4 Mon Sep 17 00:00:00 2001
From: Chris Erb <chris.erb65@gmail.com>
Date: Wed, 11 Oct 2023 01:20:17 -0500
Subject: [PATCH 31/36] Add MIOPEN_BETA_API defines around f8 (#2430)

---------

Co-authored-by: JD <jahandad@gmail.com>
---
 CMakeLists.txt                       |  3 +++
 driver/layernorm_driver.hpp          |  6 +++---
 fin                                  |  2 +-
 include/miopen/miopen.h              | 17 +++++++++++++++--
 src/include/miopen/layernorm.hpp     |  4 ++--
 src/layer_norm.cpp                   |  4 ++--
 src/solver/batchnorm/backward_ck.cpp |  6 +++---
 7 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ac8484172b..aebd984989 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,6 +68,9 @@ include(ROCMCreatePackage)
 include(CheckCXXCompilerFlag)
 include(ROCMHeaderWrapper)
 
+# Build library with Beta APIs
+add_definitions("-DMIOPEN_BETA_API=1")
+
 set(MIOPEN_ENABLE_AI_IMMED_MODE_FALLBACK On CACHE BOOL "Enable AI-based fallback for Immediate Mode")
 set(MIOPEN_ENABLE_AI_KERNEL_TUNING On CACHE BOOL "Enable AI heuristic for kernel tuning")
 set(MIOPEN_ENABLE_SQLITE On CACHE BOOL "")
diff --git a/driver/layernorm_driver.hpp b/driver/layernorm_driver.hpp
index 8251472625..e43f3f2f37 100644
--- a/driver/layernorm_driver.hpp
+++ b/driver/layernorm_driver.hpp
@@ -255,19 +255,19 @@ int LayerNormDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
 
     for(int i = 0; i < in_sz; i++)
     {
-        in[i] = RAN_GEN<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+        in[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
     }
     status = in_dev->ToGPU(q, in.data());
 
     for(int i = 0; i < weight_sz; i++)
     {
-        weight[i] = RAN_GEN<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+        weight[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
     }
     status = weight_dev->ToGPU(q, weight.data());
 
     for(int i = 0; i < bias_sz; i++)
     {
-        bias[i] = RAN_GEN<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+        bias[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
     }
     status = bias_dev->ToGPU(q, bias.data());
 
diff --git a/fin b/fin
index b2f3f4db3c..26b5c32864 160000
--- a/fin
+++ b/fin
@@ -1 +1 @@
-Subproject commit b2f3f4db3c3d7dd757e6d9e68719a780d8114dfa
+Subproject commit 26b5c328642a6af5041539ceae36b9340829384b
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index a59181acf3..abbec2599c 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -112,11 +112,13 @@ typedef enum
     miopenStatusVersionMismatch = 10, /*!< Version mismatch of the supplied binary data argment. */
 } miopenStatus_t;
 
+#ifdef MIOPEN_BETA_API
 typedef enum
 {
     miopenF8RoundingModeStandard   = 0,
     miopenF8RoundingModeStochastic = 1,
 } miopenF8RoundingMode_t;
+#endif
 
 /*! @brief Get character string for an error code.
  *
@@ -354,9 +356,14 @@ typedef enum
         4, /*!< Pack of four 8-bit int points in NCHW_VECT_C format (Partially supported) */
     miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction)
                            (Partially supported) */
-    miopenDouble  = 6,  /*!< 64-bit floating point (Partially supported) */
+    miopenDouble = 6,   /*!< 64-bit floating point (Partially supported) */
+#ifdef MIOPEN_BETA_API
     miopenFloat8  = 7,
-    miopenBFloat8 = 8
+    miopenBFloat8 = 8,
+#else
+// miopenReserved1 = 7,
+// miopenReserved2 = 8,
+#endif
 } miopenDataType_t;
 
 /*! @ingroup tensor
@@ -601,11 +608,15 @@ typedef enum
     MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC =
         1, /*!< Restrict MIOpen convolutions to kernels which produce numerically deterministic
               results. 0 - disabled (default), 1 - enabled >*/
+#ifdef MIOPEN_BETA_API
     MIOPEN_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE =
         2, /*!<Specifies the rounding mode for the 8-bit floating data types. Currently, two
               rounding modes are supported miopenF8RoundingModeStandard and
               miopenF8RoundingModeStochastic. These are listed as part of the miopenF8RoundingMode_t
               enum.>*/
+#else
+// miopenReserved1 = 2,
+#endif
 } miopenConvolutionAttrib_t;
 
 /** @addtogroup tensor
@@ -723,6 +734,7 @@ MIOPEN_EXPORT miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t
                                                        const int* dimsA,
                                                        const int* stridesA);
 
+#ifdef MIOPEN_BETA_API
 /*! @brief Set the tensor cast type
  *
  *  For tensors where the cast_type attribute is set, the tensor elements would be converted to the
@@ -734,6 +746,7 @@ MIOPEN_EXPORT miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t
  */
 MIOPEN_EXPORT miopenStatus_t miopenSetTensorCastType(miopenTensorDescriptor_t tensorDesc,
                                                      miopenDataType_t cast_type);
+#endif
 
 /*! @brief Set shape of N-dimensional tensor
  *
diff --git a/src/include/miopen/layernorm.hpp b/src/include/miopen/layernorm.hpp
index 8ec2d96055..f897e79eea 100644
--- a/src/include/miopen/layernorm.hpp
+++ b/src/include/miopen/layernorm.hpp
@@ -49,8 +49,8 @@ miopenStatus_t LayerNormForward(const Handle& handle,
                                 const TensorDescriptor& rstdDesc,
                                 Data_t rstd,
                                 miopenLayerNormMode_t mode,
-                                const float epsilon,
-                                const int32_t normalized_dim);
+                                float epsilon,
+                                int32_t normalized_dim);
 
 } // namespace miopen
 #endif // _MIOPEN_LAYERNORM_HPP_
diff --git a/src/layer_norm.cpp b/src/layer_norm.cpp
index 3d52bc771f..33030887ee 100644
--- a/src/layer_norm.cpp
+++ b/src/layer_norm.cpp
@@ -48,8 +48,8 @@ miopenStatus_t LayerNormForward(const Handle& handle,
                                 const TensorDescriptor& rstdDesc,
                                 Data_t rstd,
                                 miopenLayerNormMode_t mode,
-                                const float epsilon,
-                                const int32_t normalized_dim)
+                                float epsilon,
+                                int32_t normalized_dim)
 {
     if(x == nullptr || y == nullptr)
     {
diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
index fba8724990..6fecc24049 100644
--- a/src/solver/batchnorm/backward_ck.cpp
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -156,11 +156,11 @@ static bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription& pr
 
 #endif
 
-bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
+bool BnCKBwdBackward::IsApplicable(const ExecutionContext& context,
                                    const miopen::batchnorm::ProblemDescription& bn_problem) const
 {
 #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL
-    std::ignore = ctx;
+    std::ignore = context;
     std::ignore = fdesc_problem;
     return false;
 #else
@@ -168,7 +168,7 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
         return false;
     if(!bn_problem.IsLayoutNHWC())
         return false;
-    if(!ck_utility::is_ck_supported_hardware(ctx.GetStream()))
+    if(!ck_utility::is_ck_supported_hardware(context.GetStream()))
         return false;
     if(bn_problem.GetXDesc().GetType() != bn_problem.GetScaleBiasDiffDesc().GetType())
         return false;

From b45e54d6c6737376d2820cce093dda41244b9380 Mon Sep 17 00:00:00 2001
From: Artem Tamazov <artem.tamazov@gmail.com>
Date: Wed, 11 Oct 2023 09:56:45 +0300
Subject: [PATCH 32/36] Remove INT8x4 support (#2441)

---
 docs/datatypes.md                             |  3 +-
 include/miopen/miopen.h                       | 11 ++--
 src/check_numerics.cpp                        |  2 +-
 src/convolution.cpp                           |  2 +-
 src/gemm_v2.cpp                               | 52 ++++++-------------
 src/hip/batched_transpose_sol.cpp             |  6 ++-
 src/include/miopen/datatype.hpp               | 12 ++---
 src/include/miopen/tensor.hpp                 |  4 +-
 src/include/miopen/visit_float.hpp            |  2 +-
 src/kernels/MIOpenIm2d2Col.cl                 |  6 ---
 src/kernels/MIOpenIm3d2Col.cl                 |  6 ---
 .../MIOpenSubTensorOpWithScalarKernel.cl      |  6 +--
 .../MIOpenSubTensorOpWithSubTensorKernel.cl   |  6 +--
 .../MIOpenSubTensorOpWithTransformKernel.cl   |  6 +--
 src/kernels/MIOpenUtilKernels4.cl             |  6 ---
 src/ocl/convolutionocl.cpp                    |  8 +--
 src/ocl/tensorocl.cpp                         | 42 ++++++---------
 src/ocl/utilocl.cpp                           | 33 +++---------
 src/pooling_api.cpp                           |  2 +-
 src/reducetensor.cpp                          |  7 ++-
 src/reducetensor_api.cpp                      |  2 +-
 src/solver/batchnorm/forward_inference_ck.cpp |  4 +-
 .../conv_ck_igemm_fwd_bias_activ_fused.cpp    |  8 +--
 src/solver/conv_direct_naive_conv.cpp         |  2 +-
 ...ip_implicit_gemm_3d_grouped_bwd_xdlops.cpp |  8 +--
 ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp |  8 +--
 ...ip_implicit_gemm_3d_grouped_wrw_xdlops.cpp |  8 +--
 ...conv_hip_implicit_gemm_bwd_data_xdlops.cpp |  8 +--
 .../conv_hip_implicit_gemm_fwd_xdlops.cpp     |  8 +--
 ...v_hip_implicit_gemm_grouped_fwd_xdlops.cpp |  8 +--
 src/solver/gemm.cpp                           | 50 ++++++------------
 src/solver/mlir_common.cpp                    |  2 +-
 src/tensor.cpp                                |  2 +-
 test/conv_common.hpp                          | 33 +++++-------
 test/driver.hpp                               |  4 +-
 test/gpu_reference_kernel.cpp                 |  2 -
 test/gtest/conv_embed_db.cpp                  |  4 +-
 test/gtest/conv_hip_igemm_xdlops.cpp          |  4 +-
 test/gtest/conv_igemm_dynamic.cpp             |  4 +-
 test/gtest/conv_igemm_mlir.cpp                |  4 +-
 test/gtest/conv_igemm_mlir_xdlops.cpp         |  4 +-
 test/gtest/conv_trans.cpp                     |  4 +-
 test/gtest/db_sync.cpp                        |  2 +-
 test/tensor_holder.hpp                        |  7 +--
 test/tensor_transform.cpp                     |  2 +-
 45 files changed, 151 insertions(+), 263 deletions(-)

diff --git a/docs/datatypes.md b/docs/datatypes.md
index 1a46e5fd62..f5d8f28931 100644
--- a/docs/datatypes.md
+++ b/docs/datatypes.md
@@ -10,7 +10,7 @@ typedef enum {
     miopenFloat    = 1,
     miopenInt32    = 2,
     miopenInt8     = 3,
-    miopenInt8x4   = 4,
+    /* Value 4 is reserved. */
     miopenBFloat16 = 5,
 } miopenDataType_t;
 ```
@@ -22,7 +22,6 @@ Type descriptions:
  * `miopenFloat` - 32-bit floating point
  * `miopenInt32` - 32-bit integer, used primarily for `int8` convolution outputs
  * `miopenInt8` - 8-bit integer, currently only supported by `int8` convolution forward path, tensor set, tensor copy, tensor cast, tensor transform, tensor transpose, and im2col.
- * `miopenInt8x4` - 8-bit 4 element vector type used primarily with `int8` convolutions forward path.
  * `miopenBFloat16` - brain float fp-16 (8-bit exponent, 7-bit fraction), currently only supported by convolutions, tensor set, and tensor copy.
 
 
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index abbec2599c..0c55b3becd 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -348,12 +348,11 @@ MIOPEN_DECLARE_OBJECT(miopenReduceTensorDescriptor);
  */
 typedef enum
 {
-    miopenHalf  = 0, /*!< 16-bit floating point (Fully supported) */
-    miopenFloat = 1, /*!< 32-bit floating point (Fully supported) */
-    miopenInt32 = 2, /*!< 32-bit int point (Partially supported) */
-    miopenInt8  = 3, /*!< 8-bit int point (Partially supported) */
-    miopenInt8x4 =
-        4, /*!< Pack of four 8-bit int points in NCHW_VECT_C format (Partially supported) */
+    miopenHalf     = 0, /*!< 16-bit floating point (Fully supported) */
+    miopenFloat    = 1, /*!< 32-bit floating point (Fully supported) */
+    miopenInt32    = 2, /*!< 32-bit int point (Partially supported) */
+    miopenInt8     = 3, /*!< 8-bit int point (Partially supported) */
+    miopenInt8x4   = 4, /*!< Pack of four Int8 in NCHW_VECT_C format (Support discontinued) */
     miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction)
                            (Partially supported) */
     miopenDouble = 6,   /*!< 64-bit floating point (Partially supported) */
diff --git a/src/check_numerics.cpp b/src/check_numerics.cpp
index b0882995e9..50cb0af4af 100644
--- a/src/check_numerics.cpp
+++ b/src/check_numerics.cpp
@@ -64,7 +64,7 @@ std::string GetKernelName(miopenDataType_t data_type)
     case miopenBFloat8: return {"check_numerics_bf8"};
     case miopenInt32:
     case miopenInt8:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenDouble:
     default: return {""};
     }
diff --git a/src/convolution.cpp b/src/convolution.cpp
index 403ff777cd..ac7c28fdc4 100644
--- a/src/convolution.cpp
+++ b/src/convolution.cpp
@@ -358,7 +358,7 @@ ConvolutionDescriptor::GetForwardOutputTensorWithLayout(const TensorDescriptor&
     std::vector<std::size_t> out_strides;
     tensor_layout_to_strides(
         out_lens, default_layout, yLayout, xDesc.GetVectorLength(), out_strides);
-    return {(xDesc.GetType() == miopenInt8 || xDesc.GetType() == miopenInt8x4
+    return {(xDesc.GetType() == miopenInt8
                  ? (yType)
                  : xDesc.GetType()), // TODO: This function overrides the output type with
                                      // essentially the input which is incorrect.
diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp
index 1750f625da..fad06870ce 100644
--- a/src/gemm_v2.cpp
+++ b/src/gemm_v2.cpp
@@ -63,10 +63,6 @@
 /// "disabled expansion of recursive macro" injected by rocblas headers.
 #define AVOID_ROCBLAS_WRAPPERS_204 (MIOPEN_ROCBLAS_VERSION_FLAT >= 2004000)
 
-/// Maintain API compatibility with various rocBLAS version
-#define USE_GEMM_FLAGS_PACK_INT8X4 \
-    ((MIOPEN_ROCBLAS_VERSION_FLAT >= 2038000) && (MIOPEN_ROCBLAS_VERSION_FLAT < 4000000))
-
 /// Maintain API compatibility for versions not supporting FP16 alternate implementations
 #define USE_GEMM_FLAGS_FP16_ALT_IMPL (MIOPEN_ROCBLAS_VERSION_FLAT >= 2043000)
 /// Some 2.42 versions have rocblas_gemm_flags_fp16_alt_impl, but
@@ -110,7 +106,7 @@ static inline rocblas_datatype rocBlasComputeType(const miopen::GemmDescriptor&
 {
     // Complex compute types are only supported in newer version of the API
     assert(desc.dataType == desc.a_cast_type && desc.dataType == desc.b_cast_type);
-    if(desc.dataType == miopenInt8 || desc.dataType == miopenInt8x4)
+    if(desc.dataType == miopenInt8)
         return rocblas_datatype::rocblas_datatype_i32_r;
     else
         return rocblas_datatype::rocblas_datatype_f32_r;
@@ -441,7 +437,6 @@ miopenStatus_t CallGemm(const Handle& handle,
 
         switch(gemm_desc.dataType)
         {
-        case miopenInt8x4:
         case miopenInt8: {
             assert(gemm_desc.k % 4 == 0);
 
@@ -473,12 +468,7 @@ miopenStatus_t CallGemm(const Handle& handle,
                 rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_i32_r,
                 rocblas_gemm_algo::rocblas_gemm_algo_standard,
                 0,
-#if USE_GEMM_FLAGS_PACK_INT8X4
-                rocblas_gemm_flags_pack_int8x4
-#else
-                0
-#endif
-            );
+                0);
         }
         break;
         case miopenInt32: break;
@@ -622,9 +612,9 @@ miopenStatus_t CallGemm(const Handle& handle,
         };
         break;
 
+        case miopenInt8x4:
         case miopenDouble: {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "miopenDouble data type not supported by MIOpenGEMM.");
+            MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type.");
         };
         break;
         }
@@ -695,7 +685,6 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
 
         switch(gemm_desc.dataType)
         {
-        case miopenInt8x4:
         case miopenInt8: {
             assert(gemm_desc.k % 4 == 0);
 
@@ -731,12 +720,7 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
                 rocblas_datatype::rocblas_datatype_i32_r,
                 rocblas_gemm_algo::rocblas_gemm_algo_standard,
                 0,
-#if USE_GEMM_FLAGS_PACK_INT8X4
-                rocblas_gemm_flags_pack_int8x4
-#else
-                0
-#endif
-            );
+                0);
         }
         break;
         case miopenInt32: break;
@@ -895,10 +879,10 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
             break;
         }
 
+        case miopenInt8x4:
         case miopenDouble: {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "miopenDouble data type not supported by MIOpenGEMM.");
-        }
+            MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type.");
+        };
         break;
         }
 
@@ -971,7 +955,6 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
 
         switch(gemm_desc.dataType)
         {
-        case miopenInt8x4:
         case miopenInt8: {
             assert(gemm_desc.k % 4 == 0);
 
@@ -1005,12 +988,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
                     rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_i32_r,
                     rocblas_gemm_algo::rocblas_gemm_algo_standard,
                     0,
-#if USE_GEMM_FLAGS_PACK_INT8X4
-                    rocblas_gemm_flags_pack_int8x4
-#else
-                    0
-#endif
-                );
+                    0);
             }
         }
         break;
@@ -1166,10 +1144,10 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
             break;
         }
 
+        case miopenInt8x4:
         case miopenDouble: {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "miopenDouble data type not supported by MIOpenGEMM.");
-        }
+            MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type.");
+        };
         break;
         }
 
@@ -1199,7 +1177,7 @@ GemmDescriptor CreateGemmDescriptorConvFwd(const TensorDescriptor& wDesc,
 {
 #ifndef NDEBUG
     assert(wDesc.GetType() == xDesc.GetType());
-    if(wDesc.GetType() != miopenInt8 && wDesc.GetType() != miopenInt8x4)
+    if(wDesc.GetType() != miopenInt8)
         assert(wDesc.GetType() == yDesc.GetType());
 #endif
 
@@ -1354,7 +1332,7 @@ GemmDescriptor CreateGemmDescriptorConvCNHWFwd(const TensorDescriptor& wDesc,
 {
 #ifndef NDEBUG
     assert(wDesc.GetType() == xDesc.GetType());
-    if(wDesc.GetType() != miopenInt8 && wDesc.GetType() != miopenInt8x4)
+    if(wDesc.GetType() != miopenInt8)
         assert(wDesc.GetType() == yDesc.GetType());
 #endif
 
@@ -1458,7 +1436,7 @@ GemmDescriptor CreateGemmStridedBatchedDescriptorConv1x1Fwd(const TensorDescript
 {
 #ifndef NDEBUG
     assert(wDesc.GetType() == xDesc.GetType());
-    if(wDesc.GetType() != miopenInt8 && wDesc.GetType() != miopenInt8x4)
+    if(wDesc.GetType() != miopenInt8)
         assert(wDesc.GetType() == yDesc.GetType());
 #else
     (void)yDesc;
diff --git a/src/hip/batched_transpose_sol.cpp b/src/hip/batched_transpose_sol.cpp
index 4ffcf34b77..01349775ca 100644
--- a/src/hip/batched_transpose_sol.cpp
+++ b/src/hip/batched_transpose_sol.cpp
@@ -322,7 +322,11 @@ BatchedTransposeSolution::BatchedTransposeSolution(const ExecutionContext& ctx,
                                                    uint32_t width_)
     : data_type(data_type_), batch(batch_), height(height_), width(width_)
 {
-    if(data_type == miopenInt8x4 || data_type == miopenDouble)
+    if(!(data_type == miopenHalf     //
+         || data_type == miopenFloat //
+         || data_type == miopenInt32 //
+         || data_type == miopenInt8  //
+         || data_type == miopenBFloat16))
         MIOPEN_THROW("These data type are not supported");
     num_cu                 = ctx.GetStream().GetMaxComputeUnits();
     std::size_t data_size  = miopen::GetTypeSize(data_type);
diff --git a/src/include/miopen/datatype.hpp b/src/include/miopen/datatype.hpp
index 485bdb3d67..29057e3de9 100644
--- a/src/include/miopen/datatype.hpp
+++ b/src/include/miopen/datatype.hpp
@@ -53,7 +53,10 @@ inline std::string GetDataType(miopenDataType_t type)
         type_str = "bfloat16";
     }
     break;
-    case miopenInt8x4:
+    case miopenInt8x4: {
+        type_str = "UNSUPPORTED_TYPE";
+    }
+    break;
     case miopenInt8: {
         type_str = "int8_t";
     }
@@ -137,7 +140,6 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type)
     int use_fp16x8             = 0;
     int use_fp32               = 0;
     int use_int8               = 0;
-    int use_int8x4             = 0;
     int use_int32              = 0;
     int use_bfp16              = 0;
     int use_fp64               = 0;
@@ -150,15 +152,14 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type)
     case miopenHalf: use_fp16 = 1; break;
     case miopenFloat: use_fp32 = 1; break;
     case miopenInt8: use_int8 = 1; break;
-    case miopenInt8x4: use_int8x4 = 1; break;
     case miopenBFloat16: use_bfp16 = 1; break;
     case miopenInt32: use_int32 = 1; break;
     case miopenDouble: use_fp64 = 1; break;
     case miopenFloat8: use_fp8 = 1; break;
     case miopenBFloat8: use_bfp8 = 1; break;
+    case miopenInt8x4: // fallthrough
     default:
-        MIOPEN_THROW(
-            "Only float, half, bfloat16, int8, int8x4, float8, bfloat8 data type is supported.");
+        MIOPEN_THROW("Only float, half, bfloat16, int8, float8, bfloat8 data types are supported.");
         break;
     }
 
@@ -168,7 +169,6 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type)
         {"MIOPEN_USE_FP16x8", use_fp16x8},
         {"MIOPEN_USE_FP32", use_fp32},
         {"MIOPEN_USE_INT8", use_int8},
-        {"MIOPEN_USE_INT8x4", use_int8x4},
         {"MIOPEN_USE_BFP16", use_bfp16},
         {"MIOPEN_USE_INT32", use_int32},
         {"MIOPEN_USE_RNE_BFLOAT16", use_rne_bfloat16},
diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp
index b8d72df67c..a133ab4adc 100644
--- a/src/include/miopen/tensor.hpp
+++ b/src/include/miopen/tensor.hpp
@@ -101,13 +101,13 @@ inline std::size_t GetTypeSize(miopenDataType_t d)
     case miopenFloat: return 4;
     case miopenHalf:
     case miopenBFloat16: return 2;
-    case miopenInt8x4:
+    case miopenInt8x4: break;
     case miopenInt8:
     case miopenFloat8:
     case miopenBFloat8: return 1;
     case miopenDouble: return 8;
     }
-    MIOPEN_THROW("Unknown data type");
+    MIOPEN_THROW("Unknown or unsupported data type");
 }
 
 template <class X, class Y>
diff --git a/src/include/miopen/visit_float.hpp b/src/include/miopen/visit_float.hpp
index d26afba9dd..35e1ae6ae7 100644
--- a/src/include/miopen/visit_float.hpp
+++ b/src/include/miopen/visit_float.hpp
@@ -79,7 +79,6 @@ void visit_float(miopenDataType_t t, F f)
     }
     case miopenFloat8:
     case miopenBFloat8:
-    case miopenInt8x4:
     case miopenInt8: {
         f(as_float<int8_t>{});
         break;
@@ -92,6 +91,7 @@ void visit_float(miopenDataType_t t, F f)
         f(as_float<double>{});
         break;
     }
+    case miopenInt8x4: MIOPEN_THROW("miopenInt8x4: Support discontinued.");
     }
 }
 
diff --git a/src/kernels/MIOpenIm2d2Col.cl b/src/kernels/MIOpenIm2d2Col.cl
index 7b1522db6f..852ccff955 100644
--- a/src/kernels/MIOpenIm2d2Col.cl
+++ b/src/kernels/MIOpenIm2d2Col.cl
@@ -40,10 +40,6 @@
 #define MIOPEN_USE_INT8 0
 #endif
 
-#ifndef MIOPEN_USE_INT8x4
-#define MIOPEN_USE_INT8x4 0
-#endif
-
 #ifndef MIOPEN_USE_INT32
 #define MIOPEN_USE_INT32 0
 #endif
@@ -58,8 +54,6 @@
 
 #if MIOPEN_USE_INT8 || MIOPEN_USE_FP8 || MIOPEN_USE_BFP8
 typedef char data_t;
-#elif MIOPEN_USE_INT8x4
-typedef uint data_t;
 #elif MIOPEN_USE_INT32
 typedef int data_t;
 #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16)
diff --git a/src/kernels/MIOpenIm3d2Col.cl b/src/kernels/MIOpenIm3d2Col.cl
index 7ccb8363b7..5ee437a068 100644
--- a/src/kernels/MIOpenIm3d2Col.cl
+++ b/src/kernels/MIOpenIm3d2Col.cl
@@ -40,18 +40,12 @@
 #define MIOPEN_USE_INT8 0
 #endif
 
-#ifndef MIOPEN_USE_INT8x4
-#define MIOPEN_USE_INT8x4 0
-#endif
-
 #ifndef MIOPEN_USE_INT32
 #define MIOPEN_USE_INT32 0
 #endif
 
 #if MIOPEN_USE_INT8
 typedef char data_t;
-#elif MIOPEN_USE_INT8x4
-typedef uint data_t;
 #elif MIOPEN_USE_INT32
 typedef int data_t;
 #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16)
diff --git a/src/kernels/MIOpenSubTensorOpWithScalarKernel.cl b/src/kernels/MIOpenSubTensorOpWithScalarKernel.cl
index 87fd82a7fd..dc8c96fc60 100644
--- a/src/kernels/MIOpenSubTensorOpWithScalarKernel.cl
+++ b/src/kernels/MIOpenSubTensorOpWithScalarKernel.cl
@@ -46,17 +46,13 @@
 #define MIOPEN_USE_INT8 0
 #endif
 
-#ifndef MIOPEN_USE_INT8x4
-#define MIOPEN_USE_INT8x4 0
-#endif
-
 #ifndef MIOPEN_USE_INT32
 #define MIOPEN_USE_INT32 0
 #endif
 
 #include "float_types.h"
 
-#if MIOPEN_USE_INT8 == 1 || MIOPEN_USE_INT8x4 == 1
+#if MIOPEN_USE_INT8 == 1
 #define _FLOAT char
 #endif
 
diff --git a/src/kernels/MIOpenSubTensorOpWithSubTensorKernel.cl b/src/kernels/MIOpenSubTensorOpWithSubTensorKernel.cl
index d0d1762a10..1de4843712 100644
--- a/src/kernels/MIOpenSubTensorOpWithSubTensorKernel.cl
+++ b/src/kernels/MIOpenSubTensorOpWithSubTensorKernel.cl
@@ -46,11 +46,7 @@
 #define MIOPEN_USE_INT8 0
 #endif
 
-#ifndef MIOPEN_USE_INT8x4
-#define MIOPEN_USE_INT8x4 0
-#endif
-
-#if MIOPEN_USE_INT8 == 1 || MIOPEN_USE_INT8x4 == 1
+#if MIOPEN_USE_INT8 == 1
 #define _FLOAT char
 #ifndef FLT_MAX
 #define MAX_VAL 127 /* max value */
diff --git a/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl b/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl
index bc17f2ecf4..bd06f89626 100644
--- a/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl
+++ b/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl
@@ -43,11 +43,7 @@
 #define MIOPEN_USE_INT8 0
 #endif
 
-#ifndef MIOPEN_USE_INT8x4
-#define MIOPEN_USE_INT8x4 0
-#endif
-
-#if MIOPEN_USE_INT8 == 1 || MIOPEN_USE_INT8x4 == 1
+#if MIOPEN_USE_INT8 == 1
 #define _FLOAT char
 #ifndef FLT_MAX
 #define MAX_VAL 127 /* max value */
diff --git a/src/kernels/MIOpenUtilKernels4.cl b/src/kernels/MIOpenUtilKernels4.cl
index 861e563012..09e6b73cff 100644
--- a/src/kernels/MIOpenUtilKernels4.cl
+++ b/src/kernels/MIOpenUtilKernels4.cl
@@ -40,10 +40,6 @@
 #define MIOPEN_USE_INT8 0
 #endif
 
-#ifndef MIOPEN_USE_INT8x4
-#define MIOPEN_USE_INT8x4 0
-#endif
-
 #ifndef MIOPEN_USE_INT32
 #define MIOPEN_USE_INT32 0
 #endif
@@ -58,8 +54,6 @@
 
 #if MIOPEN_USE_INT8 || MIOPEN_USE_FP8 || MIOPEN_USE_BFP8
 typedef char data_t;
-#elif MIOPEN_USE_INT8x4
-typedef uint data_t;
 #elif MIOPEN_USE_INT32
 typedef int data_t;
 #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16)
diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp
index 8c042e3e7b..94b083577d 100644
--- a/src/ocl/convolutionocl.cpp
+++ b/src/ocl/convolutionocl.cpp
@@ -296,8 +296,7 @@ void ValidateConvTensors(const ConvTensors& tensors)
                                           tensors.xDesc.GetSize() != tensors.wDesc.GetSize();
 
     const auto trivial_tensor_types_not_matched =
-        tensors.xDesc.GetType() != tensors.yDesc.GetType() &&
-        tensors.xDesc.GetType() != miopenInt8 && tensors.xDesc.GetType() != miopenInt8x4;
+        tensors.xDesc.GetType() != tensors.yDesc.GetType() && tensors.xDesc.GetType() != miopenInt8;
 
     // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) {
     //    MIOPEN_THROW(miopenStatusBadParm);
@@ -426,11 +425,6 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle,
     ValidateConvTensors(tensors);
     ValidateAlphaBeta(alpha, beta);
 
-    if(algo != miopenConvolutionFwdAlgoGEMM && xDesc.GetType() == miopenInt8x4)
-    {
-        MIOPEN_THROW(miopenStatusBadParm);
-    }
-
     ConvForwardCheckNumerics(handle, tensors, [&]() {
         ValidateGroupCount(xDesc, wDesc, *this);
 
diff --git a/src/ocl/tensorocl.cpp b/src/ocl/tensorocl.cpp
index 9c7bff6992..1e1f8b1df9 100644
--- a/src/ocl/tensorocl.cpp
+++ b/src/ocl/tensorocl.cpp
@@ -1597,10 +1597,13 @@ void ScaleTensor(const Handle& handle,
     assert(yDim_flat > 0 && yDim_flat <= 5);
 
     const miopenDataType_t dataType = yDesc_flat.GetType();
-    if(dataType == miopenInt8 || dataType == miopenInt8x4 || dataType == miopenBFloat16)
+
+    if(!(dataType == miopenHalf     //
+         || dataType == miopenFloat //
+         || dataType == miopenInt32 //
+         || dataType == miopenDouble))
     {
-        MIOPEN_THROW(miopenStatusBadParm,
-                     "Tensor scale operation is not supported for int8, int8x4, and bfloat16.");
+        MIOPEN_THROW(miopenStatusBadParm, "ScaleTensor: unsupported data type.");
     }
 
     std::string kernel_name = "SubTensorOpWithScalar" + std::to_string(yDim_flat) + "d";
@@ -1941,8 +1944,7 @@ std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, mio
     case miopenDouble:
         // TODO
         MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor.");
-    case miopenInt8x4:
-        MIOPEN_THROW(miopenStatusBadParm, "miopenInt8x4 data type not supported in cast tensor.");
+    case miopenInt8x4: // fallthrough
     default: MIOPEN_THROW(miopenStatusBadParm, "Invalid data type in cast tensor desc.");
     }
 }
@@ -2237,24 +2239,6 @@ void TransformTensor(const Handle& handle,
             }
         }
     }
-    else if(xDesc.GetType() == miopenInt8 && yDesc.GetType() == miopenInt8x4 && x_len.size() >= 3)
-    {
-        if(x_len[1] <= (y_len[1] - 4) || y_len[1] % 4 != 0)
-        {
-            MIOPEN_THROW("Invalid y channel size");
-        }
-
-        transpose_NCHW2Vec(handle, x_len, x, y, 4, false, true, alpha, beta);
-    }
-    else if(xDesc.GetType() == miopenInt8x4 && yDesc.GetType() == miopenInt8 && x_len.size() >= 3)
-    {
-        if(y_len[1] <= (x_len[1] - 4) || x_len[1] % 4 != 0)
-        {
-            MIOPEN_THROW("Invalid x channel size");
-        }
-
-        transpose_NCHW2Vec(handle, y_len, x, y, 4, false, false, alpha, beta);
-    }
     else
     {
         auto x_y_len          = boost::combine(x_len, y_len);
@@ -2294,12 +2278,20 @@ void TransformTensor(const Handle& handle,
         const miopenDataType_t dataTypex = xDesc_flat.GetType();
         const miopenDataType_t dataTypey = yDesc_flat.GetType();
 
-        if(dataTypex == miopenInt8 || dataTypex == miopenInt8x4)
+        if(!(dataTypex == miopenHalf        //
+             || dataTypex == miopenFloat    //
+             || dataTypex == miopenInt32    //
+             || dataTypex == miopenBFloat16 //
+             || dataTypex == miopenDouble))
         {
             MIOPEN_THROW("Tensor x is a unsupported data type");
         }
 
-        if(dataTypey == miopenInt8 || dataTypey == miopenInt8x4)
+        if(!(dataTypey == miopenHalf        //
+             || dataTypey == miopenFloat    //
+             || dataTypey == miopenInt32    //
+             || dataTypey == miopenBFloat16 //
+             || dataTypey == miopenDouble))
         {
             MIOPEN_THROW("Tensor y is a unsupported data type");
         }
diff --git a/src/ocl/utilocl.cpp b/src/ocl/utilocl.cpp
index 0a88efb2f1..d536e819e6 100644
--- a/src/ocl/utilocl.cpp
+++ b/src/ocl/utilocl.cpp
@@ -80,8 +80,8 @@ float Im2d2ColGPU(const Handle& handle,
 
     int data_size_bound = c * in_h * in_w;
 
-    int data_size_bound_pack = type == miopenInt8x4 ? data_size_bound * 4 : data_size_bound;
-    int im_offset_pack       = type == miopenInt8x4 ? im_offset / 4 : im_offset;
+    int data_size_bound_pack = data_size_bound;
+    int im_offset_pack       = im_offset;
 
     if(!kernels.empty())
     {
@@ -105,7 +105,7 @@ float Im2d2ColGPU(const Handle& handle,
     }
     else
     {
-        const int c_pack = type == miopenInt8x4 ? c / 4 : c;
+        const int c_pack = c;
 
         std::string params;
         int num_ch_per_wg;
@@ -331,9 +331,8 @@ float Im3d2ColGPU(const Handle& handle,
 
     auto&& kernels = handle.GetKernels("miopenIm3d2Col", network_config);
 
-    // int8x4 vectorize-c format
-    int im_offset_pack = type == miopenInt8x4 ? im_offset / 4 : im_offset;
-    int im_c_pack      = type == miopenInt8x4 ? im_c / 4 : im_c;
+    int im_offset_pack = im_offset;
+    int im_c_pack      = im_c;
 
     if(!kernels.empty())
     {
@@ -772,13 +771,6 @@ float transpose_NCHW2CNHW(const Handle& handle,
 
     std::string params = GetDataTypeKernelParams(type);
 
-    if(type == miopenInt8x4)
-    {
-        c /= 4;
-        in_offset /= 4;
-        out_offset /= 4;
-    }
-
     if(h_stride == 1 && w_stride == 1 && type == miopenFloat)
     {
         kernel_name += "_V1";
@@ -910,13 +902,6 @@ float transpose_CNHW2NCHW(const Handle& handle,
 
     std::string params = GetDataTypeKernelParams(type);
 
-    if(type == miopenInt8x4)
-    {
-        c /= 4;
-        in_offset /= 4;
-        out_offset /= 4;
-    }
-
     if(h_stride == 1 && w_stride == 1 && type == miopenFloat)
     {
         kernel_name += "_V1";
@@ -1170,14 +1155,8 @@ float transpose_packed_MN2NM(const Handle& handle,
     auto&& kernels = handle.GetKernels(kernel_name, network_config);
 
     std::string params = GetDataTypeKernelParams(type);
-    if(type == miopenInt8x4)
-    {
-        m /= 4;
-        in_offset /= 4;
-        out_offset /= 4;
-    }
 
-    if(!(type == miopenInt8x4 || type == miopenInt8))
+    if(type != miopenInt8)
     {
         MIOPEN_THROW("transpose_packed_MN2NM only meant for int8 variants.");
     }
diff --git a/src/pooling_api.cpp b/src/pooling_api.cpp
index ef526804cf..bf318f7b78 100644
--- a/src/pooling_api.cpp
+++ b/src/pooling_api.cpp
@@ -50,7 +50,7 @@ inline void Pooling_logging_cmd(const miopenPoolingDescriptor_t poolDesc,
         case miopenFloat: ss << "pool"; break;
         case miopenInt32:
         case miopenInt8:
-        case miopenInt8x4:
+        case miopenInt8x4: // Support discontinued.
         case miopenBFloat16:
         case miopenDouble:
         case miopenFloat8:
diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index 6cf29c2f64..9cabbc03f6 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -211,11 +211,10 @@ inline int GetDataTypeSize(miopenDataType_t t)
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt8: return (1);
-    case miopenInt8x4: return (4);
+    case miopenInt8x4: return (4); // Support discontinued.
     case miopenBFloat16: return (2);
     case miopenInt32: return (4);
-    default:
-        MIOPEN_THROW("Only float, half, double, bfloat16, int8, int8x4 data type is supported.");
+    default: MIOPEN_THROW("Only float, half, double, bfloat16, int8 data types are supported.");
     };
 };
 
@@ -269,7 +268,7 @@ inline int GetDataTypeId(miopenDataType_t t)
     case miopenBFloat16: return (static_cast<int>('B'));
     case miopenDouble: return (static_cast<int>('D'));
     case miopenInt8:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32: return (static_cast<int>('O'));
diff --git a/src/reducetensor_api.cpp b/src/reducetensor_api.cpp
index 685b2fb430..10b1ff73bf 100644
--- a/src/reducetensor_api.cpp
+++ b/src/reducetensor_api.cpp
@@ -43,7 +43,7 @@ static void LogCmdRedux(const miopen::ReduceTensorDescriptor reduceTensorDesc,
             ss << "reducefp16";
         else if(aDesc.GetType() == miopenBFloat16)
             ss << "reducebfp16";
-        else if(aDesc.GetType() == miopenInt8 || aDesc.GetType() == miopenInt8x4)
+        else if(aDesc.GetType() == miopenInt8)
             ss << "reduceint8";
         else if(aDesc.GetType() == miopenDouble)
             ss << "reducefp64";
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index 5a7918cc64..ff17432a62 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -200,7 +200,7 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& context,
         return (CheckCKApplicability<BF16, BF16, F32, BF16, BF16, F32>(bn_problem) != -1);
     case miopenInt32:
     case miopenInt8:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -244,7 +244,7 @@ BnCKFwdInference::GetSolution(const ExecutionContext& context,
                 break;
             case miopenInt8:
             case miopenInt32:
-            case miopenInt8x4:
+            case miopenInt8x4: // Support discontinued.
             case miopenFloat8:
             case miopenBFloat8:
             default: MIOPEN_THROW("Unsupported datatype");
diff --git a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
index 9520a79a90..e2df6f8097 100644
--- a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
+++ b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
@@ -287,7 +287,7 @@ void PerformanceConfigConvCKIgemmFwdBiasActivFused::HeuristicInit(
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -342,7 +342,7 @@ bool PerformanceConfigConvCKIgemmFwdBiasActivFused::IsValid(
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -435,7 +435,7 @@ bool ConvCKIgemmFwdBiasActivFused::IsApplicable(const FusionContext& ctx,
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -469,7 +469,7 @@ ConvSolution ConvCKIgemmFwdBiasActivFused::GetSolution(
             case miopenInt8:
             case miopenFloat:
             case miopenInt32:
-            case miopenInt8x4:
+            case miopenInt8x4: // Support discontinued.
             case miopenBFloat16:
             case miopenDouble:
             default: MIOPEN_THROW("Unsupported datatype");
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index 86a8a4161e..992b196b45 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -49,7 +49,7 @@ bool ConvDirectNaiveConvIsAssemblyKernel(const ExecutionContext& ctx,
 {
     const auto device_name = ctx.GetStream().GetDeviceName();
     return (device_name == "gfx906" || device_name == "gfx908") && ctx.rmv.IsV3() &&
-           problem.IsLayoutDefault() && (!problem.IsInt8());
+           problem.IsLayoutDefault() && (problem.IsFp16() || problem.IsFp32() || problem.IsBfp16());
 }
 
 // Check tensor data type respectively
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
index 2602c54320..94983c7f0e 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
@@ -210,7 +210,7 @@ void PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -253,7 +253,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -324,7 +324,7 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -352,7 +352,7 @@ ConvSolution ConvHipImplicitGemm3DGroupBwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGBwdPtrs<float>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
index f0623c642d..a21c9ba300 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
@@ -210,7 +210,7 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenBFloat16:
@@ -253,7 +253,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenBFloat16:
@@ -322,7 +322,7 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenBFloat16:
@@ -350,7 +350,7 @@ ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGFwdPtrs<float>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
index 3c94374b4e..6225410599 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
@@ -206,7 +206,7 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -249,7 +249,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -320,7 +320,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -348,7 +348,7 @@ ConvSolution ConvHipImplicitGemm3DGroupWrwXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGWrwPtrs<float>, CKArgs, conv::WrWInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
index 57c63267d2..5aa17e75bf 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
@@ -172,7 +172,7 @@ void PerformanceConfigHipImplicitGemmBwdXdlops::HeuristicInit(
     case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -214,7 +214,7 @@ bool PerformanceConfigHipImplicitGemmBwdXdlops::IsValid(
     case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -294,7 +294,7 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable(
     case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -318,7 +318,7 @@ ConvSolution ConvHipImplicitGemmBwdXdlops::GetSolution(
             problem, config.kernel_id);
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
index 73907eb788..7a9c322bc7 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
@@ -172,7 +172,7 @@ void PerformanceConfigHipImplicitGemmFwdXdlops::HeuristicInit(
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -215,7 +215,7 @@ bool PerformanceConfigHipImplicitGemmFwdXdlops::IsValid(
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -294,7 +294,7 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable(
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -320,7 +320,7 @@ ConvSolution ConvHipImplicitGemmFwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpPtrs<float>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
index c2d0b83141..0a8b6eeb03 100644
--- a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
@@ -198,7 +198,7 @@ void PerformanceConfigHipImplicitGemmGroupFwdXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -240,7 +240,7 @@ bool PerformanceConfigHipImplicitGemmGroupFwdXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -311,7 +311,7 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -339,7 +339,7 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGFwdPtrs<int8_t>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/gemm.cpp b/src/solver/gemm.cpp
index 7ad238e7fa..6ae4f5dde7 100644
--- a/src/solver/gemm.cpp
+++ b/src/solver/gemm.cpp
@@ -87,13 +87,15 @@ bool GemmFwdBase::IsApplicable(const ExecutionContext& ctx,
     const auto& xDesc = problem.GetIn();
     const auto& wDesc = problem.GetWeights();
     const auto& yDesc = problem.GetOut();
-    if(xDesc.GetType() == miopenInt8x4 || xDesc.GetType() == miopenInt8)
-    {
-        // rocBlas needs the output to be int32 always
-        if(yDesc.GetType() != miopenFloat && yDesc.GetType() != miopenInt32 &&
-           yDesc.GetType() != miopenInt8x4)
-            return false;
-    }
+
+    // rocBlas needs the output to be 32-bit always
+    if(xDesc.GetType() == miopenInt8x4)
+        return false;
+    if(xDesc.GetType() == miopenInt8      //
+       && (yDesc.GetType() != miopenFloat //
+           && yDesc.GetType() != miopenInt32))
+        return false;
+
     const auto rblas_fp8_supported = miopen::StartsWith(ctx.GetStream().GetDeviceName(), "gfx94");
     if(problem.IsTensorsCasted())
     {
@@ -185,8 +187,7 @@ float GemmFwdBase::GetWti(const ExecutionContext&, const conv::ProblemDescriptio
             n_transpose_packed_MN2NM = 1;
         n_gemm_strided_batched = conv.group_count;
         n_transpose_CNHW2NCHW  = 1;
-        if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) &&
-           yDesc.GetType() != miopenInt32)
+        if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32)
             n_CastTensor = 1;
     }
     // 1x1_stride=1 with GEMM and zero workspace
@@ -205,8 +206,7 @@ float GemmFwdBase::GetWti(const ExecutionContext&, const conv::ProblemDescriptio
             n_gemm_strided_batched = conv.group_count;
             n_gemm_runs            = in_n;
         }
-        if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) &&
-           yDesc.GetType() != miopenInt32)
+        if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32)
             n_CastTensor = 1;
     }
     else // not 1x1
@@ -216,8 +216,7 @@ float GemmFwdBase::GetWti(const ExecutionContext&, const conv::ProblemDescriptio
             n_transpose_packed_MN2NM = in_n;
         n_gemm_strided_batched = conv.group_count;
         n_gemm_runs            = in_n;
-        if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) &&
-           yDesc.GetType() != miopenInt32)
+        if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32)
             n_CastTensor = 1;
     }
 
@@ -410,7 +409,7 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context,
                 x_t_size *= 2;
             }
 
-            if(wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4)
+            if(wDesc.GetType() == miopenInt8)
             {
                 const auto xts = GetTypeSize(xDesc.GetType());
                 if(xts > 0)
@@ -494,8 +493,7 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context,
             if(handle.IsProfilingEnabled())
                 time_gemm += handle.GetKernelTime();
 
-            if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) &&
-               yDesc.GetType() != miopenInt32)
+            if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32)
             {
                 TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides());
 
@@ -782,7 +780,6 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
     auto solution = ConvSolution{miopenStatusSuccess};
 
     const auto group_count = conv.group_count;
-    const auto lowp_quant  = conv.lowp_quant;
 
     if(group_count > 1)
     {
@@ -881,14 +878,6 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
                     }
                 }
 
-                if(wDesc.GetType() == miopenInt8x4 && yDesc.GetType() != miopenInt32)
-                {
-                    TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides());
-                    CastTensor(handle, &lowp_quant, ygemmDesc, y, yDesc, y, 0, 0);
-                    if(handle.IsProfilingEnabled())
-                        time_gemm += handle.GetKernelTime();
-                }
-
                 if(handle.IsProfilingEnabled())
                 {
                     handle.ResetKernelTime();
@@ -967,14 +956,6 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
                 if(handle.IsProfilingEnabled())
                     time += handle.GetKernelTime();
 
-                if(wDesc.GetType() == miopenInt8x4 && yDesc.GetType() != miopenInt32)
-                {
-                    TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides());
-                    CastTensor(handle, &lowp_quant, ygemmDesc, y, yDesc, y, 0, 0);
-                    if(handle.IsProfilingEnabled())
-                        time += handle.GetKernelTime();
-                }
-
                 if(handle.IsProfilingEnabled())
                 {
                     handle.ResetKernelTime();
@@ -1268,8 +1249,7 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context,
                 }
             }
 
-            if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) &&
-               yDesc.GetType() != miopenInt32)
+            if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32)
             {
                 TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides());
 
diff --git a/src/solver/mlir_common.cpp b/src/solver/mlir_common.cpp
index 4101db86f9..4e41b8944a 100644
--- a/src/solver/mlir_common.cpp
+++ b/src/solver/mlir_common.cpp
@@ -57,7 +57,7 @@ static const char* DTypeName(miopenDataType_t ty)
     case miopenBFloat16: return "bf16";
     case miopenInt32: return "i32";
     case miopenInt8: return "i8";
-    case miopenInt8x4: return "i8x4";
+    case miopenInt8x4: return "i8x4"; // Support discontinued.
     case miopenFloat8: return "fp8";
     case miopenBFloat8: return "bfp8";
     }
diff --git a/src/tensor.cpp b/src/tensor.cpp
index ca4f1afc7a..df0d7c2819 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -50,7 +50,7 @@ bool IsDataTypeSupported(miopenDataType_t t)
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt8:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: return true;
     }
diff --git a/test/conv_common.hpp b/test/conv_common.hpp
index 99f964a0f6..e3fa9766e6 100644
--- a/test/conv_common.hpp
+++ b/test/conv_common.hpp
@@ -217,12 +217,7 @@ tensor<Tout> get_output_tensor(const miopen::ConvolutionDescriptor& filter,
             ? input.desc.GetLayout(miopen::tensor_layout_get_default(input.desc.GetSize()))
             : out_layout;
     return tensor<Tout>{filter.GetForwardOutputTensorWithLayout(
-        input.desc,
-        weights.desc,
-        yLayout,
-        weights.desc.GetType() == miopenInt8x4
-            ? (std::is_same<Tout, int>{} ? miopenInt32 : miopenFloat)
-            : miopen_type<Tout>{})};
+        input.desc, weights.desc, yLayout, miopen_type<Tout>{})};
 }
 
 enum class ConvApi
@@ -572,8 +567,7 @@ struct verify_forward_conv : conv_base<T, Tout>
         auto rout = ref_conv_fwd(input, weights, out, filter);
         if(filter.mode != miopenTranspose)
         {
-            bool is_int8 =
-                weights.desc.GetType() == miopenInt8 || weights.desc.GetType() == miopenInt8x4;
+            bool is_int8   = weights.desc.GetType() == miopenInt8;
             bool is_vect_c = weights.desc.GetVectorLength() > 1;
             rout.par_for_each([&](auto... is) {
                 if(is_int8 && !is_vect_c)
@@ -774,7 +768,7 @@ struct verify_forward_conv : conv_base<T, Tout>
             break;
         case ConvApi::Find_1_0:
         case ConvApi::Find_2_0:
-            if(weights.desc.GetType() == miopenInt8 || weights.desc.GetType() == miopenInt8x4)
+            if(weights.desc.GetType() == miopenInt8)
             {
 
                 bool is_transform = (input.desc.GetLengths()[1] % 4 != 0 || is_vect);
@@ -786,10 +780,8 @@ struct verify_forward_conv : conv_base<T, Tout>
                 in_len[1]  = ((in_len[1] + 3) / 4) * 4;
                 wei_len[1] = ((wei_len[1] + 3) / 4) * 4;
 
-                miopen::TensorDescriptor input_vpad_desc(is_vect ? miopenInt8x4 : miopenInt8,
-                                                         in_len);
-                miopen::TensorDescriptor weight_vpad_desc(is_vect ? miopenInt8x4 : miopenInt8,
-                                                          wei_len);
+                miopen::TensorDescriptor input_vpad_desc(miopenInt8, in_len);
+                miopen::TensorDescriptor weight_vpad_desc(miopenInt8, wei_len);
 
                 auto input_vpad   = tensor<T>{in_len};
                 auto weights_vpad = tensor<T>{wei_len};
@@ -1738,8 +1730,8 @@ struct verify_forward_conv_int8 : conv_base<T>
         in_len[1]  = ((in_len[1] + 3) / 4) * 4;
         wei_len[1] = ((wei_len[1] + 3) / 4) * 4;
 
-        miopen::TensorDescriptor input_vpad_desc(is_vect ? miopenInt8x4 : miopenInt8, in_len);
-        miopen::TensorDescriptor weight_vpad_desc(is_vect ? miopenInt8x4 : miopenInt8, wei_len);
+        miopen::TensorDescriptor input_vpad_desc(miopenInt8, in_len);
+        miopen::TensorDescriptor weight_vpad_desc(miopenInt8, wei_len);
 
         auto input_vpad   = tensor<T>{in_len};
         auto weights_vpad = tensor<T>{wei_len};
@@ -2046,7 +2038,7 @@ struct conv_driver : test_driver
             filter.spatialDim = get_spatial_dim();
         else
             filter.spatialDim = filter_dims.size();
-        bool is_int8 = (input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenInt8x4);
+        bool is_int8 = (input.desc.GetType() == miopenInt8);
 
         filter.mode             = cmode_lookup[miopen::ToUpper(conv_mode)];
         filter.paddingMode      = pmode_lookup[miopen::ToUpper(pad_mode)];
@@ -2360,8 +2352,7 @@ struct conv_driver : test_driver
                 bool skip_backward_weights = is_int8;
 
 #if TEST_DIRECT_SUPPORTED_CONFIG_ONLY
-                if(input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenInt8x4 ||
-                   input.desc.GetType() == miopenBFloat16)
+                if(input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenBFloat16)
                 {
                     show_command();
                     std::cout << "Direct path doesn't support Int8 or BFloat16 type." << std::endl;
@@ -2405,7 +2396,8 @@ struct conv_driver : test_driver
                 size_t total_mem;
                 if(is_int8)
                 {
-                    // TODO: Tout here was float which should have been int32
+                    /// \todo Properly construct the `output` tensor descriptor
+                    /// and get rid of this special "int8" stuff.
                     auto output_int8 =
                         get_output_tensor<T, Tout>(filter, input, weights, out_layout);
                     const auto problem        = ConvProblemDescription{input.desc,
@@ -2422,6 +2414,9 @@ struct conv_driver : test_driver
                 }
                 else
                 {
+                    /// \todo Take into account `skip_forward`, `skip_backward_data`,
+                    /// `skip_backward_weights` and use this path to compute `total_mem` for int8
+                    /// variations.
                     const auto fwd_problem = miopen::conv::ProblemDescription{
                         input.desc,
                         weights.desc,
diff --git a/test/driver.hpp b/test/driver.hpp
index 0a8e2d3080..fd83dd1fc5 100644
--- a/test/driver.hpp
+++ b/test/driver.hpp
@@ -274,7 +274,7 @@ struct test_driver
         {
         case miopenHalf: ss << "--half "; break;
         case miopenBFloat16: ss << "--bfloat16 "; break;
-        case miopenInt8x4:
+        case miopenInt8x4: ss << "--UNSUPPORED_TYPE "; break;
         case miopenInt8: ss << "--int8 "; break;
         case miopenInt32: ss << "--int32 "; break;
         case miopenFloat: ss << "--float "; break;
@@ -303,7 +303,7 @@ struct test_driver
         {
         case miopenHalf: ret.emplace_back("--half"); break;
         case miopenBFloat16: ret.emplace_back("--bf16"); break;
-        case miopenInt8x4:
+        case miopenInt8x4: ret.emplace_back("--UNSUPPORTED_TYPE"); break;
         case miopenInt8: ret.emplace_back("--int8"); break;
         case miopenInt32: ret.emplace_back("--int32"); break;
         case miopenFloat: ret.emplace_back("--float"); break;
diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp
index aa3dda788d..be8f3f8430 100644
--- a/test/gpu_reference_kernel.cpp
+++ b/test/gpu_reference_kernel.cpp
@@ -303,8 +303,6 @@ static std::string miopen_type_to_string(miopenDataType_t type)
         return "int32";
     if(type == miopenInt8)
         return "int8";
-    if(type == miopenInt8x4)
-        return "int8x4";
     if(type == miopenBFloat16)
         return "bf16";
     return "n/a";
diff --git a/test/gtest/conv_embed_db.cpp b/test/gtest/conv_embed_db.cpp
index b69fde1b5e..4672bb0404 100644
--- a/test/gtest/conv_embed_db.cpp
+++ b/test/gtest/conv_embed_db.cpp
@@ -73,12 +73,12 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenHalf: params = ConfigWithHalf::GetParam(); break;
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
     case miopenBFloat16: params = ConfigWithBFloat16::GetParam(); break;
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenFloat8:
     case miopenBFloat8:
     case miopenDouble:
-        FAIL() << "miopenInt8x4, miopenInt32, miopenFloat8, miopenBFloat8, miopenDouble data type "
+        FAIL() << "miopenInt32, miopenFloat8, miopenBFloat8, miopenDouble data type "
                   "not supported by conv_embed_db test";
 
     default: params = ConfigWithFloat::GetParam();
diff --git a/test/gtest/conv_hip_igemm_xdlops.cpp b/test/gtest/conv_hip_igemm_xdlops.cpp
index 508624c847..3407446557 100644
--- a/test/gtest/conv_hip_igemm_xdlops.cpp
+++ b/test/gtest/conv_hip_igemm_xdlops.cpp
@@ -65,10 +65,10 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenHalf:
     case miopenBFloat16:
     case miopenFloat:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
-        FAIL() << "miopenHalf, miopenBFloat16, miopenFloat, miopenInt8x4, miopenInt32, "
+        FAIL() << "miopenHalf, miopenBFloat16, miopenFloat, miopenInt32, "
                   "miopenDouble data "
                   "type not supported by "
                   "test_conv_hip_igemm_xdlops test";
diff --git a/test/gtest/conv_igemm_dynamic.cpp b/test/gtest/conv_igemm_dynamic.cpp
index 25a4e179c5..59341ac8a0 100644
--- a/test/gtest/conv_igemm_dynamic.cpp
+++ b/test/gtest/conv_igemm_dynamic.cpp
@@ -68,12 +68,12 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenHalf:
     case miopenInt8:
     case miopenBFloat16:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
     case miopenFloat8:
     case miopenBFloat8:
-        FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt8x4, miopenInt32, "
+        FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt32, "
                   "miopenDouble, miopenFloat8, miopenBFloat8 "
                   "data type not supported by conv_igemm_dynamic test";
 
diff --git a/test/gtest/conv_igemm_mlir.cpp b/test/gtest/conv_igemm_mlir.cpp
index d5fc0d426c..29d240645c 100644
--- a/test/gtest/conv_igemm_mlir.cpp
+++ b/test/gtest/conv_igemm_mlir.cpp
@@ -81,13 +81,13 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
     case miopenFloat: params = ConfigWithFloat::GetParam(); break;
     case miopenBFloat16:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenFloat8:
     case miopenBFloat8:
     case miopenDouble:
         MIOPEN_THROW(miopenStatusBadParm,
-                     "miopenBFloat16, miopenInt8x4, miopenInt32, miopenFloat8, miopenBFloat8, "
+                     "miopenBFloat16, miopenInt32, miopenFloat8, miopenBFloat8, "
                      "miopenDouble data type not supported by conv_igemm_mlir test");
 
     default: params = ConfigWithFloat::GetParam();
diff --git a/test/gtest/conv_igemm_mlir_xdlops.cpp b/test/gtest/conv_igemm_mlir_xdlops.cpp
index 19913093c0..e2c5a3ed8f 100644
--- a/test/gtest/conv_igemm_mlir_xdlops.cpp
+++ b/test/gtest/conv_igemm_mlir_xdlops.cpp
@@ -53,13 +53,13 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
     case miopenBFloat16:
     case miopenFloat:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
     case miopenFloat8:
     case miopenBFloat8:
         MIOPEN_THROW(miopenStatusBadParm,
-                     "miopenBFloat16, miopenFloat, miopenInt8x4, miopenInt32, miopenDouble data "
+                     "miopenBFloat16, miopenFloat, miopenInt32, miopenDouble data "
                      "type not supported by "
                      "conv_igemm_mlir_xdlops test");
 
diff --git a/test/gtest/conv_trans.cpp b/test/gtest/conv_trans.cpp
index 20015336a0..185a459947 100644
--- a/test/gtest/conv_trans.cpp
+++ b/test/gtest/conv_trans.cpp
@@ -55,10 +55,10 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenBFloat8:
     case miopenInt8:
     case miopenBFloat16:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
-        FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt8x4, miopenInt32, miopenDouble "
+        FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt32, miopenDouble "
                   "data type not supported by "
                   "conv_trans test";
 
diff --git a/test/gtest/db_sync.cpp b/test/gtest/db_sync.cpp
index d7e59468a2..13b647ceac 100644
--- a/test/gtest/db_sync.cpp
+++ b/test/gtest/db_sync.cpp
@@ -73,7 +73,7 @@ miopenDataType_t GetDataTypeFromString(const std::string& data_type)
     else if(data_type == "INT8")
         return miopenInt8;
     else if(data_type == "INT8x4")
-        return miopenInt8x4;
+        return miopenInt8x4; // Support discontinued. Maintain compatibility with old databases.
     else if(data_type == "INT32")
         return miopenInt32;
     else if(data_type == "BF16")
diff --git a/test/tensor_holder.hpp b/test/tensor_holder.hpp
index 0b05a1e5e8..3fda3b5cf7 100644
--- a/test/tensor_holder.hpp
+++ b/test/tensor_holder.hpp
@@ -195,9 +195,10 @@ struct tensor
 
     tensor(miopen::TensorDescriptor rhs) : desc(std::move(rhs))
     {
-        assert(desc.GetType() == miopen_type<T>{} ||
-               ((miopen_type<T>{} == miopenInt8 || miopen_type<T>{} == miopenInt8x4) &&
-                (desc.GetType() == miopenFloat || desc.GetType() == miopenInt32)));
+        assert(desc.GetType() == miopen_type<T>{}    //
+               || (miopen_type<T>{} == miopenInt8    //
+                   && (desc.GetType() == miopenFloat //
+                       || desc.GetType() == miopenInt32)));
         data.resize(desc.GetElementSpace());
     }
 
diff --git a/test/tensor_transform.cpp b/test/tensor_transform.cpp
index a0585853a0..e87420bb7a 100644
--- a/test/tensor_transform.cpp
+++ b/test/tensor_transform.cpp
@@ -422,7 +422,7 @@ struct tensor_transform_driver : test_driver
         }
 
         // Test tensor scale addition
-        if(miopen_type<T>{} == miopenInt8 || miopen_type<T>{} == miopenInt8x4)
+        if(miopen_type<T>{} == miopenInt8)
             return;
 
         super_src = tensor<T>{superLens_src}.generate(tensor_elem_gen_integer{max_value});

From defb1b04fc74945227188c5de5d243215ade6a22 Mon Sep 17 00:00:00 2001
From: amberhassaan <amber_474@yahoo.com>
Date: Thu, 12 Oct 2023 00:05:34 -0400
Subject: [PATCH 33/36] Test non-packed inputs with naive reference convolution
 kernels (#2394)

---
 driver/random.hpp                             |   1 +
 fin                                           |   2 +-
 src/include/miopen/convolution.hpp            |   4 +
 .../gpu_reference_kernel/naive_conv.cpp       |   6 +-
 src/ocl/convolutionocl.cpp                    | 123 ++++--
 src/solver/conv_direct_naive_conv.cpp         |   4 +
 src/solver/conv_direct_naive_conv_bwd.cpp     |   8 +-
 src/solver/conv_direct_naive_conv_fwd.cpp     |   8 +-
 src/solver/conv_direct_naive_conv_wrw.cpp     |   8 +-
 test/gpu_reference_kernel.cpp                 | 381 +++++-------------
 test/gtest/conv_api_strided_tensors.cpp       |   8 +-
 11 files changed, 221 insertions(+), 332 deletions(-)

diff --git a/driver/random.hpp b/driver/random.hpp
index 6398048dde..b3be81f56e 100644
--- a/driver/random.hpp
+++ b/driver/random.hpp
@@ -91,6 +91,7 @@ inline T gen_0_to_B(T B)
 template <typename T>
 inline T gen_A_to_B(T A, T B)
 {
+    assert(B > A);
     return gen_0_to_B(B - A) + A;
 }
 
diff --git a/fin b/fin
index 26b5c32864..afc1a8d87e 160000
--- a/fin
+++ b/fin
@@ -1 +1 @@
-Subproject commit 26b5c328642a6af5041539ceae36b9340829384b
+Subproject commit afc1a8d87e6d00c82903942007bb370ee1f6c760
diff --git a/src/include/miopen/convolution.hpp b/src/include/miopen/convolution.hpp
index bac0133106..35c494eab2 100644
--- a/src/include/miopen/convolution.hpp
+++ b/src/include/miopen/convolution.hpp
@@ -36,6 +36,7 @@
 #include <miopen/names.hpp>
 #include <miopen/invoke_params.hpp>
 #include <miopen/invoker.hpp>
+#include <miopen/conv/tensors.hpp>
 
 #include <nlohmann/json_fwd.hpp>
 
@@ -404,6 +405,9 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor
 
     friend void to_json(nlohmann::json& json, const ConvolutionDescriptor& conv);
     friend void from_json(const nlohmann::json& json, ConvolutionDescriptor& conv);
+
+private:
+    void ValidateTensors(const ConvTensors& conv_tensors) const;
 };
 
 void ConvolutionBackwardBias(const Handle& handle,
diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp
index b243b1234a..125eff94f3 100644
--- a/src/kernels/gpu_reference_kernel/naive_conv.cpp
+++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp
@@ -126,9 +126,9 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val)
 /// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw,
 /// and nhwc should be nhwgc. Same follows for the 3D case.
 ///
-/// - strides here are in the little-endian order, i.e., for NHWC, stride for N is
-/// at index 3 while stride for C is at index 0. This is reverse of how strides are
-/// stored in tensor descriptors, which are big-endian.
+/// - strides here are stored right to left, i.e., for NHWC, stride for N is
+/// at index 3 while stride for C is at index 0. This is different from how the
+/// tensor descriptors store strides, which is always NCHW order, left-to-right.
 
 template <bool ASSUME_PACKED, typename src_data_t, typename acc_data_t, typename dst_data_t>
 inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in,
diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp
index 94b083577d..d66186577c 100644
--- a/src/ocl/convolutionocl.cpp
+++ b/src/ocl/convolutionocl.cpp
@@ -287,30 +287,6 @@ void ConvolutionDescriptor::FindConvFwdAlgorithm(Handle& handle,
 
 namespace {
 
-void ValidateConvTensors(const ConvTensors& tensors)
-{
-    const auto invalid_buffers =
-        tensors.x == nullptr || tensors.w == nullptr || tensors.y == nullptr;
-
-    const auto tensor_sizes_not_matched = tensors.xDesc.GetSize() != tensors.yDesc.GetSize() ||
-                                          tensors.xDesc.GetSize() != tensors.wDesc.GetSize();
-
-    const auto trivial_tensor_types_not_matched =
-        tensors.xDesc.GetType() != tensors.yDesc.GetType() && tensors.xDesc.GetType() != miopenInt8;
-
-    // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) {
-    //    MIOPEN_THROW(miopenStatusBadParm);
-    //}
-
-    const auto x_tensor_invalid = tensors.xDesc.GetSize() < 3;
-
-    const auto bad_parameters = invalid_buffers || tensor_sizes_not_matched ||
-                                trivial_tensor_types_not_matched || x_tensor_invalid;
-
-    if(bad_parameters)
-        MIOPEN_THROW(miopenStatusBadParm);
-}
-
 void ValidateAlphaBeta(const void* alpha, const void* beta)
 {
     if(!float_equal(*(static_cast<const float*>(alpha)), 1.0) ||
@@ -401,6 +377,88 @@ static void ConvForwardCheckNumerics(const Handle& handle,
     }
 }
 
+void ConvolutionDescriptor::ValidateTensors(const ConvTensors& tensors) const
+{
+
+    // Group stride in current TensorDescriptor is implicit. When invoking kernels,
+    // we need to add the group dimension G and compute its stride. We want the stride
+    // left of C to be a multiple of group count G. e.g. for NCHW, the stride for N
+    // should be a multiple of G so that we can compute the strides for NGCHW
+    auto bad_group_stride = [this](const TensorDescriptor& td) {
+        auto l             = td.GetLayout_t();
+        int g_stride_index = -1;
+        if(l == miopenTensorNCHW || l == miopenTensorNCDHW)
+        {
+            g_stride_index = 0; // stride index for N;
+        }
+        else if(l == miopenTensorNHWC || l == miopenTensorNDHWC)
+        {
+            // stride index for W. Normally this would be 2nd-last stride but we store
+            // strides in NCHW order for some weird reason.
+            g_stride_index = td.GetStrides().size() - 1;
+        }
+        else
+        {
+            MIOPEN_THROW(miopenStatusInternalError, "Layout not supported for grouped convolution");
+        }
+
+        if(g_stride_index != -1)
+        {
+            return (td.GetStrides()[g_stride_index] % this->group_count) != 0;
+        }
+
+        return false;
+    };
+
+    // invalid_buffers
+    if(tensors.x == nullptr || tensors.w == nullptr || tensors.y == nullptr)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "One of the convolution tensors is null");
+    }
+
+    // x_tensor_invalid =
+    if(tensors.xDesc.GetSize() < 3)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "input tensor's number of dimensions is wrong");
+    }
+
+    // tensor_sizes_not_matched =
+    if(tensors.xDesc.GetSize() != tensors.yDesc.GetSize() ||
+       tensors.xDesc.GetSize() != tensors.wDesc.GetSize())
+    {
+        MIOPEN_THROW(miopenStatusBadParm,
+                     "number of dimensions mismatch between input, output and weights tensors");
+    }
+
+    // trivial_tensor_types_not_matched =
+    if(tensors.xDesc.GetType() != tensors.yDesc.GetType() &&
+       tensors.xDesc.GetType() != miopenInt8 && tensors.xDesc.GetType() != miopenInt8x4)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "input/output tensor data types do not match");
+    }
+
+    // check for bad_group_stride. This applies for input and output only. There
+    // is no check for weight tensor currently.
+    // no need to check for group_count == 1
+
+    if((this->group_count > 1) && bad_group_stride(tensors.xDesc))
+    {
+        MIOPEN_THROW(
+            miopenStatusBadParm,
+            "Invalid input tensor strides. Channel stride must be a multiple of group count");
+    }
+    if((this->group_count > 1) && bad_group_stride(tensors.yDesc))
+    {
+        MIOPEN_THROW(
+            miopenStatusBadParm,
+            "Invalid output tensor strides. Channel stride must be a multiple of group count");
+    }
+
+    // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) {
+    //    MIOPEN_THROW(miopenStatusBadParm);
+    //}
+}
+
 void ConvolutionDescriptor::ConvolutionForward(Handle& handle,
                                                const void* alpha,
                                                const TensorDescriptor& xDesc,
@@ -416,13 +474,8 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle,
 {
     MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize);
 
-    if(!(xDesc.IsPacked() && wDesc.IsPacked() && yDesc.IsPacked()))
-    {
-        MIOPEN_THROW(miopenStatusNotImplemented, "Only fully packed tensors are supported");
-    }
-
     const auto tensors = ConvFwdTensors{xDesc, x, wDesc, w, yDesc, y};
-    ValidateConvTensors(tensors);
+    ValidateTensors(tensors);
     ValidateAlphaBeta(alpha, beta);
 
     ConvForwardCheckNumerics(handle, tensors, [&]() {
@@ -735,7 +788,7 @@ void ConvolutionDescriptor::ConvolutionForwardImmediate(Handle& handle,
     MIOPEN_LOG_I("solver_id = " << solver_id.ToString() << ", workspace = " << workSpaceSize);
     const auto tensors = ConvFwdTensors{xDesc, x, wDesc, w, yDesc, y};
 
-    ValidateConvTensors(tensors);
+    ValidateTensors(tensors);
     if(!solver_id.IsValid())
         MIOPEN_THROW(miopenStatusBadParm);
 
@@ -871,7 +924,7 @@ void ConvolutionDescriptor::ConvolutionBackwardData(Handle& handle,
 
     auto tensors = ConvBwdTensors{dyDesc, dy, wDesc, w, dxDesc, dx};
 
-    ValidateConvTensors(tensors);
+    ValidateTensors(tensors);
     ValidateAlphaBeta(alpha, beta);
 
     ConvBwdCheckNumerics(handle, tensors, beta, [&]() {
@@ -937,7 +990,7 @@ void ConvolutionDescriptor::ConvolutionBackwardImmediate(Handle& handle,
     MIOPEN_LOG_I("solver_id = " << solver_id.ToString() << ", workspace = " << workSpaceSize);
     auto tensors = ConvBwdTensors{dyDesc, dy, wDesc, w, dxDesc, dx};
 
-    ValidateConvTensors(tensors);
+    ValidateTensors(tensors);
 
     static const float beta = 0.0f;
     ConvBwdCheckNumerics(handle, tensors, &beta, [&]() {
@@ -1071,7 +1124,7 @@ void ConvolutionDescriptor::ConvolutionBackwardWeights(const Handle& handle,
 {
     MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize);
     decltype(auto) tensors = ConvWrwTensors{dyDesc, dy, xDesc, x, dwDesc, dw};
-    ValidateConvTensors(tensors);
+    ValidateTensors(tensors);
     ValidateAlphaBeta(alpha, beta);
 
     if(xDesc.GetType() == miopenInt8)
@@ -1134,7 +1187,7 @@ void ConvolutionDescriptor::ConvolutionWrwImmediate(Handle& handle,
 {
     MIOPEN_LOG_I("solver_id = " << solver_id.ToString() << ", workspace = " << workSpaceSize);
     auto tensors = ConvWrwTensors{dyDesc, dy, xDesc, x, dwDesc, dw};
-    ValidateConvTensors(tensors);
+    ValidateTensors(tensors);
 
     if(xDesc.GetType() == miopenInt8)
         MIOPEN_THROW(miopenStatusBadParm);
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index 992b196b45..f87511f911 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -111,11 +111,15 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS);
 std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem)
 {
     std::ostringstream kernel_name;
+
+    /// \todo remove packed reference convolution kernels --amberhassaan
+#ifndef NDEBUG // enable in debug mode only
     if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS()))
     {
         kernel_name << "naive_conv_packed_";
     }
     else
+#endif
     {
         kernel_name << "naive_conv_nonpacked_";
     }
diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp
index 1a28f8aae6..dea91c9ecf 100644
--- a/src/solver/conv_direct_naive_conv_bwd.cpp
+++ b/src/solver/conv_direct_naive_conv_bwd.cpp
@@ -134,12 +134,8 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
-    const auto is_f8 = [&]() {
-        if(kernel.kernel_file == "fp8_naive_conv.cpp")
-            return true;
-        else
-            return false;
-    }();
+    const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp");
+
     kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
     int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);
diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp
index a4656d929a..5bc25a2367 100644
--- a/src/solver/conv_direct_naive_conv_fwd.cpp
+++ b/src/solver/conv_direct_naive_conv_fwd.cpp
@@ -122,12 +122,6 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
     KernelInfo kernel;
 
     kernel.kernel_file = ConvDirectNaiveConvKernelFile(ctx, problem);
-    const auto is_f8   = [&]() {
-        if(kernel.kernel_file == "fp8_naive_conv.cpp")
-            return true;
-        else
-            return false;
-    }();
     kernel.kernel_name = ConvDirectNaiveConvKernelName(problem);
     kernel.g_wk.clear();
 
@@ -139,6 +133,8 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
+    const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp");
+
     kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
 
     int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);
diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp
index dfe1c342b0..a8c4d40e0b 100644
--- a/src/solver/conv_direct_naive_conv_wrw.cpp
+++ b/src/solver/conv_direct_naive_conv_wrw.cpp
@@ -121,13 +121,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
+    const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp");
+
     kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem);
-    const auto is_f8    = [&]() {
-        if(kernel.kernel_file == "fp8_naive_conv.cpp")
-            return true;
-        else
-            return false;
-    }();
 
     int G_stride_idx = conv_internal::GetGroupStrideIndex(problem);
 
diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp
index be8f3f8430..e166781b9b 100644
--- a/test/gpu_reference_kernel.cpp
+++ b/test/gpu_reference_kernel.cpp
@@ -24,6 +24,8 @@
  *
  *******************************************************************************/
 
+#include <functional>
+#include <numeric>
 #include <miopen/handle.hpp>
 #include <miopen/miopen.h>
 #include <miopen/convolution.hpp>
@@ -73,17 +75,9 @@ std::string tensor_layout_to_string(tensor_layout_t layout)
 struct gpu_reference_kernel_base
 {
     miopenHandle_t handle{};
-#if MIOPEN_BACKEND_OPENCL
-    cl_command_queue q{};
-#endif
 
-    gpu_reference_kernel_base()
-    {
-        miopenCreate(&handle);
-#if MIOPEN_BACKEND_OPENCL
-        miopenGetStream(handle, &q);
-#endif
-    }
+    gpu_reference_kernel_base() { miopenCreate(&handle); }
+
     ~gpu_reference_kernel_base() { miopenDestroy(handle); }
 
     static int conv_out_size(int in_size, int pad, int dilation, int ksize, int stride)
@@ -308,6 +302,21 @@ static std::string miopen_type_to_string(miopenDataType_t type)
     return "n/a";
 }
 
+/// input: a vector of lengths of dims in a tensor
+/// multiply each element with a random constant integer
+void pad_tensor_strides(std::vector<int>& strides)
+{
+    constexpr int min_stride_multiplier = 1;
+    constexpr int max_stride_multiplier = 5;
+
+    auto c = prng::gen_A_to_B(min_stride_multiplier, max_stride_multiplier);
+    for(auto& v : strides)
+    {
+        // cppcheck-suppress useStlAlgorithm
+        v = v * c;
+    }
+}
+
 template <miopen::conv::Direction direction,
           typename TRef,
           typename Tout,
@@ -339,11 +348,6 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
             int ho          = conv_out_size(hi, py, dy, fy, sy);
             int wo          = conv_out_size(wi, px, dx, fx, sx);
             int c_per_group = c / g;
-            int k_per_group = k / g;
-
-            int in_sz  = g * n * c_per_group * hi * wi;
-            int wei_sz = g * k_per_group * c_per_group * fy * fx;
-            int out_sz = g * n * k_per_group * ho * wo;
 
             std::vector<int> in_len({n, c, hi, wi});
             std::vector<int> wei_len({k, c_per_group, fy, fx});
@@ -360,28 +364,25 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
             miopen::tensor_layout_to_strides(wei_len, layout_default, layout_string, wei_strides);
             miopen::tensor_layout_to_strides(out_len, layout_default, layout_string, out_strides);
 
+            pad_tensor_strides(in_strides);
+            pad_tensor_strides(wei_strides);
+            pad_tensor_strides(out_strides);
+
             tensor<TRef> in(in_len, in_strides);
             tensor<TRef> wei(wei_len, wei_strides);
             tensor<Tout> out(out_len, out_strides);
-#if MIOPEN_BACKEND_OPENCL
-            cl_context ctx;
-            clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
-            cl_int status = CL_SUCCESS;
-            cl_mem in_dev =
-                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * in_sz, nullptr, &status);
-            cl_mem wei_dev =
-                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * wei_sz, nullptr, nullptr);
-            cl_mem out_dev =
-                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(Tout) * out_sz, nullptr, nullptr);
-            EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
+            auto in_sz  = in.data.size();
+            auto wei_sz = wei.data.size();
+            auto out_sz = out.data.size();
+
             void* in_dev;
             void* wei_dev;
             void* out_dev;
             EXPECT(hipMalloc(&in_dev, sizeof(TRef) * in_sz) == hipSuccess);
             EXPECT(hipMalloc(&wei_dev, sizeof(TRef) * wei_sz) == hipSuccess);
             EXPECT(hipMalloc(&out_dev, sizeof(Tout) * out_sz) == hipSuccess);
-#endif
+
             EXPECT(miopenCreateConvolutionDescriptor(&convDesc) == miopenStatusSuccess);
             EXPECT(miopenInitConvolutionNdDescriptor(convDesc,
                                                      2,
@@ -417,27 +418,9 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 // initialize data with integer
                 rand_tensor_integer(in);
                 rand_tensor_integer(wei);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueWriteBuffer(q,
-                                              in_dev,
-                                              CL_TRUE,
-                                              0,
-                                              sizeof(TRef) * in_sz,
-                                              in.data.data(),
-                                              0,
-                                              nullptr,
-                                              nullptr);
-                status |= clEnqueueWriteBuffer(q,
-                                               wei_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * wei_sz,
-                                               wei.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+                /// \ref copy_non_packed_output_before_convolution
+                rand_tensor_integer(out);
+
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -445,7 +428,19 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-#endif
+                /// \anchor copy_non_packed_output_before_convolution
+                /// \note Output is a non-packed tensor, which means there are
+                /// elements that convolution will not update. In order to verify
+                /// the convolution result, the GPU buffer should have the same
+                /// data as the CPU in both update and not-updated elements.
+                /// Therefore, we copy the output to the GPU buffer after
+                /// initializing it with random values.
+                ///
+                EXPECT(hipMemcpy(out_dev,
+                                 out.data.data(),
+                                 sizeof(Tout) * out_sz,
+                                 hipMemcpyHostToDevice) == hipSuccess);
+
                 cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(),
                                         in,
                                         wei,
@@ -470,23 +465,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                        miopenStatusSuccess);
 
                 tensor<Tout> out_host(out_len, out_strides);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueReadBuffer(q,
-                                             out_dev,
-                                             CL_TRUE,
-                                             0,
-                                             sizeof(Tout) * out_sz,
-                                             out_host.data.data(),
-                                             0,
-                                             nullptr,
-                                             nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
                 EXPECT(hipMemcpy(out_host.data.data(),
                                  out_dev,
                                  sizeof(Tout) * out_sz,
                                  hipMemcpyDeviceToHost) == hipSuccess);
-#endif
+
                 // we expect excact match, since use integer
                 valid_result = verify_tensor(out_host, out);
             }
@@ -495,36 +478,22 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 // initialize data with integer
                 rand_tensor_integer(out);
                 rand_tensor_integer(wei);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueWriteBuffer(q,
-                                              out_dev,
-                                              CL_TRUE,
-                                              0,
-                                              sizeof(TRef) * out_sz,
-                                              out.data.data(),
-                                              0,
-                                              nullptr,
-                                              nullptr);
-                status |= clEnqueueWriteBuffer(q,
-                                               wei_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * wei_sz,
-                                               wei.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+                /// \ref copy_non_packed_output_before_convolution
+                rand_tensor_integer(in);
+                /// \ref copy_non_packed_output_before_convolution
+
+                EXPECT(hipMemcpy(
+                           in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
+                       hipSuccess);
                 EXPECT(hipMemcpy(out_dev,
                                  out.data.data(),
-                                 sizeof(TRef) * out_sz,
+                                 sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
                 EXPECT(hipMemcpy(wei_dev,
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-#endif
+
                 cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(),
                                               in,
                                               wei,
@@ -549,23 +518,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                        miopenStatusSuccess);
 
                 tensor<TRef> in_host(in_len, in_strides);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueReadBuffer(q,
-                                             in_dev,
-                                             CL_TRUE,
-                                             0,
-                                             sizeof(TRef) * in_sz,
-                                             in_host.data.data(),
-                                             0,
-                                             nullptr,
-                                             nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(in_host.data.data(),
                                  in_dev,
                                  sizeof(TRef) * in_sz,
                                  hipMemcpyDeviceToHost) == hipSuccess);
-#endif
 
                 // we expect excact match, since use integer
                 valid_result = verify_tensor(in_host, in);
@@ -574,35 +531,22 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
             {
                 rand_tensor_integer(in);
                 rand_tensor_integer(out);
-#if MIOPEN_BACKEND_OPENCL
-                status |= clEnqueueWriteBuffer(q,
-                                               in_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * in_sz,
-                                               in.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                status |= clEnqueueWriteBuffer(q,
-                                               out_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * out_sz,
-                                               out.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+                /// \ref copy_non_packed_output_before_convolution
+                rand_tensor_integer(wei);
+
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
+                /// \ref copy_non_packed_output_before_convolution
+                EXPECT(hipMemcpy(wei_dev,
+                                 wei.data.data(),
+                                 sizeof(TRef) * wei_sz,
+                                 hipMemcpyHostToDevice) == hipSuccess);
                 EXPECT(hipMemcpy(out_dev,
                                  out.data.data(),
-                                 sizeof(TRef) * out_sz,
+                                 sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-#endif
+
                 cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(),
                                                 in,
                                                 wei,
@@ -627,23 +571,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                        miopenStatusSuccess);
 
                 tensor<TRef> wei_host(wei_len, wei_strides);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueReadBuffer(q,
-                                             wei_dev,
-                                             CL_TRUE,
-                                             0,
-                                             sizeof(TRef) * wei_sz,
-                                             wei_host.data.data(),
-                                             0,
-                                             nullptr,
-                                             nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(wei_host.data.data(),
                                  wei_dev,
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyDeviceToHost) == hipSuccess);
-#endif
 
                 // we expect excact match, since use integer
                 valid_result = verify_tensor(wei_host, wei);
@@ -665,15 +597,10 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
             miopenDestroyTensorDescriptor(inDesc);
             miopenDestroyTensorDescriptor(weiDesc);
             miopenDestroyTensorDescriptor(outDesc);
-#if MIOPEN_BACKEND_OPENCL
-            clReleaseMemObject(in_dev);
-            clReleaseMemObject(wei_dev);
-            clReleaseMemObject(out_dev);
-#elif MIOPEN_BACKEND_HIP
+
             hipFree(in_dev);
             hipFree(wei_dev);
             hipFree(out_dev);
-#endif
         };
 
         iterate_conv_2d(run_conv_2d);
@@ -717,11 +644,6 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             int wo          = conv_out_size(wi, px, dx, fx, sx);
             int do_         = conv_out_size(di, pz, dz, fz, sz);
             int c_per_group = c / g;
-            int k_per_group = k / g;
-
-            int in_sz  = g * n * c_per_group * di * hi * wi;
-            int wei_sz = g * k_per_group * c_per_group * fz * fy * fx;
-            int out_sz = g * n * k_per_group * do_ * ho * wo;
 
             std::vector<int> in_len({n, c, di, hi, wi});
             std::vector<int> wei_len({k, c_per_group, fz, fy, fx});
@@ -738,28 +660,26 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             miopen::tensor_layout_to_strides(wei_len, layout_default, layout_string, wei_strides);
             miopen::tensor_layout_to_strides(out_len, layout_default, layout_string, out_strides);
 
+            pad_tensor_strides(in_strides);
+            pad_tensor_strides(wei_strides);
+            pad_tensor_strides(out_strides);
+
             tensor<TRef> in(in_len, in_strides);
             tensor<TRef> wei(wei_len, wei_strides);
             tensor<Tout> out(out_len, out_strides);
-#if MIOPEN_BACKEND_OPENCL
-            cl_context ctx;
-            clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
-            cl_int status = CL_SUCCESS;
-            cl_mem in_dev =
-                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * in_sz, nullptr, &status);
-            cl_mem wei_dev =
-                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * wei_sz, nullptr, nullptr);
-            cl_mem out_dev =
-                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(Tout) * out_sz, nullptr, nullptr);
-            EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
+            auto in_sz  = in.data.size();
+            auto wei_sz = wei.data.size();
+            auto out_sz = out.data.size();
+
             void* in_dev;
             void* wei_dev;
             void* out_dev;
+
             EXPECT(hipMalloc(&in_dev, sizeof(TRef) * in_sz) == hipSuccess);
             EXPECT(hipMalloc(&wei_dev, sizeof(TRef) * wei_sz) == hipSuccess);
             EXPECT(hipMalloc(&out_dev, sizeof(Tout) * out_sz) == hipSuccess);
-#endif
+
             EXPECT(miopenCreateConvolutionDescriptor(&convDesc) == miopenStatusSuccess);
             EXPECT(miopenInitConvolutionNdDescriptor(convDesc,
                                                      3,
@@ -795,35 +715,21 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 // initialize data with integer
                 rand_tensor_integer(in);
                 rand_tensor_integer(wei);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueWriteBuffer(q,
-                                              in_dev,
-                                              CL_TRUE,
-                                              0,
-                                              sizeof(TRef) * in_sz,
-                                              in.data.data(),
-                                              0,
-                                              nullptr,
-                                              nullptr);
-                status |= clEnqueueWriteBuffer(q,
-                                               wei_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * wei_sz,
-                                               wei.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+                /// \ref copy_non_packed_output_before_convolution
+                rand_tensor_integer(out);
+
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
+                /// \ref copy_non_packed_output_before_convolution
+                EXPECT(hipMemcpy(out_dev,
+                                 out.data.data(),
+                                 sizeof(Tout) * out_sz,
+                                 hipMemcpyHostToDevice) == hipSuccess);
                 EXPECT(hipMemcpy(wei_dev,
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-#endif
 
                 cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(),
                                         in,
@@ -849,23 +755,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                        miopenStatusSuccess);
 
                 tensor<Tout> out_host(out_len, out_strides);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueReadBuffer(q,
-                                             out_dev,
-                                             CL_TRUE,
-                                             0,
-                                             sizeof(Tout) * out_sz,
-                                             out_host.data.data(),
-                                             0,
-                                             nullptr,
-                                             nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(out_host.data.data(),
                                  out_dev,
                                  sizeof(Tout) * out_sz,
                                  hipMemcpyDeviceToHost) == hipSuccess);
-#endif
 
                 // we expect excact match, since use integer
                 valid_result = verify_tensor(out_host, out);
@@ -875,36 +769,22 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 // initialize data with integer
                 rand_tensor_integer(out);
                 rand_tensor_integer(wei);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueWriteBuffer(q,
-                                              out_dev,
-                                              CL_TRUE,
-                                              0,
-                                              sizeof(TRef) * out_sz,
-                                              out.data.data(),
-                                              0,
-                                              nullptr,
-                                              nullptr);
-                status |= clEnqueueWriteBuffer(q,
-                                               wei_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * wei_sz,
-                                               wei.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+                /// \ref copy_non_packed_output_before_convolution
+                rand_tensor_integer(in);
+
+                /// \ref copy_non_packed_output_before_convolution
+                EXPECT(hipMemcpy(
+                           in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
+                       hipSuccess);
                 EXPECT(hipMemcpy(out_dev,
                                  out.data.data(),
-                                 sizeof(TRef) * out_sz,
+                                 sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
                 EXPECT(hipMemcpy(wei_dev,
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-#endif
+
                 cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(),
                                               in,
                                               wei,
@@ -929,23 +809,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                        miopenStatusSuccess);
 
                 tensor<TRef> in_host(in_len, in_strides);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueReadBuffer(q,
-                                             in_dev,
-                                             CL_TRUE,
-                                             0,
-                                             sizeof(TRef) * in_sz,
-                                             in_host.data.data(),
-                                             0,
-                                             nullptr,
-                                             nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(in_host.data.data(),
                                  in_dev,
                                  sizeof(TRef) * in_sz,
                                  hipMemcpyDeviceToHost) == hipSuccess);
-#endif
 
                 // we expect excact match, since use integer
                 valid_result = verify_tensor(in_host, in);
@@ -954,35 +822,22 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             {
                 rand_tensor_integer(in, 3, -2);
                 rand_tensor_integer(out, 3, -2);
-#if MIOPEN_BACKEND_OPENCL
-                status |= clEnqueueWriteBuffer(q,
-                                               in_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * in_sz,
-                                               in.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                status |= clEnqueueWriteBuffer(q,
-                                               out_dev,
-                                               CL_TRUE,
-                                               0,
-                                               sizeof(TRef) * out_sz,
-                                               out.data.data(),
-                                               0,
-                                               nullptr,
-                                               nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+                /// \ref copy_non_packed_output_before_convolution
+                rand_tensor_integer(wei);
+
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
+                /// \ref copy_non_packed_output_before_convolution
+                EXPECT(hipMemcpy(wei_dev,
+                                 wei.data.data(),
+                                 sizeof(TRef) * wei_sz,
+                                 hipMemcpyHostToDevice) == hipSuccess);
                 EXPECT(hipMemcpy(out_dev,
                                  out.data.data(),
-                                 sizeof(TRef) * out_sz,
+                                 sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-#endif
+
                 cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(),
                                                 in,
                                                 wei,
@@ -1007,23 +862,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                        miopenStatusSuccess);
 
                 tensor<TRef> wei_host(wei_len, wei_strides);
-#if MIOPEN_BACKEND_OPENCL
-                status = clEnqueueReadBuffer(q,
-                                             wei_dev,
-                                             CL_TRUE,
-                                             0,
-                                             sizeof(TRef) * wei_sz,
-                                             wei_host.data.data(),
-                                             0,
-                                             nullptr,
-                                             nullptr);
-                EXPECT(status == CL_SUCCESS);
-#elif MIOPEN_BACKEND_HIP
+
                 EXPECT(hipMemcpy(wei_host.data.data(),
                                  wei_dev,
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyDeviceToHost) == hipSuccess);
-#endif
 
                 // we expect excact match, since use integer
                 valid_result = verify_tensor(wei_host, wei, 8.0); // max possible int
@@ -1049,15 +892,9 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             miopenDestroyTensorDescriptor(weiDesc);
             miopenDestroyTensorDescriptor(outDesc);
 
-#if MIOPEN_BACKEND_OPENCL
-            clReleaseMemObject(in_dev);
-            clReleaseMemObject(wei_dev);
-            clReleaseMemObject(out_dev);
-#elif MIOPEN_BACKEND_HIP
             hipFree(in_dev);
             hipFree(wei_dev);
             hipFree(out_dev);
-#endif
         };
 
         iterate_conv_3d(run_conv_3d);
diff --git a/test/gtest/conv_api_strided_tensors.cpp b/test/gtest/conv_api_strided_tensors.cpp
index 2a59dcd696..04d56ec908 100644
--- a/test/gtest/conv_api_strided_tensors.cpp
+++ b/test/gtest/conv_api_strided_tensors.cpp
@@ -139,7 +139,9 @@ class ConvStridedTensors : public ::testing::Test
     std::vector<float> h_output;
 };
 
-// This test should be replaced when strided tensors are fully implemented
+/// \todo re-enable this test after NCDHW grouped convolution lands (PR 2429)
+/// \todo add cpu reference convolution for verification --amberhassaan
+#if 0
 TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented)
 {
     auto device = Device(handle);
@@ -178,9 +180,8 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented)
     const float alpha = 1.f;
     const float beta  = 0.f;
 
-    // miopenConvolutionForward() must return error if the format is not supported
     ASSERT_TRUE(device.Synchronize());
-    ASSERT_NE(miopenConvolutionForward(handle,
+    ASSERT_EQ(miopenConvolutionForward(handle,
                                        &alpha,
                                        input_descr,
                                        d_input.Data(),
@@ -196,3 +197,4 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented)
               miopenStatusSuccess);
     ASSERT_TRUE(device.Synchronize());
 }
+#endif

From 4855e6eb9fd9a63977bb3cad6e3f74a97a046ce6 Mon Sep 17 00:00:00 2001
From: Daming Feng <dmfeng8898@gmail.com>
Date: Thu, 12 Oct 2023 03:08:31 -0500
Subject: [PATCH 34/36] 3D forward convolution solver with non-packed input
 tensors (#2418)

---
 .../miopen/solver/ck_utility_common.hpp       |  10 +
 src/solver/conv_direct_naive_conv_bwd.cpp     |   3 +
 src/solver/conv_direct_naive_conv_fwd.cpp     |   3 +
 src/solver/conv_direct_naive_conv_wrw.cpp     |   3 +
 ...ip_implicit_gemm_3d_grouped_bwd_xdlops.cpp |  18 +-
 ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp |  18 +-
 ...ip_implicit_gemm_3d_grouped_wrw_xdlops.cpp |  20 +-
 test/gpu_reference_kernel.cpp                 | 172 ++++++++++++++++--
 test/gtest/conv_api_strided_tensors.cpp       |   4 -
 test/gtest/nonpack_conv3d_fwd.cpp             |  98 ++++++++++
 test/gtest/nonpack_conv3d_fwd.hpp             | 157 ++++++++++++++++
 11 files changed, 468 insertions(+), 38 deletions(-)
 create mode 100644 test/gtest/nonpack_conv3d_fwd.cpp
 create mode 100644 test/gtest/nonpack_conv3d_fwd.hpp

diff --git a/src/include/miopen/solver/ck_utility_common.hpp b/src/include/miopen/solver/ck_utility_common.hpp
index f4f91fa228..003b067e50 100644
--- a/src/include/miopen/solver/ck_utility_common.hpp
+++ b/src/include/miopen/solver/ck_utility_common.hpp
@@ -63,6 +63,16 @@ static inline bool is_ck_supported_hardware(const Handle& handle)
            StartsWith(handle.GetDeviceName(), "gfx1102");
 }
 
+static inline bool is_conv_ck_supported_hardware(const std::string& device_name, bool is_wrw)
+{
+    auto res_wrw = StartsWith(device_name, "gfx908") || StartsWith(device_name, "gfx90a") ||
+                   StartsWith(device_name, "gfx940") || StartsWith(device_name, "gfx941") ||
+                   StartsWith(device_name, "gfx942");
+    return is_wrw ? res_wrw
+                  : (res_wrw || StartsWith(device_name, "gfx900") ||
+                     StartsWith(device_name, "gfx906"));
+}
+
 static inline bool is_support_amd_buffer_atomic_fadd(const std::string& device_name)
 {
     return StartsWith(device_name, "gfx908");
diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp
index dea91c9ecf..1e8f006ef0 100644
--- a/src/solver/conv_direct_naive_conv_bwd.cpp
+++ b/src/solver/conv_direct_naive_conv_bwd.cpp
@@ -162,6 +162,9 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx,
                     handle.Run(kern)(tensors.out,
                                      tensors.w,
                                      tensors.in,
+                                     out_strides,
+                                     wei_strides,
+                                     in_strides,
                                      hi,
                                      wi,
                                      n,
diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp
index 5bc25a2367..f1ed2f5b10 100644
--- a/src/solver/conv_direct_naive_conv_fwd.cpp
+++ b/src/solver/conv_direct_naive_conv_fwd.cpp
@@ -162,6 +162,9 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx,
                     handle.Run(kern)(tensors.in,
                                      tensors.w,
                                      tensors.out,
+                                     in_strides,
+                                     wei_strides,
+                                     out_strides,
                                      hi,
                                      wi,
                                      n,
diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp
index a8c4d40e0b..b83b334faa 100644
--- a/src/solver/conv_direct_naive_conv_wrw.cpp
+++ b/src/solver/conv_direct_naive_conv_wrw.cpp
@@ -150,6 +150,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx,
                     handle.Run(kern)(tensors.x,
                                      tensors.dw,
                                      tensors.dy,
+                                     in_strides,
+                                     wei_strides,
+                                     out_strides,
                                      hi,
                                      wi,
                                      n,
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
index 94983c7f0e..58efe498ff 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
@@ -32,6 +32,7 @@
 #include <miopen/conv/data_invoke_params.hpp>
 #include <miopen/solver/problem_description_interpreter.hpp>
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+#include <miopen/solver/ck_utility_common.hpp>
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp>
 #endif
 #include <miopen/solver/implicitgemm_ck_util.hpp>
@@ -86,10 +87,16 @@ struct CKArgs
         output = {G, N, K, Do, Ho, Wo};
         weight = {G, K, C, Z, Y, X};
 
-        // strides from NHWGC to GNCHW laout
-        in_strides  = {C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
-        out_strides = {K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
-        wei_strides = {K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
+        // miopen strides to CK strides
+        auto miopen_in_strides  = problem.GetIn().GetStrides();
+        auto miopen_out_strides = problem.GetOut().GetStrides();
+        auto miopen_wei_strides = problem.GetWeights().GetStrides();
+        miopen_in_strides.insert(miopen_in_strides.begin(), C);
+        miopen_out_strides.insert(miopen_out_strides.begin(), K);
+        miopen_wei_strides.insert(miopen_wei_strides.begin(), K * miopen_wei_strides[0]);
+        std::copy(miopen_in_strides.begin(), miopen_in_strides.end(), in_strides.begin());
+        std::copy(miopen_out_strides.begin(), miopen_out_strides.end(), out_strides.begin());
+        std::copy(miopen_wei_strides.begin(), miopen_wei_strides.end(), wei_strides.begin());
 
         strides  = {ProblemInterpreter::GetAdjustedConvolutionStrideD(problem),
                    ProblemInterpreter::GetAdjustedConvolutionStrideH(problem),
@@ -315,8 +322,7 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable(
         return false;
     if(!problem.IsLayoutNHWC())
         return false;
-    const std::string& arch = ctx.GetStream().GetDeviceName();
-    if(miopen::StartsWith(arch, "gfx11") || miopen::StartsWith(arch, "gfx10"))
+    if(!ck_utility::is_conv_ck_supported_hardware(ctx.GetStream().GetDeviceName(), false))
         return false;
     switch(problem.GetInDataType())
     {
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
index a21c9ba300..e7a44456b9 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
@@ -32,6 +32,7 @@
 #include <miopen/conv/data_invoke_params.hpp>
 #include <miopen/solver/problem_description_interpreter.hpp>
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+#include <miopen/solver/ck_utility_common.hpp>
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp>
 #endif
 #include <miopen/solver/implicitgemm_ck_util.hpp>
@@ -86,10 +87,16 @@ struct CKArgs
         output = {G, N, K, Do, Ho, Wo};
         weight = {G, K, C, Z, Y, X};
 
-        // strides from NHWGC to GNCHW laout
-        in_strides  = {C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
-        out_strides = {K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
-        wei_strides = {K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
+        // miopen strides to CK strides
+        auto miopen_in_strides  = problem.GetIn().GetStrides();
+        auto miopen_out_strides = problem.GetOut().GetStrides();
+        auto miopen_wei_strides = problem.GetWeights().GetStrides();
+        miopen_in_strides.insert(miopen_in_strides.begin(), C);
+        miopen_out_strides.insert(miopen_out_strides.begin(), K);
+        miopen_wei_strides.insert(miopen_wei_strides.begin(), K * miopen_wei_strides[0]);
+        std::copy(miopen_in_strides.begin(), miopen_in_strides.end(), in_strides.begin());
+        std::copy(miopen_out_strides.begin(), miopen_out_strides.end(), out_strides.begin());
+        std::copy(miopen_wei_strides.begin(), miopen_wei_strides.end(), wei_strides.begin());
 
         strides  = {ProblemInterpreter::GetAdjustedConvolutionStrideD(problem),
                    ProblemInterpreter::GetAdjustedConvolutionStrideH(problem),
@@ -313,8 +320,7 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable(
         return false;
     if(!problem.IsLayoutNHWC())
         return false;
-    const std::string& arch = ctx.GetStream().GetDeviceName();
-    if(!(arch == "gfx908" || arch == "gfx90a"))
+    if(!ck_utility::is_conv_ck_supported_hardware(ctx.GetStream().GetDeviceName(), false))
         return false;
     switch(problem.GetInDataType())
     {
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
index 6225410599..d0236e4f42 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
@@ -32,6 +32,7 @@
 #include <miopen/conv/wrw_invoke_params.hpp>
 #include <miopen/solver/problem_description_interpreter.hpp>
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
+#include <miopen/solver/ck_utility_common.hpp>
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp>
 #endif
 #include <miopen/solver/implicitgemm_ck_util.hpp>
@@ -84,10 +85,16 @@ struct CKArgs
         output = {G, N, K, Do, Ho, Wo};
         weight = {G, K, C, Z, Y, X};
 
-        // strides from NHWGC to GNCHW laout
-        in_strides  = {C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
-        out_strides = {K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
-        wei_strides = {K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
+        // miopen strides to CK strides
+        auto miopen_in_strides  = problem.GetIn().GetStrides();
+        auto miopen_out_strides = problem.GetOut().GetStrides();
+        auto miopen_wei_strides = problem.GetWeights().GetStrides();
+        miopen_in_strides.insert(miopen_in_strides.begin(), C);
+        miopen_out_strides.insert(miopen_out_strides.begin(), K);
+        miopen_wei_strides.insert(miopen_wei_strides.begin(), K * miopen_wei_strides[0]);
+        std::copy(miopen_in_strides.begin(), miopen_in_strides.end(), in_strides.begin());
+        std::copy(miopen_out_strides.begin(), miopen_out_strides.end(), out_strides.begin());
+        std::copy(miopen_wei_strides.begin(), miopen_wei_strides.end(), wei_strides.begin());
 
         strides  = {ProblemInterpreter::GetAdjustedConvolutionStrideD(problem),
                    ProblemInterpreter::GetAdjustedConvolutionStrideH(problem),
@@ -309,10 +316,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable(
         return false;
     if(!problem.IsLayoutNHWC())
         return false;
-    const std::string& arch = ctx.GetStream().GetDeviceName();
-    if(miopen::StartsWith(arch, "gfx11") || miopen::StartsWith(arch, "gfx10"))
-        return false;
-    if(arch == "gfx906" || arch == "gfx900")
+    if(!ck_utility::is_conv_ck_supported_hardware(ctx.GetStream().GetDeviceName(), true))
         return false;
     switch(problem.GetInDataType())
     {
diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp
index e166781b9b..a6a0b1e2bc 100644
--- a/test/gpu_reference_kernel.cpp
+++ b/test/gpu_reference_kernel.cpp
@@ -376,13 +376,25 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
             auto wei_sz = wei.data.size();
             auto out_sz = out.data.size();
 
+#if MIOPEN_BACKEND_OPENCL
+            cl_context ctx;
+            clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
+            cl_int status = CL_SUCCESS;
+            cl_mem in_dev =
+                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * in_sz, nullptr, &status);
+            cl_mem wei_dev =
+                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * wei_sz, nullptr, nullptr);
+            cl_mem out_dev =
+                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(Tout) * out_sz, nullptr, nullptr);
+            EXPECT(status == CL_SUCCESS);
+#elif MIOPEN_BACKEND_HIP
             void* in_dev;
             void* wei_dev;
             void* out_dev;
             EXPECT(hipMalloc(&in_dev, sizeof(TRef) * in_sz) == hipSuccess);
             EXPECT(hipMalloc(&wei_dev, sizeof(TRef) * wei_sz) == hipSuccess);
             EXPECT(hipMalloc(&out_dev, sizeof(Tout) * out_sz) == hipSuccess);
-
+#endif
             EXPECT(miopenCreateConvolutionDescriptor(&convDesc) == miopenStatusSuccess);
             EXPECT(miopenInitConvolutionNdDescriptor(convDesc,
                                                      2,
@@ -420,7 +432,27 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 rand_tensor_integer(wei);
                 /// \ref copy_non_packed_output_before_convolution
                 rand_tensor_integer(out);
-
+#if MIOPEN_BACKEND_OPENCL
+                status = clEnqueueWriteBuffer(q,
+                                              in_dev,
+                                              CL_TRUE,
+                                              0,
+                                              sizeof(TRef) * in_sz,
+                                              in.data.data(),
+                                              0,
+                                              nullptr,
+                                              nullptr);
+                status |= clEnqueueWriteBuffer(q,
+                                               wei_dev,
+                                               CL_TRUE,
+                                               0,
+                                               sizeof(TRef) * wei_sz,
+                                               wei.data.data(),
+                                               0,
+                                               nullptr,
+                                               nullptr);
+                EXPECT(status == CL_SUCCESS);
+#elif MIOPEN_BACKEND_HIP
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -440,7 +472,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                                  out.data.data(),
                                  sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-
+#endif
                 cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(),
                                         in,
                                         wei,
@@ -480,8 +512,28 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 rand_tensor_integer(wei);
                 /// \ref copy_non_packed_output_before_convolution
                 rand_tensor_integer(in);
+#if MIOPEN_BACKEND_OPENCL
+                status = clEnqueueWriteBuffer(q,
+                                              out_dev,
+                                              CL_TRUE,
+                                              0,
+                                              sizeof(TRef) * out_sz,
+                                              out.data.data(),
+                                              0,
+                                              nullptr,
+                                              nullptr);
+                status |= clEnqueueWriteBuffer(q,
+                                               wei_dev,
+                                               CL_TRUE,
+                                               0,
+                                               sizeof(TRef) * wei_sz,
+                                               wei.data.data(),
+                                               0,
+                                               nullptr,
+                                               nullptr);
+                EXPECT(status == CL_SUCCESS);
+#elif MIOPEN_BACKEND_HIP
                 /// \ref copy_non_packed_output_before_convolution
-
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -493,7 +545,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-
+#endif
                 cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(),
                                               in,
                                               wei,
@@ -533,7 +585,27 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                 rand_tensor_integer(out);
                 /// \ref copy_non_packed_output_before_convolution
                 rand_tensor_integer(wei);
-
+#if MIOPEN_BACKEND_OPENCL
+                status |= clEnqueueWriteBuffer(q,
+                                               in_dev,
+                                               CL_TRUE,
+                                               0,
+                                               sizeof(TRef) * in_sz,
+                                               in.data.data(),
+                                               0,
+                                               nullptr,
+                                               nullptr);
+                status |= clEnqueueWriteBuffer(q,
+                                               out_dev,
+                                               CL_TRUE,
+                                               0,
+                                               sizeof(TRef) * out_sz,
+                                               out.data.data(),
+                                               0,
+                                               nullptr,
+                                               nullptr);
+                EXPECT(status == CL_SUCCESS);
+#elif MIOPEN_BACKEND_HIP
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -546,7 +618,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base
                                  out.data.data(),
                                  sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-
+#endif
                 cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(),
                                                 in,
                                                 wei,
@@ -672,6 +744,18 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             auto wei_sz = wei.data.size();
             auto out_sz = out.data.size();
 
+#if MIOPEN_BACKEND_OPENCL
+            cl_context ctx;
+            clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
+            cl_int status = CL_SUCCESS;
+            cl_mem in_dev =
+                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * in_sz, nullptr, &status);
+            cl_mem wei_dev =
+                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * wei_sz, nullptr, nullptr);
+            cl_mem out_dev =
+                clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(Tout) * out_sz, nullptr, nullptr);
+            EXPECT(status == CL_SUCCESS);
+#elif MIOPEN_BACKEND_HIP
             void* in_dev;
             void* wei_dev;
             void* out_dev;
@@ -679,7 +763,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
             EXPECT(hipMalloc(&in_dev, sizeof(TRef) * in_sz) == hipSuccess);
             EXPECT(hipMalloc(&wei_dev, sizeof(TRef) * wei_sz) == hipSuccess);
             EXPECT(hipMalloc(&out_dev, sizeof(Tout) * out_sz) == hipSuccess);
-
+#endif
             EXPECT(miopenCreateConvolutionDescriptor(&convDesc) == miopenStatusSuccess);
             EXPECT(miopenInitConvolutionNdDescriptor(convDesc,
                                                      3,
@@ -717,7 +801,27 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 rand_tensor_integer(wei);
                 /// \ref copy_non_packed_output_before_convolution
                 rand_tensor_integer(out);
-
+#if MIOPEN_BACKEND_OPENCL
+                status = clEnqueueWriteBuffer(q,
+                                              in_dev,
+                                              CL_TRUE,
+                                              0,
+                                              sizeof(TRef) * in_sz,
+                                              in.data.data(),
+                                              0,
+                                              nullptr,
+                                              nullptr);
+                status |= clEnqueueWriteBuffer(q,
+                                               wei_dev,
+                                               CL_TRUE,
+                                               0,
+                                               sizeof(TRef) * wei_sz,
+                                               wei.data.data(),
+                                               0,
+                                               nullptr,
+                                               nullptr);
+                EXPECT(status == CL_SUCCESS);
+#elif MIOPEN_BACKEND_HIP
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -730,7 +834,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-
+#endif
                 cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(),
                                         in,
                                         wei,
@@ -771,7 +875,27 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 rand_tensor_integer(wei);
                 /// \ref copy_non_packed_output_before_convolution
                 rand_tensor_integer(in);
-
+#if MIOPEN_BACKEND_OPENCL
+                status = clEnqueueWriteBuffer(q,
+                                              out_dev,
+                                              CL_TRUE,
+                                              0,
+                                              sizeof(TRef) * out_sz,
+                                              out.data.data(),
+                                              0,
+                                              nullptr,
+                                              nullptr);
+                status |= clEnqueueWriteBuffer(q,
+                                               wei_dev,
+                                               CL_TRUE,
+                                               0,
+                                               sizeof(TRef) * wei_sz,
+                                               wei.data.data(),
+                                               0,
+                                               nullptr,
+                                               nullptr);
+                EXPECT(status == CL_SUCCESS);
+#elif MIOPEN_BACKEND_HIP
                 /// \ref copy_non_packed_output_before_convolution
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
@@ -784,7 +908,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                                  wei.data.data(),
                                  sizeof(TRef) * wei_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-
+#endif
                 cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(),
                                               in,
                                               wei,
@@ -824,7 +948,27 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                 rand_tensor_integer(out, 3, -2);
                 /// \ref copy_non_packed_output_before_convolution
                 rand_tensor_integer(wei);
-
+#if MIOPEN_BACKEND_OPENCL
+                status |= clEnqueueWriteBuffer(q,
+                                               in_dev,
+                                               CL_TRUE,
+                                               0,
+                                               sizeof(TRef) * in_sz,
+                                               in.data.data(),
+                                               0,
+                                               nullptr,
+                                               nullptr);
+                status |= clEnqueueWriteBuffer(q,
+                                               out_dev,
+                                               CL_TRUE,
+                                               0,
+                                               sizeof(TRef) * out_sz,
+                                               out.data.data(),
+                                               0,
+                                               nullptr,
+                                               nullptr);
+                EXPECT(status == CL_SUCCESS);
+#elif MIOPEN_BACKEND_HIP
                 EXPECT(hipMemcpy(
                            in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) ==
                        hipSuccess);
@@ -837,7 +981,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base
                                  out.data.data(),
                                  sizeof(Tout) * out_sz,
                                  hipMemcpyHostToDevice) == hipSuccess);
-
+#endif
                 cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(),
                                                 in,
                                                 wei,
diff --git a/test/gtest/conv_api_strided_tensors.cpp b/test/gtest/conv_api_strided_tensors.cpp
index 04d56ec908..9a2876b3f0 100644
--- a/test/gtest/conv_api_strided_tensors.cpp
+++ b/test/gtest/conv_api_strided_tensors.cpp
@@ -139,9 +139,6 @@ class ConvStridedTensors : public ::testing::Test
     std::vector<float> h_output;
 };
 
-/// \todo re-enable this test after NCDHW grouped convolution lands (PR 2429)
-/// \todo add cpu reference convolution for verification --amberhassaan
-#if 0
 TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented)
 {
     auto device = Device(handle);
@@ -197,4 +194,3 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented)
               miopenStatusSuccess);
     ASSERT_TRUE(device.Synchronize());
 }
-#endif
diff --git a/test/gtest/nonpack_conv3d_fwd.cpp b/test/gtest/nonpack_conv3d_fwd.cpp
new file mode 100644
index 0000000000..35cc492c74
--- /dev/null
+++ b/test/gtest/nonpack_conv3d_fwd.cpp
@@ -0,0 +1,98 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/solver_id.hpp>
+#include <serialize.hpp>
+
+#include "tensor_util.hpp"
+#include "get_handle.hpp"
+#include "nonpack_conv3d_fwd.hpp"
+
+struct ConvFwdSolverTest3D : ConvFwdSolverTest<half_float::half>
+{
+};
+
+template <typename Solver>
+void SolverFwd(const miopen::TensorDescriptor& inputDesc,
+               ConstData_t input,
+               const miopen::TensorDescriptor& wDesc,
+               ConstData_t weight,
+               const miopen::TensorDescriptor& outputDesc,
+               Data_t output,
+               const miopen::ConvolutionDescriptor& convDesc,
+               const NonPackTestCase& conv_config,
+               bool& test_skipped)
+{
+    auto&& handle = get_handle();
+
+    Solver solv{};
+
+    const auto tensors =
+        miopen::ConvFwdTensors{inputDesc, input, wDesc, weight, outputDesc, output};
+
+    const auto problem = miopen::conv::ProblemDescription{
+        inputDesc, wDesc, outputDesc, convDesc, miopen::conv::Direction::Forward};
+    auto ctx = miopen::ExecutionContext{};
+
+    ctx.SetStream(&handle);
+
+    if(!solv.IsApplicable(ctx, problem))
+    {
+        test_skipped = true;
+        GTEST_SKIP() << solv.SolverDbId()
+                     << "ConvHipImplicitGemm3DGroupFwdXdlops Not Applicable for this problem"
+                     << conv_config;
+    }
+    const auto invoke_params = miopen::conv::DataInvokeParams{tensors, nullptr, 0, false};
+
+    ASSERT_TRUE(solv.IsApplicable(ctx, problem));
+    auto sol = solv.GetSolution(ctx, problem, solv.GetDefaultPerformanceConfig(ctx, problem));
+    ASSERT_TRUE(sol.Succeeded());
+    ASSERT_TRUE(sol.invoker_factory);
+    const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params);
+    (invoker)(handle, invoke_params);
+    handle.Finish();
+}
+
+TEST_P(ConvFwdSolverTest3D, CKNonPackConvFwd3D)
+{
+    SolverFwd<miopen::solver::ConvHipImplicitGemm3DGroupFwdXdlops>(input.desc,
+                                                                   in_dev.get(),
+                                                                   weights.desc,
+                                                                   wei_dev.get(),
+                                                                   output.desc,
+                                                                   out_dev.get(),
+                                                                   conv_desc,
+                                                                   conv_config,
+                                                                   test_skipped);
+}
+
+INSTANTIATE_TEST_SUITE_P(ConvFwdTest,
+                         ConvFwdSolverTest3D,
+                         testing::Combine(testing::Values(miopenConvolutionFwdAlgoImplicitGEMM),
+                                          testing::ValuesIn(ConvTestConfigs()),
+                                          testing::Values(miopenTensorNDHWC)));
diff --git a/test/gtest/nonpack_conv3d_fwd.hpp b/test/gtest/nonpack_conv3d_fwd.hpp
new file mode 100644
index 0000000000..5b3677cbb4
--- /dev/null
+++ b/test/gtest/nonpack_conv3d_fwd.hpp
@@ -0,0 +1,157 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include "conv3d_test_case.hpp"
+
+struct NonPackTestCase : Conv3DTestCase
+{
+    size_t i0;
+    size_t i1;
+    size_t i2;
+    size_t i3;
+    size_t i4;
+    size_t w0;
+    size_t w1;
+    size_t w2;
+    size_t w3;
+    size_t w4;
+    size_t o0;
+    size_t o1;
+    size_t o2;
+    size_t o3;
+    size_t o4;
+    std::vector<size_t> GetInputStrides() { return {i0, i1, i2, i3, i4}; }
+    std::vector<size_t> GetWeightStrides() { return {w0, w1, w2, w3, w4}; }
+    std::vector<size_t> GetOutputStrides() { return {o0, o1, o2, o3, o4}; }
+};
+
+std::vector<NonPackTestCase> ConvTestConfigs()
+{ // g    n   c   d    h   w   k   z  y  x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z
+    return {{{1, 4, 16, 4, 9, 16, 16, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
+             10240,
+             1,
+             2560,
+             160,
+             16,
+             432,
+             1,
+             144,
+             48,
+             16,
+             9216,
+             1,
+             2304,
+             256,
+             16},
+            {{1, 1, 64, 3, 16, 16, 128, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution},
+             65536,
+             1,
+             24000,
+             2048,
+             64,
+             1728,
+             1,
+             576,
+             192,
+             64,
+             98304,
+             1,
+             32768,
+             2048,
+             128}};
+}
+
+template <typename T = float>
+struct ConvFwdSolverTest
+    : public ::testing::TestWithParam<
+          std::tuple<miopenConvFwdAlgorithm_t, NonPackTestCase, miopenTensorLayout_t>>
+{
+protected:
+    void SetUp() override
+    {
+        test_skipped = false;
+
+        std::tie(algo, conv_config, tensor_layout) = GetParam();
+        input                                      = tensor<T>{
+            miopen_type<T>{}, tensor_layout, conv_config.GetInput(), conv_config.GetInputStrides()};
+        weights = tensor<T>{miopen_type<T>{}, tensor_layout, conv_config.GetWeights()};
+        std::random_device rd{};
+        std::mt19937 gen{rd()};
+        std::uniform_real_distribution<> d{-3, 3};
+        auto gen_value = [&](auto...) { return d(gen); };
+        input.generate(gen_value);
+        weights.generate(gen_value);
+        conv_desc = conv_config.GetConv();
+
+        miopen::TensorDescriptor output_desc =
+            conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType<T>());
+        output = tensor<T>{miopen_type<T>{}, tensor_layout, output_desc.GetLengths()};
+        std::fill(output.begin(), output.end(), std::numeric_limits<double>::quiet_NaN());
+        auto&& handle = get_handle();
+        in_dev        = handle.Write(input.data);
+        wei_dev       = handle.Write(weights.data);
+        out_dev       = handle.Write(output.data);
+    }
+    void TearDown() override
+    {
+        if(test_skipped)
+            return;
+
+        auto&& handle = get_handle();
+
+        miopen::TensorDescriptor output_desc =
+            conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType<T>());
+        ref_out     = tensor<T>{miopen_type<T>{}, tensor_layout, output_desc.GetLengths()};
+        ref_out     = ref_conv_fwd(input, weights, output, conv_desc);
+        output.data = handle.Read<T>(out_dev, output.data.size());
+        EXPECT_FALSE(miopen::range_zero(ref_out)) << "Cpu data is all zeros";
+        EXPECT_FALSE(miopen::range_zero(output)) << "Gpu data is all zeros";
+        EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output));
+
+        const double tolerance = 80;
+        double threshold       = std::numeric_limits<T>::epsilon() * tolerance;
+        auto error             = miopen::rms_range(ref_out, output);
+
+        EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0)
+            << "Non finite number found in the CPU data";
+
+        EXPECT_TRUE(error < threshold)
+            << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
+    }
+    NonPackTestCase conv_config;
+    miopen::ConvolutionDescriptor conv_desc;
+    tensor<T> input;
+    tensor<T> weights;
+    tensor<T> output;
+    tensor<T> ref_out;
+    miopen::Allocator::ManageDataPtr in_dev;
+    miopen::Allocator::ManageDataPtr wei_dev;
+    miopen::Allocator::ManageDataPtr out_dev;
+    miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoImplicitGEMM;
+    bool test_skipped             = false;
+    miopenTensorLayout_t tensor_layout;
+};

From 251c1d18718c83db9ad42adcf5a497e63c2a3eba Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Thu, 12 Oct 2023 01:09:25 -0700
Subject: [PATCH 35/36] Bump CK comit for ROCm 6.0 (#2439)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index bd4a9cb992..9d0d711550 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,4 @@ nlohmann/json@v3.9.1 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off
 ROCmSoftwarePlatform/FunctionalPlus@v0.2.18-p0
 ROCmSoftwarePlatform/eigen@3.4.0
 ROCmSoftwarePlatform/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50
-ROCmSoftwarePlatform/composable_kernel@114c2646df9c45c531befc9eff9315b405098f56 -DDTYPES="fp16;fp32;bf16" -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON
\ No newline at end of file
+ROCmSoftwarePlatform/composable_kernel@503204136541d6d58194025f4220b603693e391c -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON
\ No newline at end of file

From 300045c73b2ef71887abbc60bc8f96488a5c999a Mon Sep 17 00:00:00 2001
From: Chris Erb <chris.erb65@gmail.com>
Date: Mon, 16 Oct 2023 21:55:35 -0500
Subject: [PATCH 36/36] [MI100][MI200] Kernel db updates (#2454)

---
 src/kernels/gfx908.kdb.bz2 | 4 ++--
 src/kernels/gfx90a.kdb.bz2 | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/kernels/gfx908.kdb.bz2 b/src/kernels/gfx908.kdb.bz2
index d19d4e35c2..014466d9d3 100644
--- a/src/kernels/gfx908.kdb.bz2
+++ b/src/kernels/gfx908.kdb.bz2
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:888797fb8848d87096fa447f82e96bbe61ecfca0f754667638fd6efa8da3004e
-size 336743541
+oid sha256:7f5b1925fbd2f58ab7011913867fcd59222e3cc09dee040f08acb6e7e22e0fda
+size 250110225
diff --git a/src/kernels/gfx90a.kdb.bz2 b/src/kernels/gfx90a.kdb.bz2
index 613df18f7a..b811a6b688 100644
--- a/src/kernels/gfx90a.kdb.bz2
+++ b/src/kernels/gfx90a.kdb.bz2
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3d76d7c53648f4864a5cfe9267e8cb9171abab81de9d1732a9f94bafb0816b61
-size 250548882
+oid sha256:d22333c785c9081f293e56bc3f5baccc5410d578d2be1dc5e5d523d02de8b5ed
+size 250698825