From 7bef289e28b06d169367b9e2e907b97733f28558 Mon Sep 17 00:00:00 2001
From: bibek <108366729+bghimireamd@users.noreply.github.com>
Date: Sun, 27 Oct 2024 01:48:48 -0500
Subject: [PATCH] Merge all bn ck nchw branches (#3332)

---
 driver/bn_driver.hpp                          |   5 +-
 .../miopen/batchnorm/problem_description.hpp  |  20 ++-
 src/ocl/batchnormocl.cpp                      |   4 +-
 src/solver/batchnorm/backward_ck.cpp          |  49 ++++---
 src/solver/batchnorm/forward_inference_ck.cpp |  64 +++++----
 src/solver/batchnorm/forward_training_ck.cpp  |  47 ++++---
 test/bn_3d_spatial_test.cpp                   |  36 ++---
 test/bn_spatial_test.cpp                      |  29 ++--
 test/gtest/bn.hpp                             |  38 +++++
 test/gtest/bn_bwd.cpp                         | 132 +++++++++---------
 test/gtest/bn_fwd_train.cpp                   | 116 +++++++--------
 test/gtest/bn_infer.cpp                       | 120 ++++++++--------
 12 files changed, 359 insertions(+), 301 deletions(-)
diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 238b4ea1e6..c89c3f166d 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -60,7 +60,7 @@
 #define ERRTOL_FP32 1e-4
 #define ERRTOL_FP16 0.5e-3
 #define RMSTOL_FP32 1e-4
-#define RMSTOL_FP16 0.5e-3
+#define RMSTOL_FP16 2e-3
 
 #define MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D 1 // Resolves issue #1974
 
@@ -1298,7 +1298,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
 
     out.CopyFromDeviceToHost(GetStream());
 
-    maxval        = static_cast<Tref>(0.0);
+    maxval = static_cast<Tref>(0.0);
+
     auto errorOut = miopen::rms_range(out_ref.data, out.GetVector());
     if(!std::isfinite(errorOut) || errorOut > maxrms)
     {
diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp
index 8054111128..b0ecb64b7e 100644
--- a/src/include/miopen/batchnorm/problem_description.hpp
+++ b/src/include/miopen/batchnorm/problem_description.hpp
@@ -179,11 +179,7 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
         return scaleDesc;
     }
 
-    const TensorDescriptor& GetScaleBiasDiffDesc() const
-    {
-        assert(direction == Direction::Backward);
-        return scaleDesc;
-    }
+    const TensorDescriptor& GetScaleBiasDiffDesc() const { return scaleDesc; }
 
     bool GetResultSave() const
     {
@@ -217,6 +213,20 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
                                               : ((in_layout == "NDHWC") && (out_layout == "NDHWC"));
     }
 
+    bool IsLayoutNCHW() const
+    {
+        if(direction == Direction::Backward)
+        {
+            return xDesc.GetLengths().size() == 4
+                       ? ((in_layout == "NCHW") && (out_layout == "NCHW") && (din_layout == "NCHW"))
+                       : ((in_layout == "NCDHW") && (out_layout == "NCDHW") &&
+                          (din_layout == "NCDHW"));
+        }
+
+        return xDesc.GetLengths().size() == 4 ? ((in_layout == "NCHW") && (out_layout == "NCHW"))
+                                              : ((in_layout == "NCDHW") && (out_layout == "NCDHW"));
+    }
+
     bool Is2D() const { return xDesc.GetLengths().size() == 4; }
     bool Is3D() const { return xDesc.GetLengths().size() == 5; }
 
diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index dca94078be..f5555047c1 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -250,8 +250,8 @@ void BatchNormForwardInference(Handle& handle,
         }();
 
         const auto algo    = AlgorithmName{"miopenBatchNormalizationForwardInference"};
-        const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdInference,
-                                                     solver::batchnorm::BnCKFwdInference>{};
+        const auto solvers = solver::SolverContainer<solver::batchnorm::BnCKFwdInference,
+                                                     solver::batchnorm::BnFwdInference>{};
 
         solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
     }
diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
index c99a67250b..198b046eff 100644
--- a/src/solver/batchnorm/backward_ck.cpp
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -87,7 +87,21 @@ struct CKArgsBNormBwd
 
         // prep for CK
         std::sort(in_strides.begin(), in_strides.end(), std::greater<>());
-        std::rotate(lens.begin() + 1, lens.begin() + 2, lens.end());
+
+        if(problem.IsLayoutNHWC())
+        {
+            std::rotate(lens.begin() + 1, lens.begin() + 2, lens.end());
+            reduceDims = {0, 1, 2};
+        }
+        else if(problem.IsLayoutNCHW())
+        {
+            reduceDims = {0, 2, 3};
+        }
+        else
+        {
+            MIOPEN_THROW(miopenStatusInternalError,
+                         "BnCKBwd operation does not support this data layout");
+        }
     }
 
     CKArgsBNormBwd(const CKArgsBNormBwd&) = default;
@@ -133,7 +147,7 @@ struct CKArgsBNormBwd
     std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarStrides;
 
     double epsilon = 1e-5;
-    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+    std::array<int, NumBatchNormReduceDim> reduceDims;
 };
 
 template <typename XDataType,
@@ -345,7 +359,7 @@ bool BnCKBwdBackward::IsApplicable(
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
     if(env::disabled(MIOPEN_DEBUG_CK_BN_BACK))
         return false;
-    if(!bn_problem.IsLayoutNHWC())
+    if(!bn_problem.IsLayoutNHWC() && !bn_problem.IsLayoutNCHW())
         return false;
     if(!ck_utility::is_ck_supported_hardware(context.GetStream()))
         return false;
@@ -353,6 +367,12 @@ bool BnCKBwdBackward::IsApplicable(
         return false;
     if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward)
         return false;
+    if(bn_problem.GetXDesc().GetType() != bn_problem.GetScaleBiasDiffDesc().GetType())
+        return false;
+    if(bn_problem.GetMode() != miopenBNSpatial)
+        return false;
+    if(!bn_problem.Is2D())
+        return false;
 
     switch(bn_problem.GetXDesc().GetType())
     {
@@ -376,24 +396,15 @@ ConvSolution MakeAnyInvokerFactory(const miopen::batchnorm::ProblemDescription&
                                    InvokerFactoryMakerNHWC&& invoker_factory_maker_nhwc)
 {
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
-    if(problem.IsLayoutNHWC())
-    {
-        switch(problem.GetXDesc().GetType())
-        {
-        case miopenFloat: return invoker_factory_maker_nhwc(F32{});
-        case miopenDouble: return invoker_factory_maker_nhwc(F64{});
-        case miopenHalf: return invoker_factory_maker_nhwc(F16{});
-        case miopenBFloat16: return invoker_factory_maker_nhwc(BF16{});
-        default:
-            MIOPEN_THROW(miopenStatusInternalError,
-                         "BnCKBwdBackward operation does not support this data type");
-        }
-    }
-    // Todo: problem.IsLayoutDefault()
-    else
+    switch(problem.GetXDesc().GetType())
     {
+    case miopenFloat: return invoker_factory_maker_nhwc(F32{});
+    case miopenDouble: return invoker_factory_maker_nhwc(F64{});
+    case miopenHalf: return invoker_factory_maker_nhwc(F16{});
+    case miopenBFloat16: return invoker_factory_maker_nhwc(BF16{});
+    default:
         MIOPEN_THROW(miopenStatusInternalError,
-                     "BnCKBwdBackward operation does not support this data layout");
+                     "BnCKBwdBackward operation does not support this data type");
     }
 #else
     return {};
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index bec1805de6..529065d2ae 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -69,6 +69,7 @@ struct CKArgsBNormFwd
 {
     CKArgsBNormFwd(const miopen::batchnorm::ProblemDescription& problem)
     {
+
         std::copy(problem.GetXDesc().GetLengths().begin(),
                   problem.GetXDesc().GetLengths().end(),
                   xyLengths.begin());
@@ -78,12 +79,29 @@ struct CKArgsBNormFwd
                   xyStrides.begin());
         // prep for CK
         std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>());
-        std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
 
-        aligned_scaleBiasMeanVarStrides[0] = 0;
-        aligned_scaleBiasMeanVarStrides[1] = 0;
-        aligned_scaleBiasMeanVarStrides[2] = 0;
-        aligned_scaleBiasMeanVarStrides[3] = 1;
+        if(problem.IsLayoutNHWC())
+        {
+            std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
+            reduceDims                         = {0, 1, 2};
+            aligned_scaleBiasMeanVarStrides[0] = 0;
+            aligned_scaleBiasMeanVarStrides[1] = 0;
+            aligned_scaleBiasMeanVarStrides[2] = 0;
+            aligned_scaleBiasMeanVarStrides[3] = 1;
+        }
+        else if(problem.IsLayoutNCHW())
+        {
+            reduceDims                         = {0, 2, 3};
+            aligned_scaleBiasMeanVarStrides[0] = 0;
+            aligned_scaleBiasMeanVarStrides[1] = 1;
+            aligned_scaleBiasMeanVarStrides[2] = 0;
+            aligned_scaleBiasMeanVarStrides[3] = 0;
+        }
+        else
+        {
+            MIOPEN_THROW(miopenStatusInternalError,
+                         "BnCKFwdInference operation does not support this data layout");
+        }
     }
 
     std::array<ck::index_t, Rank> xyLengths;
@@ -91,8 +109,9 @@ struct CKArgsBNormFwd
     std::vector<int> invariantDims;
 
     std::array<index_t, Rank> aligned_scaleBiasMeanVarStrides{3};
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarStrides;
 
-    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+    std::array<int, NumBatchNormReduceDim> reduceDims;
 
     template <typename InvokerPtr, typename InvokerParams>
     auto MakeArgPtr(const InvokerPtr& invoker_ptr, const InvokerParams& data_ctx) const
@@ -305,7 +324,7 @@ bool BnCKFwdInference::IsApplicable(
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
     if(env::disabled(MIOPEN_DEBUG_CK_BN_INFER))
         return false;
-    if(!bn_problem.IsLayoutNHWC())
+    if(!bn_problem.IsLayoutNHWC() && !bn_problem.IsLayoutNCHW())
         return false;
     if(!ck_utility::is_ck_supported_hardware(context.GetStream()))
         return false;
@@ -313,6 +332,10 @@ bool BnCKFwdInference::IsApplicable(
         return false;
     if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardInference)
         return false;
+    if(bn_problem.GetMode() != miopenBNSpatial)
+        return false;
+    if(bn_problem.GetXDesc().GetType() != bn_problem.GetScaleBiasDiffDesc().GetType())
+        return false;
 
     switch(bn_problem.GetXDesc().GetType())
     {
@@ -330,29 +353,20 @@ bool BnCKFwdInference::IsApplicable(
     return false;
 }
 
-template <typename InvokerFactoryMakerNHWC>
+template <typename InvokerFactoryMaker>
 ConvSolution MakeAnyInvokerFactory(const miopen::batchnorm::ProblemDescription& problem,
-                                   InvokerFactoryMakerNHWC&& invoker_factory_maker_nhwc)
+                                   InvokerFactoryMaker&& invoker_factory_maker)
 {
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
-    if(problem.IsLayoutNHWC())
-    {
-        switch(problem.GetXDesc().GetType())
-        {
-        case miopenFloat: return invoker_factory_maker_nhwc(F32{});
-        case miopenDouble: return invoker_factory_maker_nhwc(F64{});
-        case miopenHalf: return invoker_factory_maker_nhwc(F16{});
-        case miopenBFloat16: return invoker_factory_maker_nhwc(BF16{});
-        default:
-            MIOPEN_THROW(miopenStatusInternalError,
-                         "BnCKFwdInference operation does not support this data type");
-        }
-    }
-    // Todo: problem.IsLayoutDefault()
-    else
+    switch(problem.GetXDesc().GetType())
     {
+    case miopenFloat: return invoker_factory_maker(F32{});
+    case miopenDouble: return invoker_factory_maker(F64{});
+    case miopenHalf: return invoker_factory_maker(F16{});
+    case miopenBFloat16: return invoker_factory_maker(BF16{});
+    default:
         MIOPEN_THROW(miopenStatusInternalError,
-                     "BnCKFwdInference operation does not support this data layout");
+                     "BnCKFwdInference operation does not support this data type");
     }
 #else
     return {};
diff --git a/src/solver/batchnorm/forward_training_ck.cpp b/src/solver/batchnorm/forward_training_ck.cpp
index cdaafe7b58..fec919cf79 100644
--- a/src/solver/batchnorm/forward_training_ck.cpp
+++ b/src/solver/batchnorm/forward_training_ck.cpp
@@ -86,7 +86,21 @@ struct CKArgsBNormFwdTraining
 
         // prep for CK
         std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>());
-        std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
+
+        if(problem.IsLayoutNHWC())
+        {
+            std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end());
+            reduceDims = {0, 1, 2};
+        }
+        else if(problem.IsLayoutNCHW())
+        {
+            reduceDims = {0, 2, 3};
+        }
+        else
+        {
+            MIOPEN_THROW(miopenStatusInternalError,
+                         "BnCKFwdTraining operation does not support this data layout");
+        }
     }
 
     CKArgsBNormFwdTraining(const CKArgsBNormFwdTraining&) = default;
@@ -131,7 +145,7 @@ struct CKArgsBNormFwdTraining
     std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarLengths;
     std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarStrides;
 
-    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+    std::array<int, NumBatchNormReduceDim> reduceDims;
 };
 
 template <typename XDataType,
@@ -337,7 +351,7 @@ bool BnCKFwdTraining::IsApplicable(
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
     if(env::disabled(MIOPEN_DEBUG_CK_BN_FWD_TRAINING))
         return false;
-    if(!bn_problem.IsLayoutNHWC())
+    if(!bn_problem.IsLayoutNHWC() && !bn_problem.IsLayoutNCHW())
         return false;
     if(!ck_utility::is_ck_supported_hardware(context.GetStream()))
         return false;
@@ -345,6 +359,10 @@ bool BnCKFwdTraining::IsApplicable(
         return false;
     if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardTraining)
         return false;
+    if(bn_problem.GetMode() != miopenBNSpatial)
+        return false;
+    if(bn_problem.GetXDesc().GetType() != bn_problem.GetScaleBiasDiffDesc().GetType())
+        return false;
 
     switch(bn_problem.GetXDesc().GetType())
     {
@@ -367,24 +385,15 @@ ConvSolution MakeAnyInvokerFactory(const miopen::batchnorm::ProblemDescription&
                                    InvokerFactoryMakerNHWC&& invoker_factory_maker_nhwc)
 {
 #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
-    if(problem.IsLayoutNHWC())
-    {
-        switch(problem.GetXDesc().GetType())
-        {
-        case miopenFloat: return invoker_factory_maker_nhwc(F32{});
-        case miopenDouble: return invoker_factory_maker_nhwc(F64{});
-        case miopenHalf: return invoker_factory_maker_nhwc(F16{});
-        case miopenBFloat16: return invoker_factory_maker_nhwc(BF16{});
-        default:
-            MIOPEN_THROW(miopenStatusInternalError,
-                         "BnCKFwdTraining operation does not support this data type");
-        }
-    }
-    // Todo: problem.IsLayoutDefault()
-    else
+    switch(problem.GetXDesc().GetType())
     {
+    case miopenFloat: return invoker_factory_maker_nhwc(F32{});
+    case miopenDouble: return invoker_factory_maker_nhwc(F64{});
+    case miopenHalf: return invoker_factory_maker_nhwc(F16{});
+    case miopenBFloat16: return invoker_factory_maker_nhwc(BF16{});
+    default:
         MIOPEN_THROW(miopenStatusInternalError,
-                     "BnCKFwdTraining operation does not support this data layout");
+                     "BnCKFwdTraining operation does not support this data type");
     }
 #else
     return {};
diff --git a/test/bn_3d_spatial_test.cpp b/test/bn_3d_spatial_test.cpp
index 08bfdb5a57..7b97d17e88 100644
--- a/test/bn_3d_spatial_test.cpp
+++ b/test/bn_3d_spatial_test.cpp
@@ -1203,6 +1203,8 @@ struct batch_norm_3d_spatial_driver : test_driver
     batch_norm_3d_spatial_driver()
     {
         this->batch_factor = 4;
+        this->tolerance =
+            4e-3 / std::numeric_limits<T>::epsilon(); // ck solver has tolerance of 4e-3
         add(input,
             "input",
             get_3d_bn_spatial_input_tensor(
@@ -1233,34 +1235,18 @@ struct batch_norm_3d_spatial_driver : test_driver
         miopen::DeriveBNTensorDescriptor(derivedBnDesc, input.desc, miopenBNSpatial);
         std::tie(ssn, ssc, ssd, ssh, ssw) = miopen::tien<5>(derivedBnDesc.GetLengths());
 
-        if(input.desc.GetType() == miopenFloat)
-        {
-            scale =
-                tensor<PREC_TYPE>{ssn, ssc, ssd, ssh, ssw}.generate(tensor_elem_gen_integer{17});
-            shift =
-                tensor<PREC_TYPE>{ssn, ssc, ssd, ssh, ssw}.generate(tensor_elem_gen_integer{17});
+        scale                   = tensor<PREC_TYPE>{ssn, ssc, ssd, ssh, ssw};
+        shift                   = tensor<PREC_TYPE>{ssn, ssc, ssd, ssh, ssw};
+        const double Data_scale = 1e-4;
 
-            if(d * h * w < 3072)
-            {
-                std::cout << "Choosing smaller input values for low dims" << std::endl;
-                input = tensor<T>{n, c, d, h, w}.generate(tensor_elem_gen_integer{7});
-            }
+        for(std::size_t i = 0; i < scale.desc.GetElementSize(); i++)
+        {
+            scale[i] = prng::gen_descreet_uniform_sign<PREC_TYPE>(Data_scale, 100);
+            shift[i] = prng::gen_descreet_uniform_sign<PREC_TYPE>(Data_scale, 100);
         }
-        else
+        for(std::size_t i = 0; i < input.desc.GetElementSize(); i++)
         {
-            scale = tensor<PREC_TYPE>{ssn, ssc, ssd, ssh, ssw};
-            shift = tensor<PREC_TYPE>{ssn, ssc, ssd, ssh, ssw};
-
-            const double Data_scale = 1e-4;
-            for(std::size_t i = 0; i < scale.desc.GetElementSize(); i++)
-            {
-                scale[i] = prng::gen_descreet_uniform_sign<PREC_TYPE>(Data_scale, 100);
-                shift[i] = prng::gen_descreet_uniform_sign<PREC_TYPE>(Data_scale, 100);
-            }
-            for(std::size_t i = 0; i < input.desc.GetElementSize(); i++)
-            {
-                input[i] = prng::gen_descreet_uniform_sign<T>(1e-5, 100);
-            }
+            input[i] = prng::gen_descreet_uniform_sign<T>(1e-5, 100);
         }
 
 // train
diff --git a/test/bn_spatial_test.cpp b/test/bn_spatial_test.cpp
index 95a8ee099a..3db862c45b 100644
--- a/test/bn_spatial_test.cpp
+++ b/test/bn_spatial_test.cpp
@@ -1130,6 +1130,8 @@ struct batch_norm_spatial_driver : test_driver
     batch_norm_spatial_driver()
     {
         this->batch_factor = 4;
+        this->tolerance =
+            4e-3 / std::numeric_limits<T>::epsilon(); // ck solver has tolerance of 4e-3
         add(input,
             "input",
             get_bn_spatial_input_tensor(
@@ -1155,26 +1157,18 @@ struct batch_norm_spatial_driver : test_driver
         miopen::DeriveBNTensorDescriptor(derivedBnDesc, input.desc, miopenBNSpatial);
         std::tie(ssn, ssc, ssh, ssw) = miopen::tien<4>(derivedBnDesc.GetLengths());
 
-        if(input.desc.GetType() == miopenFloat)
+        scale                   = tensor<PREC_TYPE>{ssn, ssc, ssh, ssw};
+        shift                   = tensor<PREC_TYPE>{ssn, ssc, ssh, ssw};
+        const double Data_scale = 1e-2;
+
+        for(std::size_t i = 0; i < scale.desc.GetElementSize(); i++)
         {
-            scale = tensor<PREC_TYPE>{ssn, ssc, ssh, ssw}.generate(tensor_elem_gen_integer{17});
-            shift = tensor<PREC_TYPE>{ssn, ssc, ssh, ssw}.generate(tensor_elem_gen_integer{17});
+            scale[i] = prng::gen_descreet_uniform_sign<PREC_TYPE>(Data_scale, 100);
+            shift[i] = prng::gen_descreet_uniform_sign<PREC_TYPE>(Data_scale, 100);
         }
-        else
+        for(std::size_t i = 0; i < input.desc.GetElementSize(); i++)
         {
-            scale = tensor<PREC_TYPE>{ssn, ssc, ssh, ssw};
-            shift = tensor<PREC_TYPE>{ssn, ssc, ssh, ssw};
-
-            const double Data_scale = 1e-4;
-            for(std::size_t i = 0; i < scale.desc.GetElementSize(); i++)
-            {
-                scale[i] = prng::gen_descreet_uniform_sign<PREC_TYPE>(Data_scale, 100);
-                shift[i] = prng::gen_descreet_uniform_sign<PREC_TYPE>(Data_scale, 100);
-            }
-            for(std::size_t i = 0; i < input.desc.GetElementSize(); i++)
-            {
-                input[i] = prng::gen_descreet_uniform_sign<T>(1e-5, 100);
-            }
+            input[i] = prng::gen_descreet_uniform_sign<T>(Data_scale, 100);
         }
 
 // train
@@ -1193,6 +1187,7 @@ struct batch_norm_spatial_driver : test_driver
         // std::fill(input.begin(), input.end(), 1);
         // std::fill(scale.begin(), scale.end(), 1);
         // std::fill(shift.begin(), shift.end(), 1);
+        this->tolerance = 80 * input.desc.GetElementSize();
         verify(verify_forward_infer_bn_spatial_recalc<T, PREC_TYPE>{input, scale, shift});
 
         // inference use estimated running values
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index e1f192c37d..a195b6b81e 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -39,6 +39,44 @@ enum BNApiType
     testBNAPIV2,
 };
 
+// Assuming miopenTensorLayout_t and testAPI_t are the types of your enums
+static std::string LayoutToString(int tensor_format)
+{
+    switch(tensor_format)
+    {
+    case miopenTensorNCHW: return "NCHW";
+    case miopenTensorNHWC: return "NHWC";
+    default: return "UnknownTensorFormat";
+    }
+}
+
+static std::string ApiVerisonToString(int api_version)
+{
+    switch(api_version)
+    {
+    case testBNAPIV1: return "testBNAPIV1";
+    case testBNAPIV2: return "testBNAPIV2";
+    default: return "UnknownAPIVersion";
+    }
+}
+
+// Custom test name generator to handle enums
+struct TestNameGenerator
+{
+    std::string operator()(
+        const testing::TestParamInfo<std::tuple<BNTestCase, miopenTensorLayout_t, BNApiType>>& info)
+        const
+    {
+        const auto& layout_type = std::get<1>(info.param);
+        const auto& api_type    = std::get<2>(info.param);
+
+        std::string tensor_name = LayoutToString(layout_type);
+        std::string api_name    = ApiVerisonToString(api_type);
+
+        return tensor_name + "_" + api_name + "_" + std::to_string(info.index);
+    }
+};
+
 template <typename XDataType,
           typename YDataType,
           typename ScaleDataType,
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index df093a4710..164c36bb18 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -26,25 +26,12 @@
 
 #include "bn.hpp"
 
-// https://github.com/ROCm/MIOpen/issues/1549
-// NCHW solver accepts
-// XDataType       : half_float::half
-// YDataYype       : half_float::half
-// ScaleDataType   : half_float::half
-// BiasDataType    : half_float::half
-// MeanVarDataType : half_float::half
-// struct GPU_BN_V1_BwdNCHW_FP16 : BNBwdTest<half_float::half, half_float::half, half_float::half,
-// half_float::half, half_float::half, half_float::half, half_float::half>
-// {
-// };
-
-// NHWC solver accepts
-// XDataType       : half_float::half
-// YDataYype       : half_float::half
-// ScaleDataType   : half_float::half
-// BiasDataType    : half_float::half
-// MeanVarDataType : float
-struct GPU_BN_V2_BwdNHWC_FP16
+struct GPU_BN_BWD_Small_FP16
+    : BNBwdTest<half_float::half, float, float, float, half_float::half, float, float>
+{
+};
+
+struct GPU_BN_BWD_Large_FP16
     : BNBwdTest<half_float::half, float, float, float, half_float::half, float, float>
 {
 };
@@ -55,94 +42,101 @@ struct GPU_BN_V2_BwdNHWC_FP16
 // ScaleDataType   : bfloat16
 // BiasDataType    : bfloat16
 // MeanVarDataType : float
-struct GPU_BN_V1_BwdNHWC_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
+struct GPU_BN_BWD_Small_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
 {
 };
 
-struct GPU_BN_V2_BwdNHWC_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
+struct GPU_BN_BWD_Large_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
 {
 };
 
-struct GPU_BN_V1_Bwd_FP32 : BNBwdTest<float, float, float, float, float, float, float>
+struct GPU_BN_BWD_Small_FP32 : BNBwdTest<float, float, float, float, float, float, float>
 {
 };
 
-struct GPU_BN_V2_Bwd_FP32 : BNBwdTest<float, float, float, float, float, float, float>
+struct GPU_BN_BWD_Large_FP32 : BNBwdTest<float, float, float, float, float, float, float>
 {
 };
 
-struct GPU_BN_V1_BwdNHWC_FP64 : BNBwdTest<double, double, double, double, double, double, double>
+struct GPU_BN_BWD_Small_FP64 : BNBwdTest<double, double, double, double, double, double, double>
 {
 };
 
-struct GPU_BN_V2_BwdNHWC_FP64 : BNBwdTest<double, double, double, double, double, double, double>
+struct GPU_BN_BWD_Large_FP64 : BNBwdTest<double, double, double, double, double, double, double>
 {
 };
 
 // fp16
-// TEST_P(GPU_BN_V1_BwdNCHW_FP16, BnV1BwdHalf) {}
-TEST_P(GPU_BN_V2_BwdNHWC_FP16, BnV2BwdCKHalf) {}
+TEST_P(GPU_BN_BWD_Small_FP16, BnV1SmallBWDCKfp16) {}
+TEST_P(GPU_BN_BWD_Large_FP16, BnV2LargeBWDCKfp16) {}
 
-// float
-TEST_P(GPU_BN_V1_Bwd_FP32, BnV1BwdFloat) {}
-TEST_P(GPU_BN_V2_Bwd_FP32, BnV2BwdFloat) {}
+// bfp16
+TEST_P(GPU_BN_BWD_Small_BFP16, BnV1SmallBWDCKbfp16) {}
+TEST_P(GPU_BN_BWD_Large_BFP16, BnV2LargeBWDCKbfp16) {}
 
-// bfp16 is only on CK solver
-TEST_P(GPU_BN_V1_BwdNHWC_BFP16, BnV1BwdCKBfloat) {}
-TEST_P(GPU_BN_V2_BwdNHWC_BFP16, BnV2BwdCKBfloat) {}
+// fp32 (float)
+TEST_P(GPU_BN_BWD_Small_FP32, BnV1SmallBWDCKfp32) {}
+TEST_P(GPU_BN_BWD_Large_FP32, BnV2LargeBWDCKfp32) {}
 
-// double is only on CK solver
-TEST_P(GPU_BN_V1_BwdNHWC_FP64, BnV1BwdCKDouble) {}
-TEST_P(GPU_BN_V2_BwdNHWC_FP64, BnV2BwdCKDouble) {}
+// fp64
+TEST_P(GPU_BN_BWD_Small_FP64, BnV1SmallBWDCKfp64) {}
+TEST_P(GPU_BN_BWD_Large_FP64, BnV2LargeBWDCKfp64) {}
 
-// // fp16
-// INSTANTIATE_TEST_SUITE_P(Smoke,
-//                          GPU_BN_V1_BwdNCHW_FP16,
-//                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-//                                           testing::Values(miopenTensorNCHW),
-//                                           testing::ValuesIn({testBNAPIV1})));
+// fp16
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BN_BWD_Small_FP16,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV1})),
+                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V2_BwdNHWC_FP16,
+                         GPU_BN_BWD_Large_FP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV2})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 
-// fp32
+// bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V1_Bwd_FP32,
+                         GPU_BN_BWD_Small_BFP16,
                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::Values(miopenTensorNCHW),
-                                          testing::ValuesIn({testBNAPIV1})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV1})),
+                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V2_Bwd_FP32,
+                         GPU_BN_BWD_Large_BFP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV2})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 
-// bfp16 is only on CK solver
+// fp32
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V1_BwdNHWC_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV1})));
+                         GPU_BN_BWD_Small_FP32,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V2_BwdNHWC_BFP16,
+                         GPU_BN_BWD_Large_FP32,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV2})));
-
-// fp64 is only on CK solver
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
+// fp64
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V1_BwdNHWC_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV1})));
+                         GPU_BN_BWD_Small_FP64,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V2_BwdNHWC_FP64,
+                         GPU_BN_BWD_Large_FP64,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV2})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp
index ed25631175..8cc60e2d55 100644
--- a/test/gtest/bn_fwd_train.cpp
+++ b/test/gtest/bn_fwd_train.cpp
@@ -26,119 +26,121 @@
 
 #include "bn.hpp"
 
-// ** OCL kernel for fwd training are failing gtest **
-// ** Hence, this gtest only tests CK solvers **
-
-// NHWC solver accepts
-// XDataType       : half_float::half
-// YDataYype       : half_float::half
-// ScaleDataType   : half_float::half
-// BiasDataType    : half_float::half
+// XDataType       : half
+// YDataYype       : half
+// ScaleDataType   : half
+// BiasDataType    : half
 // MeanVarDataType : float
-struct GPU_BN_V1_FwdTrainNHWC_FP16
+struct GPU_BN_FWD_Train_Small_FP16
     : BNFwdTrainTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
 {
 };
 
-struct GPU_BN_V2_FwdTrainNHWC_FP16
+struct GPU_BN_FWD_Train_Large_FP16
     : BNFwdTrainTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
 {
 };
 
-// bf16 NHWC solver accepts is only on CK solver
 // XDataType       : bfloat16
 // YDataYype       : bfloat16
 // ScaleDataType   : bfloat16
 // BiasDataType    : bfloat16
 // MeanVarDataType : float
-struct GPU_BN_V1_FwdTrainNHWC_BFP16 : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
+struct GPU_BN_FWD_Train_Small_BFP16 : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
 {
 };
 
-struct GPU_BN_V2_FwdTrainNHWC_BFP16 : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
+struct GPU_BN_FWD_Train_Large_BFP16 : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
 {
 };
 
-struct GPU_BN_V1_FwdTrainNHWC_FP32 : BNFwdTrainTest<float, float, float, float, float>
+struct GPU_BN_FWD_Train_Small_FP32 : BNFwdTrainTest<float, float, float, float, float>
 {
 };
 
-struct GPU_BN_V2_FwdTrainNHWC_FP32 : BNFwdTrainTest<float, float, float, float, float>
+struct GPU_BN_FWD_Train_Large_FP32 : BNFwdTrainTest<float, float, float, float, float>
 {
 };
 
-struct GPU_BN_V1_FwdTrainNHWC_FP64 : BNFwdTrainTest<double, double, double, double, double>
+struct GPU_BN_FWD_Train_Small_FP64 : BNFwdTrainTest<double, double, double, double, double>
 {
 };
 
-struct GPU_BN_V2_FwdTrainNHWC_FP64 : BNFwdTrainTest<double, double, double, double, double>
+struct GPU_BN_FWD_Train_Large_FP64 : BNFwdTrainTest<double, double, double, double, double>
 {
 };
 
 // fp16
-TEST_P(GPU_BN_V1_FwdTrainNHWC_FP16, BnV1FwdTrainHalf) {}
-TEST_P(GPU_BN_V2_FwdTrainNHWC_FP16, BnV2FwdTrainCKHalf) {}
-
-// float
-TEST_P(GPU_BN_V1_FwdTrainNHWC_FP32, BnV1FwdTrainFloat) {}
-TEST_P(GPU_BN_V2_FwdTrainNHWC_FP32, BnV2FwdTrainFloat) {}
+TEST_P(GPU_BN_FWD_Train_Small_FP16, BnV1SmallFWD_TrainCKfp16) {}
+TEST_P(GPU_BN_FWD_Train_Large_FP16, BnV2LargeFWD_TrainCKfp16) {}
 
 // bfp16
-TEST_P(GPU_BN_V1_FwdTrainNHWC_BFP16, BnV1FwdTrainCKBfloat) {}
-TEST_P(GPU_BN_V2_FwdTrainNHWC_BFP16, BnV2FwdTrainCKBfloat) {}
+TEST_P(GPU_BN_FWD_Train_Small_BFP16, BnV1SmallFWD_TrainCKbfp16) {}
+TEST_P(GPU_BN_FWD_Train_Large_BFP16, BnV2LargeFWD_TrainCKbfp16) {}
+
+// fp32 (float)
+TEST_P(GPU_BN_FWD_Train_Small_FP32, BnV1SmallFWD_TrainCKfp32) {}
+TEST_P(GPU_BN_FWD_Train_Large_FP32, BnV2LargeFWD_TrainCKfp32) {}
 
-// double
-TEST_P(GPU_BN_V1_FwdTrainNHWC_FP64, BnV1FwdTrainCKDouble) {}
-TEST_P(GPU_BN_V2_FwdTrainNHWC_FP64, BnV2FwdTrainCKDouble) {}
+// fp64
+TEST_P(GPU_BN_FWD_Train_Small_FP64, BnV1SmallFWD_TrainCKfp64) {}
+TEST_P(GPU_BN_FWD_Train_Large_FP64, BnV2LargeFWD_TrainCKfp64) {}
 
 // fp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V1_FwdTrainNHWC_FP16,
+                         GPU_BN_FWD_Train_Small_FP16,
                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV1})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV1})),
+                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V2_FwdTrainNHWC_FP16,
+                         GPU_BN_FWD_Train_Large_FP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV2})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 
-// fp32
+// bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V1_FwdTrainNHWC_FP32,
+                         GPU_BN_FWD_Train_Small_BFP16,
                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV1})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV1})),
+                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V2_FwdTrainNHWC_FP32,
+                         GPU_BN_FWD_Train_Large_BFP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV2})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 
-// bfp16
+// fp32
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V1_FwdTrainNHWC_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV1})));
+                         GPU_BN_FWD_Train_Small_FP32,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V2_FwdTrainNHWC_BFP16,
+                         GPU_BN_FWD_Train_Large_FP32,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV2})));
-
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 // fp64
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V1_FwdTrainNHWC_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV1})));
+                         GPU_BN_FWD_Train_Small_FP64,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V2_FwdTrainNHWC_FP64,
+                         GPU_BN_FWD_Train_Large_FP64,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV2})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index aea15c097e..4f32a2b3bd 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -26,27 +26,18 @@
 
 #include "bn.hpp"
 
-// NCHW solver accepts
+// NHWC solver accepts
 // XDataType       : half_float::half
 // YDataYype       : half_float::half
 // ScaleDataType   : half_float::half
 // BiasDataType    : half_float::half
-// MeanVarDataType : half_float::half
-struct GPU_BN_V1_InferNCHW_FP16 : BNInferTest<half_float::half,
-                                              half_float::half,
-                                              half_float::half,
-                                              half_float::half,
-                                              half_float::half>
+// MeanVarDataType : float
+struct GPU_BN_Infer_Small_FP16
+    : BNInferTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
 {
 };
 
-// NHWC solver accepts
-// XDataType       : half_float::half
-// YDataYype       : half_float::half
-// ScaleDataType   : half_float::half
-// BiasDataType    : half_float::half
-// MeanVarDataType : float
-struct GPU_BN_V2_InferNHWC_FP16
+struct GPU_BN_Infer_Large_FP16
     : BNInferTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
 {
 };
@@ -57,94 +48,101 @@ struct GPU_BN_V2_InferNHWC_FP16
 // ScaleDataType   : bfloat16
 // BiasDataType    : bfloat16
 // MeanVarDataType : float
-struct GPU_BN_V1_InferNHWC_BFP16 : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
+struct GPU_BN_Infer_Small_BFP16 : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
 {
 };
 
-struct GPU_BN_V2_InferNHWC_BFP16 : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
+struct GPU_BN_Infer_Large_BFP16 : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
 {
 };
 
-struct GPU_BN_V1_Infer_FP32 : BNInferTest<float, float, float, float, float>
+struct GPU_BN_Infer_Small_FP32 : BNInferTest<float, float, float, float, float>
 {
 };
 
-struct GPU_BN_V2_Infer_FP32 : BNInferTest<float, float, float, float, float>
+struct GPU_BN_Infer_Large_FP32 : BNInferTest<float, float, float, float, float>
 {
 };
 
-struct GPU_BN_V1_InferNHWC_FP64 : BNInferTest<double, double, double, double, double>
+struct GPU_BN_Infer_Small_FP64 : BNInferTest<double, double, double, double, double>
 {
 };
 
-struct GPU_BN_V2_InferNHWC_FP64 : BNInferTest<double, double, double, double, double>
+struct GPU_BN_Infer_Large_FP64 : BNInferTest<double, double, double, double, double>
 {
 };
 
 // fp16
-TEST_P(GPU_BN_V1_InferNCHW_FP16, BnV1InferHalf) {}
-TEST_P(GPU_BN_V2_InferNHWC_FP16, BnV2InferCKHalf) {}
+TEST_P(GPU_BN_Infer_Small_FP16, BnV1SmallInferCKfp16) {}
+TEST_P(GPU_BN_Infer_Large_FP16, BnV2LargeInferCKfp16) {}
 
-// float
-TEST_P(GPU_BN_V1_Infer_FP32, BnV1InferFloat) {}
-TEST_P(GPU_BN_V2_Infer_FP32, BnV2InferFloat) {}
+// bfp16
+TEST_P(GPU_BN_Infer_Small_BFP16, BnV1SmallInferCKbfp16) {}
+TEST_P(GPU_BN_Infer_Large_BFP16, BnV2LargeInferCKbfp16) {}
 
-// bfp16 is only on CK solver
-TEST_P(GPU_BN_V1_InferNHWC_BFP16, BnV1InferCKBfloat) {}
-TEST_P(GPU_BN_V2_InferNHWC_BFP16, BnV2InferCKBfloat) {}
+// fp32 (float)
+TEST_P(GPU_BN_Infer_Small_FP32, BnV1SmallInferCKfp32) {}
+TEST_P(GPU_BN_Infer_Large_FP32, BnV2LargeInferCKfp32) {}
 
-// double is only on CK solver
-TEST_P(GPU_BN_V1_InferNHWC_FP64, BnV1InferCKDouble) {}
-TEST_P(GPU_BN_V2_InferNHWC_FP64, BnV2InferCKDouble) {}
+// fp64
+TEST_P(GPU_BN_Infer_Small_FP64, BnV1SmallInferCKfp64) {}
+TEST_P(GPU_BN_Infer_Large_FP64, BnV2LargeInferCKfp64) {}
 
 // fp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V1_InferNCHW_FP16,
+                         GPU_BN_Infer_Small_FP16,
                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::Values(miopenTensorNCHW),
-                                          testing::ValuesIn({testBNAPIV1})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV1})),
+                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V2_InferNHWC_FP16,
+                         GPU_BN_Infer_Large_FP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV2})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 
-// fp32
+// bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V1_Infer_FP32,
+                         GPU_BN_Infer_Small_BFP16,
                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::Values(miopenTensorNCHW),
-                                          testing::ValuesIn({testBNAPIV1})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV1})),
+                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V2_Infer_FP32,
+                         GPU_BN_Infer_Large_BFP16,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV2})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 
-// bfp16 is only on CK solver
+// fp32
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V1_InferNHWC_BFP16,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV1})));
+                         GPU_BN_Infer_Small_FP32,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V2_InferNHWC_BFP16,
+                         GPU_BN_Infer_Large_FP32,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV2})));
-
-// fp64 is only on CK solver
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
+// fp64
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V1_InferNHWC_FP64,
-                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV1})));
+                         GPU_BN_Infer_Small_FP64,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BN_V2_InferNHWC_FP64,
+                         GPU_BN_Infer_Large_FP64,
                          testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC),
-                                          testing::ValuesIn({testBNAPIV2})));
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator());