From f02b6c31b98a7a5ddf7c2e6c07fc122a719f71cb Mon Sep 17 00:00:00 2001
From: Jing Zhou <jingz@amd.com>
Date: Wed, 27 Oct 2021 17:48:30 -0700
Subject: [PATCH 01/10] refactor ocl kernels for batchnorm nhwc

---
 .../miopen/batchnorm/problem_description.hpp  |  28 +++-
 src/kernels/MIOpenBatchNormBwdSpatial.cl      | 129 +++++++++++++----
 src/kernels/MIOpenBatchNormFwdTrainSpatial.cl |  49 +++++--
 .../batchnorm/backward_spatial_multiple.cpp   | 126 +++++++++--------
 .../batchnorm/backward_spatial_single.cpp     | 130 ++++++++++--------
 .../batchnorm/forward_spatial_multiple.cpp    |  54 ++++----
 .../batchnorm/forward_spatial_single.cpp      |  92 +++++++------
 7 files changed, 388 insertions(+), 220 deletions(-)

diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp
index f64ac2d901..63582e3026 100644
--- a/src/include/miopen/batchnorm/problem_description.hpp
+++ b/src/include/miopen/batchnorm/problem_description.hpp
@@ -67,6 +67,8 @@ struct ProblemDescription
           resultsave(resultsave_),
           resultrunning(resultrunning_)
     {
+        in_layout  = xDesc.GetLayout(xDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW");
+        out_layout = yOrDyDesc.GetLayout(yOrDyDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW");
     }
 
     // Backward
@@ -86,6 +88,9 @@ struct ProblemDescription
           epsilon(epsilon_),
           useSaved(useSaved_)
     {
+        in_layout  = xDesc.GetLayout(xDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW");
+        out_layout = yOrDyDesc.GetLayout(yOrDyDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW");
+        din_layout = dxDesc.GetLayout(dxDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW");
     }
 
     Direction GetDirection() const { return direction; }
@@ -140,6 +145,20 @@ struct ProblemDescription
         return useSaved;
     }
 
+    bool IsLayoutNHWC() const
+    {
+        if(direction == Direction::Backward)
+        {
+            return xDesc.GetLengths().size() == 4
+                       ? ((in_layout == "NHWC") && (out_layout == "NHWC") && (din_layout == "NHWC"))
+                       : ((in_layout == "NDHWC") && (out_layout == "NDHWC") &&
+                          (din_layout == "NDHWC"));
+        }
+
+        return xDesc.GetLengths().size() == 4 ? ((in_layout == "NHWC") && (out_layout == "NHWC"))
+                                              : ((in_layout == "NDHWC") && (out_layout == "NDHWC"));
+    }
+
     NetworkConfig MakeNetworkConfig() const;
 
     void Serialize(std::ostream& stream) const;
@@ -159,9 +178,12 @@ struct ProblemDescription
     TensorDescriptor scaleBiasDesc;
     double expAvgFactor = 0;
     double epsilon;
-    bool resultsave    = false;
-    bool resultrunning = false;
-    bool useSaved      = false;
+    bool resultsave        = false;
+    bool resultrunning     = false;
+    bool useSaved          = false;
+    std::string in_layout  = "NCHW";
+    std::string out_layout = "NCHW";
+    std::string din_layout = "NCHW";
 
     NetworkConfig MakeForwardTrainingNetworkConfig() const;
     NetworkConfig MakeForwardInferenceNetworkConfig() const;
diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl
index a3a144ee33..961c543822 100644
--- a/src/kernels/MIOpenBatchNormBwdSpatial.cl
+++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl
@@ -144,7 +144,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     {
         variance = 0;
     }
-    invVariance = rsqrt(variance + epsilon);
+    invVariance            = rsqrt(variance + epsilon);
 #endif // end -- Recalc mean and variance
     //-------------------------------------------
 
@@ -229,9 +229,19 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 
 #elif(MIO_BN_VARIANT == 1)
 
+#ifndef MIO_LAYOUT_NHWC
+#define MIO_LAYOUT_NHWC 0
+#endif
+
+#if MIO_LAYOUT_NHWC == 1
+#define MIO_MAX_READ 1
+#define RD_BLK 1
+#define GRPRD (MIO_BN_GRP0 * RD_BLK)
+#else
 #define MIO_MAX_READ 2
 #define RD_BLK 1
 #define GRPRD (MIO_BN_GRP0 * RD_BLK * 4)
+#endif
 #define MIO_BN_REM4 (MIO_BN_NHW - ((MIO_BN_NHW / GRPRD) * GRPRD))
 #define MIO_BN_LESS4 (MIO_BN_NHW - MIO_BN_REM4)
 #define MIO_BN_CHUNK4 (MIO_MAX_READ * GRPRD)
@@ -278,7 +288,9 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     unsigned int index = 0;
     unsigned int lid   = get_local_id(0);
     unsigned int grpid = get_group_id(0);
+#if MIO_LAYOUT_NHWC == 0
     unsigned int chwid = grpid * MIO_BN_HW;
+#endif
     unsigned int nidx  = 0;
     unsigned int hwidx = 0;
 
@@ -295,7 +307,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #if(MIO_BN_USESAVED == 0)
     //==== CALC MEAN and VARIANCE ONCE AGAIN =======================
     _FLOAT_PREC variance = (_FLOAT_PREC)0.;
-#if(MIO_BN_HW >= 4096)
+#if(MIO_LAYOUT_NHWC == 0 && MIO_BN_HW >= 4096)
     _FLOAT4 read4;
 #if(MIO_BN_N > MIO_BN_LOOP_UNROLL_MAXN)
     __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << 2; k < MIO_BN_LESS4;
@@ -350,7 +362,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     {
         nidx           = k / MIO_BN_HW;
         hwidx          = k - (nidx * MIO_BN_HW);
-        index          = nidx * MIO_BN_CHW + chwid + hwidx;
+#if MIO_LAYOUT_NHWC == 1
+        index          = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
+#else
+        index = nidx * MIO_BN_CHW + chwid + hwidx;
+#endif
         _FLOAT_PREC in = (_FLOAT_PREC)(*(x_in + index));
         mean += in;
         variance = mad(in, in, variance);
@@ -361,7 +377,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
         unsigned int remkey = lid + MIO_BN_LESS;
         nidx                = remkey / MIO_BN_HW;
         hwidx               = remkey - (nidx * MIO_BN_HW);
-        index               = nidx * MIO_BN_CHW + chwid + hwidx;
+#if MIO_LAYOUT_NHWC == 1
+        index               = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
+#else
+        index = nidx * MIO_BN_CHW + chwid + hwidx;
+#endif
         _FLOAT_PREC in = (index < MIO_BN_NCHW) ? (_FLOAT_PREC)(*(x_in + index)) : (_FLOAT_PREC)0.;
         mean += in;
         variance = mad(in, in, variance);
@@ -396,19 +416,47 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 
 #endif
 
+#if MIO_LAYOUT_NHWC == 1
+    _FLOAT dyRead;
+    _FLOAT xread;
+    _FLOAT_PREC xhat;
+#else
     _FLOAT4 dyRead4;
     _FLOAT4 xread4;
     _FLOAT_PREC4 xhat4;
+#endif
 #if(MIO_BN_N > MIO_BN_LOOP_UNROLL_MAXN)
-    __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << 2; k < MIO_BN_LESS4;
+    __attribute__((opencl_unroll_hint(4))) for(unsigned int k =
+#if MIO_LAYOUT_NHWC == 1
+                                                   lid
+#else
+                                                   lid << 2
+#endif
+                                               ;
+                                               k < MIO_BN_LESS4;
                                                k += GRPRD)
 #else
-    __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << 2; k < MIO_BN_LESS4;
+    __attribute__((opencl_unroll_hint(2))) for(unsigned int k =
+#if MIO_LAYOUT_NHWC == 1
+                                                   lid
+#else
+                                                   lid << 2
+#endif
+                                               ;
+                                               k < MIO_BN_LESS4;
                                                k += GRPRD)
 #endif
     {
-        nidx    = k / MIO_BN_HW;
-        hwidx   = k - (nidx * MIO_BN_HW);
+        nidx   = k / MIO_BN_HW;
+        hwidx  = k - (nidx * MIO_BN_HW);
+#if MIO_LAYOUT_NHWC == 1
+        index  = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
+        xread  = *((const global _FLOAT*)(x_in + index));
+        dyRead = *((const global _FLOAT*)(dy_in + index));
+        xhat   = ((_FLOAT_PREC)xread - mean) * invVariance;
+        db += (_FLOAT_PREC)dyRead;
+        ds = mad(xhat, (_FLOAT_PREC)dyRead, ds);
+#else
         index   = nidx * MIO_BN_CHW + chwid + hwidx;
         xread4  = *((const global _FLOAT4*)(x_in + index));
         dyRead4 = *((const global _FLOAT4*)(dy_in + index));
@@ -424,13 +472,31 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
         ds = mad(xhat4.y, (_FLOAT_PREC)dyRead4.y, ds);
         ds = mad(xhat4.z, (_FLOAT_PREC)dyRead4.z, ds);
         ds = mad(xhat4.w, (_FLOAT_PREC)dyRead4.w, ds);
+#endif
     }
 
 #if(MIO_BN_REM4)
-    unsigned int remkey = (lid << 2) + MIO_BN_LESS4;
-    nidx                = remkey / MIO_BN_HW;
-    hwidx               = remkey - (nidx * MIO_BN_HW);
-    index               = nidx * MIO_BN_CHW + chwid + hwidx;
+    unsigned int remkey =
+#if MIO_LAYOUT_NHWC == 1
+        lid
+#else
+        (lid << 2)
+#endif
+        + MIO_BN_LESS4;
+    nidx  = remkey / MIO_BN_HW;
+    hwidx = remkey - (nidx * MIO_BN_HW);
+    index = nidx * MIO_BN_CHW +
+#if MIO_LAYOUT_NHWC == 1
+            hwidx * MIO_BN_C + grpid;
+    if(index < MIO_BN_NCHW)
+    {
+        xread  = *((const global _FLOAT*)(x_in + index));
+        dyRead = *((const global _FLOAT*)(dy_in + index));
+        xhat   = ((_FLOAT_PREC)xread - mean) * invVariance;
+        db += (_FLOAT_PREC)dyRead;
+        ds = mad(xhat.x, (_FLOAT_PREC)dyRead.x, ds);
+#else
+            chwid + hwidx;
     if(index < (MIO_BN_NCHW - 3))
     {
         xread4  = *((const global _FLOAT4*)(x_in + index));
@@ -447,6 +513,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
         ds = mad(xhat4.y, (_FLOAT_PREC)dyRead4.y, ds);
         ds = mad(xhat4.z, (_FLOAT_PREC)dyRead4.z, ds);
         ds = mad(xhat4.w, (_FLOAT_PREC)dyRead4.w, ds);
+#endif
     }
 
 #endif
@@ -491,12 +558,16 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
         for(unsigned int j = 0; j < MIO_MAX_READ; j++)
 #endif
         {
-            unsigned int l  = k + j;
-            nidx            = l / MIO_BN_HW;
-            hwidx           = l - (nidx * MIO_BN_HW);
-            index           = nidx * MIO_BN_CHW + chwid + hwidx;
-            dyvalue         = (_FLOAT_PREC)(*(dy_in + index));
-            xhat            = ((_FLOAT_PREC)(*(x_in + index)) - mean) * invVariance;
+            unsigned int l = k + j;
+            nidx           = l / MIO_BN_HW;
+            hwidx          = l - (nidx * MIO_BN_HW);
+#if MIO_LAYOUT_NHWC == 1
+            index          = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
+#else
+            index   = nidx * MIO_BN_CHW + chwid + hwidx;
+#endif
+            dyvalue        = (_FLOAT_PREC)(*(dy_in + index));
+            xhat = ((_FLOAT_PREC)(*(x_in + index)) - mean) * invVariance;
 #if MIOPEN_USE_FP16 == 1
             float temp_tmp1 = mad((float)NHW, (float)dyvalue, -temp_db);
             float temp_tmp2 = -((float)xhat) * temp_ds;
@@ -518,7 +589,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
             unsigned int l    = k + j;
             nidx              = l / MIO_BN_HW;
             hwidx             = l - (nidx * MIO_BN_HW);
-            index             = nidx * MIO_BN_CHW + chwid + hwidx;
+#if MIO_LAYOUT_NHWC == 1
+            index             = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
+#else
+            index = nidx * MIO_BN_CHW + chwid + hwidx;
+#endif
             *(dx_out + index) = (_FLOAT)vals[j];
         }
     }
@@ -534,7 +609,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
         unsigned int l = remkeyout + j;
         nidx           = l / MIO_BN_HW;
         hwidx          = l - (nidx * MIO_BN_HW);
-        index          = nidx * MIO_BN_CHW + chwid + hwidx;
+#if MIO_LAYOUT_NHWC == 1
+        index          = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
+#else
+        index = nidx * MIO_BN_CHW + chwid + hwidx;
+#endif
         if(index < MIO_BN_NCHW)
         {
             dyvalue = (_FLOAT_PREC)(*(dy_in + index));
@@ -554,7 +633,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
         unsigned int l = remkeyout + j;
         nidx           = l / MIO_BN_HW;
         hwidx          = l - (nidx * MIO_BN_HW);
-        index          = nidx * MIO_BN_CHW + chwid + hwidx;
+#if MIO_LAYOUT_NHWC == 1
+        index          = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
+#else
+        index = nidx * MIO_BN_CHW + chwid + hwidx;
+#endif
         if(index < MIO_BN_NCHW)
         {
             *(dx_out + index) = (_FLOAT_PREC)vals[j];
@@ -681,7 +764,7 @@ MIOpenBatchNormBwdSpatialDScaleDBias(const __global _FLOAT* x_in,
                                      const __global _FLOAT* savedMean,
                                      const __global _FLOAT* savedInvVariance
 #endif
-)
+                                     )
 {
 
     unsigned int xgid    = get_global_id(0);
@@ -995,7 +1078,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #else  // maxn
             db += (_FLOAT_PREC)(*(dy_in + index));
             _FLOAT_PREC xhat = (((_FLOAT_PREC)(*(x_in + index)) - mean) * invVariance);
-            ds               = mad(xhat, (_FLOAT_PREC)(*(dy_in + index)), ds);
+            ds = mad(xhat, (_FLOAT_PREC)(*(dy_in + index)), ds);
 #endif
         }
     }
diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
index fafc08aa1a..b0e6ab8322 100644
--- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
+++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
@@ -69,7 +69,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
                                __global _FLOAT_PREC* __restrict resultSaveMean,
                                __global _FLOAT_PREC* __restrict resultSaveInvVariance
 #endif
-)
+                               )
 {
 
     // SPATIAL
@@ -181,6 +181,15 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
 
 //===========
 
+#ifndef MIO_LAYOUT_NHWC
+#define MIO_LAYOUT_NHWC 0
+#endif
+
+#if MIO_LAYOUT_NHWC == 1
+#define MIO_MAX_READ 1
+#define RD_BLK 1
+#define GRPRD (MIO_BN_GRP0 * RD_BLK)
+#else
 #if(MIO_BN_HW >= 4096)
 #define MIO_MAX_READ 3
 #else
@@ -188,6 +197,8 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
 #endif
 #define RD_BLK 1
 #define GRPRD (MIO_BN_GRP0 * RD_BLK * 4)
+#endif
+
 #define MIO_BN_REM4 (MIO_BN_NHW - ((MIO_BN_NHW / GRPRD) * GRPRD))
 #define MIO_BN_LESS4 (MIO_BN_NHW - MIO_BN_REM4)
 #define MIO_BN_CHUNK4 (MIO_MAX_READ * GRPRD)
@@ -216,7 +227,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
                                __global _FLOAT_PREC* __restrict resultSaveMean,
                                __global _FLOAT_PREC* __restrict resultSaveInvVariance
 #endif
-)
+                               )
 {
 
     // SPATIAL
@@ -232,7 +243,9 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
     uint index = 0;
     uint lid   = get_local_id(0);
     uint grpid = get_group_id(0);
+#if MIO_LAYOUT_NHWC == 0
     uint chwid = grpid * MIO_BN_HW;
+#endif
     uint nidx  = 0;
     uint hwidx = 0;
 
@@ -243,7 +256,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if(MIO_BN_HW >= 4096)
+#if(MIO_LAYOUT_NHWC == 0 && MIO_BN_HW >= 4096)
     _FLOAT4 read4;
     __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << 2; k < MIO_BN_LESS4;
                                                k += GRPRD)
@@ -288,7 +301,11 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
     {
         nidx            = k / MIO_BN_HW;
         hwidx           = k - (nidx * MIO_BN_HW);
-        index           = nidx * MIO_BN_CHW + chwid + hwidx;
+#if MIO_LAYOUT_NHWC == 1
+        index           = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
+#else
+        index = nidx * MIO_BN_CHW + chwid + hwidx;
+#endif
         _FLOAT_PREC xin = (_FLOAT_PREC)(*(in + index));
         mean += xin;
         variance = mad(xin, xin, variance);
@@ -299,7 +316,11 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
         unsigned int remkey = lid + MIO_BN_LESS;
         nidx                = remkey / MIO_BN_HW;
         hwidx               = remkey - (nidx * MIO_BN_HW);
-        index               = nidx * MIO_BN_CHW + chwid + hwidx;
+#if MIO_LAYOUT_NHWC == 1
+        index               = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
+#else
+        index = nidx * MIO_BN_CHW + chwid + hwidx;
+#endif
         _FLOAT_PREC xin = (index < MIO_BN_NCHW) ? (_FLOAT_PREC)(*(in + index)) : (_FLOAT_PREC)0.;
         mean += xin;
         variance = mad(xin, xin, variance);
@@ -329,13 +350,23 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
     pvscale = lcl_scale;
     pvbias  = lcl_bias;
 
-#if(MIO_BN_REM == 0)
-    __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid; k < MIO_BN_LESS;
+#if(MIO_LAYOUT_NHWC == 1 || MIO_BN_REM == 0)
+    __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid; k <
+#if MIO_LAYOUT_NHWC == 1
+                                                                     MIO_BN_NHW
+#else
+                                                                     MIO_BN_LESS
+#endif
+                                               ;
                                                k += MIO_BN_GRP0)
     {
         nidx  = k / MIO_BN_HW;
         hwidx = k - (nidx * MIO_BN_HW);
+#if MIO_LAYOUT_NHWC == 1
+        index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
+#else
         index = nidx * MIO_BN_CHW + chwid + hwidx;
+#endif
         out[index] =
             (_FLOAT)mad(pvscale, ((_FLOAT_PREC)(*(in + index)) - mean) * invVariance, pvbias);
     } // end for
@@ -478,7 +509,7 @@ MIOpenBatchNormFwdTrainSpatialFinalMeanVariance(
     ,
     __global _FLOAT* __restrict resultSaveInvVariance
 #endif
-)
+    )
 {
     _FLOAT variance             = (_FLOAT)0.;
     _FLOAT invVariance          = (_FLOAT)0.;
@@ -613,7 +644,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
                                __global _FLOAT_PREC* __restrict resultSaveMean,
                                __global _FLOAT_PREC* __restrict resultSaveInvVariance
 #endif
-)
+                               )
 {
 
     // SPATIAL
diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp
index 70b4aef407..ff632ec057 100644
--- a/src/solver/batchnorm/backward_spatial_multiple.cpp
+++ b/src/solver/batchnorm/backward_spatial_multiple.cpp
@@ -90,83 +90,94 @@ ConvSolution BnBwdTrainingSpatialMultiple::GetSolution(
     unsigned int ldsnogcn = 0;
     int variant           = 1;
 
-    //*************************************************************************************************
-    // N*H*W < 32M and H*W > 1024, use batchnorm variant#1 implementation which parallelize
-    // work groups over channels and loop through NHW.
-    //*************************************************************************************************
-    if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024))
+    if(problem.IsLayoutNHWC())
     {
-        variant    = 1;
         xlocalsize = 1024;
         xgridsize  = c * xlocalsize;
         ldsgcn     = xlocalsize / 64;
         ldsnogcn   = xlocalsize;
     }
-    //*************************************************************************************************
-    // N*H*W < 32M and H*W > 512  use batchnorm variant#1 or variant#3 implementation which
-    // parallelize
-    // work groups over channels and loop through N.
-    //*************************************************************************************************
-    else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512)
-    {
-        variant    = (n >= 32) ? 1 : 3;
-        xlocalsize = std::min(64 * ((in_cstride + 63) / 64), static_cast<unsigned int>(1024));
-        xgridsize  = c * xlocalsize;
-        ldsgcn     = xlocalsize / 64;
-        ldsnogcn   = xlocalsize;
-    }
-    //*************************************************************************************************
-    // H*W < 512  use batchnorm variant#0 or variant#3 implementation based on batch size and
-    // H*W
-    //*************************************************************************************************
-    else if(in_cstride <= 512)
+    else
     {
-        if((n > 64) && (in_cstride > 160))
+        //*************************************************************************************************
+        // N*H*W < 32M and H*W > 1024, use batchnorm variant#1 implementation which parallelize
+        // work groups over channels and loop through NHW.
+        //*************************************************************************************************
+        if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024))
         {
-            variant    = 3;
+            variant    = 1;
+            xlocalsize = 1024;
+            xgridsize  = c * xlocalsize;
+            ldsgcn     = xlocalsize / 64;
+            ldsnogcn   = xlocalsize;
+        }
+        //*************************************************************************************************
+        // N*H*W < 32M and H*W > 512  use batchnorm variant#1 or variant#3 implementation which
+        // parallelize
+        // work groups over channels and loop through N.
+        //*************************************************************************************************
+        else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512)
+        {
+            variant    = (n >= 32) ? 1 : 3;
             xlocalsize = std::min(64 * ((in_cstride + 63) / 64), static_cast<unsigned int>(1024));
             xgridsize  = c * xlocalsize;
             ldsgcn     = xlocalsize / 64;
             ldsnogcn   = xlocalsize;
         }
-        else
+        //*************************************************************************************************
+        // H*W < 512  use batchnorm variant#0 or variant#3 implementation based on batch size and
+        // H*W
+        //*************************************************************************************************
+        else if(in_cstride <= 512)
         {
-            variant = 0;
-            if(bfp32parm)
+            if((n > 64) && (in_cstride > 160))
             {
-                xlocalsize = 1024;
-                xgridsize  = 1024 * c;
+                variant = 3;
+                xlocalsize =
+                    std::min(64 * ((in_cstride + 63) / 64), static_cast<unsigned int>(1024));
+                xgridsize = c * xlocalsize;
+                ldsgcn    = xlocalsize / 64;
+                ldsnogcn  = xlocalsize;
             }
             else
             {
-                xlocalsize = 256;
-                xgridsize  = 256 * c;
+                variant = 0;
+                if(bfp32parm)
+                {
+                    xlocalsize = 1024;
+                    xgridsize  = 1024 * c;
+                }
+                else
+                {
+                    xlocalsize = 256;
+                    xgridsize  = 256 * c;
+                }
+                ldsgcn   = xlocalsize / 64;
+                ldsnogcn = xlocalsize;
             }
-            ldsgcn   = xlocalsize / 64;
-            ldsnogcn = xlocalsize;
         }
-    }
-    //*************************************************************************************************
-    // N*H*W > 32M, use batchnorm variant#2 implementation which parallelize
-    // work groups over channels and data segments.
-    //*************************************************************************************************
-    else
-    {
-        variant      = 2;
-        ylocalsize   = 1024;
-        auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-        xgridsize    = c;
-        ygridsize    = segment * ylocalsize;
-        ldsgcn       = ylocalsize / 64;
-        ldsnogcn     = ylocalsize;
-    }
-    if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm)
-    {
-        variant    = 1;
-        xlocalsize = 1024;
-        xgridsize  = c * xlocalsize;
-        ldsgcn     = xlocalsize / 64;
-        ldsnogcn   = xlocalsize;
+        //*************************************************************************************************
+        // N*H*W > 32M, use batchnorm variant#2 implementation which parallelize
+        // work groups over channels and data segments.
+        //*************************************************************************************************
+        else
+        {
+            variant      = 2;
+            ylocalsize   = 1024;
+            auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
+            xgridsize    = c;
+            ygridsize    = segment * ylocalsize;
+            ldsgcn       = ylocalsize / 64;
+            ldsnogcn     = ylocalsize;
+        }
+        if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm)
+        {
+            variant    = 1;
+            xlocalsize = 1024;
+            xgridsize  = c * xlocalsize;
+            ldsgcn     = xlocalsize / 64;
+            ldsnogcn   = xlocalsize;
+        }
     }
 
     auto result = ConvSolution{miopenStatusSuccess};
@@ -199,6 +210,7 @@ ConvSolution BnBwdTrainingSpatialMultiple::GetSolution(
             {"MIO_BN_GRP1", ylocalsize},
             {"MIO_BN_GRP2", zlocalsize},
             {"MIO_BN_GFX1030", ((handle.GetDeviceName() == "gfx1030") ? "1" : "0")},
+            {"MIO_LAYOUT_NHWC", static_cast<int>(problem.IsLayoutNHWC())},
         };
 
         kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp
index 2f3d82192b..cd94533335 100644
--- a/src/solver/batchnorm/backward_spatial_single.cpp
+++ b/src/solver/batchnorm/backward_spatial_single.cpp
@@ -99,85 +99,95 @@ BnBwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context,
     unsigned int ldsnogcn = 0;
     int variant           = 1;
 
-    //*************************************************************************************************
-    // N*H*W < 32M and H*W > 1024, use batchnorm variant#1 implementation which parallelize
-    // work groups over channels and loop through NHW.
-    //*************************************************************************************************
-    if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024))
+    if(problem.IsLayoutNHWC())
     {
-        variant    = 1;
         xlocalsize = 1024;
         xgridsize  = c * xlocalsize;
         ldsgcn     = xlocalsize / 64;
         ldsnogcn   = xlocalsize;
     }
-    //*************************************************************************************************
-    // N*H*W < 32M and H*W > 512  use batchnorm variant#1 or variant#3 implementation which
-    // parallelize
-    // work groups over channels and loop through N.
-    //*************************************************************************************************
-    else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512)
-    {
-        variant    = (n >= 32) ? 1 : 3;
-        xlocalsize = std::min(64 * ((in_cstride + 63) / 64), static_cast<unsigned int>(1024));
-        xgridsize  = c * xlocalsize;
-        ldsgcn     = xlocalsize / 64;
-        ldsnogcn   = xlocalsize;
-    }
-    //*************************************************************************************************
-    // H*W < 512  use batchnorm variant#0 or variant#3 implementation based on batch size and
-    // H*W
-    //*************************************************************************************************
-    else if(in_cstride <= 512)
+    else
     {
-        if((n > 64) && (in_cstride > 160))
+        //*************************************************************************************************
+        // N*H*W < 32M and H*W > 1024, use batchnorm variant#1 implementation which parallelize
+        // work groups over channels and loop through NHW.
+        //*************************************************************************************************
+        if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024))
+        {
+            variant    = 1;
+            xlocalsize = 1024;
+            xgridsize  = c * xlocalsize;
+            ldsgcn     = xlocalsize / 64;
+            ldsnogcn   = xlocalsize;
+        }
+        //*************************************************************************************************
+        // N*H*W < 32M and H*W > 512  use batchnorm variant#1 or variant#3 implementation which
+        // parallelize
+        // work groups over channels and loop through N.
+        //*************************************************************************************************
+        else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512)
         {
-            variant    = 3;
+            variant    = (n >= 32) ? 1 : 3;
             xlocalsize = std::min(64 * ((in_cstride + 63) / 64), static_cast<unsigned int>(1024));
             xgridsize  = c * xlocalsize;
             ldsgcn     = xlocalsize / 64;
             ldsnogcn   = xlocalsize;
         }
-        else
+        //*************************************************************************************************
+        // H*W < 512  use batchnorm variant#0 or variant#3 implementation based on batch size and
+        // H*W
+        //*************************************************************************************************
+        else if(in_cstride <= 512)
         {
-            variant = 0;
-            if(bfp32parm)
+            if((n > 64) && (in_cstride > 160))
             {
-                xlocalsize = 1024;
-                xgridsize  = 1024 * c;
+                variant = 3;
+                xlocalsize =
+                    std::min(64 * ((in_cstride + 63) / 64), static_cast<unsigned int>(1024));
+                xgridsize = c * xlocalsize;
+                ldsgcn    = xlocalsize / 64;
+                ldsnogcn  = xlocalsize;
             }
             else
             {
-                xlocalsize = 256;
-                xgridsize  = 256 * c;
+                variant = 0;
+                if(bfp32parm)
+                {
+                    xlocalsize = 1024;
+                    xgridsize  = 1024 * c;
+                }
+                else
+                {
+                    xlocalsize = 256;
+                    xgridsize  = 256 * c;
+                }
+                ldsgcn   = xlocalsize / 64;
+                ldsnogcn = xlocalsize;
             }
-            ldsgcn   = xlocalsize / 64;
-            ldsnogcn = xlocalsize;
+        }
+        //*************************************************************************************************
+        // N*H*W > 32M, use batchnorm variant#2 implementation which parallelize
+        // work groups over channels and data segments.
+        //*************************************************************************************************
+        else
+        {
+            variant      = 2;
+            ylocalsize   = 1024;
+            auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
+            xgridsize    = c;
+            ygridsize    = segment * ylocalsize;
+            ldsgcn       = ylocalsize / 64;
+            ldsnogcn     = ylocalsize;
+        }
+        if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm)
+        {
+            variant    = 1;
+            xlocalsize = 1024;
+            xgridsize  = c * xlocalsize;
+            ldsgcn     = xlocalsize / 64;
+            ldsnogcn   = xlocalsize;
         }
     }
-    //*************************************************************************************************
-    // N*H*W > 32M, use batchnorm variant#2 implementation which parallelize
-    // work groups over channels and data segments.
-    //*************************************************************************************************
-    else
-    {
-        variant      = 2;
-        ylocalsize   = 1024;
-        auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-        xgridsize    = c;
-        ygridsize    = segment * ylocalsize;
-        ldsgcn       = ylocalsize / 64;
-        ldsnogcn     = ylocalsize;
-    }
-    if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm)
-    {
-        variant    = 1;
-        xlocalsize = 1024;
-        xgridsize  = c * xlocalsize;
-        ldsgcn     = xlocalsize / 64;
-        ldsnogcn   = xlocalsize;
-    }
-
     auto result = ConvSolution{miopenStatusSuccess};
 
     {
@@ -203,6 +213,7 @@ BnBwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context,
             {"MIO_BN_GRP0", xlocalsize},
             {"MIO_BN_GRP1", ylocalsize},
             {"MIO_BN_GRP2", zlocalsize},
+            {"MIO_LAYOUT_NHWC", static_cast<int>(problem.IsLayoutNHWC())},
         };
 
         if((n > 64) && (n % 2 == 0) && (variant == 3) && (bfpmixparm) && (problem.UseSaved()) &&
@@ -210,7 +221,8 @@ BnBwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context,
            (StartsWith(handle.GetDeviceName(), "gfx8") ||
             (StartsWith(handle.GetDeviceName(), "gfx9")
 #if WORKAROUND_ISSUE_1146
-             && (handle.GetDeviceName() != "gfx90a")
+             &&
+             (handle.GetDeviceName() != "gfx90a")
 #endif
                  )) &&
            (!handle.GetTargetProperties().Xnack() || !*handle.GetTargetProperties().Xnack()))
diff --git a/src/solver/batchnorm/forward_spatial_multiple.cpp b/src/solver/batchnorm/forward_spatial_multiple.cpp
index b7566ec105..7cf04cc4f4 100644
--- a/src/solver/batchnorm/forward_spatial_multiple.cpp
+++ b/src/solver/batchnorm/forward_spatial_multiple.cpp
@@ -93,24 +93,26 @@ ConvSolution BnFwdTrainingSpatialMultiple::GetSolution(
     unsigned int ldsgcn   = xlocalsize / 64;
     unsigned int ldsnogcn = xlocalsize;
 
-#if(WORKAROUND_SWDEV_253606 == 0)
-    if(n < 3)
+    if(!problem.IsLayoutNHWC())
     {
-        variant    = 4;
-        xlocalsize = 256;
-        xgridsize  = c * xlocalsize;
-        ylocalsize = 1;
-        ygridsize  = 1;
-        ldsgcn     = xlocalsize / 64;
-        ldsnogcn   = xlocalsize;
-    }
-    else
+#if(WORKAROUND_SWDEV_253606 == 0)
+        if(n < 3)
+        {
+            variant    = 4;
+            xlocalsize = 256;
+            xgridsize  = c * xlocalsize;
+            ylocalsize = 1;
+            ygridsize  = 1;
+            ldsgcn     = xlocalsize / 64;
+            ldsnogcn   = xlocalsize;
+        }
+        else
 #endif
 
-        // clang-format off
+            // clang-format off
         if((in_nhw < 33554432 && in_cstride > 1024) ||
-               ((n >= 256) && (in_cstride > 60) && bfpmixparm) ||
-               ((in_cstride > 512) && bfpmixparm))
+            ((n >= 256) && (in_cstride > 60) && bfpmixparm) ||
+            ((in_cstride > 512) && bfpmixparm))
         {
             variant = 1;
         }
@@ -129,18 +131,19 @@ ConvSolution BnFwdTrainingSpatialMultiple::GetSolution(
             ldsgcn       = ylocalsize / 64;
             ldsnogcn     = ylocalsize;
         }
-    // clang-format on
+        // clang-format on
 
-    if((n > 768) && (in_cstride > 150) && bfp32parm)
-    {
-        variant      = 2;
-        xlocalsize   = 1;
-        ylocalsize   = 1024;
-        auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-        xgridsize    = c;
-        ygridsize    = segment * ylocalsize;
-        ldsgcn       = ylocalsize / 64;
-        ldsnogcn     = ylocalsize;
+        if((n > 768) && (in_cstride > 150) && bfp32parm)
+        {
+            variant      = 2;
+            xlocalsize   = 1;
+            ylocalsize   = 1024;
+            auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
+            xgridsize    = c;
+            ygridsize    = segment * ylocalsize;
+            ldsgcn       = ylocalsize / 64;
+            ldsnogcn     = ylocalsize;
+        }
     }
 
     auto result = ConvSolution{miopenStatusSuccess};
@@ -174,6 +177,7 @@ ConvSolution BnFwdTrainingSpatialMultiple::GetSolution(
             {"MIO_BN_GRP1", ylocalsize},
             {"MIO_BN_GRP2", zlocalsize},
             {"MIO_BN_GFX1030", ((handle.GetDeviceName() == "gfx1030") ? "1" : "0")},
+            {"MIO_LAYOUT_NHWC", static_cast<int>(problem.IsLayoutNHWC())},
         };
 
         kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
diff --git a/src/solver/batchnorm/forward_spatial_single.cpp b/src/solver/batchnorm/forward_spatial_single.cpp
index c357a3b0f2..d36250eb7a 100644
--- a/src/solver/batchnorm/forward_spatial_single.cpp
+++ b/src/solver/batchnorm/forward_spatial_single.cpp
@@ -129,54 +129,57 @@ BnFwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context,
     unsigned int ldsgcn   = xlocalsize / 64;
     unsigned int ldsnogcn = xlocalsize;
 
-#if(WORKAROUND_SWDEV_253606 == 0)
-    if(n < 3)
+    if(!problem.IsLayoutNHWC())
     {
-        variant    = 4;
-        xlocalsize = 256;
-        xgridsize  = c * xlocalsize;
-        ylocalsize = 1;
-        ygridsize  = 1;
-        ldsgcn     = xlocalsize / 64;
-        ldsnogcn   = xlocalsize;
-    }
-    else
+#if(WORKAROUND_SWDEV_253606 == 0)
+        if(n < 3)
+        {
+            variant    = 4;
+            xlocalsize = 256;
+            xgridsize  = c * xlocalsize;
+            ylocalsize = 1;
+            ygridsize  = 1;
+            ldsgcn     = xlocalsize / 64;
+            ldsnogcn   = xlocalsize;
+        }
+        else
 #endif
 
-        // clang-format off
-    if((in_nhw < 33554432 && in_cstride > 1024) ||
-            ((n >= 256) && (in_cstride > 60) && bfpmixparm) ||
-            ((in_cstride > 512) && bfpmixparm))
-    {
-        variant = 1;
-    }
-    else if(in_cstride <= 512)
-    {
-        variant = 0;
-    }
-    else
-    {
-        variant      = 2;
-        xlocalsize   = 1;
-        ylocalsize   = 1024;
-        auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-        xgridsize    = c;
-        ygridsize    = segment * ylocalsize;
-        ldsgcn       = ylocalsize / 64;
-        ldsnogcn     = ylocalsize;
-    }
-    // clang-format on
+            // clang-format off
+        if((in_nhw < 33554432 && in_cstride > 1024) ||
+                ((n >= 256) && (in_cstride > 60) && bfpmixparm) ||
+                ((in_cstride > 512) && bfpmixparm))
+        {
+            variant = 1;
+        }
+        else if(in_cstride <= 512)
+        {
+            variant = 0;
+        }
+        else
+        {
+            variant      = 2;
+            xlocalsize   = 1;
+            ylocalsize   = 1024;
+            auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
+            xgridsize    = c;
+            ygridsize    = segment * ylocalsize;
+            ldsgcn       = ylocalsize / 64;
+            ldsnogcn     = ylocalsize;
+        }
+        // clang-format on
 
-    if((n > 768) && (in_cstride > 150) && bfp32parm)
-    {
-        variant      = 2;
-        xlocalsize   = 1;
-        ylocalsize   = 1024;
-        auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-        xgridsize    = c;
-        ygridsize    = segment * ylocalsize;
-        ldsgcn       = ylocalsize / 64;
-        ldsnogcn     = ylocalsize;
+        if((n > 768) && (in_cstride > 150) && bfp32parm)
+        {
+            variant      = 2;
+            xlocalsize   = 1;
+            ylocalsize   = 1024;
+            auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
+            xgridsize    = c;
+            ygridsize    = segment * ylocalsize;
+            ldsgcn       = ylocalsize / 64;
+            ldsnogcn     = ylocalsize;
+        }
     }
 
     auto result = ConvSolution{miopenStatusSuccess};
@@ -204,6 +207,7 @@ BnFwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context,
             {"MIO_BN_GRP1", ylocalsize},
             {"MIO_BN_GRP2", zlocalsize},
             {"MIO_BN_GFX1030", ((handle.GetDeviceName() == "gfx1030") ? "1" : "0")},
+            {"MIO_LAYOUT_NHWC", static_cast<int>(problem.IsLayoutNHWC())},
         };
 
         if(variant != 4)

From 0b24942fce48859753bbbea85ac9887a36244165 Mon Sep 17 00:00:00 2001
From: Jing Zhou <jingz@amd.com>
Date: Thu, 18 Nov 2021 18:16:04 -0800
Subject: [PATCH 02/10] add batch norm spatial nhwc test

---
 src/kernels/MIOpenBatchNormBwdSpatial.cl      |  24 +-
 .../batchnorm/backward_spatial_single.cpp     |   3 +
 .../batchnorm/forward_spatial_single.cpp      |   3 +
 test/bn_spatial_nhwc_test.cpp                 | 743 ++++++++++++++++++
 4 files changed, 761 insertions(+), 12 deletions(-)
 create mode 100644 test/bn_spatial_nhwc_test.cpp

diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl
index 961c543822..d7bee08300 100644
--- a/src/kernels/MIOpenBatchNormBwdSpatial.cl
+++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl
@@ -419,7 +419,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #if MIO_LAYOUT_NHWC == 1
     _FLOAT dyRead;
     _FLOAT xread;
-    _FLOAT_PREC xhat;
+    _FLOAT_PREC xhat_tmp;
 #else
     _FLOAT4 dyRead4;
     _FLOAT4 xread4;
@@ -447,15 +447,15 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
                                                k += GRPRD)
 #endif
     {
-        nidx   = k / MIO_BN_HW;
-        hwidx  = k - (nidx * MIO_BN_HW);
+        nidx     = k / MIO_BN_HW;
+        hwidx    = k - (nidx * MIO_BN_HW);
 #if MIO_LAYOUT_NHWC == 1
-        index  = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
-        xread  = *((const global _FLOAT*)(x_in + index));
-        dyRead = *((const global _FLOAT*)(dy_in + index));
-        xhat   = ((_FLOAT_PREC)xread - mean) * invVariance;
+        index    = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
+        xread    = *((const global _FLOAT*)(x_in + index));
+        dyRead   = *((const global _FLOAT*)(dy_in + index));
+        xhat_tmp = ((_FLOAT_PREC)xread - mean) * invVariance;
         db += (_FLOAT_PREC)dyRead;
-        ds = mad(xhat, (_FLOAT_PREC)dyRead, ds);
+        ds = mad(xhat_tmp, (_FLOAT_PREC)dyRead, ds);
 #else
         index   = nidx * MIO_BN_CHW + chwid + hwidx;
         xread4  = *((const global _FLOAT4*)(x_in + index));
@@ -490,11 +490,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
             hwidx * MIO_BN_C + grpid;
     if(index < MIO_BN_NCHW)
     {
-        xread  = *((const global _FLOAT*)(x_in + index));
-        dyRead = *((const global _FLOAT*)(dy_in + index));
-        xhat   = ((_FLOAT_PREC)xread - mean) * invVariance;
+        xread    = *((const global _FLOAT*)(x_in + index));
+        dyRead   = *((const global _FLOAT*)(dy_in + index));
+        xhat_tmp = ((_FLOAT_PREC)xread - mean) * invVariance;
         db += (_FLOAT_PREC)dyRead;
-        ds = mad(xhat.x, (_FLOAT_PREC)dyRead.x, ds);
+        ds = mad(xhat_tmp, (_FLOAT_PREC)dyRead, ds);
 #else
             chwid + hwidx;
     if(index < (MIO_BN_NCHW - 3))
diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp
index cd94533335..623fdbf847 100644
--- a/src/solver/batchnorm/backward_spatial_single.cpp
+++ b/src/solver/batchnorm/backward_spatial_single.cpp
@@ -46,6 +46,9 @@ bool BnBwdTrainingSpatialSingle::IsApplicable(
        problem.GetMode() != miopenBNSpatial)
         return false;
 
+    if(problem.IsLayoutNHWC())
+        return true;
+
     int n, c, h, w;
     std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
 
diff --git a/src/solver/batchnorm/forward_spatial_single.cpp b/src/solver/batchnorm/forward_spatial_single.cpp
index d36250eb7a..48ad737fe3 100644
--- a/src/solver/batchnorm/forward_spatial_single.cpp
+++ b/src/solver/batchnorm/forward_spatial_single.cpp
@@ -46,6 +46,9 @@ bool BnFwdTrainingSpatialSingle::IsApplicable(
        problem.GetMode() != miopenBNSpatial)
         return false;
 
+    if(problem.IsLayoutNHWC())
+        return true;
+
     int n, c, h, w;
     std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
 
diff --git a/test/bn_spatial_nhwc_test.cpp b/test/bn_spatial_nhwc_test.cpp
new file mode 100644
index 0000000000..a1f0407402
--- /dev/null
+++ b/test/bn_spatial_nhwc_test.cpp
@@ -0,0 +1,743 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "driver.hpp"
+#include "get_handle.hpp"
+#include "tensor_holder.hpp"
+#include "test.hpp"
+#include "verify.hpp"
+#include "random.hpp"
+#include <array>
+#include <cmath>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <miopen/batch_norm.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_layout.hpp>
+#include <utility>
+#include <cfloat>
+
+#define MIO_BN_TEST_EXPAVGFACTOR 0.1
+#define MIO_BN_TEST_EPSILON 1e-5
+#define MIO_BN_USE_MIX_PREC 1
+#if MIO_BN_USE_MIX_PREC == 1
+#define PREC_TYPE float
+#else
+#define PREC_TYPE T
+#endif
+
+template <class T, class U>
+struct verify_forward_train_bn_spatial
+{
+    const tensor<T> input;
+    const tensor<U> scale;
+    const tensor<U> shift;
+    std::tuple<tensor<T>, tensor<U>, tensor<U>, tensor<U>, tensor<U>> cpu() const
+    {
+        double epsilon      = MIO_BN_TEST_EPSILON;
+        double expAvgFactor = MIO_BN_TEST_EXPAVGFACTOR;
+
+        std::size_t n_batch, channels, height, width;
+        std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
+
+        std::size_t rs_n_batch, rs_channels, rs_height, rs_width;
+        auto derivedBnDesc =
+            miopen::TensorDescriptor(input.desc.GetType(),
+                                     std::vector<std::size_t>{1, 1, 1, channels},
+                                     std::vector<std::size_t>{channels, channels, channels, 1});
+        std::tie(rs_n_batch, rs_height, rs_width, rs_channels) =
+            miopen::tien<4>(derivedBnDesc.GetLengths());
+
+        tensor<U> runMean;
+        tensor<U> runVar;
+        if(input.desc.GetType() == miopenFloat)
+        {
+            runMean = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels}.generate(
+                tensor_elem_gen_integer{17});
+            runVar = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels}.generate(
+                tensor_elem_gen_integer{17});
+        }
+        else
+        {
+            srand(0);
+            runMean = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
+            runVar  = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
+            for(std::size_t i = 0; i < runMean.desc.GetElementSize(); i++)
+            {
+                runMean[i] = (((GET_RAND() % 2) == 1) ? -1 : 1) * 1e-3 * U(GET_RAND() % 100);
+                runVar[i]  = 1e-3 * U(GET_RAND() % 100);
+            }
+        }
+        auto saveMean   = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
+        auto saveInvVar = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
+        auto out        = input;
+        std::fill(out.begin(), out.end(), 0);
+
+        const auto nhw = double(height * width * n_batch);
+        par_for(channels, 1, [&](int cidx) {
+            double elemStd        = 0.;
+            double variance_accum = 0.;
+            double mean_accum     = 0.;
+            double invVar         = 0.;
+            double newRunMean     = 0.;
+            double adjust         = 0.;
+
+            std::vector<double> variance_accum_arr(height, 0.0);
+            std::vector<double> mean_accum_arr(height, 0.0);
+            std::vector<double> dshift_accum_arr(height, 0.0);
+            std::vector<double> dscale_accum_arr(height, 0.0);
+
+            for(std::size_t row = 0; row < height; row++)
+            {
+                for(std::size_t column = 0; column < width; column++)
+                {
+                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
+                    {
+                        mean_accum_arr[row] += input(bidx, cidx, row, column);
+                    }
+                }
+            }
+            for(std::size_t i = 0; i < height; i++)
+                mean_accum += mean_accum_arr[i];
+
+            mean_accum /= nhw;
+
+            elemStd        = 0.;
+            variance_accum = 0.;
+
+            for(std::size_t row = 0; row < height; row++)
+            {
+                for(std::size_t column = 0; column < width; column++)
+                {
+                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
+                    {
+                        out(bidx, cidx, row, column) = elemStd =
+                            input(bidx, cidx, row, column) - mean_accum;
+                        variance_accum_arr[row] += elemStd * elemStd;
+                    }
+                }
+            }
+            for(std::size_t i = 0; i < height; i++)
+                variance_accum += variance_accum_arr[i];
+
+            variance_accum /= nhw;
+            invVar = 1.0 / sqrt(variance_accum + epsilon);
+
+            for(std::size_t bidx = 0; bidx < n_batch; bidx++)
+            {
+                for(std::size_t row = 0; row < height; row++)
+                {
+                    for(std::size_t column = 0; column < width; column++)
+                    {
+                        out(bidx, cidx, row, column) =
+                            scale(0, 0, 0, cidx) * (invVar * out(bidx, cidx, row, column)) +
+                            shift(0, 0, 0, cidx);
+                    }
+                }
+            }
+
+            saveMean(0, 0, 0, cidx)   = mean_accum;
+            saveInvVar(0, 0, 0, cidx) = invVar;
+
+            newRunMean = runMean(0, 0, 0, cidx) * (1 - expAvgFactor);
+            runMean(0, 0, 0, cidx) = mean_accum * expAvgFactor + newRunMean;
+            adjust = (n_batch * height * width == 1) ? variance_accum
+                                                     : (nhw / (nhw - 1)) * variance_accum;
+            runVar(0, 0, 0, cidx) =
+                (1 - expAvgFactor) * runVar(0, 0, 0, cidx) + expAvgFactor * adjust;
+        });
+
+        return std::make_tuple(out, runMean, runVar, saveMean, saveInvVar);
+    }
+
+    std::tuple<tensor<T>, tensor<U>, tensor<U>, tensor<U>, tensor<U>> gpu() const
+    {
+        auto&& handle = get_handle();
+
+        std::size_t n_batch, channels, height, width;
+        std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
+
+        auto out = input;
+        std::fill(out.begin(), out.end(), 0);
+
+        std::size_t rs_n_batch, rs_channels, rs_height, rs_width;
+        auto derivedBnDesc =
+            miopen::TensorDescriptor(input.desc.GetType(),
+                                     std::vector<std::size_t>{1, 1, 1, channels},
+                                     std::vector<std::size_t>{channels, channels, channels, 1});
+        std::tie(rs_n_batch, rs_height, rs_width, rs_channels) =
+            miopen::tien<4>(derivedBnDesc.GetLengths());
+
+        tensor<U> runMean;
+        tensor<U> runVar;
+        if(input.desc.GetType() == miopenFloat)
+        {
+            runMean = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels}.generate(
+                tensor_elem_gen_integer{17});
+            runVar = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels}.generate(
+                tensor_elem_gen_integer{17});
+        }
+        else
+        {
+            srand(0);
+            runMean = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
+            runVar  = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
+            for(std::size_t i = 0; i < runMean.desc.GetElementSize(); i++)
+            {
+                runMean[i] = (((GET_RAND() % 2) == 1) ? -1 : 1) * 1e-3 * U(GET_RAND() % 100);
+                runVar[i]  = 1e-3 * U(GET_RAND() % 100);
+            }
+        }
+
+        auto saveMean   = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
+        auto saveInvVar = tensor<U>{rs_n_batch, rs_height, rs_width, rs_channels};
+
+        auto in_dev    = handle.Write(input.data);
+        auto scale_dev = handle.Write(scale.data);
+        auto shift_dev = handle.Write(shift.data);
+
+        auto runMean_dev    = handle.Write(runMean.data);
+        auto runVar_dev     = handle.Write(runVar.data);
+        auto saveMean_dev   = handle.Create<U>(channels);
+        auto saveInvVar_dev = handle.Create<U>(channels);
+        auto out_dev        = handle.Create<T>(n_batch * channels * height * width);
+
+        double epsilon      = MIO_BN_TEST_EPSILON;
+        double expAvgFactor = MIO_BN_TEST_EXPAVGFACTOR;
+
+        float alpha = 1.0;
+        float beta  = 0.0;
+
+        miopen::BatchNormForwardTraining(handle,
+                                         miopenBNSpatial,
+                                         &alpha,
+                                         &beta,
+                                         input.desc,
+                                         in_dev.get(),
+                                         out.desc,
+                                         out_dev.get(),
+                                         scale.desc,
+                                         scale_dev.get(),
+                                         shift_dev.get(),
+                                         expAvgFactor,
+                                         runMean_dev.get(),
+                                         runVar_dev.get(),
+                                         epsilon,
+                                         saveMean_dev.get(),
+                                         saveInvVar_dev.get());
+
+        saveMean.data   = handle.Read<U>(saveMean_dev, saveMean.data.size());
+        saveInvVar.data = handle.Read<U>(saveInvVar_dev, saveInvVar.data.size());
+        runMean.data    = handle.Read<U>(runMean_dev, runMean.data.size());
+        runVar.data     = handle.Read<U>(runVar_dev, runVar.data.size());
+        out.data        = handle.Read<T>(out_dev, out.data.size());
+
+        return std::make_tuple(out, runMean, runVar, saveMean, saveInvVar);
+    }
+
+    void fail(int badtensor) const
+    {
+        std::cout << "Forward Train Spatial Batch Normalization: " << std::endl;
+        std::cout << "Input tensor: " << input.desc.ToString() << std::endl;
+
+        switch(badtensor)
+        {
+        case(0): std::cout << "Output tensor output failed verification." << std::endl; break;
+        case(1): std::cout << "Running Mean output tensor failed verification." << std::endl; break;
+        case(2):
+            std::cout << "Running Variance output tensor failed verification." << std::endl;
+            break;
+        case(3): std::cout << "Saved Mean tensor failed verification." << std::endl; break;
+        case(4): std::cout << "Saved Variance tensor failed verification." << std::endl; break;
+        default: break;
+        }
+    }
+};
+
+template <class T, class U>
+struct verify_backward_bn_spatial_recalc
+{
+    const tensor<T> x_input;
+    const tensor<T> dy_input;
+    const tensor<U> scale;
+
+    std::tuple<tensor<T>, tensor<U>, tensor<U>> cpu() const
+    {
+        double epsilon = MIO_BN_TEST_EPSILON;
+
+        std::size_t n_batch, channels, height, width;
+        std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
+
+        std::size_t ss_n_batch, ss_channels, ss_height, ss_width;
+        auto derivedBnDesc =
+            miopen::TensorDescriptor(x_input.desc.GetType(),
+                                     std::vector<std::size_t>{1, 1, 1, channels},
+                                     std::vector<std::size_t>{channels, channels, channels, 1});
+        std::tie(ss_n_batch, ss_height, ss_width, ss_channels) =
+            miopen::tien<4>(derivedBnDesc.GetLengths());
+
+        auto dx_out = dy_input;
+        std::fill(dx_out.begin(), dx_out.end(), 0);
+
+        auto dscale = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
+        std::fill(dscale.begin(), dscale.end(), 0);
+
+        auto dshift = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
+        std::fill(dshift.begin(), dshift.end(), 0);
+
+        const auto nhw = double(height * width * n_batch);
+
+        par_for(channels, 1, [&](int cidx) {
+            double elemStd = 0.;
+            unsigned int xhat_index;
+            double mean     = 0.;
+            double invVar   = 0.;
+            double dyelem   = 0.;
+            double variance = 0.;
+
+            std::vector<double> xhat(height * width * n_batch, 0.0);
+            std::vector<double> variance_accum_arr(height, 0.0);
+            std::vector<double> mean_accum_arr(height, 0.0);
+            std::vector<double> dshift_accum_arr(height, 0.0);
+            std::vector<double> dscale_accum_arr(height, 0.0);
+
+            for(std::size_t row = 0; row < height; row++)
+            {
+                for(std::size_t column = 0; column < width; column++)
+                {
+                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
+                    {
+                        mean_accum_arr[row] += x_input(bidx, cidx, row, column);
+                    }
+                }
+            }
+            for(std::size_t i = 0; i < height; i++)
+                mean += mean_accum_arr[i];
+
+            mean /= nhw;
+
+            elemStd  = 0.;
+            variance = 0.;
+
+            for(std::size_t row = 0; row < height; row++)
+            {
+                for(std::size_t column = 0; column < width; column++)
+                {
+                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
+                    {
+                        elemStd = x_input(bidx, cidx, row, column) - mean;
+                        variance_accum_arr[row] += elemStd * elemStd;
+                    }
+                }
+            }
+            for(std::size_t i = 0; i < height; i++)
+                variance += variance_accum_arr[i];
+
+            variance /= nhw;
+            invVar = 1. / double(sqrt(variance + epsilon));
+
+            dscale(0, cidx, 0, 0) = 0.;
+
+            for(std::size_t row = 0; row < height; row++)
+            {
+                for(std::size_t column = 0; column < width; column++)
+                {
+                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
+                    {
+                        xhat_index       = height * width * bidx + (width * row + column);
+                        elemStd          = x_input(bidx, cidx, row, column) - mean;
+                        xhat[xhat_index] = elemStd * invVar;
+                        dyelem           = dy_input(bidx, cidx, row, column);
+                        dshift_accum_arr[row] += dyelem;
+                        dscale_accum_arr[row] += xhat[xhat_index] * dyelem;
+                    }
+                }
+            }
+            for(std::size_t i = 0; i < height; i++)
+            {
+                dshift(0, cidx, 0, 0) += dshift_accum_arr[i];
+                dscale(0, cidx, 0, 0) += dscale_accum_arr[i];
+            }
+
+            for(std::size_t row = 0; row < height; row++)
+            {
+                for(std::size_t column = 0; column < width; column++)
+                {
+                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
+                    {
+                        xhat_index = height * width * bidx + (width * row + column);
+
+                        double tmp1 =
+                            nhw * dy_input(bidx, cidx, row, column) - dshift(0, cidx, 0, 0);
+                        double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
+                        double tmp3 = (scale(0, 0, 0, cidx) * invVar) / nhw;
+                        dx_out(bidx, cidx, row, column) = tmp3 * (tmp2 + tmp1);
+                    }
+                }
+            }
+        });
+
+        return std::make_tuple(dx_out, dscale, dshift);
+    }
+
+    std::tuple<tensor<T>, tensor<U>, tensor<U>> gpu() const
+    {
+        auto&& handle = get_handle();
+
+        std::size_t n_batch, channels, height, width;
+        std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
+
+        auto dx_out = dy_input;
+        std::fill(dx_out.begin(), dx_out.end(), 0);
+
+        std::size_t ss_n_batch, ss_channels, ss_height, ss_width;
+        auto derivedBnDesc =
+            miopen::TensorDescriptor(x_input.desc.GetType(),
+                                     std::vector<std::size_t>{1, 1, 1, channels},
+                                     std::vector<std::size_t>{channels, channels, channels, 1});
+        std::tie(ss_n_batch, ss_height, ss_width, ss_channels) =
+            miopen::tien<4>(derivedBnDesc.GetLengths());
+
+        auto dscale = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
+        std::fill(dscale.begin(), dscale.end(), 0);
+
+        auto dshift = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
+        std::fill(dshift.begin(), dshift.end(), 0);
+
+        float alpha = 1.0;
+        float beta  = 0.0;
+
+        auto xin_dev    = handle.Write(x_input.data);
+        auto dyin_dev   = handle.Write(dy_input.data);
+        auto scale_dev  = handle.Write(scale.data);
+        auto dscale_dev = handle.Write(dscale.data);
+        auto dshift_dev = handle.Write(dshift.data);
+        auto dx_out_dev = handle.Write(dx_out.data);
+
+        double epsilon = MIO_BN_TEST_EPSILON;
+
+        miopen::BatchNormBackward(handle,
+                                  miopenBNSpatial,
+                                  &alpha,
+                                  &beta,
+                                  &alpha,
+                                  &beta,
+                                  x_input.desc,
+                                  xin_dev.get(),
+                                  dy_input.desc,
+                                  dyin_dev.get(),
+                                  dx_out.desc,
+                                  dx_out_dev.get(),
+                                  scale.desc,
+                                  scale_dev.get(),
+                                  dscale_dev.get(),
+                                  dshift_dev.get(),
+                                  epsilon,
+                                  nullptr,
+                                  nullptr);
+
+        dx_out.data = handle.Read<T>(dx_out_dev, dx_out.data.size());
+        dscale.data = handle.Read<U>(dscale_dev, dscale.data.size());
+        dshift.data = handle.Read<U>(dshift_dev, dshift.data.size());
+
+        return std::make_tuple(dx_out, dscale, dshift);
+    }
+
+    void fail(int badtensor) const
+    {
+        std::cout << "Backward Batch Spatial Normalization Recalc Mean and Variance: " << std::endl;
+        std::cout << "X Input tensor: " << x_input.desc.ToString() << std::endl;
+        std::cout << "Delta Y Input tensor: " << dy_input.desc.ToString() << std::endl;
+        switch(badtensor)
+        {
+        case(0):
+            std::cout << "Delta X output tensor output failed verification." << std::endl;
+            break;
+        case(1): std::cout << "Delta scale output tensor failed verification." << std::endl; break;
+        case(2): std::cout << "Delta shift output tensor failed verification." << std::endl; break;
+        default: break;
+        }
+    }
+};
+
+template <class T, class U>
+struct verify_backward_bn_spatial_use_saved
+{
+    const tensor<T> x_input;
+    const tensor<T> dy_input;
+    const tensor<U> scale;
+    const tensor<U> savedMean;
+    const tensor<U> savedInvVar;
+    std::tuple<tensor<T>, tensor<U>, tensor<U>> cpu() const
+    {
+
+        std::size_t n_batch, channels, height, width;
+        std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
+
+        auto dx_out = dy_input;
+        std::fill(dx_out.begin(), dx_out.end(), 0);
+
+        std::size_t ss_n_batch, ss_channels, ss_height, ss_width;
+        auto derivedBnDesc =
+            miopen::TensorDescriptor(x_input.desc.GetType(),
+                                     std::vector<std::size_t>{1, 1, 1, channels},
+                                     std::vector<std::size_t>{channels, channels, channels, 1});
+        std::tie(ss_n_batch, ss_height, ss_width, ss_channels) =
+            miopen::tien<4>(derivedBnDesc.GetLengths());
+
+        auto dscale = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
+        std::fill(dscale.begin(), dscale.end(), 0);
+
+        auto dshift = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
+        std::fill(dshift.begin(), dshift.end(), 0);
+
+        const auto nhw = double(height * width * n_batch);
+
+        par_for(channels, 1, [&](int cidx) {
+            double elemStd = 0.;
+            unsigned int xhat_index;
+            double mean   = savedMean(0, 0, 0, cidx);
+            double invVar = savedInvVar(0, 0, 0, cidx);
+            double dyelem = 0.;
+
+            std::vector<double> xhat(n_batch * height * width, 0.0);
+            std::vector<double> dshift_accum_arr(height, 0.0);
+            std::vector<double> dscale_accum_arr(height, 0.0);
+            dscale(0, cidx, 0, 0) = 0.;
+
+            for(std::size_t row = 0; row < height; row++)
+            {
+                for(std::size_t column = 0; column < width; column++)
+                {
+                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
+                    {
+                        xhat_index       = height * width * bidx + (width * row + column);
+                        elemStd          = x_input(bidx, cidx, row, column) - mean;
+                        xhat[xhat_index] = elemStd * invVar;
+                        dyelem           = dy_input(bidx, cidx, row, column);
+                        dshift_accum_arr[row] += dyelem;
+                        dscale_accum_arr[row] += xhat[xhat_index] * dyelem;
+                    }
+                }
+            }
+            for(std::size_t i = 0; i < height; i++)
+            {
+                dshift(0, cidx, 0, 0) += dshift_accum_arr[i];
+                dscale(0, cidx, 0, 0) += dscale_accum_arr[i];
+            }
+
+            for(std::size_t row = 0; row < height; row++)
+            {
+                for(std::size_t column = 0; column < width; column++)
+                {
+                    for(std::size_t bidx = 0; bidx < n_batch; bidx++)
+                    {
+                        xhat_index = height * width * bidx + (width * row + column);
+
+                        double tmp1 =
+                            nhw * dy_input(bidx, cidx, row, column) - dshift(0, cidx, 0, 0);
+                        double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
+                        double tmp3 = (scale(0, 0, 0, cidx) * invVar) / nhw;
+                        dx_out(bidx, cidx, row, column) = tmp3 * (tmp2 + tmp1);
+                    }
+                }
+            }
+        });
+
+        return std::make_tuple(dx_out, dscale, dshift);
+    }
+
+    std::tuple<tensor<T>, tensor<U>, tensor<U>> gpu() const
+    {
+        auto&& handle = get_handle();
+
+        std::size_t n_batch, channels, height, width;
+        std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
+
+        auto dx_out = dy_input;
+        std::fill(dx_out.begin(), dx_out.end(), 0);
+
+        std::size_t ss_n_batch, ss_channels, ss_height, ss_width;
+        auto derivedBnDesc =
+            miopen::TensorDescriptor(x_input.desc.GetType(),
+                                     std::vector<std::size_t>{1, 1, 1, channels},
+                                     std::vector<std::size_t>{channels, channels, channels, 1});
+        std::tie(ss_n_batch, ss_height, ss_width, ss_channels) =
+            miopen::tien<4>(derivedBnDesc.GetLengths());
+
+        auto dscale = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
+        std::fill(dscale.begin(), dscale.end(), 0);
+
+        auto dshift = tensor<U>{ss_n_batch, ss_channels, ss_height, ss_width};
+        std::fill(dshift.begin(), dshift.end(), 0);
+
+        float alpha = 1.0;
+        float beta  = 0.0;
+
+        auto xin_dev         = handle.Write(x_input.data);
+        auto dyin_dev        = handle.Write(dy_input.data);
+        auto scale_dev       = handle.Write(scale.data);
+        auto dscale_dev      = handle.Write(dscale.data);
+        auto dshift_dev      = handle.Write(dshift.data);
+        auto dx_out_dev      = handle.Write(dx_out.data);
+        auto savedMean_dev   = handle.Write(savedMean.data);
+        auto savedInvVar_dev = handle.Write(savedInvVar.data);
+
+        double epsilon = MIO_BN_TEST_EPSILON;
+
+        miopen::BatchNormBackward(handle,
+                                  miopenBNSpatial,
+                                  &alpha,
+                                  &beta,
+                                  &alpha,
+                                  &beta,
+                                  x_input.desc,
+                                  xin_dev.get(),
+                                  dy_input.desc,
+                                  dyin_dev.get(),
+                                  dx_out.desc,
+                                  dx_out_dev.get(),
+                                  scale.desc,
+                                  scale_dev.get(),
+                                  dscale_dev.get(),
+                                  dshift_dev.get(),
+                                  epsilon,
+                                  savedMean_dev.get(),
+                                  savedInvVar_dev.get());
+
+        dx_out.data = handle.Read<T>(dx_out_dev, dx_out.data.size());
+        dscale.data = handle.Read<U>(dscale_dev, dscale.data.size());
+        dshift.data = handle.Read<U>(dshift_dev, dshift.data.size());
+
+        return std::make_tuple(dx_out, dscale, dshift);
+    }
+
+    void fail(int badtensor) const
+    {
+        std::cout << "Backward Batch Spatial Normalization Use Saved Mean and Variance: "
+                  << std::endl;
+        std::cout << "X Input tensor: " << x_input.desc.ToString() << std::endl;
+        std::cout << "Delta Y Input tensor: " << dy_input.desc.ToString() << std::endl;
+        switch(badtensor)
+        {
+        case(0):
+            std::cout << "Delta X output tensor output failed verification." << std::endl;
+            break;
+        case(1): std::cout << "Delta scale output tensor failed verification." << std::endl; break;
+        case(2): std::cout << "Delta shift output tensor failed verification." << std::endl; break;
+        default: break;
+        }
+    }
+};
+
+template <class T>
+struct batch_norm_spatial_nhwc_driver : test_driver
+{
+    tensor<T> input;
+    tensor<PREC_TYPE> scale;
+    tensor<PREC_TYPE> shift;
+    batch_norm_spatial_nhwc_driver()
+    {
+        this->batch_factor = 4;
+        add(input,
+            "input",
+            get_bn_spatial_input_tensor(
+                tensor_elem_gen_integer{miopen_type<T>{} == miopenHalf ? 5 : 17}));
+    }
+
+    void run()
+    {
+        std::size_t n, c, h, w;
+        std::tie(n, c, h, w) = miopen::tien<4>(input.desc.GetLengths());
+
+        std::size_t ssn, ssc, ssh, ssw;
+        auto derivedBnDesc = miopen::TensorDescriptor(input.desc.GetType(),
+                                                      std::vector<std::size_t>{1, 1, 1, c},
+                                                      std::vector<std::size_t>{c, c, c, 1});
+        std::tie(ssn, ssh, ssw, ssc) = miopen::tien<4>(derivedBnDesc.GetLengths());
+
+        std::vector<std::size_t> new_len = input.desc.GetLengths();
+        std::vector<std::size_t> new_str;
+        miopen::tensor_layout_to_strides(new_len, "NCHW", "NHWC", new_str);
+        input.desc = miopen::TensorDescriptor(miopen_type<T>{}, new_len, new_str);
+
+        if(input.desc.GetType() == miopenFloat)
+        {
+            scale = tensor<PREC_TYPE>{ssn, ssh, ssw, ssc}.generate(tensor_elem_gen_integer{17});
+            shift = tensor<PREC_TYPE>{ssn, ssh, ssw, ssc}.generate(tensor_elem_gen_integer{17});
+        }
+        else
+        {
+            srand(0);
+            scale = tensor<PREC_TYPE>{ssn, ssh, ssw, ssc};
+            shift = tensor<PREC_TYPE>{ssn, ssh, ssw, ssc};
+            for(std::size_t i = 0; i < scale.desc.GetElementSize(); i++)
+            {
+                scale[i] = (((GET_RAND() % 2) == 1) ? -1 : 1) * 1e-4 * PREC_TYPE(GET_RAND() % 100);
+                shift[i] = (((GET_RAND() % 2) == 1) ? -1 : 1) * 1e-4 * PREC_TYPE(GET_RAND() % 100);
+            }
+            for(std::size_t i = 0; i < input.desc.GetElementSize(); i++)
+            {
+                input[i] = (((GET_RAND() % 2) == 1) ? -1 : 1) * (1e-5 * T(GET_RAND() % 100));
+            }
+        }
+
+        auto outpair = verify(verify_forward_train_bn_spatial<T, PREC_TYPE>{input, scale, shift});
+
+        auto dy_input = std::get<0>(outpair.second);
+        for(std::size_t bidx = 0; bidx < n; bidx++)
+        {
+            for(std::size_t cidx = 0; cidx < c; cidx++)
+            {
+                for(std::size_t row = 0; row < h; row++)
+                {
+                    for(std::size_t column = 0; column < w; column++)
+                    {
+                        dy_input(bidx, cidx, row, column) *= 0.1;
+                    }
+                }
+            }
+        }
+        this->tolerance = 80 * input.desc.GetElementSize();
+        verify(verify_backward_bn_spatial_recalc<T, PREC_TYPE>{input, dy_input, scale});
+
+        auto savedMean   = std::get<3>(outpair.second);
+        auto savedInvVar = std::get<4>(outpair.second);
+        verify(verify_backward_bn_spatial_use_saved<T, PREC_TYPE>{
+            input, dy_input, scale, savedMean, savedInvVar});
+    }
+};
+
+int main(int argc, const char* argv[])
+{
+    test_drive<batch_norm_spatial_nhwc_driver>(argc, argv);
+    exit(0);
+}

From 0cd8e89887da9af1d5cbd79f3274bed284b76b16 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Fri, 19 Nov 2021 11:07:41 -0800
Subject: [PATCH 03/10] Fix Tidy Issue

---
 test/bn_spatial_nhwc_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/bn_spatial_nhwc_test.cpp b/test/bn_spatial_nhwc_test.cpp
index a1f0407402..4a72bfecf0 100644
--- a/test/bn_spatial_nhwc_test.cpp
+++ b/test/bn_spatial_nhwc_test.cpp
@@ -739,5 +739,5 @@ struct batch_norm_spatial_nhwc_driver : test_driver
 int main(int argc, const char* argv[])
 {
     test_drive<batch_norm_spatial_nhwc_driver>(argc, argv);
-    exit(0);
+    return 0;
 }

From 8285fe87b06eec53783ef1921b9dd2b6b5a67136 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Mon, 13 Dec 2021 15:13:07 -0800
Subject: [PATCH 04/10] Fix tidy and review requests

---
 src/kernels/MIOpenBatchNormBwdSpatial.cl      | 20 ++----
 src/kernels/MIOpenBatchNormFwdTrainSpatial.cl | 13 ++--
 .../batchnorm/forward_spatial_single.cpp      | 69 ++++++++++---------
 3 files changed, 45 insertions(+), 57 deletions(-)

diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl
index d7bee08300..3556185f20 100644
--- a/src/kernels/MIOpenBatchNormBwdSpatial.cl
+++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl
@@ -288,7 +288,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     unsigned int index = 0;
     unsigned int lid   = get_local_id(0);
     unsigned int grpid = get_group_id(0);
-#if MIO_LAYOUT_NHWC == 0
+#if !MIO_LAYOUT_NHWC
     unsigned int chwid = grpid * MIO_BN_HW;
 #endif
     unsigned int nidx  = 0;
@@ -307,7 +307,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #if(MIO_BN_USESAVED == 0)
     //==== CALC MEAN and VARIANCE ONCE AGAIN =======================
     _FLOAT_PREC variance = (_FLOAT_PREC)0.;
-#if(MIO_LAYOUT_NHWC == 0 && MIO_BN_HW >= 4096)
+#if !MIO_LAYOUT_NHWC && MIO_BN_HW >= 4096
     _FLOAT4 read4;
 #if(MIO_BN_N > MIO_BN_LOOP_UNROLL_MAXN)
     __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << 2; k < MIO_BN_LESS4;
@@ -426,23 +426,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     _FLOAT_PREC4 xhat4;
 #endif
 #if(MIO_BN_N > MIO_BN_LOOP_UNROLL_MAXN)
-    __attribute__((opencl_unroll_hint(4))) for(unsigned int k =
-#if MIO_LAYOUT_NHWC == 1
-                                                   lid
-#else
-                                                   lid << 2
-#endif
-                                               ;
+    __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << (2 * MIO_LAYOUT_NHWC);
                                                k < MIO_BN_LESS4;
                                                k += GRPRD)
 #else
-    __attribute__((opencl_unroll_hint(2))) for(unsigned int k =
-#if MIO_LAYOUT_NHWC == 1
-                                                   lid
-#else
-                                                   lid << 2
-#endif
-                                               ;
+    __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << (2 * MIO_LAYOUT_NHWC);
                                                k < MIO_BN_LESS4;
                                                k += GRPRD)
 #endif
diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
index b0e6ab8322..f715709d5a 100644
--- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
+++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
@@ -243,7 +243,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
     uint index = 0;
     uint lid   = get_local_id(0);
     uint grpid = get_group_id(0);
-#if MIO_LAYOUT_NHWC == 0
+#if !MIO_LAYOUT_NHWC
     uint chwid = grpid * MIO_BN_HW;
 #endif
     uint nidx  = 0;
@@ -256,7 +256,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
-#if(MIO_LAYOUT_NHWC == 0 && MIO_BN_HW >= 4096)
+#if !MIO_LAYOUT_NHWC && MIO_BN_HW >= 4096
     _FLOAT4 read4;
     __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << 2; k < MIO_BN_LESS4;
                                                k += GRPRD)
@@ -351,14 +351,13 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
     pvbias  = lcl_bias;
 
 #if(MIO_LAYOUT_NHWC == 1 || MIO_BN_REM == 0)
-    __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid; k <
+    const unsigned int k_limit =
 #if MIO_LAYOUT_NHWC == 1
-                                                                     MIO_BN_NHW
+                                                                     MIO_BN_NHW;
 #else
-                                                                     MIO_BN_LESS
+                                                                     MIO_BN_LESS;
 #endif
-                                               ;
-                                               k += MIO_BN_GRP0)
+    __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid; k < k_limit; k += MIO_BN_GRP0)
     {
         nidx  = k / MIO_BN_HW;
         hwidx = k - (nidx * MIO_BN_HW);
diff --git a/src/solver/batchnorm/forward_spatial_single.cpp b/src/solver/batchnorm/forward_spatial_single.cpp
index 48ad737fe3..fdfa828623 100644
--- a/src/solver/batchnorm/forward_spatial_single.cpp
+++ b/src/solver/batchnorm/forward_spatial_single.cpp
@@ -147,41 +147,42 @@ BnFwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context,
         }
         else
 #endif
-
-            // clang-format off
-        if((in_nhw < 33554432 && in_cstride > 1024) ||
-                ((n >= 256) && (in_cstride > 60) && bfpmixparm) ||
-                ((in_cstride > 512) && bfpmixparm))
-        {
-            variant = 1;
-        }
-        else if(in_cstride <= 512)
         {
-            variant = 0;
-        }
-        else
-        {
-            variant      = 2;
-            xlocalsize   = 1;
-            ylocalsize   = 1024;
-            auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-            xgridsize    = c;
-            ygridsize    = segment * ylocalsize;
-            ldsgcn       = ylocalsize / 64;
-            ldsnogcn     = ylocalsize;
-        }
-        // clang-format on
-
-        if((n > 768) && (in_cstride > 150) && bfp32parm)
-        {
-            variant      = 2;
-            xlocalsize   = 1;
-            ylocalsize   = 1024;
-            auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-            xgridsize    = c;
-            ygridsize    = segment * ylocalsize;
-            ldsgcn       = ylocalsize / 64;
-            ldsnogcn     = ylocalsize;
+            // clang-format off
+            if( (in_nhw < 33554432 && in_cstride > 1024) ||
+                    ((n >= 256) && (in_cstride > 60) && bfpmixparm) ||
+                    ((in_cstride > 512) && bfpmixparm))
+            {
+                variant = 1;
+            }
+            else if(in_cstride <= 512)
+            {
+                variant = 0;
+            }
+            else
+            {
+                variant      = 2;
+                xlocalsize   = 1;
+                ylocalsize   = 1024;
+                auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
+                xgridsize    = c;
+                ygridsize    = segment * ylocalsize;
+                ldsgcn       = ylocalsize / 64;
+                ldsnogcn     = ylocalsize;
+            }
+            // clang-format on
+
+            if((n > 768) && (in_cstride > 150) && bfp32parm)
+            {
+                variant      = 2;
+                xlocalsize   = 1;
+                ylocalsize   = 1024;
+                auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
+                xgridsize    = c;
+                ygridsize    = segment * ylocalsize;
+                ldsgcn       = ylocalsize / 64;
+                ldsnogcn     = ylocalsize;
+            }
         }
     }
 

From cd19418872e4a8da7904de96d7c94220f7360220 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Wed, 15 Dec 2021 16:46:24 -0800
Subject: [PATCH 05/10] Fix error and review requests

---
 src/kernels/MIOpenBatchNormBwdSpatial.cl      | 30 +++++++++++--------
 src/kernels/MIOpenBatchNormFwdTrainSpatial.cl | 16 ++++++----
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl
index 3556185f20..671f5a3f49 100644
--- a/src/kernels/MIOpenBatchNormBwdSpatial.cl
+++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl
@@ -233,7 +233,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #define MIO_LAYOUT_NHWC 0
 #endif
 
-#if MIO_LAYOUT_NHWC == 1
+#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1)
+#error MIO_LAYOUT_NHWC must be 0 or 1
+#endif
+
+#if MIO_LAYOUT_NHWC
 #define MIO_MAX_READ 1
 #define RD_BLK 1
 #define GRPRD (MIO_BN_GRP0 * RD_BLK)
@@ -362,7 +366,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     {
         nidx           = k / MIO_BN_HW;
         hwidx          = k - (nidx * MIO_BN_HW);
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
         index          = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
         index = nidx * MIO_BN_CHW + chwid + hwidx;
@@ -377,7 +381,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
         unsigned int remkey = lid + MIO_BN_LESS;
         nidx                = remkey / MIO_BN_HW;
         hwidx               = remkey - (nidx * MIO_BN_HW);
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
         index               = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
         index = nidx * MIO_BN_CHW + chwid + hwidx;
@@ -416,7 +420,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 
 #endif
 
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
     _FLOAT dyRead;
     _FLOAT xread;
     _FLOAT_PREC xhat_tmp;
@@ -426,18 +430,18 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     _FLOAT_PREC4 xhat4;
 #endif
 #if(MIO_BN_N > MIO_BN_LOOP_UNROLL_MAXN)
-    __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << (2 * MIO_LAYOUT_NHWC);
+    __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << 2*(1 - MIO_LAYOUT_NHWC));
                                                k < MIO_BN_LESS4;
                                                k += GRPRD)
 #else
-    __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << (2 * MIO_LAYOUT_NHWC);
+    __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << 2*(1 - MIO_LAYOUT_NHWC));
                                                k < MIO_BN_LESS4;
                                                k += GRPRD)
 #endif
     {
         nidx     = k / MIO_BN_HW;
         hwidx    = k - (nidx * MIO_BN_HW);
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
         index    = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
         xread    = *((const global _FLOAT*)(x_in + index));
         dyRead   = *((const global _FLOAT*)(dy_in + index));
@@ -465,7 +469,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 
 #if(MIO_BN_REM4)
     unsigned int remkey =
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
         lid
 #else
         (lid << 2)
@@ -474,7 +478,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     nidx  = remkey / MIO_BN_HW;
     hwidx = remkey - (nidx * MIO_BN_HW);
     index = nidx * MIO_BN_CHW +
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
             hwidx * MIO_BN_C + grpid;
     if(index < MIO_BN_NCHW)
     {
@@ -549,7 +553,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
             unsigned int l = k + j;
             nidx           = l / MIO_BN_HW;
             hwidx          = l - (nidx * MIO_BN_HW);
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
             index          = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
             index   = nidx * MIO_BN_CHW + chwid + hwidx;
@@ -577,7 +581,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
             unsigned int l    = k + j;
             nidx              = l / MIO_BN_HW;
             hwidx             = l - (nidx * MIO_BN_HW);
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
             index             = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
             index = nidx * MIO_BN_CHW + chwid + hwidx;
@@ -597,7 +601,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
         unsigned int l = remkeyout + j;
         nidx           = l / MIO_BN_HW;
         hwidx          = l - (nidx * MIO_BN_HW);
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
         index          = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
         index = nidx * MIO_BN_CHW + chwid + hwidx;
@@ -621,7 +625,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
         unsigned int l = remkeyout + j;
         nidx           = l / MIO_BN_HW;
         hwidx          = l - (nidx * MIO_BN_HW);
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
         index          = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
         index = nidx * MIO_BN_CHW + chwid + hwidx;
diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
index f715709d5a..77b46a17ed 100644
--- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
+++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
@@ -185,7 +185,11 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
 #define MIO_LAYOUT_NHWC 0
 #endif
 
-#if MIO_LAYOUT_NHWC == 1
+#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1)
+#error MIO_LAYOUT_NHWC must be 0 or 1
+#endif
+
+#if MIO_LAYOUT_NHWC
 #define MIO_MAX_READ 1
 #define RD_BLK 1
 #define GRPRD (MIO_BN_GRP0 * RD_BLK)
@@ -301,7 +305,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
     {
         nidx            = k / MIO_BN_HW;
         hwidx           = k - (nidx * MIO_BN_HW);
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
         index           = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
         index = nidx * MIO_BN_CHW + chwid + hwidx;
@@ -316,7 +320,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
         unsigned int remkey = lid + MIO_BN_LESS;
         nidx                = remkey / MIO_BN_HW;
         hwidx               = remkey - (nidx * MIO_BN_HW);
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
         index               = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
         index = nidx * MIO_BN_CHW + chwid + hwidx;
@@ -350,9 +354,9 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
     pvscale = lcl_scale;
     pvbias  = lcl_bias;
 
-#if(MIO_LAYOUT_NHWC == 1 || MIO_BN_REM == 0)
+#if(MIO_LAYOUT_NHWC || MIO_BN_REM == 0)
     const unsigned int k_limit =
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
                                                                      MIO_BN_NHW;
 #else
                                                                      MIO_BN_LESS;
@@ -361,7 +365,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
     {
         nidx  = k / MIO_BN_HW;
         hwidx = k - (nidx * MIO_BN_HW);
-#if MIO_LAYOUT_NHWC == 1
+#if MIO_LAYOUT_NHWC
         index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
         index = nidx * MIO_BN_CHW + chwid + hwidx;

From e4a74f4e4945fd000295e1ae512a245ce47b6b46 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Thu, 16 Dec 2021 09:44:24 -0800
Subject: [PATCH 06/10] bugfix: define macro in branch results in error

---
 src/kernels/MIOpenBatchNormBwdSpatial.cl      | 16 ++++++++--------
 src/kernels/MIOpenBatchNormFwdTrainSpatial.cl | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl
index 671f5a3f49..5b148a9a14 100644
--- a/src/kernels/MIOpenBatchNormBwdSpatial.cl
+++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl
@@ -41,6 +41,14 @@
 #include "batchnorm_functions.h"
 #include "reduction_functions.h"
 
+#ifndef MIO_LAYOUT_NHWC
+#define MIO_LAYOUT_NHWC 0
+#endif
+
+#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1)
+#error MIO_LAYOUT_NHWC must be 0 or 1
+#endif
+
 #if(MIO_BN_VARIANT == 0)
 
 #define MIO_BN_SEGTMP_1 (MIO_BN_GRP0 / MIO_BN_HW)
@@ -229,14 +237,6 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 
 #elif(MIO_BN_VARIANT == 1)
 
-#ifndef MIO_LAYOUT_NHWC
-#define MIO_LAYOUT_NHWC 0
-#endif
-
-#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1)
-#error MIO_LAYOUT_NHWC must be 0 or 1
-#endif
-
 #if MIO_LAYOUT_NHWC
 #define MIO_MAX_READ 1
 #define RD_BLK 1
diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
index 77b46a17ed..0aa54d53e6 100644
--- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
+++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
@@ -41,6 +41,14 @@
 #include "batchnorm_functions.h"
 #include "reduction_functions.h"
 
+#ifndef MIO_LAYOUT_NHWC
+#define MIO_LAYOUT_NHWC 0
+#endif
+
+#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1)
+#error MIO_LAYOUT_NHWC must be 0 or 1
+#endif
+
 #if(MIO_BN_VARIANT == 0)
 
 #define MIO_BN_SEGTMP_1 (MIO_BN_GRP0 / MIO_BN_HW)
@@ -181,14 +189,6 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
 
 //===========
 
-#ifndef MIO_LAYOUT_NHWC
-#define MIO_LAYOUT_NHWC 0
-#endif
-
-#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1)
-#error MIO_LAYOUT_NHWC must be 0 or 1
-#endif
-
 #if MIO_LAYOUT_NHWC
 #define MIO_MAX_READ 1
 #define RD_BLK 1

From a2b1adabdc859e498138d1042db5873340e77568 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Thu, 16 Dec 2021 12:41:58 -0800
Subject: [PATCH 07/10] fix ocl error macro

---
 src/kernels/MIOpenBatchNormBwdSpatial.cl      | 2 +-
 src/kernels/MIOpenBatchNormFwdTrainSpatial.cl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl
index 5b148a9a14..6bdfb2c2cd 100644
--- a/src/kernels/MIOpenBatchNormBwdSpatial.cl
+++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl
@@ -46,7 +46,7 @@
 #endif
 
 #if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1)
-#error MIO_LAYOUT_NHWC must be 0 or 1
+#error "MIO_LAYOUT_NHWC must be 0 or 1"
 #endif
 
 #if(MIO_BN_VARIANT == 0)
diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
index 0aa54d53e6..68054d4dcc 100644
--- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
+++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
@@ -46,7 +46,7 @@
 #endif
 
 #if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1)
-#error MIO_LAYOUT_NHWC must be 0 or 1
+#error "MIO_LAYOUT_NHWC must be 0 or 1"
 #endif
 
 #if(MIO_BN_VARIANT == 0)

From 2043ad28b422afd4547bc96216be658977da8073 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Thu, 16 Dec 2021 15:36:48 -0800
Subject: [PATCH 08/10] Fix error on macro conditions

---
 src/kernels/MIOpenBatchNormBwdSpatial.cl      | 2 +-
 src/kernels/MIOpenBatchNormFwdTrainSpatial.cl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl
index 6bdfb2c2cd..8227d9dc60 100644
--- a/src/kernels/MIOpenBatchNormBwdSpatial.cl
+++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl
@@ -45,7 +45,7 @@
 #define MIO_LAYOUT_NHWC 0
 #endif
 
-#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1)
+#if (MIO_LAYOUT_NHWC != 0) && (MIO_LAYOUT_NHWC != 1)
 #error "MIO_LAYOUT_NHWC must be 0 or 1"
 #endif
 
diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
index 68054d4dcc..36e00db74a 100644
--- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
+++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
@@ -45,7 +45,7 @@
 #define MIO_LAYOUT_NHWC 0
 #endif
 
-#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1)
+#if (MIO_LAYOUT_NHWC != 0) && (MIO_LAYOUT_NHWC != 1)
 #error "MIO_LAYOUT_NHWC must be 0 or 1"
 #endif
 

From 3a0735bea4cf907024d5e2013dad7410a4cd6600 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Thu, 16 Dec 2021 16:31:32 -0800
Subject: [PATCH 09/10] Error fix regarding format

---
 src/kernels/MIOpenBatchNormBwdSpatial.cl      | 24 +++++++------------
 src/kernels/MIOpenBatchNormFwdTrainSpatial.cl |  4 ++--
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl
index 8227d9dc60..3185c56725 100644
--- a/src/kernels/MIOpenBatchNormBwdSpatial.cl
+++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl
@@ -369,7 +369,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #if MIO_LAYOUT_NHWC
         index          = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
-        index = nidx * MIO_BN_CHW + chwid + hwidx;
+        index          = nidx * MIO_BN_CHW + chwid + hwidx;
 #endif
         _FLOAT_PREC in = (_FLOAT_PREC)(*(x_in + index));
         mean += in;
@@ -384,7 +384,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #if MIO_LAYOUT_NHWC
         index               = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
-        index = nidx * MIO_BN_CHW + chwid + hwidx;
+        index               = nidx * MIO_BN_CHW + chwid + hwidx;
 #endif
         _FLOAT_PREC in = (index < MIO_BN_NCHW) ? (_FLOAT_PREC)(*(x_in + index)) : (_FLOAT_PREC)0.;
         mean += in;
@@ -430,11 +430,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     _FLOAT_PREC4 xhat4;
 #endif
 #if(MIO_BN_N > MIO_BN_LOOP_UNROLL_MAXN)
-    __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << 2*(1 - MIO_LAYOUT_NHWC));
+    __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << 2*(1 - MIO_LAYOUT_NHWC);
                                                k < MIO_BN_LESS4;
                                                k += GRPRD)
 #else
-    __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << 2*(1 - MIO_LAYOUT_NHWC));
+    __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << 2*(1 - MIO_LAYOUT_NHWC);
                                                k < MIO_BN_LESS4;
                                                k += GRPRD)
 #endif
@@ -468,13 +468,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     }
 
 #if(MIO_BN_REM4)
-    unsigned int remkey =
-#if MIO_LAYOUT_NHWC
-        lid
-#else
-        (lid << 2)
-#endif
-        + MIO_BN_LESS4;
+    unsigned int remkey = lid << 2*(1 - MIO_LAYOUT_NHWC) + MIO_BN_LESS4;
     nidx  = remkey / MIO_BN_HW;
     hwidx = remkey - (nidx * MIO_BN_HW);
     index = nidx * MIO_BN_CHW +
@@ -556,7 +550,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #if MIO_LAYOUT_NHWC
             index          = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
-            index   = nidx * MIO_BN_CHW + chwid + hwidx;
+            index          = nidx * MIO_BN_CHW + chwid + hwidx;
 #endif
             dyvalue        = (_FLOAT_PREC)(*(dy_in + index));
             xhat = ((_FLOAT_PREC)(*(x_in + index)) - mean) * invVariance;
@@ -584,7 +578,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #if MIO_LAYOUT_NHWC
             index             = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
-            index = nidx * MIO_BN_CHW + chwid + hwidx;
+            index             = nidx * MIO_BN_CHW + chwid + hwidx;
 #endif
             *(dx_out + index) = (_FLOAT)vals[j];
         }
@@ -604,7 +598,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #if MIO_LAYOUT_NHWC
         index          = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
-        index = nidx * MIO_BN_CHW + chwid + hwidx;
+        index          = nidx * MIO_BN_CHW + chwid + hwidx;
 #endif
         if(index < MIO_BN_NCHW)
         {
@@ -628,7 +622,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 #if MIO_LAYOUT_NHWC
         index          = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
-        index = nidx * MIO_BN_CHW + chwid + hwidx;
+        index          = nidx * MIO_BN_CHW + chwid + hwidx;
 #endif
         if(index < MIO_BN_NCHW)
         {
diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
index 36e00db74a..fae80adbb6 100644
--- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
+++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
@@ -308,7 +308,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
 #if MIO_LAYOUT_NHWC
         index           = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
-        index = nidx * MIO_BN_CHW + chwid + hwidx;
+        index           = nidx * MIO_BN_CHW + chwid + hwidx;
 #endif
         _FLOAT_PREC xin = (_FLOAT_PREC)(*(in + index));
         mean += xin;
@@ -323,7 +323,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in,
 #if MIO_LAYOUT_NHWC
         index               = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid;
 #else
-        index = nidx * MIO_BN_CHW + chwid + hwidx;
+        index               = nidx * MIO_BN_CHW + chwid + hwidx;
 #endif
         _FLOAT_PREC xin = (index < MIO_BN_NCHW) ? (_FLOAT_PREC)(*(in + index)) : (_FLOAT_PREC)0.;
         mean += xin;

From f85ceddffc09fef221d1174589550f8adb34bdce Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Thu, 16 Dec 2021 20:52:27 -0800
Subject: [PATCH 10/10] Error fix

---
 src/kernels/MIOpenBatchNormBwdSpatial.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl
index 3185c56725..6842b76a04 100644
--- a/src/kernels/MIOpenBatchNormBwdSpatial.cl
+++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl
@@ -468,7 +468,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
     }
 
 #if(MIO_BN_REM4)
-    unsigned int remkey = lid << 2*(1 - MIO_LAYOUT_NHWC) + MIO_BN_LESS4;
+    unsigned int remkey = (lid << 2*(1 - MIO_LAYOUT_NHWC)) + MIO_BN_LESS4;
     nidx  = remkey / MIO_BN_HW;
     hwidx = remkey - (nidx * MIO_BN_HW);
     index = nidx * MIO_BN_CHW +