From f02b6c31b98a7a5ddf7c2e6c07fc122a719f71cb Mon Sep 17 00:00:00 2001 From: Jing Zhou Date: Wed, 27 Oct 2021 17:48:30 -0700 Subject: [PATCH 01/10] refactor ocl kernels for batchnorm nhwc --- .../miopen/batchnorm/problem_description.hpp | 28 +++- src/kernels/MIOpenBatchNormBwdSpatial.cl | 129 +++++++++++++---- src/kernels/MIOpenBatchNormFwdTrainSpatial.cl | 49 +++++-- .../batchnorm/backward_spatial_multiple.cpp | 126 +++++++++-------- .../batchnorm/backward_spatial_single.cpp | 130 ++++++++++-------- .../batchnorm/forward_spatial_multiple.cpp | 54 ++++---- .../batchnorm/forward_spatial_single.cpp | 92 +++++++------ 7 files changed, 388 insertions(+), 220 deletions(-) diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp index f64ac2d901..63582e3026 100644 --- a/src/include/miopen/batchnorm/problem_description.hpp +++ b/src/include/miopen/batchnorm/problem_description.hpp @@ -67,6 +67,8 @@ struct ProblemDescription resultsave(resultsave_), resultrunning(resultrunning_) { + in_layout = xDesc.GetLayout(xDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW"); + out_layout = yOrDyDesc.GetLayout(yOrDyDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW"); } // Backward @@ -86,6 +88,9 @@ struct ProblemDescription epsilon(epsilon_), useSaved(useSaved_) { + in_layout = xDesc.GetLayout(xDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW"); + out_layout = yOrDyDesc.GetLayout(yOrDyDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW"); + din_layout = dxDesc.GetLayout(dxDesc.GetLengths().size() == 4 ? "NCHW" : "NCDHW"); } Direction GetDirection() const { return direction; } @@ -140,6 +145,20 @@ struct ProblemDescription return useSaved; } + bool IsLayoutNHWC() const + { + if(direction == Direction::Backward) + { + return xDesc.GetLengths().size() == 4 + ? ((in_layout == "NHWC") && (out_layout == "NHWC") && (din_layout == "NHWC")) + : ((in_layout == "NDHWC") && (out_layout == "NDHWC") && + (din_layout == "NDHWC")); + } + + return xDesc.GetLengths().size() == 4 ? ((in_layout == "NHWC") && (out_layout == "NHWC")) + : ((in_layout == "NDHWC") && (out_layout == "NDHWC")); + } + NetworkConfig MakeNetworkConfig() const; void Serialize(std::ostream& stream) const; @@ -159,9 +178,12 @@ struct ProblemDescription TensorDescriptor scaleBiasDesc; double expAvgFactor = 0; double epsilon; - bool resultsave = false; - bool resultrunning = false; - bool useSaved = false; + bool resultsave = false; + bool resultrunning = false; + bool useSaved = false; + std::string in_layout = "NCHW"; + std::string out_layout = "NCHW"; + std::string din_layout = "NCHW"; NetworkConfig MakeForwardTrainingNetworkConfig() const; NetworkConfig MakeForwardInferenceNetworkConfig() const; diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl index a3a144ee33..961c543822 100644 --- a/src/kernels/MIOpenBatchNormBwdSpatial.cl +++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl @@ -144,7 +144,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, { variance = 0; } - invVariance = rsqrt(variance + epsilon); + invVariance = rsqrt(variance + epsilon); #endif // end -- Recalc mean and variance //------------------------------------------- @@ -229,9 +229,19 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #elif(MIO_BN_VARIANT == 1) +#ifndef MIO_LAYOUT_NHWC +#define MIO_LAYOUT_NHWC 0 +#endif + +#if MIO_LAYOUT_NHWC == 1 +#define MIO_MAX_READ 1 +#define RD_BLK 1 +#define GRPRD (MIO_BN_GRP0 * RD_BLK) +#else #define MIO_MAX_READ 2 #define RD_BLK 1 #define GRPRD (MIO_BN_GRP0 * RD_BLK * 4) +#endif #define MIO_BN_REM4 (MIO_BN_NHW - ((MIO_BN_NHW / GRPRD) * GRPRD)) #define MIO_BN_LESS4 (MIO_BN_NHW - MIO_BN_REM4) #define MIO_BN_CHUNK4 (MIO_MAX_READ * GRPRD) @@ -278,7 +288,9 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, unsigned int index = 0; unsigned int lid = get_local_id(0); unsigned int grpid = get_group_id(0); +#if MIO_LAYOUT_NHWC == 0 unsigned int chwid = grpid * MIO_BN_HW; +#endif unsigned int nidx = 0; unsigned int hwidx = 0; @@ -295,7 +307,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #if(MIO_BN_USESAVED == 0) //==== CALC MEAN and VARIANCE ONCE AGAIN ======================= _FLOAT_PREC variance = (_FLOAT_PREC)0.; -#if(MIO_BN_HW >= 4096) +#if(MIO_LAYOUT_NHWC == 0 && MIO_BN_HW >= 4096) _FLOAT4 read4; #if(MIO_BN_N > MIO_BN_LOOP_UNROLL_MAXN) __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << 2; k < MIO_BN_LESS4; @@ -350,7 +362,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, { nidx = k / MIO_BN_HW; hwidx = k - (nidx * MIO_BN_HW); - index = nidx * MIO_BN_CHW + chwid + hwidx; +#if MIO_LAYOUT_NHWC == 1 + index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; +#else + index = nidx * MIO_BN_CHW + chwid + hwidx; +#endif _FLOAT_PREC in = (_FLOAT_PREC)(*(x_in + index)); mean += in; variance = mad(in, in, variance); @@ -361,7 +377,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, unsigned int remkey = lid + MIO_BN_LESS; nidx = remkey / MIO_BN_HW; hwidx = remkey - (nidx * MIO_BN_HW); - index = nidx * MIO_BN_CHW + chwid + hwidx; +#if MIO_LAYOUT_NHWC == 1 + index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; +#else + index = nidx * MIO_BN_CHW + chwid + hwidx; +#endif _FLOAT_PREC in = (index < MIO_BN_NCHW) ? (_FLOAT_PREC)(*(x_in + index)) : (_FLOAT_PREC)0.; mean += in; variance = mad(in, in, variance); @@ -396,19 +416,47 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #endif +#if MIO_LAYOUT_NHWC == 1 + _FLOAT dyRead; + _FLOAT xread; + _FLOAT_PREC xhat; +#else _FLOAT4 dyRead4; _FLOAT4 xread4; _FLOAT_PREC4 xhat4; +#endif #if(MIO_BN_N > MIO_BN_LOOP_UNROLL_MAXN) - __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << 2; k < MIO_BN_LESS4; + __attribute__((opencl_unroll_hint(4))) for(unsigned int k = +#if MIO_LAYOUT_NHWC == 1 + lid +#else + lid << 2 +#endif + ; + k < MIO_BN_LESS4; k += GRPRD) #else - __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << 2; k < MIO_BN_LESS4; + __attribute__((opencl_unroll_hint(2))) for(unsigned int k = +#if MIO_LAYOUT_NHWC == 1 + lid +#else + lid << 2 +#endif + ; + k < MIO_BN_LESS4; k += GRPRD) #endif { - nidx = k / MIO_BN_HW; - hwidx = k - (nidx * MIO_BN_HW); + nidx = k / MIO_BN_HW; + hwidx = k - (nidx * MIO_BN_HW); +#if MIO_LAYOUT_NHWC == 1 + index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; + xread = *((const global _FLOAT*)(x_in + index)); + dyRead = *((const global _FLOAT*)(dy_in + index)); + xhat = ((_FLOAT_PREC)xread - mean) * invVariance; + db += (_FLOAT_PREC)dyRead; + ds = mad(xhat, (_FLOAT_PREC)dyRead, ds); +#else index = nidx * MIO_BN_CHW + chwid + hwidx; xread4 = *((const global _FLOAT4*)(x_in + index)); dyRead4 = *((const global _FLOAT4*)(dy_in + index)); @@ -424,13 +472,31 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, ds = mad(xhat4.y, (_FLOAT_PREC)dyRead4.y, ds); ds = mad(xhat4.z, (_FLOAT_PREC)dyRead4.z, ds); ds = mad(xhat4.w, (_FLOAT_PREC)dyRead4.w, ds); +#endif } #if(MIO_BN_REM4) - unsigned int remkey = (lid << 2) + MIO_BN_LESS4; - nidx = remkey / MIO_BN_HW; - hwidx = remkey - (nidx * MIO_BN_HW); - index = nidx * MIO_BN_CHW + chwid + hwidx; + unsigned int remkey = +#if MIO_LAYOUT_NHWC == 1 + lid +#else + (lid << 2) +#endif + + MIO_BN_LESS4; + nidx = remkey / MIO_BN_HW; + hwidx = remkey - (nidx * MIO_BN_HW); + index = nidx * MIO_BN_CHW + +#if MIO_LAYOUT_NHWC == 1 + hwidx * MIO_BN_C + grpid; + if(index < MIO_BN_NCHW) + { + xread = *((const global _FLOAT*)(x_in + index)); + dyRead = *((const global _FLOAT*)(dy_in + index)); + xhat = ((_FLOAT_PREC)xread - mean) * invVariance; + db += (_FLOAT_PREC)dyRead; + ds = mad(xhat.x, (_FLOAT_PREC)dyRead.x, ds); +#else + chwid + hwidx; if(index < (MIO_BN_NCHW - 3)) { xread4 = *((const global _FLOAT4*)(x_in + index)); @@ -447,6 +513,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, ds = mad(xhat4.y, (_FLOAT_PREC)dyRead4.y, ds); ds = mad(xhat4.z, (_FLOAT_PREC)dyRead4.z, ds); ds = mad(xhat4.w, (_FLOAT_PREC)dyRead4.w, ds); +#endif } #endif @@ -491,12 +558,16 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, for(unsigned int j = 0; j < MIO_MAX_READ; j++) #endif { - unsigned int l = k + j; - nidx = l / MIO_BN_HW; - hwidx = l - (nidx * MIO_BN_HW); - index = nidx * MIO_BN_CHW + chwid + hwidx; - dyvalue = (_FLOAT_PREC)(*(dy_in + index)); - xhat = ((_FLOAT_PREC)(*(x_in + index)) - mean) * invVariance; + unsigned int l = k + j; + nidx = l / MIO_BN_HW; + hwidx = l - (nidx * MIO_BN_HW); +#if MIO_LAYOUT_NHWC == 1 + index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; +#else + index = nidx * MIO_BN_CHW + chwid + hwidx; +#endif + dyvalue = (_FLOAT_PREC)(*(dy_in + index)); + xhat = ((_FLOAT_PREC)(*(x_in + index)) - mean) * invVariance; #if MIOPEN_USE_FP16 == 1 float temp_tmp1 = mad((float)NHW, (float)dyvalue, -temp_db); float temp_tmp2 = -((float)xhat) * temp_ds; @@ -518,7 +589,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, unsigned int l = k + j; nidx = l / MIO_BN_HW; hwidx = l - (nidx * MIO_BN_HW); - index = nidx * MIO_BN_CHW + chwid + hwidx; +#if MIO_LAYOUT_NHWC == 1 + index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; +#else + index = nidx * MIO_BN_CHW + chwid + hwidx; +#endif *(dx_out + index) = (_FLOAT)vals[j]; } } @@ -534,7 +609,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, unsigned int l = remkeyout + j; nidx = l / MIO_BN_HW; hwidx = l - (nidx * MIO_BN_HW); - index = nidx * MIO_BN_CHW + chwid + hwidx; +#if MIO_LAYOUT_NHWC == 1 + index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; +#else + index = nidx * MIO_BN_CHW + chwid + hwidx; +#endif if(index < MIO_BN_NCHW) { dyvalue = (_FLOAT_PREC)(*(dy_in + index)); @@ -554,7 +633,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, unsigned int l = remkeyout + j; nidx = l / MIO_BN_HW; hwidx = l - (nidx * MIO_BN_HW); - index = nidx * MIO_BN_CHW + chwid + hwidx; +#if MIO_LAYOUT_NHWC == 1 + index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; +#else + index = nidx * MIO_BN_CHW + chwid + hwidx; +#endif if(index < MIO_BN_NCHW) { *(dx_out + index) = (_FLOAT_PREC)vals[j]; @@ -681,7 +764,7 @@ MIOpenBatchNormBwdSpatialDScaleDBias(const __global _FLOAT* x_in, const __global _FLOAT* savedMean, const __global _FLOAT* savedInvVariance #endif -) + ) { unsigned int xgid = get_global_id(0); @@ -995,7 +1078,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #else // maxn db += (_FLOAT_PREC)(*(dy_in + index)); _FLOAT_PREC xhat = (((_FLOAT_PREC)(*(x_in + index)) - mean) * invVariance); - ds = mad(xhat, (_FLOAT_PREC)(*(dy_in + index)), ds); + ds = mad(xhat, (_FLOAT_PREC)(*(dy_in + index)), ds); #endif } } diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl index fafc08aa1a..b0e6ab8322 100644 --- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl +++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl @@ -69,7 +69,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, __global _FLOAT_PREC* __restrict resultSaveMean, __global _FLOAT_PREC* __restrict resultSaveInvVariance #endif -) + ) { // SPATIAL @@ -181,6 +181,15 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, //=========== +#ifndef MIO_LAYOUT_NHWC +#define MIO_LAYOUT_NHWC 0 +#endif + +#if MIO_LAYOUT_NHWC == 1 +#define MIO_MAX_READ 1 +#define RD_BLK 1 +#define GRPRD (MIO_BN_GRP0 * RD_BLK) +#else #if(MIO_BN_HW >= 4096) #define MIO_MAX_READ 3 #else @@ -188,6 +197,8 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, #endif #define RD_BLK 1 #define GRPRD (MIO_BN_GRP0 * RD_BLK * 4) +#endif + #define MIO_BN_REM4 (MIO_BN_NHW - ((MIO_BN_NHW / GRPRD) * GRPRD)) #define MIO_BN_LESS4 (MIO_BN_NHW - MIO_BN_REM4) #define MIO_BN_CHUNK4 (MIO_MAX_READ * GRPRD) @@ -216,7 +227,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, __global _FLOAT_PREC* __restrict resultSaveMean, __global _FLOAT_PREC* __restrict resultSaveInvVariance #endif -) + ) { // SPATIAL @@ -232,7 +243,9 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, uint index = 0; uint lid = get_local_id(0); uint grpid = get_group_id(0); +#if MIO_LAYOUT_NHWC == 0 uint chwid = grpid * MIO_BN_HW; +#endif uint nidx = 0; uint hwidx = 0; @@ -243,7 +256,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, } barrier(CLK_LOCAL_MEM_FENCE); -#if(MIO_BN_HW >= 4096) +#if(MIO_LAYOUT_NHWC == 0 && MIO_BN_HW >= 4096) _FLOAT4 read4; __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << 2; k < MIO_BN_LESS4; k += GRPRD) @@ -288,7 +301,11 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, { nidx = k / MIO_BN_HW; hwidx = k - (nidx * MIO_BN_HW); - index = nidx * MIO_BN_CHW + chwid + hwidx; +#if MIO_LAYOUT_NHWC == 1 + index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; +#else + index = nidx * MIO_BN_CHW + chwid + hwidx; +#endif _FLOAT_PREC xin = (_FLOAT_PREC)(*(in + index)); mean += xin; variance = mad(xin, xin, variance); @@ -299,7 +316,11 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, unsigned int remkey = lid + MIO_BN_LESS; nidx = remkey / MIO_BN_HW; hwidx = remkey - (nidx * MIO_BN_HW); - index = nidx * MIO_BN_CHW + chwid + hwidx; +#if MIO_LAYOUT_NHWC == 1 + index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; +#else + index = nidx * MIO_BN_CHW + chwid + hwidx; +#endif _FLOAT_PREC xin = (index < MIO_BN_NCHW) ? (_FLOAT_PREC)(*(in + index)) : (_FLOAT_PREC)0.; mean += xin; variance = mad(xin, xin, variance); @@ -329,13 +350,23 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, pvscale = lcl_scale; pvbias = lcl_bias; -#if(MIO_BN_REM == 0) - __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid; k < MIO_BN_LESS; +#if(MIO_LAYOUT_NHWC == 1 || MIO_BN_REM == 0) + __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid; k < +#if MIO_LAYOUT_NHWC == 1 + MIO_BN_NHW +#else + MIO_BN_LESS +#endif + ; k += MIO_BN_GRP0) { nidx = k / MIO_BN_HW; hwidx = k - (nidx * MIO_BN_HW); +#if MIO_LAYOUT_NHWC == 1 + index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; +#else index = nidx * MIO_BN_CHW + chwid + hwidx; +#endif out[index] = (_FLOAT)mad(pvscale, ((_FLOAT_PREC)(*(in + index)) - mean) * invVariance, pvbias); } // end for @@ -478,7 +509,7 @@ MIOpenBatchNormFwdTrainSpatialFinalMeanVariance( , __global _FLOAT* __restrict resultSaveInvVariance #endif -) + ) { _FLOAT variance = (_FLOAT)0.; _FLOAT invVariance = (_FLOAT)0.; @@ -613,7 +644,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, __global _FLOAT_PREC* __restrict resultSaveMean, __global _FLOAT_PREC* __restrict resultSaveInvVariance #endif -) + ) { // SPATIAL diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp index 70b4aef407..ff632ec057 100644 --- a/src/solver/batchnorm/backward_spatial_multiple.cpp +++ b/src/solver/batchnorm/backward_spatial_multiple.cpp @@ -90,83 +90,94 @@ ConvSolution BnBwdTrainingSpatialMultiple::GetSolution( unsigned int ldsnogcn = 0; int variant = 1; - //************************************************************************************************* - // N*H*W < 32M and H*W > 1024, use batchnorm variant#1 implementation which parallelize - // work groups over channels and loop through NHW. - //************************************************************************************************* - if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024)) + if(problem.IsLayoutNHWC()) { - variant = 1; xlocalsize = 1024; xgridsize = c * xlocalsize; ldsgcn = xlocalsize / 64; ldsnogcn = xlocalsize; } - //************************************************************************************************* - // N*H*W < 32M and H*W > 512 use batchnorm variant#1 or variant#3 implementation which - // parallelize - // work groups over channels and loop through N. - //************************************************************************************************* - else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512) - { - variant = (n >= 32) ? 1 : 3; - xlocalsize = std::min(64 * ((in_cstride + 63) / 64), static_cast(1024)); - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - //************************************************************************************************* - // H*W < 512 use batchnorm variant#0 or variant#3 implementation based on batch size and - // H*W - //************************************************************************************************* - else if(in_cstride <= 512) + else { - if((n > 64) && (in_cstride > 160)) + //************************************************************************************************* + // N*H*W < 32M and H*W > 1024, use batchnorm variant#1 implementation which parallelize + // work groups over channels and loop through NHW. + //************************************************************************************************* + if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024)) { - variant = 3; + variant = 1; + xlocalsize = 1024; + xgridsize = c * xlocalsize; + ldsgcn = xlocalsize / 64; + ldsnogcn = xlocalsize; + } + //************************************************************************************************* + // N*H*W < 32M and H*W > 512 use batchnorm variant#1 or variant#3 implementation which + // parallelize + // work groups over channels and loop through N. + //************************************************************************************************* + else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512) + { + variant = (n >= 32) ? 1 : 3; xlocalsize = std::min(64 * ((in_cstride + 63) / 64), static_cast(1024)); xgridsize = c * xlocalsize; ldsgcn = xlocalsize / 64; ldsnogcn = xlocalsize; } - else + //************************************************************************************************* + // H*W < 512 use batchnorm variant#0 or variant#3 implementation based on batch size and + // H*W + //************************************************************************************************* + else if(in_cstride <= 512) { - variant = 0; - if(bfp32parm) + if((n > 64) && (in_cstride > 160)) { - xlocalsize = 1024; - xgridsize = 1024 * c; + variant = 3; + xlocalsize = + std::min(64 * ((in_cstride + 63) / 64), static_cast(1024)); + xgridsize = c * xlocalsize; + ldsgcn = xlocalsize / 64; + ldsnogcn = xlocalsize; } else { - xlocalsize = 256; - xgridsize = 256 * c; + variant = 0; + if(bfp32parm) + { + xlocalsize = 1024; + xgridsize = 1024 * c; + } + else + { + xlocalsize = 256; + xgridsize = 256 * c; + } + ldsgcn = xlocalsize / 64; + ldsnogcn = xlocalsize; } - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; } - } - //************************************************************************************************* - // N*H*W > 32M, use batchnorm variant#2 implementation which parallelize - // work groups over channels and data segments. - //************************************************************************************************* - else - { - variant = 2; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / 64; - ldsnogcn = ylocalsize; - } - if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm) - { - variant = 1; - xlocalsize = 1024; - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; + //************************************************************************************************* + // N*H*W > 32M, use batchnorm variant#2 implementation which parallelize + // work groups over channels and data segments. + //************************************************************************************************* + else + { + variant = 2; + ylocalsize = 1024; + auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); + xgridsize = c; + ygridsize = segment * ylocalsize; + ldsgcn = ylocalsize / 64; + ldsnogcn = ylocalsize; + } + if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm) + { + variant = 1; + xlocalsize = 1024; + xgridsize = c * xlocalsize; + ldsgcn = xlocalsize / 64; + ldsnogcn = xlocalsize; + } } auto result = ConvSolution{miopenStatusSuccess}; @@ -199,6 +210,7 @@ ConvSolution BnBwdTrainingSpatialMultiple::GetSolution( {"MIO_BN_GRP1", ylocalsize}, {"MIO_BN_GRP2", zlocalsize}, {"MIO_BN_GFX1030", ((handle.GetDeviceName() == "gfx1030") ? "1" : "0")}, + {"MIO_LAYOUT_NHWC", static_cast(problem.IsLayoutNHWC())}, }; kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp index 2f3d82192b..cd94533335 100644 --- a/src/solver/batchnorm/backward_spatial_single.cpp +++ b/src/solver/batchnorm/backward_spatial_single.cpp @@ -99,85 +99,95 @@ BnBwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context, unsigned int ldsnogcn = 0; int variant = 1; - //************************************************************************************************* - // N*H*W < 32M and H*W > 1024, use batchnorm variant#1 implementation which parallelize - // work groups over channels and loop through NHW. - //************************************************************************************************* - if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024)) + if(problem.IsLayoutNHWC()) { - variant = 1; xlocalsize = 1024; xgridsize = c * xlocalsize; ldsgcn = xlocalsize / 64; ldsnogcn = xlocalsize; } - //************************************************************************************************* - // N*H*W < 32M and H*W > 512 use batchnorm variant#1 or variant#3 implementation which - // parallelize - // work groups over channels and loop through N. - //************************************************************************************************* - else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512) - { - variant = (n >= 32) ? 1 : 3; - xlocalsize = std::min(64 * ((in_cstride + 63) / 64), static_cast(1024)); - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - //************************************************************************************************* - // H*W < 512 use batchnorm variant#0 or variant#3 implementation based on batch size and - // H*W - //************************************************************************************************* - else if(in_cstride <= 512) + else { - if((n > 64) && (in_cstride > 160)) + //************************************************************************************************* + // N*H*W < 32M and H*W > 1024, use batchnorm variant#1 implementation which parallelize + // work groups over channels and loop through NHW. + //************************************************************************************************* + if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024)) + { + variant = 1; + xlocalsize = 1024; + xgridsize = c * xlocalsize; + ldsgcn = xlocalsize / 64; + ldsnogcn = xlocalsize; + } + //************************************************************************************************* + // N*H*W < 32M and H*W > 512 use batchnorm variant#1 or variant#3 implementation which + // parallelize + // work groups over channels and loop through N. + //************************************************************************************************* + else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512) { - variant = 3; + variant = (n >= 32) ? 1 : 3; xlocalsize = std::min(64 * ((in_cstride + 63) / 64), static_cast(1024)); xgridsize = c * xlocalsize; ldsgcn = xlocalsize / 64; ldsnogcn = xlocalsize; } - else + //************************************************************************************************* + // H*W < 512 use batchnorm variant#0 or variant#3 implementation based on batch size and + // H*W + //************************************************************************************************* + else if(in_cstride <= 512) { - variant = 0; - if(bfp32parm) + if((n > 64) && (in_cstride > 160)) { - xlocalsize = 1024; - xgridsize = 1024 * c; + variant = 3; + xlocalsize = + std::min(64 * ((in_cstride + 63) / 64), static_cast(1024)); + xgridsize = c * xlocalsize; + ldsgcn = xlocalsize / 64; + ldsnogcn = xlocalsize; } else { - xlocalsize = 256; - xgridsize = 256 * c; + variant = 0; + if(bfp32parm) + { + xlocalsize = 1024; + xgridsize = 1024 * c; + } + else + { + xlocalsize = 256; + xgridsize = 256 * c; + } + ldsgcn = xlocalsize / 64; + ldsnogcn = xlocalsize; } - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; + } + //************************************************************************************************* + // N*H*W > 32M, use batchnorm variant#2 implementation which parallelize + // work groups over channels and data segments. + //************************************************************************************************* + else + { + variant = 2; + ylocalsize = 1024; + auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); + xgridsize = c; + ygridsize = segment * ylocalsize; + ldsgcn = ylocalsize / 64; + ldsnogcn = ylocalsize; + } + if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm) + { + variant = 1; + xlocalsize = 1024; + xgridsize = c * xlocalsize; + ldsgcn = xlocalsize / 64; + ldsnogcn = xlocalsize; } } - //************************************************************************************************* - // N*H*W > 32M, use batchnorm variant#2 implementation which parallelize - // work groups over channels and data segments. - //************************************************************************************************* - else - { - variant = 2; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / 64; - ldsnogcn = ylocalsize; - } - if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm) - { - variant = 1; - xlocalsize = 1024; - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - auto result = ConvSolution{miopenStatusSuccess}; { @@ -203,6 +213,7 @@ BnBwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context, {"MIO_BN_GRP0", xlocalsize}, {"MIO_BN_GRP1", ylocalsize}, {"MIO_BN_GRP2", zlocalsize}, + {"MIO_LAYOUT_NHWC", static_cast(problem.IsLayoutNHWC())}, }; if((n > 64) && (n % 2 == 0) && (variant == 3) && (bfpmixparm) && (problem.UseSaved()) && @@ -210,7 +221,8 @@ BnBwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context, (StartsWith(handle.GetDeviceName(), "gfx8") || (StartsWith(handle.GetDeviceName(), "gfx9") #if WORKAROUND_ISSUE_1146 - && (handle.GetDeviceName() != "gfx90a") + && + (handle.GetDeviceName() != "gfx90a") #endif )) && (!handle.GetTargetProperties().Xnack() || !*handle.GetTargetProperties().Xnack())) diff --git a/src/solver/batchnorm/forward_spatial_multiple.cpp b/src/solver/batchnorm/forward_spatial_multiple.cpp index b7566ec105..7cf04cc4f4 100644 --- a/src/solver/batchnorm/forward_spatial_multiple.cpp +++ b/src/solver/batchnorm/forward_spatial_multiple.cpp @@ -93,24 +93,26 @@ ConvSolution BnFwdTrainingSpatialMultiple::GetSolution( unsigned int ldsgcn = xlocalsize / 64; unsigned int ldsnogcn = xlocalsize; -#if(WORKAROUND_SWDEV_253606 == 0) - if(n < 3) + if(!problem.IsLayoutNHWC()) { - variant = 4; - xlocalsize = 256; - xgridsize = c * xlocalsize; - ylocalsize = 1; - ygridsize = 1; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - else +#if(WORKAROUND_SWDEV_253606 == 0) + if(n < 3) + { + variant = 4; + xlocalsize = 256; + xgridsize = c * xlocalsize; + ylocalsize = 1; + ygridsize = 1; + ldsgcn = xlocalsize / 64; + ldsnogcn = xlocalsize; + } + else #endif - // clang-format off + // clang-format off if((in_nhw < 33554432 && in_cstride > 1024) || - ((n >= 256) && (in_cstride > 60) && bfpmixparm) || - ((in_cstride > 512) && bfpmixparm)) + ((n >= 256) && (in_cstride > 60) && bfpmixparm) || + ((in_cstride > 512) && bfpmixparm)) { variant = 1; } @@ -129,18 +131,19 @@ ConvSolution BnFwdTrainingSpatialMultiple::GetSolution( ldsgcn = ylocalsize / 64; ldsnogcn = ylocalsize; } - // clang-format on + // clang-format on - if((n > 768) && (in_cstride > 150) && bfp32parm) - { - variant = 2; - xlocalsize = 1; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / 64; - ldsnogcn = ylocalsize; + if((n > 768) && (in_cstride > 150) && bfp32parm) + { + variant = 2; + xlocalsize = 1; + ylocalsize = 1024; + auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); + xgridsize = c; + ygridsize = segment * ylocalsize; + ldsgcn = ylocalsize / 64; + ldsnogcn = ylocalsize; + } } auto result = ConvSolution{miopenStatusSuccess}; @@ -174,6 +177,7 @@ ConvSolution BnFwdTrainingSpatialMultiple::GetSolution( {"MIO_BN_GRP1", ylocalsize}, {"MIO_BN_GRP2", zlocalsize}, {"MIO_BN_GFX1030", ((handle.GetDeviceName() == "gfx1030") ? "1" : "0")}, + {"MIO_LAYOUT_NHWC", static_cast(problem.IsLayoutNHWC())}, }; kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); diff --git a/src/solver/batchnorm/forward_spatial_single.cpp b/src/solver/batchnorm/forward_spatial_single.cpp index c357a3b0f2..d36250eb7a 100644 --- a/src/solver/batchnorm/forward_spatial_single.cpp +++ b/src/solver/batchnorm/forward_spatial_single.cpp @@ -129,54 +129,57 @@ BnFwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context, unsigned int ldsgcn = xlocalsize / 64; unsigned int ldsnogcn = xlocalsize; -#if(WORKAROUND_SWDEV_253606 == 0) - if(n < 3) + if(!problem.IsLayoutNHWC()) { - variant = 4; - xlocalsize = 256; - xgridsize = c * xlocalsize; - ylocalsize = 1; - ygridsize = 1; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - else +#if(WORKAROUND_SWDEV_253606 == 0) + if(n < 3) + { + variant = 4; + xlocalsize = 256; + xgridsize = c * xlocalsize; + ylocalsize = 1; + ygridsize = 1; + ldsgcn = xlocalsize / 64; + ldsnogcn = xlocalsize; + } + else #endif - // clang-format off - if((in_nhw < 33554432 && in_cstride > 1024) || - ((n >= 256) && (in_cstride > 60) && bfpmixparm) || - ((in_cstride > 512) && bfpmixparm)) - { - variant = 1; - } - else if(in_cstride <= 512) - { - variant = 0; - } - else - { - variant = 2; - xlocalsize = 1; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / 64; - ldsnogcn = ylocalsize; - } - // clang-format on + // clang-format off + if((in_nhw < 33554432 && in_cstride > 1024) || + ((n >= 256) && (in_cstride > 60) && bfpmixparm) || + ((in_cstride > 512) && bfpmixparm)) + { + variant = 1; + } + else if(in_cstride <= 512) + { + variant = 0; + } + else + { + variant = 2; + xlocalsize = 1; + ylocalsize = 1024; + auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); + xgridsize = c; + ygridsize = segment * ylocalsize; + ldsgcn = ylocalsize / 64; + ldsnogcn = ylocalsize; + } + // clang-format on - if((n > 768) && (in_cstride > 150) && bfp32parm) - { - variant = 2; - xlocalsize = 1; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / 64; - ldsnogcn = ylocalsize; + if((n > 768) && (in_cstride > 150) && bfp32parm) + { + variant = 2; + xlocalsize = 1; + ylocalsize = 1024; + auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); + xgridsize = c; + ygridsize = segment * ylocalsize; + ldsgcn = ylocalsize / 64; + ldsnogcn = ylocalsize; + } } auto result = ConvSolution{miopenStatusSuccess}; @@ -204,6 +207,7 @@ BnFwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context, {"MIO_BN_GRP1", ylocalsize}, {"MIO_BN_GRP2", zlocalsize}, {"MIO_BN_GFX1030", ((handle.GetDeviceName() == "gfx1030") ? "1" : "0")}, + {"MIO_LAYOUT_NHWC", static_cast(problem.IsLayoutNHWC())}, }; if(variant != 4) From 0b24942fce48859753bbbea85ac9887a36244165 Mon Sep 17 00:00:00 2001 From: Jing Zhou Date: Thu, 18 Nov 2021 18:16:04 -0800 Subject: [PATCH 02/10] add batch norm spatial nhwc test --- src/kernels/MIOpenBatchNormBwdSpatial.cl | 24 +- .../batchnorm/backward_spatial_single.cpp | 3 + .../batchnorm/forward_spatial_single.cpp | 3 + test/bn_spatial_nhwc_test.cpp | 743 ++++++++++++++++++ 4 files changed, 761 insertions(+), 12 deletions(-) create mode 100644 test/bn_spatial_nhwc_test.cpp diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl index 961c543822..d7bee08300 100644 --- a/src/kernels/MIOpenBatchNormBwdSpatial.cl +++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl @@ -419,7 +419,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #if MIO_LAYOUT_NHWC == 1 _FLOAT dyRead; _FLOAT xread; - _FLOAT_PREC xhat; + _FLOAT_PREC xhat_tmp; #else _FLOAT4 dyRead4; _FLOAT4 xread4; @@ -447,15 +447,15 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, k += GRPRD) #endif { - nidx = k / MIO_BN_HW; - hwidx = k - (nidx * MIO_BN_HW); + nidx = k / MIO_BN_HW; + hwidx = k - (nidx * MIO_BN_HW); #if MIO_LAYOUT_NHWC == 1 - index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; - xread = *((const global _FLOAT*)(x_in + index)); - dyRead = *((const global _FLOAT*)(dy_in + index)); - xhat = ((_FLOAT_PREC)xread - mean) * invVariance; + index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; + xread = *((const global _FLOAT*)(x_in + index)); + dyRead = *((const global _FLOAT*)(dy_in + index)); + xhat_tmp = ((_FLOAT_PREC)xread - mean) * invVariance; db += (_FLOAT_PREC)dyRead; - ds = mad(xhat, (_FLOAT_PREC)dyRead, ds); + ds = mad(xhat_tmp, (_FLOAT_PREC)dyRead, ds); #else index = nidx * MIO_BN_CHW + chwid + hwidx; xread4 = *((const global _FLOAT4*)(x_in + index)); @@ -490,11 +490,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, hwidx * MIO_BN_C + grpid; if(index < MIO_BN_NCHW) { - xread = *((const global _FLOAT*)(x_in + index)); - dyRead = *((const global _FLOAT*)(dy_in + index)); - xhat = ((_FLOAT_PREC)xread - mean) * invVariance; + xread = *((const global _FLOAT*)(x_in + index)); + dyRead = *((const global _FLOAT*)(dy_in + index)); + xhat_tmp = ((_FLOAT_PREC)xread - mean) * invVariance; db += (_FLOAT_PREC)dyRead; - ds = mad(xhat.x, (_FLOAT_PREC)dyRead.x, ds); + ds = mad(xhat_tmp, (_FLOAT_PREC)dyRead, ds); #else chwid + hwidx; if(index < (MIO_BN_NCHW - 3)) diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp index cd94533335..623fdbf847 100644 --- a/src/solver/batchnorm/backward_spatial_single.cpp +++ b/src/solver/batchnorm/backward_spatial_single.cpp @@ -46,6 +46,9 @@ bool BnBwdTrainingSpatialSingle::IsApplicable( problem.GetMode() != miopenBNSpatial) return false; + if(problem.IsLayoutNHWC()) + return true; + int n, c, h, w; std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); diff --git a/src/solver/batchnorm/forward_spatial_single.cpp b/src/solver/batchnorm/forward_spatial_single.cpp index d36250eb7a..48ad737fe3 100644 --- a/src/solver/batchnorm/forward_spatial_single.cpp +++ b/src/solver/batchnorm/forward_spatial_single.cpp @@ -46,6 +46,9 @@ bool BnFwdTrainingSpatialSingle::IsApplicable( problem.GetMode() != miopenBNSpatial) return false; + if(problem.IsLayoutNHWC()) + return true; + int n, c, h, w; std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); diff --git a/test/bn_spatial_nhwc_test.cpp b/test/bn_spatial_nhwc_test.cpp new file mode 100644 index 0000000000..a1f0407402 --- /dev/null +++ b/test/bn_spatial_nhwc_test.cpp @@ -0,0 +1,743 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "driver.hpp" +#include "get_handle.hpp" +#include "tensor_holder.hpp" +#include "test.hpp" +#include "verify.hpp" +#include "random.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MIO_BN_TEST_EXPAVGFACTOR 0.1 +#define MIO_BN_TEST_EPSILON 1e-5 +#define MIO_BN_USE_MIX_PREC 1 +#if MIO_BN_USE_MIX_PREC == 1 +#define PREC_TYPE float +#else +#define PREC_TYPE T +#endif + +template +struct verify_forward_train_bn_spatial +{ + const tensor input; + const tensor scale; + const tensor shift; + std::tuple, tensor, tensor, tensor, tensor> cpu() const + { + double epsilon = MIO_BN_TEST_EPSILON; + double expAvgFactor = MIO_BN_TEST_EXPAVGFACTOR; + + std::size_t n_batch, channels, height, width; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); + + std::size_t rs_n_batch, rs_channels, rs_height, rs_width; + auto derivedBnDesc = + miopen::TensorDescriptor(input.desc.GetType(), + std::vector{1, 1, 1, channels}, + std::vector{channels, channels, channels, 1}); + std::tie(rs_n_batch, rs_height, rs_width, rs_channels) = + miopen::tien<4>(derivedBnDesc.GetLengths()); + + tensor runMean; + tensor runVar; + if(input.desc.GetType() == miopenFloat) + { + runMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}.generate( + tensor_elem_gen_integer{17}); + runVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}.generate( + tensor_elem_gen_integer{17}); + } + else + { + srand(0); + runMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; + runVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; + for(std::size_t i = 0; i < runMean.desc.GetElementSize(); i++) + { + runMean[i] = (((GET_RAND() % 2) == 1) ? -1 : 1) * 1e-3 * U(GET_RAND() % 100); + runVar[i] = 1e-3 * U(GET_RAND() % 100); + } + } + auto saveMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; + auto saveInvVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; + auto out = input; + std::fill(out.begin(), out.end(), 0); + + const auto nhw = double(height * width * n_batch); + par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + double variance_accum = 0.; + double mean_accum = 0.; + double invVar = 0.; + double newRunMean = 0.; + double adjust = 0.; + + std::vector variance_accum_arr(height, 0.0); + std::vector mean_accum_arr(height, 0.0); + std::vector dshift_accum_arr(height, 0.0); + std::vector dscale_accum_arr(height, 0.0); + + for(std::size_t row = 0; row < height; row++) + { + for(std::size_t column = 0; column < width; column++) + { + for(std::size_t bidx = 0; bidx < n_batch; bidx++) + { + mean_accum_arr[row] += input(bidx, cidx, row, column); + } + } + } + for(std::size_t i = 0; i < height; i++) + mean_accum += mean_accum_arr[i]; + + mean_accum /= nhw; + + elemStd = 0.; + variance_accum = 0.; + + for(std::size_t row = 0; row < height; row++) + { + for(std::size_t column = 0; column < width; column++) + { + for(std::size_t bidx = 0; bidx < n_batch; bidx++) + { + out(bidx, cidx, row, column) = elemStd = + input(bidx, cidx, row, column) - mean_accum; + variance_accum_arr[row] += elemStd * elemStd; + } + } + } + for(std::size_t i = 0; i < height; i++) + variance_accum += variance_accum_arr[i]; + + variance_accum /= nhw; + invVar = 1.0 / sqrt(variance_accum + epsilon); + + for(std::size_t bidx = 0; bidx < n_batch; bidx++) + { + for(std::size_t row = 0; row < height; row++) + { + for(std::size_t column = 0; column < width; column++) + { + out(bidx, cidx, row, column) = + scale(0, 0, 0, cidx) * (invVar * out(bidx, cidx, row, column)) + + shift(0, 0, 0, cidx); + } + } + } + + saveMean(0, 0, 0, cidx) = mean_accum; + saveInvVar(0, 0, 0, cidx) = invVar; + + newRunMean = runMean(0, 0, 0, cidx) * (1 - expAvgFactor); + runMean(0, 0, 0, cidx) = mean_accum * expAvgFactor + newRunMean; + adjust = (n_batch * height * width == 1) ? variance_accum + : (nhw / (nhw - 1)) * variance_accum; + runVar(0, 0, 0, cidx) = + (1 - expAvgFactor) * runVar(0, 0, 0, cidx) + expAvgFactor * adjust; + }); + + return std::make_tuple(out, runMean, runVar, saveMean, saveInvVar); + } + + std::tuple, tensor, tensor, tensor, tensor> gpu() const + { + auto&& handle = get_handle(); + + std::size_t n_batch, channels, height, width; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); + + auto out = input; + std::fill(out.begin(), out.end(), 0); + + std::size_t rs_n_batch, rs_channels, rs_height, rs_width; + auto derivedBnDesc = + miopen::TensorDescriptor(input.desc.GetType(), + std::vector{1, 1, 1, channels}, + std::vector{channels, channels, channels, 1}); + std::tie(rs_n_batch, rs_height, rs_width, rs_channels) = + miopen::tien<4>(derivedBnDesc.GetLengths()); + + tensor runMean; + tensor runVar; + if(input.desc.GetType() == miopenFloat) + { + runMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}.generate( + tensor_elem_gen_integer{17}); + runVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}.generate( + tensor_elem_gen_integer{17}); + } + else + { + srand(0); + runMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; + runVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; + for(std::size_t i = 0; i < runMean.desc.GetElementSize(); i++) + { + runMean[i] = (((GET_RAND() % 2) == 1) ? -1 : 1) * 1e-3 * U(GET_RAND() % 100); + runVar[i] = 1e-3 * U(GET_RAND() % 100); + } + } + + auto saveMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; + auto saveInvVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; + + auto in_dev = handle.Write(input.data); + auto scale_dev = handle.Write(scale.data); + auto shift_dev = handle.Write(shift.data); + + auto runMean_dev = handle.Write(runMean.data); + auto runVar_dev = handle.Write(runVar.data); + auto saveMean_dev = handle.Create(channels); + auto saveInvVar_dev = handle.Create(channels); + auto out_dev = handle.Create(n_batch * channels * height * width); + + double epsilon = MIO_BN_TEST_EPSILON; + double expAvgFactor = MIO_BN_TEST_EXPAVGFACTOR; + + float alpha = 1.0; + float beta = 0.0; + + miopen::BatchNormForwardTraining(handle, + miopenBNSpatial, + &alpha, + &beta, + input.desc, + in_dev.get(), + out.desc, + out_dev.get(), + scale.desc, + scale_dev.get(), + shift_dev.get(), + expAvgFactor, + runMean_dev.get(), + runVar_dev.get(), + epsilon, + saveMean_dev.get(), + saveInvVar_dev.get()); + + saveMean.data = handle.Read(saveMean_dev, saveMean.data.size()); + saveInvVar.data = handle.Read(saveInvVar_dev, saveInvVar.data.size()); + runMean.data = handle.Read(runMean_dev, runMean.data.size()); + runVar.data = handle.Read(runVar_dev, runVar.data.size()); + out.data = handle.Read(out_dev, out.data.size()); + + return std::make_tuple(out, runMean, runVar, saveMean, saveInvVar); + } + + void fail(int badtensor) const + { + std::cout << "Forward Train Spatial Batch Normalization: " << std::endl; + std::cout << "Input tensor: " << input.desc.ToString() << std::endl; + + switch(badtensor) + { + case(0): std::cout << "Output tensor output failed verification." << std::endl; break; + case(1): std::cout << "Running Mean output tensor failed verification." << std::endl; break; + case(2): + std::cout << "Running Variance output tensor failed verification." << std::endl; + break; + case(3): std::cout << "Saved Mean tensor failed verification." << std::endl; break; + case(4): std::cout << "Saved Variance tensor failed verification." << std::endl; break; + default: break; + } + } +}; + +template +struct verify_backward_bn_spatial_recalc +{ + const tensor x_input; + const tensor dy_input; + const tensor scale; + + std::tuple, tensor, tensor> cpu() const + { + double epsilon = MIO_BN_TEST_EPSILON; + + std::size_t n_batch, channels, height, width; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); + + std::size_t ss_n_batch, ss_channels, ss_height, ss_width; + auto derivedBnDesc = + miopen::TensorDescriptor(x_input.desc.GetType(), + std::vector{1, 1, 1, channels}, + std::vector{channels, channels, channels, 1}); + std::tie(ss_n_batch, ss_height, ss_width, ss_channels) = + miopen::tien<4>(derivedBnDesc.GetLengths()); + + auto dx_out = dy_input; + std::fill(dx_out.begin(), dx_out.end(), 0); + + auto dscale = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; + std::fill(dscale.begin(), dscale.end(), 0); + + auto dshift = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; + std::fill(dshift.begin(), dshift.end(), 0); + + const auto nhw = double(height * width * n_batch); + + par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + unsigned int xhat_index; + double mean = 0.; + double invVar = 0.; + double dyelem = 0.; + double variance = 0.; + + std::vector xhat(height * width * n_batch, 0.0); + std::vector variance_accum_arr(height, 0.0); + std::vector mean_accum_arr(height, 0.0); + std::vector dshift_accum_arr(height, 0.0); + std::vector dscale_accum_arr(height, 0.0); + + for(std::size_t row = 0; row < height; row++) + { + for(std::size_t column = 0; column < width; column++) + { + for(std::size_t bidx = 0; bidx < n_batch; bidx++) + { + mean_accum_arr[row] += x_input(bidx, cidx, row, column); + } + } + } + for(std::size_t i = 0; i < height; i++) + mean += mean_accum_arr[i]; + + mean /= nhw; + + elemStd = 0.; + variance = 0.; + + for(std::size_t row = 0; row < height; row++) + { + for(std::size_t column = 0; column < width; column++) + { + for(std::size_t bidx = 0; bidx < n_batch; bidx++) + { + elemStd = x_input(bidx, cidx, row, column) - mean; + variance_accum_arr[row] += elemStd * elemStd; + } + } + } + for(std::size_t i = 0; i < height; i++) + variance += variance_accum_arr[i]; + + variance /= nhw; + invVar = 1. / double(sqrt(variance + epsilon)); + + dscale(0, cidx, 0, 0) = 0.; + + for(std::size_t row = 0; row < height; row++) + { + for(std::size_t column = 0; column < width; column++) + { + for(std::size_t bidx = 0; bidx < n_batch; bidx++) + { + xhat_index = height * width * bidx + (width * row + column); + elemStd = x_input(bidx, cidx, row, column) - mean; + xhat[xhat_index] = elemStd * invVar; + dyelem = dy_input(bidx, cidx, row, column); + dshift_accum_arr[row] += dyelem; + dscale_accum_arr[row] += xhat[xhat_index] * dyelem; + } + } + } + for(std::size_t i = 0; i < height; i++) + { + dshift(0, cidx, 0, 0) += dshift_accum_arr[i]; + dscale(0, cidx, 0, 0) += dscale_accum_arr[i]; + } + + for(std::size_t row = 0; row < height; row++) + { + for(std::size_t column = 0; column < width; column++) + { + for(std::size_t bidx = 0; bidx < n_batch; bidx++) + { + xhat_index = height * width * bidx + (width * row + column); + + double tmp1 = + nhw * dy_input(bidx, cidx, row, column) - dshift(0, cidx, 0, 0); + double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); + double tmp3 = (scale(0, 0, 0, cidx) * invVar) / nhw; + dx_out(bidx, cidx, row, column) = tmp3 * (tmp2 + tmp1); + } + } + } + }); + + return std::make_tuple(dx_out, dscale, dshift); + } + + std::tuple, tensor, tensor> gpu() const + { + auto&& handle = get_handle(); + + std::size_t n_batch, channels, height, width; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); + + auto dx_out = dy_input; + std::fill(dx_out.begin(), dx_out.end(), 0); + + std::size_t ss_n_batch, ss_channels, ss_height, ss_width; + auto derivedBnDesc = + miopen::TensorDescriptor(x_input.desc.GetType(), + std::vector{1, 1, 1, channels}, + std::vector{channels, channels, channels, 1}); + std::tie(ss_n_batch, ss_height, ss_width, ss_channels) = + miopen::tien<4>(derivedBnDesc.GetLengths()); + + auto dscale = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; + std::fill(dscale.begin(), dscale.end(), 0); + + auto dshift = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; + std::fill(dshift.begin(), dshift.end(), 0); + + float alpha = 1.0; + float beta = 0.0; + + auto xin_dev = handle.Write(x_input.data); + auto dyin_dev = handle.Write(dy_input.data); + auto scale_dev = handle.Write(scale.data); + auto dscale_dev = handle.Write(dscale.data); + auto dshift_dev = handle.Write(dshift.data); + auto dx_out_dev = handle.Write(dx_out.data); + + double epsilon = MIO_BN_TEST_EPSILON; + + miopen::BatchNormBackward(handle, + miopenBNSpatial, + &alpha, + &beta, + &alpha, + &beta, + x_input.desc, + xin_dev.get(), + dy_input.desc, + dyin_dev.get(), + dx_out.desc, + dx_out_dev.get(), + scale.desc, + scale_dev.get(), + dscale_dev.get(), + dshift_dev.get(), + epsilon, + nullptr, + nullptr); + + dx_out.data = handle.Read(dx_out_dev, dx_out.data.size()); + dscale.data = handle.Read(dscale_dev, dscale.data.size()); + dshift.data = handle.Read(dshift_dev, dshift.data.size()); + + return std::make_tuple(dx_out, dscale, dshift); + } + + void fail(int badtensor) const + { + std::cout << "Backward Batch Spatial Normalization Recalc Mean and Variance: " << std::endl; + std::cout << "X Input tensor: " << x_input.desc.ToString() << std::endl; + std::cout << "Delta Y Input tensor: " << dy_input.desc.ToString() << std::endl; + switch(badtensor) + { + case(0): + std::cout << "Delta X output tensor output failed verification." << std::endl; + break; + case(1): std::cout << "Delta scale output tensor failed verification." << std::endl; break; + case(2): std::cout << "Delta shift output tensor failed verification." << std::endl; break; + default: break; + } + } +}; + +template +struct verify_backward_bn_spatial_use_saved +{ + const tensor x_input; + const tensor dy_input; + const tensor scale; + const tensor savedMean; + const tensor savedInvVar; + std::tuple, tensor, tensor> cpu() const + { + + std::size_t n_batch, channels, height, width; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); + + auto dx_out = dy_input; + std::fill(dx_out.begin(), dx_out.end(), 0); + + std::size_t ss_n_batch, ss_channels, ss_height, ss_width; + auto derivedBnDesc = + miopen::TensorDescriptor(x_input.desc.GetType(), + std::vector{1, 1, 1, channels}, + std::vector{channels, channels, channels, 1}); + std::tie(ss_n_batch, ss_height, ss_width, ss_channels) = + miopen::tien<4>(derivedBnDesc.GetLengths()); + + auto dscale = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; + std::fill(dscale.begin(), dscale.end(), 0); + + auto dshift = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; + std::fill(dshift.begin(), dshift.end(), 0); + + const auto nhw = double(height * width * n_batch); + + par_for(channels, 1, [&](int cidx) { + double elemStd = 0.; + unsigned int xhat_index; + double mean = savedMean(0, 0, 0, cidx); + double invVar = savedInvVar(0, 0, 0, cidx); + double dyelem = 0.; + + std::vector xhat(n_batch * height * width, 0.0); + std::vector dshift_accum_arr(height, 0.0); + std::vector dscale_accum_arr(height, 0.0); + dscale(0, cidx, 0, 0) = 0.; + + for(std::size_t row = 0; row < height; row++) + { + for(std::size_t column = 0; column < width; column++) + { + for(std::size_t bidx = 0; bidx < n_batch; bidx++) + { + xhat_index = height * width * bidx + (width * row + column); + elemStd = x_input(bidx, cidx, row, column) - mean; + xhat[xhat_index] = elemStd * invVar; + dyelem = dy_input(bidx, cidx, row, column); + dshift_accum_arr[row] += dyelem; + dscale_accum_arr[row] += xhat[xhat_index] * dyelem; + } + } + } + for(std::size_t i = 0; i < height; i++) + { + dshift(0, cidx, 0, 0) += dshift_accum_arr[i]; + dscale(0, cidx, 0, 0) += dscale_accum_arr[i]; + } + + for(std::size_t row = 0; row < height; row++) + { + for(std::size_t column = 0; column < width; column++) + { + for(std::size_t bidx = 0; bidx < n_batch; bidx++) + { + xhat_index = height * width * bidx + (width * row + column); + + double tmp1 = + nhw * dy_input(bidx, cidx, row, column) - dshift(0, cidx, 0, 0); + double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); + double tmp3 = (scale(0, 0, 0, cidx) * invVar) / nhw; + dx_out(bidx, cidx, row, column) = tmp3 * (tmp2 + tmp1); + } + } + } + }); + + return std::make_tuple(dx_out, dscale, dshift); + } + + std::tuple, tensor, tensor> gpu() const + { + auto&& handle = get_handle(); + + std::size_t n_batch, channels, height, width; + std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); + + auto dx_out = dy_input; + std::fill(dx_out.begin(), dx_out.end(), 0); + + std::size_t ss_n_batch, ss_channels, ss_height, ss_width; + auto derivedBnDesc = + miopen::TensorDescriptor(x_input.desc.GetType(), + std::vector{1, 1, 1, channels}, + std::vector{channels, channels, channels, 1}); + std::tie(ss_n_batch, ss_height, ss_width, ss_channels) = + miopen::tien<4>(derivedBnDesc.GetLengths()); + + auto dscale = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; + std::fill(dscale.begin(), dscale.end(), 0); + + auto dshift = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; + std::fill(dshift.begin(), dshift.end(), 0); + + float alpha = 1.0; + float beta = 0.0; + + auto xin_dev = handle.Write(x_input.data); + auto dyin_dev = handle.Write(dy_input.data); + auto scale_dev = handle.Write(scale.data); + auto dscale_dev = handle.Write(dscale.data); + auto dshift_dev = handle.Write(dshift.data); + auto dx_out_dev = handle.Write(dx_out.data); + auto savedMean_dev = handle.Write(savedMean.data); + auto savedInvVar_dev = handle.Write(savedInvVar.data); + + double epsilon = MIO_BN_TEST_EPSILON; + + miopen::BatchNormBackward(handle, + miopenBNSpatial, + &alpha, + &beta, + &alpha, + &beta, + x_input.desc, + xin_dev.get(), + dy_input.desc, + dyin_dev.get(), + dx_out.desc, + dx_out_dev.get(), + scale.desc, + scale_dev.get(), + dscale_dev.get(), + dshift_dev.get(), + epsilon, + savedMean_dev.get(), + savedInvVar_dev.get()); + + dx_out.data = handle.Read(dx_out_dev, dx_out.data.size()); + dscale.data = handle.Read(dscale_dev, dscale.data.size()); + dshift.data = handle.Read(dshift_dev, dshift.data.size()); + + return std::make_tuple(dx_out, dscale, dshift); + } + + void fail(int badtensor) const + { + std::cout << "Backward Batch Spatial Normalization Use Saved Mean and Variance: " + << std::endl; + std::cout << "X Input tensor: " << x_input.desc.ToString() << std::endl; + std::cout << "Delta Y Input tensor: " << dy_input.desc.ToString() << std::endl; + switch(badtensor) + { + case(0): + std::cout << "Delta X output tensor output failed verification." << std::endl; + break; + case(1): std::cout << "Delta scale output tensor failed verification." << std::endl; break; + case(2): std::cout << "Delta shift output tensor failed verification." << std::endl; break; + default: break; + } + } +}; + +template +struct batch_norm_spatial_nhwc_driver : test_driver +{ + tensor input; + tensor scale; + tensor shift; + batch_norm_spatial_nhwc_driver() + { + this->batch_factor = 4; + add(input, + "input", + get_bn_spatial_input_tensor( + tensor_elem_gen_integer{miopen_type{} == miopenHalf ? 5 : 17})); + } + + void run() + { + std::size_t n, c, h, w; + std::tie(n, c, h, w) = miopen::tien<4>(input.desc.GetLengths()); + + std::size_t ssn, ssc, ssh, ssw; + auto derivedBnDesc = miopen::TensorDescriptor(input.desc.GetType(), + std::vector{1, 1, 1, c}, + std::vector{c, c, c, 1}); + std::tie(ssn, ssh, ssw, ssc) = miopen::tien<4>(derivedBnDesc.GetLengths()); + + std::vector new_len = input.desc.GetLengths(); + std::vector new_str; + miopen::tensor_layout_to_strides(new_len, "NCHW", "NHWC", new_str); + input.desc = miopen::TensorDescriptor(miopen_type{}, new_len, new_str); + + if(input.desc.GetType() == miopenFloat) + { + scale = tensor{ssn, ssh, ssw, ssc}.generate(tensor_elem_gen_integer{17}); + shift = tensor{ssn, ssh, ssw, ssc}.generate(tensor_elem_gen_integer{17}); + } + else + { + srand(0); + scale = tensor{ssn, ssh, ssw, ssc}; + shift = tensor{ssn, ssh, ssw, ssc}; + for(std::size_t i = 0; i < scale.desc.GetElementSize(); i++) + { + scale[i] = (((GET_RAND() % 2) == 1) ? -1 : 1) * 1e-4 * PREC_TYPE(GET_RAND() % 100); + shift[i] = (((GET_RAND() % 2) == 1) ? -1 : 1) * 1e-4 * PREC_TYPE(GET_RAND() % 100); + } + for(std::size_t i = 0; i < input.desc.GetElementSize(); i++) + { + input[i] = (((GET_RAND() % 2) == 1) ? -1 : 1) * (1e-5 * T(GET_RAND() % 100)); + } + } + + auto outpair = verify(verify_forward_train_bn_spatial{input, scale, shift}); + + auto dy_input = std::get<0>(outpair.second); + for(std::size_t bidx = 0; bidx < n; bidx++) + { + for(std::size_t cidx = 0; cidx < c; cidx++) + { + for(std::size_t row = 0; row < h; row++) + { + for(std::size_t column = 0; column < w; column++) + { + dy_input(bidx, cidx, row, column) *= 0.1; + } + } + } + } + this->tolerance = 80 * input.desc.GetElementSize(); + verify(verify_backward_bn_spatial_recalc{input, dy_input, scale}); + + auto savedMean = std::get<3>(outpair.second); + auto savedInvVar = std::get<4>(outpair.second); + verify(verify_backward_bn_spatial_use_saved{ + input, dy_input, scale, savedMean, savedInvVar}); + } +}; + +int main(int argc, const char* argv[]) +{ + test_drive(argc, argv); + exit(0); +} From 0cd8e89887da9af1d5cbd79f3274bed284b76b16 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Fri, 19 Nov 2021 11:07:41 -0800 Subject: [PATCH 03/10] Fix Tidy Issue --- test/bn_spatial_nhwc_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/bn_spatial_nhwc_test.cpp b/test/bn_spatial_nhwc_test.cpp index a1f0407402..4a72bfecf0 100644 --- a/test/bn_spatial_nhwc_test.cpp +++ b/test/bn_spatial_nhwc_test.cpp @@ -739,5 +739,5 @@ struct batch_norm_spatial_nhwc_driver : test_driver int main(int argc, const char* argv[]) { test_drive(argc, argv); - exit(0); + return 0; } From 8285fe87b06eec53783ef1921b9dd2b6b5a67136 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Mon, 13 Dec 2021 15:13:07 -0800 Subject: [PATCH 04/10] Fix tidy and review requests --- src/kernels/MIOpenBatchNormBwdSpatial.cl | 20 ++---- src/kernels/MIOpenBatchNormFwdTrainSpatial.cl | 13 ++-- .../batchnorm/forward_spatial_single.cpp | 69 ++++++++++--------- 3 files changed, 45 insertions(+), 57 deletions(-) diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl index d7bee08300..3556185f20 100644 --- a/src/kernels/MIOpenBatchNormBwdSpatial.cl +++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl @@ -288,7 +288,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, unsigned int index = 0; unsigned int lid = get_local_id(0); unsigned int grpid = get_group_id(0); -#if MIO_LAYOUT_NHWC == 0 +#if !MIO_LAYOUT_NHWC unsigned int chwid = grpid * MIO_BN_HW; #endif unsigned int nidx = 0; @@ -307,7 +307,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #if(MIO_BN_USESAVED == 0) //==== CALC MEAN and VARIANCE ONCE AGAIN ======================= _FLOAT_PREC variance = (_FLOAT_PREC)0.; -#if(MIO_LAYOUT_NHWC == 0 && MIO_BN_HW >= 4096) +#if !MIO_LAYOUT_NHWC && MIO_BN_HW >= 4096 _FLOAT4 read4; #if(MIO_BN_N > MIO_BN_LOOP_UNROLL_MAXN) __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << 2; k < MIO_BN_LESS4; @@ -426,23 +426,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, _FLOAT_PREC4 xhat4; #endif #if(MIO_BN_N > MIO_BN_LOOP_UNROLL_MAXN) - __attribute__((opencl_unroll_hint(4))) for(unsigned int k = -#if MIO_LAYOUT_NHWC == 1 - lid -#else - lid << 2 -#endif - ; + __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << (2 * MIO_LAYOUT_NHWC); k < MIO_BN_LESS4; k += GRPRD) #else - __attribute__((opencl_unroll_hint(2))) for(unsigned int k = -#if MIO_LAYOUT_NHWC == 1 - lid -#else - lid << 2 -#endif - ; + __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << (2 * MIO_LAYOUT_NHWC); k < MIO_BN_LESS4; k += GRPRD) #endif diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl index b0e6ab8322..f715709d5a 100644 --- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl +++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl @@ -243,7 +243,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, uint index = 0; uint lid = get_local_id(0); uint grpid = get_group_id(0); -#if MIO_LAYOUT_NHWC == 0 +#if !MIO_LAYOUT_NHWC uint chwid = grpid * MIO_BN_HW; #endif uint nidx = 0; @@ -256,7 +256,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, } barrier(CLK_LOCAL_MEM_FENCE); -#if(MIO_LAYOUT_NHWC == 0 && MIO_BN_HW >= 4096) +#if !MIO_LAYOUT_NHWC && MIO_BN_HW >= 4096 _FLOAT4 read4; __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << 2; k < MIO_BN_LESS4; k += GRPRD) @@ -351,14 +351,13 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, pvbias = lcl_bias; #if(MIO_LAYOUT_NHWC == 1 || MIO_BN_REM == 0) - __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid; k < + const unsigned int k_limit = #if MIO_LAYOUT_NHWC == 1 - MIO_BN_NHW + MIO_BN_NHW; #else - MIO_BN_LESS + MIO_BN_LESS; #endif - ; - k += MIO_BN_GRP0) + __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid; k < k_limit; k += MIO_BN_GRP0) { nidx = k / MIO_BN_HW; hwidx = k - (nidx * MIO_BN_HW); diff --git a/src/solver/batchnorm/forward_spatial_single.cpp b/src/solver/batchnorm/forward_spatial_single.cpp index 48ad737fe3..fdfa828623 100644 --- a/src/solver/batchnorm/forward_spatial_single.cpp +++ b/src/solver/batchnorm/forward_spatial_single.cpp @@ -147,41 +147,42 @@ BnFwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context, } else #endif - - // clang-format off - if((in_nhw < 33554432 && in_cstride > 1024) || - ((n >= 256) && (in_cstride > 60) && bfpmixparm) || - ((in_cstride > 512) && bfpmixparm)) - { - variant = 1; - } - else if(in_cstride <= 512) { - variant = 0; - } - else - { - variant = 2; - xlocalsize = 1; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / 64; - ldsnogcn = ylocalsize; - } - // clang-format on - - if((n > 768) && (in_cstride > 150) && bfp32parm) - { - variant = 2; - xlocalsize = 1; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / 64; - ldsnogcn = ylocalsize; + // clang-format off + if( (in_nhw < 33554432 && in_cstride > 1024) || + ((n >= 256) && (in_cstride > 60) && bfpmixparm) || + ((in_cstride > 512) && bfpmixparm)) + { + variant = 1; + } + else if(in_cstride <= 512) + { + variant = 0; + } + else + { + variant = 2; + xlocalsize = 1; + ylocalsize = 1024; + auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); + xgridsize = c; + ygridsize = segment * ylocalsize; + ldsgcn = ylocalsize / 64; + ldsnogcn = ylocalsize; + } + // clang-format on + + if((n > 768) && (in_cstride > 150) && bfp32parm) + { + variant = 2; + xlocalsize = 1; + ylocalsize = 1024; + auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); + xgridsize = c; + ygridsize = segment * ylocalsize; + ldsgcn = ylocalsize / 64; + ldsnogcn = ylocalsize; + } } } From cd19418872e4a8da7904de96d7c94220f7360220 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Wed, 15 Dec 2021 16:46:24 -0800 Subject: [PATCH 05/10] Fix error and review requests --- src/kernels/MIOpenBatchNormBwdSpatial.cl | 30 +++++++++++-------- src/kernels/MIOpenBatchNormFwdTrainSpatial.cl | 16 ++++++---- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl index 3556185f20..671f5a3f49 100644 --- a/src/kernels/MIOpenBatchNormBwdSpatial.cl +++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl @@ -233,7 +233,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #define MIO_LAYOUT_NHWC 0 #endif -#if MIO_LAYOUT_NHWC == 1 +#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1) +#error MIO_LAYOUT_NHWC must be 0 or 1 +#endif + +#if MIO_LAYOUT_NHWC #define MIO_MAX_READ 1 #define RD_BLK 1 #define GRPRD (MIO_BN_GRP0 * RD_BLK) @@ -362,7 +366,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, { nidx = k / MIO_BN_HW; hwidx = k - (nidx * MIO_BN_HW); -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else index = nidx * MIO_BN_CHW + chwid + hwidx; @@ -377,7 +381,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, unsigned int remkey = lid + MIO_BN_LESS; nidx = remkey / MIO_BN_HW; hwidx = remkey - (nidx * MIO_BN_HW); -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else index = nidx * MIO_BN_CHW + chwid + hwidx; @@ -416,7 +420,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #endif -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC _FLOAT dyRead; _FLOAT xread; _FLOAT_PREC xhat_tmp; @@ -426,18 +430,18 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, _FLOAT_PREC4 xhat4; #endif #if(MIO_BN_N > MIO_BN_LOOP_UNROLL_MAXN) - __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << (2 * MIO_LAYOUT_NHWC); + __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << 2*(1 - MIO_LAYOUT_NHWC)); k < MIO_BN_LESS4; k += GRPRD) #else - __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << (2 * MIO_LAYOUT_NHWC); + __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << 2*(1 - MIO_LAYOUT_NHWC)); k < MIO_BN_LESS4; k += GRPRD) #endif { nidx = k / MIO_BN_HW; hwidx = k - (nidx * MIO_BN_HW); -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; xread = *((const global _FLOAT*)(x_in + index)); dyRead = *((const global _FLOAT*)(dy_in + index)); @@ -465,7 +469,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #if(MIO_BN_REM4) unsigned int remkey = -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC lid #else (lid << 2) @@ -474,7 +478,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, nidx = remkey / MIO_BN_HW; hwidx = remkey - (nidx * MIO_BN_HW); index = nidx * MIO_BN_CHW + -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC hwidx * MIO_BN_C + grpid; if(index < MIO_BN_NCHW) { @@ -549,7 +553,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, unsigned int l = k + j; nidx = l / MIO_BN_HW; hwidx = l - (nidx * MIO_BN_HW); -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else index = nidx * MIO_BN_CHW + chwid + hwidx; @@ -577,7 +581,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, unsigned int l = k + j; nidx = l / MIO_BN_HW; hwidx = l - (nidx * MIO_BN_HW); -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else index = nidx * MIO_BN_CHW + chwid + hwidx; @@ -597,7 +601,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, unsigned int l = remkeyout + j; nidx = l / MIO_BN_HW; hwidx = l - (nidx * MIO_BN_HW); -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else index = nidx * MIO_BN_CHW + chwid + hwidx; @@ -621,7 +625,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, unsigned int l = remkeyout + j; nidx = l / MIO_BN_HW; hwidx = l - (nidx * MIO_BN_HW); -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else index = nidx * MIO_BN_CHW + chwid + hwidx; diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl index f715709d5a..77b46a17ed 100644 --- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl +++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl @@ -185,7 +185,11 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, #define MIO_LAYOUT_NHWC 0 #endif -#if MIO_LAYOUT_NHWC == 1 +#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1) +#error MIO_LAYOUT_NHWC must be 0 or 1 +#endif + +#if MIO_LAYOUT_NHWC #define MIO_MAX_READ 1 #define RD_BLK 1 #define GRPRD (MIO_BN_GRP0 * RD_BLK) @@ -301,7 +305,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, { nidx = k / MIO_BN_HW; hwidx = k - (nidx * MIO_BN_HW); -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else index = nidx * MIO_BN_CHW + chwid + hwidx; @@ -316,7 +320,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, unsigned int remkey = lid + MIO_BN_LESS; nidx = remkey / MIO_BN_HW; hwidx = remkey - (nidx * MIO_BN_HW); -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else index = nidx * MIO_BN_CHW + chwid + hwidx; @@ -350,9 +354,9 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, pvscale = lcl_scale; pvbias = lcl_bias; -#if(MIO_LAYOUT_NHWC == 1 || MIO_BN_REM == 0) +#if(MIO_LAYOUT_NHWC || MIO_BN_REM == 0) const unsigned int k_limit = -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC MIO_BN_NHW; #else MIO_BN_LESS; @@ -361,7 +365,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, { nidx = k / MIO_BN_HW; hwidx = k - (nidx * MIO_BN_HW); -#if MIO_LAYOUT_NHWC == 1 +#if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else index = nidx * MIO_BN_CHW + chwid + hwidx; From e4a74f4e4945fd000295e1ae512a245ce47b6b46 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Thu, 16 Dec 2021 09:44:24 -0800 Subject: [PATCH 06/10] bugfix: define macro in branch results in error --- src/kernels/MIOpenBatchNormBwdSpatial.cl | 16 ++++++++-------- src/kernels/MIOpenBatchNormFwdTrainSpatial.cl | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl index 671f5a3f49..5b148a9a14 100644 --- a/src/kernels/MIOpenBatchNormBwdSpatial.cl +++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl @@ -41,6 +41,14 @@ #include "batchnorm_functions.h" #include "reduction_functions.h" +#ifndef MIO_LAYOUT_NHWC +#define MIO_LAYOUT_NHWC 0 +#endif + +#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1) +#error MIO_LAYOUT_NHWC must be 0 or 1 +#endif + #if(MIO_BN_VARIANT == 0) #define MIO_BN_SEGTMP_1 (MIO_BN_GRP0 / MIO_BN_HW) @@ -229,14 +237,6 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #elif(MIO_BN_VARIANT == 1) -#ifndef MIO_LAYOUT_NHWC -#define MIO_LAYOUT_NHWC 0 -#endif - -#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1) -#error MIO_LAYOUT_NHWC must be 0 or 1 -#endif - #if MIO_LAYOUT_NHWC #define MIO_MAX_READ 1 #define RD_BLK 1 diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl index 77b46a17ed..0aa54d53e6 100644 --- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl +++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl @@ -41,6 +41,14 @@ #include "batchnorm_functions.h" #include "reduction_functions.h" +#ifndef MIO_LAYOUT_NHWC +#define MIO_LAYOUT_NHWC 0 +#endif + +#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1) +#error MIO_LAYOUT_NHWC must be 0 or 1 +#endif + #if(MIO_BN_VARIANT == 0) #define MIO_BN_SEGTMP_1 (MIO_BN_GRP0 / MIO_BN_HW) @@ -181,14 +189,6 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, //=========== -#ifndef MIO_LAYOUT_NHWC -#define MIO_LAYOUT_NHWC 0 -#endif - -#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1) -#error MIO_LAYOUT_NHWC must be 0 or 1 -#endif - #if MIO_LAYOUT_NHWC #define MIO_MAX_READ 1 #define RD_BLK 1 From a2b1adabdc859e498138d1042db5873340e77568 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Thu, 16 Dec 2021 12:41:58 -0800 Subject: [PATCH 07/10] fix ocl error macro --- src/kernels/MIOpenBatchNormBwdSpatial.cl | 2 +- src/kernels/MIOpenBatchNormFwdTrainSpatial.cl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl index 5b148a9a14..6bdfb2c2cd 100644 --- a/src/kernels/MIOpenBatchNormBwdSpatial.cl +++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl @@ -46,7 +46,7 @@ #endif #if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1) -#error MIO_LAYOUT_NHWC must be 0 or 1 +#error "MIO_LAYOUT_NHWC must be 0 or 1" #endif #if(MIO_BN_VARIANT == 0) diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl index 0aa54d53e6..68054d4dcc 100644 --- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl +++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl @@ -46,7 +46,7 @@ #endif #if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1) -#error MIO_LAYOUT_NHWC must be 0 or 1 +#error "MIO_LAYOUT_NHWC must be 0 or 1" #endif #if(MIO_BN_VARIANT == 0) From 2043ad28b422afd4547bc96216be658977da8073 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Thu, 16 Dec 2021 15:36:48 -0800 Subject: [PATCH 08/10] Fix error on macro conditions --- src/kernels/MIOpenBatchNormBwdSpatial.cl | 2 +- src/kernels/MIOpenBatchNormFwdTrainSpatial.cl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl index 6bdfb2c2cd..8227d9dc60 100644 --- a/src/kernels/MIOpenBatchNormBwdSpatial.cl +++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl @@ -45,7 +45,7 @@ #define MIO_LAYOUT_NHWC 0 #endif -#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1) +#if (MIO_LAYOUT_NHWC != 0) && (MIO_LAYOUT_NHWC != 1) #error "MIO_LAYOUT_NHWC must be 0 or 1" #endif diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl index 68054d4dcc..36e00db74a 100644 --- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl +++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl @@ -45,7 +45,7 @@ #define MIO_LAYOUT_NHWC 0 #endif -#if (MIO_LAYOUT_NHWC != 0) || (MIO_LAYOUT_NHWC != 1) +#if (MIO_LAYOUT_NHWC != 0) && (MIO_LAYOUT_NHWC != 1) #error "MIO_LAYOUT_NHWC must be 0 or 1" #endif From 3a0735bea4cf907024d5e2013dad7410a4cd6600 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Thu, 16 Dec 2021 16:31:32 -0800 Subject: [PATCH 09/10] Error fix regarding format --- src/kernels/MIOpenBatchNormBwdSpatial.cl | 24 +++++++------------ src/kernels/MIOpenBatchNormFwdTrainSpatial.cl | 4 ++-- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl index 8227d9dc60..3185c56725 100644 --- a/src/kernels/MIOpenBatchNormBwdSpatial.cl +++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl @@ -369,7 +369,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else - index = nidx * MIO_BN_CHW + chwid + hwidx; + index = nidx * MIO_BN_CHW + chwid + hwidx; #endif _FLOAT_PREC in = (_FLOAT_PREC)(*(x_in + index)); mean += in; @@ -384,7 +384,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else - index = nidx * MIO_BN_CHW + chwid + hwidx; + index = nidx * MIO_BN_CHW + chwid + hwidx; #endif _FLOAT_PREC in = (index < MIO_BN_NCHW) ? (_FLOAT_PREC)(*(x_in + index)) : (_FLOAT_PREC)0.; mean += in; @@ -430,11 +430,11 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, _FLOAT_PREC4 xhat4; #endif #if(MIO_BN_N > MIO_BN_LOOP_UNROLL_MAXN) - __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << 2*(1 - MIO_LAYOUT_NHWC)); + __attribute__((opencl_unroll_hint(4))) for(unsigned int k = lid << 2*(1 - MIO_LAYOUT_NHWC); k < MIO_BN_LESS4; k += GRPRD) #else - __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << 2*(1 - MIO_LAYOUT_NHWC)); + __attribute__((opencl_unroll_hint(2))) for(unsigned int k = lid << 2*(1 - MIO_LAYOUT_NHWC); k < MIO_BN_LESS4; k += GRPRD) #endif @@ -468,13 +468,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, } #if(MIO_BN_REM4) - unsigned int remkey = -#if MIO_LAYOUT_NHWC - lid -#else - (lid << 2) -#endif - + MIO_BN_LESS4; + unsigned int remkey = lid << 2*(1 - MIO_LAYOUT_NHWC) + MIO_BN_LESS4; nidx = remkey / MIO_BN_HW; hwidx = remkey - (nidx * MIO_BN_HW); index = nidx * MIO_BN_CHW + @@ -556,7 +550,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else - index = nidx * MIO_BN_CHW + chwid + hwidx; + index = nidx * MIO_BN_CHW + chwid + hwidx; #endif dyvalue = (_FLOAT_PREC)(*(dy_in + index)); xhat = ((_FLOAT_PREC)(*(x_in + index)) - mean) * invVariance; @@ -584,7 +578,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else - index = nidx * MIO_BN_CHW + chwid + hwidx; + index = nidx * MIO_BN_CHW + chwid + hwidx; #endif *(dx_out + index) = (_FLOAT)vals[j]; } @@ -604,7 +598,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else - index = nidx * MIO_BN_CHW + chwid + hwidx; + index = nidx * MIO_BN_CHW + chwid + hwidx; #endif if(index < MIO_BN_NCHW) { @@ -628,7 +622,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, #if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else - index = nidx * MIO_BN_CHW + chwid + hwidx; + index = nidx * MIO_BN_CHW + chwid + hwidx; #endif if(index < MIO_BN_NCHW) { diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl index 36e00db74a..fae80adbb6 100644 --- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl +++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl @@ -308,7 +308,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, #if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else - index = nidx * MIO_BN_CHW + chwid + hwidx; + index = nidx * MIO_BN_CHW + chwid + hwidx; #endif _FLOAT_PREC xin = (_FLOAT_PREC)(*(in + index)); mean += xin; @@ -323,7 +323,7 @@ MIOpenBatchNormFwdTrainSpatial(const __global _FLOAT* __restrict in, #if MIO_LAYOUT_NHWC index = nidx * MIO_BN_CHW + hwidx * MIO_BN_C + grpid; #else - index = nidx * MIO_BN_CHW + chwid + hwidx; + index = nidx * MIO_BN_CHW + chwid + hwidx; #endif _FLOAT_PREC xin = (index < MIO_BN_NCHW) ? (_FLOAT_PREC)(*(in + index)) : (_FLOAT_PREC)0.; mean += xin; From f85ceddffc09fef221d1174589550f8adb34bdce Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Thu, 16 Dec 2021 20:52:27 -0800 Subject: [PATCH 10/10] Error fix --- src/kernels/MIOpenBatchNormBwdSpatial.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl index 3185c56725..6842b76a04 100644 --- a/src/kernels/MIOpenBatchNormBwdSpatial.cl +++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl @@ -468,7 +468,7 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, } #if(MIO_BN_REM4) - unsigned int remkey = lid << 2*(1 - MIO_LAYOUT_NHWC) + MIO_BN_LESS4; + unsigned int remkey = (lid << 2*(1 - MIO_LAYOUT_NHWC)) + MIO_BN_LESS4; nidx = remkey / MIO_BN_HW; hwidx = remkey - (nidx * MIO_BN_HW); index = nidx * MIO_BN_CHW +