From ea0d0b081cef37ae487876338e0b2db3c43c3665 Mon Sep 17 00:00:00 2001 From: Artem Tamazov Date: Sat, 4 Dec 2021 00:59:40 +0300 Subject: [PATCH 1/3] [HOTFIX][gfx90a][FP16] Disable ConvCkIgemmFwdV6r1DlopsNchw when "ALT FP16" kernel is required (#1320) * gfx90a-disable-nonfp16alt-leftover-01(01) ConvCkIgemmFwdV6r1DlopsNchw --- src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp index f8332fe4b9..42096a5333 100644 --- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp +++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp @@ -97,6 +97,9 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx) co return false; if(ctx.group_counts != 1) return false; + if(ctx.GetStream().GetTargetProperties().Name() == "gfx90a" && + ctx.conv_problem.IsGfx90aFp16altRequired()) + return false; { // this kernel use int32_t for memory offset, which covers 2GB of memory maximum From 330bdd911d28365f2ecc2eaef359f301ebf0b934 Mon Sep 17 00:00:00 2001 From: Shaojie WANG Date: Sat, 4 Dec 2021 09:23:01 +0800 Subject: [PATCH 2/3] [NCHW][SWDEV-312112] fix local buffer alignment bug on transpose+nhwc kernel's solver (#1324) * [nchw]fix local buffer alignment bug on transpose+nhwc kernel's solver --- src/conv/invokers/impl_gemm_dynamic.cpp | 6 ++++-- src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp | 3 +++ src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 3 +++ src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp | 6 +++++- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp index 241fa4fd59..426b4850c0 100644 --- a/src/conv/invokers/impl_gemm_dynamic.cpp +++ b/src/conv/invokers/impl_gemm_dynamic.cpp @@ -579,7 +579,8 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( trans_output_idx = idx++; } - const size_t cast_offset = is_nchw ? (trans_output_offset + trans_output_size) : 0; + // 4 bytes alignment to do atomic add + const size_t cast_offset = is_nchw ? (((trans_output_offset + trans_output_size + 3) >> 2) << 2) : 0; const size_t cast_size = need_cast ? miopen::GetTypeSize(miopenFloat) * n * k * ho * wo : 0; const int kID_trans_start = isGfx90aFp16altSupport ? 2 : 1; @@ -886,7 +887,8 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory( trans_output_idx = idx++; } - const size_t cast_offset = is_nchw ? (trans_output_offset + trans_output_size) : 0; + // 4 bytes alignment to do atomic add + const size_t cast_offset = is_nchw ? (((trans_output_offset + trans_output_size + 3) >> 2) << 2) : 0; const size_t cast_size = need_cast ? miopen::GetTypeSize(miopenFloat) * n * c * hi * wi : 0; const int kID_trans_start = isGfx90aFp16altSupport ? 2 : 1; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp index 8695fdb8f4..0dfc549a95 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp @@ -927,6 +927,9 @@ ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo workspace_size += trans_weight.GetSize(); if(!trans_output.IsSkippable()) workspace_size += trans_output.GetSize(); + + // 4 bytes alignment to do atomic add + workspace_size = ((workspace_size + 3) >> 2) << 2; } if(!ctx.IsFp32()) diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp index 78992c4ea2..4bc9d48658 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -777,6 +777,9 @@ ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo workspace_size += trans_weight.GetSize(); if(!trans_output.IsSkippable()) workspace_size += trans_output.GetSize(); + + // 4 bytes alignment to do atomic add + workspace_size = ((workspace_size + 3) >> 2) << 2; } if(!ctx.IsFp32()) diff --git a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp index d3236918e2..864852e020 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp @@ -903,6 +903,9 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo workspace_size += trans_weight.GetSize(); if(!trans_output.IsSkippable()) workspace_size += trans_output.GetSize(); + + // 4 bytes alignment to do atomic add + workspace_size = ((workspace_size + 3) >> 2) << 2; } if(!ctx.IsFp32()) @@ -1071,7 +1074,8 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution( MIOPEN_LOG_I2(SolverDbId(*this) << ": " << config.ToString() << msg.str()); - const size_t cast_offset = is_nchw ? (trans_output_offset + trans_output_size) : 0; + // 4 bytes alignment to do atomic add + const size_t cast_offset = is_nchw ? (((trans_output_offset + trans_output_size + 3) >> 2) << 2) : 0; const size_t cast_size = need_cast ? miopen::GetTypeSize(miopenFloat) * k * (c / group) * y * x : 0; From cd0d942bc29fe5a9f3b363c4076dc6ef6d5705ff Mon Sep 17 00:00:00 2001 From: Murali N <33875998+muralinr@users.noreply.github.com> Date: Sat, 4 Dec 2021 13:00:31 -0800 Subject: [PATCH 3/3] [Batchnorm][ASM] Disabling ASM kernel for unsupported target (gfx940) (#1312) --- src/solver/batchnorm/backward_spatial_single.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp index 2f3d82192b..5ae1c29704 100644 --- a/src/solver/batchnorm/backward_spatial_single.cpp +++ b/src/solver/batchnorm/backward_spatial_single.cpp @@ -210,9 +210,11 @@ BnBwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context, (StartsWith(handle.GetDeviceName(), "gfx8") || (StartsWith(handle.GetDeviceName(), "gfx9") #if WORKAROUND_ISSUE_1146 - && (handle.GetDeviceName() != "gfx90a") + && + (handle.GetDeviceName() != "gfx90a") #endif - )) && + && + (handle.GetDeviceName() != "gfx940"))) && (!handle.GetTargetProperties().Xnack() || !*handle.GetTargetProperties().Xnack())) { kernel.kernel_file = "gcnAsmBNBwdTrainSpatial.s";