From 26d801a0c2d7a83c253c1a47d849ffcd0085b504 Mon Sep 17 00:00:00 2001 From: JD Date: Tue, 19 Sep 2023 12:53:55 -0500 Subject: [PATCH 01/36] MIOpen FP8 and BFP8 enablement (#2251) --------- Co-authored-by: xinlipn Co-authored-by: Sujatha Phatak Co-authored-by: Evgenii Averin <86725875+averinevg@users.noreply.github.com> Co-authored-by: Jun Liu --- driver/conv_driver.hpp | 160 ++++- driver/driver.hpp | 34 +- driver/main.cpp | 8 + include/miopen/config.h.in | 2 + include/miopen/miopen.h | 29 +- src/CMakeLists.txt | 9 +- src/check_numerics.cpp | 43 +- src/conv/problem_description.cpp | 2 +- src/convolution.cpp | 13 + src/gemm_v2.cpp | 589 +++++++++++----- src/hipoc/hipoc_program.cpp | 9 +- .../miopen/conv/problem_description.hpp | 19 + src/include/miopen/convolution.hpp | 38 + src/include/miopen/datatype.hpp | 21 +- src/include/miopen/gemm_v2.hpp | 76 +- src/include/miopen/handle.hpp | 4 + src/include/miopen/hip_f8_impl.hpp | 1 + src/include/miopen/hip_float8.hpp | 1 + src/include/miopen/problem_description.hpp | 1 + .../miopen/solver/conv_direct_naive_conv.hpp | 6 +- .../problem_description_interpreter.hpp | 16 + src/include/miopen/tensor.hpp | 10 +- src/include/miopen/visit_float.hpp | 2 + src/kernels/MIOpenCheckNumerics.cpp | 205 ++++++ src/kernels/MIOpenIm2d2Col.cl | 10 +- src/kernels/MIOpenUtilKernels4.cl | 10 +- src/kernels/bfloat16_dev.hpp | 179 +++++ src/kernels/float_types.h | 45 ++ .../gpu_reference_kernel/fp8_kern_types.h | 63 ++ .../gpu_reference_kernel/fp8_naive_conv.cpp | 571 +++++++++++++++ src/kernels/hip_f8_impl.hpp | 361 ++++++++++ src/kernels/hip_float8.hpp | 651 ++++++++++++++++++ src/ocl/tensorocl.cpp | 14 +- src/pooling.cpp | 4 +- src/pooling_api.cpp | 2 + src/reducetensor.cpp | 11 +- src/solver/batchnorm/forward_inference_ck.cpp | 4 + src/solver/conv_MP_bidirectional_winograd.cpp | 3 + src/solver/conv_asm_1x1u.cpp | 6 + src/solver/conv_asm_1x1u_bias_activ_fused.cpp | 7 +- src/solver/conv_asm_1x1u_stride2.cpp | 6 + src/solver/conv_asm_3x3u.cpp | 6 + src/solver/conv_asm_5x10u2v2b1.cpp | 2 + src/solver/conv_asm_5x10u2v2f1.cpp | 3 + .../conv_asm_7x7c3h224w224k64u2v2p3q3f1.cpp | 3 + src/solver/conv_asm_dir_BwdWrW1x1.cpp | 3 + src/solver/conv_asm_dir_BwdWrW3x3.cpp | 3 + ...onv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp | 3 + src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp | 3 + .../conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp | 3 + src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp | 3 + .../conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp | 3 + .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 3 + .../conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp | 3 + .../conv_asm_implicit_gemm_v4r1_dynamic.cpp | 3 + ...m_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp | 3 + ...onv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp | 3 + src/solver/conv_bin_wino3x3U.cpp | 3 + src/solver/conv_bin_winoRxS.cpp | 3 + src/solver/conv_bin_winoRxS_fused.cpp | 3 + .../conv_ck_igemm_fwd_bias_activ_fused.cpp | 11 + .../conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp | 3 + src/solver/conv_direct_naive_conv.cpp | 57 +- src/solver/conv_direct_naive_conv_bwd.cpp | 93 ++- src/solver/conv_direct_naive_conv_fwd.cpp | 97 ++- src/solver/conv_direct_naive_conv_wrw.cpp | 93 ++- ...ip_implicit_gemm_3d_grouped_bwd_xdlops.cpp | 10 + ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 8 + ...ip_implicit_gemm_3d_grouped_wrw_xdlops.cpp | 8 + ...conv_hip_implicit_gemm_bwd_data_xdlops.cpp | 11 + .../conv_hip_implicit_gemm_bwd_v1r1.cpp | 3 + ...conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp | 3 + .../conv_hip_implicit_gemm_bwd_v4r1.cpp | 3 + ...conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp | 3 + .../conv_hip_implicit_gemm_fwd_v4r1.cpp | 7 + .../conv_hip_implicit_gemm_fwd_v4r4.cpp | 3 + ...conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp | 3 + ...licit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp | 3 + ...conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp | 3 + .../conv_hip_implicit_gemm_fwd_xdlops.cpp | 10 + ...v_hip_implicit_gemm_grouped_fwd_xdlops.cpp | 10 + .../conv_hip_implicit_gemm_wrw_v4r4.cpp | 3 + ...conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp | 3 + ...licit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp | 3 + src/solver/conv_mlir_igemm_bwd.cpp | 2 + src/solver/conv_mlir_igemm_bwd_xdlops.cpp | 2 + src/solver/conv_mlir_igemm_fwd.cpp | 2 + src/solver/conv_mlir_igemm_fwd_xdlops.cpp | 2 + src/solver/conv_mlir_igemm_wrw.cpp | 2 + src/solver/conv_mlir_igemm_wrw_xdlops.cpp | 2 + src/solver/conv_multipass_wino3x3WrW.cpp | 3 + src/solver/conv_ocl_dir2D11x11.cpp | 3 + src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp | 3 + src/solver/conv_ocl_dir2D_bwdWrW_2.cpp | 3 + src/solver/conv_ocl_dir2D_bwdWrW_53.cpp | 3 + src/solver/conv_ocl_dir2Dfwd.cpp | 3 + src/solver/conv_ocl_dir2Dfwd1x1.cpp | 3 + src/solver/conv_ocl_dir2Dfwdgen.cpp | 3 + src/solver/conv_winoRxS.cpp | 3 + src/solver/conv_winoRxS_fused.cpp | 3 + src/solver/gemm.cpp | 171 +++-- src/solver/gemm_bwd.cpp | 126 +++- src/solver/gemm_wrw.cpp | 101 ++- src/solver/mlir_common.cpp | 2 + src/tensor.cpp | 21 + src/tensor_api.cpp | 31 + test/conv_common.hpp | 99 ++- test/cpu_conv.hpp | 203 +++--- test/driver.hpp | 19 + test/gtest/api_convbiasactiv.cpp | 7 +- test/gtest/cba.hpp | 1 + test/gtest/conv_embed_db.cpp | 6 +- test/gtest/conv_hip_igemm_xdlops.cpp | 2 + test/gtest/conv_igemm_mlir.cpp | 7 +- test/gtest/conv_igemm_mlir_xdlops.cpp | 2 + test/gtest/conv_tensor_gen.hpp | 125 ++++ test/gtest/conv_test_base.hpp | 40 +- test/gtest/conv_trans.cpp | 2 + test/gtest/{solver.hpp => get_solver.hpp} | 51 +- test/gtest/solver_bwd.hpp | 185 +++++ test/gtest/solver_bwd_f8.cpp | 68 ++ test/gtest/solver_convasm3x3u.cpp | 55 +- test/gtest/solver_f8.hpp | 263 +++++++ test/gtest/solver_fwd.hpp | 125 ++++ test/gtest/solver_fwd_f8.cpp | 70 ++ test/gtest/solver_wrw.hpp | 186 +++++ test/gtest/solver_wrw_f8.cpp | 41 ++ test/gtest/tensor_api.cpp | 2 +- test/perf_models/resnet50_v1.5.sh | 2 +- test/tensor_holder.hpp | 24 + test/verify.hpp | 34 +- 131 files changed, 5222 insertions(+), 646 deletions(-) create mode 120000 src/include/miopen/hip_f8_impl.hpp create mode 120000 src/include/miopen/hip_float8.hpp create mode 100644 src/kernels/MIOpenCheckNumerics.cpp create mode 100644 src/kernels/gpu_reference_kernel/fp8_kern_types.h create mode 100644 src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp create mode 100644 src/kernels/hip_f8_impl.hpp create mode 100644 src/kernels/hip_float8.hpp create mode 100644 test/gtest/conv_tensor_gen.hpp rename test/gtest/{solver.hpp => get_solver.hpp} (59%) create mode 100644 test/gtest/solver_bwd.hpp create mode 100644 test/gtest/solver_bwd_f8.cpp create mode 100644 test/gtest/solver_f8.hpp create mode 100644 test/gtest/solver_fwd.hpp create mode 100644 test/gtest/solver_fwd_f8.cpp create mode 100644 test/gtest/solver_wrw.hpp create mode 100644 test/gtest/solver_wrw_f8.cpp mode change 100755 => 100644 test/perf_models/resnet50_v1.5.sh diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp index 868ebcdccc..5f67b83588 100644 --- a/driver/conv_driver.hpp +++ b/driver/conv_driver.hpp @@ -155,6 +155,26 @@ void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems) } } +static inline miopenDataType_t DataTypeFromShortString(const std::string& type) +{ + static const std::unordered_map conv_map = { + {"fp32", miopenFloat}, + {"fp16", miopenHalf}, + {"bf16", miopenBFloat16}, + {"fp8", miopenFloat8}, + {"bf8", miopenBFloat8}}; + + const auto res = conv_map.find(type); + if(res != conv_map.end()) + { + return res->second; + } + else + { + MIOPEN_THROW("Invalid compute/cast type short hand supplied"); + } +} + template bool readBufferFromFile(T* data, size_t dataNumItems, const char* fileName) { @@ -225,6 +245,7 @@ class ConvDriver : public Driver int ChkLayout_ShortName(); int GetandSetData() override; + bool TensorsCasted() const; std::vector GetInputTensorLengthsFromCmdLine(); std::vector GetWeightTensorLengthsFromCmdLine(); std::vector GetBiasTensorLengthsFromCmdLine(); @@ -381,9 +402,14 @@ class ConvDriver : public Driver // Computation error of fp16 is ~2^13 (=8192) bigger than // the one of fp32 because mantissa is shorter by 13 bits. auto tolerance = (sizeof(Tgpu) == 4 || sizeof(Tgpu) == 1) ? 1.5e-6 : 8.2e-3; + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. if(std::is_same::value) tolerance *= 8.0; + constexpr bool is_fp8 = std::is_same::value; + constexpr bool is_bfp8 = std::is_same::value; + if(is_bfp8 || is_fp8 || TensorsCasted()) + tolerance *= 37.0; return tolerance; } @@ -557,6 +583,34 @@ int ConvDriver::ParseCmdLineArgs(int argc, char* argv[]) if(solution_value >= 0) immediate_solution = solution_value; + const std::set valid_cast_types = {"fp32", "fp16", "bf16", "fp8", "bf8"}; + if(inflags.GetValueStr("in_cast_type") != "-1") + { + const auto in_cast_type = inflags.GetValueStr("in_cast_type"); + if(valid_cast_types.find(in_cast_type) == valid_cast_types.end()) + { + std::cout << "Invalid value for in_cast_type argument:" << in_cast_type << std::endl; + return 1; + } + } + if(inflags.GetValueStr("wei_cast_type") != "-1") + { + const auto wei_cast_type = inflags.GetValueStr("wei_cast_type"); + if(valid_cast_types.find(wei_cast_type) == valid_cast_types.end()) + { + std::cout << "Invalid value for wei_cast_type argument:" << wei_cast_type << std::endl; + return 1; + } + } + if(inflags.GetValueStr("out_cast_type") != "-1") + { + const auto out_cast_type = inflags.GetValueStr("out_cast_type"); + if(valid_cast_types.find(out_cast_type) == valid_cast_types.end()) + { + std::cout << "Invalid value for out_cast_type argument:" << out_cast_type << std::endl; + return 1; + } + } return 0; } @@ -619,6 +673,14 @@ int ConvDriver::ChkLayout_ShortName() } } +template +bool ConvDriver::TensorsCasted() const +{ + return inflags.GetValueStr("in_cast_type") != "-1" || + inflags.GetValueStr("wei_cast_type") != "-1" || + inflags.GetValueStr("out_cast_type") != "-1"; +} + template int ConvDriver::GetandSetData() { @@ -626,7 +688,17 @@ int ConvDriver::GetandSetData() std::vector wei_len = GetWeightTensorLengthsFromCmdLine(); SetTensorNd(inputTensor, in_len, inflags.GetValueStr("in_layout"), data_type); + if(inflags.GetValueStr("in_cast_type") != "-1") + { + const auto in_cast_type = DataTypeFromShortString(inflags.GetValueStr("in_cast_type")); + miopenSetTensorCastType(inputTensor, in_cast_type); + } SetTensorNd(weightTensor, wei_len, inflags.GetValueStr("fil_layout"), data_type); + if(inflags.GetValueStr("wei_cast_type") != "-1") + { + const auto in_cast_type = DataTypeFromShortString(inflags.GetValueStr("wei_cast_type")); + miopenSetTensorCastType(weightTensor, in_cast_type); + } if(inflags.GetValueInt("tensor_vect") == 1 && data_type == miopenInt8) { @@ -658,6 +730,11 @@ int ConvDriver::GetandSetData() miopenDataType_t y_type = (data_type == miopenInt8 || data_type == miopenInt8x4) ? miopenInt32 : data_type; SetTensorNd(outputTensor, out_len, inflags.GetValueStr("out_layout"), y_type); + if(inflags.GetValueStr("out_cast_type") != "-1") + { + const auto out_cast_type = DataTypeFromShortString(inflags.GetValueStr("out_cast_type")); + miopenSetTensorCastType(outputTensor, out_cast_type); + } if(inflags.GetValueInt("bias") != 0) { @@ -821,6 +898,12 @@ int ConvDriver::AddCmdLineArgs() "\n Immediate mode, build and run specified solution" "\n Use Find() API", "string"); + inflags.AddInputFlag( + "in_cast_type", 'U', "-1", "Cast type for input tensor, default to not set", "string"); + inflags.AddInputFlag( + "out_cast_type", 'T', "-1", "Cast type for output tensor, default to not set", "string"); + inflags.AddInputFlag( + "wei_cast_type", 'R', "-1", "Cast type for weight tensor, default to not set", "string"); return 0; } @@ -1049,7 +1132,6 @@ int ConvDriver::SetConvDescriptorFromCmdLineArgs() convDesc, spatial_dim, pads.data(), conv_strides.data(), conv_dilations.data(), mode); miopenSetConvolutionGroupCount(convDesc, group_count); - if(mode == miopenTranspose) { miopenSetTransposeConvNdOutputPadding(convDesc, spatial_dim, trans_output_pads.data()); @@ -1109,6 +1191,32 @@ void RanGenSubnormBuffer(T* buf, size_t size, int percentage) }); } +template <> +float8 RanGenWeights() +{ + const auto tmp = + prng::gen_0_to_B(1.0) > 0.5 ? static_cast(0.0) : static_cast(1.0); + // 1 in 2 chance of number being positive + const float sign = + (prng::gen_0_to_B(1.0) > 0.5) ? static_cast(-1) : static_cast(1); + const auto tmp2 = static_cast(std::numeric_limits::epsilon()) * + static_cast(2) * sign * static_cast(tmp); + return static_cast(tmp2); +} + +template <> +bfloat8 RanGenWeights() +{ + const auto tmp = + prng::gen_0_to_B(1.0) > 0.5 ? static_cast(0.0) : static_cast(1.0); + // 1 in 2 chance of number being positive + const float sign = + (prng::gen_0_to_B(1.0) > 0.5) ? static_cast(-1) : static_cast(1); + const auto tmp2 = static_cast(std::numeric_limits::epsilon()) * + static_cast(2) * sign * static_cast(tmp); + return static_cast(tmp2); +} + } // namespace detail template @@ -1135,11 +1243,14 @@ int ConvDriver::AllocateBuffersAndCopy() } } - bool is_transform = IsInputTensorTransform(); - bool is_int8 = data_type == miopenInt8 || data_type == miopenInt8x4; - size_t in_sz = GetTensorSize(inputTensor); - size_t wei_sz = GetTensorSize(weightTensor); - size_t out_sz = GetTensorSize(outputTensor); + bool is_transform = IsInputTensorTransform(); + bool is_int8 = data_type == miopenInt8 || data_type == miopenInt8x4; + // Data generated for very low precision types follows the same constraints whether its fp8, + // bfp8 or even if the interim tensors are being casted + bool is_fp8 = data_type == miopenFloat8 || data_type == miopenBFloat8 || TensorsCasted(); + size_t in_sz = GetTensorSize(inputTensor); + size_t wei_sz = GetTensorSize(weightTensor); + size_t out_sz = GetTensorSize(outputTensor); auto subnorm_percentage = miopen::Value(MIOPEN_DRIVER_SUBNORM_PERCENTAGE{}); // Workaround: Pad buffers allocations to be a multiple of 2M @@ -1338,7 +1449,10 @@ int ConvDriver::AllocateBuffersAndCopy() if(!weiFileName.empty()) weiRead = readBufferFromFile(wei.data.data(), wei_sz, weiFileName.c_str()); - const Tgpu Data_scale = is_int8 ? static_cast(127) : static_cast(0.01); + const Tgpu Data_scale = is_int8 ? static_cast(127) + : (is_fp8 ? static_cast(1.0) : static_cast(0.01)); + const Tgpu Data_min = (is_fp8 ? static_cast(-1.0) : static_cast(0.0)); + const Tgpu Data_max = (is_fp8 ? static_cast(1.0) : static_cast(1.0)); if(is_int8) { if(inflags.GetValueInt("bias") != 0) @@ -1361,6 +1475,7 @@ int ConvDriver::AllocateBuffersAndCopy() } else { + bool doutRead = false; if(is_bwd || is_wrw) if(!doutFileName.empty()) @@ -1375,7 +1490,8 @@ int ConvDriver::AllocateBuffersAndCopy() /// initialization of input buffers regardless of which kinds of /// convolutions are currently selectedfor testing (see the "-F" option). /// Verification cache would be broken otherwise. - auto val = prng::gen_0_to_B(Data_scale); + auto val = + is_fp8 ? prng::gen_A_to_B(Data_min, Data_max) : prng::gen_0_to_B(Data_scale); if(is_bwd || is_wrw) dout.data[i] = val; } @@ -1394,8 +1510,11 @@ int ConvDriver::AllocateBuffersAndCopy() db_host = tensor(miopen::deref(biasTensor)); for(int i = 0; i < b_sz; i++) { - b.data[i] = static_cast(i % 8) + prng::gen_canonical(); - db[i] = static_cast(i % 8) + prng::gen_canonical(); + b.data[i] = + static_cast(i % 8) + + (is_fp8 ? prng::gen_A_to_B(Data_min, Data_max) : prng::gen_canonical()); + db[i] = static_cast(i % 8) + (is_fp8 ? prng::gen_A_to_B(Data_min, Data_max) + : prng::gen_canonical()); } if(!biasFileName.empty()) @@ -1413,7 +1532,7 @@ int ConvDriver::AllocateBuffersAndCopy() for(int i = 0; i < in_sz; i++) { /// \ref move_rand - auto val = prng::gen_0_to_B(Data_scale); + auto val = is_fp8 ? prng::gen_A_to_B(Data_min, Data_max) : prng::gen_0_to_B(Data_scale); if(is_fwd || is_wrw) in.data[i] = val; } @@ -1475,7 +1594,12 @@ int ConvDriver::AllocateBuffersAndCopy() } if(is_fwd) { - out_dev = std::unique_ptr( + // TODO: For the temporary conversion to half, this is required, however, that would also + // need change elsewhere which has not yet been implemented out_dev = + // std::unique_ptr(new GPUMem( + // ctx, out_sz, is_int8 ? sizeof(float) : (is_fp8 ? sizeof(half) : sizeof(Tgpu)))); + std::ignore = is_fp8; + out_dev = std::unique_ptr( new GPUMem(ctx, out_sz, is_int8 ? sizeof(float) : sizeof(Tgpu))); status |= (is_int8 ? out_dev->ToGPU(q, out_int8.data()) : out_dev->ToGPU(q, out.data.data())); @@ -1496,7 +1620,8 @@ bool ConvDriver::UseGPUReference() { if((miopen_type{} == miopenFloat && (miopen_type{} == miopenFloat || miopen_type{} == miopenHalf || - miopen_type{} == miopenBFloat16)) || + miopen_type{} == miopenBFloat16 || miopen_type{} == miopenFloat8 || + miopen_type{} == miopenBFloat8)) || (miopen_type{} == miopenInt32 && miopen_type{} == miopenInt8)) return true; else @@ -3394,8 +3519,8 @@ int ConvDriver::VerifyBackward() else { std::cout << "Backward Convolution Data Verifies OK on " - << (UseGPUReference() ? "GPU" : "CPU") << " reference (" << error_data << ')' - << std::endl; + << (UseGPUReference() ? "GPU" : "CPU") << " reference (" << error_data + << " < " << tolerance << ')' << std::endl; } } @@ -3429,6 +3554,9 @@ int ConvDriver::VerifyBackward() else if(std::is_same::value) tolerance *= 5; } + // bfloat8 has very poor accuracy in wrw direction + if(std::is_same::value) + tolerance = tolerance * 2; auto error_weights = is_wrw_run_failed ? std::numeric_limits::max() : miopen::rms_range(dwei_host.data, dwei); @@ -3443,7 +3571,7 @@ int ConvDriver::VerifyBackward() { std::cout << "Backward Convolution Weights Verifies OK on " << (UseGPUReference() ? "GPU" : "CPU") << " reference (" << error_weights - << ')' << std::endl; + << " < " << tolerance << ')' << std::endl; } } diff --git a/driver/driver.hpp b/driver/driver.hpp index 0760a749e7..8e15894705 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -34,8 +34,6 @@ #include "random.hpp" -using float16 = half_float::half; - #include "InputFlags.hpp" #include #include @@ -44,6 +42,12 @@ using float16 = half_float::half; #include #include #include +using half = half_float::half; +using hip_bfloat16 = bfloat16; +#include +using float16 = half_float::half; +using float8 = miopen_f8::hip_f8; +using bfloat8 = miopen_f8::hip_f8; #include #include @@ -143,7 +147,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) [[gnu::noreturn]] inline void Usage() { printf("Usage: ./driver *base_arg* *other_args*\n"); - printf("Supported Base Arguments: conv[fp16|int8|bfp16], CBAInfer[fp16], " + printf("Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], " "pool[fp16], lrn[fp16], " "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm, ctc, dropout[fp16], " "tensorop[fp16], reduce[fp16,fp64]\n"); @@ -161,13 +165,13 @@ inline std::string ParseBaseArg(int argc, char* argv[]) std::string arg = argv[1]; if(arg != "conv" && arg != "convfp16" && arg != "convint8" && arg != "convbfp16" && - arg != "CBAInfer" && arg != "CBAInferfp16" && arg != "pool" && arg != "poolfp16" && - arg != "lrn" && arg != "lrnfp16" && arg != "activ" && arg != "activfp16" && - arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" && arg != "bnormfp16" && - arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" && arg != "rnn_seqfp16" && - arg != "gemm" /*&& arg != "gemmfp16"*/ && arg != "ctc" && arg != "dropout" && - arg != "dropoutfp16" && arg != "tensorop" && arg != "tensoropfp16" && arg != "reduce" && - arg != "reducefp16" && arg != "reducefp64" && arg != "--version") + arg != "convfp8" && arg != "convbfp8" && arg != "CBAInfer" && arg != "CBAInferfp16" && + arg != "pool" && arg != "poolfp16" && arg != "lrn" && arg != "lrnfp16" && arg != "activ" && + arg != "activfp16" && arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" && + arg != "bnormfp16" && arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" && + arg != "rnn_seqfp16" && arg != "gemm" /*&& arg != "gemmfp16"*/ && arg != "ctc" && + arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "tensoropfp16" && + arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); @@ -249,6 +253,16 @@ inline void Driver::InitDataType() { data_type = miopenBFloat16; } +template <> +inline void Driver::InitDataType() +{ + data_type = miopenFloat8; +} +template <> +inline void Driver::InitDataType() +{ + data_type = miopenBFloat8; +} // "std::is_same{}" used to avoid "static_assert" compilation error, // which occurs when the condition does not depend in any way on the template parameters. template diff --git a/driver/main.cpp b/driver/main.cpp index 6db3952c12..abdefc34a3 100644 --- a/driver/main.cpp +++ b/driver/main.cpp @@ -81,6 +81,14 @@ int main(int argc, char* argv[]) { drv = new ConvDriver(); } + else if(base_arg == "convfp8") + { + drv = new ConvDriver(); + } + else if(base_arg == "convbfp8") + { + drv = new ConvDriver(); + } else if(base_arg == "CBAInfer") { drv = new CBAInferFusionDriver(); diff --git a/include/miopen/config.h.in b/include/miopen/config.h.in index 67edaeb284..74b0185fde 100644 --- a/include/miopen/config.h.in +++ b/include/miopen/config.h.in @@ -74,6 +74,8 @@ // remain in the future) perform final conversion (and rounding) of FP32 // to BF16 results. This affects the main functionality of the library. #cmakedefine01 MIOPEN_USE_RNE_BFLOAT16 +#cmakedefine01 MIOPEN_FP8_IEEE_EXPONENT_BIAS +#cmakedefine01 MIOPEN_FP8_CLIPPING // clang-format off #cmakedefine MIOPEN_DEFAULT_FIND_MODE @MIOPEN_DEFAULT_FIND_MODE@ diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 7bc268620b..fcbc60a0b2 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -111,6 +111,12 @@ typedef enum miopenStatusVersionMismatch = 10, /*!< Version mismatch of the supplied binary data argment. */ } miopenStatus_t; +typedef enum +{ + miopenF8RoundingModeStandard = 0, + miopenF8RoundingModeStochastic = 1, +} miopenF8RoundingMode_t; + /*! @brief Get character string for an error code. * * A function which returns a NULL terminated character string of the error code. @@ -347,7 +353,9 @@ typedef enum 4, /*!< Pack of four 8-bit int points in NCHW_VECT_C format (Partially supported) */ miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction) (Partially supported) */ - miopenDouble = 6, /*!< 64-bit floating point (Partially supported) */ + miopenDouble = 6, /*!< 64-bit floating point (Partially supported) */ + miopenFloat8 = 7, + miopenBFloat8 = 8 } miopenDataType_t; /*! @ingroup tensor @@ -581,6 +589,11 @@ typedef enum MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC = 1, /*!< Restrict MIOpen convolutions to kernels which produce numerically deterministic results. 0 - disabled (default), 1 - enabled >*/ + MIOPEN_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE = + 2, /*!*/ } miopenConvolutionAttrib_t; /** @addtogroup tensor @@ -698,7 +711,19 @@ MIOPEN_EXPORT miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t const int* dimsA, const int* stridesA); -/*! @brief Get size of N-dimensional tensor +/*! @brief Set the tensor cast type + * + * For tensors where the cast_type attribute is set, the tensor elements would be converted to the + * target type before the target operation is applied. Currently, only supported for convolution + * operations targeting the FP8 datatype + * + * @param tensorDesc Tensor descriptor type (input) + * @param cast_type MIOpen datatype (input) + */ +MIOPEN_EXPORT miopenStatus_t miopenSetTensorCastType(miopenTensorDescriptor_t tensorDesc, + miopenDataType_t cast_type); + +/*! @brief Set shape of N-dimensional tensor * * Interface for querying tensor size. MIOpen has support for 1, 2, 3, 4, 5 dimensional tensor of * layout. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 95da2f345d..58ff101c33 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -39,10 +39,14 @@ endif() # remain in the future) perform final conversion (and rounding) of FP32 # to BF16 results. This affects the main functionality of the library. option( MIOPEN_USE_RNE_BFLOAT16 "Sets rounding scheme for bfloat16 type" ON ) +option( MIOPEN_FP8_IEEE_EXPONENT_BIAS "Sets the FP8 exponent bias to IEEE" ON) +option( MIOPEN_FP8_CLIPPING "Sets the FP8 clipping" ON) set ( MIOPEN_DEFAULT_FIND_MODE "DynamicHybrid" CACHE STRING "Sets the default find mode") set_property(CACHE MIOPEN_DEFAULT_FIND_MODE PROPERTY STRINGS Normal Fast Hybrid FastHybrid DynamicHybrid) +option( MIOPEN_FP8_CLIPPING "Sets the FP8 clipping" ON) + configure_file("${PROJECT_SOURCE_DIR}/include/miopen/config.h.in" "${PROJECT_BINARY_DIR}/include/miopen/config.h") # configure a header file to pass the CMake version settings to the source, and package the header files in the output archive @@ -279,6 +283,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN ${COMPOSABLE_KERNEL_DYNAMIC_ASM_INCLUDE} ${GPU_GENERAL_TENSOR_REORDER_KERNEL_HIP_INCLUDE} include/miopen/implicitgemm_params.hpp + kernels/gpu_reference_kernel/fp8_kern_types.h kernels/Conv_Winograd_v13_3_12_fp16dot_stride1.inc kernels/Conv_Winograd_v13_3_12_fp16dot_stride2_dec.inc kernels/Conv_Winograd_v13_3_12_fp16dot_stride2_dil.inc @@ -380,6 +385,8 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/bfloat16_dev.hpp kernels/float_types.h kernels/workaround_issue_1431.hpp + kernels/hip_f8_impl.hpp + kernels/hip_float8.hpp ) set(MIOPEN_KERNELS @@ -392,7 +399,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN ${GPU_BATCHED_TRANSPOSE_KERNEL_HIP} ${GPU_GENERAL_TENSOR_REORDER_KERNEL_HIP_SOURCE} kernels/detect_llvm_amdgcn_buffer_atomic_fadd_f32_float.cpp - kernels/MIOpenCheckNumerics.cl + kernels/MIOpenCheckNumerics.cpp kernels/MIOpenBatchNormActivBwdPerAct.cl kernels/MIOpenBatchNormActivBwdSpatial.cl kernels/MIOpenBatchNormActivFwdTrainPerAct.cl diff --git a/src/check_numerics.cpp b/src/check_numerics.cpp index b7d6bc1eb8..b0882995e9 100644 --- a/src/check_numerics.cpp +++ b/src/check_numerics.cpp @@ -53,30 +53,41 @@ struct CheckNumericsResult int hasInf = 0; }; +std::string GetKernelName(miopenDataType_t data_type) +{ + switch(data_type) + { + case miopenFloat: return {"check_numerics_fp32"}; + case miopenHalf: return {"check_numerics_fp16"}; + case miopenBFloat16: return {"check_numerics_bf16"}; + case miopenFloat8: return {"check_numerics_fp8"}; + case miopenBFloat8: return {"check_numerics_bf8"}; + case miopenInt32: + case miopenInt8: + case miopenInt8x4: + case miopenDouble: + default: return {""}; + } +} + bool checkNumericsImpl( const Handle& handle, int mode, const TensorDescriptor& dDesc, ConstData_t data, bool isInput) { int numElements = dDesc.GetElementSize(); - - // TODO - some constants we should get from the device: - const int blockSize = 256; - const auto numBlocks = handle.GetMaxComputeUnits() * 6; - const size_t numGlobalWorkItems = blockSize * numBlocks; - - const int computeStats = (mode & CheckNumerics::ComputeStats); - CheckNumericsResult abnormal_h; - auto abnormal_d = handle.Create(sizeof(CheckNumericsResult)); // TODO - someday avoid slow malloc/free here handle.WriteTo(&abnormal_h, abnormal_d, sizeof(CheckNumericsResult)); - - std::string params = GetDataTypeKernelParams(dDesc.GetType()); - std::string program_name = "MIOpenCheckNumerics.cl"; - std::string kernel_name = "MIOpenCheckNumerics"; - const std::vector vld = {size_t{blockSize}, size_t{1}, size_t{1}}; - const std::vector vgd = {numGlobalWorkItems, size_t{1}, size_t{1}}; - handle.AddKernel("MIOpenCheckNumerics", "", program_name, kernel_name, vld, vgd, params)( + const size_t threadsPerBlock = 256; + const size_t numBlocks = handle.GetMaxComputeUnits() * 6; + const int computeStats = (mode & CheckNumerics::ComputeStats); + // TODO - some constants we should get from the device: + std::string program_name = "MIOpenCheckNumerics.cpp"; + std::string kernel_name = GetKernelName(dDesc.GetType()); + const std::vector vld = {size_t{threadsPerBlock}, size_t{1}, size_t{1}}; + const std::vector vgd = {numBlocks, size_t{1}, size_t{1}}; + handle.AddKernel( + "MIOpenCheckNumerics", "MIOpenCheckNumerics", program_name, kernel_name, vld, vgd, "")( data, numElements, abnormal_d.get(), computeStats); handle.ReadTo(&abnormal_h, abnormal_d, sizeof(CheckNumericsResult)); diff --git a/src/conv/problem_description.cpp b/src/conv/problem_description.cpp index e44160b4d5..76c47cbcd9 100644 --- a/src/conv/problem_description.cpp +++ b/src/conv/problem_description.cpp @@ -224,7 +224,7 @@ bool ProblemDescription::IsNCHWc_CHWNc() const void ProblemDescription::SetupFloats(ExecutionContext& ctx) const { - if(IsFp32() || IsFp16() || IsBfp16() || IsInt8()) + if(IsFp32() || IsFp16() || IsBfp16() || IsInt8() || IsFp8() || IsBfp8()) { ctx.general_compile_options += GetDataTypeKernelParams(GetInDataType()); return; diff --git a/src/convolution.cpp b/src/convolution.cpp index cb6cde5eda..5f7539f70d 100644 --- a/src/convolution.cpp +++ b/src/convolution.cpp @@ -519,6 +519,17 @@ void ConvolutionAttribute::Set(miopenConvolutionAttrib_t attr, int value) std::to_string(value)); deterministic.value = value; } + else if(attr == MIOPEN_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE) + { + const auto rounding_mode = static_cast(value); + if(rounding_mode != miopenF8RoundingModeStochastic && + rounding_mode != miopenF8RoundingModeStandard) + MIOPEN_THROW(miopenStatusBadParm, + "[Set conv attribute] Error: Attempt to set invalid value for " + "MIOPEN_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE" + + std::to_string(value)); + fp8rounding_mode.rounding_mode = rounding_mode; + } else { MIOPEN_THROW(miopenStatusBadParm, @@ -531,6 +542,8 @@ int ConvolutionAttribute::Get(miopenConvolutionAttrib_t attr) const { if(attr == MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL) return gfx90aFp16alt.value; + else if(attr == MIOPEN_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE) + return static_cast(fp8rounding_mode.rounding_mode); else if(attr == MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC) return deterministic.value; MIOPEN_THROW(miopenStatusBadParm, diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp index a75a58ee6b..804587aac0 100644 --- a/src/gemm_v2.cpp +++ b/src/gemm_v2.cpp @@ -35,6 +35,10 @@ #endif #if MIOPEN_USE_ROCBLAS +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-macros" +#define ROCBLAS_BETA_FEATURES_API 1 +#pragma clang diagnostic pop #if HIP_PACKAGE_VERSION_FLAT >= 5006000000ULL #include #else @@ -44,6 +48,7 @@ #include #else #include +#define USE_ROCBLAS_GEMM_EX3 ((MIOPEN_ROCBLAS_VERSION_FLAT >= 2047000) && ROCBLAS_BETA_FEATURES_API) #endif #include #endif @@ -68,14 +73,126 @@ /// Let's pass literal value as a workaround; there should be no harm. #define USE_GEMM_FLAGS_FP16_ALT_IMPL_242 (MIOPEN_ROCBLAS_VERSION_FLAT == 2042000) +static inline uint32_t +FlagsForRocblasFp32Fp16Call(const miopen::GemmDescriptor& desc) // bool gfx90aFp16Alt) +{ +#if USE_GEMM_FLAGS_FP16_ALT_IMPL + return desc.gfx90a_alt_impl ? rocblas_gemm_flags_fp16_alt_impl : 0; +#elif USE_GEMM_FLAGS_FP16_ALT_IMPL_242 + return desc.gfx90a_alt_impl ? 0x4 : 0; +#else + std::ignore = desc; + MIOPEN_LOG_W("The gfx90aFp16Alt is not supported by rocBlas"); + return 0; +#endif +#if USE_GEMM_FLAGS_FP16_ALT_IMPL_242 // -warning: macro is not used +#endif +} + +#if USE_ROCBLAS_GEMM_EX3 +static inline rocblas_computetype rocBlasComputeType_ex3(const miopen::GemmDescriptor& desc) +{ + if(desc.a_cast_type == miopenFloat8 && desc.b_cast_type == miopenFloat8) + return rocblas_compute_type_f8_f8_f32; + else if(desc.a_cast_type == miopenFloat8 && desc.b_cast_type == miopenBFloat8) + return rocblas_compute_type_f8_bf8_f32; + else if(desc.a_cast_type == miopenBFloat8 && desc.b_cast_type == miopenFloat8) + return rocblas_compute_type_bf8_f8_f32; + else if(desc.a_cast_type == miopenBFloat8 && desc.b_cast_type == miopenBFloat8) + return rocblas_compute_type_bf8_bf8_f32; + else + return rocblas_compute_type_f32; +} +#endif + +static inline rocblas_datatype rocBlasComputeType(const miopen::GemmDescriptor& desc) +{ + // Complex compute types are only supported in newer version of the API + assert(desc.dataType == desc.a_cast_type && desc.dataType == desc.b_cast_type); + if(desc.dataType == miopenInt8 || desc.dataType == miopenInt8x4) + return rocblas_datatype::rocblas_datatype_i32_r; + else + return rocblas_datatype::rocblas_datatype_f32_r; +} + +auto rocBlasDataType(miopenDataType_t data_type) +{ + if(data_type == miopenFloat8) + return rocblas_datatype::rocblas_datatype_f8_r; + else if(data_type == miopenBFloat8) + return rocblas_datatype::rocblas_datatype_bf8_r; + else if(data_type == miopenHalf) + return rocblas_datatype::rocblas_datatype_f16_r; + MIOPEN_THROW(miopenStatusInternalError, "Invalid data type passed"); +} + +template +rocblas_status miopen_rocblas_gemm_ex3(const miopen::Handle& handle, + const miopen::GemmDescriptor& gemm_desc, + ConstData_t A, + int a_offset, + ConstData_t B, + int b_offset, + Data_t C, + int c_offset) +{ + rocblas_status rb_status = + rocblas_status::rocblas_status_internal_error; // cppcheck-suppress redundantInitialization +#if USE_ROCBLAS_GEMM_EX3 +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdisabled-macro-expansion" +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + float alpha = gemm_desc.alpha; + float beta = gemm_desc.beta; + auto flags = FlagsForRocblasFp32Fp16Call(gemm_desc); + if(gemm_desc.conv_attributes.fp8rounding_mode.Get() == miopenF8RoundingModeStochastic) + flags = flags | rocblas_gemm_flags::rocblas_gemm_flags_stochastic_rounding; + + rb_status = // cppcheck-suppress redundantInitialization + rocblas_gemm_ex3(handle.rhandle().get(), + gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, + gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, + gemm_desc.m, + gemm_desc.n, + gemm_desc.k, + &alpha, + static_cast(A) + a_offset, + rocBlasDataType(gemm_desc.dataType), + gemm_desc.lda, + static_cast(B) + b_offset, + rocBlasDataType(gemm_desc.dataType), + gemm_desc.ldb, + &beta, + static_cast(C) + c_offset, + rocBlasDataType(gemm_desc.dataType), + gemm_desc.ldc, + static_cast(C) + c_offset, + rocBlasDataType(gemm_desc.dataType), + gemm_desc.ldc, + rocBlasComputeType_ex3(gemm_desc), + rocblas_gemm_algo::rocblas_gemm_algo_standard, + 0, + flags); // gfx90a_alt_impl)); +#pragma clang diagnostic pop +#endif + MIOPEN_THROW(miopenStatusBadParm, "An appropriate version of rocBLAS is required for this op"); + std::ignore = handle; + std::ignore = gemm_desc; + return rb_status; +} + template -auto miopen_rocblas_gemm_ex(Ts... xs) +auto miopen_rocblas_gemm_ex(const miopen::Handle& handle, + const miopen::GemmDescriptor& gemm_desc, + Ts... xs) { + std::ignore = handle; + std::ignore = gemm_desc; #if AVOID_ROCBLAS_WRAPPERS_204 - return (rocblas_gemm_ex)(xs...); + return (rocblas_gemm_ex)(handle.rhandle().get(), xs...); #else std::size_t zero = 0; - return rocblas_gemm_ex(xs..., &zero, nullptr); + return rocblas_gemm_ex(handle.rhandle().get(), xs..., &zero, nullptr); #endif } @@ -90,6 +207,33 @@ auto miopen_rocblas_gemm_strided_batched_ex(Ts... xs) #endif } +template +rocblas_status miopen_rocblas_gemm_strided_batched_ex3(const miopen::Handle& handle, + const miopen::GemmDescriptor& gemm_desc, + ConstData_t A, + int a_offset, + ConstData_t B, + int b_offset, + Data_t C, + int c_offset) +{ + rocblas_status rb_status = rocblas_status::rocblas_status_internal_error; + // Until there is a batched counter part to the ex3 rocBlas call we need to iterate over the + // batched GEMM + for(int bCount = 0; bCount < gemm_desc.batch_count; ++bCount) + { + rb_status = miopen_rocblas_gemm_ex3(handle, + gemm_desc, + A, + a_offset + (bCount * gemm_desc.strideA), + B, + b_offset + (bCount * gemm_desc.strideB), + C, + c_offset + (bCount * gemm_desc.strideC)); + } + return rb_status; +} + #endif // MIOPEN_USE_ROCBLAS MIOPEN_DECLARE_ENV_VAR(MIOPEN_GEMM_ENFORCE_BACKEND) @@ -114,7 +258,10 @@ std::ostream& operator<<(std::ostream& stream, const GemmDescriptor& gemm_desc) << "strideC " << gemm_desc.strideC << ", " << "alpha " << gemm_desc.alpha << ", " << "beta " << gemm_desc.beta << ", " - << "dataType " << gemm_desc.dataType << "} "; + << "dataType " << gemm_desc.dataType << "a_cast_type" << gemm_desc.a_cast_type + << ", " + << "b_cast_type" << gemm_desc.b_cast_type << ", " + << "} "; } #if MIOPEN_USE_ROCBLAS @@ -207,8 +354,7 @@ miopenStatus_t CallGemmTimeMeasure(const Handle& handle, int c_offset, bool time_precision, CallGemmType_t call_gemm_type, - GemmBackend_t gemm_backend, - bool gfx90a_alt_impl) + GemmBackend_t gemm_backend) { switch(call_gemm_type) { @@ -216,103 +362,37 @@ miopenStatus_t CallGemmTimeMeasure(const Handle& handle, if(time_precision) { // rocBLAS need a warm-up call for accurate timing - CallGemm(handle, - gemm_desc, - A, - a_offset, - B, - b_offset, - C, - c_offset, - gemm_backend, - gfx90a_alt_impl); + CallGemm(handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset, gemm_backend); } - return CallGemm(handle, - gemm_desc, - A, - a_offset, - B, - b_offset, - C, - c_offset, - gemm_backend, - gfx90a_alt_impl); + return CallGemm(handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset, gemm_backend); } case callGemmStridedBatched: { if(time_precision) { // rocBLAS need extra warm-up call for accurate timing - CallGemmStridedBatched(handle, - gemm_desc, - A, - a_offset, - B, - b_offset, - C, - c_offset, - gemm_backend, - gfx90a_alt_impl); + CallGemmStridedBatched( + handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset, gemm_backend); } - return CallGemmStridedBatched(handle, - gemm_desc, - A, - a_offset, - B, - b_offset, - C, - c_offset, - gemm_backend, - gfx90a_alt_impl); + return CallGemmStridedBatched( + handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset, gemm_backend); } case callGemmStridedBatchedSequential: { if(time_precision) { // rocBLAS need a warm-up call for accurate timing - CallGemmStridedBatchedSequential(handle, - gemm_desc, - A, - a_offset, - B, - b_offset, - C, - c_offset, - gemm_backend, - gfx90a_alt_impl); + CallGemmStridedBatchedSequential( + handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset, gemm_backend); } - return CallGemmStridedBatchedSequential(handle, - gemm_desc, - A, - a_offset, - B, - b_offset, - C, - c_offset, - gemm_backend, - gfx90a_alt_impl); + return CallGemmStridedBatchedSequential( + handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset, gemm_backend); } } return miopenStatusNotImplemented; } -#if MIOPEN_USE_ROCBLAS -static inline uint32_t FlagsForRocblasFp32Fp16Call(const bool gfx90aFp16Alt) -{ -#if USE_GEMM_FLAGS_FP16_ALT_IMPL - return gfx90aFp16Alt ? rocblas_gemm_flags_fp16_alt_impl : 0; -#elif USE_GEMM_FLAGS_FP16_ALT_IMPL_242 - return gfx90aFp16Alt ? 0x4 : 0; -#else - std::ignore = gfx90aFp16Alt; - return 0; -#endif -#if USE_GEMM_FLAGS_FP16_ALT_IMPL_242 // -warning: macro is not used -#endif -} -#endif // MIOPEN_USE_ROCBLAS - miopenStatus_t CallGemm(const Handle& handle, GemmDescriptor gemm_desc, ConstData_t A, @@ -321,8 +401,7 @@ miopenStatus_t CallGemm(const Handle& handle, int b_offset, Data_t C, int c_offset, - GemmBackend_t gemm_backend, - bool gfx90a_alt_impl) + GemmBackend_t gemm_backend) { MIOPEN_LOG_I2("gemm_desc: " << gemm_desc); @@ -368,7 +447,8 @@ miopenStatus_t CallGemm(const Handle& handle, auto beta = int(gemm_desc.beta); rb_status = miopen_rocblas_gemm_ex( - handle.rhandle().get(), + handle, + gemm_desc, gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, gemm_desc.m, @@ -388,7 +468,7 @@ miopenStatus_t CallGemm(const Handle& handle, static_cast(C) + c_offset, rocblas_datatype::rocblas_datatype_i32_r, gemm_desc.ldc, - rocblas_datatype::rocblas_datatype_i32_r, + rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_i32_r, rocblas_gemm_algo::rocblas_gemm_algo_standard, 0, #if USE_GEMM_FLAGS_PACK_INT8X4 @@ -401,35 +481,61 @@ miopenStatus_t CallGemm(const Handle& handle, break; case miopenInt32: break; case miopenHalf: { - - float alpha = gemm_desc.alpha; - float beta = gemm_desc.beta; - - rb_status = miopen_rocblas_gemm_ex( - handle.rhandle().get(), - gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, - gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, - gemm_desc.m, - gemm_desc.n, - gemm_desc.k, - &alpha, - static_cast(A) + a_offset, - rocblas_datatype::rocblas_datatype_f16_r, - gemm_desc.lda, - static_cast(B) + b_offset, - rocblas_datatype::rocblas_datatype_f16_r, - gemm_desc.ldb, - &beta, - static_cast(C) + c_offset, - rocblas_datatype::rocblas_datatype_f16_r, - gemm_desc.ldc, - static_cast(C) + c_offset, - rocblas_datatype::rocblas_datatype_f16_r, - gemm_desc.ldc, - rocblas_datatype::rocblas_datatype_f32_r, - rocblas_gemm_algo::rocblas_gemm_algo_standard, - 0, - FlagsForRocblasFp32Fp16Call(gfx90a_alt_impl)); + const auto is_gfx94x = miopen::StartsWith(handle.GetDeviceName(), "gfx94"); + // We need ex3 API if any of the dataType or the cast type is an 8-bit floating type + const auto needs_ex3 = [&]() { + if((gemm_desc.dataType == miopenFloat8 || gemm_desc.dataType == miopenBFloat8) || + (gemm_desc.a_cast_type == miopenFloat8 || + gemm_desc.a_cast_type == miopenBFloat8) || + (gemm_desc.b_cast_type == miopenBFloat8 || + gemm_desc.b_cast_type == miopenFloat8)) + return true; + else + return false; + }(); + // ex3 API only works on the gfx94x ASIC; + if(needs_ex3) + { + if(is_gfx94x) + { + rb_status = miopen_rocblas_gemm_ex3( + handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset); + } + else + MIOPEN_THROW(miopenStatusBadParm, + "8-bit floating types are only supported on gfx94x"); + } + else + { + float alpha = gemm_desc.alpha; + float beta = gemm_desc.beta; + rb_status = miopen_rocblas_gemm_ex( + handle, + gemm_desc, + gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, + gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, + gemm_desc.m, + gemm_desc.n, + gemm_desc.k, + &alpha, + static_cast(A) + a_offset, + rocblas_datatype::rocblas_datatype_f16_r, + gemm_desc.lda, + static_cast(B) + b_offset, + rocblas_datatype::rocblas_datatype_f16_r, + gemm_desc.ldb, + &beta, + static_cast(C) + c_offset, + rocblas_datatype::rocblas_datatype_f16_r, + gemm_desc.ldc, + static_cast(C) + c_offset, + rocblas_datatype::rocblas_datatype_f16_r, + gemm_desc.ldc, + rocBlasComputeType(gemm_desc), + rocblas_gemm_algo::rocblas_gemm_algo_standard, + 0, + FlagsForRocblasFp32Fp16Call(gemm_desc)); // gfx90a_alt_impl)); + } } break; @@ -439,7 +545,8 @@ miopenStatus_t CallGemm(const Handle& handle, float beta = gemm_desc.beta; rb_status = miopen_rocblas_gemm_ex( - handle.rhandle().get(), + handle, + gemm_desc, gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, gemm_desc.m, @@ -459,7 +566,7 @@ miopenStatus_t CallGemm(const Handle& handle, static_cast(C) + c_offset, rocblas_datatype::rocblas_datatype_bf16_r, gemm_desc.ldc, - rocblas_datatype::rocblas_datatype_f32_r, + rocBlasComputeType(gemm_desc), rocblas_gemm_algo::rocblas_gemm_algo_standard, 0, 0); @@ -471,7 +578,8 @@ miopenStatus_t CallGemm(const Handle& handle, float beta = gemm_desc.beta; rb_status = miopen_rocblas_gemm_ex( - handle.rhandle().get(), + handle, + gemm_desc, gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, gemm_desc.m, @@ -491,13 +599,27 @@ miopenStatus_t CallGemm(const Handle& handle, static_cast(C) + c_offset, rocblas_datatype::rocblas_datatype_f32_r, gemm_desc.ldc, - rocblas_datatype::rocblas_datatype_f32_r, + rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_f32_r, rocblas_gemm_algo::rocblas_gemm_algo_standard, 0, 0); } break; + case miopenFloat8: + case miopenBFloat8: { + const auto is_gfx94x = miopen::StartsWith(handle.GetDeviceName(), "gfx94"); + if(is_gfx94x) + { + rb_status = miopen_rocblas_gemm_ex3( + handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset); + } + else + MIOPEN_THROW(miopenStatusBadParm, + "8-bit floating types are only supported on gfx94x"); + }; + break; + case miopenDouble: { MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported by MIOpenGEMM."); @@ -531,8 +653,7 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle, int b_offset, Data_t C, int c_offset, - GemmBackend_t gemm_backend, - bool gfx90a_alt_impl) + GemmBackend_t gemm_backend) { MIOPEN_LOG_I2("gemm_desc: " << gemm_desc); @@ -560,7 +681,6 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle, HipEventPtr stop = nullptr; if(handle.IsProfilingEnabled()) { - ProfilingRecordStart(handle, start, stop); } rocblas_atomics_mode cur_mode = @@ -619,40 +739,67 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle, case miopenInt32: break; case miopenHalf: { + const auto is_gfx94x = miopen::StartsWith(handle.GetDeviceName(), "gfx94"); + // We need ex3 API if any of the dataType or the cast type is an 8-bit floating type + const auto needs_ex3 = [&]() { + if((gemm_desc.dataType == miopenFloat8 || gemm_desc.dataType == miopenBFloat8) || + (gemm_desc.a_cast_type == miopenFloat8 || + gemm_desc.a_cast_type == miopenBFloat8) || + (gemm_desc.b_cast_type == miopenBFloat8 || + gemm_desc.b_cast_type == miopenFloat8)) + return true; + else + return false; + }(); + // ex3 API only works on the gfx94x ASIC; + if(needs_ex3) + { + if(is_gfx94x) + { + rb_status = miopen_rocblas_gemm_strided_batched_ex3( + handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset); + } + else + MIOPEN_THROW(miopenStatusBadParm, + "8-bit floating types are only supported on gfx94x"); + } + else + { - float alpha = gemm_desc.alpha; - float beta = gemm_desc.beta; + float alpha = gemm_desc.alpha; + float beta = gemm_desc.beta; - rb_status = miopen_rocblas_gemm_strided_batched_ex( - handle.rhandle().get(), - gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, - gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, - gemm_desc.m, - gemm_desc.n, - gemm_desc.k, - &alpha, - static_cast(A) + a_offset, - rocblas_datatype::rocblas_datatype_f16_r, - gemm_desc.lda, - gemm_desc.strideA, - static_cast(B) + b_offset, - rocblas_datatype::rocblas_datatype_f16_r, - gemm_desc.ldb, - gemm_desc.strideB, - &beta, - static_cast(C) + c_offset, - rocblas_datatype::rocblas_datatype_f16_r, - gemm_desc.ldc, - gemm_desc.strideC, - static_cast(C) + c_offset, - rocblas_datatype::rocblas_datatype_f16_r, - gemm_desc.ldc, - gemm_desc.strideC, - gemm_desc.batch_count, - rocblas_datatype::rocblas_datatype_f32_r, - rocblas_gemm_algo::rocblas_gemm_algo_standard, - 0, - FlagsForRocblasFp32Fp16Call(gfx90a_alt_impl)); + rb_status = miopen_rocblas_gemm_strided_batched_ex( + handle.rhandle().get(), + gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, + gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, + gemm_desc.m, + gemm_desc.n, + gemm_desc.k, + &alpha, + static_cast(A) + a_offset, + rocblas_datatype::rocblas_datatype_f16_r, + gemm_desc.lda, + gemm_desc.strideA, + static_cast(B) + b_offset, + rocblas_datatype::rocblas_datatype_f16_r, + gemm_desc.ldb, + gemm_desc.strideB, + &beta, + static_cast(C) + c_offset, + rocblas_datatype::rocblas_datatype_f16_r, + gemm_desc.ldc, + gemm_desc.strideC, + static_cast(C) + c_offset, + rocblas_datatype::rocblas_datatype_f16_r, + gemm_desc.ldc, + gemm_desc.strideC, + gemm_desc.batch_count, + rocblas_datatype::rocblas_datatype_f32_r, + rocblas_gemm_algo::rocblas_gemm_algo_standard, + 0, + FlagsForRocblasFp32Fp16Call(gemm_desc)); + } } break; @@ -730,6 +877,21 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle, } break; + case miopenFloat8: + case miopenBFloat8: { + const auto is_gfx94x = miopen::StartsWith(handle.GetDeviceName(), "gfx94"); + if(is_gfx94x) + { + rb_status = miopen_rocblas_gemm_strided_batched_ex3( + handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset); + } + else + MIOPEN_THROW(miopenStatusBadParm, + "8-bit floating types are only supported on gfx94x"); + + break; + } + case miopenDouble: { MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported by MIOpenGEMM."); @@ -764,8 +926,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, int b_offset, Data_t C, int c_offset, - GemmBackend_t gemm_backend, - bool gfx90a_alt_impl) + GemmBackend_t gemm_backend) { MIOPEN_LOG_I2("gemm_desc: " << gemm_desc); @@ -816,7 +977,8 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, for(int i = 0; i < gemm_desc.batch_count; ++i) { rb_status = miopen_rocblas_gemm_ex( - handle.rhandle().get(), + handle, + gemm_desc, gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, gemm_desc.m, @@ -836,7 +998,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, static_cast(C) + c_offset + i * gemm_desc.strideC, rocblas_datatype::rocblas_datatype_i32_r, gemm_desc.ldc, - rocblas_datatype::rocblas_datatype_i32_r, + rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_i32_r, rocblas_gemm_algo::rocblas_gemm_algo_standard, 0, #if USE_GEMM_FLAGS_PACK_INT8X4 @@ -850,37 +1012,65 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, break; case miopenInt32: break; case miopenHalf: { + const auto is_gfx94x = miopen::StartsWith(handle.GetDeviceName(), "gfx94"); + // We need ex3 API if any of the dataType or the cast type is an 8-bit floating type + const auto needs_ex3 = [&]() { + if((gemm_desc.dataType == miopenFloat8 || gemm_desc.dataType == miopenBFloat8) || + (gemm_desc.a_cast_type == miopenFloat8 || + gemm_desc.a_cast_type == miopenBFloat8) || + (gemm_desc.b_cast_type == miopenBFloat8 || + gemm_desc.b_cast_type == miopenFloat8)) + return true; + else + return false; + }(); + // ex3 API only works on the gfx94x ASIC; + if(needs_ex3) + { + if(is_gfx94x) + { + rb_status = miopen_rocblas_gemm_strided_batched_ex3( + handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset); + } + else + MIOPEN_THROW(miopenStatusBadParm, + "8-bit floating types are only supported on gfx94x"); + } + else + { - float alpha = gemm_desc.alpha; - float beta = gemm_desc.beta; + float alpha = gemm_desc.alpha; + float beta = gemm_desc.beta; - for(int i = 0; i < gemm_desc.batch_count; ++i) - { - rb_status = miopen_rocblas_gemm_ex( - handle.rhandle().get(), - gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, - gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, - gemm_desc.m, - gemm_desc.n, - gemm_desc.k, - &alpha, - static_cast(A) + a_offset + i * gemm_desc.strideA, - rocblas_datatype::rocblas_datatype_f16_r, - gemm_desc.lda, - static_cast(B) + b_offset + i * gemm_desc.strideB, - rocblas_datatype::rocblas_datatype_f16_r, - gemm_desc.ldb, - &beta, - static_cast(C) + c_offset + i * gemm_desc.strideC, - rocblas_datatype::rocblas_datatype_f16_r, - gemm_desc.ldc, - static_cast(C) + c_offset + i * gemm_desc.strideC, - rocblas_datatype::rocblas_datatype_f16_r, - gemm_desc.ldc, - rocblas_datatype::rocblas_datatype_f32_r, - rocblas_gemm_algo::rocblas_gemm_algo_standard, - 0, - FlagsForRocblasFp32Fp16Call(gfx90a_alt_impl)); + for(int i = 0; i < gemm_desc.batch_count; ++i) + { + rb_status = miopen_rocblas_gemm_ex( + handle, + gemm_desc, + gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, + gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, + gemm_desc.m, + gemm_desc.n, + gemm_desc.k, + &alpha, + static_cast(A) + a_offset + i * gemm_desc.strideA, + rocblas_datatype::rocblas_datatype_f16_r, + gemm_desc.lda, + static_cast(B) + b_offset + i * gemm_desc.strideB, + rocblas_datatype::rocblas_datatype_f16_r, + gemm_desc.ldb, + &beta, + static_cast(C) + c_offset + i * gemm_desc.strideC, + rocblas_datatype::rocblas_datatype_f16_r, + gemm_desc.ldc, + static_cast(C) + c_offset + i * gemm_desc.strideC, + rocblas_datatype::rocblas_datatype_f16_r, + gemm_desc.ldc, + rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_f32_r, + rocblas_gemm_algo::rocblas_gemm_algo_standard, + 0, + FlagsForRocblasFp32Fp16Call(gemm_desc)); + } } } break; @@ -892,7 +1082,8 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, for(int i = 0; i < gemm_desc.batch_count; ++i) { rb_status = miopen_rocblas_gemm_ex( - handle.rhandle().get(), + handle, + gemm_desc, gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, gemm_desc.m, @@ -912,7 +1103,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, static_cast(C) + c_offset + i * gemm_desc.strideC, rocblas_datatype::rocblas_datatype_bf16_r, gemm_desc.ldc, - rocblas_datatype::rocblas_datatype_f32_r, + rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_f32_r, rocblas_gemm_algo::rocblas_gemm_algo_standard, 0, 0); @@ -927,7 +1118,8 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, for(int i = 0; i < gemm_desc.batch_count; ++i) { rb_status = miopen_rocblas_gemm_ex( - handle.rhandle().get(), + handle, + gemm_desc, gemm_desc.transA ? rocblas_operation_transpose : rocblas_operation_none, gemm_desc.transB ? rocblas_operation_transpose : rocblas_operation_none, gemm_desc.m, @@ -947,7 +1139,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, static_cast(C) + c_offset + i * gemm_desc.strideC, rocblas_datatype::rocblas_datatype_f32_r, gemm_desc.ldc, - rocblas_datatype::rocblas_datatype_f32_r, + rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_f32_r, rocblas_gemm_algo::rocblas_gemm_algo_standard, 0, 0); @@ -955,6 +1147,21 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, } break; + case miopenFloat8: + case miopenBFloat8: { + const auto is_gfx94x = miopen::StartsWith(handle.GetDeviceName(), "gfx94"); + if(is_gfx94x) + { + rb_status = miopen_rocblas_gemm_strided_batched_ex3( + handle, gemm_desc, A, a_offset, B, b_offset, C, c_offset); + } + else + MIOPEN_THROW(miopenStatusBadParm, + "8-bit floating types are only supported on gfx94x"); + + break; + } + case miopenDouble: { MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported by MIOpenGEMM."); diff --git a/src/hipoc/hipoc_program.cpp b/src/hipoc/hipoc_program.cpp index eea3b32291..8a87d0d3f7 100644 --- a/src/hipoc/hipoc_program.cpp +++ b/src/hipoc/hipoc_program.cpp @@ -321,23 +321,20 @@ void HIPOCProgramImpl::BuildCodeObject(std::string params, return GetKernelSrc(program); }(); +#if MIOPEN_BUILD_DEV if(miopen::EndsWith(filename, ".cpp")) { -#if MIOPEN_BUILD_DEV params += " -Werror" + HipKernelWarningsString(); -#else - params += " -Wno-everything"; -#endif } else if(miopen::EndsWith(filename, ".cl")) { -#if MIOPEN_BUILD_DEV params += " -Werror" + (is_kernel_str ? MiopengemmWarningsString() : OclKernelWarningsString()); + } #else + if(miopen::EndsWith(filename, ".cpp") || miopen::EndsWith(filename, ".cl")) params += " -Wno-everything"; #endif - } #if MIOPEN_USE_COMGR /// \todo Refactor when functionality stabilize. BuildCodeObjectInMemory(params, src, filename); diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp index c458f3421d..438e1d306a 100644 --- a/src/include/miopen/conv/problem_description.hpp +++ b/src/include/miopen/conv/problem_description.hpp @@ -55,6 +55,8 @@ inline std::string GetDataTypeName(miopenDataType_t data_type) case miopenInt32: return "INT32"; case miopenBFloat16: return "BF16"; case miopenDouble: return "FP64"; + case miopenFloat8: return "FP8"; + case miopenBFloat8: return "BFP8"; } return "Unknown(" + std::to_string(data_type) + ")"; @@ -190,6 +192,7 @@ struct ProblemDescription : ProblemDescriptionBase // In getters miopenDataType_t GetInDataType() const { return in.GetType(); } + std::optional GetInCastType() const { return in.GetCastType(); } unsigned GetInBatchSize_() const { return GetN5(GetSpatialDims(), in.GetLengths()); } unsigned GetBatchSize_() const { return GetInBatchSize_(); } // alias of GetInBatchSize_() unsigned GetInChannels_() const { return GetC5(GetSpatialDims(), in.GetLengths()); } @@ -223,6 +226,7 @@ struct ProblemDescription : ProblemDescriptionBase // Out getters miopenDataType_t GetOutDataType() const { return out.GetType(); } + std::optional GetOutCastType() const { return out.GetCastType(); } unsigned GetOutBatchSize_() const { return GetN5(GetSpatialDims(), out.GetLengths()); } unsigned GetOutChannels_() const { return GetC5(GetSpatialDims(), out.GetLengths()); } unsigned GetOutDepth_() const { return GetD5(GetSpatialDims(), out.GetLengths()); } @@ -255,6 +259,7 @@ struct ProblemDescription : ProblemDescriptionBase // Weights getters miopenDataType_t GetWeightsDataType() const { return weights.GetType(); } + std::optional GetWeightsCastType() const { return weights.GetCastType(); } unsigned GetWeightsDepth_() const { return GetD5(GetSpatialDims(), weights.GetLengths()); } unsigned GetWeightsHeight_() const { @@ -343,6 +348,20 @@ struct ProblemDescription : ProblemDescriptionBase return GetInDataType() == miopenInt8 && GetWeightsDataType() == miopenInt8 && (GetOutDataType() == miopenInt32 || GetOutDataType() == miopenFloat); } + bool IsFp8() const + { + return GetInDataType() == miopenFloat8 || GetWeightsDataType() == miopenFloat8 || + GetOutDataType() == miopenFloat8; + } + bool IsBfp8() const + { + return GetInDataType() == miopenBFloat8 || GetWeightsDataType() == miopenBFloat8 || + GetOutDataType() == miopenBFloat8; + } + bool IsTensorsCasted() const + { + return GetInCastType() || GetWeightsCastType() || GetOutCastType(); + } // To be used in Solvers that do not implement ALT FP16 kernels. // Those Solvers must be non-applicable for gfx90a when this function returns true. diff --git a/src/include/miopen/convolution.hpp b/src/include/miopen/convolution.hpp index c4b5b7ea46..5e0507ddb6 100644 --- a/src/include/miopen/convolution.hpp +++ b/src/include/miopen/convolution.hpp @@ -45,9 +45,12 @@ #include #include #include +#include MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL) MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONVOLUTION_DETERMINISTIC) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE) +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_SEED) namespace miopen { @@ -77,6 +80,9 @@ using ExtraKernelArgs = std::tuple; +struct ConvFwdTensors; +struct ConvWrwTensors; + struct ConvolutionAttribute { class Gfx90aFp16alt @@ -100,6 +106,38 @@ struct ConvolutionAttribute friend void from_json(const nlohmann::json& json, Gfx90aFp16alt& attribute); } gfx90aFp16alt; + struct FP8RoundingMode + { + inline uint32_t InitSeed() + { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distribution(0, 0xFFFFFFFF); + return distribution(gen); + } + miopenF8RoundingMode_t rounding_mode = miopenF8RoundingModeStochastic; + uint32_t seed = InitSeed(); + friend struct ConvolutionAttribute; + + inline miopenF8RoundingMode_t Get() const + { + if(nullptr != miopen::GetStringEnv(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE{})) + return static_cast( + miopen::Value(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE{})); + return rounding_mode; + } + + inline uint32_t GetSeed() const + { + // assert(rounding_mode == miopenF8RoundingModeStochastic); + if(nullptr != miopen::GetStringEnv(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_SEED{})) + return miopen::Value(MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP8_ROUNDING_SEED{}); + return seed; + } + + inline void SetSeed(const uint32_t s) { seed = s; } + } fp8rounding_mode; + class Deterministic { int value = 0; diff --git a/src/include/miopen/datatype.hpp b/src/include/miopen/datatype.hpp index 2a0cb48ef4..485bdb3d67 100644 --- a/src/include/miopen/datatype.hpp +++ b/src/include/miopen/datatype.hpp @@ -66,6 +66,14 @@ inline std::string GetDataType(miopenDataType_t type) type_str = "double"; } break; + case miopenFloat8: { + type_str = "float8"; + } + break; + case miopenBFloat8: { + type_str = "bfloat8"; + } + break; } return type_str; } @@ -133,6 +141,8 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type) int use_int32 = 0; int use_bfp16 = 0; int use_fp64 = 0; + int use_fp8 = 0; + int use_bfp8 = 0; const int use_rne_bfloat16 = MIOPEN_USE_RNE_BFLOAT16; switch(type) @@ -144,8 +154,11 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type) case miopenBFloat16: use_bfp16 = 1; break; case miopenInt32: use_int32 = 1; break; case miopenDouble: use_fp64 = 1; break; + case miopenFloat8: use_fp8 = 1; break; + case miopenBFloat8: use_bfp8 = 1; break; default: - MIOPEN_THROW("Only float, half, bfloat16, int8, int8x4 data type is supported."); + MIOPEN_THROW( + "Only float, half, bfloat16, int8, int8x4, float8, bfloat8 data type is supported."); break; } @@ -159,9 +172,15 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type) {"MIOPEN_USE_BFP16", use_bfp16}, {"MIOPEN_USE_INT32", use_int32}, {"MIOPEN_USE_RNE_BFLOAT16", use_rne_bfloat16}, + {"MIOPEN_FP8_IEEE_EXPONENT_BIAS", MIOPEN_FP8_IEEE_EXPONENT_BIAS}, + {"MIOPEN_FP8_CLIPPING", MIOPEN_FP8_CLIPPING}, }; if(use_fp64 != 0) kbp.Define("MIOPEN_USE_FP64", use_fp64); + if(use_fp8 != 0) + kbp.Define("MIOPEN_USE_FP8", use_fp8); + if(use_bfp8 != 0) + kbp.Define("MIOPEN_USE_FP8", use_bfp8); return kbp; } diff --git a/src/include/miopen/gemm_v2.hpp b/src/include/miopen/gemm_v2.hpp index f206771c89..9300ffa29b 100644 --- a/src/include/miopen/gemm_v2.hpp +++ b/src/include/miopen/gemm_v2.hpp @@ -27,6 +27,7 @@ #define GUARD_MIOPEN_GEMM_V2_HPP_ #include +#include #include namespace miopen { @@ -72,6 +73,50 @@ struct GemmDescriptor float alpha, beta; miopenDataType_t dataType; bool deterministic; + bool gfx90a_alt_impl; + miopenDataType_t a_cast_type; + miopenDataType_t b_cast_type; + ConvolutionAttribute conv_attributes; + GemmDescriptor() {} + GemmDescriptor(bool isColMajor_, + bool transA_, + bool transB_, + int m_, + int n_, + int k_, + int lda_, + int ldb_, + int ldc_, + int batch_count_, + long long int strideA_, + long long int strideB_, + long long int strideC_, + float alpha_, + float beta_, + miopenDataType_t dataType_, + bool deterministic_) + : isColMajor(isColMajor_), + transA(transA_), + transB(transB_), + m(m_), + n(n_), + k(k_), + lda(lda_), + ldb(ldb_), + ldc(ldc_), + batch_count(batch_count_), + strideA(strideA_), + strideB(strideB_), + strideC(strideC_), + alpha(alpha_), + beta(beta_), + dataType(dataType_), + deterministic(deterministic_), + gfx90a_alt_impl(false), + a_cast_type(dataType), + b_cast_type(dataType) + { + } friend std::ostream& operator<<(std::ostream& stream, const GemmDescriptor& gemm_desc); }; @@ -86,8 +131,7 @@ miopenStatus_t CallGemmTimeMeasure(const Handle& handle, int c_offset, bool time_precision, CallGemmType_t call_gemm_type, - GemmBackend_t gemm_backend = GemmBackend_t::rocblas, - bool gfx90a_alt_impl = false); + GemmBackend_t gemm_backend = GemmBackend_t::rocblas); miopenStatus_t CallGemm(const Handle& handle, GemmDescriptor gemm_desc, @@ -97,8 +141,7 @@ miopenStatus_t CallGemm(const Handle& handle, int b_offset, Data_t C, int c_offset, - GemmBackend_t gemm_backend = GemmBackend_t::rocblas, - bool gfx90a_alt_impl = false); + GemmBackend_t gemm_backend = GemmBackend_t::rocblas); miopenStatus_t CallGemmStridedBatched(const Handle& handle, GemmDescriptor gemm_desc, @@ -108,19 +151,18 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle, int b_offset, Data_t C, int c_offset, - GemmBackend_t gemm_backend = GemmBackend_t::rocblas, - bool gfx90a_alt_impl = false); - -miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, - GemmDescriptor gemm_desc, - ConstData_t A, - int a_offset, - ConstData_t B, - int b_offset, - Data_t C, - int c_offset, - GemmBackend_t gemm_backend = GemmBackend_t::rocblas, - bool gfx90a_alt_impl = false); + GemmBackend_t gemm_backend = GemmBackend_t::rocblas); + +miopenStatus_t +CallGemmStridedBatchedSequential(const Handle& handle, + GemmDescriptor gemm_desc, + ConstData_t A, + int a_offset, + ConstData_t B, + int b_offset, + Data_t C, + int c_offset, + GemmBackend_t gemm_backend = GemmBackend_t::rocblas); // GEMM parameters for Convolution (using Im2Col) Fwd // y = w * Im2Col(x) diff --git a/src/include/miopen/handle.hpp b/src/include/miopen/handle.hpp index ee26d7985f..7d1bb79a37 100644 --- a/src/include/miopen/handle.hpp +++ b/src/include/miopen/handle.hpp @@ -51,6 +51,10 @@ #include #if MIOPEN_USE_ROCBLAS +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-macros" +#define ROCBLAS_BETA_FEATURES_API 1 +#pragma clang diagnostic pop #include #if MIOPEN_ROCBLAS_VERSION_FLAT < 2045000 #include diff --git a/src/include/miopen/hip_f8_impl.hpp b/src/include/miopen/hip_f8_impl.hpp new file mode 120000 index 0000000000..22052778a0 --- /dev/null +++ b/src/include/miopen/hip_f8_impl.hpp @@ -0,0 +1 @@ +../../kernels/hip_f8_impl.hpp \ No newline at end of file diff --git a/src/include/miopen/hip_float8.hpp b/src/include/miopen/hip_float8.hpp new file mode 120000 index 0000000000..5e16a70c91 --- /dev/null +++ b/src/include/miopen/hip_float8.hpp @@ -0,0 +1 @@ +../../kernels/hip_float8.hpp \ No newline at end of file diff --git a/src/include/miopen/problem_description.hpp b/src/include/miopen/problem_description.hpp index bc781d4b1e..b8e70db5ff 100644 --- a/src/include/miopen/problem_description.hpp +++ b/src/include/miopen/problem_description.hpp @@ -198,6 +198,7 @@ struct ProblemDescriptionCompatTemporary /* * set bot tensor */ + void setBotDescr(const std::string& layout, miopenDataType_t data_type, int batch, diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp index e5ed014831..f05bbdf712 100644 --- a/src/include/miopen/solver/conv_direct_naive_conv.hpp +++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp @@ -34,8 +34,10 @@ namespace solver { bool ConvDirectNaiveConvIsAssemblyKernel(const ExecutionContext&, const ProblemDescription&); std::string ConvDirectNaiveConvKernelName(const ProblemDescription&); -std::string ConvDirectNaiveConvKernelFile(); -std::string ConvDirectNaiveConvCompileOption(const ConvolutionContext& ctx); +std::string ConvDirectNaiveConvKernelFile(const ConvolutionContext& ctx, + const ProblemDescription& problem); +std::string ConvDirectNaiveConvCompileOption(const ConvolutionContext& ctx, + const ProblemDescription& problem); bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext&, const ProblemDescription&); diff --git a/src/include/miopen/solver/problem_description_interpreter.hpp b/src/include/miopen/solver/problem_description_interpreter.hpp index 3e9e7fb3de..0690d3d36e 100644 --- a/src/include/miopen/solver/problem_description_interpreter.hpp +++ b/src/include/miopen/solver/problem_description_interpreter.hpp @@ -105,6 +105,14 @@ struct ProblemInterpreter return problem.GetOutWidth_(); } + static auto GetInputCastType(const ProblemDescription& problem) + { + if(problem.direction.IsForward()) + return problem.GetInCastType(); + else + return problem.GetOutCastType(); + } + static int GetOutputDepthDo(const ProblemDescription& problem) { if(problem.direction.IsForward()) @@ -129,6 +137,14 @@ struct ProblemInterpreter return problem.GetInWidth_(); } + static auto GetOutputCastType(const ProblemDescription& problem) + { + if(problem.direction.IsForward()) + return problem.GetOutCastType(); + else + return problem.GetInCastType(); + } + static auto GetOutputDataType(const ProblemDescription& problem) { return problem.direction.IsForward() ? problem.GetOutDataType() : problem.GetInDataType(); diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp index e27622bc4a..b8d72df67c 100644 --- a/src/include/miopen/tensor.hpp +++ b/src/include/miopen/tensor.hpp @@ -41,6 +41,7 @@ #include #include #include +#include namespace miopen { @@ -101,7 +102,9 @@ inline std::size_t GetTypeSize(miopenDataType_t d) case miopenHalf: case miopenBFloat16: return 2; case miopenInt8x4: - case miopenInt8: return 1; + case miopenInt8: + case miopenFloat8: + case miopenBFloat8: return 1; case miopenDouble: return 8; } MIOPEN_THROW("Unknown data type"); @@ -185,6 +188,8 @@ struct TensorDescriptor : miopenTensorDescriptor std::string GetLayout_str() const; std::size_t GetVectorLength() const; + std::optional GetCastType() const; + void SetCastType(miopenDataType_t cast_type_); std::size_t GetElementSize() const; @@ -280,7 +285,8 @@ struct TensorDescriptor : miopenTensorDescriptor bool packed; std::size_t vector_length = 1; - miopenDataType_t type = miopenFloat; + miopenDataType_t type = miopenFloat; + std::optional cast_type; miopenTensorLayout_t tensorLayout = GetDefaultLayout(); }; diff --git a/src/include/miopen/visit_float.hpp b/src/include/miopen/visit_float.hpp index 8170ce5478..d26afba9dd 100644 --- a/src/include/miopen/visit_float.hpp +++ b/src/include/miopen/visit_float.hpp @@ -77,6 +77,8 @@ void visit_float(miopenDataType_t t, F f) f(as_float{}); break; } + case miopenFloat8: + case miopenBFloat8: case miopenInt8x4: case miopenInt8: { f(as_float{}); diff --git a/src/kernels/MIOpenCheckNumerics.cpp b/src/kernels/MIOpenCheckNumerics.cpp new file mode 100644 index 0000000000..827f4d1397 --- /dev/null +++ b/src/kernels/MIOpenCheckNumerics.cpp @@ -0,0 +1,205 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include + +// Copied over from naive_conv.cpp +#ifdef __HIPCC_RTC__ +#ifdef WORKAROUND_ISSUE_HIPRTC_TRUE_TYPE +/// Definitions from , conflict with +/// /opt/rocm/include/hip/amd_detail/amd_hip_vector_types.h. + +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef signed short int16_t; +typedef unsigned short uint16_t; +typedef float float_t; + +// std::conditional requires type_traits which has a few other things +// which result in collision with amd_hip_vector_types.h + +namespace std { +template +struct conditional; + +template +struct conditional +{ + using type = X; +}; + +template +struct conditional +{ + using type = Y; +}; + +template +using conditional_t = typename conditional::type; +} // namespace std +#else +#include // int8_t, int16_t +#include // float_t +#endif +#endif // __HIPCC_RTC__ + +#include // std::numeric_limits + +#define MIOPEN_ENABLE_F8_DEVICE_CODE 1 +#include "hip_float8.hpp" + +struct Numerics +{ + float sum; + float absSum; + float min; + float max; +}; + +struct CheckNumericsResult +{ + Numerics n; + + int hasZero; + int hasNan; + int hasInf; +}; + +__device__ void thread_redux(Numerics* stats, size_t wid) +{ + const auto lid = threadIdx.x; + if(lid < wid) + { + stats[lid].sum += stats[lid + wid].sum; + stats[lid].absSum += stats[lid + wid].absSum; + stats[lid].min = fmin(stats[lid].min, stats[lid + wid].min); + stats[lid].max = fmax(stats[lid].max, stats[lid + wid].max); + } +} + +template +__device__ void +check_numerics(const T* C_d, size_t sz, CheckNumericsResult* abnormal, bool computeStats) +{ + __shared__ Numerics stats[256]; + U sum = 0; + U absSum = 0; + T minV = std::numeric_limits::max(); + T maxV = std::numeric_limits::min(); + + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for(size_t i = offset; i < sz; i += stride) + { + T val = C_d[i]; + sum += static_cast(val); + const auto abs_val = fabs(static_cast(val)); + absSum += abs_val; + minV = min(minV, val); + maxV = max(maxV, val); + if(abs_val <= static_cast(0.0f)) + abnormal->hasZero = 1; + if(isnan(static_cast(val))) + abnormal->hasNan = 1; + if(isinf(static_cast(val))) + abnormal->hasInf = 1; + } + if(computeStats) + { + stats[threadIdx.x].sum = static_cast(sum); + stats[threadIdx.x].absSum = static_cast(absSum); + stats[threadIdx.x].min = static_cast(minV); + stats[threadIdx.x].max = static_cast(maxV); + __syncthreads(); + for(int idx = 128; idx > 0; idx = idx >> 1) + { + thread_redux(stats, idx); + __syncthreads(); + } + if(threadIdx.x == 0) + { + atomicAdd(&abnormal->n.sum, stats[0].sum); + atomicAdd(&abnormal->n.absSum, stats[0].absSum); + atomicMin(&abnormal->n.min, stats[0].min); + atomicMax(&abnormal->n.max, stats[0].max); + } + } +} + +extern "C" __global__ void check_numerics_fp32(const void* __restrict__ C_d, + size_t sz, + CheckNumericsResult* __restrict__ abnormal, + bool computeStats) +{ + check_numerics(reinterpret_cast(C_d), sz, abnormal, computeStats); +} + +extern "C" __global__ void check_numerics_fp16(const void* __restrict__ C_d, + size_t sz, + CheckNumericsResult* __restrict__ abnormal, + bool computeStats) +{ + check_numerics<_Float16, float>( + reinterpret_cast(C_d), sz, abnormal, computeStats); +} + +extern "C" __global__ void check_numerics_bf16(const void* __restrict__ C_d, + size_t sz, + CheckNumericsResult* __restrict__ abnormal, + bool computeStats) +{ + check_numerics( + reinterpret_cast(C_d), sz, abnormal, computeStats); +} + +extern "C" __global__ void check_numerics_fp8(const void* __restrict__ C_d, + size_t sz, + CheckNumericsResult* __restrict__ abnormal, + bool computeStats) +{ + check_numerics, float>( + reinterpret_cast*>(C_d), + sz, + abnormal, + computeStats); +} + +extern "C" __global__ void check_numerics_bf8(const void* __restrict__ C_d, + size_t sz, + CheckNumericsResult* __restrict__ abnormal, + bool computeStats) +{ + check_numerics, float>( + reinterpret_cast*>(C_d), + sz, + abnormal, + computeStats); +} diff --git a/src/kernels/MIOpenIm2d2Col.cl b/src/kernels/MIOpenIm2d2Col.cl index c64f14dad5..7b1522db6f 100644 --- a/src/kernels/MIOpenIm2d2Col.cl +++ b/src/kernels/MIOpenIm2d2Col.cl @@ -48,7 +48,15 @@ #define MIOPEN_USE_INT32 0 #endif -#if MIOPEN_USE_INT8 +#ifndef MIOPEN_USE_FP8 +#define MIOPEN_USE_FP8 0 +#endif + +#ifndef MIOPEN_USE_BFP8 +#define MIOPEN_USE_BFP8 0 +#endif + +#if MIOPEN_USE_INT8 || MIOPEN_USE_FP8 || MIOPEN_USE_BFP8 typedef char data_t; #elif MIOPEN_USE_INT8x4 typedef uint data_t; diff --git a/src/kernels/MIOpenUtilKernels4.cl b/src/kernels/MIOpenUtilKernels4.cl index d7e0d3f789..861e563012 100644 --- a/src/kernels/MIOpenUtilKernels4.cl +++ b/src/kernels/MIOpenUtilKernels4.cl @@ -48,7 +48,15 @@ #define MIOPEN_USE_INT32 0 #endif -#if MIOPEN_USE_INT8 +#ifndef MIOPEN_USE_FP8 +#define MIOPEN_USE_FP8 0 +#endif + +#ifndef MIOPEN_USE_BFP8 +#define MIOPEN_USE_BFP8 0 +#endif + +#if MIOPEN_USE_INT8 || MIOPEN_USE_FP8 || MIOPEN_USE_BFP8 typedef char data_t; #elif MIOPEN_USE_INT8x4 typedef uint data_t; diff --git a/src/kernels/bfloat16_dev.hpp b/src/kernels/bfloat16_dev.hpp index 84346b5d36..c1a77c90db 100644 --- a/src/kernels/bfloat16_dev.hpp +++ b/src/kernels/bfloat16_dev.hpp @@ -118,6 +118,185 @@ EXECUTION_SPECIFIER ushort float_to_bfloat16(float src_val) #endif // MIOPEN_BACKEND_HIP } +#ifndef MIOPEN_USE_FP8 +#define MIOPEN_USE_FP8 0 +#endif + +#ifndef MIOPEN_USE_BFP8 +#define MIOPEN_USE_BFP8 0 +#endif + +#if MIOPEN_USE_FP8 || MIOPEN_USE_BFP8 +// TODO: Convert the Col2Im kernels from OpenCL to HIP and remove the following +// functions which are rewrites of the f8 header impl functions +EXECUTION_SPECIFIER float fp8_to_float_impl(uchar x, const int wm, const int we) +{ + bool negative_zero_nan = MIOPEN_FP8_IEEE_EXPONENT_BIAS ? false : true; + + const int weo = 8; + const int wmo = 23; + + float fInf, fNegInf, fNaN, fNeg0; + const uint ifInf = 0x7F800000; + const uint ifNegInf = 0xFF800000; + const uint ifNaN = 0x7F800001; + const uint ifNeg0 = 0x80000000; + fInf = *((const float*)(&ifInf)); + fNegInf = *((const float*)(&ifNegInf)); + fNaN = *((const float*)(&ifNaN)); + fNeg0 = *((const float*)(&ifNeg0)); + + if(x == 0) + return (float)(0); + + uint sign = x >> 7; + uint mantissa = x & ((1 << wm) - 1); + int exponent = (x & 0x7F) >> wm; + if(negative_zero_nan) + { + if(x == 0x80) + return fNaN; + } + else + { + if(x == 0x80) + return fNeg0; + if(exponent == ((1 << we) - 1)) + return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN; + } + uint retval; + const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0); + + // subnormal input + if(exponent == 0) + { + // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above + // TODO: verify __builtin_clz and OpenCL's clz do the same thing + int sh = 1 + clz(mantissa) - (32 - wm); + mantissa <<= sh; + exponent += 1 - sh; + /* + exponent++; + while(mantissa<(1<>= 1 - exponent; + exponent = 0; + } + + retval = (sign << 31) | (exponent << 23) | mantissa; + return *((const float*)(&retval)); +} + +EXECUTION_SPECIFIER float fp8_to_float(uchar x) { return fp8_to_float_impl(x, 3, 4); } + +EXECUTION_SPECIFIER float bfp8_to_float(uchar x) { return fp8_to_float_impl(x, 2, 5); } + +inline uchar float_to_fp8_impl(float _x, const int wm, const int we) // bool stoch, uint rng) +{ + bool negative_zero_nan = MIOPEN_FP8_IEEE_EXPONENT_BIAS ? false : true; + bool clip = MIOPEN_FP8_CLIPPING; + + // Conserve the logic for stochastic rounding: + bool stoch = false; + uint rng = 0; + const int mfmt = 23; + uint x; + x = *((uint*)(&_x)); + + uint head, mantissa; + int exponent; + uint sign; + + head = x & 0xFF800000; + mantissa = x & 0x7FFFFF; + exponent = (head >> 23) & 0xFF; + sign = head >> 31; + + uint signed_inf = (sign << 7) + (((1 << we) - 1) << wm); + + if(negative_zero_nan) + { + if((x & 0x7F800000) == 0x7F800000) + return 0x80; + } + else + { + if((x & 0x7F800000) == 0x7F800000) + return signed_inf + (mantissa != 0 ? 1 : 0); + } + if(x == 0) + return 0; + + uint drop_mask = (1 << (mfmt - wm)) - 1; + const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2); + const int exp_low_cutoff = (128) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0); + + exponent -= exp_low_cutoff - 1; + if(exponent <= 0) + drop_mask = (1 << (mfmt - wm + 1 - exponent)) - 1; + mantissa += 1 << mfmt; + mantissa += (stoch ? rng : mantissa) & drop_mask; + if(mantissa >= (2 << mfmt)) + { + mantissa >>= 1; + exponent++; + } + mantissa >>= (mfmt - wm); + + if(exponent <= 0) + { + if(x == 0) // cppcheck-suppress identicalConditionAfterEarlyExit + return 0; + else + { + // subnormal range; represented by a subnormal float8 (exponent 0) + // and involves loss of accuracy + mantissa >>= 1 - exponent; + exponent = 0; + } + } + // above range: quantize to maximum possible float of the same sign + else if(exponent > max_exp) + { + if(clip) + { + mantissa = (1 << wm) - 1; + exponent = max_exp; + } + else + { + return signed_inf; + } + } + if(exponent == 0 && mantissa == 0) + return negative_zero_nan ? 0 : (sign << 7); + mantissa &= (1 << wm) - 1; + return (sign << 7) | (exponent << wm) | mantissa; +} + +EXECUTION_SPECIFIER uchar float_to_fp8(float _x) // bool stoch, uint rng) +{ + return float_to_fp8_impl(_x, 3, 4); +} + +EXECUTION_SPECIFIER uchar float_to_bfp8(float _x) // bool stoch, uint rng) +{ + return float_to_fp8_impl(_x, 2, 5); +} +#endif // MIOPEN_USE_FP8 || MIOPEN_USE_BFP8 + #ifdef __cplusplus } #endif diff --git a/src/kernels/float_types.h b/src/kernels/float_types.h index a4c3b654ca..897e95ca67 100644 --- a/src/kernels/float_types.h +++ b/src/kernels/float_types.h @@ -33,6 +33,51 @@ #define TWO 2 #define FOUR 4 #define EIGHT 8 +#if MIOPEN_USE_FP8 == 1 +#ifdef __HIP_PLATFORM_HCC__ +#define FLOAT hip_f8 +#define FLOAT_ACCUM float +// HIP implements the correct operators for conversion + +#else +#define _FLOAT uchar +#define _FLOAT_ACCUM float +// OpenCL requires explicit functions +#define CVT_FLOAT2ACCUM(x) fp8_to_float(x) +#define CVT_ACCUM2FLOAT(x) float_to_fp8(x) +#endif +#define SIZEOF_FLOAT 1 +// Max value for the main datatype +#define MAX_VAL 0x7F +// Max value for accumulator +// #ifndef FLT_MAX +// #define MAX_VAL_ACCUM 3.402823466e+38F +// #else +// #define MAX_VAL_ACCUM FLT_MAX +// #endif +#endif // MIOPEN_USE_FP8 + +#if MIOPEN_USE_BFP8 == 1 +#ifdef __HIP_PLATFORM_HCC__ +#define FLOAT hip_f8 +#define FLOAT_ACCUM float +#else +#define _FLOAT uchar +#define _FLOAT_ACCUM float +// OpenCL requires explicit functions +#define CVT_FLOAT2ACCUM(x) bfp8_to_float(x) +#define CVT_ACCUM2FLOAT(x) float_to_bfp8(x) +#endif +#define SIZEOF_FLOAT 1 +// Max value for the main datatype +#define MAX_VAL 0x7F +// Max value for accumulator +// #ifndef FLT_MAX +// #define MAX_VAL_ACCUM 3.402823466e+38F +// #else +// #define MAX_VAL_ACCUM FLT_MAX +// #endif +#endif // MIOPEN_USE_BFP8 #ifndef __HIP_PLATFORM_HCC__ #define _FLOAT2 PPCAT(_FLOAT, TWO) diff --git a/src/kernels/gpu_reference_kernel/fp8_kern_types.h b/src/kernels/gpu_reference_kernel/fp8_kern_types.h new file mode 100644 index 0000000000..3bac0a31f7 --- /dev/null +++ b/src/kernels/gpu_reference_kernel/fp8_kern_types.h @@ -0,0 +1,63 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + *all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#define CAT_I(a, b) a##b +#define CAT(a, b) CAT_I(a, b) + +#ifndef INPUT_TYPE +#define INPUT_TYPE half +#endif + +#ifndef OUTPUT_TYPE +#define OUTPUT_TYPE half +#endif + +#ifndef WEIGHTS_TYPE +#define WEIGHTS_TYPE half +#endif + +#ifndef INPUT_CAST_TYPE +#define INPUT_CAST_TYPE float8 +#endif + +#ifndef WEIGHTS_CAST_TYPE +#define WEIGHTS_CAST_TYPE float8 +#endif + +#ifndef OUTPUT_CAST_TYPE +#define OUTPUT_CAST_TYPE float8 +#endif + +#ifndef ACCUMULATOR_TYPE +#define ACCUMULATOR_TYPE double +#endif + +#define KERNEL_NAME_SUFFIX CAT(CAT(INPUT_TYPE, _), CAT(CAT(WEIGHTS_TYPE, _), OUTPUT_TYPE)) + +#define FWD_KERNEL_NAME CAT(naive_conv_fwd_nchw_, KERNEL_NAME_SUFFIX) +#define BWD_KERNEL_NAME CAT(naive_conv_bwd_nchw_, KERNEL_NAME_SUFFIX) +#define WRW_KERNEL_NAME CAT(naive_conv_wrw_nchw_, KERNEL_NAME_SUFFIX) diff --git a/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp b/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp new file mode 100644 index 0000000000..e6b2945beb --- /dev/null +++ b/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp @@ -0,0 +1,571 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + *all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#include +#include +#endif + +// Copied over from naive_conv.cpp +#ifdef __HIPCC_RTC__ +#ifdef WORKAROUND_ISSUE_HIPRTC_TRUE_TYPE +/// Definitions from , conflict with +/// /opt/rocm/include/hip/amd_detail/amd_hip_vector_types.h. + +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef signed short int16_t; +typedef unsigned short uint16_t; +typedef float float_t; + +// std::conditional requires type_traits which has a few other things +// which result in collition with amd_hip_vector_types.h + +namespace std { +template +struct conditional; + +template +struct conditional +{ + using type = X; +}; + +template +struct conditional +{ + using type = Y; +}; + +template +using conditional_t = typename conditional::type; +} // namespace std +#else +#include // int8_t, int16_t +#include // float_t +#endif +#endif // __HIPCC_RTC__ + +#include // std::numeric_limits + +#define MIOPEN_ENABLE_F8_DEVICE_CODE 1 +#include "hip_float8.hpp" + +#include "fp8_kern_types.h" + +using float8 = miopen_f8::hip_f8; +using bfloat8 = miopen_f8::hip_f8; + +template +inline __device__ uint32_t draft_rng(T x, uint32_t seed) +{ + int i = threadIdx.x + blockIdx.x * blockDim.x; + typedef typename std::conditional::type IT; + IT tmp = *(reinterpret_cast(&x)); + uint32_t drop_bits = uint32_t(tmp) & 0xFFFFu; + if(sizeof(tmp) == 4) + drop_bits ^= tmp >> 16; + drop_bits = ((drop_bits & 31) << 11) | (drop_bits >> 5); + drop_bits *= 0x7000149; + uint32_t rng = (drop_bits ^ 0x13371337 ^ (i * 229791) ^ seed); + return rng; +} + +template +inline __device__ TO cast_number(const TI input, miopen_f8::hip_f8_rounding_mode mode, uint32_t rng) +{ + if(std::is_same::value) + { + return input; + } + if(sizeof(TI) == sizeof(TO)) + { + const auto tmp = static_cast(input); + return TO{tmp, mode, rng}; + } + else if(sizeof(TO) > sizeof(TI)) + { + return static_cast(input); + } + else + { + return TO{input, mode, rng}; + } +} + +template +inline __device__ void naive_conv_fwd_nchw(const TI* __restrict__ p_in, + const TW* __restrict__ p_wei, + TO* __restrict__ p_out, + const int hi, + const int wi, + const int n, + const int k_per_group, + const int c_per_group, + const int ho, + const int wo, + const int sy, + const int sx, + const int dy, + const int dx, + const int py, + const int px, + const int fy, + const int fx, + const int group, + bool stoch, + uint32_t seed) +{ + /* + * need to compute total output pixel: `group * n * k_per_group * ho * wo`. + * to distribute this workload, let one workgroup compute `ho * wo` pixel, + * hence need `group * n * k_per_group` workgroups (grid_size). + */ + int k = k_per_group * group; + int c = c_per_group * group; + int thread_length = ho * wo; + int bid = blockIdx.x; + int ik = bid % k_per_group; + int in = (bid / k_per_group) % n; + int ig = bid / (n * k_per_group); + + p_in += static_cast(in) * c * hi * wi + static_cast(ig) * c_per_group * hi * wi; + p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + + static_cast(ik) * c_per_group * fy * fx; + p_out += static_cast(in) * k * ho * wo + + static_cast(ig) * k_per_group * ho * wo + static_cast(ik) * ho * wo; + + for(int tid = threadIdx.x; tid < thread_length; tid += 256) + { + int iho = tid / wo; + int iwo = tid % wo; + + TACC value = .0f; + + for(int ic = 0; ic < c_per_group; ic++) + { + for(int iy = 0; iy < fy; iy++) + { + int valid_h = 1; + int cur_h = sy * iho - py + dy * iy; + if(cur_h < 0 || cur_h >= hi) + valid_h &= 0; + for(int ix = 0; ix < fx; ix++) + { + int valid_w = 1; + int cur_w = sx * iwo - px + dx * ix; + if(cur_w < 0 || cur_w >= wi) + valid_w &= 0; + + if(valid_w & valid_h) + { + size_t i_idx = static_cast(ic) * hi * wi + + static_cast(cur_h) * wi + static_cast(cur_w); + size_t f_idx = static_cast(ic) * fy * fx + + static_cast(iy) * fx + static_cast(ix); + uint32_t rng1 = 0; + uint32_t rng2 = 0; + auto rnd_mode = miopen_f8::hip_f8_rounding_mode::standard; + if(stoch) + { + rng1 = draft_rng(p_in[i_idx], seed); + rng2 = draft_rng(p_in[f_idx], seed); + rnd_mode = miopen_f8::hip_f8_rounding_mode::stochastic; + } + const auto item_in = in_cast_type(p_in[i_idx], rnd_mode, rng1); + const auto item_wei = wei_cast_type(p_wei[f_idx], rnd_mode, rng2); + value += static_cast(item_in) * static_cast(item_wei); + } + } + } + } + size_t o_idx = static_cast(iho) * wo + static_cast(iwo); + // p_out[o_idx] = __float2half(static_cast(value)); + p_out[o_idx] = static_cast(value); + } +} + +extern "C" __global__ void FWD_KERNEL_NAME(const INPUT_TYPE* __restrict__ p_in, + + const WEIGHTS_TYPE* __restrict__ p_wei, + OUTPUT_TYPE* __restrict__ p_out, + int hi, + int wi, + int n, + int k_per_group, + int c_per_group, + int ho, + int wo, + int sy, + int sx, + int dy, + int dx, + int py, + int px, + int fy, + int fx, + int group, + bool stochastic, + uint32_t seed) +{ + // instantiate the kernel + naive_conv_fwd_nchw(p_in, + p_wei, + p_out, + hi, + wi, + n, + k_per_group, + c_per_group, + ho, + wo, + sy, + sx, + dy, + dx, + py, + px, + fy, + fx, + group, + stochastic, + seed); +} + +template +inline __device__ void naive_conv_bwd_nchw(TI* __restrict__ p_in, + const TW* __restrict__ p_wei, + const TO* __restrict__ p_out, + int hi, + int wi, + int n, + int k_per_group, + int c_per_group, + int ho, + int wo, + int sy, + int sx, + int dy, + int dx, + int py, + int px, + int fy, + int fx, + int group, + bool stoch, + uint32_t seed) +{ + /* + * need to compute total input pixel: `group * n * c_per_group * hi * wi`. + * to distribute this workload, let one workgroup compute `hi * wi` pixel, + * hence need `group * n * c_per_group` workgroups (grid_size). + */ + int k = k_per_group * group; + int c = c_per_group * group; + int thread_length = hi * wi; + int bid = blockIdx.x; + int ic = bid % c_per_group; + int in = (bid / c_per_group) % n; + int ig = bid / (n * c_per_group); + + p_in += static_cast(in) * c * hi * wi + + static_cast(ig) * c_per_group * hi * wi + static_cast(ic) * hi * wi; + p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + + static_cast(ic) * fy * fx; + p_out += + static_cast(in) * k * ho * wo + static_cast(ig) * k_per_group * ho * wo; + + for(int tid = threadIdx.x; tid < thread_length; tid += 256) + { + int ihi = tid / wi; + int iwi = tid % wi; + + TACC value = .0f; + + for(int ik = 0; ik < k_per_group; ik++) + { + for(int iy = 0; iy < fy; iy++) + { + int valid_h = 1; + int cur_ho = ihi + py - dy * iy; // cur_h = sy*iho-py+dy*iy; + if(cur_ho < 0 || cur_ho % sy) + valid_h &= 0; + cur_ho /= sy; + if(cur_ho >= ho) + valid_h &= 0; + for(int ix = 0; ix < fx; ix++) + { + int valid_w = 1; + int cur_wo = iwi + px - dx * ix; // cur_w = sx*iwo-px+dx*ix; + if(cur_wo < 0 || cur_wo % sx) + valid_w &= 0; + cur_wo /= sx; + if(cur_wo >= wo) + valid_w &= 0; + + if(valid_h & valid_w) + { + size_t o_idx = static_cast(ik) * ho * wo + + static_cast(cur_ho) * wo + + static_cast(cur_wo); + size_t f_idx = static_cast(ik) * c_per_group * fy * fx + + static_cast(iy) * fx + static_cast(ix); + uint32_t rng1 = 0; + uint32_t rng2 = 0; + auto rnd_mode = miopen_f8::hip_f8_rounding_mode::standard; + if(stoch) + { + rng1 = draft_rng(p_out[o_idx], seed); + rng2 = draft_rng(p_wei[f_idx], seed); + rnd_mode = miopen_f8::hip_f8_rounding_mode::stochastic; + } + const auto item_out = out_cast_type(p_out[o_idx], rnd_mode, rng1); + const auto item_wei = wei_cast_type(p_wei[f_idx], rnd_mode, rng2); + value += static_cast(item_out) * static_cast(item_wei); + } + } + } + } + size_t i_idx = static_cast(ihi) * wi + static_cast(iwi); + p_in[i_idx] = static_cast(value); + } +} + +extern "C" __global__ void BWD_KERNEL_NAME(INPUT_TYPE* __restrict__ p_in, + const WEIGHTS_TYPE* __restrict__ p_wei, + const OUTPUT_TYPE* __restrict__ p_out, + int hi, + int wi, + int n, + int k_per_group, + int c_per_group, + int ho, + int wo, + int sy, + int sx, + int dy, + int dx, + int py, + int px, + int fy, + int fx, + int group, + bool stochastic, + uint32_t seed) +{ + // instantiate template + naive_conv_bwd_nchw(p_in, + p_wei, + p_out, + hi, + wi, + n, + k_per_group, + c_per_group, + ho, + wo, + sy, + sx, + dy, + dx, + py, + px, + fy, + fx, + group, + stochastic, + seed); +} + +template +inline __device__ void naive_conv_wrw_nchw(const TI* __restrict__ p_in, + TW* __restrict__ p_wei, + const TO* __restrict__ p_out, + int hi, + int wi, + int n, + int k_per_group, + int c_per_group, + int ho, + int wo, + int sy, + int sx, + int dy, + int dx, + int py, + int px, + int fy, + int fx, + int group, + bool stoch, + uint32_t seed) +{ + /* + * need to compute total filter pixel: `group * k_per_group * c_per_group * + * fy * fx`. + * to distribute this workload, let one workgroup compute `c_per_group * fy + * * fx` pixel, + * hence need `group * k_per_group` workgroups (grid_size). + */ + int k = k_per_group * group; + int c = c_per_group * group; + int thread_length = c_per_group * fy * fx; + int bid = blockIdx.x; + int ik = bid % k_per_group; + int ig = bid / k_per_group; + + p_in += static_cast(ig) * c_per_group * hi * wi; + p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + + static_cast(ik) * c_per_group * fy * fx; + p_out += static_cast(ig) * k_per_group * ho * wo + static_cast(ik) * ho * wo; + + for(int tid = threadIdx.x; tid < thread_length; tid += 256) + { + int ix = tid % fx; + int iy = (tid / fx) % fy; + int ic = tid / (fx * fy); + + TACC value = .0f; + + for(int in = 0; in < n; in++) + { + for(int iho = 0; iho < ho; iho++) + { + int valid_h = 1; + int cur_h = sy * iho - py + dy * iy; + if(cur_h < 0 || cur_h >= hi) + valid_h &= 0; + for(int iwo = 0; iwo < wo; iwo++) + { + int valid_w = 1; + int cur_w = sx * iwo - px + dx * ix; + if(cur_w < 0 || cur_w >= wi) + valid_w &= 0; + + if(valid_h & valid_w) + { + size_t i_idx = static_cast(in) * c * hi * wi + + static_cast(ic) * hi * wi + + static_cast(cur_h) * wi + static_cast(cur_w); + size_t o_idx = static_cast(in) * k * ho * wo + + static_cast(iho) * wo + static_cast(iwo); + uint32_t rng1 = 0; + uint32_t rng2 = 0; + auto rnd_mode = miopen_f8::hip_f8_rounding_mode::standard; + if(stoch) + { + rng1 = draft_rng(p_in[i_idx], seed); + rng2 = draft_rng(p_out[o_idx], seed); + rnd_mode = miopen_f8::hip_f8_rounding_mode::stochastic; + } + const auto item_in = in_cast_type(p_in[i_idx], rnd_mode, rng1); + const auto item_out = out_cast_type(p_out[o_idx], rnd_mode, rng2); + value += static_cast(item_in) * static_cast(item_out); + } + } + } + } + size_t f_idx = static_cast(ic) * fy * fx + static_cast(iy) * fx + + static_cast(ix); + p_wei[f_idx] = static_cast(value); + } +} + +extern "C" __global__ void WRW_KERNEL_NAME(const INPUT_TYPE* __restrict__ p_in, + WEIGHTS_TYPE* __restrict__ p_wei, + const OUTPUT_TYPE* __restrict__ p_out, + int hi, + int wi, + int n, + int k_per_group, + int c_per_group, + int ho, + int wo, + int sy, + int sx, + int dy, + int dx, + int py, + int px, + int fy, + int fx, + int group, + bool stochastic, + uint32_t seed) +{ + naive_conv_wrw_nchw(p_in, + p_wei, + p_out, + hi, + wi, + n, + k_per_group, + c_per_group, + ho, + wo, + sy, + sx, + dy, + dx, + py, + px, + fy, + fx, + group, + stochastic, + seed); +} diff --git a/src/kernels/hip_f8_impl.hpp b/src/kernels/hip_f8_impl.hpp new file mode 100644 index 0000000000..c7a62f9f72 --- /dev/null +++ b/src/kernels/hip_f8_impl.hpp @@ -0,0 +1,361 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + *all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +// #include +// #include +namespace miopen_hip_f8_impl { + +#ifndef __HIP_PLATFORM_HCC__ +using hip_bfloat16 = bfloat16; +using half = half_float::half; +#endif + +template +MIOPEN_HIP_HOST_DEVICE uint8_t cast_to_f8_no_range_reduce(T _x, + bool stoch = false, + uint32_t rng = 0) +{ + static_assert(we == 5, "we==5"); + static_assert(sizeof(T) == 2, "no_range_reduce only works for float16"); + + uint32_t x = *(reinterpret_cast(&_x)); + + uint32_t head, mantissa, exponent; + uint32_t sign; + + const int mfmt = 10; + head = x & 0xFC00; + mantissa = x & 0x3FF; + exponent = (head >> 10) & 0x1F; + sign = head >> 15; + uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm); + + if((x & 0x7FFF) == 0x7C00) + return signed_inf; + if((x & 0x7C00) == 0x7C00) + return signed_inf + 1; + if(x == 0) + return 0; + if(x == 0x8000) + return 0x80; + + // uint32_t nextbit = 1<<(mfmt-wm-1); + uint32_t drop_mask = (1 << (mfmt - wm)) - 1; + + // const int max_exp = (1<= (2 << mfmt)) + { + mantissa >>= 1; + exponent++; + } + else if(mantissa >= (1 << mfmt) && exponent == 0) + { + exponent++; + } + mantissa >>= (mfmt - wm); + mantissa &= (1 << wm) - 1; + if(exponent == 31) + return (sign << 7) | 0x7B; + return (sign << 7) | (exponent << wm) | mantissa; +} + +template +MIOPEN_HIP_HOST_DEVICE uint8_t cast_to_f8(T _x, bool stoch, uint32_t rng) +{ + constexpr bool is_half = std::is_same::value; + constexpr bool is_float = std::is_same::value; + static_assert(wm + we == 7, "wm+we==7"); + static_assert(is_half || is_float, "Only half and float can be cast to f8"); + + if(sizeof(T) == 2 && we == 5 && !negative_zero_nan) + return cast_to_f8_no_range_reduce<2, 5, half>(static_cast(_x), stoch, rng); + + const int mfmt = (sizeof(T) == 4) ? 23 : 10; + uint32_t x; + if(sizeof(T) == 4) + x = *(reinterpret_cast(&_x)); // cppcheck-suppress invalidPointerCast + else + x = *(reinterpret_cast(&_x)); // cppcheck-suppress invalidPointerCast + + uint32_t head, mantissa; + int exponent, bias; + uint32_t sign; + + if(sizeof(T) == 4) + { + head = x & 0xFF800000; + mantissa = x & 0x7FFFFF; + exponent = (head >> 23) & 0xFF; + sign = head >> 31; + bias = 127; + } + else + { + head = x & 0xFC00; + mantissa = x & 0x3FF; + exponent = (head >> 10) & 0x1F; + sign = head >> 15; + bias = 15; + } + + uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm); + + if(negative_zero_nan) + { + if(sizeof(T) == 4) + { + if((x & 0x7F800000) == 0x7F800000) + return 0x80; + } + else + { + // if(__hisinf(x) || __hisnan(x)) + if((x & 0x7C00) == 0x7C00) + return 0x80; + } + } + else + { + if(sizeof(T) == 4) + { + if((x & 0x7F800000) == 0x7F800000) + return signed_inf + (mantissa != 0 ? 1 : 0); + } + else + { + if((x & 0x7C00) == 0x7C00) + return signed_inf + (mantissa != 0 ? 1 : 0); + } + } + if(x == 0) + return 0; + // First need to check if it is normal or denorm as there is a difference of implict 1 + // Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift + // The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for + // RNE, no need to add rng. Then probably need to check whether there is carry and adjust + // exponent and mantissa again + + // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits + const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0); + const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal + // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias) + // f8_exponent is the converted f8 exponent with bias encoding + // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent, + // the difference needs to be adjusted and mantissa shifted + int act_exponent, f8_exponent, exponent_diff; + + if(exponent == 0) + { // fp32/fp16 is in denormal. + /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16 + here. In this case, f8 is usually in denormal. But there could be exceptions. fp16 + denormal has exponent bias 15 while bf8 with NANOO has exponent bias 16. It means that + there are some numbers in fp16 denormal but they are bf8 (NANOO) normals - smallest bf8 + (NANOO) normal is 2^-15. fp16 numbers where exponent==0 (actual exponent -14) and highest + bit of mantissa is 1 are bf8 (NANOO) normal. In this case, the fp16 mantissa should be + shift left by 1 */ + act_exponent = exponent - bias + 1; + exponent_diff = f8_denormal_act_exponent - + act_exponent; // actual exponent is exponent-bias+1 as it is denormal + } + else + { // fp32/fp16 is normal with implicit 1 + act_exponent = exponent - bias; + if(act_exponent <= f8_denormal_act_exponent) + { + /* This is the case where fp32/fp16 is normal but it is in f8 denormal range. + For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16 + actual exponent is -7, it is actually larger due to the implict 1, + Therefore it needs to be adjust to -6 and mantissa shift right by 1. + So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */ + exponent_diff = f8_denormal_act_exponent - act_exponent; + } + else + { // both fp32/fp16 and f8 are in normal range + exponent_diff = + 0; // exponent_diff=0 does not mean there is no difference for this case, + // act_exponent could be larger. Just that it does not need shift mantissa + } + mantissa += (1 << mfmt); // Add the implicit 1 into mantissa + } + const long tmp = (mfmt - wm + exponent_diff); + if(tmp == 33) + printf("Gotcha"); + + bool midpoint = (mantissa & ((static_cast(1) << (mfmt - wm + exponent_diff)) - 1)) == + (static_cast(1) << (mfmt - wm + exponent_diff - 1)); + /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we + shift right as shift right could rip off some residual part and make something not midpoint + look like midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger + than midpoint, but after shift right by 4 bits, it would look like midpoint. + */ + + if(exponent_diff > 0) + mantissa >>= exponent_diff; + else if(exponent_diff == -1) + mantissa <<= -exponent_diff; + bool implicit_one = mantissa & (1 << mfmt); + // if there is no implict 1, it means the f8 is denormal and need to adjust to denorm exponent + f8_exponent = + (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1); + + // Now we have the exponent and mantissa adjusted + uint32_t drop_mask = (1 << (mfmt - wm)) - 1; + bool odd = + mantissa & (1 << (mfmt - wm)); // if the least significant bit that is not truncated is 1 + mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask; + + // Now we deal with overflow + if(f8_exponent == 0) + { + if((1 << mfmt) & mantissa) + { + f8_exponent = 1; // denormal overflow to become normal, promote exponent + } + } + else + { + if((1 << (mfmt + 1)) & mantissa) + { + mantissa >>= 1; + f8_exponent++; + } + } + + mantissa >>= (mfmt - wm); + + // above range: quantize to maximum possible float of the same sign + const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2); + if(f8_exponent > max_exp) + { + if(clip) + { + mantissa = (1 << wm) - 1; + f8_exponent = max_exp; + } + else + { + return signed_inf; + } + } + + if(f8_exponent == 0 && mantissa == 0) + return negative_zero_nan ? 0 : (sign << 7); + mantissa &= (1 << wm) - 1; + return (sign << 7) | (f8_exponent << wm) | mantissa; +} + +template +MIOPEN_HIP_HOST_DEVICE T cast_from_f8(uint8_t x) +{ + constexpr bool is_half = std::is_same::value; + constexpr bool is_float = std::is_same::value; + static_assert(is_half || is_float, "only half and float are supported"); + + constexpr int weo = is_half ? 5 : 8; + constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7); + + T fInf, fNegInf, fNaN, fNeg0; + if(is_half) + { + const uint16_t ihInf = 0x7C00; + const uint16_t ihNegInf = 0xFC00; + const uint16_t ihNaN = 0x7C01; + const uint16_t ihNeg0 = 0x8000; + fInf = *(reinterpret_cast(&ihInf)); + fNegInf = *(reinterpret_cast(&ihNegInf)); + fNaN = *(reinterpret_cast(&ihNaN)); + fNeg0 = *(reinterpret_cast(&ihNeg0)); + } + else if(is_float) + { + const uint32_t ifInf = 0x7F800000; + const uint32_t ifNegInf = 0xFF800000; + const uint32_t ifNaN = 0x7F800001; + const uint32_t ifNeg0 = 0x80000000; + fInf = *(reinterpret_cast(&ifInf)); // cppcheck-suppress invalidPointerCast + fNegInf = + *(reinterpret_cast(&ifNegInf)); // cppcheck-suppress invalidPointerCast + fNaN = *(reinterpret_cast(&ifNaN)); // cppcheck-suppress invalidPointerCast + fNeg0 = *(reinterpret_cast(&ifNeg0)); // cppcheck-suppress invalidPointerCast + } + + if(x == 0) + return static_cast(0); + + uint32_t sign = x >> 7; + uint32_t mantissa = x & ((1 << wm) - 1); + int exponent = (x & 0x7F) >> wm; + if(negative_zero_nan) + { + if(x == 0x80) + return fNaN; + } + else + { + if(x == 0x80) + return fNeg0; + if(exponent == ((1 << we) - 1)) + return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN; + } + typename std::conditional::type retval; + if(we == 5 && is_half && !negative_zero_nan) + { + retval = x << 8; + return *(reinterpret_cast(&retval)); + } + + const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0); + + // subnormal input + if(exponent == 0) + { + // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above + int sh = 1 + __builtin_clz(mantissa) - (32 - wm); + mantissa <<= sh; + exponent += 1 - sh; + mantissa &= ((1 << wm) - 1); + } + exponent += exp_low_cutoff - 1; + mantissa <<= wmo - wm; + + // subnormal output (occurs when T=half, we=5, negative_zero_nan=true) + if(exponent <= 0) + { + mantissa |= 1 << wmo; + mantissa >>= 1 - exponent; + exponent = 0; + } + + if(sizeof(T) == 2) + retval = (sign << 15) | (exponent << 10) | mantissa; + else + retval = (sign << 31) | (exponent << 23) | mantissa; + return *(reinterpret_cast(&retval)); +} + +} // namespace miopen_hip_f8_impl diff --git a/src/kernels/hip_float8.hpp b/src/kernels/hip_float8.hpp new file mode 100644 index 0000000000..dd57c9ca5b --- /dev/null +++ b/src/kernels/hip_float8.hpp @@ -0,0 +1,651 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + *all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once +#ifndef MIOPEN_ENABLE_F8_DEVICE_CODE +#define MIOPEN_ENABLE_F8_DEVICE_CODE 0 +#endif + +// FP8 header version 0.4, 2021/05/11 +#if defined __HIP_PLATFORM_HCC__ && MIOPEN_ENABLE_F8_DEVICE_CODE +// MIOpen by default does not have device code in the regular compilation paths, +// therefore, when this file is used from the host side, compilation takes much +// longer. By guarding the __device__ directive we can control that such compilation +// only happens for kernels which include this file. +#define MIOPEN_HIP_HOST_DEVICE __host__ __device__ +#else +#define MIOPEN_HIP_HOST_DEVICE +#endif + +#define USE_SIMPLER_HIP_F8x8 0 + +#ifndef MIOPEN_FP8_CLIPPING +#define MIOPEN_FP8_CLIPPING 1 +#endif + +#ifndef MIOPEN_FP8_IEEE_EXPONENT_BIAS +#define MIOPEN_FP8_IEEE_EXPONENT_BIAS 1 +#endif + +namespace miopen_hip_f8_impl { + +template +MIOPEN_HIP_HOST_DEVICE uint8_t cast_to_f8(T _x, bool stoch = false, uint32_t rng = 0); + +template +MIOPEN_HIP_HOST_DEVICE T cast_from_f8(uint8_t x); + +} // namespace miopen_hip_f8_impl + +#include "hip_f8_impl.hpp" + +namespace miopen_f8 { +enum class hip_f8_type +{ + bf8 = 0, // 1:5:2 + fp8 = 1 // 1:4:3 +}; + +enum class hip_f8_rounding_mode +{ + standard, + stochastic +}; + +inline MIOPEN_HIP_HOST_DEVICE bool get_hip_f8_bias_mode() +{ +#if MIOPEN_FP8_IEEE_EXPONENT_BIAS + return false; +#else + return true; +#endif +} + +template +struct hip_f8 +{ + uint8_t data; + + // default constructor + MIOPEN_HIP_HOST_DEVICE hip_f8() = default; + + MIOPEN_HIP_HOST_DEVICE hip_f8(hip_f8 const&) = default; + + // constructor from bits + explicit MIOPEN_HIP_HOST_DEVICE hip_f8(uint8_t v) { data = v; } + + // constructor from in + explicit MIOPEN_HIP_HOST_DEVICE hip_f8(int v) : hip_f8(static_cast(v)) {} + + explicit MIOPEN_HIP_HOST_DEVICE hip_f8(double v) : hip_f8(static_cast(v)) {} + + // constructor from float + explicit MIOPEN_HIP_HOST_DEVICE + hip_f8(float v, + miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard, + uint32_t rng = 0) + { + if(T == hip_f8_type::bf8) + { + if(get_hip_f8_bias_mode()) + { + data = miopen_hip_f8_impl::cast_to_f8<2, + 5, + float, + true /*negative_zero_nan*/, + MIOPEN_FP8_CLIPPING /*clip*/>( + v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng); + } + else + { + data = miopen_hip_f8_impl::cast_to_f8<2, + 5, + float, + false /*negative_zero_nan*/, + MIOPEN_FP8_CLIPPING /*clip*/>( + v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng); + } + } + else /* fp8*/ + { + if(get_hip_f8_bias_mode()) + { + data = miopen_hip_f8_impl::cast_to_f8<3, + 4, + float, + true /*negative_zero_nan*/, + MIOPEN_FP8_CLIPPING /*clip*/>( + v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng); + } + else + { + data = miopen_hip_f8_impl::cast_to_f8<3, + 4, + float, + false /*negative_zero_nan*/, + MIOPEN_FP8_CLIPPING /*clip*/>( + v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng); + } + } + } + + // constructor from half + explicit MIOPEN_HIP_HOST_DEVICE + hip_f8(half v, + miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard, + uint32_t rng = 0) + { + if(T == hip_f8_type::bf8) + { + if(get_hip_f8_bias_mode()) + { + data = miopen_hip_f8_impl::cast_to_f8<2, + 5, + half, + true /*negative_zero_nan*/, + MIOPEN_FP8_CLIPPING /*clip*/>( + v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng); + } + else + { + data = miopen_hip_f8_impl::cast_to_f8<2, + 5, + half, + false /*negative_zero_nan*/, + MIOPEN_FP8_CLIPPING /*clip*/>( + v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng); + } + } + else /* fp8*/ + { + if(get_hip_f8_bias_mode()) + { + data = miopen_hip_f8_impl::cast_to_f8<3, + 4, + half, + true /*negative_zero_nan*/, + MIOPEN_FP8_CLIPPING /*clip*/>( + v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng); + } + else + { + data = miopen_hip_f8_impl::cast_to_f8<3, + 4, + half, + false /*negative_zero_nan*/, + MIOPEN_FP8_CLIPPING /*clip*/>( + v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng); + } + } + } + template + explicit MIOPEN_HIP_HOST_DEVICE + hip_f8(hip_f8 v, + miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard, + uint32_t rng = 0) + { + if(T == U) + { + data = v.data; + } + else + { + const auto tmp = static_cast(v); + const auto tmp2 = hip_f8(tmp, rm, rng); + data = tmp2.data; + } + } + + explicit MIOPEN_HIP_HOST_DEVICE hip_f8(hip_f8 v, hip_f8_rounding_mode, uint32_t) + { + this->data = v.data; + } + + // constructor from hip_bfloat16 + explicit MIOPEN_HIP_HOST_DEVICE + hip_f8(hip_bfloat16 v, + hip_f8_rounding_mode r = miopen_f8::hip_f8_rounding_mode::standard, + uint32_t rng = 0); + + MIOPEN_HIP_HOST_DEVICE + hip_f8& operator*=(const hip_f8& rhs) + { + const auto tmp = static_cast(*this) * static_cast(rhs); + *this = static_cast(tmp); + return *this; + } + + MIOPEN_HIP_HOST_DEVICE + hip_f8& operator+=(const hip_f8& rhs) + { + const auto tmp = static_cast(*this) + static_cast(rhs); + *this = static_cast(tmp); + return *this; + } + + MIOPEN_HIP_HOST_DEVICE + hip_f8& operator-=(const hip_f8& rhs) + { + const auto tmp = static_cast(*this) - static_cast(rhs); + *this = static_cast(tmp); + return *this; + } + + inline MIOPEN_HIP_HOST_DEVICE hip_f8& operator=(const hip_f8& rhs) + { + if(&rhs != this) + this->data = rhs.data; + return *this; + } + + inline MIOPEN_HIP_HOST_DEVICE bool operator==(const hip_f8& rhs) const + { + if((rhs.is_zero() && this->is_zero()) || + (fabs(rhs - *this) < std::numeric_limits>::epsilon())) + return true; + else if(rhs.is_nan() || rhs.is_inf() || this->is_nan() || this->is_inf()) + return false; + + return false; + } + + inline MIOPEN_HIP_HOST_DEVICE bool operator<(const hip_f8& rhs) const + { + const auto we = static_cast(*this); + const auto them = static_cast(rhs); + return we < them; + } + + inline MIOPEN_HIP_HOST_DEVICE bool operator>(const hip_f8& rhs) const + { + const auto we = static_cast(*this); + const auto them = static_cast(rhs); + return we > them; + } +#if 0 + /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator double() + { + // float tmp = static_cast(*this); + // return tmp; + } + + /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator double() const + { + // float tmp = static_cast(*this); + // return tmp; + } +#endif + // convert to float + /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator float() const + { + if(T == hip_f8_type::bf8) + { + if(get_hip_f8_bias_mode()) + { + return miopen_hip_f8_impl::cast_from_f8<2, 5, float, true /*negative_zero_nan*/>( + data); + } + else + { + return miopen_hip_f8_impl::cast_from_f8<2, 5, float, false /*negative_zero_nan*/>( + data); + } + } + else /* fp8*/ + { + if(get_hip_f8_bias_mode()) + { + return miopen_hip_f8_impl::cast_from_f8<3, 4, float, true /*negative_zero_nan*/>( + data); + } + else + { + return miopen_hip_f8_impl::cast_from_f8<3, 4, float, false /*negative_zero_nan*/>( + data); + } + } + } + + // convert to half + /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator half() const + { + if(T == hip_f8_type::bf8) + { + if(get_hip_f8_bias_mode()) + { + return miopen_hip_f8_impl::cast_from_f8<2, 5, half, true /*negative_zero_nan*/>( + data); + } + else + { + return miopen_hip_f8_impl::cast_from_f8<2, 5, half, false /*negative_zero_nan*/>( + data); + } + } + else /* fp8*/ + { + if(get_hip_f8_bias_mode()) + { + return miopen_hip_f8_impl::cast_from_f8<3, 4, half, true /*negative_zero_nan*/>( + data); + } + else + { + return miopen_hip_f8_impl::cast_from_f8<3, 4, half, false /*negative_zero_nan*/>( + data); + } + } + } + + // convert to hip_bfloat16 + /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator hip_bfloat16() const; + + // check for zero + inline MIOPEN_HIP_HOST_DEVICE bool is_zero() const + { + if(get_hip_f8_bias_mode()) + { + return data == 0x00; + } + else + { + return (data == 0x00) || (data == 0x80); + } + } + + // check for nan + inline MIOPEN_HIP_HOST_DEVICE bool is_nan() const + { + if(get_hip_f8_bias_mode()) + { + return data == 0x80; + } + else + { + if(T == hip_f8_type::bf8) + { + return (data == 0x7d) || (data == 0x7e) || (data == 0x7f) || (data == 0xfd) || + (data == 0xfe) || (data == 0xff); + } + else + { + return (data == 0x79) || (data == 0x7a) || (data == 0x7b) || (data == 0x7c) || + (data == 0x7d) || (data == 0x7e) || (data == 0x7f) || (data == 0xf9) || + (data == 0xfa) || (data == 0xfb) || (data == 0xfc) || (data == 0xfd) || + (data == 0xfe) || (data == 0xff); + } + } + } + + // check for inf + inline MIOPEN_HIP_HOST_DEVICE bool is_inf() const + { + if(get_hip_f8_bias_mode()) + { + return data == 0x80; + } + else + { + if(T == hip_f8_type::bf8) + { + return (data == 0x7c) || (data == 0xfc); + } + else + { + return (data == 0x78) || (data == 0xf8); + } + } + } +}; // end of class hip_f8 + +template +inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 operator*(miopen_f8::hip_f8 lhs, + const miopen_f8::hip_f8& rhs) +{ + lhs *= rhs; + return lhs; +} + +template +inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 operator+(miopen_f8::hip_f8 lhs, + const miopen_f8::hip_f8& rhs) +{ + lhs += rhs; + return lhs; +} + +template +inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 operator-(miopen_f8::hip_f8 lhs, + const miopen_f8::hip_f8& rhs) +{ + lhs -= rhs; + return lhs; +} + +template +inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 operator-(U lhs, const miopen_f8::hip_f8& rhs) +{ + const auto tmp = static_cast(rhs); + return static_cast>(lhs - tmp); +} + +template +inline MIOPEN_HIP_HOST_DEVICE bool operator<(const miopen_f8::hip_f8& lhs, + const miopen_f8::hip_f8& rhs) +{ + return static_cast(lhs) < static_cast(rhs); +} + +template +inline MIOPEN_HIP_HOST_DEVICE bool operator>(const miopen_f8::hip_f8& lhs, + const miopen_f8::hip_f8& rhs) +{ + return static_cast(lhs) > static_cast(rhs); +} + +template +inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 fabs(miopen_f8::hip_f8 v) +{ + v.data = v.data & 0x7f; + return v; +} +template +MIOPEN_HIP_HOST_DEVICE T F8_Max() +{ + union + { + uint8_t bits; + T value; + } x; + + x.bits = 0x7F; + return x.value; +} +} // namespace miopen_f8 + +// define numeric limits for the new data type +namespace std { +inline bool isfinite(miopen_f8::hip_f8 x) // NOLINT +{ + return x.is_inf(); +} + +inline bool isfinite(miopen_f8::hip_f8 x) // NOLINT +{ + return x.is_inf(); +} + +template <> +class numeric_limits> +{ +public: + static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 epsilon() + { + return static_cast>(float(0.0625)); + } + + static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 quiet_NaN() + { + return static_cast>( + static_cast(miopen_f8::get_hip_f8_bias_mode() ? 0X80 : 0x79)); + } + + static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 max() + { + return miopen_f8::F8_Max>(); + } + + static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 min() + { + return static_cast>(-1.0f) * + miopen_f8::F8_Max>(); + } +}; + +template <> +class numeric_limits> +{ +public: + static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 epsilon() + { + return static_cast>(float(0.125)); + } + + static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 quiet_NaN() + { + return static_cast>( + static_cast(miopen_f8::get_hip_f8_bias_mode() ? 0X80 : 0x7d)); + } + + static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 max() + { + return static_cast>( + miopen_f8::F8_Max>()); + } + static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8 min() + { + return static_cast>(-1.0f) * + miopen_f8::F8_Max>(); + } +}; + +} // namespace std + +template +struct hip_f8x4 +{ + // define some convenience types + using float32x2 = float __attribute__((ext_vector_type(2))); + using float32x4 = float __attribute__((ext_vector_type(4))); + + using halfx2 = _Float16 __attribute__((ext_vector_type(2))); + using halfx4 = _Float16 __attribute__((ext_vector_type(4))); + + using hip_bfloat16x2 = uint16_t __attribute__((ext_vector_type(2))); + using hip_bfloat16x4 = uint16_t __attribute__((ext_vector_type(4))); + + uint32_t data; + + // default constructor + MIOPEN_HIP_HOST_DEVICE hip_f8x4() = default; + + // constructor from bits + MIOPEN_HIP_HOST_DEVICE hip_f8x4(uint32_t v); + + // constructor from float + MIOPEN_HIP_HOST_DEVICE + hip_f8x4(float v0, + float v1 = 0, + float v2 = 0, + float v3 = 0, + miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard, + uint32_t rng = 0); + MIOPEN_HIP_HOST_DEVICE + hip_f8x4(float32x2 v, + miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard, + uint32_t rng = 0); + MIOPEN_HIP_HOST_DEVICE + hip_f8x4(float32x4 v, + miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard, + uint32_t rng = 0); + + // constructor from half + MIOPEN_HIP_HOST_DEVICE + hip_f8x4(half v0, + half v1 = {}, + half v2 = {}, + half v3 = {}, + miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard, + uint32_t rng = 0); + MIOPEN_HIP_HOST_DEVICE + hip_f8x4(halfx2 v, + miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard, + uint32_t rng = 0); + MIOPEN_HIP_HOST_DEVICE + hip_f8x4(halfx4 v, + miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard, + uint32_t rng = 0); + + // convert to float32x4 + inline MIOPEN_HIP_HOST_DEVICE operator float32x4() const; + + // convert to halfx4 + inline MIOPEN_HIP_HOST_DEVICE operator halfx4() const; +}; + +template +struct hip_f8x8 +{ + // define some convenience types + using f8x8 = hip_f8x4 __attribute__((ext_vector_type(2))); + + f8x8 data; + + // default constructor + MIOPEN_HIP_HOST_DEVICE hip_f8x8() = default; + + // do we need to define other constructors or any conversion routines here? +}; + +// If we do not end up needing either any constructors or conversion routines for the above type, +// then we can simplify the above type to the following +#if USE_SIMPLER_HIP_F8x8 +template +using hip_f8x8 = hip_f8x4 __attribute__((ext_vector_type(2))); +#endif + +using hip_float32x4 = float __attribute__((ext_vector_type(4))); +using hip_float32x16 = float __attribute__((ext_vector_type(16))); + +// these are device-specific and we don't expect them to exist unless we're compiling with hip-clang +// for MI300. +template +__device__ hip_float32x4 mfma_f32_16x16x32(hip_f8x8 a, hip_f8x8 b, hip_float32x4 c); + +template +__device__ hip_float32x16 mfma_f32_32x32x16(hip_f8x8 a, hip_f8x8 b, hip_float32x16 c); + +using float8 = miopen_f8::hip_f8; +using bfloat8 = miopen_f8::hip_f8; diff --git a/src/ocl/tensorocl.cpp b/src/ocl/tensorocl.cpp index 8d63f21054..9c7bff6992 100644 --- a/src/ocl/tensorocl.cpp +++ b/src/ocl/tensorocl.cpp @@ -1472,12 +1472,12 @@ void SetTensor(const Handle& handle, std::multiplies()); std::size_t wld = 256 < wgd ? 256 : wgd; - - std::string parms = "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET" + - GetDataTypeKernelParams(dataType); + std::stringstream ss; + ss << "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_SET" + << GetDataTypeKernelParams(dataType); for(int i = 0; i < yDim_flat; ++i) { - parms += " -DWORK_LENGTH_" + std::to_string(i) + "=" + std::to_string(worker_sizes[i]); + ss << " -DWORK_LENGTH_" << std::to_string(i) << "=" << std::to_string(worker_sizes[i]); } kernel = handle.AddKernel(kernel_name, @@ -1486,7 +1486,7 @@ void SetTensor(const Handle& handle, kernel_name, {wld, 1, 1}, {wgd, 1, 1}, - parms); + ss.str()); } switch(yDim_flat) @@ -1934,6 +1934,10 @@ std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, mio case miopenHalf: return option += "2"; case miopenFloat: return option += "3"; case miopenBFloat16: return option += "4"; + case miopenFloat8: + MIOPEN_THROW(miopenStatusBadParm, "miopenFloat8 data type not supported in cast tensor."); + case miopenBFloat8: + MIOPEN_THROW(miopenStatusBadParm, "miopenBFloat8 data type not supported in cast tensor."); case miopenDouble: // TODO MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor."); diff --git a/src/pooling.cpp b/src/pooling.cpp index ab928ffd81..91a27f324a 100644 --- a/src/pooling.cpp +++ b/src/pooling.cpp @@ -131,8 +131,8 @@ PoolingDescriptor::GetForwardOutputDim(const TensorDescriptor& xDesc) const assert(stride_h > 0); assert(stride_w > 0); - assert(window_h < (input_h + 2 * pad_h)); - assert(window_w < (input_w + 2 * pad_w)); + assert(window_h < (input_h + static_cast(2) * pad_h)); + assert(window_w < (input_w + static_cast(2) * pad_w)); auto output_h = std::max( 1, ((input_h + 2 * static_cast(pad_h) - window_h) / stride_h + 1)); diff --git a/src/pooling_api.cpp b/src/pooling_api.cpp index 8bfd1ac64d..ef526804cf 100644 --- a/src/pooling_api.cpp +++ b/src/pooling_api.cpp @@ -53,6 +53,8 @@ inline void Pooling_logging_cmd(const miopenPoolingDescriptor_t poolDesc, case miopenInt8x4: case miopenBFloat16: case miopenDouble: + case miopenFloat8: + case miopenBFloat8: default: MIOPEN_LOG_W( "Pooing cmd args logging is not implemented properly for " + diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp index 6cb00bc82b..6cf29c2f64 100644 --- a/src/reducetensor.cpp +++ b/src/reducetensor.cpp @@ -208,6 +208,8 @@ inline int GetDataTypeSize(miopenDataType_t t) case miopenHalf: return (2); case miopenFloat: return (4); case miopenDouble: return (8); + case miopenFloat8: + case miopenBFloat8: case miopenInt8: return (1); case miopenInt8x4: return (4); case miopenBFloat16: return (2); @@ -268,8 +270,10 @@ inline int GetDataTypeId(miopenDataType_t t) case miopenDouble: return (static_cast('D')); case miopenInt8: case miopenInt8x4: + case miopenFloat8: + case miopenBFloat8: case miopenInt32: return (static_cast('O')); - default: MIOPEN_THROW("Only float, half, bfloat16 data type is supported."); + default: MIOPEN_THROW("Only float, half, bfloat16, float8, bfloat8 data type is supported."); }; }; @@ -307,6 +311,8 @@ static ck::DataTypeEnum_t mapDataTypeId(miopenDataType_t t) case miopenInt8: return DataTypeEnum_t::Int8; case miopenInt8x4: return DataTypeEnum_t::Int8x4; case miopenInt32: return DataTypeEnum_t::Int32; + case miopenFloat8: + case miopenBFloat8: default: MIOPEN_THROW("Only float, half, double data type is supported."); }; }; @@ -720,6 +726,9 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle, " -DCK_PARAM_DST_DATATYPE=" + std::to_string(detailStatic::GetDataTypeId(dstDataType)); param += " -DCK_PARAM_REDUCE_COMPTYPE=" + std::to_string(detailStatic::GetDataTypeId(compType)); + param += + " -DMIOPEN_FP8_IEEE_EXPONENT_BIAS=" + std::to_string(MIOPEN_FP8_IEEE_EXPONENT_BIAS); + param += " -DMIOPEN_FP8_CLIPPING" + std::to_string(MIOPEN_FP8_CLIPPING); param += " -DCK_PARAM_SRC_DESC_LENGTHS="; for(int i = 0; i < inDescLengths.size(); i++) diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index 5de1c43c3c..186bc28ff2 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -201,6 +201,8 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& context, case miopenInt32: case miopenInt8: case miopenInt8x4: + case miopenFloat8: + case miopenBFloat8: default: MIOPEN_THROW("Unsupported datatype"); } return false; @@ -243,6 +245,8 @@ BnCKFwdInference::GetSolution(const ExecutionContext& context, case miopenInt8: case miopenInt32: case miopenInt8x4: + case miopenFloat8: + case miopenBFloat8: default: MIOPEN_THROW("Unsupported datatype"); } }; diff --git a/src/solver/conv_MP_bidirectional_winograd.cpp b/src/solver/conv_MP_bidirectional_winograd.cpp index fc2d769520..5d638469d5 100644 --- a/src/solver/conv_MP_bidirectional_winograd.cpp +++ b/src/solver/conv_MP_bidirectional_winograd.cpp @@ -329,6 +329,9 @@ bool ConvMPBidirectWinograd::IsA return false; } + if(problem.IsTensorsCasted()) + return false; + if(!IsApplicableGEMM(problem)) return false; diff --git a/src/solver/conv_asm_1x1u.cpp b/src/solver/conv_asm_1x1u.cpp index b71d195667..4da4f6dc02 100644 --- a/src/solver/conv_asm_1x1u.cpp +++ b/src/solver/conv_asm_1x1u.cpp @@ -537,6 +537,9 @@ bool ConvAsm1x1U::IsApplicable(const ConvolutionContext& ctx, if(!(problem.IsFp32() || problem.IsFp16())) return false; + if(problem.IsTensorsCasted()) + return false; + const auto target = ctx.GetStream().GetTargetProperties(); if(target.Xnack() && *target.Xnack()) return false; @@ -551,6 +554,9 @@ bool ConvAsm1x1U::IsApplicable(const ConvolutionContext& ctx, return false; } + if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8()) + return false; + if(name == "gfx90a" && problem.IsGfx90aFp16altRequired()) return false; diff --git a/src/solver/conv_asm_1x1u_bias_activ_fused.cpp b/src/solver/conv_asm_1x1u_bias_activ_fused.cpp index c4ae30f859..c935a2aff6 100644 --- a/src/solver/conv_asm_1x1u_bias_activ_fused.cpp +++ b/src/solver/conv_asm_1x1u_bias_activ_fused.cpp @@ -94,9 +94,9 @@ bool ConvBiasActivAsm1x1U::IsValidPerformanceConfig( PerformanceConfigConvBiasActivAsm1x1U ConvBiasActivAsm1x1U::Search(const FusionContext& context, const FusionDescription& problem, - const AnyInvokeParams& invoke_ctx) const + const AnyInvokeParams& invoke_params) const { - return GenericSearch(*this, context, problem, invoke_ctx); + return GenericSearch(*this, context, problem, invoke_params); } ConvSolution @@ -256,6 +256,9 @@ bool ConvBiasActivAsm1x1U::IsApplicable(const FusionContext& context, if(conv_problem.GetDilationH() != 1) return false; + if(conv_problem.IsTensorsCasted()) + return false; + // Check if the conovlution part is applicable return sol.IsApplicable(conv_ctx, conv_problem); } diff --git a/src/solver/conv_asm_1x1u_stride2.cpp b/src/solver/conv_asm_1x1u_stride2.cpp index 9b3dd0462d..ba23b9d9dc 100644 --- a/src/solver/conv_asm_1x1u_stride2.cpp +++ b/src/solver/conv_asm_1x1u_stride2.cpp @@ -496,6 +496,9 @@ bool ConvAsm1x1UV2::IsApplicable(const ConvolutionContext& ctx, if(!problem.IsFp32()) return false; + if(problem.IsTensorsCasted()) + return false; + const auto target = ctx.GetStream().GetTargetProperties(); if(target.Xnack() && *target.Xnack()) return false; @@ -510,6 +513,9 @@ bool ConvAsm1x1UV2::IsApplicable(const ConvolutionContext& ctx, return false; } + if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8()) + return false; + const auto elements_in_dword = 4 / GetTypeSize(problem.GetInDataType()); // clang-format off const auto img_hw = problem.GetOutHeight_() * problem.GetOutWidth_(); diff --git a/src/solver/conv_asm_3x3u.cpp b/src/solver/conv_asm_3x3u.cpp index 1ebb39c84f..b185b959af 100644 --- a/src/solver/conv_asm_3x3u.cpp +++ b/src/solver/conv_asm_3x3u.cpp @@ -188,6 +188,9 @@ bool ConvAsm3x3U::IsApplicable(const ConvolutionContext& ctx, if(target.Xnack() && *target.Xnack()) return false; + if(problem.IsTensorsCasted()) + return false; + const std::string name = ctx.GetStream().GetDeviceName(); if(!(StartsWith(name, "gfx8") || StartsWith(name, "gfx90"))) return false; @@ -196,6 +199,9 @@ bool ConvAsm3x3U::IsApplicable(const ConvolutionContext& ctx, return false; } + if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8()) + return false; + constexpr auto GIB = static_cast(1024) * 1024 * 1024; constexpr auto TIB = GIB * 1024; constexpr auto ELEM_SZ = static_cast(sizeof(float)); diff --git a/src/solver/conv_asm_5x10u2v2b1.cpp b/src/solver/conv_asm_5x10u2v2b1.cpp index fe0f0f42a3..6da4863f6d 100644 --- a/src/solver/conv_asm_5x10u2v2b1.cpp +++ b/src/solver/conv_asm_5x10u2v2b1.cpp @@ -74,6 +74,8 @@ bool ConvAsm5x10u2v2b1::IsApplicable(const ExecutionContext& ctx, { return false; } + if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8()) + return false; // Min image + padding shall be not smaller than filter matrix. const int min_out_width = 138; diff --git a/src/solver/conv_asm_5x10u2v2f1.cpp b/src/solver/conv_asm_5x10u2v2f1.cpp index 09e2d2abed..74301fe5fd 100644 --- a/src/solver/conv_asm_5x10u2v2f1.cpp +++ b/src/solver/conv_asm_5x10u2v2f1.cpp @@ -76,6 +76,9 @@ bool ConvAsm5x10u2v2f1::IsApplicable(const ExecutionContext& ctx, return false; } + if(problem.IsTensorsCasted()) + return false; + // Min image + padding shall be not smaller than filter matrix. const int min_in_width = static_cast(problem.GetWeightsWidth_()) - problem.GetPadW() * 2; const int min_in_height = static_cast(problem.GetWeightsHeight_()) - problem.GetPadH() * 2; diff --git a/src/solver/conv_asm_7x7c3h224w224k64u2v2p3q3f1.cpp b/src/solver/conv_asm_7x7c3h224w224k64u2v2p3q3f1.cpp index 4310a87fd6..4426a3eeca 100644 --- a/src/solver/conv_asm_7x7c3h224w224k64u2v2p3q3f1.cpp +++ b/src/solver/conv_asm_7x7c3h224w224k64u2v2p3q3f1.cpp @@ -51,6 +51,9 @@ bool ConvAsm7x7c3h224w224k64u2v2p3q3f1::IsApplicable(const ExecutionContext& ctx if(!ctx.rmv.IsV2orV3()) return false; + if(problem.IsTensorsCasted()) + return false; + const auto target = ctx.GetStream().GetTargetProperties(); if(target.Xnack() && *target.Xnack()) return false; diff --git a/src/solver/conv_asm_dir_BwdWrW1x1.cpp b/src/solver/conv_asm_dir_BwdWrW1x1.cpp index 4cd78b7357..b6b2458157 100644 --- a/src/solver/conv_asm_dir_BwdWrW1x1.cpp +++ b/src/solver/conv_asm_dir_BwdWrW1x1.cpp @@ -484,6 +484,9 @@ bool ConvAsmBwdWrW1x1::IsApplicable(const ConvolutionContext& ctx, if(!ctx.rmv.IsV2orV3()) return false; + if(problem.IsTensorsCasted()) + return false; + const auto target = ctx.GetStream().GetTargetProperties(); if(target.Xnack() && *target.Xnack()) return false; diff --git a/src/solver/conv_asm_dir_BwdWrW3x3.cpp b/src/solver/conv_asm_dir_BwdWrW3x3.cpp index 2781d25d07..7b0b0567d8 100644 --- a/src/solver/conv_asm_dir_BwdWrW3x3.cpp +++ b/src/solver/conv_asm_dir_BwdWrW3x3.cpp @@ -378,6 +378,9 @@ bool ConvAsmBwdWrW3x3::IsApplicable(const ConvolutionContext& ctx, { return false; } + + if(problem.IsTensorsCasted()) + return false; #if WORKAROUND_ISSUE_532 if(StartsWith(name, "gfx9") && (problem.GetKernelStrideW() > 1 || problem.GetKernelStrideH() > 1)) diff --git a/src/solver/conv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp b/src/solver/conv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp index dbfdb0b69c..9e41d56c82 100644 --- a/src/solver/conv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp +++ b/src/solver/conv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp @@ -149,6 +149,9 @@ bool ConvAsmImplicitGemmV4R1DynamicBwd::IsApplicable(const ExecutionContext& ctx if(!problem.IsFp32()) return false; + if(problem.IsTensorsCasted()) + return false; + if(!ctx.rmv.IsV3()) return false; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp index caa26d1f23..ee6b16d38b 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd.cpp @@ -995,6 +995,9 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlops::IsApplicable(const ExecutionContext if(!problem.IsFp32() && !problem.IsFp16()) return false; + if(problem.IsTensorsCasted()) + return false; + if(!ctx.rmv.IsV3()) return false; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp index 41a2b018fa..71c53f61a9 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp @@ -942,6 +942,9 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable( !(problem.IsBfp16() && (device_name == "gfx90a" || StartsWith(device_name, "gfx94")))) return false; + if(problem.IsTensorsCasted()) + return false; + if(!ctx.rmv.IsV3()) return false; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp index 081e12a532..32b50167cf 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd.cpp @@ -1521,6 +1521,9 @@ bool ConvAsmImplicitGemmGTCDynamicFwdXdlops::IsApplicable(const ExecutionContext if(!problem.IsFp32() && !problem.IsFp16()) return false; + if(problem.IsTensorsCasted()) + return false; + if(!ctx.rmv.IsV3()) return false; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp index c52372b6d2..bbedf8d680 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp @@ -567,6 +567,9 @@ bool ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::IsApplicable( !(problem.IsFp16() && problem.GetVectorLength() == 8)) return false; + if(problem.IsTensorsCasted()) + return false; + if(!ctx.rmv.IsV3()) return false; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp index 601b2e1211..e315fd0895 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -880,6 +880,9 @@ bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsApplicable( !(problem.IsBfp16() && (device_name == "gfx90a" || StartsWith(device_name, "gfx94")))) return false; + if(problem.IsTensorsCasted()) + return false; + if(!ctx.rmv.IsV3()) return false; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp index 35de228c45..c8dee39a79 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp @@ -870,6 +870,9 @@ bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsApplicable( !(problem.IsBfp16() && (device_name == "gfx90a" || StartsWith(device_name, "gfx94")))) return false; + if(problem.IsTensorsCasted()) + return false; + if(!ctx.rmv.IsV3()) return false; diff --git a/src/solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp b/src/solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp index 7846179475..8e1450c7a3 100644 --- a/src/solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp +++ b/src/solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp @@ -295,6 +295,9 @@ bool ConvAsmImplicitGemmV4R1DynamicFwd::IsApplicable(const ExecutionContext& ctx if(!problem.IsFp32()) return false; + if(problem.IsTensorsCasted()) + return false; + if(!ctx.rmv.IsV3()) return false; diff --git a/src/solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp b/src/solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp index 3594b26277..a5d056178d 100644 --- a/src/solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp +++ b/src/solver/conv_asm_implicit_gemm_wrw_gtc_dynamic_xdlops.cpp @@ -836,6 +836,9 @@ bool ConvAsmImplicitGemmGTCDynamicWrwXdlops::IsApplicable(const ExecutionContext if(!problem.IsFp32() && !problem.IsFp16()) return false; + if(problem.IsTensorsCasted()) + return false; + if(!ctx.rmv.IsV3()) return false; diff --git a/src/solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp b/src/solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp index 35020df9b6..fb5f0caf7c 100644 --- a/src/solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp +++ b/src/solver/conv_asm_implicit_gemm_wrw_v4r1_dynamic.cpp @@ -318,6 +318,9 @@ bool ConvAsmImplicitGemmV4R1DynamicWrw::IsApplicable(const ExecutionContext& ctx if(!problem.IsFp32()) return false; + if(problem.IsTensorsCasted()) + return false; + if(!ctx.rmv.IsV3()) return false; diff --git a/src/solver/conv_bin_wino3x3U.cpp b/src/solver/conv_bin_wino3x3U.cpp index e42aab012c..c8508cf372 100644 --- a/src/solver/conv_bin_wino3x3U.cpp +++ b/src/solver/conv_bin_wino3x3U.cpp @@ -69,6 +69,9 @@ bool ConvBinWinograd3x3U::IsApplicable(const ExecutionContext& ctx, return false; } + if(problem.IsTensorsCasted()) + return false; + // clang-format off return problem.GetPadW() == 1 && problem.GetPadH() == 1 diff --git a/src/solver/conv_bin_winoRxS.cpp b/src/solver/conv_bin_winoRxS.cpp index 8b42bf5899..eb4d7386f1 100644 --- a/src/solver/conv_bin_winoRxS.cpp +++ b/src/solver/conv_bin_winoRxS.cpp @@ -222,6 +222,9 @@ bool ConvBinWinogradRxS::IsApplicable(const ExecutionContext& ctx, return false; if(!(problem.IsFp32() || problem.IsFp16())) return false; + + if(problem.IsTensorsCasted()) + return false; if(miopen::IsDisabled(MIOPEN_DEBUG_AMD_WINOGRAD_RXS{})) return false; if(problem.direction.IsBackwardWrW()) diff --git a/src/solver/conv_bin_winoRxS_fused.cpp b/src/solver/conv_bin_winoRxS_fused.cpp index f184b0e291..f11edc368e 100644 --- a/src/solver/conv_bin_winoRxS_fused.cpp +++ b/src/solver/conv_bin_winoRxS_fused.cpp @@ -87,6 +87,9 @@ bool ConvBinWinogradRxSFused::IsApplicable(const FusionContext& context, size_t padded_y = 0; size_t padded_x = 0; + + if(conv_problem.IsTensorsCasted()) + return false; if(conv_problem.GetKernelStrideH() == 1) { if(y <= 3) diff --git a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp index cdb4225b88..ed975285ee 100644 --- a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp +++ b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp @@ -282,6 +282,8 @@ void PerformanceConfigConvCKIgemmFwdBiasActivFused::HeuristicInit( switch(conv_problem.GetInDataType()) { case miopenHalf: Init(conv_problem); break; + case miopenFloat8: + case miopenBFloat8: case miopenInt8: case miopenFloat: case miopenInt32: @@ -335,6 +337,8 @@ bool PerformanceConfigConvCKIgemmFwdBiasActivFused::IsValid( switch(conv_problem.GetInDataType()) { case miopenHalf: return CheckIsSupportCKArgs(conv_problem); + case miopenFloat8: + case miopenBFloat8: case miopenInt8: case miopenFloat: case miopenInt32: @@ -406,6 +410,9 @@ bool ConvCKIgemmFwdBiasActivFused::IsApplicable(const FusionContext& ctx, if(activ_op.activMode != miopenActivationRELU) return false; const auto conv_problem = fdesc_problem.GetConvProblem(0, conv::Direction::Forward); + + if(conv_problem.IsTensorsCasted()) + return false; if(conv_problem.GetConv().attribute.deterministic) return false; if(conv_problem.GetInDataType() != conv_problem.GetWeightsDataType() || @@ -422,6 +429,8 @@ bool ConvCKIgemmFwdBiasActivFused::IsApplicable(const FusionContext& ctx, switch(conv_problem.GetInDataType()) { case miopenHalf: return CheckCKApplicability(conv_problem); + case miopenFloat8: + case miopenBFloat8: case miopenInt8: case miopenFloat: case miopenInt32: @@ -454,6 +463,8 @@ ConvSolution ConvCKIgemmFwdBiasActivFused::GetSolution( case miopenHalf: RunCKSolution(handle, primitive_parameters, conv_problem, config); break; + case miopenFloat8: + case miopenBFloat8: case miopenInt8: case miopenFloat: case miopenInt32: diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp index 0c743ee9d7..7ddf2e3049 100644 --- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp +++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp @@ -105,6 +105,9 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx, return false; if(!(problem.IsFp32() or problem.IsFp16())) return false; + + if(problem.IsTensorsCasted()) + return false; if(problem.GetGroupCount() != 1) return false; if(ctx.GetStream().GetTargetProperties().Name() == "gfx90a" && diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp index 51e7f0a641..64c95257e6 100644 --- a/src/solver/conv_direct_naive_conv.cpp +++ b/src/solver/conv_direct_naive_conv.cpp @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include namespace miopen { @@ -133,7 +135,14 @@ std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem) else MIOPEN_THROW("unsupported tensor layout"); - if(IsInputFp32(problem)) + if(problem.IsFp8() || problem.IsTensorsCasted() || problem.IsBfp8()) + { + kernel_name << miopen::GetDataType(ProblemInterpreter::GetInputDataType(problem)); + kernel_name << "_" << miopen::GetDataType(problem.GetWeightsDataType()); + kernel_name << "_" << miopen::GetDataType(ProblemInterpreter::GetOutputDataType(problem)); + return kernel_name.str(); + } + else if(IsInputFp32(problem)) kernel_name << "float_"; else if(IsInputFp16(problem)) kernel_name << "half_"; @@ -167,18 +176,56 @@ std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem) return kernel_name.str(); } -std::string ConvDirectNaiveConvKernelFile() { return "naive_conv.cpp"; } +std::string ConvDirectNaiveConvKernelFile(const ConvolutionContext& ctx, + const ProblemDescription& problem) +{ + const auto device_name = ctx.GetStream().GetDeviceName(); + // The above function, ConvDirectNaiveConvKernelName is not in sync for the asm kernel, + // resulting in empty code objects. This happens for systems with COv3 as the default type. + // if(device_name == "gfx906" || device_name == "gfx908") + // { + // if(ctx.rmv.IsV3() && problem.IsLayoutDefault() && !problem.IsFp8() && + // !problem.IsTensorsCasted() && !problem.IsBfp8()) + // return "naive_conv_gcn.s"; + // } + if(problem.IsFp8() || problem.IsTensorsCasted() || problem.IsBfp8()) + return "fp8_naive_conv.cpp"; + return "naive_conv.cpp"; +} -std::string ConvDirectNaiveConvCompileOption(const ConvolutionContext& ctx) +std::string ConvDirectNaiveConvCompileOption(const ConvolutionContext& ctx, + const ProblemDescription& problem) { - std::string filename = ConvDirectNaiveConvKernelFile(); + std::string filename = ConvDirectNaiveConvKernelFile(ctx, problem); if(miopen::EndsWith(filename, ".s")) { std::ostringstream options; GenerateClangDefsym(options, "ROCM_METADATA_VERSION", 5); return options.str(); } - return ctx.general_compile_options; + std::ostringstream ss; + ss << ctx.general_compile_options; + if(problem.IsFp8() || problem.IsTensorsCasted() || problem.IsBfp8()) + { + ss << " -DINPUT_TYPE=" + << miopen::GetDataType(ProblemInterpreter::GetInputDataType(problem)); + ss << " -DWEIGHTS_TYPE=" << miopen::GetDataType(problem.GetWeightsDataType()); + ss << " -DOUTPUT_TYPE=" + << miopen::GetDataType(ProblemInterpreter::GetOutputDataType(problem)); + const auto in_cast_type = problem.GetInCastType(); + if(in_cast_type) + ss << " -DINPUT_CAST_TYPE=" << miopen::GetDataType(*in_cast_type); + const auto wei_cast_type = problem.GetWeightsCastType(); + if(wei_cast_type) + ss << " -DWEIGHTS_CAST_TYPE=" << miopen::GetDataType(*(wei_cast_type)); + const auto out_cast_type = ProblemInterpreter::GetOutputCastType(problem); + if(out_cast_type) + ss << " -DOUTPUT_CAST_TYPE=" << miopen::GetDataType(*out_cast_type); + ss << " -DMIOPEN_FP8_CLIPPING=" << MIOPEN_FP8_CLIPPING; + ss << " -DMIOPEN_FP8_IEEE_EXPONENT_BIAS=" << MIOPEN_FP8_IEEE_EXPONENT_BIAS; + // Let the kernel choose its accumulator (double for naive kernels ) + } + return ss.str(); } bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext& ctx, diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp index 077fe550bc..c5d793860c 100644 --- a/src/solver/conv_direct_naive_conv_bwd.cpp +++ b/src/solver/conv_direct_naive_conv_bwd.cpp @@ -47,11 +47,29 @@ bool ConvDirectNaiveConvBwd::IsApplicable(const ConvolutionContext& ctx, if(!problem.IsLayoutDefault() && !problem.IsLayoutNHWC()) return false; - if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16())) + if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16() || problem.IsFp8() || + problem.IsBfp8())) return false; if(!problem.direction.IsBackwardData()) return false; + if(problem.IsTensorsCasted()) + { + auto test_cast = [&](const TensorDescriptor& desc) { + if(desc.GetCastType()) + { + const auto cast_type = *desc.GetCastType(); + if(cast_type == miopenFloat8 || cast_type == miopenBFloat8) + return false; + } + // all tested tensors must have cast type set + return true; + }; + if(test_cast(problem.GetOut())) + return false; + if(test_cast(problem.GetWeights())) + return false; + } return true; } @@ -104,7 +122,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, KernelInfo kernel; - kernel.kernel_file = ConvDirectNaiveConvKernelFile(); + kernel.kernel_file = ConvDirectNaiveConvKernelFile(ctx, problem); kernel.kernel_name = ConvDirectNaiveConvKernelName(problem); kernel.g_wk.clear(); @@ -116,7 +134,13 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, kernel.l_wk.push_back(1); kernel.l_wk.push_back(1); - kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx); + const auto is_f8 = [&]() { + if(kernel.kernel_file == "fp8_naive_conv.cpp") + return true; + else + return false; + }(); + kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); if(problem.Is2d()) result.invoker_factory = [=](const std::vector& kernels) { @@ -125,26 +149,49 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; float elapsed = 0; - - handle.Run(kern)(tensors.out, - tensors.w, - tensors.in, - hi, - wi, - n, - k_per_group, - c_per_group, - ho, - wo, - sy, - sx, - dy, - dx, - py, - px, - fy, - fx, - group); + if(is_f8) + handle.Run(kern)(tensors.out, + tensors.w, + tensors.in, + hi, + wi, + n, + k_per_group, + c_per_group, + ho, + wo, + sy, + sx, + dy, + dx, + py, + px, + fy, + fx, + group, + problem.GetConv().attribute.fp8rounding_mode.Get() == + miopenF8RoundingModeStochastic, + problem.GetConv().attribute.fp8rounding_mode.GetSeed()); + else + handle.Run(kern)(tensors.out, + tensors.w, + tensors.in, + hi, + wi, + n, + k_per_group, + c_per_group, + ho, + wo, + sy, + sx, + dy, + dx, + py, + px, + fy, + fx, + group); if(handle.IsProfilingEnabled()) elapsed += handle.GetKernelTime(); diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp index b3b2da870c..fc8d8e77fd 100644 --- a/src/solver/conv_direct_naive_conv_fwd.cpp +++ b/src/solver/conv_direct_naive_conv_fwd.cpp @@ -47,12 +47,30 @@ bool ConvDirectNaiveConvFwd::IsApplicable(const ConvolutionContext& ctx, if(!problem.IsLayoutDefault() && !problem.IsLayoutNHWC()) return false; - if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16() || problem.IsInt8())) + if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16() || problem.IsInt8() || + problem.IsFp8() || problem.IsBfp8())) return false; if(!problem.direction.IsForward()) return false; + if(problem.IsTensorsCasted()) + { + auto test_cast = [&](const TensorDescriptor& desc) { + if(desc.GetCastType()) + { + const auto cast_type = *desc.GetCastType(); + if(cast_type == miopenFloat8 || cast_type == miopenBFloat8) + return false; + } + // all tested tensors must have cast type set + return true; + }; + if(test_cast(problem.GetIn())) + return false; + if(test_cast(problem.GetWeights())) + return false; + } return true; } @@ -104,7 +122,13 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, KernelInfo kernel; - kernel.kernel_file = ConvDirectNaiveConvKernelFile(); + kernel.kernel_file = ConvDirectNaiveConvKernelFile(ctx, problem); + const auto is_f8 = [&]() { + if(kernel.kernel_file == "fp8_naive_conv.cpp") + return true; + else + return false; + }(); kernel.kernel_name = ConvDirectNaiveConvKernelName(problem); kernel.g_wk.clear(); @@ -116,7 +140,7 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, kernel.l_wk.push_back(1); kernel.l_wk.push_back(1); - kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx); + kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); if(problem.Is2d()) result.invoker_factory = [=](const std::vector& kernels) { @@ -125,26 +149,53 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; float elapsed = 0; - - handle.Run(kern)(tensors.in, - tensors.w, - tensors.out, - hi, - wi, - n, - k_per_group, - c_per_group, - ho, - wo, - sy, - sx, - dy, - dx, - py, - px, - fy, - fx, - group); + if(is_f8) + { + handle.Run(kern)(tensors.in, + tensors.w, + tensors.out, + hi, + wi, + n, + k_per_group, + c_per_group, + ho, + wo, + sy, + sx, + dy, + dx, + py, + px, + fy, + fx, + group, + problem.GetConv().attribute.fp8rounding_mode.Get() == + miopenF8RoundingModeStochastic, + problem.GetConv().attribute.fp8rounding_mode.GetSeed()); + } + else + { + handle.Run(kern)(tensors.in, + tensors.w, + tensors.out, + hi, + wi, + n, + k_per_group, + c_per_group, + ho, + wo, + sy, + sx, + dy, + dx, + py, + px, + fy, + fx, + group); + } if(handle.IsProfilingEnabled()) elapsed += handle.GetKernelTime(); diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp index f25d3a3baa..2c85949ad4 100644 --- a/src/solver/conv_direct_naive_conv_wrw.cpp +++ b/src/solver/conv_direct_naive_conv_wrw.cpp @@ -47,11 +47,29 @@ bool ConvDirectNaiveConvWrw::IsApplicable(const ConvolutionContext& ctx, if(!problem.IsLayoutDefault() && !problem.IsLayoutNHWC()) return false; - if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16())) + if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16() || problem.IsFp8() || + problem.IsBfp8())) return false; if(!problem.direction.IsBackwardWrW()) return false; + if(problem.IsTensorsCasted()) + { + auto test_cast = [&](const TensorDescriptor& desc) { + if(desc.GetCastType()) + { + const auto cast_type = *desc.GetCastType(); + if(cast_type == miopenFloat8 || cast_type == miopenBFloat8) + return false; + } + // all tested tensors must have cast type set + return true; + }; + if(test_cast(problem.GetIn())) + return false; + if(test_cast(problem.GetOut())) + return false; + } return true; } @@ -91,7 +109,7 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, KernelInfo kernel; - kernel.kernel_file = ConvDirectNaiveConvKernelFile(); + kernel.kernel_file = ConvDirectNaiveConvKernelFile(ctx, problem); kernel.kernel_name = ConvDirectNaiveConvKernelName(problem); kernel.g_wk.clear(); @@ -103,7 +121,13 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, kernel.l_wk.push_back(1); kernel.l_wk.push_back(1); - kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx); + kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); + const auto is_f8 = [&]() { + if(kernel.kernel_file == "fp8_naive_conv.cpp") + return true; + else + return false; + }(); if(problem.Is2d()) result.invoker_factory = [=](const std::vector& kernels) { @@ -112,26 +136,49 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; float elapsed = 0; - - handle.Run(kern)(tensors.x, - tensors.dw, - tensors.dy, - hi, - wi, - n, - k_per_group, - c_per_group, - ho, - wo, - sy, - sx, - dy, - dx, - py, - px, - fy, - fx, - group); + if(is_f8) + handle.Run(kern)(tensors.x, + tensors.dw, + tensors.dy, + hi, + wi, + n, + k_per_group, + c_per_group, + ho, + wo, + sy, + sx, + dy, + dx, + py, + px, + fy, + fx, + group, + problem.GetConv().attribute.fp8rounding_mode.Get() == + miopenF8RoundingModeStochastic, + problem.GetConv().attribute.fp8rounding_mode.GetSeed()); + else + handle.Run(kern)(tensors.x, + tensors.dw, + tensors.dy, + hi, + wi, + n, + k_per_group, + c_per_group, + ho, + wo, + sy, + sx, + dy, + dx, + py, + px, + fy, + fx, + group); if(handle.IsProfilingEnabled()) elapsed += handle.GetKernelTime(); diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp index 0b310ea71a..0b880b2fc8 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp @@ -212,6 +212,8 @@ void PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::HeuristicInit( case miopenInt32: case miopenInt8x4: case miopenBFloat16: + case miopenFloat8: + case miopenBFloat8: case miopenDouble: break; } #endif @@ -253,6 +255,8 @@ bool PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::IsValid( case miopenInt32: case miopenInt8x4: case miopenBFloat16: + case miopenFloat8: + case miopenBFloat8: case miopenDouble: break; } #endif @@ -303,6 +307,8 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable( problem.GetWeightsDataType() != problem.GetOutDataType() || problem.GetInDataType() != problem.GetOutDataType()) return false; + if(problem.IsTensorsCasted()) + return false; if(!problem.direction.IsBackwardData()) return false; if(!problem.Is3d()) @@ -320,6 +326,8 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable( case miopenInt32: case miopenInt8x4: case miopenBFloat16: + case miopenFloat8: + case miopenBFloat8: case miopenDouble: break; } #endif @@ -347,6 +355,8 @@ ConvSolution ConvHipImplicitGemm3DGroupBwdXdlops::GetSolution( case miopenInt8x4: case miopenBFloat16: case miopenDouble: + case miopenFloat8: + case miopenBFloat8: default: MIOPEN_THROW(miopenStatusInternalError, "ConvHipImplicitGemmBwdXdlops operation not implemented for this data type"); diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index 9b7079df1d..80b969fcbd 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -211,6 +211,8 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::HeuristicInit( case miopenInt8: Init(problem); break; case miopenInt32: case miopenInt8x4: + case miopenFloat8: + case miopenBFloat8: case miopenBFloat16: case miopenDouble: break; } @@ -252,6 +254,8 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::IsValid( case miopenInt8: return CheckIsSupportCKArgs(problem); case miopenInt32: case miopenInt8x4: + case miopenFloat8: + case miopenBFloat8: case miopenBFloat16: case miopenDouble: break; } @@ -319,6 +323,8 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable( case miopenInt8: return CheckCKApplicability(problem); case miopenInt32: case miopenInt8x4: + case miopenFloat8: + case miopenBFloat8: case miopenBFloat16: case miopenDouble: break; } @@ -347,6 +353,8 @@ ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution( case miopenInt8x4: case miopenBFloat16: case miopenDouble: + case miopenFloat8: + case miopenBFloat8: default: MIOPEN_THROW(miopenStatusInternalError, "ConvHipImplicitGemmFwdXdlops operation not implemented for this data type"); diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp index 89081fa339..7292828f69 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp @@ -208,6 +208,8 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::HeuristicInit( case miopenInt32: case miopenInt8x4: case miopenBFloat16: + case miopenFloat8: + case miopenBFloat8: case miopenDouble: break; } #endif @@ -249,6 +251,8 @@ bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::IsValid( case miopenInt32: case miopenInt8x4: case miopenBFloat16: + case miopenFloat8: + case miopenBFloat8: case miopenDouble: break; } #endif @@ -318,6 +322,8 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable( case miopenInt32: case miopenInt8x4: case miopenBFloat16: + case miopenFloat8: + case miopenBFloat8: case miopenDouble: break; } #endif @@ -344,6 +350,8 @@ ConvSolution ConvHipImplicitGemm3DGroupWrwXdlops::GetSolution( case miopenInt32: case miopenInt8x4: case miopenBFloat16: + case miopenFloat8: + case miopenBFloat8: case miopenDouble: default: MIOPEN_THROW(miopenStatusInternalError, diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp index 11b3d4bfbb..7e380f6289 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp @@ -168,6 +168,8 @@ void PerformanceConfigHipImplicitGemmBwdXdlops::HeuristicInit( { case miopenHalf: Init(problem); break; case miopenFloat: Init(problem); break; + case miopenFloat8: + case miopenBFloat8: case miopenInt8: case miopenInt32: case miopenInt8x4: @@ -208,6 +210,8 @@ bool PerformanceConfigHipImplicitGemmBwdXdlops::IsValid( { case miopenHalf: return CheckIsSupportCKArgs(problem); case miopenFloat: return CheckIsSupportCKArgs(problem); + case miopenFloat8: + case miopenBFloat8: case miopenInt8: case miopenInt32: case miopenInt8x4: @@ -262,6 +266,9 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable( problem.GetWeightsDataType() != problem.GetOutDataType() || problem.GetInDataType() != problem.GetOutDataType()) return false; + + if(problem.IsTensorsCasted()) + return false; if(!problem.direction.IsBackwardData()) return false; if(!problem.Is2d()) @@ -283,6 +290,8 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable( { case miopenHalf: return CheckCKApplicability(problem); case miopenFloat: return CheckCKApplicability(problem); + case miopenFloat8: + case miopenBFloat8: case miopenInt8: case miopenInt32: case miopenInt8x4: @@ -312,6 +321,8 @@ ConvSolution ConvHipImplicitGemmBwdXdlops::GetSolution( case miopenInt8x4: case miopenBFloat16: case miopenDouble: + case miopenFloat8: + case miopenBFloat8: default: MIOPEN_THROW(miopenStatusInternalError, "ConvHipImplicitGemmFwdXdlops operation not implemented for this data type"); diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp index ec3ea2fc65..c389cb0cee 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp @@ -649,6 +649,9 @@ bool ConvHipImplicitGemmBwdDataV1R1::IsApplicable(const ConvolutionContext& ctx, if(!(problem.IsFp32() || problem.IsBfp16())) return false; + + if(problem.IsTensorsCasted()) + return false; if(problem.GetGroupCount() != 1) return false; if(!IsIndexRangeLargeEnough(problem)) diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp index 1075556d39..5b533d72ee 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp @@ -784,6 +784,9 @@ bool ConvHipImplicitGemmBwdDataV1R1Xdlops::IsApplicable(const ConvolutionContext if(!problem.Is2d()) return false; + if(problem.IsTensorsCasted()) + return false; + if(ctx.GetStream().GetDeviceName() == "gfx90a" && problem.IsGfx90aFp16altRequired()) return false; diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp index 79250a6b61..65f8cf6675 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp @@ -754,6 +754,9 @@ bool ConvHipImplicitGemmBwdDataV4R1::IsApplicable(const ConvolutionContext& ctx, if(!problem.IsFp32()) return false; + if(problem.IsTensorsCasted()) + return false; + if(problem.GetGroupCount() != 1) return false; diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp index 8a147585ab..3e85e4c966 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp @@ -840,6 +840,9 @@ bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsApplicable(const ConvolutionContext return false; if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16())) return false; + + if(problem.IsTensorsCasted()) + return false; if(!IsApplicableXdlops(ctx, problem)) return false; if(!IsIndexRangeLargeEnough(problem)) diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp index 99edfd139d..773f5a1d32 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp @@ -66,6 +66,9 @@ bool ConvHipImplicitGemmV4R1Fwd::IsApplicable(const ConvolutionContext& ctx, if(ctx.GetStream().GetDeviceName() == "gfx90a" && problem.IsGfx90aFp16altRequired()) return false; + if(problem.IsTensorsCasted()) + return false; + std::size_t n = problem.GetBatchSize_(); std::size_t k = problem.GetOutChannels_() / problem.GetGroupCount(); std::size_t c = problem.GetInChannels_() / problem.GetGroupCount(); @@ -88,6 +91,8 @@ bool ConvHipImplicitGemmV4R1WrW::IsApplicable(const ConvolutionContext& ctx, { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1{})) return false; + if(ThisSolverIsDeprecatedStatic::IsDisabled(ctx)) + return false; if(!IsComposableKernelSupportedHardware(ctx)) return false; if(!problem.direction.IsBackwardWrW()) @@ -104,6 +109,8 @@ bool ConvHipImplicitGemmV4R1WrW::IsApplicable(const ConvolutionContext& ctx, return false; if(ctx.GetStream().GetDeviceName() == "gfx90a" && problem.IsGfx90aFp16altRequired()) return false; + if(problem.IsTensorsCasted()) + return false; // retrieve dimension from ProblemDescription // remember: ProblemDescription has swapped some dimensions for you! diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp index 15f247e7d3..07fbae436e 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp @@ -598,6 +598,9 @@ bool ConvHipImplicitGemmV4R4Fwd::IsApplicable(const ConvolutionContext& ctx, if(!IsIndexRangeLargeEnough(problem)) return false; + if(problem.IsTensorsCasted()) + return false; + int gemm_m = 0; int gemm_n = 0; int gemm_k = 0; diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp index edcce82e68..795e3d1704 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp @@ -990,6 +990,9 @@ bool ConvHipImplicitGemmForwardV4R4Xdlops::IsApplicable(const ConvolutionContext if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16())) return false; + if(problem.IsTensorsCasted()) + return false; + if(!problem.direction.IsForward()) return false; diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp index 95fed60757..7fa139d21a 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp @@ -1047,6 +1047,9 @@ bool ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsApplicable( if(!ctx.use_hip_kernels) return false; + if(problem.IsTensorsCasted()) + return false; + if(!IsComposableKernelSupportedHardware(ctx)) return false; diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp index 71f97ff3b2..9bd59d36b9 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp @@ -1020,6 +1020,9 @@ bool ConvHipImplicitGemmForwardV4R5Xdlops::IsApplicable(const ConvolutionContext if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16())) return false; + if(problem.IsTensorsCasted()) + return false; + const auto y = ProblemInterpreter::GetFilterHeightY(problem); const auto x = ProblemInterpreter::GetFilterWidthX(problem); diff --git a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp index 762893bf9e..ae2395dd0a 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp @@ -169,6 +169,8 @@ void PerformanceConfigHipImplicitGemmFwdXdlops::HeuristicInit( case miopenInt8: Init(problem); break; case miopenHalf: Init(problem); break; case miopenFloat: Init(problem); break; + case miopenFloat8: + case miopenBFloat8: case miopenInt32: case miopenInt8x4: case miopenBFloat16: @@ -210,6 +212,8 @@ bool PerformanceConfigHipImplicitGemmFwdXdlops::IsValid( case miopenInt8: return CheckIsSupportCKArgs(problem); case miopenHalf: return CheckIsSupportCKArgs(problem); case miopenFloat: return CheckIsSupportCKArgs(problem); + case miopenFloat8: + case miopenBFloat8: case miopenInt32: case miopenInt8x4: case miopenBFloat16: @@ -278,6 +282,8 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable( return false; if(!problem.IsLayoutNHWC()) return false; + if(problem.IsTensorsCasted()) + return false; if(problem.GetGroupCount() > 1) return false; switch(problem.GetInDataType()) @@ -285,6 +291,8 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable( case miopenInt8: return CheckCKApplicability(problem); case miopenHalf: return CheckCKApplicability(problem); case miopenFloat: return CheckCKApplicability(problem); + case miopenFloat8: + case miopenBFloat8: case miopenInt32: case miopenInt8x4: case miopenBFloat16: @@ -315,6 +323,8 @@ ConvSolution ConvHipImplicitGemmFwdXdlops::GetSolution( case miopenInt8x4: case miopenBFloat16: case miopenDouble: + case miopenFloat8: + case miopenBFloat8: default: MIOPEN_THROW(miopenStatusInternalError, "ConvHipImplicitGemmFwdXdlops operation not implemented for this data type"); diff --git a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp index 1763d30d6b..b51031780f 100644 --- a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp @@ -200,6 +200,8 @@ void PerformanceConfigHipImplicitGemmGroupFwdXdlops::HeuristicInit( case miopenInt32: case miopenInt8x4: case miopenBFloat16: + case miopenFloat8: + case miopenBFloat8: case miopenDouble: break; } #endif @@ -240,6 +242,8 @@ bool PerformanceConfigHipImplicitGemmGroupFwdXdlops::IsValid( case miopenInt32: case miopenInt8x4: case miopenBFloat16: + case miopenFloat8: + case miopenBFloat8: case miopenDouble: break; } #endif @@ -284,6 +288,8 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable( #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL if(miopen::IsDisabled(MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS{})) return false; + if(problem.IsTensorsCasted()) + return false; if(problem.GetConv().attribute.deterministic) return false; if(problem.GetInDataType() != problem.GetWeightsDataType() || @@ -307,6 +313,8 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable( case miopenInt32: case miopenInt8x4: case miopenBFloat16: + case miopenFloat8: + case miopenBFloat8: case miopenDouble: break; } #endif @@ -334,6 +342,8 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlops::GetSolution( case miopenInt8x4: case miopenBFloat16: case miopenDouble: + case miopenFloat8: + case miopenBFloat8: default: MIOPEN_THROW(miopenStatusInternalError, "ConvHipImplicitGemmFwdXdlops operation not implemented for this data type"); diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp index 331e8a14c2..0a33f611c0 100644 --- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp +++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp @@ -596,6 +596,9 @@ bool ConvHipImplicitGemmV4R4WrW::IsApplicable(const ConvolutionContext& ctx, return false; if(!problem.IsFp32()) return false; + + if(problem.IsTensorsCasted()) + return false; if(problem.GetGroupCount() != 1) return false; if(!IsIndexRangeLargeEnough(problem)) diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp index f6c4847551..070ad3615f 100644 --- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp @@ -1072,6 +1072,9 @@ bool ConvHipImplicitGemmWrwV4R4Xdlops::IsApplicable(const ConvolutionContext& ct if(!IsIndexRangeLargeEnough(problem)) return false; + if(problem.IsTensorsCasted()) + return false; + if(!problem.IsLayoutDefault()) { return false; diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp index 9f46af0245..7b4295df35 100644 --- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp +++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp @@ -1132,6 +1132,9 @@ bool ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsApplicable( if(!problem.Is2d()) return false; + if(problem.IsTensorsCasted()) + return false; + if(ctx.GetStream().GetDeviceName() == "gfx90a" && problem.IsGfx90aFp16altRequired()) return false; diff --git a/src/solver/conv_mlir_igemm_bwd.cpp b/src/solver/conv_mlir_igemm_bwd.cpp index 6fa2b2e7f4..58787c2532 100644 --- a/src/solver/conv_mlir_igemm_bwd.cpp +++ b/src/solver/conv_mlir_igemm_bwd.cpp @@ -49,6 +49,8 @@ bool ConvMlirIgemmBwd::IsApplicable(const ConvolutionContext& ctx, return false; if(!IsComposableKernelSupportedHardware(ctx)) return false; + if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8()) + return false; // Note: ConvMlirIgemmBwd can run on a machine with xdlops support, however, it is // guaranteed to be slower than its xdlops alternative, therefore disabling it to // save compilation overhead diff --git a/src/solver/conv_mlir_igemm_bwd_xdlops.cpp b/src/solver/conv_mlir_igemm_bwd_xdlops.cpp index c55d89464a..a4a9549db8 100644 --- a/src/solver/conv_mlir_igemm_bwd_xdlops.cpp +++ b/src/solver/conv_mlir_igemm_bwd_xdlops.cpp @@ -50,6 +50,8 @@ bool ConvMlirIgemmBwdXdlops::IsApplicable(const ConvolutionContext& ctx, return false; if(!problem.direction.IsBackwardData()) return false; + if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8()) + return false; if(!IsComposableKernelSupportedHardware(ctx)) return false; diff --git a/src/solver/conv_mlir_igemm_fwd.cpp b/src/solver/conv_mlir_igemm_fwd.cpp index 3a1eb3068d..c4983852b1 100644 --- a/src/solver/conv_mlir_igemm_fwd.cpp +++ b/src/solver/conv_mlir_igemm_fwd.cpp @@ -169,6 +169,8 @@ bool ConvMlirIgemmFwd::IsApplicable(const ConvolutionContext& ctx, return false; if(!IsComposableKernelSupportedHardware(ctx)) return false; + if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8()) + return false; // Note: ConvMlirIgemmFwd can run on a machine with xdlops support, however, it is // guaranteed to be slower than its xdlops alternative, therefore disabling it to // save compilation overhead diff --git a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp index 692b2aeba2..8256213623 100644 --- a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp +++ b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp @@ -66,6 +66,8 @@ bool ConvMlirIgemmFwdXdlops::IsApplicable(const ConvolutionContext& ctx, return false; if(!IsComposableKernelSupportedHardware(ctx)) return false; + if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8()) + return false; return MiirIsConfigApplicable(mlir::ConstructBuildOptions(ctx, problem, true)); #else std::ignore = ctx; diff --git a/src/solver/conv_mlir_igemm_wrw.cpp b/src/solver/conv_mlir_igemm_wrw.cpp index bb1e1229b2..d5f88dcaea 100644 --- a/src/solver/conv_mlir_igemm_wrw.cpp +++ b/src/solver/conv_mlir_igemm_wrw.cpp @@ -50,6 +50,8 @@ bool ConvMlirIgemmWrW::IsApplicable(const ConvolutionContext& ctx, return false; if(!IsComposableKernelSupportedHardware(ctx)) return false; + if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8()) + return false; // Note: ConvMlirIgemmWrW can run on a machine with xdlops support, however, it is // guaranteed to be slower than its xdlops alternative, therefore disabling it to // save compilation overhead diff --git a/src/solver/conv_mlir_igemm_wrw_xdlops.cpp b/src/solver/conv_mlir_igemm_wrw_xdlops.cpp index 34c99d39cd..2f3bc63f50 100644 --- a/src/solver/conv_mlir_igemm_wrw_xdlops.cpp +++ b/src/solver/conv_mlir_igemm_wrw_xdlops.cpp @@ -51,6 +51,8 @@ bool ConvMlirIgemmWrWXdlops::IsApplicable(const ConvolutionContext& ctx, return false; if(!problem.direction.IsBackwardWrW()) return false; + if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8()) + return false; if(!IsComposableKernelSupportedHardware(ctx)) return false; diff --git a/src/solver/conv_multipass_wino3x3WrW.cpp b/src/solver/conv_multipass_wino3x3WrW.cpp index e41a434253..233489c4fc 100644 --- a/src/solver/conv_multipass_wino3x3WrW.cpp +++ b/src/solver/conv_multipass_wino3x3WrW.cpp @@ -438,6 +438,9 @@ bool ConvWinograd3x3MultipassWrW return false; if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16())) return false; + + if(problem.IsTensorsCasted()) + return false; if(!problem.IsLayoutDefault()) { return false; diff --git a/src/solver/conv_ocl_dir2D11x11.cpp b/src/solver/conv_ocl_dir2D11x11.cpp index ec56fe9f56..145cf9ed13 100644 --- a/src/solver/conv_ocl_dir2D11x11.cpp +++ b/src/solver/conv_ocl_dir2D11x11.cpp @@ -51,6 +51,9 @@ bool ConvOclDirectFwd11x11::IsApplicable(const ConvolutionContext& ctx, return false; if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16())) return false; + + if(problem.IsTensorsCasted()) + return false; if(!problem.IsLayoutDefault()) { return false; diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp index 562e98c366..eded6fddf8 100644 --- a/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp +++ b/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp @@ -67,6 +67,9 @@ bool ConvOclBwdWrW1x1::IsApplicable(const ConvolutionContext& ctx, return false; } + if(problem.IsTensorsCasted()) + return false; + bool result = (problem.GetWeightsWidth_() == 1 && problem.GetWeightsHeight_() == 1 && problem.GetDilationW() == 1 && problem.GetDilationH() == 1 && problem.GetGroupCount() == 1); diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp index d24eb17320..d4e79250e1 100644 --- a/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp +++ b/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp @@ -468,6 +468,9 @@ bool ConvOclBwdWrW2::IsApplicableBase(const ConvolutionContext& c return false; } + if(problem.IsTensorsCasted()) + return false; + return problem.GetDilationW() == 1 && problem.GetDilationH() == 1 && #if 0 // There is a stronger restriction than this one, which make this one unnecessary. diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp index e2ae607157..2db6109bf5 100644 --- a/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp +++ b/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp @@ -55,6 +55,9 @@ bool ConvOclBwdWrW53::IsApplicable(const ConvolutionContext& ctx, return false; if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16())) return false; + + if(problem.IsTensorsCasted()) + return false; if(!problem.direction.IsBackwardWrW()) return false; if(!problem.IsLayoutDefault()) diff --git a/src/solver/conv_ocl_dir2Dfwd.cpp b/src/solver/conv_ocl_dir2Dfwd.cpp index 18086410da..70005e606e 100644 --- a/src/solver/conv_ocl_dir2Dfwd.cpp +++ b/src/solver/conv_ocl_dir2Dfwd.cpp @@ -52,6 +52,9 @@ bool ConvOclDirectFwd::IsApplicable(const ConvolutionContext& ctx, return false; if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16())) return false; + + if(problem.IsTensorsCasted()) + return false; if(!problem.IsLayoutDefault()) { return false; diff --git a/src/solver/conv_ocl_dir2Dfwd1x1.cpp b/src/solver/conv_ocl_dir2Dfwd1x1.cpp index 9c6f392821..71dc41fc65 100644 --- a/src/solver/conv_ocl_dir2Dfwd1x1.cpp +++ b/src/solver/conv_ocl_dir2Dfwd1x1.cpp @@ -61,6 +61,9 @@ bool ConvOclDirectFwd1x1::IsApplicable(const ConvolutionContext& ctx, return false; if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16())) return false; + + if(problem.IsTensorsCasted()) + return false; if(!problem.IsLayoutDefault()) { return false; diff --git a/src/solver/conv_ocl_dir2Dfwdgen.cpp b/src/solver/conv_ocl_dir2Dfwdgen.cpp index 06399252ea..80fb0708e1 100644 --- a/src/solver/conv_ocl_dir2Dfwdgen.cpp +++ b/src/solver/conv_ocl_dir2Dfwdgen.cpp @@ -49,6 +49,9 @@ bool ConvOclDirectFwdGen::IsApplicable(const ConvolutionContext& ctx, return false; if(!(problem.IsFp32() || problem.IsFp16() || problem.IsBfp16())) return false; + + if(problem.IsTensorsCasted()) + return false; if(!problem.IsLayoutDefault()) { return false; diff --git a/src/solver/conv_winoRxS.cpp b/src/solver/conv_winoRxS.cpp index 5bce82163c..cdff22ac1d 100644 --- a/src/solver/conv_winoRxS.cpp +++ b/src/solver/conv_winoRxS.cpp @@ -624,6 +624,9 @@ static bool IsApplicableBase(const ConvolutionContext& ctx, const ProblemDescrip return false; if(!(problem.IsFp32() || problem.IsFp16())) return false; + + if(problem.IsTensorsCasted()) + return false; if(!ctx.use_asm_kernels) return false; if(!ctx.rmv.IsV3()) diff --git a/src/solver/conv_winoRxS_fused.cpp b/src/solver/conv_winoRxS_fused.cpp index e242d5d0d8..377656cfb7 100644 --- a/src/solver/conv_winoRxS_fused.cpp +++ b/src/solver/conv_winoRxS_fused.cpp @@ -177,6 +177,9 @@ bool ConvBinWinogradRxSf2x3g1Fused::IsApplicable(const FusionContext& context, return false; // clang-format on + if(conv_problem.IsTensorsCasted()) + return false; + const auto group_count = conv_problem.GetGroupCount(); if(group_count != 1) return false; diff --git a/src/solver/gemm.cpp b/src/solver/gemm.cpp index 2634eb3282..7ad238e7fa 100644 --- a/src/solver/gemm.cpp +++ b/src/solver/gemm.cpp @@ -94,6 +94,42 @@ bool GemmFwdBase::IsApplicable(const ExecutionContext& ctx, yDesc.GetType() != miopenInt8x4) return false; } + const auto rblas_fp8_supported = miopen::StartsWith(ctx.GetStream().GetDeviceName(), "gfx94"); + if(problem.IsTensorsCasted()) + { + if(!rblas_fp8_supported) + { + MIOPEN_LOG_I2("GEMM not supported with casted tensors on this GPU architecture"); + return false; + } + if(xDesc.GetCastType() && wDesc.GetCastType()) + { + const auto x_cast_type = xDesc.GetCastType(); + const auto w_cast_type = wDesc.GetCastType(); + if(x_cast_type != miopenFloat8 && x_cast_type != miopenBFloat8) + { + MIOPEN_LOG_W( + "Casting is only supported for the miopenFloat8 and miopenBFloat8 data types"); + return false; + } + if(w_cast_type != miopenFloat8 && w_cast_type != miopenBFloat8) + { + MIOPEN_LOG_W( + "Casting is only supported for the miopenFloat8 and miopenBFloat8 data types"); + return false; + } + } + else + { + MIOPEN_LOG_I("Both the input and weights tensors need to be casted"); + return false; + } + } + if(problem.IsFp8() && !rblas_fp8_supported) + { + MIOPEN_LOG_I2("GEMM not applicable for F8 on this GPU architecture"); + return false; + } return problem.GetDirection() == conv::Direction::Forward && problem.IsLayoutDefault() && !(IsAnyBufferBF16(xDesc, yDesc, wDesc) && !IsBf16Supported) && !(IsAnyBufferFp16(xDesc, yDesc, wDesc) && !IsFp16Supported); @@ -274,11 +310,20 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context, decltype(auto) wDesc = problem.GetWeights(); decltype(auto) yDesc = problem.GetOut(); - const GemmDescriptor gemm_desc = [&]() { + const GemmDescriptor tmp_gemm_desc = [&]() { auto tmp = conv.group_count > 1 ? CreateGemmDescriptorGroupConvCNHWFwd(wDesc, xDesc, yDesc, conv.group_count) : CreateGemmDescriptorConvCNHWFwd(wDesc, xDesc, yDesc); tmp.deterministic = problem.GetConv().attribute.deterministic; + if(problem.IsTensorsCasted()) + { + // IsApplicable ensures that both are casted + if(xDesc.GetCastType()) + tmp.a_cast_type = *wDesc.GetCastType(); + if(wDesc.GetCastType()) + tmp.b_cast_type = *xDesc.GetCastType(); + } + tmp.conv_attributes = problem.GetConv().attribute; return tmp; }(); @@ -377,7 +422,11 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context, } miopenStatus_t gemm_status; - + auto gemm_desc = [&]() { + auto tmp = tmp_gemm_desc; + tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt; + return tmp; + }(); if(conv_params.type == InvokeType::Run) { if(group_count > 1) @@ -390,8 +439,7 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context, 0, workSpace, x_t_size, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } else { @@ -404,8 +452,7 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context, wksp_offset, workSpace, x_t_size, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } } else @@ -421,8 +468,7 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context, x_t_size, time_precision, group_count > 1 ? callGemmStridedBatched : callGemm, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } if(gemm_status != miopenStatusSuccess) @@ -524,6 +570,8 @@ bool GemmFwd1x1_0_1_int8::IsApplicable(const ExecutionContext& context, const auto spatial_dim = conv.GetSpatialDimension(); const auto wei_spatial = boost::adaptors::slice(wDesc.GetLengths(), 2, 2 + spatial_dim); + if(problem.IsTensorsCasted() || problem.IsFp8() || problem.IsBfp8()) + return false; return miopen::all_of(wei_spatial, [](auto v) { return v == 1; }) && miopen::all_of(conv.GetConvPads(), [](auto v) { return v == 0; }) && @@ -562,9 +610,18 @@ ConvSolution GemmFwd1x1_0_1_int8::GetSolution(const ExecutionContext& context, solution.workspace_sz = workspace_req; TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides()); - const GemmDescriptor gemm_desc = [&]() { + const GemmDescriptor tmp_gemm_desc = [&]() { auto tmp = CreateGemmDescriptorConvFwd(wDesc, xDesc, yDesc); tmp.deterministic = problem.GetConv().attribute.deterministic; + if(problem.IsTensorsCasted()) + { + // IsApplicable ensures that both are casted + if(xDesc.GetCastType()) + tmp.a_cast_type = *xDesc.GetCastType(); + if(wDesc.GetCastType()) + tmp.b_cast_type = *wDesc.GetCastType(); + } + tmp.conv_attributes = problem.GetConv().attribute; return tmp; }(); const auto x_type = xDesc.GetType(); @@ -601,7 +658,11 @@ ConvSolution GemmFwd1x1_0_1_int8::GetSolution(const ExecutionContext& context, miopenStatus_t gemm_status = miopenStatusNotInitialized; float time = 0; const auto runs = conv_params.type == InvokeType::Run ? in_n : 1; - + const auto gemm_desc = [&]() { + auto tmp = tmp_gemm_desc; + tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt; + return tmp; + }(); for(std::size_t i = 0; i < runs; i++) { std::size_t out_offset = i * wei_k * out_spatial_size; @@ -623,8 +684,7 @@ ConvSolution GemmFwd1x1_0_1_int8::GetSolution(const ExecutionContext& context, 0, y, out_offset, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } else { @@ -638,8 +698,7 @@ ConvSolution GemmFwd1x1_0_1_int8::GetSolution(const ExecutionContext& context, out_offset, time_precision, callGemm, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } if(gemm_status != miopenStatusSuccess) @@ -727,9 +786,18 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context, if(group_count > 1) { - GemmDescriptor gemm_desc = [&]() { + const GemmDescriptor tmp_gemm_desc = [&]() { auto tmp = CreateGemmDescriptorGroupConvFwd(wDesc, xDesc, yDesc, group_count); tmp.deterministic = problem.GetConv().attribute.deterministic; + if(problem.IsTensorsCasted()) + { + // IsApplicable ensures that both are casted + if(xDesc.GetCastType()) + tmp.a_cast_type = *wDesc.GetCastType(); + if(wDesc.GetCastType()) + tmp.b_cast_type = *xDesc.GetCastType(); + } + tmp.conv_attributes = problem.GetConv().attribute; return tmp; }(); @@ -765,6 +833,11 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context, : conv_params.type == InvokeType::Run ? in_n : 1; + const auto gemm_desc = [&]() { + auto tmp = tmp_gemm_desc; + tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt; + return tmp; + }(); for(std::size_t i = 0; i < runs; i++) { std::size_t out_offset = i * wei_k * out_spatial_size; @@ -780,8 +853,7 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context, in_offset, y, out_offset, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } else { @@ -795,8 +867,7 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context, out_offset, time_precision, callGemmStridedBatched, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } if(gemm_status != miopenStatusSuccess) @@ -829,9 +900,20 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context, else { // tensors.y = tensors.w * tensors.x - GemmDescriptor gemm_desc = - CreateGemmStridedBatchedDescriptorConv1x1Fwd(wDesc, xDesc, yDesc); - gemm_desc.deterministic = problem.GetConv().attribute.deterministic; + const GemmDescriptor tmp_gemm_desc = [&]() { + auto tmp = CreateGemmStridedBatchedDescriptorConv1x1Fwd(wDesc, xDesc, yDesc); + tmp.deterministic = problem.GetConv().attribute.deterministic; + if(problem.IsTensorsCasted()) + { + // IsApplicable ensures that both are casted + if(xDesc.GetCastType()) + tmp.a_cast_type = *wDesc.GetCastType(); + if(wDesc.GetCastType()) + tmp.b_cast_type = *xDesc.GetCastType(); + } + tmp.conv_attributes = problem.GetConv().attribute; + return tmp; + }(); const auto in_spatial = std::vector(in_spatial_.begin(), in_spatial_.end()); const auto out_spatial = std::vector(out_spatial_.begin(), out_spatial_.end()); @@ -854,18 +936,15 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context, // tensors.y = tensors.w * tensors.x miopenStatus_t gemm_status; + const auto gemm_desc = [&]() { + auto tmp = tmp_gemm_desc; + tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt; + return tmp; + }(); if(conv_params.type == InvokeType::Run) { - gemm_status = CallGemmStridedBatched(handle, - gemm_desc, - w, - 0, - x, - 0, - y, - 0, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + gemm_status = CallGemmStridedBatched( + handle, gemm_desc, w, 0, x, 0, y, 0, GemmBackend_t::rocblas); } else { @@ -879,8 +958,7 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context, 0, time_precision, callGemmStridedBatched, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } if(gemm_status != miopenStatusSuccess) @@ -1035,11 +1113,20 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context, solution.workspace_sz = workspace_req; solution.invoker_factory = [=](const std::vector&) { - const auto gemm_desc = [&]() { + const auto tmp_gemm_desc = [&]() { auto tmp = conv.group_count > 1 ? CreateGemmDescriptorGroupConvFwd(wDesc, xDesc, yDesc, conv.group_count) : CreateGemmDescriptorConvFwd(wDesc, xDesc, yDesc); tmp.deterministic = problem.GetConv().attribute.deterministic; + if(problem.IsTensorsCasted()) + { + // IsApplicable ensures that both are casted + if(xDesc.GetCastType()) + tmp.a_cast_type = *wDesc.GetCastType(); + if(wDesc.GetCastType()) + tmp.b_cast_type = *xDesc.GetCastType(); + } + tmp.conv_attributes = problem.GetConv().attribute; return tmp; }(); @@ -1124,6 +1211,11 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context, miopenStatus_t gemm_status = miopenStatusNotInitialized; // tensors.y = tensors.w * Im2Col(tensors.x) + const auto gemm_desc = [&]() { + auto tmp = tmp_gemm_desc; + tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt; + return tmp; + }(); if(conv_params.type != InvokeType::Run) { gemm_status = CallGemmTimeMeasure(handle, @@ -1137,8 +1229,7 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context, time_precision, conv.group_count > 1 ? callGemmStridedBatched : callGemm, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } else { @@ -1151,8 +1242,7 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context, 0, y, out_offset, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); else gemm_status = CallGemm(handle, gemm_desc, @@ -1162,8 +1252,7 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context, wksp_offset, y, out_offset, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } if(gemm_status != miopenStatusSuccess) diff --git a/src/solver/gemm_bwd.cpp b/src/solver/gemm_bwd.cpp index 697583742c..df7d08304b 100644 --- a/src/solver/gemm_bwd.cpp +++ b/src/solver/gemm_bwd.cpp @@ -100,9 +100,45 @@ bool GemmBwdBase::IsApplicable(const ExecutionContext& ctx, #if MIOPEN_USE_GEMM if(conv::solver::gemm::IsWorkaroundIssue1315(ctx)) return false; - const auto& dyDesc = problem.GetIn(); - const auto& wDesc = problem.GetWeights(); - const auto& dxDesc = problem.GetOut(); + const auto& dyDesc = problem.GetIn(); + const auto& wDesc = problem.GetWeights(); + const auto& dxDesc = problem.GetOut(); + const auto rblas_fp8_supported = miopen::StartsWith(ctx.GetStream().GetDeviceName(), "gfx94"); + if(problem.IsTensorsCasted()) + { + if(!rblas_fp8_supported) + { + MIOPEN_LOG_I2("GEMM not supported with casted tensors on this GPU architecture"); + return false; + } + if(dyDesc.GetCastType() && wDesc.GetCastType()) + { + const auto a_cast_type = dyDesc.GetCastType(); + const auto b_cast_type = wDesc.GetCastType(); + if(a_cast_type != miopenFloat8 && a_cast_type != miopenBFloat8) + { + MIOPEN_LOG_W( + "Casting is only supported for the miopenFloat8 and miopenBFloat8 data types"); + return false; + } + if(b_cast_type != miopenFloat8 && b_cast_type != miopenBFloat8) + { + MIOPEN_LOG_W( + "Casting is only supported for the miopenFloat8 and miopenBFloat8 data types"); + return false; + } + } + else + { + MIOPEN_LOG_I("Both the output and weights tensors need to be casted"); + return false; + } + } + if(problem.IsFp8() && !rblas_fp8_supported) + { + MIOPEN_LOG_I2("GEMM not applicable for F8 on this GPU architecture"); + return false; + } return problem.GetDirection() == conv::Direction::BackwardData && problem.IsLayoutDefault() && !(IsAnyBufferBF16(dxDesc, dyDesc, wDesc) && !IsBf16Supported) && !(IsAnyBufferFp16(dxDesc, dyDesc, wDesc) && !IsFp16Supported); @@ -242,12 +278,21 @@ ConvSolution GemmBwd1x1_stride2::GetSolution(const ExecutionContext& context, const auto group_count = conv.group_count; - GemmDescriptor gemm_desc = [&]() { + GemmDescriptor tmp_gemm_desc = [&]() { auto tmp = group_count > 1 ? CreateGemmDescriptorGroupConvCNHWBwdData(wDesc, dyDesc, dxDesc, group_count) : CreateGemmDescriptorConvCNHWBwdData(wDesc, dyDesc, dxDesc); tmp.deterministic = problem.GetConv().attribute.deterministic; + if(problem.IsTensorsCasted()) + { + // IsApplicable ensures that both are casted + if(dyDesc.GetCastType()) + tmp.a_cast_type = *wDesc.GetCastType(); + if(wDesc.GetCastType()) + tmp.b_cast_type = *dyDesc.GetCastType(); + } + tmp.conv_attributes = problem.GetConv().attribute; return tmp; }(); std::size_t in_n, in_c; @@ -322,6 +367,11 @@ ConvSolution GemmBwd1x1_stride2::GetSolution(const ExecutionContext& context, miopenStatus_t gemm_status; + const auto gemm_desc = [&]() { + auto tmp = tmp_gemm_desc; + tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt; + return tmp; + }(); if(conv_params.type == InvokeType::Run) { if(group_count > 1) @@ -333,8 +383,7 @@ ConvSolution GemmBwd1x1_stride2::GetSolution(const ExecutionContext& context, 0, workspace, dyDesc_.GetElementSize(), - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); else // tensors.dx = CNHW2NCHW(transpose(tensors.w) * NCHW2CNHW(tensors.dy)) gemm_status = CallGemm(handle, @@ -345,8 +394,7 @@ ConvSolution GemmBwd1x1_stride2::GetSolution(const ExecutionContext& context, 0, workspace, dyDesc_.GetElementSize(), - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } else { @@ -361,8 +409,7 @@ ConvSolution GemmBwd1x1_stride2::GetSolution(const ExecutionContext& context, dyDesc_.GetElementSize(), time_precision, group_count > 1 ? callGemmStridedBatched : callGemm, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } if(gemm_status != miopenStatusSuccess) @@ -469,12 +516,21 @@ ConvSolution GemmBwd1x1_stride1::GetSolution(const ExecutionContext&, const auto in_n = dxDesc.GetLengths()[0]; // dx = transpose(w) * dy - const auto gemm_desc = [&]() { + const auto tmp_gemm_desc = [&]() { auto tmp = group_count > 1 ? CreateGemmDescriptorGroupConvBwdData(wDesc, dyDesc, dxDesc, group_count) : CreateGemmStridedBatchedDescriptorConv1x1BwdData(wDesc, dyDesc, dxDesc); tmp.deterministic = problem.GetConv().attribute.deterministic; + if(problem.IsTensorsCasted()) + { + // IsApplicable ensures that both are casted + if(dyDesc.GetCastType()) + tmp.a_cast_type = *wDesc.GetCastType(); + if(wDesc.GetCastType()) + tmp.b_cast_type = *dyDesc.GetCastType(); + } + tmp.conv_attributes = problem.GetConv().attribute; return tmp; }(); @@ -506,6 +562,11 @@ ConvSolution GemmBwd1x1_stride1::GetSolution(const ExecutionContext&, miopenStatus_t gemm_status = miopenStatusUnknownError; + const auto gemm_desc = [&]() { + auto tmp = tmp_gemm_desc; + tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt; + return tmp; + }(); if(conv_params.type == InvokeType::Run) { if(group_count > 1) @@ -525,8 +586,7 @@ ConvSolution GemmBwd1x1_stride1::GetSolution(const ExecutionContext&, out_offset, dx, in_offset, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); if(handle.IsProfilingEnabled()) { @@ -538,16 +598,8 @@ ConvSolution GemmBwd1x1_stride1::GetSolution(const ExecutionContext&, } else { - gemm_status = CallGemmStridedBatched(handle, - gemm_desc, - w, - 0, - dy, - 0, - dx, - 0, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + gemm_status = CallGemmStridedBatched( + handle, gemm_desc, w, 0, dy, 0, dx, 0, GemmBackend_t::rocblas); } } else @@ -562,8 +614,7 @@ ConvSolution GemmBwd1x1_stride1::GetSolution(const ExecutionContext&, 0, time_precision, callGemmStridedBatched, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } if(gemm_status != miopenStatusSuccess) @@ -665,11 +716,20 @@ ConvSolution GemmBwdRest::GetSolution(const ExecutionContext& context, const auto out_spatial = std::vector(out_spatial_.begin(), out_spatial_.end()); // dx = transpose(w) * dy - const auto gemm_desc = [&]() { + const auto tmp_gemm_desc = [&]() { auto tmp = group_count > 1 ? CreateGemmDescriptorGroupConvBwdData(wDesc, dyDesc, dxDesc, group_count) : CreateGemmDescriptorConvBwdData(wDesc, dyDesc, dxDesc); tmp.deterministic = problem.GetConv().attribute.deterministic; + if(problem.IsTensorsCasted()) + { + // IsApplicable ensures that both are casted + if(dyDesc.GetCastType()) + tmp.a_cast_type = *wDesc.GetCastType(); + if(wDesc.GetCastType()) + tmp.b_cast_type = *dyDesc.GetCastType(); + } + tmp.conv_attributes = problem.GetConv().attribute; return tmp; }(); const auto spatial_dims = conv.GetSpatialDimension(); @@ -718,6 +778,11 @@ ConvSolution GemmBwdRest::GetSolution(const ExecutionContext& context, std::to_string(workspace_size) + " < " + std::to_string(workspace_req) + ")"); + const auto gemm_desc = [&]() { + auto tmp = tmp_gemm_desc; + tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt; + return tmp; + }(); if(conv_params.type == InvokeType::Run) { float time_gemm = 0; @@ -739,8 +804,7 @@ ConvSolution GemmBwdRest::GetSolution(const ExecutionContext& context, out_offset, workspace, 0, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); else gemm_status = CallGemm(handle, gemm_desc, @@ -750,8 +814,7 @@ ConvSolution GemmBwdRest::GetSolution(const ExecutionContext& context, out_offset, workspace, 0, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) MIOPEN_THROW("GemmBwdRest execution failure."); @@ -795,8 +858,7 @@ ConvSolution GemmBwdRest::GetSolution(const ExecutionContext& context, 0, time_precision, group_count > 1 ? callGemmStridedBatched : callGemm, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); if(gemm_status != miopenStatusSuccess) MIOPEN_THROW("GemmBwdRest execution failure."); diff --git a/src/solver/gemm_wrw.cpp b/src/solver/gemm_wrw.cpp index 26a5f582c0..1bda06eca6 100644 --- a/src/solver/gemm_wrw.cpp +++ b/src/solver/gemm_wrw.cpp @@ -67,9 +67,45 @@ bool GemmWrwBase::IsApplicable(const ExecutionContext& ctx, #if MIOPEN_USE_GEMM if(conv::solver::gemm::IsWorkaroundIssue1315(ctx)) return false; - const auto& dyDesc = problem.GetIn(); - const auto& dwDesc = problem.GetWeights(); - const auto& xDesc = problem.GetOut(); + const auto& dyDesc = problem.GetIn(); + const auto& dwDesc = problem.GetWeights(); + const auto& xDesc = problem.GetOut(); + const auto rblas_fp8_supported = miopen::StartsWith(ctx.GetStream().GetDeviceName(), "gfx94"); + if(problem.IsTensorsCasted()) + { + if(!rblas_fp8_supported) + { + MIOPEN_LOG_I2("GEMM not supported with casted tensors on this GPU architecture"); + return false; + } + if(xDesc.GetCastType() && dyDesc.GetCastType()) + { + const auto a_cast_type = xDesc.GetCastType(); + const auto b_cast_type = dyDesc.GetCastType(); + if(a_cast_type != miopenFloat8 && b_cast_type != miopenBFloat8) + { + MIOPEN_LOG_W( + "Casting is only supported for the miopenFloat8 and miopenBFloat8 data types"); + return false; + } + if(a_cast_type != miopenFloat8 && b_cast_type != miopenBFloat8) + { + MIOPEN_LOG_W( + "Casting is only supported for the miopenFloat8 and miopenBFloat8 data types"); + return false; + } + } + else + { + MIOPEN_LOG_I("Both the input and output tensors need to be casted"); + return false; + } + } + if(problem.IsFp8() && !rblas_fp8_supported) + { + MIOPEN_LOG_I2("GEMM not applicable for F8 on this GPU architecture"); + return false; + } return problem.GetDirection() == conv::Direction::BackwardWeights && problem.IsLayoutDefault() && !(IsAnyBufferBF16(xDesc, dyDesc, dwDesc) && !IsBF16PathValid) && @@ -171,11 +207,20 @@ ConvSolution GemmWrw1x1_stride1::GetSolution(const ExecutionContext&, } // dw = sum_over_batch(dy[i] * transpose(x[i])), i is batch id - const auto gemm_desc = [&]() { + const auto tmp_gemm_desc = [&]() { auto tmp = group_count > 1 ? CreateGemmDescriptorGroupConvBwdWeight(dyDesc, xDesc, dwDesc, group_count) : CreateGemmStridedBatchedDescriptorConv1x1BwdWeight(dyDesc, xDesc, dwDesc); tmp.deterministic = problem.GetConv().attribute.deterministic; + if(problem.IsTensorsCasted()) + { + // IsApplicable ensures that both are casted + if(dyDesc.GetCastType()) + tmp.a_cast_type = *dyDesc.GetCastType(); + if(xDesc.GetCastType()) + tmp.b_cast_type = *xDesc.GetCastType(); + } + tmp.conv_attributes = problem.GetConv().attribute; return tmp; }(); @@ -216,6 +261,11 @@ ConvSolution GemmWrw1x1_stride1::GetSolution(const ExecutionContext&, MIOPEN_LOG_FUNCTION("conv, 1x1"); } + const auto gemm_desc = [&]() { + auto tmp = tmp_gemm_desc; + tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt; + return tmp; + }(); if(conv_params.type != InvokeType::Run) { const auto status = CallGemmTimeMeasure( @@ -229,8 +279,7 @@ ConvSolution GemmWrw1x1_stride1::GetSolution(const ExecutionContext&, 0, time_precision, group_count > 1 ? callGemmStridedBatched : callGemmStridedBatchedSequential, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); if(status != miopenStatusSuccess) MIOPEN_THROW("GemmWrw1x1_stride1 execution failure."); @@ -266,8 +315,7 @@ ConvSolution GemmWrw1x1_stride1::GetSolution(const ExecutionContext&, in_offset, dw, 0, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); if(status != miopenStatusSuccess) MIOPEN_THROW("GemmWrw1x1_stride1 execution failure."); @@ -285,16 +333,8 @@ ConvSolution GemmWrw1x1_stride1::GetSolution(const ExecutionContext&, else { // dw = sum_over_batch(dy[i] * transpose(x[i])), i is batch id - const auto status = CallGemmStridedBatchedSequential(handle, - gemm_desc, - dy, - 0, - x, - 0, - dw, - 0, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + const auto status = CallGemmStridedBatchedSequential( + handle, gemm_desc, dy, 0, x, 0, dw, 0, GemmBackend_t::rocblas); if(status != miopenStatusSuccess) MIOPEN_THROW("GemmWrw1x1_stride1 execution failure."); @@ -373,11 +413,20 @@ ConvSolution GemmWrwUniversal::GetSolution(const ExecutionContext& context, const auto group_count = conv.group_count; // dw = dy * transpose(Im2Col(x)) - const auto gemm_desc = [&]() { + const auto tmp_gemm_desc = [&]() { auto tmp = group_count > 1 ? CreateGemmDescriptorGroupConvBwdWeight(dyDesc, xDesc, dwDesc, group_count) : CreateGemmDescriptorConvBwdWeight(dyDesc, xDesc, dwDesc); tmp.deterministic = problem.GetConv().attribute.deterministic; + if(problem.IsTensorsCasted()) + { + // IsApplicable ensures that both are casted + if(dyDesc.GetCastType()) + tmp.a_cast_type = *dyDesc.GetCastType(); + if(xDesc.GetCastType()) + tmp.b_cast_type = *xDesc.GetCastType(); + } + tmp.conv_attributes = problem.GetConv().attribute; return tmp; }(); @@ -439,6 +488,11 @@ ConvSolution GemmWrwUniversal::GetSolution(const ExecutionContext& context, std::to_string(workspace_req) + ")"); } + const auto gemm_desc = [&]() { + auto tmp = tmp_gemm_desc; + tmp.gfx90a_alt_impl = conv_params.gfx90aFp16alt; + return tmp; + }(); if(conv_params.type == InvokeType::Run) { // Zeroing out the output buffer @@ -478,8 +532,7 @@ ConvSolution GemmWrwUniversal::GetSolution(const ExecutionContext& context, 0, dw, 0, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } else { @@ -492,8 +545,7 @@ ConvSolution GemmWrwUniversal::GetSolution(const ExecutionContext& context, 0, dw, 0, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); } if(status != miopenStatusSuccess) @@ -539,8 +591,7 @@ ConvSolution GemmWrwUniversal::GetSolution(const ExecutionContext& context, 0, time_precision, group_count > 1 ? callGemmStridedBatched : callGemm, - GemmBackend_t::rocblas, - conv_params.gfx90aFp16alt); + GemmBackend_t::rocblas); if(status != miopenStatusSuccess) MIOPEN_THROW("GemmWrw1x1_stride1 execution failure."); diff --git a/src/solver/mlir_common.cpp b/src/solver/mlir_common.cpp index c9250fe26f..eaaa0e42c3 100644 --- a/src/solver/mlir_common.cpp +++ b/src/solver/mlir_common.cpp @@ -58,6 +58,8 @@ static const char* DTypeName(miopenDataType_t ty) case miopenInt32: return "i32"; case miopenInt8: return "i8"; case miopenInt8x4: return "i8x4"; + case miopenFloat8: return "fp8"; + case miopenBFloat8: return "bfp8"; } MIOPEN_THROW(miopenStatusInternalError, "Value outside of datatype enum"); } diff --git a/src/tensor.cpp b/src/tensor.cpp index 877e319c33..ca4f1afc7a 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -47,6 +47,8 @@ bool IsDataTypeSupported(miopenDataType_t t) case miopenHalf: case miopenFloat: case miopenInt32: + case miopenFloat8: + case miopenBFloat8: case miopenInt8: case miopenInt8x4: case miopenBFloat16: @@ -338,6 +340,13 @@ std::size_t TensorDescriptor::GetElementSize() const miopenDataType_t TensorDescriptor::GetType() const { return this->type; } +std::optional TensorDescriptor::GetCastType() const { return this->cast_type; } + +void TensorDescriptor::SetCastType(const miopenDataType_t cast_type_) +{ + this->cast_type = cast_type_; +} + miopenTensorLayout_t TensorDescriptor::GetLayout_t() const { return this->tensorLayout; } std::string TensorDescriptor::GetLayout_str() const @@ -456,6 +465,18 @@ std::ostream& operator<<(std::ostream& stream, const TensorDescriptor& t) stream << "packed" << ", "; + if(t.cast_type) + { + stream << "cast_type: "; + const auto ct = *t.cast_type; + if(ct == miopenFloat8) + stream << "miopenFloat8"; + else if(ct == miopenBFloat8) + stream << "miopenBFloat8"; + else + stream << "Other"; + } + return stream; } diff --git a/src/tensor_api.cpp b/src/tensor_api.cpp index 76742a1084..307395f59d 100644 --- a/src/tensor_api.cpp +++ b/src/tensor_api.cpp @@ -199,6 +199,37 @@ extern "C" miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t ten }); } +extern "C" miopenStatus_t miopenSetTensorCastType(miopenTensorDescriptor_t tensorDesc, + miopenDataType_t cast_type) +{ + if(miopen::IsLoggingFunctionCalls()) + { + MIOPEN_LOG_FUNCTION(tensorDesc, cast_type); + } + + return miopen::try_([&] { miopen::deref(tensorDesc).SetCastType(cast_type); }); +} + +extern "C" miopenStatus_t miopenGetTensorCastType(miopenTensorDescriptor_t tensorDesc, + miopenDataType_t& cast_type) +{ + if(miopen::IsLoggingFunctionCalls()) + { + MIOPEN_LOG_FUNCTION(tensorDesc); + } + return miopen::try_([&] { + const auto c_type = miopen::deref(tensorDesc).GetCastType(); + if(c_type) + { + cast_type = *c_type; + } + else + { + cast_type = miopen::deref(tensorDesc).GetType(); + } + }); +} + extern "C" miopenStatus_t miopenGetTensorNumBytes(miopenTensorDescriptor_t tensorDesc, size_t* numBytes) { diff --git a/test/conv_common.hpp b/test/conv_common.hpp index db3da76e3b..3d510bb21b 100644 --- a/test/conv_common.hpp +++ b/test/conv_common.hpp @@ -380,7 +380,7 @@ tensor ref_conv_fwd(const tensor& input, auto rout = out; if(filter.mode == miopenTranspose) { - std::fill(rout.begin(), rout.end(), 0); + std::fill(rout.begin(), rout.end(), static_cast(0)); bool gpu_ref_used = gpu_ref_convolution_bwd(rout, weights, input, filter); if(!gpu_ref_used) { @@ -440,27 +440,92 @@ tensor ref_conv_wrw(const tensor& input, } template -tensor ref_conv_bwd(const tensor& input, +tensor ref_conv_bwd(const tensor& input, const tensor& weights, - const tensor& out, + const tensor& out, const miopen::ConvolutionDescriptor& filter) { - auto rin = input; - std::fill(rin.begin(), rin.end(), 0); - bool gpu_ref_used = gpu_ref_convolution_bwd(rin, weights, out, filter); - if(!gpu_ref_used) + auto rinput = input; + + std::fill(rinput.begin(), rinput.end(), 0); + + if(filter.mode == miopenTranspose) { - MIOPEN_LOG_W("GPU reference skipped"); - cpu_convolution_backward_data(filter.GetSpatialDimension(), - rin, - weights, - out, - filter.GetConvPads(), - filter.GetConvStrides(), - filter.GetConvDilations(), - filter.GetGroupCount()); + bool gpu_ref_used = gpu_ref_convolution_fwd(out, weights, rinput, filter); + if(!gpu_ref_used) + { + MIOPEN_LOG_W("GPU reference not run"); + cpu_convolution_forward(filter.GetSpatialDimension(), + out, + weights, + rinput, + filter.GetConvPads(), + filter.GetConvStrides(), + filter.GetConvDilations(), + filter.GetGroupCount()); + } + } + else + { + bool gpu_ref_used = gpu_ref_convolution_bwd(rinput, weights, out, filter); + if(!gpu_ref_used) + { + MIOPEN_LOG_W("GPU reference not run"); + cpu_convolution_backward_data(filter.GetSpatialDimension(), + rinput, + weights, + out, + filter.GetConvPads(), + filter.GetConvStrides(), + filter.GetConvDilations(), + filter.GetGroupCount()); + } + } + return rinput; +} + +template +tensor ref_conv_wrw(const tensor& input, + const tensor& weights, + const tensor& out, + const miopen::ConvolutionDescriptor& filter) +{ + auto rweights = weights; + std::fill(rweights.begin(), rweights.end(), 0); + + if(filter.mode == miopenTranspose) + { + bool gpu_ref_used = gpu_ref_convolution_wrw(out, rweights, input, filter); + if(!gpu_ref_used) + { + MIOPEN_LOG_W("GPU reference not run"); + cpu_convolution_backward_weight(filter.GetSpatialDimension(), + out, + rweights, + input, + filter.GetConvPads(), + filter.GetConvStrides(), + filter.GetConvDilations(), + filter.GetGroupCount()); + } + } + else + { + bool gpu_ref_used = gpu_ref_convolution_wrw(input, rweights, out, filter); + if(!gpu_ref_used) + { + MIOPEN_LOG_W("GPU reference not run"); + cpu_convolution_backward_weight(filter.GetSpatialDimension(), + input, + rweights, + out, + filter.GetConvPads(), + filter.GetConvStrides(), + filter.GetConvDilations(), + filter.GetGroupCount()); + } } - return rin; + return rweights; } // Mainline convolution tests diff --git a/test/cpu_conv.hpp b/test/cpu_conv.hpp index 78c1176503..560c867c6b 100644 --- a/test/cpu_conv.hpp +++ b/test/cpu_conv.hpp @@ -39,6 +39,7 @@ #include "tensor_holder.hpp" #include #include +#include template static constexpr auto make_array(T x, Ts... xs) @@ -46,6 +47,12 @@ static constexpr auto make_array(T x, Ts... xs) return std::array{{x, xs...}}; } +template +struct PassThru +{ + T operator()(T t) { return t; } +}; + template struct cpu_convolution_acc_type { @@ -66,6 +73,8 @@ struct cpu_convolution_acc_type template & in, const Range& pads, const Range& strides, const Range& dilations, - std::size_t group_count) + std::size_t group_count, + FI fi = {}, + FW fw = {}) { static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); assert(in.desc.GetSize() == ConvDim + 2 and wei.desc.GetSize() == ConvDim + 2 and @@ -162,22 +173,26 @@ void cpu_convolution_forward_impl(const tensor& in, in_id[0] = out_n_id; in_id[1] = in_c_id; std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2); - acc += - Tacc(in(in_id)) * Tacc(wei(out_k_id, wei_c_id, wei_spatial_id_pack...)); + Tacc tmp1 = static_cast(fi(in(in_id))); + Tacc tmp2 = + static_cast(fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...))); + acc += tmp1 * tmp2; } } }); }); if(vector_len > 1) out(out_k_id % vector_len, out_n_id, out_k_id / vector_len, out_spatial_id_pack...) = - acc; + static_cast(acc); else - out(out_n_id, out_k_id, out_spatial_id_pack...) = acc; + out(out_n_id, out_k_id, out_spatial_id_pack...) = static_cast(acc); }); } template & in, const Range& pads, const Range& strides, const Range& dilations, - std::size_t group_count) + std::size_t group_count, + FW fw = {}, + FO fo = {}) { static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); assert(in.desc.GetSize() == ConvDim + 2 and wei.desc.GetSize() == ConvDim + 2 and @@ -255,19 +272,21 @@ void cpu_convolution_backward_data_impl(tensor& in, out_id[0] = in_n_id; out_id[1] = out_k_id; std::copy_n(out_spatial_id.begin(), ConvDim, out_id.begin() + 2); - - acc += Tacc(out(out_id)) * - Tacc(wei(out_k_id, wei_c_id, wei_spatial_id_pack...)); + Tacc tmp1 = fo(out(out_id)); + Tacc tmp2 = fw(wei(out_k_id, wei_c_id, wei_spatial_id_pack...)); + acc += tmp1 * tmp2; } }); }); - - in(in_n_id, in_c_id, in_spatial_id_pack...) = acc; + // TODO: Why do we need a no-lint here ? + in(in_n_id, in_c_id, in_spatial_id_pack...) = static_cast(acc); // NOLINT }); } template & in, const Range& pads, const Range& strides, const Range& dilations, - std::size_t group_count) + std::size_t group_count, + FI fi, + FO fo) { static_assert(ConvDim > 0, "wrong! convolution dim should be larger than 0"); assert(in.desc.GetSize() == ConvDim + 2 and wei.desc.GetSize() == ConvDim + 2 and @@ -303,54 +324,60 @@ void cpu_convolution_backward_weight_impl(const tensor& in, auto par_ford_wei_kc_spatial = miopen::unpacker(miopen::prepender(par_ford, wei_k_len, wei_c_len))(wei_spatial_len); - par_ford_wei_kc_spatial([&](std::size_t wei_k_id, - std::size_t wei_c_id, - auto... wei_spatial_id_pack) { - auto wei_spatial_id = make_array(wei_spatial_id_pack...); + par_ford_wei_kc_spatial( + [&](std::size_t wei_k_id, std::size_t wei_c_id, auto... wei_spatial_id_pack) { + auto wei_spatial_id = make_array(wei_spatial_id_pack...); - std::size_t group_id = wei_k_id / wei_k_len_per_group; - std::size_t in_c_id = group_id * wei_c_len + wei_c_id; + std::size_t group_id = wei_k_id / wei_k_len_per_group; + std::size_t in_c_id = group_id * wei_c_len + wei_c_id; - Tacc acc = 0; + Tacc acc = 0; - ford(out_n_len)([&](std::size_t out_n_id) { - auto ford_out_spatial = miopen::unpacker(ford)(out_spatial_len); + ford(out_n_len)([&](std::size_t out_n_id) { + auto ford_out_spatial = miopen::unpacker(ford)(out_spatial_len); - ford_out_spatial([&](auto... out_spatial_id_pack) { - auto out_spatial_id = make_array(out_spatial_id_pack...); + ford_out_spatial([&](auto... out_spatial_id_pack) { + auto out_spatial_id = make_array(out_spatial_id_pack...); - std::array in_spatial_id{}; + std::array in_spatial_id{}; - for(std::size_t i = 0; i < ConvDim; ++i) - { - in_spatial_id[i] = - out_spatial_id[i] * strides[i] + wei_spatial_id[i] * dilations[i] - pads[i]; - } + for(std::size_t i = 0; i < ConvDim; ++i) + { + in_spatial_id[i] = out_spatial_id[i] * strides[i] + + wei_spatial_id[i] * dilations[i] - pads[i]; + } - bool out_of_bound = false; - for(std::size_t i = 0; i < ConvDim; ++i) - { - out_of_bound = out_of_bound or - (in_spatial_id[i] < 0 or in_spatial_id[i] >= in_spatial_len[i]); - } + bool out_of_bound = false; + for(std::size_t i = 0; i < ConvDim; ++i) + { + out_of_bound = out_of_bound or (in_spatial_id[i] < 0 or + in_spatial_id[i] >= in_spatial_len[i]); + } - if(!out_of_bound) - { - std::array in_id{}; - in_id[0] = out_n_id; - in_id[1] = in_c_id; - std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2); + if(!out_of_bound) + { + std::array in_id{}; + in_id[0] = out_n_id; + in_id[1] = in_c_id; + std::copy_n(in_spatial_id.begin(), ConvDim, in_id.begin() + 2); + Tacc tmp1 = fi(in(in_id)); + Tacc tmp2 = fo(out(out_n_id, wei_k_id, out_spatial_id_pack...)); + acc += tmp1 * tmp2; + } + }); - acc += Tacc(in(in_id)) * Tacc(out(out_n_id, wei_k_id, out_spatial_id_pack...)); - } + wei(wei_k_id, wei_c_id, wei_spatial_id_pack...) = static_cast(acc); }); - - wei(wei_k_id, wei_c_id, wei_spatial_id_pack...) = acc; }); - }); } -template +template , + typename FW = PassThru> void cpu_convolution_forward(std::size_t spatial_dim, const tensor& in, const tensor& wei, @@ -358,30 +385,30 @@ void cpu_convolution_forward(std::size_t spatial_dim, const Range& pads, const Range& strides, const Range& dilations, - std::size_t group_count) + std::size_t group_count, + FI fi = {}, + FW fw = {}) { - using acc_type = typename cpu_convolution_acc_type::type; - switch(spatial_dim) { case 1: { - cpu_convolution_forward_impl<1, acc_type>( - in, wei, out, pads, strides, dilations, group_count); + cpu_convolution_forward_impl<1, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fw); break; } case 2: { - cpu_convolution_forward_impl<2, acc_type>( - in, wei, out, pads, strides, dilations, group_count); + cpu_convolution_forward_impl<2, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fw); break; } case 3: { - cpu_convolution_forward_impl<3, acc_type>( - in, wei, out, pads, strides, dilations, group_count); + cpu_convolution_forward_impl<3, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fw); break; } case 4: { - cpu_convolution_forward_impl<4, acc_type>( - in, wei, out, pads, strides, dilations, group_count); + cpu_convolution_forward_impl<4, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fw); break; } default: { @@ -390,7 +417,13 @@ void cpu_convolution_forward(std::size_t spatial_dim, } } -template +template , + typename FO = PassThru> void cpu_convolution_backward_data(std::size_t spatial_dim, tensor& in, const tensor& wei, @@ -398,30 +431,30 @@ void cpu_convolution_backward_data(std::size_t spatial_dim, const Range& pads, const Range& strides, const Range& dilations, - std::size_t group_count) + std::size_t group_count, + FW fw = {}, + FO fo = {}) { - using acc_type = typename cpu_convolution_acc_type::type; - switch(spatial_dim) { case 1: { - cpu_convolution_backward_data_impl<1, acc_type>( - in, wei, out, pads, strides, dilations, group_count); + cpu_convolution_backward_data_impl<1, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fw, fo); break; } case 2: { - cpu_convolution_backward_data_impl<2, acc_type>( - in, wei, out, pads, strides, dilations, group_count); + cpu_convolution_backward_data_impl<2, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fw, fo); break; } case 3: { - cpu_convolution_backward_data_impl<3, acc_type>( - in, wei, out, pads, strides, dilations, group_count); + cpu_convolution_backward_data_impl<3, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fw, fo); break; } case 4: { - cpu_convolution_backward_data_impl<4, acc_type>( - in, wei, out, pads, strides, dilations, group_count); + cpu_convolution_backward_data_impl<4, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fw, fo); break; } default: { @@ -430,7 +463,13 @@ void cpu_convolution_backward_data(std::size_t spatial_dim, } } -template +template , + typename FO = PassThru> void cpu_convolution_backward_weight(std::size_t spatial_dim, const tensor& in, tensor& wei, @@ -438,30 +477,30 @@ void cpu_convolution_backward_weight(std::size_t spatial_dim, const Range& pads, const Range& strides, const Range& dilations, - std::size_t group_count) + std::size_t group_count, + FI fi = {}, + FO fo = {}) { - using acc_type = typename cpu_convolution_acc_type::type; - switch(spatial_dim) { case 1: { - cpu_convolution_backward_weight_impl<1, acc_type>( - in, wei, out, pads, strides, dilations, group_count); + cpu_convolution_backward_weight_impl<1, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fo); break; } case 2: { - cpu_convolution_backward_weight_impl<2, acc_type>( - in, wei, out, pads, strides, dilations, group_count); + cpu_convolution_backward_weight_impl<2, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fo); break; } case 3: { - cpu_convolution_backward_weight_impl<3, acc_type>( - in, wei, out, pads, strides, dilations, group_count); + cpu_convolution_backward_weight_impl<3, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fo); break; } case 4: { - cpu_convolution_backward_weight_impl<4, acc_type>( - in, wei, out, pads, strides, dilations, group_count); + cpu_convolution_backward_weight_impl<4, Tacc>( + in, wei, out, pads, strides, dilations, group_count, fi, fo); break; } default: { diff --git a/test/driver.hpp b/test/driver.hpp index 8ff0df639d..0a8e2d3080 100644 --- a/test/driver.hpp +++ b/test/driver.hpp @@ -279,6 +279,8 @@ struct test_driver case miopenInt32: ss << "--int32 "; break; case miopenFloat: ss << "--float "; break; case miopenDouble: ss << "--double "; break; + case miopenFloat8: ss << "--float8"; break; + case miopenBFloat8: ss << "--bfloat8"; break; } for(auto&& arg : this->arguments) { @@ -306,6 +308,8 @@ struct test_driver case miopenInt32: ret.emplace_back("--int32"); break; case miopenFloat: ret.emplace_back("--float"); break; case miopenDouble: ret.emplace_back("--double"); break; + case miopenFloat8: ret.emplace_back("--float8"); break; + case miopenBFloat8: ret.emplace_back("--bfloat8"); break; } for(auto&& arg : this->arguments) @@ -876,6 +880,21 @@ struct test_driver } } + template + auto verify_eps(V&& v, Ts&&... xs) -> decltype(std::make_pair(v.cpu(xs...), v.gpu(xs...))) + { + return verify_impl( + [&](std::vector& error, auto&& cpu, auto&& gpu) { + CHECK(miopen::range_distance(cpu) == miopen::range_distance(gpu)); + + double threshold = v.epsilon() * tolerance; + error = {miopen::rms_range(cpu, gpu)}; + return error.front() <= threshold; + }, + v, + xs...); + } + template auto verify(V&& v, Ts&&... xs) -> decltype(std::make_pair(v.cpu(xs...), v.gpu(xs...))) { diff --git a/test/gtest/api_convbiasactiv.cpp b/test/gtest/api_convbiasactiv.cpp index f065730eda..d59d3ae03d 100644 --- a/test/gtest/api_convbiasactiv.cpp +++ b/test/gtest/api_convbiasactiv.cpp @@ -25,6 +25,8 @@ *******************************************************************************/ #include +#define WORKAROUND_ISSUE_2212 1 + #if MIOPEN_BACKEND_HIP #include #include @@ -174,7 +176,10 @@ TEST_P(ConvBiasActivFwdTest, DriveAPI) void GatherCBATestCases(std::vector& cba_test_cases) { - if(!miopen::StartsWith(get_handle().GetDeviceName(), "gfx11")) + const auto dev_name = get_handle().GetDeviceName(); +#if WORKAROUND_ISSUE_2212 + if(!miopen::StartsWith(dev_name, "gfx11") && !miopen::StartsWith(dev_name, "gfx94")) +#endif { cba_test_cases.push_back(CBATestCase{ 16, 128, 16, 16, 128, 3, 3, 0, 0, 1, 1, 1, 1, miopenActivationRELU, miopenConvolution}); diff --git a/test/gtest/cba.hpp b/test/gtest/cba.hpp index 4529f714a4..70091e107c 100644 --- a/test/gtest/cba.hpp +++ b/test/gtest/cba.hpp @@ -38,6 +38,7 @@ #include "conv_common.hpp" #include "conv_test_base.hpp" +#include "conv_tensor_gen.hpp" template struct ConvBiasActivInferTest diff --git a/test/gtest/conv_embed_db.cpp b/test/gtest/conv_embed_db.cpp index ba9889dfd7..b69fde1b5e 100644 --- a/test/gtest/conv_embed_db.cpp +++ b/test/gtest/conv_embed_db.cpp @@ -75,9 +75,11 @@ void Run2dDriver(miopenDataType_t prec) case miopenBFloat16: params = ConfigWithBFloat16::GetParam(); break; case miopenInt8x4: case miopenInt32: + case miopenFloat8: + case miopenBFloat8: case miopenDouble: - FAIL() << "miopenInt8x4, miopenInt32, miopenDouble data type not supported by " - "conv_embed_db test"; + FAIL() << "miopenInt8x4, miopenInt32, miopenFloat8, miopenBFloat8, miopenDouble data type " + "not supported by conv_embed_db test"; default: params = ConfigWithFloat::GetParam(); } diff --git a/test/gtest/conv_hip_igemm_xdlops.cpp b/test/gtest/conv_hip_igemm_xdlops.cpp index 5fdb842fe0..508624c847 100644 --- a/test/gtest/conv_hip_igemm_xdlops.cpp +++ b/test/gtest/conv_hip_igemm_xdlops.cpp @@ -60,6 +60,8 @@ void Run2dDriver(miopenDataType_t prec) switch(prec) { case miopenInt8: params = ConfigWithInt8::GetParam(); break; + case miopenFloat8: + case miopenBFloat8: case miopenHalf: case miopenBFloat16: case miopenFloat: diff --git a/test/gtest/conv_igemm_mlir.cpp b/test/gtest/conv_igemm_mlir.cpp index cf3d93f07a..d5fc0d426c 100644 --- a/test/gtest/conv_igemm_mlir.cpp +++ b/test/gtest/conv_igemm_mlir.cpp @@ -83,11 +83,12 @@ void Run2dDriver(miopenDataType_t prec) case miopenBFloat16: case miopenInt8x4: case miopenInt32: + case miopenFloat8: + case miopenBFloat8: case miopenDouble: MIOPEN_THROW(miopenStatusBadParm, - "miopenBFloat16, miopenInt8x4, miopenInt32, miopenDouble data " - "type not supported by " - "conv_igemm_mlir test"); + "miopenBFloat16, miopenInt8x4, miopenInt32, miopenFloat8, miopenBFloat8, " + "miopenDouble data type not supported by conv_igemm_mlir test"); default: params = ConfigWithFloat::GetParam(); } diff --git a/test/gtest/conv_igemm_mlir_xdlops.cpp b/test/gtest/conv_igemm_mlir_xdlops.cpp index e490b15219..19913093c0 100644 --- a/test/gtest/conv_igemm_mlir_xdlops.cpp +++ b/test/gtest/conv_igemm_mlir_xdlops.cpp @@ -56,6 +56,8 @@ void Run2dDriver(miopenDataType_t prec) case miopenInt8x4: case miopenInt32: case miopenDouble: + case miopenFloat8: + case miopenBFloat8: MIOPEN_THROW(miopenStatusBadParm, "miopenBFloat16, miopenFloat, miopenInt8x4, miopenInt32, miopenDouble data " "type not supported by " diff --git a/test/gtest/conv_tensor_gen.hpp b/test/gtest/conv_tensor_gen.hpp new file mode 100644 index 0000000000..f578e91973 --- /dev/null +++ b/test/gtest/conv_tensor_gen.hpp @@ -0,0 +1,125 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include + +#include + +// Copied from conv_driver.hpp + +template +inline T FRAND() +{ + double d = static_cast(rand() / (static_cast(RAND_MAX))); + return static_cast(d); +} + +template +inline T RAN_GEN(T A, T B) +{ + T r = (FRAND() * (B - A)) + A; + return r; +} +template +T RanGenData() +{ + return RAN_GEN(static_cast(0.0f), static_cast(1.0f)); +} + +template <> +float8 RanGenData() +{ + return RAN_GEN(static_cast(-1.0f), static_cast(1.0f)); +} + +template <> +bfloat8 RanGenData() +{ + const auto tmp = RAN_GEN(static_cast(-1.0f), static_cast(1.0f)); + return static_cast(tmp); +} + +template +struct GenData +{ + template + T operator()(Ts...) const + { + return RanGenData(); + } +}; + +template +T RanGenWeights() +{ + return RAN_GEN(static_cast(-0.5), static_cast(0.5)); +} + +// Shift FP16 distribution towards positive numbers, +// otherwise Winograd FP16 validation fails. +template <> +half_float::half RanGenWeights() +{ + return RAN_GEN(static_cast(-1.0 / 3.0), + static_cast(0.5)); +} + +template <> +float8 RanGenWeights() +{ + const auto tmp = + RAN_GEN(0.0, 1.0) > 0.5 ? static_cast(0.0) : static_cast(1.0); + // 1 in 2 chance of number being positive + const float sign = + (RAN_GEN(0.0, 1.0) > 0.5) ? static_cast(-1) : static_cast(1); + const auto tmp2 = static_cast(std::numeric_limits::epsilon()) * + static_cast(2) * sign * static_cast(tmp); + return static_cast(tmp2); +} + +template <> +bfloat8 RanGenWeights() +{ + const auto tmp = + RAN_GEN(0.0, 1.0) > 0.5 ? static_cast(0.0) : static_cast(1.0); + // 1 in 2 chance of number being positive + const float sign = + (RAN_GEN(0.0, 1.0) > 0.5) ? static_cast(-1) : static_cast(1); + const auto tmp2 = static_cast(std::numeric_limits::epsilon()) * + static_cast(2) * sign * static_cast(tmp); + return static_cast(tmp2); +} + +template +struct GenWeights +{ + template + T operator()(Ts...) const + { + return RanGenWeights(); + } +}; diff --git a/test/gtest/conv_test_base.hpp b/test/gtest/conv_test_base.hpp index a8413797b0..2f91d784bf 100644 --- a/test/gtest/conv_test_base.hpp +++ b/test/gtest/conv_test_base.hpp @@ -29,6 +29,7 @@ #include #include "conv_common.hpp" +#include "conv_tensor_gen.hpp" template miopenDataType_t GetDataType(); @@ -45,6 +46,18 @@ miopenDataType_t GetDataType() return miopenHalf; } +template <> +miopenDataType_t GetDataType>() +{ + return miopenFloat8; +} + +template <> +miopenDataType_t GetDataType>() +{ + return miopenBFloat8; +} + struct ConvTestCase { size_t N; @@ -67,7 +80,7 @@ struct ConvTestCase << " k: " << tc.k << " y:" << tc.y << " x:" << tc.x << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x << " dilation_y:" << tc.dilation_y - << " dilation_x:" << tc.dilation_x << " )"; + << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode << " )"; } const std::vector GetInput() { return {N, C, H, W}; } const std::vector GetWeights() { return {k, C, y, x}; } @@ -123,7 +136,7 @@ std::vector ConvTestConfigs() {64, 1024, 14, 14, 1024, 3, 3, 1, 1, 1, 1, 1, 1, miopenConvolution}}; } -template +template struct ConvFwdSolverTestBase { protected: @@ -131,8 +144,8 @@ struct ConvFwdSolverTestBase { input = tensor{miopen_type{}, tensor_layout, conv_config.GetInput()}; weights = tensor{miopen_type{}, tensor_layout, conv_config.GetWeights()}; - input.generate(tensor_elem_gen_integer{3}); - weights.generate(tensor_elem_gen_integer{3}); + input.generate(GenData{}); + weights.generate(GenWeights{}); conv_desc = conv_config.GetConv(); @@ -140,7 +153,7 @@ struct ConvFwdSolverTestBase conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType()); output = tensor{miopen_type{}, tensor_layout, output_desc.GetLengths()}; - std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); auto&& handle = get_handle(); in_dev = handle.Write(input.data); @@ -152,7 +165,22 @@ struct ConvFwdSolverTestBase { miopen::TensorDescriptor output_desc = conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType()); - ref_out = ref_conv_fwd(input, weights, output, conv_desc); + ref_out = tensor{miopen_type{}, output.desc.GetLayout_t(), output_desc.GetLengths()}; + if(use_cpu_ref) + { + cpu_convolution_forward(conv_desc.GetSpatialDimension(), + input, + weights, + ref_out, + conv_desc.GetConvPads(), + conv_desc.GetConvStrides(), + conv_desc.GetConvDilations(), + conv_desc.GetGroupCount()); + } + else + { + ref_out = ref_conv_fwd(input, weights, ref_out, conv_desc); + } } void ThresholdChecks() diff --git a/test/gtest/conv_trans.cpp b/test/gtest/conv_trans.cpp index 4537ce8cbb..20015336a0 100644 --- a/test/gtest/conv_trans.cpp +++ b/test/gtest/conv_trans.cpp @@ -51,6 +51,8 @@ void Run2dDriver(miopenDataType_t prec) { case miopenFloat: params = ConfigWithFloat::GetParam(); break; case miopenHalf: + case miopenFloat8: + case miopenBFloat8: case miopenInt8: case miopenBFloat16: case miopenInt8x4: diff --git a/test/gtest/solver.hpp b/test/gtest/get_solver.hpp similarity index 59% rename from test/gtest/solver.hpp rename to test/gtest/get_solver.hpp index 60147352cc..9a995c8916 100644 --- a/test/gtest/solver.hpp +++ b/test/gtest/get_solver.hpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2022 Advanced Micro Devices, Inc. + * Copyright (c) 2023 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -26,36 +26,31 @@ #pragma once #include -#include "cpu_conv.hpp" +#include "conv_common.hpp" #include "get_handle.hpp" #include "tensor_util.hpp" -#include #include -#include "conv_test_base.hpp" +#include +#include -template -struct ConvFwdSolverTest - : public ::testing::TestWithParam< - std::tuple>, - ConvFwdSolverTestBase +template +auto GetSolutionImpl(miopen::rank<1>, Solver s, const Context& ctx, const Problem& problem) + -> decltype(s.GetSolution(ctx, problem, s.GetDefaultPerformanceConfig(ctx, problem))) { -public: - void SetUp() override - { - test_skipped = false; - std::tie(algo, conv_config, tensor_layout) = GetParam(); - ConvFwdSolverTestBase::SetUpImpl(conv_config, tensor_layout); - } - void TearDown() override - { - if(test_skipped) - return; - ConvFwdSolverTestBase::TearDownConv(); - ConvFwdSolverTestBase::ThresholdChecks(); - } - ConvTestCase conv_config; - miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoDirect; - bool test_skipped = false; - miopenTensorLayout_t tensor_layout; -}; + return s.GetSolution(ctx, problem, s.GetDefaultPerformanceConfig(ctx, problem)); +} + +template +auto GetSolutionImpl(miopen::rank<0>, Solver s, const Context& ctx, const Problem& problem) + -> decltype(s.GetSolution(ctx, problem)) +{ + return s.GetSolution(ctx, problem); +} + +template +miopen::solver::ConvSolution GetSolution(Solver s, const Context& ctx, const Problem& problem) +{ + auto solution = GetSolutionImpl(miopen::rank<1>{}, s, ctx, problem); + return solution; +} diff --git a/test/gtest/solver_bwd.hpp b/test/gtest/solver_bwd.hpp new file mode 100644 index 0000000000..728c38fcda --- /dev/null +++ b/test/gtest/solver_bwd.hpp @@ -0,0 +1,185 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include "conv_common.hpp" +#include "get_handle.hpp" +#include "tensor_util.hpp" +#include +#include + +#include +#include +#include + +#include "conv_test_base.hpp" +#include "conv_tensor_gen.hpp" + +#include "get_solver.hpp" + +template +struct ConvBwdSolverTest + : public ::testing::TestWithParam> +{ + + template + void SolverBwd(Solver solv) + { + auto&& handle = get_handle(); + + const auto tensors = miopen::ConvBwdTensors{ + output.desc, out_dev.get(), weights.desc, wei_dev.get(), input.desc, in_dev.get()}; + const auto conv_problem = + miopen::conv::ProblemDescription(input.desc, + weights.desc, + output.desc, + conv_desc, + miopen::conv::Direction::BackwardData); + const auto problem = miopen::ProblemDescription{conv_problem}; + const miopen::ConvolutionContext ctx = [&] { + auto tmp = miopen::ConvolutionContext{&handle}; + problem.conv_problem.SetupFloats(tmp); + return tmp; + }(); + + // const auto network_config = problem.BuildConfKey(); + + if(!solv.IsApplicable(ctx, problem)) + { + test_skipped = true; + GTEST_SKIP() << solv.SolverDbId() << ": Not Applicable for this problem" << conv_config; + } + + if(solv.MayNeedWorkspace()) + { + const auto cur_sol_ws = solv.GetWorkspaceSize(ctx, problem); + workspace_dev = handle.Create(cur_sol_ws); + workspace_size = cur_sol_ws; + } + + const auto invoke_params = + miopen::conv::DataInvokeParams{tensors, + workspace_dev.get(), + workspace_size, + conv_desc.attribute.gfx90aFp16alt.GetBwd()}; + + auto sol = GetSolution(solv, ctx, problem); + ASSERT_TRUE(sol.Succeeded()); + ASSERT_TRUE(sol.invoker_factory); + const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params); + (invoker)(handle, invoke_params); + handle.Finish(); + } + +protected: + void SetUp() override + { + test_skipped = false; + std::tie(algo, conv_config) = GetParam(); + input = tensor{conv_config.N, conv_config.C, conv_config.H, conv_config.W}; + weights = tensor{conv_config.k, conv_config.C, conv_config.y, conv_config.x}; + weights.generate(GenWeights{}); + + conv_desc = conv_config.GetConv(); + + miopen::TensorDescriptor output_desc = + conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType()); + + output = tensor{output_desc.GetLengths()}; + output.generate(GenData{}); + + std::fill(input.begin(), input.end(), std::numeric_limits::quiet_NaN()); + + auto&& handle = get_handle(); + in_dev = handle.Write(input.data); + wei_dev = handle.Write(weights.data); + out_dev = handle.Write(output.data); + } + void TearDown() override + { + if(test_skipped) + return; + + auto&& handle = get_handle(); + + miopen::TensorDescriptor output_desc = + conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType()); + ref_in = tensor{output_desc.GetLengths()}; + if(use_cpu_ref) + { + cpu_convolution_backward_data(conv_desc.GetSpatialDimension(), + ref_in, + weights, + output, + conv_desc.GetConvPads(), + conv_desc.GetConvStrides(), + conv_desc.GetConvDilations(), + conv_desc.GetGroupCount()); + } + else + { + ref_in = ref_conv_bwd(ref_in, weights, output, conv_desc); + } + input.data = handle.Read(in_dev, input.data.size()); +#if defined(__clang__) || defined(__GNUG__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfloat-equal" +#endif + const auto zero_chk = [](T x) { return static_cast(x) == static_cast(0.0); }; +#if defined(__clang__) || defined(__GNUG__) +#pragma GCC diagnostic pop +#endif + + EXPECT_FALSE(std::all_of(ref_in.begin(), ref_in.end(), [](float x) { return x == 0.0f; })) + << "Cpu data is all zeros"; + EXPECT_FALSE(std::all_of(input.begin(), input.end(), zero_chk)) << "Gpu data is all zeros"; + EXPECT_TRUE(miopen::range_distance(ref_in) == miopen::range_distance(input)); + + const double tolerance = 80; + double threshold = static_cast(std::numeric_limits::epsilon()) * tolerance; + auto error = miopen::rms_range(ref_in, input); + + EXPECT_FALSE(miopen::find_idx(ref_in, miopen::not_finite) >= 0) + << "Non finite number found in the CPU data"; + + EXPECT_TRUE(error < threshold) + << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; + } + ConvTestCase conv_config; + miopen::ConvolutionDescriptor conv_desc; + tensor input; + tensor weights; + tensor output; + tensor ref_in; + miopen::Allocator::ManageDataPtr in_dev; + miopen::Allocator::ManageDataPtr wei_dev; + miopen::Allocator::ManageDataPtr out_dev; + miopen::Allocator::ManageDataPtr workspace_dev; + size_t workspace_size; + miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoDirect; + bool test_skipped = false; +}; diff --git a/test/gtest/solver_bwd_f8.cpp b/test/gtest/solver_bwd_f8.cpp new file mode 100644 index 0000000000..7c286cbfc2 --- /dev/null +++ b/test/gtest/solver_bwd_f8.cpp @@ -0,0 +1,68 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "solver_bwd.hpp" + +struct ConvBwdFp8 : ConvBwdSolverTest +{ +}; + +struct ConvBwdFp8Naive : ConvBwdSolverTest +{ +}; + +TEST_P(ConvBwdFp8, DISABLED_GemmBwd1x1_stride2) +{ + miopen::solver::GemmBwd1x1_stride2 solv{}; + SolverBwd(solv); +} + +TEST_P(ConvBwdFp8, DISABLED_GemmBwd1x1_stride1) +{ + miopen::solver::GemmBwd1x1_stride1 solv{}; + SolverBwd(solv); +} + +TEST_P(ConvBwdFp8, DISABLED_GemmBwdRest) +{ + miopen::solver::GemmBwdRest solv{}; + SolverBwd(solv); +} + +TEST_P(ConvBwdFp8Naive, DISABLED_Bwd) +{ + miopen::solver::ConvDirectNaiveConvBwd solv{}; + SolverBwd(solv); +} +INSTANTIATE_TEST_SUITE_P(ConvBwdTest, + ConvBwdFp8, + testing::Combine(testing::Values(miopenConvolutionAlgoGEMM), + testing::ValuesIn(GetNetwork1()))); +// Since NaiveConv is verified against the CPU, we are conservative in the number and type +// of test cases we instantiate +INSTANTIATE_TEST_SUITE_P(ConvBwdTest, + ConvBwdFp8Naive, + testing::Combine(testing::Values(miopenConvolutionAlgoGEMM), + testing::ValuesIn(ConvTestConfigs()))); diff --git a/test/gtest/solver_convasm3x3u.cpp b/test/gtest/solver_convasm3x3u.cpp index 3f174039f2..91133c68ba 100644 --- a/test/gtest/solver_convasm3x3u.cpp +++ b/test/gtest/solver_convasm3x3u.cpp @@ -23,65 +23,16 @@ * SOFTWARE. * *******************************************************************************/ -#include "solver.hpp" +#include "solver_fwd.hpp" struct ConvFwdSolverTestFloat : ConvFwdSolverTest { }; -template -void SolverFwd(const miopen::TensorDescriptor& inputDesc, - ConstData_t input, - const miopen::TensorDescriptor& wDesc, - ConstData_t weight, - const miopen::TensorDescriptor& outputDesc, - Data_t output, - const miopen::ConvolutionDescriptor& convDesc, - const ConvTestCase& conv_config, - bool& test_skipped) -{ - auto&& handle = get_handle(); - - Solver solv{}; - - const auto tensors = - miopen::ConvFwdTensors{inputDesc, input, wDesc, weight, outputDesc, output}; - - const auto problem = miopen::conv::ProblemDescription{ - inputDesc, wDesc, outputDesc, convDesc, miopen::conv::Direction::Forward}; - auto ctx = miopen::ConvolutionContext{}; - - ctx.SetStream(&handle); - - if(!solv.IsApplicable(ctx, problem)) - { - test_skipped = true; - GTEST_SKIP() << solv.SolverDbId() << "ConvAsm3x3U Not Applicable for this problem" - << conv_config; - } - const auto invoke_params = miopen::conv::DataInvokeParams{ - tensors, nullptr, 0, convDesc.attribute.gfx90aFp16alt.GetFwd()}; - - ASSERT_TRUE(solv.IsApplicable(ctx, problem)); - auto sol = solv.GetSolution(ctx, problem, solv.GetDefaultPerformanceConfig(ctx, problem)); - ASSERT_TRUE(sol.Succeeded()); - ASSERT_TRUE(sol.invoker_factory); - const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params); - (invoker)(handle, invoke_params); - handle.Finish(); -} - TEST_P(ConvFwdSolverTestFloat, ConvASM3x3UFwd) { - SolverFwd(input.desc, - in_dev.get(), - weights.desc, - wei_dev.get(), - output.desc, - out_dev.get(), - conv_desc, - conv_config, - test_skipped); + miopen::solver::ConvAsm3x3U solv{}; + SolverFwd(solv); } INSTANTIATE_TEST_SUITE_P(ConvFwdTest, diff --git a/test/gtest/solver_f8.hpp b/test/gtest/solver_f8.hpp new file mode 100644 index 0000000000..34a10a4cfe --- /dev/null +++ b/test/gtest/solver_f8.hpp @@ -0,0 +1,263 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include "cpu_conv.hpp" +#include "get_handle.hpp" +#include "tensor_util.hpp" +#include +#include +#include "conv_common.hpp" +#include +#include "verify.hpp" +using float8 = miopen_f8::hip_f8; +using bfloat8 = miopen_f8::hip_f8; + +template +miopenDataType_t GetDataType(); + +template <> +miopenDataType_t GetDataType() +{ + return miopenFloat8; +} + +template <> +miopenDataType_t GetDataType() +{ + return miopenBFloat8; +} + +template <> +miopenDataType_t GetDataType() +{ + return miopenFloat; +} + +struct ConvTestCase +{ + size_t N; + size_t C; + size_t H; + size_t W; + size_t k; + size_t y; + size_t x; + size_t pad_x; + size_t pad_y; + size_t stride_x; + size_t stride_y; + size_t dialtion_x; + size_t dilation_y; + miopenConvolutionMode_t conv_mode; + friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc) + { + return os << "N: " << tc.N << " C:" << tc.C << " H:" << tc.H << " W:" << tc.W + << " k: " << tc.k << " y:" << tc.y << " x:" << tc.x << " pad_y:" << tc.pad_y + << " pad_x:" << tc.pad_x << " stride_y:" << tc.stride_y + << " dilation_y:" << tc.dilation_y << " conv_mode:" << tc.conv_mode; + } + + miopen::ConvolutionDescriptor GetConv() + { + return miopen::ConvolutionDescriptor{ + {static_cast(pad_y), static_cast(pad_x)}, + {static_cast(stride_y), static_cast(stride_x)}, + {static_cast(dilation_y), static_cast(dilation_y)}}; + } +}; + +std::vector ConvTestConfigs() +{ // n c h w k y x pad_x pad_y stri_x stri_y dia_x dia_y + return {// New tests begin + {1, 32, 4, 4, 16, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {2, 32, 4, 4, 16, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {4, 32, 4, 4, 16, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {8, 32, 4, 4, 16, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {16, 32, 4, 4, 16, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {16, 128, 16, 16, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 128, 28, 28, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 128, 64, 64, 64, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 128, 128, 64, 64, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 128, 128, 128, 64, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 128, 128, 128, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 256, 128, 128, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 256, 256, 128, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 256, 256, 256, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 256, 256, 256, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {128, 256, 256, 256, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {128, 256, 512, 256, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {128, 256, 512, 512, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {128, 256, 1024, 512, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {128, 256, 1024, 1024, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {128, 512, 1024, 1024, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {128, 512, 1024, 1024, 512, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {128, 1024, 1024, 1024, 512, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {128, 1024, 1024, 1024, 1024, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {256, 1024, 1024, 1024, 1024, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {1024, 1024, 1024, 1024, 1024, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {1024, 2048, 2048, 2048, 2048, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + // New tests end + {16, 128, 16, 16, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 128, 28, 28, 128, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 256, 14, 14, 256, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 512, 7, 7, 512, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}, + {64, 1024, 14, 14, 1024, 1, 1, 0, 0, 1, 1, 1, 1, miopenConvolution}}; +} + +template +struct Fp8Cast +{ + uint64_t seed = 1234; + bool is_stoch = true; + V operator()(U x) + { + if(is_stoch) + { + auto tmp = + float8(static_cast(x), miopen_f8::hip_f8_rounding_mode::stochastic, seed); + return static_cast(tmp); + } + else + { + auto tmp = float8(static_cast(x)); + return static_cast(tmp); + } + } +}; + +template +struct ConvFwdSolverTest + : public ::testing::TestWithParam> +{ +protected: + void SetUp() override + { + test_skipped = false; + std::tie(algo, conv_config) = GetParam(); + input = tensor{conv_config.N, conv_config.C, conv_config.H, conv_config.W}; + weights = tensor{conv_config.k, conv_config.C, conv_config.x, conv_config.y}; + + auto gen_fp8_value = [=](auto...) { + const auto tmp = float8(scalar_gen_random_float{-0.5, 0.5}()); + return tmp; + }; + + input.generate(gen_fp8_value); + weights.generate(gen_fp8_value); + + conv_desc = conv_config.GetConv(); + + miopen::TensorDescriptor output_desc = conv_desc.GetForwardOutputTensor( + input.desc, weights.desc, GetDataType()); // Tgpu Datatype? + + output = tensor{output_desc.GetLengths()}; // half_float::half instead? + + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + + auto&& handle = get_handle(); + in_dev = handle.Write(input.data); + wei_dev = handle.Write(weights.data); + out_dev = handle.Write(output.data); + } + void TearDown() override + { + if(test_skipped) + return; + + auto&& handle = get_handle(); + + miopen::TensorDescriptor output_desc = conv_desc.GetForwardOutputTensor( + input.desc, weights.desc, GetDataType()); // miopenFloat or GetDataType() ? + ref_out = tensor{output_desc.GetLengths()}; + + using FI = Fp8Cast; + using FW = Fp8Cast; + FI in_func = {0, true}; + FW weight_func = {0, true}; + + cpu_convolution_forward( + conv_desc.GetSpatialDimension(), + input, + weights, + ref_out, + conv_desc.GetConvPads(), + conv_desc.GetConvStrides(), + conv_desc.GetConvDilations(), + conv_desc.GetGroupCount(), + in_func, + weight_func); + + output.data = handle.Read(out_dev, output.data.size()); + EXPECT_FALSE(miopen::f8_range_zero(ref_out)) << "Cpu data is all zeros"; + EXPECT_FALSE(miopen::f8_range_zero(output)) << "Gpu data is all zeros"; + EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output)); + + const float tolerance = 80.0; + auto threshold = (static_cast(std::numeric_limits::epsilon()) * + static_cast(tolerance)); + + auto error = miopen::rms_range(ref_out, output); + + bool refOutNan = false; + for(auto refOutElem : ref_out.data) + { + if(refOutElem.is_nan()) + { + refOutNan = true; + break; + } + } + + bool outputNan = false; + for(auto outputElem : output.data) + { + if(outputElem.is_nan()) + { + outputNan = true; + break; + } + } + + EXPECT_FALSE(refOutNan) << "NAN found in CPU data"; + EXPECT_FALSE(outputNan) << "NAN found in GPU data"; + + EXPECT_TRUE(error < threshold) + << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; + } + ConvTestCase conv_config; + miopen::ConvolutionDescriptor conv_desc; + tensor input; + tensor weights; + tensor output; // Or T? + tensor ref_out; + miopen::Allocator::ManageDataPtr in_dev; + miopen::Allocator::ManageDataPtr wei_dev; + miopen::Allocator::ManageDataPtr out_dev; + miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoGEMM; + bool test_skipped = false; +}; diff --git a/test/gtest/solver_fwd.hpp b/test/gtest/solver_fwd.hpp new file mode 100644 index 0000000000..ac30ad15ca --- /dev/null +++ b/test/gtest/solver_fwd.hpp @@ -0,0 +1,125 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include "conv_common.hpp" +#include "get_handle.hpp" +#include "tensor_util.hpp" +#include +#include + +#include +#include +#include + +#include "conv_test_base.hpp" +#include "get_solver.hpp" + +template +struct ConvFwdSolverTest + : public ::testing::TestWithParam< + std::tuple>, + ConvFwdSolverTestBase +{ + template + void SolverFwd(Solver solv) + { + auto&& handle = get_handle(); + + const auto tensors = miopen::ConvFwdTensors{this->input.desc, + this->in_dev.get(), + this->weights.desc, + this->wei_dev.get(), + this->output.desc, + this->out_dev.get()}; + const auto problem = miopen::ProblemDescription( + miopen::conv::ProblemDescription{this->input.desc, + this->weights.desc, + this->output.desc, + this->conv_desc, + miopen::conv::Direction::Forward}); + const miopen::ConvolutionContext ctx = [&] { + auto tmp = miopen::ConvolutionContext{&handle}; + problem.conv_problem.SetupFloats(tmp); + return tmp; + }(); + + // const auto network_config = problem.BuildConfKey(); + + if(!solv.IsApplicable(ctx, problem)) + { + test_skipped = true; + GTEST_SKIP() << solv.SolverDbId() << ": Not Applicable for this problem" << conv_config; + } + if(solv.MayNeedWorkspace()) + { + const auto cur_sol_ws = solv.GetWorkspaceSize(ctx, problem); + workspace_dev = handle.Create(cur_sol_ws); + workspace_size = cur_sol_ws; + } + + const auto invoke_params = + miopen::conv::DataInvokeParams{tensors, + workspace_dev.get(), + workspace_size, + this->conv_desc.attribute.gfx90aFp16alt.GetFwd()}; + + // auto sol = solv.GetSolution(ctx, problem); + // This is complicated due to the split between tunable and non-tunable solvers + // since the signature for solver.GetSolution needs a consutructed tuning params + // in the tunable case and not otherwise + const auto sol = GetSolution(solv, ctx, problem); + ASSERT_TRUE(sol.Succeeded()); + ASSERT_TRUE(sol.invoker_factory); + const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params); + (invoker)(handle, invoke_params); + handle.Finish(); + } + +protected: + void SetUp() override + { + test_skipped = false; + std::tie(algo, conv_config, tensor_layout) = GetParam(); + this->SetUpImpl(conv_config, tensor_layout); + } + + void TearDown() override + { + if(test_skipped) + return; + this->TearDownConv(); + this->ThresholdChecks(); + } + + ConvTestCase conv_config; + miopen::Allocator::ManageDataPtr workspace_dev; + size_t workspace_size; + miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoDirect; + bool test_skipped = false; + miopenTensorLayout_t tensor_layout; +}; diff --git a/test/gtest/solver_fwd_f8.cpp b/test/gtest/solver_fwd_f8.cpp new file mode 100644 index 0000000000..36f0ec67cd --- /dev/null +++ b/test/gtest/solver_fwd_f8.cpp @@ -0,0 +1,70 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "solver_fwd.hpp" + +struct ConvFwdFp8 : ConvFwdSolverTest +{ +}; + +struct ConvFwdFp8Naive : ConvFwdSolverTest +{ +}; + +TEST_P(ConvFwdFp8, DISABLED_GemmFwdRest) +{ + miopen::solver::GemmFwdRest solv{}; + SolverFwd(solv); +} + +TEST_P(ConvFwdFp8, DISABLED_GemmFwd1x1_0_2) +{ + miopen::solver::GemmFwd1x1_0_2 solv{}; + SolverFwd(solv); +} + +TEST_P(ConvFwdFp8, DISABLED_Gemm1x1x0x1) +{ + miopen::solver::GemmFwd1x1_0_1 solv{}; + SolverFwd(solv); +} + +TEST_P(ConvFwdFp8Naive, DISABLED_Fwd) +{ + miopen::solver::ConvDirectNaiveConvFwd solv{}; + SolverFwd(solv); +} +INSTANTIATE_TEST_SUITE_P(ConvFwdTest, + ConvFwdFp8, + testing::Combine(testing::Values(miopenConvolutionAlgoGEMM), + testing::ValuesIn(ConvTestConfigs()), + testing::Values(miopenTensorNCHW))); +// Since NaiveConv is verified against the CPU, we are conservative in the number and type +// of test cases we instantiate +INSTANTIATE_TEST_SUITE_P(ConvFwdTest, + ConvFwdFp8Naive, + testing::Combine(testing::Values(miopenConvolutionAlgoGEMM), + testing::ValuesIn(ConvTestConfigs()), + testing::Values(miopenTensorNCHW))); diff --git a/test/gtest/solver_wrw.hpp b/test/gtest/solver_wrw.hpp new file mode 100644 index 0000000000..51f078fd21 --- /dev/null +++ b/test/gtest/solver_wrw.hpp @@ -0,0 +1,186 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include "conv_common.hpp" +#include "get_handle.hpp" +#include "tensor_util.hpp" +#include +#include + +#include +#include +#include + +#include "conv_test_base.hpp" +#include "conv_tensor_gen.hpp" + +#include "get_solver.hpp" + +template +struct ConvWrwSolverTest + : public ::testing::TestWithParam> +{ + + template + void SolverWrw(Solver solv) + { + auto&& handle = get_handle(); + + const auto tensors = miopen::ConvWrwTensors{ + output.desc, out_dev.get(), input.desc, in_dev.get(), weights.desc, wei_dev.get()}; + const auto problem = miopen::ProblemDescription( + miopen::conv::ProblemDescription{output.desc, + weights.desc, + input.desc, + conv_desc, + miopen::conv::Direction::BackwardWeights}); + const miopen::ConvolutionContext ctx = [&] { + auto tmp = miopen::ConvolutionContext{&handle}; + problem.conv_problem.SetupFloats(tmp); + return tmp; + }(); + + // const auto network_config = problem.BuildConfKey(); + + if(!solv.IsApplicable(ctx, problem)) + { + test_skipped = true; + GTEST_SKIP() << solv.SolverDbId() << ": Not Applicable for this problem" << conv_config; + } + + if(solv.MayNeedWorkspace()) + { + const auto cur_sol_ws = solv.GetWorkspaceSize(ctx, problem); + workspace_dev = handle.Create(cur_sol_ws); + workspace_size = cur_sol_ws; + } + + const auto invoke_params = + miopen::conv::WrWInvokeParams{tensors, + workspace_dev.get(), + workspace_size, + conv_desc.attribute.gfx90aFp16alt.GetBwd()}; + + auto sol = GetSolution(solv, ctx, problem); + ASSERT_TRUE(sol.Succeeded()); + ASSERT_TRUE(sol.invoker_factory); + const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params); + (invoker)(handle, invoke_params); + handle.Finish(); + } + +protected: + void SetUp() override + { + test_skipped = false; + std::tie(algo, conv_config) = GetParam(); + input = tensor{conv_config.N, conv_config.C, conv_config.H, conv_config.W}; + weights = tensor{conv_config.k, conv_config.C, conv_config.y, conv_config.x}; + input.generate(GenData{}); + + conv_desc = conv_config.GetConv(); + + miopen::TensorDescriptor output_desc = + conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType()); + + output = tensor{output_desc.GetLengths()}; + output.generate(GenData{}); + + std::fill(weights.begin(), weights.end(), std::numeric_limits::quiet_NaN()); + + auto&& handle = get_handle(); + in_dev = handle.Write(input.data); + wei_dev = handle.Write(weights.data); + out_dev = handle.Write(output.data); + } + void TearDown() override + { + if(test_skipped) + return; + + auto&& handle = get_handle(); + + miopen::TensorDescriptor output_desc = + conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType()); + ref_weights = tensor{output_desc.GetLengths()}; + if(use_cpu_ref) + { + cpu_convolution_backward_weight(conv_desc.GetSpatialDimension(), + input, + ref_weights, + output, + conv_desc.GetConvPads(), + conv_desc.GetConvStrides(), + conv_desc.GetConvDilations(), + conv_desc.GetGroupCount()); + } + else + { + ref_weights = ref_conv_wrw(input, ref_weights, output, conv_desc); + } + weights.data = handle.Read(in_dev, input.data.size()); +#if defined(__clang__) || defined(__GNUG__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfloat-equal" +#endif + const auto zero_chk = [](T x) { return static_cast(x) == static_cast(0.0); }; +#if defined(__clang__) || defined(__GNUG__) +#pragma GCC diagnostic pop +#endif + + EXPECT_FALSE(std::all_of(ref_weights.begin(), ref_weights.end(), [](float x) { + return x == 0.0f; + })) << "Cpu data is all zeros"; + EXPECT_FALSE(std::all_of(weights.begin(), weights.end(), zero_chk)) + << "Gpu data is all zeros"; + EXPECT_TRUE(miopen::range_distance(ref_weights) == miopen::range_distance(weights)); + + const double tolerance = 80; + double threshold = static_cast(std::numeric_limits::epsilon()) * tolerance; + auto error = miopen::rms_range(ref_weights, weights); + + EXPECT_FALSE(miopen::find_idx(ref_weights, miopen::not_finite) >= 0) + << "Non finite number found in the CPU data"; + + EXPECT_TRUE(error < threshold) + << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; + } + ConvTestCase conv_config; + miopen::ConvolutionDescriptor conv_desc; + tensor input; + tensor weights; + tensor output; + tensor ref_weights; + miopen::Allocator::ManageDataPtr in_dev; + miopen::Allocator::ManageDataPtr wei_dev; + miopen::Allocator::ManageDataPtr out_dev; + miopen::Allocator::ManageDataPtr workspace_dev; + size_t workspace_size; + miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoDirect; + bool test_skipped = false; +}; diff --git a/test/gtest/solver_wrw_f8.cpp b/test/gtest/solver_wrw_f8.cpp new file mode 100644 index 0000000000..76c608b622 --- /dev/null +++ b/test/gtest/solver_wrw_f8.cpp @@ -0,0 +1,41 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "solver_wrw.hpp" +struct ConvWrwFp8Naive : ConvWrwSolverTest +{ +}; + +TEST_P(ConvWrwFp8Naive, DISABLED_Wrw) +{ + miopen::solver::ConvDirectNaiveConvWrw solv{}; + SolverWrw(solv); +} +// Since NaiveConv is verified against the CPU, we are conservative in the number and type +// of test cases we instantiate +INSTANTIATE_TEST_SUITE_P(ConvWrwTest, + ConvWrwFp8Naive, + testing::Combine(testing::Values(miopenConvolutionAlgoGEMM), + testing::ValuesIn(ConvTestConfigs()))); diff --git a/test/gtest/tensor_api.cpp b/test/gtest/tensor_api.cpp index dddd10d5d4..eb38a78fb1 100644 --- a/test/gtest/tensor_api.cpp +++ b/test/gtest/tensor_api.cpp @@ -285,7 +285,7 @@ void RunWrongTestConfigs(const TestConfig& valid_config, { #if USE_OUT_OF_RANGE_ENUM const auto wrong_datatypes = {static_cast(miopenHalf - 1), - static_cast(miopenDouble + 1)}; + static_cast(miopenBFloat8 + 1)}; const auto wrong_layouts = {static_cast(miopenTensorNCHW - 1), static_cast(miopenTensorNDHWC + 1)}; #endif diff --git a/test/perf_models/resnet50_v1.5.sh b/test/perf_models/resnet50_v1.5.sh old mode 100755 new mode 100644 index 7ea94de7db..9d3055ab0d --- a/test/perf_models/resnet50_v1.5.sh +++ b/test/perf_models/resnet50_v1.5.sh @@ -89,4 +89,4 @@ echo resnet50_v1.5.sh CMD:${cmd} BS:${bs} CAST:${incast},${weicast} ./bin/MIOpenDriver ${cmd} --batchsize ${bs} --spatial_dim 2 --pad_h 1 --pad_w 1 --pad_d 0 --conv_stride_h 1 --conv_stride_w 1 --conv_stride_d 1 --dilation_h 1 --dilation_w 1 --dilation_d 1 --group_count 1 --mode conv --pad_mode default --trans_output_pad_h 0 --trans_output_pad_w 0 --trans_output_pad_d 0 --in_d 1 --in_h 28 --in_w 28 --fil_d 1 --fil_h 3 --fil_w 3 --in_channels 128 --out_channels 128 --forw 4 ${wrwcast} -V ${verif} -t 1 ./bin/MIOpenDriver ${cmd} --batchsize ${bs} --spatial_dim 2 --pad_h 0 --pad_w 0 --pad_d 0 --conv_stride_h 1 --conv_stride_w 1 --conv_stride_d 1 --dilation_h 1 --dilation_w 1 --dilation_d 1 --group_count 1 --mode conv --pad_mode default --trans_output_pad_h 0 --trans_output_pad_w 0 --trans_output_pad_d 0 --in_d 1 --in_h 28 --in_w 28 --fil_d 1 --fil_h 1 --fil_w 1 --in_channels 512 --out_channels 128 --forw 2 ${bwdcast} -V ${verif} -t 1 ./bin/MIOpenDriver ${cmd} --batchsize ${bs} --spatial_dim 2 --pad_h 0 --pad_w 0 --pad_d 0 --conv_stride_h 1 --conv_stride_w 1 --conv_stride_d 1 --dilation_h 1 --dilation_w 1 --dilation_d 1 --group_count 1 --mode conv --pad_mode default --trans_output_pad_h 0 --trans_output_pad_w 0 --trans_output_pad_d 0 --in_d 1 --in_h 28 --in_w 28 --fil_d 1 --fil_h 1 --fil_w 1 --in_channels 512 --out_channels 128 --forw 4 ${wrwcast} -V ${verif} -t 1 -./bin/MIOpenDriver ${cmd} --batchsize ${bs} --spatial_dim 2 --pad_h 0 --pad_w 0 --pad_d 0 --conv_stride_h 2 --conv_stride_w 2 --conv_stride_d 1 --dilation_h 1 --dilation_w 1 --dilation_d 1 --group_count 1 --mode conv --pad_mode default --trans_output_pad_h 0 --trans_output_pad_w 0 --trans_output_pad_d 0 --in_d 1 --in_h 56 --in_w 56 --fil_d 1 --fil_h 1 --fil_w 1 --in_channels 256 --out_channels 512 --forw 2 ${bwdcast} -V ${verif} -t 1 \ No newline at end of file +./bin/MIOpenDriver ${cmd} --batchsize ${bs} --spatial_dim 2 --pad_h 0 --pad_w 0 --pad_d 0 --conv_stride_h 2 --conv_stride_w 2 --conv_stride_d 1 --dilation_h 1 --dilation_w 1 --dilation_d 1 --group_count 1 --mode conv --pad_mode default --trans_output_pad_h 0 --trans_output_pad_w 0 --trans_output_pad_d 0 --in_d 1 --in_h 56 --in_w 56 --fil_d 1 --fil_h 1 --fil_w 1 --in_channels 256 --out_channels 512 --forw 2 ${bwdcast} -V ${verif} -t 1 diff --git a/test/tensor_holder.hpp b/test/tensor_holder.hpp index 70a844f17b..0b05a1e5e8 100644 --- a/test/tensor_holder.hpp +++ b/test/tensor_holder.hpp @@ -42,6 +42,12 @@ #else #include #endif +using half = half_float::half; +using hip_bfloat16 = bfloat16; +#include +using float8 = miopen_f8::hip_f8; +using bfloat8 = miopen_f8::hip_f8; + #include #include @@ -110,6 +116,16 @@ struct miopen_type : std::integral_constant { }; +template <> +struct miopen_type : std::integral_constant +{ +}; + +template <> +struct miopen_type : std::integral_constant +{ +}; + template struct tensor { @@ -403,6 +419,14 @@ tensor make_tensor(std::initializer_list dims, G g) return tensor{miopen::TensorDescriptor{miopen_type{}, dims}}.generate(g); } +// This is needed since there is no TensorDescriptor(miopenDataType_t t, const size_t* plens, int +// size) constructor +template +tensor make_tensor(const std::vector& dims) +{ + return tensor{miopen::TensorDescriptor{miopen_type{}, dims}}; +}; + template tensor make_tensor(const std::vector& dims) { diff --git a/test/verify.hpp b/test/verify.hpp index 8bf8b80684..984237a48c 100644 --- a/test/verify.hpp +++ b/test/verify.hpp @@ -33,6 +33,11 @@ #include #include #include +#include +using half = half_float::half; +using hip_bfloat16 = bfloat16; +#include +#include "tensor_holder.hpp" namespace miopen { @@ -105,7 +110,7 @@ struct square_diff_fn template double operator()(T x, U y) const { - return (x - y) * (x - y); + return static_cast((x - y) * (x - y)); } }; static constexpr square_diff_fn square_diff{}; @@ -119,6 +124,27 @@ bool range_empty(R1&& r1) template auto range_distance(R1&& r1) MIOPEN_RETURNS(std::distance(r1.begin(), r1.end())); +template +bool f8_range_zero(R& r); + +template <> +inline bool f8_range_zero>(tensor& r1) +{ + return std::all_of(r1.data.begin(), r1.data.end(), [&](float8 x) { return x.is_zero(); }); +} + +template <> +inline bool f8_range_zero>(tensor& r1) +{ + return std::all_of(r1.data.begin(), r1.data.end(), [&](bfloat8 x) { return x.is_zero(); }); +} + +template <> +inline bool f8_range_zero>(tensor& r1) +{ + return std::all_of(r1.data.begin(), r1.data.end(), [](float x) { return x == 0.0; }); +} + template bool range_zero(R1&& r1) { @@ -172,14 +198,14 @@ double rms_range(R1&& r1, R2&& r2) if(n == range_distance(r2) && n != 0) { double square_difference = range_product(r1, r2, 0.0, sum_fn{}, square_diff); - double mag1 = *std::max_element(r1.begin(), r1.end(), compare_mag); - double mag2 = *std::max_element(r2.begin(), r2.end(), compare_mag); + double mag1 = static_cast(*std::max_element(r1.begin(), r1.end(), compare_mag)); + double mag2 = static_cast(*std::max_element(r2.begin(), r2.end(), compare_mag)); double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits::min()}); return std::sqrt(square_difference) / (std::sqrt(n) * mag); } else - return std::numeric_limits>::max(); + return double(std::numeric_limits>::max()); } } // namespace miopen #endif From 4aa64c1f07bbcfb1249902aeeffc83719114ed6b Mon Sep 17 00:00:00 2001 From: JD Date: Tue, 19 Sep 2023 12:57:59 -0500 Subject: [PATCH 02/36] [CI][Jenkins] remove reboot, Vega, and Navi21 stages (#2395) * remove reboot from MIOpen CI, remove Vega and Navi21 stages * Update Docker ROCm to official 5.7 --------- Co-authored-by: Jun Liu --- Dockerfile | 6 +++--- Jenkinsfile | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index ede050ddc2..31a0334eeb 100755 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg -RUN wget https://repo.radeon.com/amdgpu-install/.5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb --no-check-certificate +RUN wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb --no-check-certificate RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ ./amdgpu-install_5.7.50700-1_all.deb @@ -26,8 +26,8 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ # Add rocm repository RUN export ROCM_APT_VER=5.7;\ echo $ROCM_APT_VER &&\ -sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/.$ROCM_APT_VER/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list' &&\ -sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/.apt_$ROCM_APT_VER focal main > /etc/apt/sources.list.d/rocm.list' +sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCM_APT_VER/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list' &&\ +sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/$ROCM_APT_VER focal main > /etc/apt/sources.list.d/rocm.list' RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" RUN amdgpu-install -y --usecase=rocm --no-dkms diff --git a/Jenkinsfile b/Jenkinsfile index e7f4ed8400..9d10064b59 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -316,7 +316,7 @@ def buildHipClangJobAndReboot(Map conf=[:]){ throw e } finally{ - if (conf.get("needs_gpu", true)) { + if (conf.get("needs_reboot", false)) { reboot() } } @@ -453,11 +453,11 @@ pipeline { description: "") booleanParam( name: "TARGET_VEGA10", - defaultValue: true, + defaultValue: false, description: "") booleanParam( name: "TARGET_VEGA20", - defaultValue: true, + defaultValue: false, description: "") booleanParam( name: "TARGET_GFX908", @@ -469,7 +469,7 @@ pipeline { description: "") booleanParam( name: "TARGET_NAVI21", - defaultValue: true, + defaultValue: false, description: "") booleanParam( name: "DATATYPE_NA", From 7315546fc5a12b37184e278e70009dee45ad9386 Mon Sep 17 00:00:00 2001 From: Artem Tamazov Date: Tue, 19 Sep 2023 21:00:58 +0300 Subject: [PATCH 03/36] [pooling][backward][2D] Support wide pooling window. Fix FP16 correctness issues of Average pooling. (#2372) --- driver/pool_driver.hpp | 62 ++-- src/kernels/MIOpenPoolingBwd.cl | 13 +- src/kernels/MIOpenPoolingBwdND.cl | 14 +- src/kernels/float_types.h | 102 ++++-- src/ocl/pooling_ocl.cpp | 36 +- src/solver/batchnorm/forward_inference_ck.cpp | 2 +- src/solver/pooling/backwardNd.cpp | 292 +++++++++------- src/solver/pooling/forwardNaive.cpp | 27 +- src/solver/pooling/forwardNd.cpp | 23 +- test/CMakeLists.txt | 40 +-- test/pooling2d.cpp | 48 +-- test/pooling_common.hpp | 318 +++++++++++------- 12 files changed, 582 insertions(+), 395 deletions(-) diff --git a/driver/pool_driver.hpp b/driver/pool_driver.hpp index c82f6442dd..bb3fa161e1 100644 --- a/driver/pool_driver.hpp +++ b/driver/pool_driver.hpp @@ -392,7 +392,7 @@ int PoolDriver_impl::AllocateBuffersAndCopy() maskhost = std::vector(out_sz, static_cast(0)); outhost = std::vector(out_sz, static_cast(0)); - din = std::vector(in_sz, static_cast(0)); + din = std::vector(in_sz, static_cast(1.0)); dout = std::vector(out_sz, static_cast(0)); dinhost = std::vector(in_sz, static_cast(0)); @@ -456,25 +456,27 @@ int PoolDriver_impl::RunForwardGPU() Timer t; START_TIME + int rc = 0; for(int i = 0; i < inflags.GetValueInt("iter"); i++) { - miopenPoolingForward(GetHandle(), - poolDesc, - &alpha, - inputTensor, - in_dev->GetMem(), - &beta, - outputTensor, - out_dev->GetMem(), - do_backward, - mask_dev->GetMem(), - 0); + rc |= miopenPoolingForward(GetHandle(), + poolDesc, + &alpha, + inputTensor, + in_dev->GetMem(), + &beta, + outputTensor, + out_dev->GetMem(), + do_backward, + mask_dev->GetMem(), + 0); } if(inflags.GetValueInt("time") == 1) { float time = 0.0; - miopenGetKernelTime(GetHandle(), &time); + if(rc == 0) + miopenGetKernelTime(GetHandle(), &time); STOP_TIME if(WALL_CLOCK) @@ -494,7 +496,7 @@ int PoolDriver_impl::RunForwardGPU() dumpBufferToFile((dump_root + "/dump_mask.bin").c_str(), mask.data(), out_sz); } - return miopenStatusSuccess; + return rc; } template @@ -524,27 +526,29 @@ int PoolDriver_impl::RunBackwardGPU() Timer t; START_TIME + int rc = 0; for(int i = 0; i < inflags.GetValueInt("iter"); i++) { - miopenPoolingBackward(GetHandle(), - poolDesc, - &alpha, - outputTensor, - out_dev->GetMem(), - dOutputTensor, - dout_dev->GetMem(), - inputTensor, - in_dev->GetMem(), - &beta, - dInputTensor, - din_dev->GetMem(), - mask_dev->GetMem()); + rc |= miopenPoolingBackward(GetHandle(), + poolDesc, + &alpha, + outputTensor, + out_dev->GetMem(), + dOutputTensor, + dout_dev->GetMem(), + inputTensor, + in_dev->GetMem(), + &beta, + dInputTensor, + din_dev->GetMem(), + mask_dev->GetMem()); } if(inflags.GetValueInt("time") == 1) { float time = 0.0; - miopenGetKernelTime(GetHandle(), &time); + if(rc == 0) + miopenGetKernelTime(GetHandle(), &time); STOP_TIME if(WALL_CLOCK) @@ -561,7 +565,7 @@ int PoolDriver_impl::RunBackwardGPU() dumpBufferToFile((dump_root + "/dump_din.bin").c_str(), din.data(), in_sz); } - return miopenStatusSuccess; + return rc; } template diff --git a/src/kernels/MIOpenPoolingBwd.cl b/src/kernels/MIOpenPoolingBwd.cl index 9e3afd8d82..6c88bebadf 100644 --- a/src/kernels/MIOpenPoolingBwd.cl +++ b/src/kernels/MIOpenPoolingBwd.cl @@ -95,12 +95,12 @@ mloPoolingAveBwd(const __global _FLOAT* top_diff, : (y + mlo_pad1 - MLO_POOLING_KERNEL_SZ1) / MLO_POOLING_STRIDE1 + 1; int top_off = b * mlo_topdf_batch_str + o * mlo_topdf_channel_str; - _FLOAT res[MLO_POOLBWD_N_VERT_OUT_PIX][MLO_POOLBWD_N_HORIZ_OUT_PIX]; + _FLOAT_ACCUM res[MLO_POOLBWD_N_VERT_OUT_PIX][MLO_POOLBWD_N_HORIZ_OUT_PIX]; for(int k = 0; k < MLO_POOLBWD_N_VERT_OUT_PIX; k++) { for(int l = 0; l < MLO_POOLBWD_N_HORIZ_OUT_PIX; l++) { - res[k][l] = 0; + res[k][l] = (_FLOAT_ACCUM)0; } } @@ -183,9 +183,10 @@ mloPoolingAveBwd(const __global _FLOAT* top_diff, pool_size = (pool_size == 0) ? 1 : pool_size; int lcl_top_h = top_h - top_y; int lcl_top_w = top_w - top_x; - _FLOAT add_val = - (lcl_top_diff[lcl_top_h * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_top_w] / - (_FLOAT)pool_size); + _FLOAT_ACCUM add_val = + CVT_FLOAT2ACCUM( + lcl_top_diff[lcl_top_h * MLO_POOLBWD_LCL_DATA_WIDTH + lcl_top_w]) / + CVT_INTEGRAL2ACCUM(pool_size); res[k][l] += add_val; #if 0 if (bot_x+l==6&&bot_y+k==0&&o==3&&b==0) @@ -206,7 +207,7 @@ mloPoolingAveBwd(const __global _FLOAT* top_diff, { if(bot_y + k < mlo_bot_height && bot_x + l < mlo_bot_width) { - bot_diff[bot_off + k * mlo_botdf_str + l] = res[k][l]; + bot_diff[bot_off + k * mlo_botdf_str + l] = CVT_ACCUM2FLOAT(res[k][l]); #if 0 if (lcl_id0==0&&lcl_id1==0&&o==0&&b==0) { diff --git a/src/kernels/MIOpenPoolingBwdND.cl b/src/kernels/MIOpenPoolingBwdND.cl index bfa45a61b5..7daacd24ab 100644 --- a/src/kernels/MIOpenPoolingBwdND.cl +++ b/src/kernels/MIOpenPoolingBwdND.cl @@ -27,6 +27,8 @@ #include "float_types.h" #include "pooling_functions.h" +#if MLO_POOLING_OP_ID == MLO_POOLING_OP_MAX + #ifndef MLO_POOLING_INDEX_MAX #error "MLO_POOLING_INDEX_MAX not defined" #endif @@ -145,6 +147,8 @@ mloPoolingNDMaxBwd(const __global _FLOAT* top_df, } } +#elif MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE || MLO_POOLING_OP_ID == MLO_POOLING_OP_AVE_INCLUSIVE + __attribute__((reqd_work_group_size(MLO_POOLING_GROUP_SZ0, 1, 1))) __kernel void mloPoolingNDAveBwd(const __global _FLOAT* top_df, __global _FLOAT* bot_df, @@ -202,7 +206,7 @@ mloPoolingNDAveBwd(const __global _FLOAT* top_df, top_h_end = min(top_h_end, (int)top_h); top_w_end = min(top_w_end, (int)top_w); - _FLOAT bot_data[PIX_D_PER_WORK][PIX_H_PER_WORK][PIX_W_PER_WORK] = {0}; + _FLOAT_ACCUM bot_data[PIX_D_PER_WORK][PIX_H_PER_WORK][PIX_W_PER_WORK] = {0}; for(int h = top_d_start; h < top_d_end; ++h) { @@ -232,8 +236,9 @@ mloPoolingNDAveBwd(const __global _FLOAT* top_df, uint top_gbl_off = b_id * top_str_b + c_id * top_str_c + h * top_str_d + j * top_str_h + i; - _FLOAT add_val = b_id < batch ? top_df[top_gbl_off] : 0; - add_val /= (_FLOAT)pool_size; + _FLOAT_ACCUM add_val = + b_id < batch ? CVT_FLOAT2ACCUM(top_df[top_gbl_off]) : CVT_FP32_2ACCUM(0.0f); + add_val /= CVT_INTEGRAL2ACCUM(pool_size); for(int m = dstart; m < dend; ++m) { @@ -269,10 +274,11 @@ mloPoolingNDAveBwd(const __global _FLOAT* top_df, { uint bot_idx = bot_off + m * bot_str_d + k * bot_str_h + l; - bot_df[bot_idx] = bot_data[m][k][l]; + bot_df[bot_idx] = CVT_ACCUM2FLOAT(bot_data[m][k][l]); } } } } } } +#endif diff --git a/src/kernels/float_types.h b/src/kernels/float_types.h index 897e95ca67..5406ba85ec 100644 --- a/src/kernels/float_types.h +++ b/src/kernels/float_types.h @@ -85,14 +85,46 @@ #define _FLOAT8 PPCAT(_FLOAT, EIGHT) #endif -#if MIOPEN_USE_FP16 == 1 +/// If MIOPEN_USE_DOUBLE_ACCUM is defined as 1 when "float_types.h" is included, +/// then all the ACCUM macros (the represent operations and types) will use FP64 +/// instead of FP32. In other words, the computations will be +/// performed using the native datatype even if ACCUM macros are used. +/// This functionality is indended mostly for debugging. +#ifdef MIOPEN_USE_DOUBLE_ACCUM +#if !(MIOPEN_USE_DOUBLE_ACCUM == 0 || MIOPEN_USE_DOUBLE_ACCUM == 1) +#error "Invalid value of MIOPEN_USE_DOUBLE_ACCUM" +#endif +#else +#define MIOPEN_USE_DOUBLE_ACCUM 0 +#endif + +#if MIOPEN_USE_DOUBLE_ACCUM +#ifdef __HIP_PLATFORM_HCC__ +#define FLOAT_ACCUM double +#else +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#define _FLOAT_ACCUM double +#endif // __HIP_PLATFORM_HCC__ +#define MAX_VAL_ACCUM DBL_MAX +#else // MIOPEN_USE_DOUBLE_ACCUM #ifdef __HIP_PLATFORM_HCC__ -#define FLOAT _Float16 #define FLOAT_ACCUM float #else +#define _FLOAT_ACCUM float +#endif // __HIP_PLATFORM_HCC__ +#ifndef FLT_MAX +#define MAX_VAL_ACCUM 3.402823466e+38F +#else +#define MAX_VAL_ACCUM FLT_MAX +#endif +#endif // MIOPEN_USE_DOUBLE_ACCUM + +#if MIOPEN_USE_FP16 == 1 +#ifdef __HIP_PLATFORM_HCC__ +#define FLOAT _Float16 +#else // __HIP_PLATFORM_HCC__ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define _FLOAT half -#define _FLOAT_ACCUM float #endif // __HIP_PLATFORM_HCC__ #define SIZEOF_FLOAT 2 // Max value for the main datatype @@ -101,21 +133,13 @@ #else #define MAX_VAL HALF_MAX #endif -// Max value for accumulator -#ifndef FLT_MAX -#define MAX_VAL_ACCUM 3.402823466e+38F -#else -#define MAX_VAL_ACCUM FLT_MAX -#endif #endif // MIOPEN_USE_FP16 #if MIOPEN_USE_FP32 == 1 #ifdef __HIP_PLATFORM_HCC__ #define FLOAT float -#define FLOAT_ACCUM float #else #define _FLOAT float -#define _FLOAT_ACCUM float #endif // __HIP_PLATFORM_HCC__ #define SIZEOF_FLOAT 4 // Max value for the main datatype @@ -124,36 +148,28 @@ #else #define MAX_VAL FLT_MAX #endif -// Max value for accumulator -#define MAX_VAL_ACCUM MAX_VAL #endif // MIOPEN_USE_FP32 #if MIOPEN_USE_BFP16 == 1 #ifdef __HIP_PLATFORM_HCC__ #define FLOAT ushort -#define FLOAT_ACCUM float #else #define _FLOAT ushort -#define _FLOAT_ACCUM float #endif // #define SIZEOF_FLOAT 2 // Max value for the main datatype #define MAX_VAL 0x7F7F -// Max value for accumulator -#ifndef FLT_MAX -#define MAX_VAL_ACCUM 3.402823466e+38F -#else -#define MAX_VAL_ACCUM FLT_MAX -#endif #endif // MIOPEN_USE_BFP16 #if MIOPEN_USE_FP16 == 1 #ifdef __HIP_PLATFORM_HCC__ #define CVT_FLOAT2ACCUM(x) (static_cast(x)) #define CVT_ACCUM2FLOAT(x) (static_cast(x)) +#define CVT_INTEGRAL2ACCUM(x) (static_cast(x)) #else #define CVT_FLOAT2ACCUM(x) ((_FLOAT_ACCUM)(x)) #define CVT_ACCUM2FLOAT(x) ((_FLOAT)(x)) +#define CVT_INTEGRAL2ACCUM(x) ((_FLOAT_ACCUM)(x)) #endif // These two are required to uniformly initialize // variables with non-zero literal constants of FP32 type @@ -165,29 +181,48 @@ #endif // MIOPEN_USE_FP16 #if MIOPEN_USE_FP32 == 1 +/// \todo Basically, conversions from float to accum and vice versa +/// should be removed because FLOAT_ACCUM and FLOAT are identical. +/// However this may lead to problems if these macros are used in +/// inappropriate contexts (e.g. with integral types), so this +/// refactoring should be considered as nontrivial and requires +/// a separate PR. Let's keep this historical stuff for now. +/// --atamazov 30.08.2023 #ifdef __HIP_PLATFORM_HCC__ #define CVT_FLOAT2ACCUM(x) (static_cast(x)) #define CVT_ACCUM2FLOAT(x) (static_cast(x)) +#define CVT_INTEGRAL2ACCUM(x) (static_cast(x)) #else #define CVT_FLOAT2ACCUM(x) ((_FLOAT_ACCUM)(x)) #define CVT_ACCUM2FLOAT(x) ((_FLOAT)(x)) +#define CVT_INTEGRAL2ACCUM(x) ((_FLOAT_ACCUM)(x)) #endif #define CVT_FP32_2FLOAT(x) (CVT_ACCUM2FLOAT(x)) #define CVT_FP32_2ACCUM(x) (x) #endif // MIOPEN_USE_FP32 #if MIOPEN_USE_BFP16 == 1 -#define CVT_FLOAT2ACCUM(x) bfloat16_to_float(x) -#define CVT_ACCUM2FLOAT(x) float_to_bfloat16(x) +#ifdef __HIP_PLATFORM_HCC__ +#define CVT_FLOAT2ACCUM(x) MIOPEN_ERROR_NOT_IMLEMENTED +#define CVT_ACCUM2FLOAT(x) MIOPEN_ERROR_NOT_IMLEMENTED +#define CVT_INTEGRAL2ACCUM(x) MIOPEN_ERROR_NOT_IMLEMENTED +#define CVT_FP32_2FLOAT(x) MIOPEN_ERROR_NOT_IMLEMENTED +#define CVT_FP32_2ACCUM(x) MIOPEN_ERROR_NOT_IMLEMENTED +#else +#define CVT_FLOAT2ACCUM(x) (bfloat16_to_float(x)) +#define CVT_ACCUM2FLOAT(x) (float_to_bfloat16(x)) +#define CVT_INTEGRAL2ACCUM(x) ((_FLOAT_ACCUM)(x)) #define CVT_FP32_2FLOAT(x) (CVT_ACCUM2FLOAT(x)) #define CVT_FP32_2ACCUM(x) (x) #endif +#endif /// If MIOPEN_USE_NATIVE_DATATYPE_ACCUM is defined as 1 when "float_types.h" is included, /// then all the ACCUM macros (the represent operations and types) will use the native /// datatype (BF16 or FP16) instead of FP32. In other words, the computations will be /// performed using the native datatype even if ACCUM macros are used. This allows for /// building both mixed-precision and "pure" kernels from the single source. +/// Note: This macro has higher priority than MIOPEN_USE_DOUBLE_ACCUM. #ifdef MIOPEN_USE_NATIVE_DATATYPE_ACCUM #if !(MIOPEN_USE_NATIVE_DATATYPE_ACCUM == 0 || MIOPEN_USE_NATIVE_DATATYPE_ACCUM == 1) #error "Invalid value of MIOPEN_USE_NATIVE_DATATYPE_ACCUM" @@ -197,8 +232,14 @@ #endif #if MIOPEN_USE_NATIVE_DATATYPE_ACCUM +#ifdef __HIP_PLATFORM_HCC__ +#undef FLOAT_ACCUM +#define FLOAT_ACCUM MIOPEN_ERROR_NOT_IMLEMENTED +#else #undef _FLOAT_ACCUM #define _FLOAT_ACCUM _FLOAT +#endif + #undef MAX_VAL_ACCUM #define MAX_VAL_ACCUM MAX_VAL #undef CVT_FLOAT2ACCUM @@ -207,6 +248,19 @@ #define CVT_ACCUM2FLOAT(x) (x) #undef CVT_FP32_2ACCUM #define CVT_FP32_2ACCUM(x) (CVT_FP32_2FLOAT(x)) -#endif // !(AVERAGE_OPS && MIOPEN_USE_FP16) + +#undef CVT_INTEGRAL2ACCUM +#ifdef __HIP_PLATFORM_HCC__ +#define CVT_INTEGRAL2ACCUM(x) MIOPEN_ERROR_NOT_IMLEMENTED +#else +#if MIOPEN_USE_BFP16 == 1 +// No direct conversion from integral types to BF16 is available. +// WARNING: Precision loss when integral type is wider than 16 bits. +#define CVT_INTEGRAL2ACCUM(x) (float_to_bfloat16(static_cast(x))) +#else +#define CVT_INTEGRAL2ACCUM(x) ((_FLOAT)(x)) +#endif +#endif +#endif // MIOPEN_USE_NATIVE_DATATYPE_ACCUM #endif // GUARD_FLOAT_TYPES_H diff --git a/src/ocl/pooling_ocl.cpp b/src/ocl/pooling_ocl.cpp index 94aac5d31f..86fca9004b 100644 --- a/src/ocl/pooling_ocl.cpp +++ b/src/ocl/pooling_ocl.cpp @@ -88,26 +88,23 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle, auto index_max = get_index_max(GetIndexType()); - // for kernel implementation max pooling backward pass, - // "index_max" means ghost, and thus should not be reached + /// \anchor max_pooling_index_max_restriction + /// For kernel implementation max pooling backward pass, + /// "index_max" means ghost, and thus should not be reached. if(mode == miopenPoolingMax && save_index) { - if((workspaceIndexMode == miopenPoolingWorkspaceIndexMask && - !(index_max >= std::accumulate(lens.begin(), lens.end(), 1, std::multiplies()))) || - (workspaceIndexMode == miopenPoolingWorkspaceIndexImage && - !(index_max >= std::accumulate(xDesc.GetLengths().begin() + 2, - xDesc.GetLengths().end(), - 1, - std::multiplies())))) + if((workspaceIndexMode == miopenPoolingWorkspaceIndexMask // + && index_max <= std::accumulate(lens.begin(), lens.end(), 1, std::multiplies())) // + || // + (workspaceIndexMode == miopenPoolingWorkspaceIndexImage // + && index_max <= std::accumulate(xDesc.GetLengths().begin() + 2, + xDesc.GetLengths().end(), + 1, + std::multiplies()))) { MIOPEN_THROW("Index range not enough for max pooling bwd"); } - if(workspaceIndexMode == miopenPoolingWorkspaceIndexMask && pool_dim == 5) - { - MIOPEN_THROW("3D pooling doesn't support workspace index mask mode"); - } - if(workSpace == nullptr) { throw std::invalid_argument("workSpace cannot be NULL in Forward Pooling MAX mode when " @@ -115,9 +112,9 @@ miopenStatus_t PoolingDescriptor::Forward(Handle& handle, } } - const auto algo_name = - AlgorithmName{pool_dim == 5 ? "miopenPoolingNdForward" : "miopenPooling2dForward"}; - const auto problem = pooling::ProblemDescription{*this, xDesc, yDesc, save_index}; + // So far, all pooling solvers implement the Direct (trivial) computation algorithm. + const auto algo_name = AlgorithmName{"miopenPoolingForwardDirect"}; + const auto problem = pooling::ProblemDescription{*this, xDesc, yDesc, save_index}; const auto invoke_params = [&]() { auto tmp = pooling::FwdInvokeParams{}; @@ -180,9 +177,8 @@ miopenStatus_t PoolingDescriptor::Backward(Handle& handle, MIOPEN_THROW("Unsupported pooling dimension"); } - const auto problem = pooling::ProblemDescription{*this, xDesc, yDesc, dxDesc, dyDesc}; - const auto algo_name = - AlgorithmName{pool_dim == 5 ? "miopenPoolingNdBackward" : "miopenPooling2dBackward"}; + const auto problem = pooling::ProblemDescription{*this, xDesc, yDesc, dxDesc, dyDesc}; + const auto algo_name = AlgorithmName{"miopenPoolingBackwardDirect"}; const auto invoke_params = [&]() { auto tmp = pooling::BwdInvokeParams{}; diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index 186bc28ff2..5a7918cc64 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -180,7 +180,7 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& context, { #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL std::ignore = context; - std::ignore = fdesc_problem; + std::ignore = bn_problem; return false; #else if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_INFER{})) diff --git a/src/solver/pooling/backwardNd.cpp b/src/solver/pooling/backwardNd.cpp index 25c5df3297..77dc917d2a 100644 --- a/src/solver/pooling/backwardNd.cpp +++ b/src/solver/pooling/backwardNd.cpp @@ -31,6 +31,8 @@ #include #include +#define WORKAROUND_ISSUE_MIFIN_80 1 // https://github.com/ROCmSoftwarePlatform/MIFin/issues/80 + namespace miopen { namespace solver { @@ -40,12 +42,25 @@ namespace pooling { bool PoolingBackwardNd::IsApplicable(const ExecutionContext&, const miopen::pooling::ProblemDescription& problem) const { - return problem.GetDirection() == miopen::pooling::Direction::Backward && - (problem.GetPooling().GetMode() == miopenPoolingMax || - problem.GetPooling().GetMode() == miopenPoolingAverage || - problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) && - problem.GetXDesc().GetSize() == 5 && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW" && - problem.GetYDesc().GetLayout("NCDHW") == "NCDHW"; + return problem.GetDirection() == miopen::pooling::Direction::Backward // + && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() // + && (problem.GetXDesc().GetType() == miopenFloat // + || problem.GetXDesc().GetType() == miopenHalf) // + && (problem.GetPooling().GetMode() == miopenPoolingMax // + || problem.GetPooling().GetMode() == miopenPoolingAverage // + || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) // + && ( // + (problem.GetXDesc().GetSize() == 5 // + && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW" // + && problem.GetYDesc().GetLayout("NCDHW") == "NCDHW") // + || // + (problem.GetXDesc().GetSize() == 4 // + && problem.GetXDesc().GetLayout("NCHW") == "NCHW" // + && problem.GetYDesc().GetLayout("NCHW") == "NCHW") // + ) // + /// \todo This solver does not support workspace index mask mode yet. + && !(problem.GetPooling().GetMode() == miopenPoolingMax // + && problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask); } ConvSolution @@ -54,165 +69,186 @@ PoolingBackwardNd::GetSolution(const ExecutionContext&, { auto result = ConvSolution{miopenStatusSuccess}; + auto kernel = KernelInfo{}; + kernel.kernel_file = "MIOpenPoolingBwdND.cl"; + kernel.kernel_name = "mloPoolingND"; + + if(problem.GetPooling().GetMode() == miopenPoolingMax) + { + kernel.kernel_name += "MaxBwd"; + } + else if(problem.GetPooling().GetMode() == miopenPoolingAverage || + problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) { - auto kernel = KernelInfo{}; + kernel.kernel_name += "AveBwd"; + } - kernel.kernel_file = "MIOpenPoolingBwdND.cl"; - kernel.kernel_name = "mloPoolingND"; + const auto& bot = problem.GetXDesc(); + const auto& top = problem.GetYDesc(); - if(problem.GetPooling().GetMode() == miopenPoolingMax) - { - kernel.kernel_name += "MaxBwd"; - } - else if(problem.GetPooling().GetMode() == miopenPoolingAverage || - problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) - { - kernel.kernel_name += "AveBwd"; - } + std::size_t batch_sz, n_inputs, in_height, in_width; + std::tie(batch_sz, n_inputs, in_height, in_width) = miopen::tien<4>(bot.GetLengths(), 1); - std::size_t batch_sz, n_inputs, in_height, in_width; - std::tie(batch_sz, n_inputs, in_height, in_width) = - miopen::tien<4>(problem.GetXDesc().GetLengths(), 1); - - const int pooling_method = (problem.GetPooling().GetMode() == miopenPoolingMax) - ? MLO_POOLING_OP_MAX - : ((problem.GetPooling().GetMode() == miopenPoolingAverage) - ? MLO_POOLING_OP_AVE - : MLO_POOLING_OP_AVE_INCLUSIVE); - - int pix_w_per_work = 1; - int pix_h_per_work = 4; - int pix_d_per_work = 2; - - int batch = problem.GetDYDesc().GetLengths()[0]; - int chal = problem.GetDYDesc().GetLengths()[1]; - - int bot_d = *(problem.GetDXDesc().GetLengths().rbegin() + 2); - int bot_h = *(problem.GetDXDesc().GetLengths().rbegin() + 1); - int bot_w = *(problem.GetDXDesc().GetLengths().rbegin()); - - int pix_blk_w = std::max((bot_w + pix_w_per_work - 1) / pix_w_per_work, 1); - int pix_blk_h = std::max((bot_h + pix_h_per_work - 1) / pix_h_per_work, 1); - int pix_blk_d = std::max((bot_d + pix_d_per_work - 1) / pix_d_per_work, 1); - - int max_activ_workitem = 65536; - int total_work = batch * chal * pix_blk_w * pix_blk_h * pix_blk_d; - int activ_work = std::min(total_work, max_activ_workitem); - - size_t lcl_work = 64; - size_t grp_num = (activ_work + lcl_work - 1) / lcl_work; - - bool territory_overlap = false; - for(std::size_t i = 0; i < problem.GetPooling().strides.size(); i++) - territory_overlap |= (problem.GetPooling().strides[i] < problem.GetPooling().lens[i]); - - const auto build_params = - KernelBuildParameters{ - {"MLO_POOLING_OP_ID", static_cast(pooling_method)}, - {"MAX_ACTIV_WORKITEM", static_cast(max_activ_workitem)}, - {"MLO_POOLING_GROUP_SZ0", static_cast(lcl_work)}, - {"MLO_POOLING_GROUP_SZ1", 1}, - {"MLO_POOLING_GROUP_SZ2", 1}, - {"PIX_W_PER_WORK", static_cast(pix_w_per_work)}, - {"PIX_H_PER_WORK", static_cast(pix_h_per_work)}, - {"PIX_D_PER_WORK", static_cast(pix_d_per_work)}, - {"KERNEL_SZ_D", static_cast(problem.GetPooling().lens[0])}, - {"KERNEL_SZ_H", static_cast(problem.GetPooling().lens[1])}, - {"KERNEL_SZ_W", static_cast(problem.GetPooling().lens[2])}, - {"STRIDE_D", static_cast(problem.GetPooling().strides[0])}, - {"STRIDE_H", static_cast(problem.GetPooling().strides[1])}, - {"STRIDE_W", static_cast(problem.GetPooling().strides[2])}, - {"TERRITORY_OVERLAP", static_cast(territory_overlap)}, - {"MLO_POOLING_INDEX_TYPE", - get_pooling_index_type_name(problem.GetPooling().GetIndexType())}, - {"MLO_POOLING_INDEX_MAX", - get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())}, - } - << GetDataTypeKBP(problem.GetDYDesc().GetType()); + const int pooling_method = (problem.GetPooling().GetMode() == miopenPoolingMax) + ? MLO_POOLING_OP_MAX + : ((problem.GetPooling().GetMode() == miopenPoolingAverage) + ? MLO_POOLING_OP_AVE + : MLO_POOLING_OP_AVE_INCLUSIVE); + + int pix_w_per_work = 1; + int pix_h_per_work = 4; + int pix_d_per_work = 2; + + int batch = top.GetLengths()[0]; + int chal = top.GetLengths()[1]; + + const bool is2d = (bot.GetSize() == 4); + + int bot_d = is2d ? 1 : *(bot.GetLengths().rbegin() + 2); + int bot_h = *(bot.GetLengths().rbegin() + 1); + int bot_w = *(bot.GetLengths().rbegin()); - kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + int pix_blk_w = std::max((bot_w + pix_w_per_work - 1) / pix_w_per_work, 1); + int pix_blk_h = std::max((bot_h + pix_h_per_work - 1) / pix_h_per_work, 1); + int pix_blk_d = std::max((bot_d + pix_d_per_work - 1) / pix_d_per_work, 1); - kernel.l_wk = {lcl_work, 1, 1}; - kernel.g_wk = {lcl_work * grp_num, 1, 1}; + int max_activ_workitem = 65536; + int total_work = batch * chal * pix_blk_w * pix_blk_h * pix_blk_d; + int activ_work = std::min(total_work, max_activ_workitem); - result.construction_params.push_back(kernel); +#if WORKAROUND_ISSUE_MIFIN_80 + const std::size_t wavesize = 64; +#else + const std::size_t wavesize = context.GetStream().GetWavefrontWidth(); +#endif + size_t grp_num = (activ_work + wavesize - 1) / wavesize; + + auto strides = problem.GetPooling().strides; + auto lens = problem.GetPooling().lens; + auto pads = problem.GetPooling().pads; + + if(is2d) + { + strides.push_back(strides[1]); + strides[1] = strides[0]; + lens.push_back(lens[1]); + lens[1] = lens[0]; + lens[0] = 1; + pads.push_back(pads[1]); + pads[1] = pads[0]; + pads[0] = 0; } - result.invoker_factory = [](const std::vector& kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); + bool territory_overlap = false; + for(std::size_t i = 0; i < strides.size(); i++) + territory_overlap |= (strides[i] < lens[i]); + + const auto build_params = + KernelBuildParameters{ + {"MLO_POOLING_OP_ID", pooling_method}, + {"MAX_ACTIV_WORKITEM", max_activ_workitem}, + {"MLO_POOLING_GROUP_SZ0", wavesize}, + {"MLO_POOLING_GROUP_SZ1", 1}, + {"MLO_POOLING_GROUP_SZ2", 1}, + {"PIX_W_PER_WORK", pix_w_per_work}, + {"PIX_H_PER_WORK", pix_h_per_work}, + {"PIX_D_PER_WORK", pix_d_per_work}, + {"KERNEL_SZ_D", lens[0]}, + {"KERNEL_SZ_H", lens[1]}, + {"KERNEL_SZ_W", lens[2]}, + {"STRIDE_D", strides[0]}, + {"STRIDE_H", strides[1]}, + {"STRIDE_W", strides[2]}, + {"TERRITORY_OVERLAP", static_cast(territory_overlap)}, + {"MLO_POOLING_INDEX_TYPE", + get_pooling_index_type_name(problem.GetPooling().GetIndexType())}, + {"MLO_POOLING_INDEX_MAX", + get_pooling_index_type_max_name(problem.GetPooling().GetIndexType())}, + } + << GetDataTypeKBP(problem.GetDYDesc().GetType()); + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); - const auto top_d = *(params.dyDesc.GetLengths().rbegin() + 2); - const auto top_h = *(params.dyDesc.GetLengths().rbegin() + 1); - const auto top_w = *(params.dyDesc.GetLengths().rbegin()); + kernel.l_wk = {wavesize, 1, 1}; + kernel.g_wk = {wavesize * grp_num, 1, 1}; - int pix_w_per_work = 1; - int pix_h_per_work = 4; - int pix_d_per_work = 2; + result.construction_params.push_back(kernel); - int batch = params.dyDesc.GetLengths()[0]; - int chal = params.dyDesc.GetLengths()[1]; + const auto top_d = is2d ? 1 : *(top.GetLengths().rbegin() + 2); + const auto top_h = *(top.GetLengths().rbegin() + 1); + const auto top_w = *(top.GetLengths().rbegin()); - int bot_d = *(params.dxDesc.GetLengths().rbegin() + 2); - int bot_h = *(params.dxDesc.GetLengths().rbegin() + 1); - int bot_w = *(params.dxDesc.GetLengths().rbegin()); + auto unpackStrides = [is2d](const auto& strides) { + return std::make_tuple(strides[0], // N stride + strides[1], // C stride + strides[2], // D stride. Same as H_stride in 3D converted from 2D. + is2d // + ? strides[2] // 2D H stride + : strides[3] // 3D H stride + ); + }; - int pix_blk_w = std::max((bot_w + pix_w_per_work - 1) / pix_w_per_work, 1); - int pix_blk_h = std::max((bot_h + pix_h_per_work - 1) / pix_h_per_work, 1); - int pix_blk_d = std::max((bot_d + pix_d_per_work - 1) / pix_d_per_work, 1); + std::size_t bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride; + std::size_t top_n_stride, top_c_stride, top_d_stride, top_h_stride; + std::tie(bot_n_stride, bot_c_stride, bot_d_stride, bot_h_stride) = + unpackStrides(bot.GetStrides()); + std::tie(top_n_stride, top_c_stride, top_d_stride, top_h_stride) = + unpackStrides(top.GetStrides()); - int total_work = batch * chal * pix_blk_w * pix_blk_h * pix_blk_d; + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); if(params.pooling.GetMode() == miopenPoolingMax) { kernel(params.dy, params.dx, params.workspace, - static_cast(params.pooling.pads[0]), - static_cast(params.pooling.pads[1]), - static_cast(params.pooling.pads[2]), + static_cast(pads[0]), + static_cast(pads[1]), + static_cast(pads[2]), static_cast(batch), static_cast(chal), - static_cast(params.dxDesc.GetLengths()[2]), - static_cast(params.dxDesc.GetLengths()[3]), - static_cast(params.dxDesc.GetLengths()[4]), + static_cast(bot_d), + static_cast(bot_h), + static_cast(bot_w), static_cast(top_d), static_cast(top_h), static_cast(top_w), - static_cast(params.dxDesc.GetStrides()[0]), - static_cast(params.dxDesc.GetStrides()[1]), - static_cast(params.dxDesc.GetStrides()[2]), - static_cast(params.dxDesc.GetStrides()[3]), - static_cast(params.dyDesc.GetStrides()[0]), - static_cast(params.dyDesc.GetStrides()[1]), - static_cast(params.dyDesc.GetStrides()[2]), - static_cast(params.dyDesc.GetStrides()[3]), + static_cast(bot_n_stride), + static_cast(bot_c_stride), + static_cast(bot_d_stride), + static_cast(bot_h_stride), + static_cast(top_n_stride), + static_cast(top_c_stride), + static_cast(top_d_stride), + static_cast(top_h_stride), static_cast(total_work)); } else { kernel(params.dy, params.dx, - static_cast(params.pooling.pads[0]), - static_cast(params.pooling.pads[1]), - static_cast(params.pooling.pads[2]), + static_cast(pads[0]), + static_cast(pads[1]), + static_cast(pads[2]), static_cast(batch), static_cast(chal), - static_cast(params.dxDesc.GetLengths()[2]), - static_cast(params.dxDesc.GetLengths()[3]), - static_cast(params.dxDesc.GetLengths()[4]), + static_cast(bot_d), + static_cast(bot_h), + static_cast(bot_w), static_cast(top_d), static_cast(top_h), static_cast(top_w), - static_cast(params.dxDesc.GetStrides()[0]), - static_cast(params.dxDesc.GetStrides()[1]), - static_cast(params.dxDesc.GetStrides()[2]), - static_cast(params.dxDesc.GetStrides()[3]), - static_cast(params.dyDesc.GetStrides()[0]), - static_cast(params.dyDesc.GetStrides()[1]), - static_cast(params.dyDesc.GetStrides()[2]), - static_cast(params.dyDesc.GetStrides()[3]), + static_cast(bot_n_stride), + static_cast(bot_c_stride), + static_cast(bot_d_stride), + static_cast(bot_h_stride), + static_cast(top_n_stride), + static_cast(top_c_stride), + static_cast(top_d_stride), + static_cast(top_h_stride), static_cast(total_work)); } }; diff --git a/src/solver/pooling/forwardNaive.cpp b/src/solver/pooling/forwardNaive.cpp index d8a13a330f..c0d0ccb5b2 100644 --- a/src/solver/pooling/forwardNaive.cpp +++ b/src/solver/pooling/forwardNaive.cpp @@ -68,18 +68,21 @@ inline uint32_t RoundUpNearestPower2Positive(uint32_t v) bool PoolingForwardNaive::IsApplicable(const ExecutionContext&, const miopen::pooling::ProblemDescription& problem) const { - return problem.GetDirection() == miopen::pooling::Direction::Forward // - && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() // - && (problem.GetXDesc().GetType() == miopenFloat // - || problem.GetXDesc().GetType() == miopenHalf) // - && ( // - (problem.GetXDesc().GetSize() == 5 // - && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW" // - && problem.GetYDesc().GetLayout("NCDHW") == "NCDHW") // - || // - (problem.GetXDesc().GetSize() == 4 // - && problem.GetXDesc().GetLayout("NCHW") == "NCHW" // - && problem.GetYDesc().GetLayout("NCHW") == "NCHW") // + return problem.GetDirection() == miopen::pooling::Direction::Forward // + && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() // + && (problem.GetXDesc().GetType() == miopenFloat // + || problem.GetXDesc().GetType() == miopenHalf) // + && (problem.GetPooling().GetMode() == miopenPoolingMax // + || problem.GetPooling().GetMode() == miopenPoolingAverage // + || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) // + && ( // + (problem.GetXDesc().GetSize() == 5 // + && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW" // + && problem.GetYDesc().GetLayout("NCDHW") == "NCDHW") // + || // + (problem.GetXDesc().GetSize() == 4 // + && problem.GetXDesc().GetLayout("NCHW") == "NCHW" // + && problem.GetYDesc().GetLayout("NCHW") == "NCHW") // ); } diff --git a/src/solver/pooling/forwardNd.cpp b/src/solver/pooling/forwardNd.cpp index 7663b1cdfe..183e8b3705 100644 --- a/src/solver/pooling/forwardNd.cpp +++ b/src/solver/pooling/forwardNd.cpp @@ -106,11 +106,24 @@ std::size_t sizeof_private_memory(const miopen::pooling::ProblemDescription& pro bool PoolingForwardNd::IsApplicable(const ExecutionContext& context, const miopen::pooling::ProblemDescription& problem) const { - return problem.GetDirection() == miopen::pooling::Direction::Forward && - problem.GetXDesc().GetSize() == 5 && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW" && - problem.GetYDesc().GetLayout("NCDHW") == "NCDHW" && - sizeof_private_memory(problem) <= - TargetProperties::GetMaxWaveScratchSize() / context.GetStream().GetWavefrontWidth(); + + return problem.GetDirection() == miopen::pooling::Direction::Forward // + && problem.GetXDesc().GetSize() == 5 // + && problem.GetXDesc().GetLayout("NCDHW") == "NCDHW" // + && problem.GetYDesc().GetLayout("NCDHW") == "NCDHW" // + && problem.GetXDesc().GetType() == problem.GetYDesc().GetType() // + && (problem.GetXDesc().GetType() == miopenFloat // + || problem.GetXDesc().GetType() == miopenHalf) // + && (problem.GetPooling().GetMode() == miopenPoolingMax // + || problem.GetPooling().GetMode() == miopenPoolingAverage // + || problem.GetPooling().GetMode() == miopenPoolingAverageInclusive) // + && sizeof_private_memory(problem) <= TargetProperties::GetMaxWaveScratchSize() // + / context.GetStream().GetWavefrontWidth() // + /// \todo This solver does not support workspace index mask mode yet. + && + !(problem.GetPooling().GetMode() == miopenPoolingMax // + && problem.GetPooling().GetWorkspaceIndexMode() == miopenPoolingWorkspaceIndexMask // + && problem.SaveIndex() == true); } ConvSolution PoolingForwardNd::GetSolution(const ExecutionContext&, diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ad0a7c8a1f..9f6432ec6d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -252,13 +252,16 @@ elseif(MIOPEN_TEST_BFLOAT16) test_deepbench_conv test_conv_igemm_dynamic_xdlops_nhwc_wrw_bf16 test_conv_igemm_dynamic_xdlops_nhwc_fwd_bf16 test_conv_igemm_dynamic_xdlops_nhwc_bwd_bf16) - endif() - if(${CODECOV_TEST}) - list(APPEND SKIP_TESTS test_conv3d test_conv3d_find2 test_immed_conv3d test_immed_conv2d test_pooling2d test_pooling2d_asymmetric) - # replaced by smaller tests with suffix _codecov - endif() +endif() + +if(${CODECOV_TEST}) + list(APPEND SKIP_TESTS + test_conv3d test_conv3d_find2 test_immed_conv3d test_immed_conv2d test_pooling2d test_pooling2d_asymmetric + test_pooling2d_wide) + # replaced by smaller tests with suffix _codecov +endif() -if (MIOPEN_NO_GPU) +if(MIOPEN_NO_GPU) set(SKIP_ALL_EXCEPT_TESTS test_include_inliner test_kernel_build_params test_test_errors test_type_name test_tensor_test test_sqlite_perfdb test_sequences test_pooling3d test_perfdb) @@ -714,27 +717,14 @@ if(${MIOPEN_TEST_WITH_MIOPENDRIVER}) add_custom_test(test_miopendriver_regression_half SKIP_UNLESS_ALL GFX94X_ENABLED GFX103X_ENABLED GFX110X_ENABLED FLOAT_DISABLED HALF_ENABLED # Regression test for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/1576 COMMAND MIOPEN_FIND_MODE=1 MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvDirectNaiveConvBwd $ ${MIOPENDRIVER_MODE_CONV} --forw 2 --in_layout NCHW --out_layout NCHW --fil_layout NCHW -n 256 -c 1024 -H 14 -W 14 -k 256 -y 1 -x 1 -p 0 -q 0 -u 1 -v 1 -l 1 -j 1 -m conv -g 1 -t 1 - # WORKAROUND_ISSUE_2110_2: tests for 2109, 2110 and 2160 shall be added to "test_pooling2d/3d --all" but this is - # impossible until backward pooling limitation (issue #2110 (2)) is fixed. - # Regression tests for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/2109 - COMMAND $ ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x3x255x255,195075x65025x255x1 -y 255 -x 255 -p 0 -q 0 -v 1 -u 1 -m avg -F 1 -t 1 -i 1 - COMMAND $ ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x3x227x227,154587x51529x227x1 -y 100 -x 100 -p 0 -q 0 -v 1 -u 1 -m avg -F 0 -t 1 -i 1 - # Regression tests for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/2160 + # WORKAROUND_ISSUE_2110_2: tests for 2110 and 2160 shall be added to "test_pooling3d --all" but this is + # impossible until backward pooling limitation (issue #2110 (2)) is fully fixed. + # Partial (3D only) regression test for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/2160. COMMAND $ ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x64x41x40x70 -y 41 -x 40 -Z 70 -m avg -F 1 -t 1 -i 1 - # Regression tests for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/2110 (1) - COMMAND $ ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x64x410x400 -y 410 -x 400 -m avg -F 1 -t 1 -i 1 + # Partial (3D only) regression test for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/2110 (1). COMMAND $ ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x64x41x40x100 -y 4 -x 4 -Z 100 -m max -F 1 -t 1 -i 1 ) - add_custom_test(test_miopendriver_regression_float SKIP_UNLESS_ALL GFX103X_ENABLED GFX110X_ENABLED - # WORKAROUND_ISSUE_2110_2: tests for 2109 shall be added to "test_pooling2d/3d --all" but this is - # impossible until backward pooling limitation (issue #2110 (2)) is fixed. - # Regression test for https://github.com/ROCmSoftwarePlatform/MIOpen/issues/2109 - COMMAND $ ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x3x255x255,195075x65025x255x1 -y 255 -x 255 -p 0 -q 0 -v 1 -u 1 -m avg -F 1 -t 1 -i 1 - COMMAND $ ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x3x227x227,154587x51529x227x1 -y 100 -x 100 -p 0 -q 0 -v 1 -u 1 -m avg -F 0 -t 1 -i 1 - COMMAND $ ${MIOPENDRIVER_MODE_POOL} -M 0 --input 1x3x227x63,42903x14301x63x1 -y 30 -x 30 -p 0 -q 0 -v 1 -u 1 -m avg -F 0 -t 1 -i 1 - ) - add_custom_test(test_miopendriver_regression_int8 SKIP_UNLESS_ALL GFX94X_ENABLED GFX103X_ENABLED GFX110X_ENABLED FLOAT_DISABLED INT8_ENABLED COMMAND MIOPEN_FIND_MODE=1 MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvDirectNaiveConvFwd $ ${MIOPENDRIVER_MODE_CONV} --forw 1 --in_layout NCHW --out_layout NCHW --fil_layout NCHW -n 256 -c 1024 -H 14 -W 14 -k 256 -y 1 -x 1 -p 0 -q 0 -u 1 -v 1 -l 1 -j 1 -m conv -g 1 -t 1 ) @@ -788,6 +778,10 @@ add_custom_test(test_pooling2d_asymmetric SKIP_UNLESS_ALL HALF_ENABLED GFX94X_EN COMMAND $ ${MIOPEN_TEST_FLOAT_ARG} --all --dataset 1 --limit 0 ${MIOPEN_TEST_FLAGS_ARGS} ) +add_custom_test(test_pooling2d_wide SKIP_UNLESS_ALL HALF_ENABLED GFX94X_ENABLED GFX103X_ENABLED GFX110X_ENABLED + COMMAND $ ${MIOPEN_TEST_FLOAT_ARG} --all --dataset 2 --limit 0 ${MIOPEN_TEST_FLAGS_ARGS} +) + set(IMPLICITGEMM_MLIR_ENV_F_XDLOPS ${IMPLICITGEMM_MLIR_ENV_BASE} MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvMlirIgemmFwdXdlops) set(IMPLICITGEMM_MLIR_ENV_B_XDLOPS ${IMPLICITGEMM_MLIR_ENV_BASE} MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvMlirIgemmBwdXdlops) set(IMPLICITGEMM_MLIR_ENV_W_XDLOPS ${IMPLICITGEMM_MLIR_ENV_BASE} MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvMlirIgemmWrWXdlops) diff --git a/test/pooling2d.cpp b/test/pooling2d.cpp index 571f44cd54..c0df88c481 100644 --- a/test/pooling2d.cpp +++ b/test/pooling2d.cpp @@ -59,6 +59,12 @@ struct pooling2d_driver : pooling_driver // Dataset 1 is intended for testing of asymmetric configs. std::vector get_2d_pooling_input_shapes_minimal() { return {{1, 4, 4, 4}}; } + // Dataset 2 is intended for testing of configs with wide window. + std::vector get_2d_pooling_input_shapes_wide() + { + return {{1, 3, 255, 255}, {2, 3, 227, 227}, {1, 7, 127, 127}, {1, 1, 410, 400}}; + } + public: pooling2d_driver() : pooling_driver() { @@ -67,33 +73,35 @@ struct pooling2d_driver : pooling_driver std::vector in_dim_vec(in_dim_set.begin(), in_dim_set.end()); this->add(this->in_shape, "input", this->generate_data(in_dim_vec, {16, 32, 8, 8})); #else - this->add(this->in_shape, - "input", - this->template generate_multi_data_limited( - {get_2d_pooling_input_shapes(), get_2d_pooling_input_shapes_minimal()}, 9)); -#endif this->add( - this->lens, - "lens", - this->template generate_multi_data({{{2, 2}, {3, 3}}, {{2, 2}, {1, 2}, {2, 1}}})); + this->in_shape, + "input", + this->template generate_multi_data_limited({get_2d_pooling_input_shapes(), + get_2d_pooling_input_shapes_minimal(), + get_2d_pooling_input_shapes_wide()}, + 9)); +#endif + this->add(this->lens, + "lens", + this->template generate_multi_data( + {{{2, 2}, {3, 3}}, // + {{2, 2}, {1, 2}, {2, 1}}, // + {{35, 35}, {100, 100}, {255, 255}, {410, 400}}})); this->add(this->strides, "strides", - this->template generate_multi_data( - {{{2, 2}, {1, 1}}, {{1, 1}, {2, 1}, {1, 2}, {2, 2}}})); + this->template generate_multi_data({{{2, 2}, {1, 1}}, // + {{1, 1}, {2, 1}, {1, 2}, {2, 2}}, // + {{1, 1}}})); + // clang-format off this->add(this->pads, "pads", this->template generate_multi_data({ - {{0, 0}, {1, 1}}, + {{0, 0}, {1, 1}}, // #if WORKAROUND_ISSUE_1670 - { - { - 0, 0 - } - } + {{0, 0}}, // #else - { - {0, 0}, {0, 1}, {1, 0}, { 1, 1 } - } + {{0, 0}, {0, 1}, {1, 0}, {1, 1}}, // #endif - })); + {{0, 0}}})); + // clang-format on this->add(this->wsidx, "wsidx", this->generate_data({0, 1})); } }; diff --git a/test/pooling_common.hpp b/test/pooling_common.hpp index 873c203aaf..9058d34321 100644 --- a/test/pooling_common.hpp +++ b/test/pooling_common.hpp @@ -59,6 +59,24 @@ static int num_uint64_case = 0; // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) static int num_uint64_case_imgidx = 0; +static inline void print(const miopen::PoolingDescriptor& filter) +{ + std::cout << "Pooling: "; + if(filter.GetMode() == miopenPoolingAverage) + std::cout << "Average"; + else if(filter.GetMode() == miopenPoolingAverageInclusive) + std::cout << "AverageInclusive"; + else + std::cout << "Max"; + std::cout << std::endl; + std::cout << "Lengths: "; + miopen::LogRange(std::cout, filter.GetLengths(), ", ") << std::endl; + std::cout << "Pads: "; + miopen::LogRange(std::cout, filter.GetPads(), ", ") << std::endl; + std::cout << "Strides: "; + miopen::LogRange(std::cout, filter.GetStrides(), ", ") << std::endl; +} + template tensor get_output_tensor(const miopen::PoolingDescriptor& filter, const tensor& input) { @@ -208,20 +226,8 @@ struct verify_forward_pooling const miopen::PoolingDescriptor& filter, const std::vector&) const { - std::cout << "Forward pooling: "; - if(filter.GetMode() == miopenPoolingAverage) - std::cout << "Average"; - else if(filter.GetMode() == miopenPoolingAverageInclusive) - std::cout << "AverageInclusive"; - else - std::cout << "Max"; - std::cout << std::endl; - std::cout << "Lengths: "; - miopen::LogRange(std::cout, filter.GetLengths(), ", ") << std::endl; - std::cout << "Pads: "; - miopen::LogRange(std::cout, filter.GetPads(), ", ") << std::endl; - std::cout << "Strides: "; - miopen::LogRange(std::cout, filter.GetStrides(), ", ") << std::endl; + std::cout << "Forward "; + print(filter); std::cout << "Input tensor: " << input.desc.ToString() << std::endl; std::cout << "Output tensor: " << filter.GetForwardOutputTensor(input.desc).ToString() << std::endl; @@ -241,7 +247,7 @@ struct verify_backward_pooling bool verify_index) const { auto dinput = input; - std::vector din_vec(input.desc.GetElementSpace(), T(0)); + std::vector din_vec(input.desc.GetElementSpace(), 0.0); CHECK(dout.desc == out.desc); std::array in_dim{}; std::copy_n(input.desc.GetLengths().begin(), SptDim + 2, in_dim.begin()); @@ -360,7 +366,8 @@ struct verify_backward_pooling din_idx += in_idx[i] * in_str[i]; } - din_vec.at(din_idx) += dout(o, w, out_spatial_id_pack...) / pool_size; + din_vec.at(din_idx) += + static_cast(dout(o, w, out_spatial_id_pack...)) / pool_size; } }); }); @@ -432,22 +439,10 @@ struct verify_backward_pooling bool, bool) const { - std::cout << "Backward pooling: "; - if(filter.GetMode() == miopenPoolingAverage) - std::cout << "Average"; - else if(filter.GetMode() == miopenPoolingAverageInclusive) - std::cout << "AverageInclusive"; - else - std::cout << "Max"; - std::cout << std::endl; - std::cout << "Lengths: "; - miopen::LogRange(std::cout, filter.GetLengths(), ", ") << std::endl; - std::cout << "Pads: "; - miopen::LogRange(std::cout, filter.GetPads(), ", ") << std::endl; - std::cout << "Strides: "; - miopen::LogRange(std::cout, filter.GetStrides(), ", ") << std::endl; - std::cout << "Output tensor: " << out.desc.ToString() << std::endl; + std::cout << "Backward "; + print(filter); std::cout << "Input tensor: " << input.desc.ToString() << std::endl; + std::cout << "Output tensor: " << out.desc.ToString() << std::endl; } }; @@ -491,10 +486,14 @@ struct pooling_driver : test_driver { add(index_type, "index_type", - generate_data({"miopenIndexUint8", - "miopenIndexUint16", - "miopenIndexUint32", - "miopenIndexUint64"})); + generate_multi_data( // + {{"miopenIndexUint8", + "miopenIndexUint16", + "miopenIndexUint32", + "miopenIndexUint64"}, // + {"miopenIndexUint8", "miopenIndexUint32"}, // + {"miopenIndexUint32"}} // + )); add(mode, "mode", generate_data( @@ -530,29 +529,95 @@ struct pooling_driver : test_driver auto idx_sz = sizeof(uint8_t); int spt_dim = in_shape.size() - 2; const bool skip_many_configs_with_non_int8_index = - (dataset_id == 0); // Otherwise the default dataset takes too much time. + (dataset_id == 0) && full_set; // Otherwise the default dataset takes too much time. + const bool wide_dataset = (dataset_id == 2) && full_set; + + filter = miopen::PoolingDescriptor + { + mode_lookup.at(miopen::ToUpper(mode)), +#if TEST_PADDING_MODE == 1 + pmode_lookup.at(miopen::ToUpper(pmode)), +#else + miopenPaddingDefault, +#endif + lens, strides, pads + }; + + filter.SetIndexType(idx_typ); + filter.SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t(wsidx)); + + if(wsidx == 0 && spt_dim == 3 && filter.GetMode() == miopenPoolingMax && full_set) + { + show_command(); + std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented " + "yet in 3D max pooling solvers." + << std::endl; + return; + } + + if(wsidx == 0 && spt_dim == 2 && filter.GetMode() == miopenPoolingMax && wide_dataset) + { + show_command(); + std::cout << "Warning: Config skipped. Workspace index mask mode is not implemented " + "yet in 2D max backward solvers that support wide pooling window." + << std::endl; + return; + } + + if(wsidx == 0 && + (filter.GetMode() == miopenPoolingAverage || + filter.GetMode() == miopenPoolingAverageInclusive) && + full_set) + { + show_command(); + std::cout << "Warning: Config skipped. Workspace index modes are irrelevant for " + "Average pooling. " + "In order to optimize performance of full tests, we " + "skip average pooling configs when (wsidx == 0). " + "Please make sure that dataset includes counterparts with (wsidx == 1)." + << std::endl; + return; + } + switch(idx_typ) { + /// The "index is too small" limitation is an approximation + /// of the real limitation, and therefore applied only when + /// the "full test" is ran. See: + /// \ref max_pooling_index_max_restriction case miopenIndexUint8: { - // index size too small for 3D image - if(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) + if((spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) && full_set && + filter.GetMode() == miopenPoolingMax) { + show_command(); + std::cout << "Warning: Config skipped: uint8 index is too small " + "(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) " + "&& filter.GetMode() == miopenPoolingMax" + << std::endl; return; } break; } case miopenIndexUint16: { - // index size too small for 3D image - if(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) + if((spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) && full_set && + filter.GetMode() == miopenPoolingMax) { + show_command(); + std::cout << "Warning: Config skipped: uint16 index is too small " + "(spt_dim == 3 || (spt_dim == 2 && wsidx == 1)) " + "&& filter.GetMode() == miopenPoolingMax" + << std::endl; return; } - if(skip_many_configs_with_non_int8_index) { // test_pooling_test --all only test 5 uint16 cases if(num_uint16_case > 5) { + show_command(); + std::cout << "Warning: Config skipped for the default dataset to speed " + "up testing (num_uint16_case > 5)" + << std::endl; return; } ++num_uint16_case; @@ -566,20 +631,29 @@ struct pooling_driver : test_driver // test_pooling_test --all only test 5 uint32 cases if(wsidx == 0) { - if(num_uint32_case > 5 || spt_dim == 3) + if(num_uint32_case > 5) + { + show_command(); + std::cout << "Warning: Config skipped for the default dataset to speed up " + "testing (wsidx == 0 && num_uint32_case > 5)" + << std::endl; return; - + } ++num_uint32_case; } else { if(num_uint32_case_imgidx > 5) + { + show_command(); + std::cout << "Warning: Config skipped for the default dataset to speed up " + "testing (wsidx != 0 && num_uint32_case_imgidx > 5)" + << std::endl; return; - + } ++num_uint32_case_imgidx; } } - idx_sz = sizeof(uint32_t); break; } @@ -588,20 +662,30 @@ struct pooling_driver : test_driver { if(wsidx == 0) { - if(num_uint64_case > 5 || spt_dim == 3) + if(num_uint64_case > 5) + { + show_command(); + std::cout << "Warning: Config skipped for the default dataset to speed up " + "testing (wsidx == 0) && (num_uint64_case > 5)" + << std::endl; return; - + } ++num_uint64_case; } else { if(num_uint64_case_imgidx > 5 && spt_dim == 2) + { + show_command(); + std::cout << "Warning: Config skipped to speed up testing of the " + "default dataset (wsidx != 0) && (num_uint64_case_imgidx > 5 " + "&& spt_dim == 2)" + << std::endl; return; - + } ++num_uint64_case_imgidx; } } - idx_sz = sizeof(uint64_t); break; } @@ -611,42 +695,39 @@ struct pooling_driver : test_driver if(spt_dim != 2 && spt_dim != 3) { + show_command(); + std::cout << "Warning: Config skipped becuse it is not supported " // + "(spt_dim != 2 && spt_dim != 3)" + << std::endl; return; } - filter = miopen::PoolingDescriptor - { - mode_lookup.at(miopen::ToUpper(mode)), -#if TEST_PADDING_MODE == 1 - pmode_lookup.at(miopen::ToUpper(pmode)) -#else - miopenPaddingDefault -#endif - , - lens, strides, pads - }; - - filter.SetIndexType(idx_typ); - filter.SetWorkspaceIndexMode(miopenPoolingWorkspaceIndexMode_t(wsidx)); - for(int i = 0; i < spt_dim; i++) - if(lens[i] >= (input_desc.GetLengths()[i + 2] + static_cast(2) * pads[i])) + if(lens[i] > (input_desc.GetLengths()[i + 2] + static_cast(2) * pads[i])) { + show_command(); + std::cout << "Warning: Config skipped becuse it is invalid " + "(lens[i] > (input_desc.GetLengths()[i + 2] + 2 * pads[i]))" + << std::endl; return; } - auto output_desc = filter.GetForwardOutputTensor(input_desc); - size_t total_mem = 3 * input_desc.GetNumBytes() + output_desc.GetNumBytes() + - idx_sz * output_desc.GetElementSize(); // estimate based on backward pass - - size_t device_mem = get_handle().GetGlobalMemorySize(); - if(total_mem >= device_mem) + if(full_set) { - show_command(); - std::cout << "Config requires " << total_mem - << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem - << " Bytes of memory." << std::endl; - return; + auto output_desc = filter.GetForwardOutputTensor(input_desc); + size_t total_mem = + 3 * input_desc.GetNumBytes() + output_desc.GetNumBytes() + + idx_sz * output_desc.GetElementSize(); // estimate based on backward pass + + size_t device_mem = get_handle().GetGlobalMemorySize(); + if(total_mem >= device_mem) + { + show_command(); + std::cout << "Config skipped because it requires " << total_mem + << " Bytes to write all necessary tensors to GPU. GPU has " << device_mem + << " Bytes of memory." << std::endl; + return; + } } std::vector in_dim(input_desc.GetLengths().begin() + 2, input_desc.GetLengths().end()); @@ -691,61 +772,52 @@ struct pooling_driver : test_driver return; } #endif - std::vector check_dim(spt_dim); - for(int i = 0; i < spt_dim; i++) + switch(filter.GetIndexType()) { - check_dim[i] = in_dim[i] + 2 * filter.GetPads()[i] - ker_dim[i]; + case miopenIndexUint8: { + if(spt_dim == 3) + { + run_impl(); + } + else + { + run_impl(); + } + break; } - - if(std::all_of(check_dim.begin(), check_dim.end(), [](int i) { return i > 0; })) - { - switch(filter.GetIndexType()) + case miopenIndexUint16: { + if(spt_dim == 3) { - case miopenIndexUint8: { - if(spt_dim == 3) - { - run_impl(); - } - else - { - run_impl(); - } - break; + run_impl(); } - case miopenIndexUint16: { - if(spt_dim == 3) - { - run_impl(); - } - else - { - run_impl(); - } - break; + else + { + run_impl(); } - case miopenIndexUint32: { - if(spt_dim == 3) - { - run_impl(); - } - else - { - run_impl(); - } - break; + break; + } + case miopenIndexUint32: { + if(spt_dim == 3) + { + run_impl(); } - case miopenIndexUint64: { - if(spt_dim == 3) - { - run_impl(); - } - else - { - run_impl(); - } - break; + else + { + run_impl(); + } + break; + } + case miopenIndexUint64: { + if(spt_dim == 3) + { + run_impl(); } + else + { + run_impl(); } + break; + } } } }; From e9697654bf154cee714f3cb06aaa4f949f2bc19f Mon Sep 17 00:00:00 2001 From: Sam Wu Date: Tue, 19 Sep 2023 12:06:36 -0600 Subject: [PATCH 04/36] [Doc] Update read the docs yaml configuration with build.os (#2398) --- .readthedocs.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 43a0890c96..2c1c2064c5 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,6 +9,10 @@ sphinx: formats: [htmlzip] python: - version: "3.8" install: - requirements: docs/.sphinx/requirements.txt + +build: + os: ubuntu-20.04 + tools: + python: "3.8" From 91ccb03aec691eb5f915b2c381b77af09b31e2a5 Mon Sep 17 00:00:00 2001 From: Seungman Han <120356720+seungmanhan@users.noreply.github.com> Date: Wed, 20 Sep 2023 03:13:51 +0900 Subject: [PATCH 05/36] Add support for layernorm primitive (#2320) --- driver/driver.hpp | 12 +- driver/layernorm_driver.hpp | 429 +++++++++++++++++++++++++++++++ driver/main.cpp | 17 ++ driver/mloLayerNormHost.hpp | 91 +++++++ driver/tensor_driver.hpp | 12 +- include/miopen/miopen.h | 63 ++++- src/CMakeLists.txt | 3 + src/include/miopen/layernorm.hpp | 57 ++++ src/kernels/MIOpenLayerNorm.cpp | 119 +++++++++ src/layer_norm.cpp | 135 ++++++++++ src/layernorm_api.cpp | 137 ++++++++++ test/cpu_layernorm.hpp | 83 ++++++ test/gtest/layernorm_test.cpp | 38 +++ test/gtest/layernorm_test.hpp | 247 ++++++++++++++++++ 14 files changed, 1430 insertions(+), 13 deletions(-) create mode 100644 driver/layernorm_driver.hpp create mode 100644 driver/mloLayerNormHost.hpp create mode 100644 src/include/miopen/layernorm.hpp create mode 100644 src/kernels/MIOpenLayerNorm.cpp create mode 100644 src/layer_norm.cpp create mode 100644 src/layernorm_api.cpp create mode 100644 test/cpu_layernorm.hpp create mode 100644 test/gtest/layernorm_test.cpp create mode 100644 test/gtest/layernorm_test.hpp diff --git a/driver/driver.hpp b/driver/driver.hpp index 8e15894705..0862652cd5 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -150,7 +150,11 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) printf("Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], " "pool[fp16], lrn[fp16], " "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm, ctc, dropout[fp16], " - "tensorop[fp16], reduce[fp16,fp64]\n"); + "tensorop[fp16], reduce[fp16,fp64]" +#ifdef MIOPEN_BETA_API + ", layernorm[bf16, fp16, fp32]" +#endif + "\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -171,7 +175,11 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "bnormfp16" && arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" && arg != "rnn_seqfp16" && arg != "gemm" /*&& arg != "gemmfp16"*/ && arg != "ctc" && arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "tensoropfp16" && - arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && arg != "--version") + arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && +#ifdef MIOPEN_BETA_API + arg != "layernorm" && arg != "layernormfp16" && arg != "layernormbfp16" && +#endif + arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/layernorm_driver.hpp b/driver/layernorm_driver.hpp new file mode 100644 index 0000000000..8251472625 --- /dev/null +++ b/driver/layernorm_driver.hpp @@ -0,0 +1,429 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#ifdef MIOPEN_BETA_API +#ifndef GUARD_MIOPEN_LAYERNORM_DRIVER_HPP +#define GUARD_MIOPEN_LAYERNORM_DRIVER_HPP + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "mloLayerNormHost.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" +#include <../test/verify.hpp> +#include +#include +#include +#include +#include +#include +#include +#include <../test/tensor_holder.hpp> +#include "random.hpp" + +template +class LayerNormDriver : public Driver +{ +public: + LayerNormDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&weightDesc); + miopenCreateTensorDescriptor(&biasDesc); + miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&meanDesc); + miopenCreateTensorDescriptor(&rstdDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + std::vector GetInputTensorLengthsFromCmdLine(); + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~LayerNormDriver() override + { + + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(weightDesc); + miopenDestroyTensorDescriptor(biasDesc); + miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(meanDesc); + miopenDestroyTensorDescriptor(rstdDesc); + } + +private: + InputFlags inflags; + + int forw; + int dim_size; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t weightDesc; + miopenTensorDescriptor_t biasDesc; + miopenTensorDescriptor_t outputDesc; + miopenTensorDescriptor_t meanDesc; + miopenTensorDescriptor_t rstdDesc; + + std::unique_ptr in_dev; + std::unique_ptr weight_dev; + std::unique_ptr bias_dev; + std::unique_ptr out_dev; + std::unique_ptr mean_dev; + std::unique_ptr rstd_dev; + + std::vector in; + std::vector weight; + std::vector bias; + std::vector out; + std::vector mean; + std::vector rstd; + std::vector outhost; + std::vector meanhost; + std::vector rstdhost; + + float eps; + int dim; + miopenLayerNormMode_t mode; +}; + +template +int LayerNormDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + return miopenStatusSuccess; +} + +template +int LayerNormDriver::GetandSetData() +{ + std::vector in_len = GetInputTensorLengthsFromCmdLine(); + + dim = static_cast(inflags.GetValueDouble("nomalized_dim")); + + std::vector inner_len; + if(dim == in_len.size()) + inner_len = {1}; + else + inner_len = {in_len.begin() + dim, in_len.end()}; + + std::vector outer_len; + if(dim == 0) + outer_len = {1}; + else + outer_len = {in_len.begin(), in_len.end() - (in_len.size() - dim)}; + + SetTensorNd(inputDesc, in_len, data_type); + SetTensorNd(weightDesc, inner_len, data_type); + SetTensorNd(biasDesc, inner_len, data_type); + SetTensorNd(outputDesc, in_len, data_type); + SetTensorNd(meanDesc, outer_len, data_type); + SetTensorNd(rstdDesc, outer_len, data_type); + + eps = static_cast(inflags.GetValueDouble("eps")); + mode = miopenLayerNormMode_t(inflags.GetValueInt("mode")); + + return (0); +} + +template +int LayerNormDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward LayerNorm (Default=1)", "int"); + inflags.AddInputFlag("batchsize", 'n', "100", "Mini-batch size (Default=100)", "int"); + inflags.AddInputFlag("in_channels", 'c', "3", "Number of Input Channels (Default=3)", "int"); + inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int"); + inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int"); + inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int"); + + inflags.AddInputFlag("eps", 'e', "0.00001", "Alpha (Default=0.00001)", "double"); + inflags.AddInputFlag("nomalized_dim", 'o', "3", "Nomalized Dim (Default=3)", "int"); + inflags.AddInputFlag( + "mode", 'm', "0", "elemwise affine mode (0), weight and bias mode (1) (Default=0)", "int"); + + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +std::vector LayerNormDriver::GetInputTensorLengthsFromCmdLine() +{ + int in_n = inflags.GetValueInt("batchsize"); + int in_c = inflags.GetValueInt("in_channels"); + int in_w = inflags.GetValueInt("in_w"); + int in_h = inflags.GetValueInt("in_h"); + int in_d = inflags.GetValueInt("in_d"); + + if(in_h != 0) + { + if(in_d != 0) + { + dim_size = 5; + return std::vector({in_n, in_c, in_d, in_h, in_w}); + } + else + { + dim_size = 4; + return std::vector({in_n, in_c, in_h, in_w}); + } + } + else + { + dim_size = 3; + return std::vector({in_n, in_c, in_w}); + } +} + +template +int LayerNormDriver::AllocateBuffersAndCopy() +{ + size_t in_sz = GetTensorSize(inputDesc); + size_t weight_sz = GetTensorSize(weightDesc); + size_t bias_sz = GetTensorSize(biasDesc); + size_t out_sz = GetTensorSize(outputDesc); + size_t mean_sz = GetTensorSize(meanDesc); + size_t rstd_sz = GetTensorSize(rstdDesc); + + // MIOPEN_BACKEND_HIP + uint32_t ctx = 0; + + in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); + weight_dev = std::unique_ptr(new GPUMem(ctx, weight_sz, sizeof(Tgpu))); + bias_dev = std::unique_ptr(new GPUMem(ctx, bias_sz, sizeof(Tgpu))); + out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); + mean_dev = std::unique_ptr(new GPUMem(ctx, mean_sz, sizeof(Tgpu))); + rstd_dev = std::unique_ptr(new GPUMem(ctx, rstd_sz, sizeof(Tgpu))); + + in = std::vector(in_sz, static_cast(0)); + weight = std::vector(weight_sz, static_cast(0)); + bias = std::vector(bias_sz, static_cast(0)); + out = std::vector(out_sz, static_cast(0)); + mean = std::vector(mean_sz, static_cast(0)); + rstd = std::vector(rstd_sz, static_cast(0)); + outhost = std::vector(out_sz, static_cast(0)); + meanhost = std::vector(mean_sz, static_cast(0)); + rstdhost = std::vector(rstd_sz, static_cast(0)); + + // MIOPEN_BACKEND_HIP + int status; + + for(int i = 0; i < in_sz; i++) + { + in[i] = RAN_GEN(static_cast(0.0), static_cast(1.0)); + } + status = in_dev->ToGPU(q, in.data()); + + for(int i = 0; i < weight_sz; i++) + { + weight[i] = RAN_GEN(static_cast(0.0), static_cast(1.0)); + } + status = weight_dev->ToGPU(q, weight.data()); + + for(int i = 0; i < bias_sz; i++) + { + bias[i] = RAN_GEN(static_cast(0.0), static_cast(1.0)); + } + status = bias_dev->ToGPU(q, bias.data()); + + status |= out_dev->ToGPU(q, out.data()); + status |= mean_dev->ToGPU(q, mean.data()); + status |= rstd_dev->ToGPU(q, rstd.data()); + + if(status != CL_SUCCESS) + printf("Error copying data to GPU\n"); + + return miopenStatusSuccess; +} + +template +int LayerNormDriver::RunForwardGPU() +{ + float kernel_total_time = 0.0; + float kernel_first_time = 0.0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenLayerNormForward(GetHandle(), + mode, + inputDesc, + in_dev->GetMem(), + weightDesc, + weight_dev->GetMem(), + biasDesc, + bias_dev->GetMem(), + eps, + dim, + outputDesc, + out_dev->GetMem(), + meanDesc, + mean_dev->GetMem(), + rstdDesc, + rstd_dev->GetMem()); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + printf("Wall-clock Time Forward LayerNorm Elapsed: %f ms\n", t.gettime_ms() / iter); + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + printf("GPU Kernel Time Forward LayerNorm Elapsed: %f ms\n", kernel_average_time); + } + + out_dev->FromGPU(GetStream(), out.data()); + mean_dev->FromGPU(GetStream(), mean.data()); + rstd_dev->FromGPU(GetStream(), rstd.data()); + + return miopenStatusSuccess; +} + +template +int LayerNormDriver::RunForwardCPU() +{ + mloLayerNormForwardRunHost(inputDesc, + in.data(), + weight.data(), + bias.data(), + outhost.data(), + meanhost.data(), + rstdhost.data(), + eps, + dim, + mode); + + return miopenStatusSuccess; +} + +template +int LayerNormDriver::RunBackwardGPU() +{ + return miopenStatusSuccess; +} + +template +Tref LayerNormDriver::GetTolerance() +{ + if(data_type == miopenHalf) + { + return 1e-3; + } + else if(data_type == miopenFloat) + { + return 5e-5; + } + else if(data_type == miopenDouble) + { + return 1e-10; + } + else if(data_type == miopenBFloat16) + { + return 5e-3; + } + return 0; +} + +template +int LayerNormDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(outhost, out); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Forward LayerNorm FAILED: " << error << std::endl; + } + else + { + printf("Forward LayerNorm Verifies on CPU and GPU (err=%f)\n", error); + } + + auto meanerror = miopen::rms_range(meanhost, mean); + if(!std::isfinite(meanerror) || meanerror > tolerance) + { + std::cout << "Forward LayerNorm mean FAILED: " << meanerror << std::endl; + } + else + { + printf("Forward LayerNorm mean Verifies on CPU and GPU (err=%f)\n", meanerror); + } + + auto rstderror = miopen::rms_range(rstdhost, rstd); + if(!std::isfinite(rstderror) || rstderror > tolerance) + { + std::cout << "Forward LayerNorm rstd FAILED: " << rstderror << std::endl; + } + else + { + printf("Forward LayerNorm rstd Verifies on CPU and GPU (err=%f)\n", rstderror); + } + + return miopenStatusSuccess; +} + +template +int LayerNormDriver::VerifyBackward() +{ + return miopenStatusSuccess; +} + +#endif // GUARD_MIOPEN_SOFTMAX_DRIVER_HPP +#endif diff --git a/driver/main.cpp b/driver/main.cpp index abdefc34a3..79e52e5e38 100644 --- a/driver/main.cpp +++ b/driver/main.cpp @@ -43,6 +43,9 @@ #include "reduce_driver.hpp" #include #include +#ifdef MIOPEN_BETA_API +#include "layernorm_driver.hpp" +#endif int main(int argc, char* argv[]) { @@ -196,6 +199,20 @@ int main(int argc, char* argv[]) { drv = new ReduceDriver(); } +#ifdef MIOPEN_BETA_API + else if(base_arg == "layernorm") + { + drv = new LayerNormDriver(); + } + else if(base_arg == "layernormfp16") + { + drv = new LayerNormDriver(); + } + else if(base_arg == "layernormbfp16") + { + drv = new LayerNormDriver(); + } +#endif else { printf("Incorrect BaseArg\n"); diff --git a/driver/mloLayerNormHost.hpp b/driver/mloLayerNormHost.hpp new file mode 100644 index 0000000000..5c504f8068 --- /dev/null +++ b/driver/mloLayerNormHost.hpp @@ -0,0 +1,91 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifdef MIOPEN_BETA_API +#ifndef MLO_LAYERNORMHOST_H_ +#define MLO_LAYERNORMHOST_H_ + +//////////////////////////////////////////////////////////// +// +/////////////////////////////////////////////////////////// + +template +int32_t mloLayerNormForwardRunHost(miopenTensorDescriptor_t inputDesc, + Tgpu* input, + Tgpu* weight, + Tgpu* bias, + Tcheck* outputhost, + Tcheck* meanhost, + Tcheck* rstdhost, + float eps, + int32_t normalized_dim, + miopenLayerNormMode_t mode) +{ + auto dims = miopen::deref(inputDesc).GetLengths(); + size_t outer_size = 1; + size_t inner_size = 1; + size_t i = 0; + for(; i < normalized_dim; i++) + { + outer_size *= dims[i]; + } + + for(; i < dims.size(); i++) + { + inner_size *= dims[i]; + } + + int32_t ret = 0; + + for(int32_t o = 0; o < outer_size; o++) + { + Tcheck pmean = 0.0f; + Tcheck pvar = 0.0f; + for(i = 0; i < inner_size; i++) + { + Tcheck tmp = static_cast(input[o * inner_size + i]); + pmean += tmp; + pvar += tmp * tmp; + } + + pmean = pmean / inner_size; + pvar = pvar / inner_size - pmean * pmean; + Tcheck prstd = 1.0f / sqrt(pvar + eps); + + meanhost[o] = pmean; + rstdhost[o] = prstd; + + for(i = 0; i < inner_size; i++) + { + Tcheck pweight = mode ? 1 : static_cast(weight[i]); + Tcheck pbias = mode ? 0 : static_cast(bias[i]); + outputhost[o * inner_size + i] = + (static_cast(input[o * inner_size + i]) - pmean) * prstd * pweight + pbias; + } + } + return ret; +} +#endif +#endif diff --git a/driver/tensor_driver.hpp b/driver/tensor_driver.hpp index 077cac4003..cb3139bf48 100644 --- a/driver/tensor_driver.hpp +++ b/driver/tensor_driver.hpp @@ -74,16 +74,8 @@ inline std::size_t GetTensorVectorLength(const miopenTensorDescriptor_t& tensor) int size = 0; miopenGetTensorDescriptorSize(tensor, &size); - if(size == 4 || size == 5) - { - miopenGetNdTensorDescriptorVectorLength(tensor, &vectorLength); - return vectorLength; - } - else - { - MIOPEN_THROW("We only support 4D layout in vector format"); - } - return 0; + miopenGetNdTensorDescriptorVectorLength(tensor, &vectorLength); + return vectorLength; } inline std::vector GetTensorLengths(const miopenTensorDescriptor_t& tensor) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index fcbc60a0b2..f0c0ce1aa6 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -53,6 +53,7 @@ * @defgroup convolutions * @defgroup pooling * @defgroup handle + * @defgroup layernorm * @defgroup LRN * @defgroup batchnorm * @defgroup activation @@ -455,7 +456,18 @@ typedef enum miopenLRNWithinChannel = 0, /*!< Channel independent */ miopenLRNCrossChannel = 1, /*!< Cross Channel */ } miopenLRNMode_t; - +#ifdef MIOPEN_BETA_API +/*! @ingroup layernorm + * @enum miopenLayerNormAlgorithm_t + * LayerNorm implementation algorithms + */ +typedef enum +{ + MIOPEN_ELEMENTWISE_AFFINE = 0, /*!< initialized to ones for weights and zeros for biases */ + MIOPEN_WEIGHT_BIAS = + 1, /*!< learnable weights and biases of the module of shape normalized_shape */ +} miopenLayerNormMode_t; +#endif /*! @ingroup batchnorm * @enum miopenBatchNormMode_t * Batch Normalization layer mode @@ -2453,6 +2465,55 @@ MIOPEN_EXPORT miopenStatus_t miopenDestroyLRNDescriptor(miopenLRNDescriptor_t lr /** @} */ // CLOSEOUT LRN DOXYGEN GROUP +#ifdef MIOPEN_BETA_API +// LayerNorm APIs +/** @addtogroup layernorm + * + * @{ + */ +/*! @brief Execute a layernorm forward layer + * + * This API only implements the LAYERNORM_MODE_CHANNEL in LAYERNORM_ACCURATE path. + * + * @param handle MIOpen handle (input) + * @param mode LayerNorm mode (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param weightDesc Tensor descriptor for data input tensor weight (input) + * @param weight Data tensor weight (input) + * @param biasDesc Tensor descriptor for data input tensor bias (input) + * @param bias Data tensor bias (input) + * @param epsilon Value to stablize inverse variance calculation (input) + * @param normalized_dim Nomalized dimensions in the input array (input) + * @param yDesc Tensor descriptor for output data tensor y (input) + * @param y Data tensor y (output) + * @param meanDesc Tensor descriptor for output data tensor mean (input) + * @param mean Data tensor mean (output) + * @param rstdDesc Tensor descriptor for output data tensor rstd (input) + * @param rstd Data tensor rstd (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenLayerNormForward(miopenHandle_t handle, + miopenLayerNormMode_t mode, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t weightDesc, + const void* weight, + const miopenTensorDescriptor_t biasDesc, + const void* bias, + const float epsilon, + const int32_t normalized_dim, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t meanDesc, + void* mean, + const miopenTensorDescriptor_t rstdDesc, + void* rstd); + +/** @} */ +// CLOSEOUT LAYERNORM DOXYGEN GROUP +#endif + // Batch-Normalization APIs /** @addtogroup batchnorm * diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 58ff101c33..ef7ba9558a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -127,6 +127,7 @@ set( MIOpen_Source load_file.cpp lock_file.cpp logger.cpp + layernorm_api.cpp lrn_api.cpp op_args.cpp operator.cpp @@ -413,6 +414,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenConvDirUni.cl kernels/MIOpenConvDirBatchNormActiv.cl kernels/MIOpenConvDirGenFwd.cl + kernels/MIOpenLayerNorm.cpp kernels/MIOpenLRNBwd.cl kernels/MIOpenLRNFwd.cl kernels/MIOpenNeuron.cl @@ -552,6 +554,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN hip/hip_build_utils.cpp hip/batched_transpose_sol.cpp hip/general_tensor_reorder_sol.cpp + layer_norm.cpp pooling.cpp ocl/fusionopconvocl.cpp ocl/fusionopbiasbnactivocl.cpp diff --git a/src/include/miopen/layernorm.hpp b/src/include/miopen/layernorm.hpp new file mode 100644 index 0000000000..8ec2d96055 --- /dev/null +++ b/src/include/miopen/layernorm.hpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#ifdef MIOPEN_BETA_API +#ifndef MIOPEN_LAYERNORM_HPP_ +#define MIOPEN_LAYERNORM_HPP_ + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +miopenStatus_t LayerNormForward(const Handle& handle, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& biasDesc, + ConstData_t bias, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& meanDesc, + Data_t mean, + const TensorDescriptor& rstdDesc, + Data_t rstd, + miopenLayerNormMode_t mode, + const float epsilon, + const int32_t normalized_dim); + +} // namespace miopen +#endif // _MIOPEN_LAYERNORM_HPP_ +#endif diff --git a/src/kernels/MIOpenLayerNorm.cpp b/src/kernels/MIOpenLayerNorm.cpp new file mode 100644 index 0000000000..58891d6538 --- /dev/null +++ b/src/kernels/MIOpenLayerNorm.cpp @@ -0,0 +1,119 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifdef MIOPEN_BETA_API + +#include "float_types.h" + +//#if MIOPEN_USE_BFP16 == 1 +//#undef FLOAT +//#define FLOAT hip_bfloat16 +//#endif + +extern "C" __global__ void LayernormFwdContiguous(const FLOAT* __restrict__ x, + FLOAT* __restrict__ y, + const FLOAT* __restrict__ weight, + const FLOAT* __restrict__ bias, + FLOAT* __restrict__ mean, + FLOAT* __restrict__ rstd, + float eps, + uint64_t inner_size, + bool mode) +{ + /* + * Each group works on a single channel. + * Example) + * x dim = {N, C, L}, normalized shape = {C, L} + * outer_size = N, inner_size = C * L + * + * Example2) + * x dim = {N, C, L}, normalized shape = {L} + * outer_size = N * C, inner_size = L + * + * => gws = {outer_size * LOCAL_SIZE}, lws = {LOCAL_SIZE} + */ + + /* + * Reduction to calculate mean and rstd + */ + + const uint64_t gid = blockIdx.x; + const uint64_t lid = threadIdx.x; + + FLOAT_ACCUM pmean = CVT_FLOAT2ACCUM(0); + FLOAT_ACCUM pvar = CVT_FLOAT2ACCUM(0); + __shared__ FLOAT_ACCUM ltmp1[LOCAL_SIZE]; + __shared__ FLOAT_ACCUM ltmp2[LOCAL_SIZE]; + + // reduce sum for mean and var + for(uint64_t i = lid; i < inner_size; i += LOCAL_SIZE) + { + uint64_t x_idx = gid * inner_size + i; + + FLOAT_ACCUM tmp = CVT_FLOAT2ACCUM(x[x_idx]); + pmean += tmp; + pvar += tmp * tmp; + } + + ltmp1[lid] = pmean; + ltmp2[lid] = pvar; + __syncthreads(); + for(uint64_t i = LOCAL_SIZE >> 1; i > 0; i >>= 1) + { + if(lid < i) + { + ltmp1[lid] += ltmp1[lid + i]; + ltmp2[lid] += ltmp2[lid + i]; + } + __syncthreads(); + } + pmean = ltmp1[0] / inner_size; + pvar = ltmp2[0] / inner_size - pmean * pmean; + FLOAT_ACCUM prstd = rsqrt(pvar + FLOAT_ACCUM(eps)); + + if(lid == 0) + { + if(mean) + mean[gid] = CVT_ACCUM2FLOAT(pmean); + if(rstd) + rstd[gid] = CVT_ACCUM2FLOAT(prstd); + } + + // forward calculation + for(uint64_t i = lid; i < inner_size; i += LOCAL_SIZE) + { + uint64_t idx = gid * inner_size + i; + + FLOAT_ACCUM pweight; + FLOAT_ACCUM pbias; + + pweight = mode ? CVT_FLOAT2ACCUM(1) : CVT_FLOAT2ACCUM(weight[i]); + pbias = mode ? CVT_FLOAT2ACCUM(0) : CVT_FLOAT2ACCUM(bias[i]); + + FLOAT_ACCUM val = (CVT_FLOAT2ACCUM(x[idx]) - pmean) * prstd * pweight + pbias; + y[idx] = CVT_ACCUM2FLOAT(val); + } +} +#endif diff --git a/src/layer_norm.cpp b/src/layer_norm.cpp new file mode 100644 index 0000000000..3d52bc771f --- /dev/null +++ b/src/layer_norm.cpp @@ -0,0 +1,135 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#ifdef MIOPEN_BETA_API +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +miopenStatus_t LayerNormForward(const Handle& handle, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& biasDesc, + ConstData_t bias, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& meanDesc, + Data_t mean, + const TensorDescriptor& rstdDesc, + Data_t rstd, + miopenLayerNormMode_t mode, + const float epsilon, + const int32_t normalized_dim) +{ + if(x == nullptr || y == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "Null pointer for tensor."); + } + + if(xDesc.GetType() != yDesc.GetType()) + { + MIOPEN_THROW(miopenStatusBadParm, "Tensor types do not match."); + } + + if(xDesc.GetLengths() != yDesc.GetLengths()) + { + MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match."); + } + + bool is_all_packed = xDesc.IsPacked() && weightDesc.IsPacked() && biasDesc.IsPacked() && + yDesc.IsPacked() && meanDesc.IsPacked() && rstdDesc.IsPacked(); + + if(!is_all_packed) + { + MIOPEN_THROW(miopenStatusBadParm, "All tensor is not packed."); + } + + auto dims = xDesc.GetLengths(); + size_t grid_size = 1; + size_t outer_size = 1; + size_t inner_size = 1; + size_t i = 0; + for(; i < normalized_dim; i++) + { + outer_size *= dims[i]; + grid_size *= dims[i]; + } + + for(; i < dims.size(); i++) + { + inner_size *= dims[i]; + grid_size *= dims[i]; + } + + auto dtype = xDesc.GetType(); + + const std::vector vld{LOCAL_SIZE, 1, 1}; + const std::vector vgd{outer_size * vld[0], 1, 1}; + + std::string algo_name = "LayerNormForward"; + std::string network_config = + "lnfwd-dtype" + std::to_string(static_cast(dtype)) + "g" + std::to_string(vgd[0]) + + "l" + std::to_string(vld[0]) + "normalized_dim" + std::to_string(normalized_dim) + "grid" + + std::to_string(grid_size) + "outer_size" + std::to_string(outer_size) + "inner_size" + + std::to_string(inner_size) + "mode" + std::to_string(static_cast(mode)) + "eps" + + std::to_string(static_cast(epsilon)); + + std::string program_name = "MIOpenLayerNorm.cpp"; + std::string kernel_name = "LayernormFwdContiguous"; + + // compile parameters + std::string parms = + " -DMIOPEN_USE_FP16=" + std::to_string(static_cast(dtype == miopenHalf)) + + " -DMIOPEN_USE_FP32=" + std::to_string(static_cast(dtype == miopenFloat)) + + " -DMIOPEN_USE_FP64=" + std::to_string(static_cast(dtype == miopenDouble)) + + " -DMIOPEN_USE_BFP16=" + std::to_string(static_cast(dtype == miopenBFloat16)); + + parms += " -DMIOPEN_BETA_API=1"; + parms += " -DLOCAL_SIZE=" + std::to_string(LOCAL_SIZE); + + auto&& kernels = handle.GetKernels(algo_name, network_config); + if(!kernels.empty()) + { + kernels.front()(x, y, weight, bias, mean, rstd, epsilon, inner_size, mode); + } + else + { + handle.AddKernel(algo_name, network_config, program_name, kernel_name, vld, vgd, parms)( + x, y, weight, bias, mean, rstd, epsilon, inner_size, mode); + } + + return miopenStatusSuccess; +} + +} // namespace miopen +#endif diff --git a/src/layernorm_api.cpp b/src/layernorm_api.cpp new file mode 100644 index 0000000000..1c8f8d0cca --- /dev/null +++ b/src/layernorm_api.cpp @@ -0,0 +1,137 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#ifdef MIOPEN_BETA_API +#include +#include +#include +#include + +static void +LogCmdLayerNorm(const miopenTensorDescriptor_t xDesc, const miopenLayerNormMode_t mode, bool is_fwd) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(xDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "layernormfp16"; + } + else if(dtype == miopenFloat) + { + ss << "layernormfp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "layernormbf16"; + } + else if(dtype == miopenDouble) + { + ss << "layernormfp64"; + } + + int32_t size = {0}; + miopenGetTensorDescriptorSize(xDesc, &size); + ss << " -n " << miopen::deref(xDesc).GetLengths()[0] << " -c " + << miopen::deref(xDesc).GetLengths()[1]; + if(size == 5) + { + ss << " -D " << miopen::deref(xDesc).GetLengths()[2] << " -H " + << miopen::deref(xDesc).GetLengths()[3] << " -W " + << miopen::deref(xDesc).GetLengths()[4]; + } + else if(size == 4) + { + ss << " -H " << miopen::deref(xDesc).GetLengths()[2] << " -W " + << miopen::deref(xDesc).GetLengths()[3]; + } + else if(size == 3) + { + ss << " -W " << miopen::deref(xDesc).GetLengths()[2]; + } + + ss << " -F " << ((is_fwd) ? "1" : "2") << " -m " << mode; + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t miopenLayerNormForward(miopenHandle_t handle, + miopenLayerNormMode_t mode, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t weightDesc, + const void* weight, + const miopenTensorDescriptor_t biasDesc, + const void* bias, + const float epsilon, + const int32_t normalized_dim, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t meanDesc, + void* mean, + const miopenTensorDescriptor_t rstdDesc, + void* rstd) +{ + MIOPEN_LOG_FUNCTION(handle, + mode, + xDesc, + x, + weightDesc, + weight, + biasDesc, + bias, + epsilon, + normalized_dim, + yDesc, + y, + meanDesc, + mean, + rstdDesc, + rstd); + + LogCmdLayerNorm(xDesc, mode, true); + return miopen::try_([&] { + miopen::LayerNormForward(miopen::deref(handle), + miopen::deref(xDesc), + DataCast(x), + miopen::deref(weightDesc), + DataCast(weight), + miopen::deref(biasDesc), + DataCast(bias), + miopen::deref(yDesc), + DataCast(y), + miopen::deref(meanDesc), + DataCast(mean), + miopen::deref(rstdDesc), + DataCast(rstd), + mode, + epsilon, + normalized_dim); + }); +} +#endif diff --git a/test/cpu_layernorm.hpp b/test/cpu_layernorm.hpp new file mode 100644 index 0000000000..08cf44368e --- /dev/null +++ b/test/cpu_layernorm.hpp @@ -0,0 +1,83 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifdef MIOPEN_BETA_API +#ifndef GUARD_CPU_LAYERNORM_HPP +#define GUARD_CPU_LAYERNORM_HPP + +#include "tensor_holder.hpp" + +template +void cpu_layernorm_forward(tensor input, + tensor weight, + tensor bias, + tensor& ref_output, + tensor& ref_mean, + tensor& ref_rstd, + float eps, + int32_t dim, + miopenLayerNormMode_t mode) +{ + auto dims = input.desc.GetLengths(); + size_t outer_size = 1; + size_t inner_size = 1; + size_t i = 0; + for(; i < dim; i++) + { + outer_size *= dims[i]; + } + + for(; i < dims.size(); i++) + { + inner_size *= dims[i]; + } + + par_ford(outer_size)([&](int32_t o) { + T mean_v = 0.0f; + T var_v = 0.0f; + + ford(inner_size)([&](int32_t i) { + T tmp = input[o * inner_size + i]; + mean_v += tmp; + var_v += tmp * tmp; + }); + + mean_v = mean_v / inner_size; + var_v = var_v / inner_size - mean_v * mean_v; + T rstd_v = 1.0f / sqrt(var_v + eps); + + ref_mean[o] = mean_v; + ref_rstd[o] = rstd_v; + + ford(inner_size)([&](int32_t i) { + T weight_v = mode ? 1 : weight[i]; + T bias_v = mode ? 0 : bias[i]; + ref_output[o * inner_size + i] = + (input[o * inner_size + i] - mean_v) * rstd_v * weight_v + bias_v; + }); + }); +} +#endif +#endif diff --git a/test/gtest/layernorm_test.cpp b/test/gtest/layernorm_test.cpp new file mode 100644 index 0000000000..d60bfe963c --- /dev/null +++ b/test/gtest/layernorm_test.cpp @@ -0,0 +1,38 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "layernorm_test.hpp" +#ifdef MIOPEN_BETA_API + +struct LayerNormSolverTestFloat : LayerNormSolverTest +{ +}; + +TEST_P(LayerNormSolverTestFloat, LayerNormTestFw){}; + +INSTANTIATE_TEST_SUITE_P(LayerNormTestSet, + LayerNormSolverTestFloat, + testing::ValuesIn(LayerNormTestConfigs())); +#endif diff --git a/test/gtest/layernorm_test.hpp b/test/gtest/layernorm_test.hpp new file mode 100644 index 0000000000..740108a887 --- /dev/null +++ b/test/gtest/layernorm_test.hpp @@ -0,0 +1,247 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#ifdef MIOPEN_BETA_API +#include +#include + +#include "tensor_holder.hpp" +#include "cpu_layernorm.hpp" +#include "get_handle.hpp" +#include "../driver/tensor_driver.hpp" +#include "verify.hpp" +#include + +struct LayerNormTestCase +{ + size_t N; + size_t C; + size_t D; + size_t H; + size_t W; + size_t nomalized_dim; + float eps; + miopenLayerNormMode_t ln_mode; + friend std::ostream& operator<<(std::ostream& os, const LayerNormTestCase& tc) + { + return os << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H + << " W:" << tc.W << " dim:" << tc.nomalized_dim << " eps:" << tc.eps + << " LayerNorm_mode:" << tc.ln_mode; + } + + std::vector GetInput() { return {N, C, D, H, W}; } +}; + +std::vector LayerNormTestConfigs() +{ // n c h d w nomalized_dim eps ln_mode + // clang-format off + return { + { 32, 1, 32, 32, 32 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 32x32x32 based on VoxNet arch + { 32, 1, 14, 14, 14 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 14, 14, 14 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 12, 12, 12 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 6, 6, 6 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 256, 1, 32, 32, 32 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 32x32x32 based on VoxNet arch + { 256, 32, 14, 14, 14 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 256, 32, 12, 12, 12 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 256, 32, 6, 6, 6 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 512, 1, 32, 32, 32 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 32x32x32 based on VoxNet arch + { 512, 32, 14, 14, 14 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 512, 32, 12, 12, 12 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 512, 32, 6, 6, 6 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 2, 32, 57, 125 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // Hand-gesture recognition CVPR 2015 paper High Res Net Path + { 32, 32, 14, 25, 59 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 6, 10, 27 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 4, 6, 11 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 2, 2, 3 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 32, 28, 62 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // Hand-gesture recognition CVPR 2015 paper Low Res Net Path + { 32, 32, 14, 12, 29 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 6, 4, 12 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 32, 32, 4, 2, 2 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, + { 16, 32, 6, 50, 50 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // Multi-view 3D convnet + { 1, 3, 8, 240, 320 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video + { 1, 3, 16, 240, 320 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video + { 1, 3, 8, 128, 171 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video + { 1, 3, 16, 128, 171 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video + { 1, 3, 8, 112, 112 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE}, // 3D convet on video + { 1, 3, 16, 112, 112 ,4 , 1e-5, MIOPEN_ELEMENTWISE_AFFINE} // 3D convet on video + }; + // clang-format on +} + +inline int32_t SetTensorLayout(miopen::TensorDescriptor& desc) +{ + std::vector lens = desc.GetLengths(); + std::vector int32_t_lens(lens.begin(), lens.end()); + + // set the strides for the tensor + return SetTensorNd(&desc, int32_t_lens, desc.GetType()); +} + +template +struct LayerNormSolverTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + test_skipped = false; + layernorm_config = GetParam(); + std::mt19937 gen(0); + std::uniform_real_distribution<> d{-3, 3}; + auto gen_value = [&](auto...) { return d(gen); }; + + nomalized_dim = layernorm_config.nomalized_dim; + eps = layernorm_config.eps; + ln_mode = layernorm_config.ln_mode; + + auto in_dim = layernorm_config.GetInput(); + + input = tensor{in_dim}.generate(gen_value); + + if(ln_mode == MIOPEN_ELEMENTWISE_AFFINE) + { + std::vector inner_dim; + if(nomalized_dim == in_dim.size()) + inner_dim = {1}; + else + inner_dim = {in_dim.begin() + nomalized_dim, in_dim.end()}; + weight = tensor{inner_dim}.generate(gen_value); + bias = tensor{inner_dim}.generate(gen_value); + SetTensorLayout(weight.desc); + SetTensorLayout(bias.desc); + } + + std::vector outer_dim; + if(nomalized_dim == 0) + outer_dim = {1}; + else + outer_dim = {in_dim.begin(), in_dim.end() - (in_dim.size() - nomalized_dim)}; + + SetTensorLayout(input.desc); + + output = tensor{in_dim}; + mean = tensor{outer_dim}; + rstd = tensor{outer_dim}; + SetTensorLayout(output.desc); + SetTensorLayout(mean.desc); + SetTensorLayout(rstd.desc); + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + std::fill(mean.begin(), mean.end(), std::numeric_limits::quiet_NaN()); + std::fill(rstd.begin(), rstd.end(), std::numeric_limits::quiet_NaN()); + + ref_output = tensor{in_dim}; + ref_mean = tensor{outer_dim}; + ref_rstd = tensor{outer_dim}; + std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits::quiet_NaN()); + std::fill(ref_mean.begin(), ref_mean.end(), std::numeric_limits::quiet_NaN()); + std::fill(ref_rstd.begin(), ref_rstd.end(), std::numeric_limits::quiet_NaN()); + + input_dev = handle.Write(input.data); + weight_dev = handle.Write(weight.data); + bias_dev = handle.Write(bias.data); + output_dev = handle.Write(output.data); + mean_dev = handle.Write(mean.data); + rstd_dev = handle.Write(rstd.data); + } + void TearDown() override + { + if(test_skipped) + return; + + auto&& handle = get_handle(); + + cpu_layernorm_forward( + input, weight, bias, ref_output, ref_mean, ref_rstd, eps, nomalized_dim, ln_mode); + miopenStatus_t status; + + status = miopen::LayerNormForward(handle, + input.desc, + input_dev.get(), + weight.desc, + weight_dev.get(), + bias.desc, + bias_dev.get(), + output.desc, + output_dev.get(), + mean.desc, + mean_dev.get(), + rstd.desc, + rstd_dev.get(), + ln_mode, + eps, + nomalized_dim); + + EXPECT_EQ(status, miopenStatusSuccess); + + output.data = handle.Read(output_dev, output.data.size()); + mean.data = handle.Read(mean_dev, mean.data.size()); + rstd.data = handle.Read(rstd_dev, rstd.data.size()); + + double threshold = std::numeric_limits::epsilon(); + auto error = miopen::rms_range(ref_output, output); + + EXPECT_TRUE(miopen::range_distance(ref_output) == miopen::range_distance(output)); + EXPECT_TRUE(error < threshold * 1000) << "Error output beyond tolerance Error:" << error + << ", Thresholdx1000: " << threshold * 1000; + + error = miopen::rms_range(ref_mean, mean); + EXPECT_TRUE(miopen::range_distance(ref_mean) == miopen::range_distance(mean)); + EXPECT_TRUE(error < threshold) + << "Error mean beyond tolerance Error:" << error << ", Threshold: " << threshold; + + error = miopen::rms_range(ref_rstd, rstd); + EXPECT_TRUE(miopen::range_distance(ref_rstd) == miopen::range_distance(rstd)); + EXPECT_TRUE(error < threshold * 2000) << "Error rstd beyond tolerance Error:" << error + << ", Thresholdx2000: " << threshold * 2000; + } + LayerNormTestCase layernorm_config; + + tensor input; + tensor weight; + tensor bias; + tensor output; + tensor mean; + tensor rstd; + + tensor ref_output; + tensor ref_mean; + tensor ref_rstd; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr weight_dev; + miopen::Allocator::ManageDataPtr bias_dev; + miopen::Allocator::ManageDataPtr output_dev; + miopen::Allocator::ManageDataPtr mean_dev; + miopen::Allocator::ManageDataPtr rstd_dev; + + size_t nomalized_dim; + float eps; + miopenLayerNormMode_t ln_mode; + + bool test_skipped = false; +}; +#endif From 71f159cbcf300ed88b3c28566c3d0b76d54a1eda Mon Sep 17 00:00:00 2001 From: mentat <108366729+bghimireamd@users.noreply.github.com> Date: Tue, 19 Sep 2023 19:06:03 -0500 Subject: [PATCH 06/36] [SWDEV-414487] Enable 2d conv + bias + activ CK kernel for MI300 (#2399) * SWDEV-414487 : enable 2d conv + bias + activ CK kernel for MI300 * SWDEV-414487: fix compilation error --- src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp index ed975285ee..9520a79a90 100644 --- a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp +++ b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp @@ -421,7 +421,8 @@ bool ConvCKIgemmFwdBiasActivFused::IsApplicable(const FusionContext& ctx, if(!conv_problem.Is2d()) return false; const std::string arch = ctx.GetStream().GetDeviceName(); - if(arch != "gfx908" && arch != "gfx90a") + if(arch != "gfx908" && arch != "gfx90a" && arch != "gfx940" && arch != "gfx941" && + arch != "gfx942") return false; if(!conv_problem.IsLayoutNHWC()) return false; From c09dac85714da92fbb2672c3db6e8faa7b70b878 Mon Sep 17 00:00:00 2001 From: Artur Wojcik Date: Thu, 21 Sep 2023 01:23:17 +0200 Subject: [PATCH 07/36] [Windows] roctracer: disable on Windows (not supported) (#2404) Co-authored-by: Artur Wojcik --- src/CMakeLists.txt | 6 ++++-- src/include/miopen/logger.hpp | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ef7ba9558a..4ffed2b4c8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -850,8 +850,10 @@ if(NOT WIN32 AND NOT APPLE) target_internal_library(MIOpen ${LIBRT}) endif() endif() -############################################################ -target_link_libraries(MIOpen PRIVATE "-lroctx64") + +if(NOT WIN32) + target_link_libraries(MIOpen PRIVATE roctx64) +endif() ############################################################ # Installation diff --git a/src/include/miopen/logger.hpp b/src/include/miopen/logger.hpp index 659906ddb6..dc18eae5c0 100644 --- a/src/include/miopen/logger.hpp +++ b/src/include/miopen/logger.hpp @@ -38,7 +38,9 @@ #include #include +#ifndef _WIN32 #include +#endif // See https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms #define MIOPEN_PP_CAT(x, y) MIOPEN_PP_PRIMITIVE_CAT(x, y) @@ -410,6 +412,7 @@ class LogScopeTime #define MIOPEN_LOG_SCOPE_TIME #endif +#ifndef _WIN32 class LogScopeRoctx { public: @@ -434,6 +437,7 @@ class LogScopeRoctx private: bool m_active{false}; }; +#endif } // namespace miopen From 3825849cd5f63cdb6378c6e3c9cf559a5d2917a8 Mon Sep 17 00:00:00 2001 From: Chris Erb Date: Mon, 25 Sep 2023 09:52:06 -0500 Subject: [PATCH 08/36] [MI200] Refresh kdb using db_sync (#2411) --- src/kernels/gfx90a.kdb.bz2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kernels/gfx90a.kdb.bz2 b/src/kernels/gfx90a.kdb.bz2 index d2c8722d2e..613df18f7a 100644 --- a/src/kernels/gfx90a.kdb.bz2 +++ b/src/kernels/gfx90a.kdb.bz2 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91eaa7412acf3e9a6f23cd70b386037434318a63e1d7be7212979a9ee50fe617 -size 592987974 +oid sha256:3d76d7c53648f4864a5cfe9267e8cb9171abab81de9d1732a9f94bafb0816b61 +size 250548882 From 115686c1f75c115308d0099d79bfa7cc87a965c4 Mon Sep 17 00:00:00 2001 From: Vasilii Filippov Date: Mon, 25 Sep 2023 20:14:06 +0200 Subject: [PATCH 09/36] Removal of convolution context (#2402) --- src/conv/heuristics/ai_heuristics.cpp | 8 +- src/conv/invokers/impl_gemm_dynamic.cpp | 4 +- src/conv/solver_finders.cpp | 22 +- src/convolution.cpp | 43 +- src/fusion.cpp | 2 +- src/include/miopen/any_solver.hpp | 81 +- .../miopen/conv/compiled_in_parameters.hpp | 3 +- src/include/miopen/conv/context.hpp | 24 +- .../miopen/conv/heuristics/ai_heuristics.hpp | 3 +- .../miopen/conv/invokers/impl_gemm.hpp | 2 +- .../conv/invokers/impl_gemm_dynamic.hpp | 5 +- .../miopen/conv/invokers/mlir_impl_gemm.hpp | 2 +- src/include/miopen/conv/solver_finders.hpp | 11 +- src/include/miopen/convolution.hpp | 14 +- src/include/miopen/execution_context.hpp | 5 + src/include/miopen/fusion/context.hpp | 13 +- src/include/miopen/fusion/solvers.hpp | 7 +- src/include/miopen/generic_search.hpp | 4 +- src/include/miopen/mlo_internal.hpp | 36 +- src/include/miopen/solver.hpp | 1147 +++++++---------- .../miopen/solver/conv_direct_naive_conv.hpp | 7 +- .../miopen/solver/implicitgemm_util.hpp | 12 +- src/include/miopen/solver/mlir_common.hpp | 7 +- src/mlo_dir_conv.cpp | 33 +- src/ocl/convolutionocl.cpp | 33 +- src/problem.cpp | 2 +- src/solution.cpp | 2 +- src/solver.cpp | 2 +- src/solver/conv_MP_bidirectional_winograd.cpp | 24 +- src/solver/conv_asm_1x1u.cpp | 19 +- src/solver/conv_asm_1x1u_stride2.cpp | 10 +- src/solver/conv_asm_3x3u.cpp | 11 +- src/solver/conv_asm_dir_BwdWrW1x1.cpp | 16 +- src/solver/conv_asm_dir_BwdWrW3x3.cpp | 14 +- .../conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp | 14 +- .../conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp | 12 +- .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 14 +- ...conv_asm_implicit_gemm_gtc_perf_config.cpp | 4 +- .../conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp | 14 +- .../conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp | 12 +- src/solver/conv_direct_naive_conv.cpp | 4 +- src/solver/conv_direct_naive_conv_bwd.cpp | 4 +- src/solver/conv_direct_naive_conv_fwd.cpp | 4 +- src/solver/conv_direct_naive_conv_wrw.cpp | 4 +- ...ip_implicit_gemm_3d_grouped_bwd_xdlops.cpp | 10 +- ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 10 +- ...ip_implicit_gemm_3d_grouped_wrw_xdlops.cpp | 10 +- ...conv_hip_implicit_gemm_bwd_data_xdlops.cpp | 10 +- .../conv_hip_implicit_gemm_bwd_v1r1.cpp | 26 +- ...conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp | 18 +- .../conv_hip_implicit_gemm_bwd_v4r1.cpp | 12 +- ...conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp | 16 +- .../conv_hip_implicit_gemm_fwd_v4r1.cpp | 20 +- .../conv_hip_implicit_gemm_fwd_v4r4.cpp | 12 +- ...conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp | 16 +- ...licit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp | 16 +- ...conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp | 16 +- .../conv_hip_implicit_gemm_fwd_xdlops.cpp | 10 +- ...v_hip_implicit_gemm_grouped_fwd_xdlops.cpp | 10 +- ...onv_hip_implicit_gemm_nonxdlops_common.cpp | 6 +- .../conv_hip_implicit_gemm_wrw_v4r4.cpp | 12 +- ...conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp | 24 +- ...licit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp | 24 +- src/solver/conv_mlir_igemm_bwd.cpp | 10 +- src/solver/conv_mlir_igemm_bwd_xdlops.cpp | 10 +- src/solver/conv_mlir_igemm_fwd.cpp | 12 +- src/solver/conv_mlir_igemm_fwd_xdlops.cpp | 12 +- src/solver/conv_mlir_igemm_wrw.cpp | 10 +- src/solver/conv_mlir_igemm_wrw_xdlops.cpp | 12 +- src/solver/conv_ocl_dir2D11x11.cpp | 4 +- src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp | 6 +- src/solver/conv_ocl_dir2D_bwdWrW_2.cpp | 20 +- src/solver/conv_ocl_dir2D_bwdWrW_53.cpp | 6 +- src/solver/conv_ocl_dir2Dfwd.cpp | 8 +- src/solver/conv_ocl_dir2Dfwd1x1.cpp | 4 +- .../conv_ocl_dir2Dfwd_exhaustive_search.cpp | 8 +- src/solver/conv_ocl_dir2Dfwdgen.cpp | 4 +- src/solver/conv_winoRxS.cpp | 28 +- src/solver/conv_wino_fury_RxS.cpp | 6 +- src/solver/mlir_common.cpp | 2 +- test/conv_common.hpp | 6 +- test/embed_sqlite.cpp | 4 +- test/gpu_conv.hpp | 6 +- test/gtest/db_sync.cpp | 6 +- test/gtest/group_conv3d_bwd.cpp | 2 +- test/gtest/group_conv3d_fwd.cpp | 2 +- test/gtest/group_conv3d_wrw.cpp | 2 +- test/gtest/group_conv_fwd.cpp | 2 +- test/gtest/kernel_tuning_net.cpp | 2 +- test/gtest/solver_bwd.hpp | 6 +- test/gtest/solver_fwd.hpp | 4 +- test/gtest/solver_wrw.hpp | 4 +- test/gtest/tuna_net.cpp | 2 +- test/solver.cpp | 30 +- 94 files changed, 1007 insertions(+), 1225 deletions(-) diff --git a/src/conv/heuristics/ai_heuristics.cpp b/src/conv/heuristics/ai_heuristics.cpp index 50cd495de6..ffe4b0c591 100644 --- a/src/conv/heuristics/ai_heuristics.cpp +++ b/src/conv/heuristics/ai_heuristics.cpp @@ -119,9 +119,9 @@ class Model offset(metadata.num_outputs - metadata.num_solvers) { } - virtual ~Model() = default; + virtual ~Model() = default; virtual bool IsProblemSupported(const ProblemDescription& problem, - const ConvolutionContext& ctx) const = 0; + const ExecutionContext& ctx) const = 0; std::vector Forward(const ProblemDescription& problem) const { std::vector features = ToFeatures(problem); @@ -150,7 +150,7 @@ class Gfx908Model : public Model public: Gfx908Model() : Model("gfx908") {} bool IsProblemSupported(const ProblemDescription& problem, - const ConvolutionContext& ctx) const override + const ExecutionContext& ctx) const override { // check if problem is of the kind TunaNet was trained to handle if(!problem.Is2d()) @@ -258,7 +258,7 @@ class Gfx908Model : public Model std::unique_ptr GetModel(const std::string&) { return std::make_unique(); } std::vector PredictSolver(const ProblemDescription& problem, - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const std::string& device) { const static std::unique_ptr model = GetModel(device); diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp index 01e931dd69..2416217ea2 100644 --- a/src/conv/invokers/impl_gemm_dynamic.cpp +++ b/src/conv/invokers/impl_gemm_dynamic.cpp @@ -438,7 +438,7 @@ MakeImplGemmDynamicBackwardDataInvokerFactory FindImpl(const ConvolutionContext& ctx, + std::vector FindImpl(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx, bool /*use_winograd_only*/) const override @@ -79,14 +79,14 @@ class ImplicitGemmSolverFinder : public SolversFinder } protected: - bool IsEnabled(const ConvolutionContext& /*ctx*/, + bool IsEnabled(const ExecutionContext& /*ctx*/, const conv::ProblemDescription& /*problem*/, bool use_winograd_only) const override { return !use_winograd_only && !IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM{}); } - std::vector FindImpl(const ConvolutionContext& ctx, + std::vector FindImpl(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx, bool /*use_winograd_only*/) const override @@ -107,7 +107,7 @@ class FftSolverFinder : public SolversFinder } protected: - bool IsEnabled(const ConvolutionContext& /*ctx*/, + bool IsEnabled(const ExecutionContext& /*ctx*/, const conv::ProblemDescription& problem, bool use_winograd_only) const override { @@ -115,7 +115,7 @@ class FftSolverFinder : public SolversFinder !IsDisabled(MIOPEN_DEBUG_CONV_FFT{}); } - std::vector FindImpl(const ConvolutionContext& ctx, + std::vector FindImpl(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx, bool /*use_winograd_only*/) const override @@ -134,14 +134,14 @@ class GemmSolverFinder : public SolversFinder } protected: - bool IsEnabled(const ConvolutionContext& /*ctx*/, + bool IsEnabled(const ExecutionContext& /*ctx*/, const conv::ProblemDescription& /*problem*/, bool use_winograd_only) const override { return !use_winograd_only && !IsDisabled(MIOPEN_DEBUG_CONV_GEMM{}); } - std::vector FindImpl(const ConvolutionContext& ctx, + std::vector FindImpl(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx, bool /*use_winograd_only*/) const override @@ -160,14 +160,14 @@ class WinogradSolverFinder : public SolversFinder } protected: - bool IsEnabled(const ConvolutionContext& /*ctx*/, + bool IsEnabled(const ExecutionContext& /*ctx*/, const conv::ProblemDescription& /*problem*/, bool /*use_winograd_only*/) const override { return !IsDisabled(MIOPEN_DEBUG_CONV_WINOGRAD{}); } - std::vector FindImpl(const ConvolutionContext& ctx, + std::vector FindImpl(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx, bool use_winograd_only) const override @@ -277,7 +277,7 @@ static void EvaluateInvokers(Handle& handle, void ConvFindCore(const AnyInvokeParams& invoke_ctx, DbRecord& record, - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, bool use_winograd_only, const std::vector>& finders) diff --git a/src/convolution.cpp b/src/convolution.cpp index 5f7539f70d..403ff777cd 100644 --- a/src/convolution.cpp +++ b/src/convolution.cpp @@ -75,7 +75,7 @@ std::size_t GetMaxWorkSpaceSize(const std::vector= 1); + assert(!deref(fusePlanDesc).op_map.empty()); std::string str; if(deref(fusePlanDesc).data_type == miopenBFloat16) diff --git a/src/include/miopen/any_solver.hpp b/src/include/miopen/any_solver.hpp index 8de0e8a759..b2f177b6ea 100644 --- a/src/include/miopen/any_solver.hpp +++ b/src/include/miopen/any_solver.hpp @@ -46,7 +46,7 @@ struct AnySolver AnySolver() : ptr_value(nullptr){}; template AnySolver(U src) : ptr_value(new AnySolver_tmpl(std::forward(src))){}; - bool IsApplicable(const ConvolutionContext& ctx, const ProblemDescription& problem) const + bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { assert(ptr_value != nullptr); return ptr_value->IsApplicable(ctx, problem); @@ -56,14 +56,14 @@ struct AnySolver assert(ptr_value != nullptr); return ptr_value->IsTunable(); }; - bool TestPerfCfgParams(const ConvolutionContext& ctx, + bool TestPerfCfgParams(const ExecutionContext& ctx, const ProblemDescription& problem, const std::string& params) const { assert(ptr_value != nullptr); return ptr_value->TestPerfCfgParams(ctx, problem, params); }; - std::vector GetAllSolutions(const ConvolutionContext& ctx, + std::vector GetAllSolutions(const ExecutionContext& ctx, const ProblemDescription& problem) const { assert(ptr_value != nullptr); @@ -74,7 +74,7 @@ struct AnySolver assert(ptr_value != nullptr); return ptr_value->IsDynamic(); }; - float GetWti(const ConvolutionContext& ctx, const ProblemDescription& problem) const + float GetWti(const ExecutionContext& ctx, const ProblemDescription& problem) const { assert(ptr_value != nullptr); return ptr_value->GetWti(ctx, problem); @@ -85,7 +85,7 @@ struct AnySolver return ptr_value->Type(); }; bool IsEmpty() const { return ptr_value == nullptr; }; - ConvSolution FindSolution(const ConvolutionContext& ctx, + ConvSolution FindSolution(const ExecutionContext& ctx, const ProblemDescription& problem, PerformanceDb& db, const miopen::AnyInvokeParams& invoke_ctx, @@ -94,7 +94,7 @@ struct AnySolver assert(ptr_value != nullptr); return ptr_value->FindSolution(ctx, problem, db, invoke_ctx, perf_cfg); }; - std::string GetPerfCfgParams(const ConvolutionContext& ctx, + std::string GetPerfCfgParams(const ExecutionContext& ctx, const ProblemDescription& problem, PerformanceDb& db) const { @@ -107,7 +107,7 @@ struct AnySolver return ptr_value->GetSolverDbId(); } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, const ProblemDescription& problem) const + size_t GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const { assert(ptr_value != nullptr); return ptr_value->GetWorkspaceSize(ctx, problem); @@ -125,30 +125,30 @@ struct AnySolver using ptr = std::shared_ptr; virtual ~AnySolver_base(){}; - virtual bool IsApplicable(const ConvolutionContext& ctx, + virtual bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const = 0; virtual bool IsTunable() const = 0; - virtual bool TestPerfCfgParams(const ConvolutionContext& ctx, + virtual bool TestPerfCfgParams(const ExecutionContext& ctx, const ProblemDescription& problem, const std::string& params) const = 0; virtual std::vector - GetAllSolutions(const ConvolutionContext& ctx, const ProblemDescription& problem) const = 0; - virtual bool IsDynamic() const = 0; - virtual float GetWti(const ConvolutionContext& ctx, - const ProblemDescription& problem) const = 0; - virtual const std::type_info& Type() const = 0; - virtual std::string GetSolverDbId() const = 0; - virtual ConvSolution FindSolution(const ConvolutionContext& ctx, + GetAllSolutions(const ExecutionContext& ctx, const ProblemDescription& problem) const = 0; + virtual bool IsDynamic() const = 0; + virtual float GetWti(const ExecutionContext& ctx, + const ProblemDescription& problem) const = 0; + virtual const std::type_info& Type() const = 0; + virtual std::string GetSolverDbId() const = 0; + virtual ConvSolution FindSolution(const ExecutionContext& ctx, const ProblemDescription& problem, PerformanceDb& db, const miopen::AnyInvokeParams& invoke_ctx, - const std::string& perf_cfg) const = 0; - virtual std::string GetPerfCfgParams(const ConvolutionContext& ctx, + const std::string& perf_cfg) const = 0; + virtual std::string GetPerfCfgParams(const ExecutionContext& ctx, const ProblemDescription& problem, - PerformanceDb& db) const = 0; - virtual size_t GetWorkspaceSize(const ConvolutionContext& ctx, - const ProblemDescription& problem) const = 0; - virtual bool MayNeedWorkspace() const = 0; + PerformanceDb& db) const = 0; + virtual size_t GetWorkspaceSize(const ExecutionContext& ctx, + const ProblemDescription& problem) const = 0; + virtual bool MayNeedWorkspace() const = 0; }; // templated derived class @@ -160,7 +160,7 @@ struct AnySolver template static constexpr auto Test(U*) -> typename std::is_class().GetDefaultPerformanceConfig( - std::declval(), + std::declval(), std::declval()))>::type; template @@ -176,7 +176,7 @@ struct AnySolver static constexpr auto Test(U*) -> typename std::is_same().GetDefaultPerformanceConfig( - std::declval(), + std::declval(), std::declval()))>::type; template @@ -186,13 +186,13 @@ struct AnySolver static constexpr bool Is = type::value; }; - bool TestPerfCfgParams(const ConvolutionContext& ctx, + bool TestPerfCfgParams(const ExecutionContext& ctx, const ProblemDescription& problem, const std::string& params, std::true_type) const { using PerformanceConfig = decltype(value.GetDefaultPerformanceConfig( - std::declval(), + std::declval(), std::declval())); PerformanceConfig config{}; @@ -208,7 +208,7 @@ struct AnySolver return success; } - bool TestPerfCfgParams(const ConvolutionContext&, + bool TestPerfCfgParams(const ExecutionContext&, const ProblemDescription&, const std::string&, std::false_type) const @@ -216,7 +216,7 @@ struct AnySolver return false; } - bool TestPerfCfgParams(const ConvolutionContext& ctx, + bool TestPerfCfgParams(const ExecutionContext& ctx, const ProblemDescription& problem, const std::string& params) const override { @@ -225,7 +225,7 @@ struct AnySolver } // tunable legacy solver - std::vector GetAllSolutions(const ConvolutionContext&, + std::vector GetAllSolutions(const ExecutionContext&, const ProblemDescription&, std::true_type, std::true_type) const @@ -234,7 +234,7 @@ struct AnySolver } // tunable solver, not legacy - std::vector GetAllSolutions(const ConvolutionContext& ctx, + std::vector GetAllSolutions(const ExecutionContext& ctx, const ProblemDescription& problem, std::true_type, std::false_type) const @@ -243,7 +243,7 @@ struct AnySolver } // non tunable solver - std::vector GetAllSolutions(const ConvolutionContext& ctx, + std::vector GetAllSolutions(const ExecutionContext& ctx, const ProblemDescription& problem, std::false_type, std::true_type) const @@ -252,7 +252,7 @@ struct AnySolver solutions.push_back(value.GetSolution(ctx, problem)); return solutions; } - std::vector GetAllSolutions(const ConvolutionContext& ctx, + std::vector GetAllSolutions(const ExecutionContext& ctx, const ProblemDescription& problem, std::false_type, std::false_type) const @@ -262,7 +262,7 @@ struct AnySolver return solutions; } - std::vector GetAllSolutions(const ConvolutionContext& ctx, + std::vector GetAllSolutions(const ExecutionContext& ctx, const ProblemDescription& problem) const override { return GetAllSolutions(ctx, @@ -273,20 +273,19 @@ struct AnySolver AnySolver_tmpl(T obj) : value(std::move(obj)){}; - bool IsApplicable(const ConvolutionContext& ctx, + bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override { return value.IsApplicable(ctx, problem); } bool IsTunable() const override { return TunableSolver::Is; } bool IsDynamic() const override { return value.IsDynamic(); } - float GetWti(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override + float GetWti(const ExecutionContext& ctx, const ProblemDescription& problem) const override { return value.GetWti(ctx, problem); } - ConvSolution FindSolution(const ConvolutionContext& ctx, + ConvSolution FindSolution(const ExecutionContext& ctx, const ProblemDescription& problem, PerformanceDb& db, const miopen::AnyInvokeParams& invoke_ctx, @@ -295,7 +294,7 @@ struct AnySolver return miopen::solver::FindSolution(value, ctx, problem, db, invoke_ctx, perf_cfg); }; - std::string GetPerfCfgParams(const ConvolutionContext& ctx, + std::string GetPerfCfgParams(const ExecutionContext& ctx, const ProblemDescription& problem, PerformanceDb& db, std::true_type) const @@ -327,7 +326,7 @@ struct AnySolver config = value.GetDefaultPerformanceConfig(ctx, problem); return config.ToString(); } - std::string GetPerfCfgParams(const ConvolutionContext&, + std::string GetPerfCfgParams(const ExecutionContext&, const ProblemDescription&, const PerformanceDb&, std::false_type) const @@ -336,7 +335,7 @@ struct AnySolver return ""; } - std::string GetPerfCfgParams(const ConvolutionContext& ctx, + std::string GetPerfCfgParams(const ExecutionContext& ctx, const ProblemDescription& problem, PerformanceDb& db) const override { @@ -344,7 +343,7 @@ struct AnySolver ctx, problem, db, std::integral_constant()); } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, + size_t GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const override { return value.GetWorkspaceSize(ctx, problem); diff --git a/src/include/miopen/conv/compiled_in_parameters.hpp b/src/include/miopen/conv/compiled_in_parameters.hpp index 28def48761..a72ccf7d47 100644 --- a/src/include/miopen/conv/compiled_in_parameters.hpp +++ b/src/include/miopen/conv/compiled_in_parameters.hpp @@ -26,7 +26,8 @@ #pragma once -#include +#include +#include #include #include diff --git a/src/include/miopen/conv/context.hpp b/src/include/miopen/conv/context.hpp index 63a1469f5d..8974b684a5 100644 --- a/src/include/miopen/conv/context.hpp +++ b/src/include/miopen/conv/context.hpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2019 Advanced Micro Devices, Inc. + * Copyright (c) 2023 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -24,25 +24,9 @@ * *******************************************************************************/ +// Todo: this is a temporary header for fin compatibility +// It would be removed in a separate PR after changes to fin would be merged + #pragma once #include -#include - -#include - -namespace miopen { -/// A leftover of the legacy design, houses -/// environmental context (e.g. HW/SW platform) and solver-specific state. -/// -/// TODO: These two entities should be made separate. -struct ConvolutionContext : ExecutionContext -{ - ConvolutionContext() = default; - explicit ConvolutionContext(const ExecutionContext& ctx) : ExecutionContext(ctx) {} - -public: - bool is_for_generic_search = false; -}; - -} // namespace miopen diff --git a/src/include/miopen/conv/heuristics/ai_heuristics.hpp b/src/include/miopen/conv/heuristics/ai_heuristics.hpp index dca891515d..7da9497070 100644 --- a/src/include/miopen/conv/heuristics/ai_heuristics.hpp +++ b/src/include/miopen/conv/heuristics/ai_heuristics.hpp @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -72,7 +71,7 @@ struct Metadata }; class Model; std::vector PredictSolver(const ProblemDescription& problem, - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const std::string& device); } // namespace immed_mode diff --git a/src/include/miopen/conv/invokers/impl_gemm.hpp b/src/include/miopen/conv/invokers/impl_gemm.hpp index fcc4666fd1..23c5afddc5 100644 --- a/src/include/miopen/conv/invokers/impl_gemm.hpp +++ b/src/include/miopen/conv/invokers/impl_gemm.hpp @@ -28,7 +28,7 @@ #include #include -#include +#include #include diff --git a/src/include/miopen/conv/invokers/impl_gemm_dynamic.hpp b/src/include/miopen/conv/invokers/impl_gemm_dynamic.hpp index e2d329b0a9..b1a0e426a0 100644 --- a/src/include/miopen/conv/invokers/impl_gemm_dynamic.hpp +++ b/src/include/miopen/conv/invokers/impl_gemm_dynamic.hpp @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -223,11 +222,11 @@ MakeImplGemmDynamicBackwardDataInvokerFactory -#include +#include namespace miopen { namespace conv { diff --git a/src/include/miopen/conv/solver_finders.hpp b/src/include/miopen/conv/solver_finders.hpp index 69425f09a7..4be112b0fb 100644 --- a/src/include/miopen/conv/solver_finders.hpp +++ b/src/include/miopen/conv/solver_finders.hpp @@ -27,8 +27,9 @@ #pragma once #include -#include +#include #include +#include #include #include @@ -44,7 +45,7 @@ class SolversFinder virtual AlgorithmName GetAlgorithmName(const conv::ProblemDescription& ptroblem) const = 0; - inline std::vector Find(const ConvolutionContext& ctx, + inline std::vector Find(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx, bool use_winograd_only) const @@ -68,10 +69,10 @@ class SolversFinder } protected: - virtual bool IsEnabled(const ConvolutionContext& ctx, + virtual bool IsEnabled(const ExecutionContext& ctx, const conv::ProblemDescription& problem, bool use_winograd_only) const = 0; - virtual std::vector FindImpl(const ConvolutionContext& ctx, + virtual std::vector FindImpl(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx, bool use_winograd_only) const = 0; @@ -81,7 +82,7 @@ const std::vector>& GetConvSolverFinders(); void ConvFindCore(const AnyInvokeParams& invoke_ctx, DbRecord& record, - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, bool use_winograd_only, const std::vector>& finders); diff --git a/src/include/miopen/convolution.hpp b/src/include/miopen/convolution.hpp index 5e0507ddb6..bac0133106 100644 --- a/src/include/miopen/convolution.hpp +++ b/src/include/miopen/convolution.hpp @@ -64,7 +64,7 @@ struct ConvSolution; struct AnyInvokeParams; struct ExecutionContext; -struct ConvolutionContext; +struct ExecutionContext; struct Handle; struct TensorDescriptor; struct ProblemDescription; @@ -208,7 +208,7 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor const TensorDescriptor& wDesc, miopenDataType_t yType = miopenFloat) const; - bool IsWinograd3x3SupportedAndFast(const miopen::ConvolutionContext& ctx, + bool IsWinograd3x3SupportedAndFast(const miopen::ExecutionContext& ctx, const ProblemDescription& problem) const; std::size_t GetWorkSpaceSize(ExecutionContext ctx, @@ -229,15 +229,15 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor bool exhaustiveSearch) const; std::vector - FindWinogradSolutions(const ConvolutionContext& ctx, + FindWinogradSolutions(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const; std::vector - FindWinogradSolutions(const ConvolutionContext& ctx, const AnyInvokeParams& invoke_ctx) const; + FindWinogradSolutions(const ExecutionContext& ctx, const AnyInvokeParams& invoke_ctx) const; std::vector - FindDataGemmSolutions(const ConvolutionContext& ctx, const AnyInvokeParams& invoke_ctx) const; + FindDataGemmSolutions(const ExecutionContext& ctx, const AnyInvokeParams& invoke_ctx) const; std::vector FindDataImplicitGemmSolutions(Handle& handle, @@ -249,7 +249,7 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor const AnyInvokeParams& invoke_ctx) const; std::vector - FindFftSolutions(const ConvolutionContext& ctx, + FindFftSolutions(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const; @@ -395,7 +395,7 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor FindMode findMode; ConvolutionAttribute attribute; - std::vector GetSolutionsFallback(const ExecutionContext& exec_ctx, + std::vector GetSolutionsFallback(const ExecutionContext& ctx, const conv::ProblemDescription& problem, size_t maxSolutionCount) const; diff --git a/src/include/miopen/execution_context.hpp b/src/include/miopen/execution_context.hpp index 83b9abdc1a..d2195d6061 100644 --- a/src/include/miopen/execution_context.hpp +++ b/src/include/miopen/execution_context.hpp @@ -90,6 +90,7 @@ struct ExecutionContext // performance config. bool disable_perfdb_access = false; bool use_dynamic_solutions_only = false; + bool is_for_generic_search = false; inline Handle& GetStream() const { return *stream; } inline void SetStream(Handle* stream_) { stream = stream_; } @@ -283,6 +284,10 @@ struct ExecutionContext void DetectRocm(); }; +struct [[deprecated]] ConvolutionContext : ExecutionContext +{ +}; + bool IsHipKernelsEnabled(); } // namespace miopen diff --git a/src/include/miopen/fusion/context.hpp b/src/include/miopen/fusion/context.hpp index cfa38f36f5..43190e6807 100644 --- a/src/include/miopen/fusion/context.hpp +++ b/src/include/miopen/fusion/context.hpp @@ -26,20 +26,23 @@ #pragma once +#include +#include + namespace miopen { -struct FusionContext : miopen::ExecutionContext +struct Handle; + +struct FusionContext : ExecutionContext { explicit FusionContext(Handle& handle) : ExecutionContext(&handle) {} - ConvolutionContext GetConvContext(const miopen::ProblemDescription& conv_problem) const + ExecutionContext GetConvContext(const ProblemDescription& conv_problem) const { - auto ctx = ConvolutionContext{*this}; + auto ctx = ExecutionContext{*this}; conv_problem.SetupFloats(ctx); return ctx; } - - bool is_for_generic_search = false; }; } // namespace miopen diff --git a/src/include/miopen/fusion/solvers.hpp b/src/include/miopen/fusion/solvers.hpp index f5621e72dc..dd8f2df494 100644 --- a/src/include/miopen/fusion/solvers.hpp +++ b/src/include/miopen/fusion/solvers.hpp @@ -152,9 +152,10 @@ struct ConvBiasActivAsm1x1U : FusionTunableSolver #include -#include #include #include +#include #include #include #include @@ -255,7 +255,7 @@ using RunAndMeasure_t = std::declval(), std::declval(), std::declval(), - std::declval(), + std::declval(), std::declval(), std::declval())); diff --git a/src/include/miopen/mlo_internal.hpp b/src/include/miopen/mlo_internal.hpp index c3a00fc3ee..f5e7d2fb83 100644 --- a/src/include/miopen/mlo_internal.hpp +++ b/src/include/miopen/mlo_internal.hpp @@ -68,7 +68,7 @@ POSSIBILITY OF SUCH DAMAGE. #else #include #endif -#include +#include #include #include #include @@ -179,74 +179,74 @@ auto mloConstruct(T& x) -> decltype(x.mloConstruct(), void()) } std::vector -FindAllGemmSolutions(const miopen::ConvolutionContext& ctx, +FindAllGemmSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx); std::vector> -AllGemmWorkspaceSize(const miopen::ConvolutionContext& ctx, +AllGemmWorkspaceSize(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem); std::vector> -AllDirectForwardBackwardDataWorkspaceSize(const miopen::ConvolutionContext& ctx, +AllDirectForwardBackwardDataWorkspaceSize(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem); std::vector> -FindAllImplicitGemmWorkspaceSizes(const miopen::ConvolutionContext& ctx, +FindAllImplicitGemmWorkspaceSizes(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem); std::vector> -FindAllWinogradWorkspaceSizes(const miopen::ConvolutionContext& ctx, +FindAllWinogradWorkspaceSizes(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem); std::vector> -FindWinogradWrWWorkspaceSizes(const miopen::ConvolutionContext& ctx, +FindWinogradWrWWorkspaceSizes(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem); std::vector> -FindImplicitGemmWrWWorkspaceSizes(const miopen::ConvolutionContext& ctx, +FindImplicitGemmWrWWorkspaceSizes(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem); std::vector> -AllDirectBwdWrW2DWorkspaceSize(const miopen::ConvolutionContext& ctx, +AllDirectBwdWrW2DWorkspaceSize(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem); std::vector> -AllFFTForwardBackwardDataWorkspaceSize(const miopen::ConvolutionContext& ctx, +AllFFTForwardBackwardDataWorkspaceSize(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem); std::vector -FindAllDirectSolutions(const miopen::ConvolutionContext& ctx, +FindAllDirectSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx); std::vector -FindAllImplicitGemmSolutions(const miopen::ConvolutionContext& ctx, +FindAllImplicitGemmSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx); std::vector -FindAllWinogradSolutions(const miopen::ConvolutionContext& ctx, +FindAllWinogradSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx); std::vector -FindWinogradWrWAllSolutions(const miopen::ConvolutionContext& ctx, +FindWinogradWrWAllSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx); std::vector -FindImplicitGemmWrWAllSolutions(const miopen::ConvolutionContext& ctx, +FindImplicitGemmWrWAllSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx); std::vector -FindAllBwdWrW2DSolutions(const miopen::ConvolutionContext& ctx, +FindAllBwdWrW2DSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx); std::vector -FindAllFFTSolutions(const miopen::ConvolutionContext& ctx, +FindAllFFTSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx); @@ -275,7 +275,7 @@ struct mlo_construct_base protected: miopen::ProblemDescriptionCompatTemporary _problem; - miopen::ConvolutionContext _ctx; + miopen::ExecutionContext _ctx; }; #define MLO_POOLING_OP_AVE 0 diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp index 1ed699bf32..ce40d6f081 100644 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -114,7 +114,7 @@ struct SolverBase /// * @see https://github.com/ROCmSoftwarePlatform/MIOpen/issues/410 virtual float GetWti(const ExecutionContext& ctx, const boost::any& problem) const = 0; - // Returns the workspace size required by the solver for a given ConvolutionContext + // Returns the workspace size required by the solver for a given ExecutionContext virtual size_t GetWorkspaceSize(const ExecutionContext& ctx, const boost::any& problem) const = 0; @@ -181,10 +181,10 @@ struct NonTunableSolverBase : SolverMixin }; /// Typedef for convolution solvers -using ConvSolver = NonTunableSolverBase; +using ConvSolver = NonTunableSolverBase; /// Base class for tunable solvers -struct ConvTunableSolverBase : SolverMixin +struct ConvTunableSolverBase : SolverMixin { /// Initializes performance config to the default values. /// The function may involve some heuristic to guess the best solution @@ -195,13 +195,13 @@ struct ConvTunableSolverBase : SolverMixin{}, "PerformanceConfig must be derived of PerfConfig"); - virtual PerformanceConfig GetDefaultPerformanceConfig(const ConvolutionContext&, + virtual PerformanceConfig GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const = 0; - virtual bool IsValidPerformanceConfig(const ConvolutionContext&, + virtual bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConfig&) const = 0; virtual PerformanceConfig - Search(const ConvolutionContext&, const ProblemDescription&, const AnyInvokeParams&) const = 0; - virtual ConvSolution GetSolution(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams&) const = 0; + virtual ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, - const PerformanceConfig&) const = 0; + const PerformanceConfig&) const = 0; - boost::any GetDefaultPerformanceConfig(const ConvolutionContext& ctx, + boost::any GetDefaultPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem, int) const final { return GetDefaultPerformanceConfig(ctx, problem); } - bool IsValidPerformanceConfig(const ConvolutionContext& ctx, + bool IsValidPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem, const PerfConfig& config) const final { @@ -253,7 +253,7 @@ struct ConvTunableSolver : ConvTunableSolverBase ctx, problem, dynamic_cast(config)); } - boost::any Search(const ConvolutionContext& ctx, + boost::any Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx, int) const final @@ -261,7 +261,7 @@ struct ConvTunableSolver : ConvTunableSolverBase return Search(ctx, problem, invoke_ctx); } - ConvSolution GetSolution(const ConvolutionContext& ctx, + ConvSolution GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerfConfig& config) const final { @@ -290,7 +290,7 @@ struct PerformanceConfigConvAsm3x3U : PerfConfigBase { const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; PerformanceConfigConvAsm3x3U - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigConvAsm3x3U&) const override; - PerformanceConfigConvAsm3x3U Search(const ConvolutionContext&, + PerformanceConfigConvAsm3x3U Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigConvAsm3x3U&) const override; }; @@ -364,15 +363,15 @@ struct PerformanceConfigConvAsm1x1U : PerfConfigBase const std::string& SolverDbId() const override { return GetSolverDbId(); } PerformanceConfigConvAsm1x1U - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigConvAsm1x1U&) const override; - PerformanceConfigConvAsm1x1U Search(const ConvolutionContext&, + PerformanceConfigConvAsm1x1U Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override; bool MayNeedWorkspace() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigConvAsm1x1U&) const override; }; @@ -470,7 +468,7 @@ struct PerformanceConfigConvAsm1x1UV2 : PerfConfigBase const std::string& SolverDbId() const override { return GetSolverDbId(); } PerformanceConfigConvAsm1x1UV2 - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigConvAsm1x1UV2&) const override; - PerformanceConfigConvAsm1x1UV2 Search(const ConvolutionContext&, + PerformanceConfigConvAsm1x1UV2 Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigConvAsm1x1UV2&) const override; }; @@ -504,20 +501,10 @@ struct ConvAsm5x10u2v2f1 final : ConvSolver const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; struct ConvAsm5x10u2v2b1 final : ConvSolver @@ -527,20 +514,10 @@ struct ConvAsm5x10u2v2b1 final : ConvSolver const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; struct ConvAsm7x7c3h224w224k64u2v2p3q3f1 final : ConvSolver @@ -553,20 +530,10 @@ struct ConvAsm7x7c3h224w224k64u2v2p3q3f1 final : ConvSolver return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; struct ConvOclDirectFwd11x11 final : ConvSolver @@ -576,16 +543,16 @@ struct ConvOclDirectFwd11x11 final : ConvSolver return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override; }; struct ConvOclDirectFwdGen final : ConvSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override; }; struct PerformanceImplicitGemm : PerfConfigBase @@ -646,10 +613,10 @@ struct PerformanceImplicitGemm : PerfConfigBase f(self.WeiBlockCopyClusterLengths_K, "WeiBlockCopyClusterLengths_K"); } - void HeuristicInit(const ConvolutionContext&, const ProblemDescription&); + void HeuristicInit(const ExecutionContext&, const ProblemDescription&); bool IsValidValue() const; bool SetNextValue(const ProblemDescription&); - bool IsValid(const ConvolutionContext&, const ProblemDescription&) const; + bool IsValid(const ExecutionContext&, const ProblemDescription&) const; bool operator==(const PerformanceImplicitGemm& other) const; }; @@ -684,7 +651,7 @@ struct PerformanceImplicitGemmV4R1 : public PerformanceImplicitGemm PerformanceImplicitGemmV4R1(bool spare) : PerformanceImplicitGemm(spare) {} - bool IsValid(const ConvolutionContext&, const ProblemDescription&) const; + bool IsValid(const ExecutionContext&, const ProblemDescription&) const; }; struct PerformanceImplicitGemmV4R4Fwd : PerfConfigBase @@ -735,12 +702,12 @@ struct PerformanceImplicitGemmV4R4Fwd : PerfConfigBase CalculateLdsNumberOfByte(const ProblemDescription&) const; bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const + bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const { return IsValid(problem); } bool IsValid(const ProblemDescription&) const; - void HeuristicInit(const ConvolutionContext&, const ProblemDescription&); + void HeuristicInit(const ExecutionContext&, const ProblemDescription&); bool SetNextValue(const ProblemDescription&); }; @@ -792,12 +759,12 @@ struct PerformanceImplicitGemmV4R4WrW : PerfConfigBase CalculateLdsNumberOfByte(const ProblemDescription&) const; bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const + bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const { return IsValid(problem); } bool IsValid(const ProblemDescription&) const; - void HeuristicInit(const ConvolutionContext&, const ProblemDescription&); + void HeuristicInit(const ExecutionContext&, const ProblemDescription&); bool SetNextValue(const ProblemDescription&); }; @@ -841,22 +808,22 @@ struct PerformanceImplicitGemmBwdDataV1R1 : PerfConfigBase CalculateGridSize(const ConvolutionContext&, + std::tuple CalculateGridSize(const ExecutionContext&, const ProblemDescription&) const; std::tuple CalculateBlockGemmPerformanceParameters() const; std::tuple - CalculateGemmABlockCopyPerformanceParameters(const ConvolutionContext&, + CalculateGemmABlockCopyPerformanceParameters(const ExecutionContext&, const ProblemDescription&) const; std::tuple - CalculateGemmBBlockCopyPerformanceParameters(const ConvolutionContext&, + CalculateGemmBBlockCopyPerformanceParameters(const ExecutionContext&, const ProblemDescription&) const; std::tuple CalculateGemmCThreadCopyPerformanceParameters(const ProblemDescription&) const; - std::tuple CalculateLdsNumberOfByte(const ConvolutionContext&, + std::tuple CalculateLdsNumberOfByte(const ExecutionContext&, const ProblemDescription&) const; bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription&) const; - void HeuristicInit(const ConvolutionContext&, const ProblemDescription&); + bool IsValid(const ExecutionContext&, const ProblemDescription&) const; + void HeuristicInit(const ExecutionContext&, const ProblemDescription&); bool SetNextValue(const ProblemDescription&); }; @@ -910,12 +877,12 @@ struct PerformanceImplicitGemmBwdDataV4R1 : PerfConfigBase CalculateLdsNumberOfByte(const ProblemDescription&) const; bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const + bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const { return IsValid(problem); } bool IsValid(const ProblemDescription&) const; - void HeuristicInit(const ConvolutionContext&, const ProblemDescription&); + void HeuristicInit(const ExecutionContext&, const ProblemDescription&); bool SetNextValue(const ProblemDescription&); }; @@ -968,10 +935,10 @@ struct PerformanceImplicitGemmBwdDataV4R1Xdlops std::tuple CalculateGemmBBlockCopyPerformanceParameters(const ProblemDescription&) const; bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription&) const; + bool IsValid(const ExecutionContext&, const ProblemDescription&) const; bool IsReallyValid(const ProblemDescription&) const; - bool IsFastToBeUsedForTuning(const ConvolutionContext&, const ProblemDescription&) const; - void HeuristicInit(const ConvolutionContext&, const ProblemDescription&); + bool IsFastToBeUsedForTuning(const ExecutionContext&, const ProblemDescription&) const; + void HeuristicInit(const ExecutionContext&, const ProblemDescription&); bool SetNextValue(const ProblemDescription&); }; @@ -983,16 +950,15 @@ struct ConvHipImplicitGemmV4R1Fwd final : ConvTunableSolver(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; PerformanceImplicitGemmV4R4Fwd - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmV4R4Fwd&) const override; - PerformanceImplicitGemmV4R4Fwd Search(const ConvolutionContext&, + PerformanceImplicitGemmV4R4Fwd Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmV4R4Fwd&) const override; @@ -1066,7 +1031,7 @@ struct PerformanceConvMlirIgemm : PerfConfigBase f(self.GemmNPerThread, "GemmNPerThread"); } - bool IsValid(const ConvolutionContext&, const ProblemDescription&) const; + bool IsValid(const ExecutionContext&, const ProblemDescription&) const; bool SetNextValue(const ProblemDescription&); private: @@ -1077,16 +1042,16 @@ struct ConvMlirIgemmFwd final : ConvTunableSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - PerformanceConvMlirIgemm GetDefaultPerformanceConfig(const ConvolutionContext&, + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + PerformanceConvMlirIgemm GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConvMlirIgemm&) const override; - PerformanceConvMlirIgemm Search(const ConvolutionContext&, + PerformanceConvMlirIgemm Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConvMlirIgemm&) const override; }; @@ -1138,7 +1103,7 @@ struct PerformanceConvMlirIgemmXdlops : PerfConfigBase(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; PerformanceConvMlirIgemmXdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConvMlirIgemmXdlops&) const override; - PerformanceConvMlirIgemmXdlops Search(const ConvolutionContext&, + PerformanceConvMlirIgemmXdlops Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConvMlirIgemmXdlops&) const override; }; @@ -1174,17 +1138,16 @@ struct ConvHipImplicitGemmV4R4WrW final : ConvTunableSolver(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; PerformanceImplicitGemmV4R4WrW - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmV4R4WrW&) const override; - PerformanceImplicitGemmV4R4WrW Search(const ConvolutionContext&, + PerformanceImplicitGemmV4R4WrW Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmV4R4WrW&) const override; @@ -1198,16 +1161,16 @@ struct ConvMlirIgemmWrW final : ConvTunableSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - PerformanceConvMlirIgemm GetDefaultPerformanceConfig(const ConvolutionContext&, + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + PerformanceConvMlirIgemm GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConvMlirIgemm&) const override; - PerformanceConvMlirIgemm Search(const ConvolutionContext&, + PerformanceConvMlirIgemm Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConvMlirIgemm&) const override; }; @@ -1219,19 +1182,18 @@ struct ConvMlirIgemmWrWXdlops final : ConvTunableSolver(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; PerformanceConvMlirIgemmXdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConvMlirIgemmXdlops&) const override; - PerformanceConvMlirIgemmXdlops Search(const ConvolutionContext&, + PerformanceConvMlirIgemmXdlops Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override; bool MayNeedWorkspace() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConvMlirIgemmXdlops&) const override; }; @@ -1269,12 +1231,12 @@ struct PerformanceImplicitGemmForwardV4R4Xdlops bool operator==(const PerformanceImplicitGemmForwardV4R4Xdlops& other) const; - void HeuristicInit(const ConvolutionContext&, const ProblemDescription&); + void HeuristicInit(const ExecutionContext&, const ProblemDescription&); bool SetNextValue(const ProblemDescription&); bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription&) const; + bool IsValid(const ExecutionContext&, const ProblemDescription&) const; bool IsReallyValid(const ProblemDescription&) const; - bool IsFastToBeUsedForTuning(const ConvolutionContext&, const ProblemDescription&) const; + bool IsFastToBeUsedForTuning(const ExecutionContext&, const ProblemDescription&) const; std::tuple CalculateBlockSize() const; std::tuple CalculateGridSize(const ProblemDescription&) const; @@ -1326,12 +1288,12 @@ struct PerformanceImplicitGemmForwardV4R5Xdlops bool operator==(const PerformanceImplicitGemmForwardV4R5Xdlops& other) const; - void HeuristicInit(const ConvolutionContext&, const ProblemDescription&); + void HeuristicInit(const ExecutionContext&, const ProblemDescription&); bool SetNextValue(const ProblemDescription&); bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription&) const; + bool IsValid(const ExecutionContext&, const ProblemDescription&) const; bool IsReallyValid(const ProblemDescription&) const; - bool IsFastToBeUsedForTuning(const ConvolutionContext&, const ProblemDescription&) const; + bool IsFastToBeUsedForTuning(const ExecutionContext&, const ProblemDescription&) const; std::tuple CalculateBlockSize() const; std::tuple CalculateGridSize(const ProblemDescription&) const; @@ -1385,12 +1347,12 @@ struct PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm bool operator==(const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm& other) const; - void HeuristicInit(const ConvolutionContext&, const ProblemDescription&); + void HeuristicInit(const ExecutionContext&, const ProblemDescription&); bool SetNextValue(const ProblemDescription&); bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription&) const; + bool IsValid(const ExecutionContext&, const ProblemDescription&) const; bool IsReallyValid(const ProblemDescription&) const; - bool IsFastToBeUsedForTuning(const ConvolutionContext&, const ProblemDescription&) const; + bool IsFastToBeUsedForTuning(const ExecutionContext&, const ProblemDescription&) const; std::tuple CalculateBlockSize() const; std::tuple CalculateGridSize(const ProblemDescription&) const; @@ -1431,12 +1393,12 @@ struct PerformanceImplicitGemmBwdV1R1Xdlops : PerfConfigBase CalculateBlockSize() const; std::tuple CalculateGridSize(const ProblemDescription&) const; @@ -1456,17 +1418,16 @@ struct ConvHipImplicitGemmForwardV4R4Xdlops final } PerformanceImplicitGemmForwardV4R4Xdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmForwardV4R4Xdlops&) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmForwardV4R4Xdlops&) const override; PerformanceImplicitGemmForwardV4R4Xdlops - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; @@ -1485,19 +1446,18 @@ struct ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm final } PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; bool IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm&) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; ConvSolution - GetSolution(const ConvolutionContext&, + GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm&) const override; PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; @@ -1517,17 +1477,16 @@ struct ConvHipImplicitGemmForwardV4R5Xdlops final } PerformanceImplicitGemmForwardV4R5Xdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmForwardV4R5Xdlops&) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmForwardV4R5Xdlops&) const override; PerformanceImplicitGemmForwardV4R5Xdlops - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; }; @@ -1540,16 +1499,15 @@ struct ConvHipImplicitGemmV4R1WrW final : ConvTunableSolver(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; PerformanceImplicitGemmBwdDataV1R1 - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmBwdDataV1R1&) const override; - PerformanceImplicitGemmBwdDataV1R1 Search(const ConvolutionContext&, + PerformanceImplicitGemmBwdDataV1R1 Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmBwdDataV1R1&) const override; - size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override; bool MayNeedWorkspace() const override { return true; } private: - static std::tuple CalculateGemmSize(const ConvolutionContext&, + static std::tuple CalculateGemmSize(const ExecutionContext&, const ProblemDescription&); friend struct PerformanceImplicitGemmBwdDataV1R1; @@ -1588,16 +1545,16 @@ struct ConvMlirIgemmBwd final : ConvTunableSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - PerformanceConvMlirIgemm GetDefaultPerformanceConfig(const ConvolutionContext&, + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + PerformanceConvMlirIgemm GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConvMlirIgemm&) const override; - PerformanceConvMlirIgemm Search(const ConvolutionContext&, + PerformanceConvMlirIgemm Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConvMlirIgemm&) const override; }; @@ -1609,17 +1566,16 @@ struct ConvMlirIgemmBwdXdlops final : ConvTunableSolver(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; PerformanceConvMlirIgemmXdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConvMlirIgemmXdlops&) const override; - PerformanceConvMlirIgemmXdlops Search(const ConvolutionContext&, + PerformanceConvMlirIgemmXdlops Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConvMlirIgemmXdlops&) const override; }; @@ -1631,17 +1587,16 @@ struct ConvHipImplicitGemmBwdDataV4R1 final : ConvTunableSolver(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; PerformanceImplicitGemmBwdDataV4R1 - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmBwdDataV4R1&) const override; - PerformanceImplicitGemmBwdDataV4R1 Search(const ConvolutionContext&, + PerformanceImplicitGemmBwdDataV4R1 Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmBwdDataV4R1&) const override; @@ -1661,17 +1616,16 @@ struct ConvHipImplicitGemmBwdDataV4R1Xdlops final } PerformanceImplicitGemmBwdDataV4R1Xdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmBwdDataV4R1Xdlops&) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmBwdDataV4R1Xdlops&) const override; PerformanceImplicitGemmBwdDataV4R1Xdlops - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; @@ -1691,18 +1645,17 @@ struct ConvHipImplicitGemmBwdDataV1R1Xdlops final } PerformanceImplicitGemmBwdV1R1Xdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmBwdV1R1Xdlops&) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override; bool MayNeedWorkspace() const override { return true; } - PerformanceImplicitGemmBwdV1R1Xdlops Search(const ConvolutionContext&, + PerformanceImplicitGemmBwdV1R1Xdlops Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmBwdV1R1Xdlops&) const override; @@ -1722,23 +1675,13 @@ struct ConvAsmImplicitGemmV4R1DynamicFwd final : ConvSolver return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool IsDynamic() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; struct ConvAsmImplicitGemmV4R1DynamicFwd_1x1 final : ConvSolver @@ -1751,23 +1694,13 @@ struct ConvAsmImplicitGemmV4R1DynamicFwd_1x1 final : ConvSolver return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool IsDynamic() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; struct ConvAsmImplicitGemmV4R1DynamicWrw final : ConvSolver @@ -1781,32 +1714,18 @@ struct ConvAsmImplicitGemmV4R1DynamicWrw final : ConvSolver return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool IsDynamic() const override { return true; } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetWorkspaceSize(static_cast(ctx), problem); - } + size_t GetWorkspaceSize(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool MayNeedWorkspace() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; struct ConvAsmImplicitGemmGTCDynamicWrwXdlops final : ConvSolver @@ -1820,32 +1739,18 @@ struct ConvAsmImplicitGemmGTCDynamicWrwXdlops final : ConvSolver return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool IsDynamic() const override { return true; } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetWorkspaceSize(static_cast(ctx), problem); - } + size_t GetWorkspaceSize(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool MayNeedWorkspace() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; struct ConvAsmImplicitGemmV4R1DynamicBwd final : ConvSolver @@ -1858,23 +1763,13 @@ struct ConvAsmImplicitGemmV4R1DynamicBwd final : ConvSolver return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool IsDynamic() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; struct ConvAsmImplicitGemmGTCDynamicFwdXdlops final : ConvSolver @@ -1887,23 +1782,13 @@ struct ConvAsmImplicitGemmGTCDynamicFwdXdlops final : ConvSolver return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool IsDynamic() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; struct ConvAsmImplicitGemmGTCDynamicBwdXdlops final : ConvSolver @@ -1916,54 +1801,44 @@ struct ConvAsmImplicitGemmGTCDynamicBwdXdlops final : ConvSolver return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool IsDynamic() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; /// Holds common member functions for the Solvers which share the same /// "legacy exhaustive search" machinery. struct ConvOclDirectFwdLegacyExhaustiveSearch : ConvTunableSolver { - LegacyPerformanceConfig GetDefaultPerformanceConfig(const ConvolutionContext&, + LegacyPerformanceConfig GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; - LegacyPerformanceConfig Search(const ConvolutionContext&, + LegacyPerformanceConfig Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; private: template - LegacyPerformanceConfig SearchImpl(const ConvolutionContext&, + LegacyPerformanceConfig SearchImpl(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const; }; struct ConvOclDirectFwd : ConvOclDirectFwdLegacyExhaustiveSearch { - static ConvSolution BaseGetSolution(const ConvolutionContext& ctx, + static ConvSolution BaseGetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const LegacyPerformanceConfig& config); const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const LegacyPerformanceConfig&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const LegacyPerformanceConfig&) const override; }; @@ -1972,12 +1847,12 @@ struct ConvOclDirectFwd1x1 final : ConvOclDirectFwdLegacyExhaustiveSearch { const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const LegacyPerformanceConfig&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const LegacyPerformanceConfig&) const override { @@ -1992,23 +1867,13 @@ struct ConvBinWinograd3x3U final : ConvSolver const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool IsDynamic() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; struct ConvBinWinogradRxS final : ConvSolver @@ -2018,23 +1883,13 @@ struct ConvBinWinogradRxS final : ConvSolver const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool IsDynamic() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; struct PerformanceConfigConvBinWinogradRxS : PerfConfigBase @@ -2052,14 +1907,14 @@ struct PerformanceConfigConvBinWinogradRxS : PerfConfigBase - void HeuristicInit(const ConvolutionContext&, const ProblemDescription&); + void HeuristicInit(const ExecutionContext&, const ProblemDescription&); bool IsValidValue() const; bool SetNextValue(const ProblemDescription&); - bool IsValid(const ConvolutionContext& ctx, const ProblemDescription&) const + bool IsValid(const ExecutionContext& ctx, const ProblemDescription&) const { return IsValid(ctx); } - bool IsValid(const ConvolutionContext&) const; + bool IsValid(const ExecutionContext&) const; bool operator==(const PerformanceConfigConvBinWinogradRxS& other) const; }; @@ -2078,17 +1933,16 @@ struct ConvBinWinoRxS final : ConvTunableSolver(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } - float GetWti(const ConvolutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override; + float GetWti(const ExecutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override; }; template @@ -2135,11 +1989,11 @@ struct ConvMPBidirectWinograd final : ConvSolver ConvMPBidirectWinograd>(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } - size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override; bool MayNeedWorkspace() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override; // kernel_file_name for solver identification static std::string GetSolverFileNames(int id) @@ -2191,7 +2045,7 @@ struct ConvMPBidirectWinograd_xdlops final ConvMPBidirectWinograd_xdlops>(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { @@ -2202,7 +2056,7 @@ struct ConvMPBidirectWinograd_xdlops final } PerformanceImplicitGemmForwardV4R4Xdlops - GetDefaultPerformanceConfig(const ConvolutionContext& ctx, + GetDefaultPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem) const override { const auto xdlops_problem = GetTransformedProblem(problem); @@ -2213,7 +2067,7 @@ struct ConvMPBidirectWinograd_xdlops final } bool - IsValidPerformanceConfig(const ConvolutionContext& ctx, + IsValidPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmForwardV4R4Xdlops& config) const override { @@ -2224,7 +2078,7 @@ struct ConvMPBidirectWinograd_xdlops final xdlops_ctx, xdlops_problem, config); } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, + size_t GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const override { const auto xdlops_problem = GetTransformedProblem(problem); @@ -2238,17 +2092,16 @@ struct ConvMPBidirectWinograd_xdlops final bool MayNeedWorkspace() const override { return true; } PerformanceImplicitGemmForwardV4R4Xdlops - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmForwardV4R4Xdlops&) const override; private: - ConvolutionContext - GetTransformedConvContext(const ConvolutionContext& ctx, - const ProblemDescription& transformed_problem) const; + ExecutionContext GetTransformedConvContext(const ExecutionContext& ctx, + const ProblemDescription& transformed_problem) const; ProblemDescription GetTransformedProblem(const ProblemDescription& problem) const; // kernel_file_name for solver identification @@ -2302,27 +2155,18 @@ struct ConvWinograd3x3MultipassWrW final : ConvSolver ConvWinograd3x3MultipassWrW>(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool IsDynamic() const override { return true; } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetWorkspaceSize(static_cast(ctx), problem); - } + size_t GetWorkspaceSize(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool MayNeedWorkspace() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; // kernel_file_name for solver identification static std::string GetSolverFileNames(int id) @@ -2356,10 +2200,6 @@ struct ConvWinograd3x3MultipassWrW final : ConvSolver } private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; - InvokerFactory PrepareInvokerFactory(const ExecutionContext&, const ProblemDescription&, std::size_t ws_sz) const; @@ -2424,10 +2264,10 @@ struct PerformanceConfigAsmDirect3x3WrW : PerfConfigBase(); } PerformanceConfigAsmDirect3x3WrW - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigAsmDirect3x3WrW&) const override; - PerformanceConfigAsmDirect3x3WrW Search(const ConvolutionContext&, + PerformanceConfigAsmDirect3x3WrW Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigAsmDirect3x3WrW& config) const override; }; @@ -2458,11 +2297,11 @@ struct ConvWinoFuryRxS final : ConvSolver return GetSolverDbId>(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } - float GetWti(const ConvolutionContext&, const ProblemDescription&) const override; + float GetWti(const ExecutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override; static constexpr bool is2x3() { return Winodata == 2 && Winofilter == 3; } static constexpr bool is3x2() { return Winodata == 3 && Winofilter == 2; } @@ -2564,10 +2403,10 @@ struct PerformanceConfigConvAsmBwdWrW1x1 : PerfConfigBase(); } PerformanceConfigConvAsmBwdWrW1x1 - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigConvAsmBwdWrW1x1&) const override; - PerformanceConfigConvAsmBwdWrW1x1 Search(const ConvolutionContext&, + PerformanceConfigConvAsmBwdWrW1x1 Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override; bool MayNeedWorkspace() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigConvAsmBwdWrW1x1&) const override; }; @@ -2648,7 +2486,7 @@ struct PerformanceConfigConvOclBwdWrw2 void HeuristicInit(const ProblemDescription&); bool IsValidValue() const; bool SetNextValue(const ProblemDescription&); - bool IsValid(const ConvolutionContext&, const ProblemDescription&) const; + bool IsValid(const ExecutionContext&, const ProblemDescription&) const; bool operator==(const PerformanceConfigConvOclBwdWrw2& other) const; }; @@ -2661,25 +2499,24 @@ struct ConvOclBwdWrW2 : ConvTunableSolver - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; bool - IsValidPerformanceConfig(const ConvolutionContext&, + IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigConvOclBwdWrw2&) const override; PerformanceConfigConvOclBwdWrw2 - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override; bool MayNeedWorkspace() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigConvOclBwdWrw2&) const override; protected: - bool IsApplicableBase(const ConvolutionContext&, const ProblemDescription&) const; + bool IsApplicableBase(const ExecutionContext&, const ProblemDescription&) const; }; // To suppress misleading clang warnings @@ -2715,8 +2552,8 @@ struct ConvOclBwdWrW2NonTunable final : ConvOclBwdWrW2<1> return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; private: // This function dervied from ConvOclBwdWrW2 is declared private @@ -2729,20 +2566,20 @@ struct ConvOclBwdWrW53 final : ConvSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override; bool MayNeedWorkspace() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override; }; struct ConvOclBwdWrW1x1 final : ConvSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override; bool MayNeedWorkspace() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override; }; struct fft final : ConvSolver @@ -2753,30 +2590,16 @@ struct fft final : ConvSolver const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return IsApplicable(static_cast(ctx), problem); - } + bool IsApplicable(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; - size_t GetWorkspaceSize(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetWorkspaceSize(static_cast(ctx), problem); - } + size_t GetWorkspaceSize(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; bool MayNeedWorkspace() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override - { - return GetSolution(static_cast(ctx), problem); - } - -private: - bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const; - size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const; - ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const; + ConvSolution GetSolution(const ExecutionContext& ctx, + const ProblemDescription& problem) const override; }; struct PerformanceImplicitGemmWrwV4R4Xdlops : PerfConfigBase @@ -2814,17 +2637,17 @@ struct PerformanceImplicitGemmWrwV4R4Xdlops : PerfConfigBase - CalculateGemmSizeAndGemmKBlock(const ConvolutionContext&, const ProblemDescription&) const; + CalculateGemmSizeAndGemmKBlock(const ExecutionContext&, const ProblemDescription&) const; std::tuple CalculateBlockSize() const; - std::tuple CalculateGridSize(const ConvolutionContext&, + std::tuple CalculateGridSize(const ExecutionContext&, const ProblemDescription&) const; std::tuple CalculateGemmABlockCopyPerformanceParameters(const ProblemDescription&) const; @@ -2842,18 +2665,17 @@ struct ConvHipImplicitGemmWrwV4R4Xdlops final } PerformanceImplicitGemmWrwV4R4Xdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override; + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override; bool MayNeedWorkspace() const override { return true; } - bool IsValidPerformanceConfig(const ConvolutionContext&, + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmWrwV4R4Xdlops&) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - ConvSolution GetSolution(const ConvolutionContext&, + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmWrwV4R4Xdlops&) const override; - PerformanceImplicitGemmWrwV4R4Xdlops Search(const ConvolutionContext&, + PerformanceImplicitGemmWrwV4R4Xdlops Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; }; @@ -2899,17 +2721,17 @@ struct PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm bool operator==(const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm& other) const; - void HeuristicInit(const ConvolutionContext&, const ProblemDescription&); + void HeuristicInit(const ExecutionContext&, const ProblemDescription&); bool SetNextValue(const ProblemDescription&); bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription&) const; - bool IsReallyValid(const ConvolutionContext&, const ProblemDescription&) const; - bool IsFastToBeUsedForTuning(const ConvolutionContext&, const ProblemDescription&) const; + bool IsValid(const ExecutionContext&, const ProblemDescription&) const; + bool IsReallyValid(const ExecutionContext&, const ProblemDescription&) const; + bool IsFastToBeUsedForTuning(const ExecutionContext&, const ProblemDescription&) const; std::tuple - CalculateGemmSizeAndGemmKBlock(const ConvolutionContext&, const ProblemDescription&) const; + CalculateGemmSizeAndGemmKBlock(const ExecutionContext&, const ProblemDescription&) const; std::tuple CalculateBlockSize() const; - std::tuple CalculateGridSize(const ConvolutionContext&, + std::tuple CalculateGridSize(const ExecutionContext&, const ProblemDescription&) const; std::tuple CalculateGemmABlockCopyPerformanceParameters(const ProblemDescription&) const; @@ -2927,21 +2749,20 @@ struct ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm final } PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override; + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override; bool MayNeedWorkspace() const override { return true; } bool IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm&) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; ConvSolution - GetSolution(const ConvolutionContext&, + GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm&) const override; PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; }; @@ -2964,7 +2785,7 @@ struct PerformanceConvCkIgemmFwdV6r1DlopsNchw } bool SetNextValue(const ProblemDescription&); - bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const + bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const { return IsValid(problem); } @@ -2982,20 +2803,19 @@ struct ConvCkIgemmFwdV6r1DlopsNchw final : ConvTunableSolver(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; - size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, const ProblemDescription&) const override; bool MayNeedWorkspace() const override { return true; } bool IsDynamic() const override { return false; } PerformanceConvCkIgemmFwdV6r1DlopsNchw - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConvCkIgemmFwdV6r1DlopsNchw&) const override; - PerformanceConvCkIgemmFwdV6r1DlopsNchw Search(const ConvolutionContext&, + PerformanceConvCkIgemmFwdV6r1DlopsNchw Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConvCkIgemmFwdV6r1DlopsNchw&) const override; }; @@ -3007,15 +2827,15 @@ struct ConvDirectNaiveConvFwd final : ConvSolver return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } /// Use very small fixed value enough to backup GEMM for cases when /// GEMM is disabled due to MIOpenGemm or OCL compiler issues. - float GetWti(const ConvolutionContext&, const ProblemDescription&) const override + float GetWti(const ExecutionContext&, const ProblemDescription&) const override { return 0.01f; } - ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override; }; struct ConvDirectNaiveConvBwd final : ConvSolver @@ -3025,15 +2845,15 @@ struct ConvDirectNaiveConvBwd final : ConvSolver return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } /// Use very small fixed value enough to backup GEMM for cases when /// GEMM is disabled due to MIOpenGemm or OCL compiler issues. - float GetWti(const ConvolutionContext&, const ProblemDescription&) const override + float GetWti(const ExecutionContext&, const ProblemDescription&) const override { return 0.01f; } - ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override; }; struct ConvDirectNaiveConvWrw final : ConvSolver @@ -3043,15 +2863,15 @@ struct ConvDirectNaiveConvWrw final : ConvSolver return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } /// Use very small fixed value enough to backup GEMM for cases when /// GEMM is disabled due to MIOpenGemm or OCL compiler issues. - float GetWti(const ConvolutionContext&, const ProblemDescription&) const override + float GetWti(const ExecutionContext&, const ProblemDescription&) const override { return 0.01f; } - ConvSolution GetSolution(const ConvolutionContext&, const ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override; }; struct GemmFwdBase : ConvSolver @@ -3061,14 +2881,14 @@ struct GemmFwdBase : ConvSolver using ConvSolver::IsApplicable; bool IsDynamic() const override { return true; } - float GetWti(const ConvolutionContext& ctx, const ProblemDescription& problem) const override + float GetWti(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetWti(static_cast(ctx), problem); + return GetWti(ctx, static_cast(problem)); } private: bool IsApplicable(const ExecutionContext&, const conv::ProblemDescription&) const; - float GetWti(const ExecutionContext& context, const conv::ProblemDescription& problem) const; + float GetWti(const ExecutionContext& ctx, const conv::ProblemDescription& problem) const; friend struct GemmFwd1x1_0_2; friend struct GemmFwd1x1_0_1_int8; @@ -3083,24 +2903,23 @@ struct GemmFwd1x1_0_2 final : GemmFwdBase const std::string& SolverDbId() const override { return GetSolverDbId(); } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, + size_t GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetWorkspaceSize(static_cast(ctx), problem); + return GetWorkspaceSize(ctx, static_cast(problem)); } bool MayNeedWorkspace() const override { return true; } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override + bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return IsApplicable(static_cast(ctx), problem); + return IsApplicable(ctx, static_cast(problem)); } - ConvSolution GetSolution(const ConvolutionContext& ctx, + ConvSolution GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetSolution(static_cast(ctx), problem); + return GetSolution(ctx, static_cast(problem)); } private: @@ -3118,24 +2937,23 @@ struct GemmFwd1x1_0_1_int8 final : GemmFwdBase const std::string& SolverDbId() const override { return GetSolverDbId(); } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, + size_t GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetWorkspaceSize(static_cast(ctx), problem); + return GetWorkspaceSize(ctx, static_cast(problem)); } bool MayNeedWorkspace() const override { return true; } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override + bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return IsApplicable(static_cast(ctx), problem); + return IsApplicable(ctx, static_cast(problem)); } - ConvSolution GetSolution(const ConvolutionContext& ctx, + ConvSolution GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetSolution(static_cast(ctx), problem); + return GetSolution(ctx, static_cast(problem)); } private: @@ -3153,24 +2971,23 @@ struct GemmFwd1x1_0_1 final : GemmFwdBase const std::string& SolverDbId() const override { return GetSolverDbId(); } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, + size_t GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetWorkspaceSize(static_cast(ctx), problem); + return GetWorkspaceSize(ctx, static_cast(problem)); } bool MayNeedWorkspace() const override { return true; } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override + bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return IsApplicable(static_cast(ctx), problem); + return IsApplicable(ctx, static_cast(problem)); } - ConvSolution GetSolution(const ConvolutionContext& ctx, + ConvSolution GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetSolution(static_cast(ctx), problem); + return GetSolution(ctx, static_cast(problem)); } private: @@ -3188,24 +3005,23 @@ struct GemmFwdRest final : GemmFwdBase const std::string& SolverDbId() const override { return GetSolverDbId(); } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, + size_t GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetWorkspaceSize(static_cast(ctx), problem); + return GetWorkspaceSize(ctx, static_cast(problem)); } bool MayNeedWorkspace() const override { return true; } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override + bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return IsApplicable(static_cast(ctx), problem); + return IsApplicable(ctx, static_cast(problem)); } - ConvSolution GetSolution(const ConvolutionContext& ctx, + ConvSolution GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetSolution(static_cast(ctx), problem); + return GetSolution(ctx, static_cast(problem)); } private: @@ -3221,9 +3037,9 @@ struct GemmBwdBase : ConvSolver using ConvSolver::IsApplicable; bool IsDynamic() const override { return true; } - float GetWti(const ConvolutionContext& ctx, const ProblemDescription& problem) const override + float GetWti(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetWti(static_cast(ctx), problem); + return GetWti(ctx, static_cast(problem)); } private: @@ -3243,24 +3059,23 @@ struct GemmBwd1x1_stride2 final : GemmBwdBase const std::string& SolverDbId() const override { return GetSolverDbId(); } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, + size_t GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetWorkspaceSize(static_cast(ctx), problem); + return GetWorkspaceSize(ctx, static_cast(problem)); } bool MayNeedWorkspace() const override { return true; } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override + bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return IsApplicable(static_cast(ctx), problem); + return IsApplicable(ctx, static_cast(problem)); } - ConvSolution GetSolution(const ConvolutionContext& ctx, + ConvSolution GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetSolution(static_cast(ctx), problem); + return GetSolution(ctx, static_cast(problem)); } private: @@ -3278,24 +3093,23 @@ struct GemmBwd1x1_stride1 final : GemmBwdBase const std::string& SolverDbId() const override { return GetSolverDbId(); } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, + size_t GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetWorkspaceSize(static_cast(ctx), problem); + return GetWorkspaceSize(ctx, static_cast(problem)); } bool MayNeedWorkspace() const override { return true; } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override + bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return IsApplicable(static_cast(ctx), problem); + return IsApplicable(ctx, static_cast(problem)); } - ConvSolution GetSolution(const ConvolutionContext& ctx, + ConvSolution GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetSolution(static_cast(ctx), problem); + return GetSolution(ctx, static_cast(problem)); } private: @@ -3315,24 +3129,23 @@ struct GemmBwdRest final : GemmBwdBase const std::string& SolverDbId() const override { return GetSolverDbId(); } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, + size_t GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetWorkspaceSize(static_cast(ctx), problem); + return GetWorkspaceSize(ctx, static_cast(problem)); } bool MayNeedWorkspace() const override { return true; } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override + bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return IsApplicable(static_cast(ctx), problem); + return IsApplicable(ctx, static_cast(problem)); } - ConvSolution GetSolution(const ConvolutionContext& ctx, + ConvSolution GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetSolution(static_cast(ctx), problem); + return GetSolution(ctx, static_cast(problem)); } private: @@ -3348,9 +3161,9 @@ struct GemmWrwBase : ConvSolver using ConvSolver::IsApplicable; bool IsDynamic() const override { return true; } - float GetWti(const ConvolutionContext& ctx, const ProblemDescription& problem) const override + float GetWti(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetWti(static_cast(ctx), problem); + return GetWti(ctx, static_cast(problem)); } private: @@ -3368,16 +3181,15 @@ struct GemmWrw1x1_stride1 final : GemmWrwBase const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override + bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return IsApplicable(static_cast(ctx), problem); + return IsApplicable(ctx, static_cast(problem)); } - ConvSolution GetSolution(const ConvolutionContext& ctx, + ConvSolution GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetSolution(static_cast(ctx), problem); + return GetSolution(ctx, static_cast(problem)); } private: @@ -3394,24 +3206,23 @@ struct GemmWrwUniversal final : GemmWrwBase const std::string& SolverDbId() const override { return GetSolverDbId(); } - size_t GetWorkspaceSize(const ConvolutionContext& ctx, + size_t GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetWorkspaceSize(static_cast(ctx), problem); + return GetWorkspaceSize(ctx, static_cast(problem)); } bool MayNeedWorkspace() const override { return true; } - bool IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const override + bool IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return IsApplicable(static_cast(ctx), problem); + return IsApplicable(ctx, static_cast(problem)); } - ConvSolution GetSolution(const ConvolutionContext& ctx, + ConvSolution GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const override { - return GetSolution(static_cast(ctx), problem); + return GetSolution(ctx, static_cast(problem)); } private: @@ -3610,16 +3421,16 @@ struct PerformanceConfigAsmImplicitGemmGTC : PerfConfigBase(); } PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; bool IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription&, const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC&) const override; PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } ConvSolution - GetSolution(const ConvolutionContext&, + GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC&) const override; }; @@ -4600,7 +4407,7 @@ struct PerformanceConfigHipImplicitGemmFwdXdlops void HeuristicInit(const ProblemDescription&); bool SetNextValue(const ProblemDescription&); bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const + bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const { return IsValid(problem); } @@ -4623,18 +4430,17 @@ struct ConvHipImplicitGemmFwdXdlops final } PerformanceConfigHipImplicitGemmFwdXdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigHipImplicitGemmFwdXdlops&) const override; PerformanceConfigHipImplicitGemmFwdXdlops - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigHipImplicitGemmFwdXdlops&) const override; /// \anchor igemm_get_wti_magic_number @@ -4648,7 +4454,7 @@ struct ConvHipImplicitGemmFwdXdlops final // Since we would like to us CK before naive, and use it instead (because // we do expect that CK is faster than Naive), therefore we use a // value bigger than 0.01f, e.g. 0.02f. - float GetWti(const ConvolutionContext&, const ProblemDescription&) const override + float GetWti(const ExecutionContext&, const ProblemDescription&) const override { return 0.02f; }; @@ -4679,7 +4485,7 @@ struct PerformanceConfigHipImplicitGemmBwdXdlops void HeuristicInit(const ProblemDescription&); bool SetNextValue(const ProblemDescription&); bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const + bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const { return IsValid(problem); } @@ -4702,22 +4508,21 @@ struct ConvHipImplicitGemmBwdXdlops final } PerformanceConfigHipImplicitGemmBwdXdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; - bool IsValidPerformanceConfig(const ConvolutionContext&, + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigHipImplicitGemmBwdXdlops&) const override; PerformanceConfigHipImplicitGemmBwdXdlops - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigHipImplicitGemmBwdXdlops&) const override; /// \ref igemm_get_wti_magic_number - float GetWti(const ConvolutionContext&, const ProblemDescription&) const override + float GetWti(const ExecutionContext&, const ProblemDescription&) const override { return 0.02f; }; @@ -4748,7 +4553,7 @@ struct PerformanceConfigHipImplicitGemmGroupFwdXdlops void HeuristicInit(const ProblemDescription&); bool SetNextValue(const ProblemDescription&); bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const + bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const { return IsValid(problem); } @@ -4771,23 +4576,22 @@ struct ConvHipImplicitGemmGroupFwdXdlops final } PerformanceConfigHipImplicitGemmGroupFwdXdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; bool - IsValidPerformanceConfig(const ConvolutionContext&, + IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigHipImplicitGemmGroupFwdXdlops&) const override; PerformanceConfigHipImplicitGemmGroupFwdXdlops - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } - ConvSolution GetSolution(const ConvolutionContext&, + ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigHipImplicitGemmGroupFwdXdlops&) const override; /// \ref igemm_get_wti_magic_number - float GetWti(const ConvolutionContext&, const ProblemDescription&) const override + float GetWti(const ExecutionContext&, const ProblemDescription&) const override { return 0.02f; }; @@ -4818,7 +4622,7 @@ struct PerformanceConfigHipImplicitGemm3DGroupFwdXdlops void HeuristicInit(const ProblemDescription&); bool SetNextValue(const ProblemDescription&); bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const + bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const { return IsValid(problem); } @@ -4841,24 +4645,23 @@ struct ConvHipImplicitGemm3DGroupFwdXdlops final } PerformanceConfigHipImplicitGemm3DGroupFwdXdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; bool IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription&, const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override; PerformanceConfigHipImplicitGemm3DGroupFwdXdlops - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } ConvSolution - GetSolution(const ConvolutionContext&, + GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override; /// \ref igemm_get_wti_magic_number - float GetWti(const ConvolutionContext&, const ProblemDescription&) const override + float GetWti(const ExecutionContext&, const ProblemDescription&) const override { return 0.02f; }; @@ -4889,7 +4692,7 @@ struct PerformanceConfigHipImplicitGemm3DGroupWrwXdlops void HeuristicInit(const ProblemDescription&); bool SetNextValue(const ProblemDescription&); bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const + bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const { return IsValid(problem); } @@ -4917,24 +4720,23 @@ struct ConvHipImplicitGemm3DGroupWrwXdlops final } PerformanceConfigHipImplicitGemm3DGroupWrwXdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; bool IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription&, const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops&) const override; PerformanceConfigHipImplicitGemm3DGroupWrwXdlops - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } ConvSolution - GetSolution(const ConvolutionContext&, + GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops&) const override; /// \ref igemm_get_wti_magic_number - float GetWti(const ConvolutionContext&, const ProblemDescription&) const override + float GetWti(const ExecutionContext&, const ProblemDescription&) const override { return 0.02f; }; @@ -4965,7 +4767,7 @@ struct PerformanceConfigHipImplicitGemm3DGroupBwdXdlops void HeuristicInit(const ProblemDescription&); bool SetNextValue(const ProblemDescription&); bool IsValidValue() const; - bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const + bool IsValid(const ExecutionContext&, const ProblemDescription& problem) const { return IsValid(problem); } @@ -4993,24 +4795,23 @@ struct ConvHipImplicitGemm3DGroupBwdXdlops final } PerformanceConfigHipImplicitGemm3DGroupBwdXdlops - GetDefaultPerformanceConfig(const ConvolutionContext&, - const ProblemDescription&) const override; + GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override; bool IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription&, const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops&) const override; PerformanceConfigHipImplicitGemm3DGroupBwdXdlops - Search(const ConvolutionContext&, + Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams& invoke_ctx) const override; - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override; + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override; bool IsDynamic() const override { return true; } ConvSolution - GetSolution(const ConvolutionContext&, + GetSolution(const ExecutionContext&, const ProblemDescription&, const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops&) const override; /// \ref igemm_get_wti_magic_number - float GetWti(const ConvolutionContext&, const ProblemDescription&) const override + float GetWti(const ExecutionContext&, const ProblemDescription&) const override { return 0.02f; }; @@ -5023,7 +4824,7 @@ struct ConvHipImplicitGemm3DGroupBwdXdlops final // Use struct as a syntactic sugar to make the intent as clear as possible. struct ThisSolverIsDeprecatedStatic { - static bool IsDisabled(const ConvolutionContext& ctx); + static bool IsDisabled(const ExecutionContext& ctx); }; } // namespace solver diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp index f05bbdf712..7bad52ff9e 100644 --- a/src/include/miopen/solver/conv_direct_naive_conv.hpp +++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp @@ -26,7 +26,8 @@ #pragma once #include -#include +#include +#include namespace miopen { @@ -34,9 +35,9 @@ namespace solver { bool ConvDirectNaiveConvIsAssemblyKernel(const ExecutionContext&, const ProblemDescription&); std::string ConvDirectNaiveConvKernelName(const ProblemDescription&); -std::string ConvDirectNaiveConvKernelFile(const ConvolutionContext& ctx, +std::string ConvDirectNaiveConvKernelFile(const ExecutionContext& ctx, const ProblemDescription& problem); -std::string ConvDirectNaiveConvCompileOption(const ConvolutionContext& ctx, +std::string ConvDirectNaiveConvCompileOption(const ExecutionContext& ctx, const ProblemDescription& problem); bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext&, const ProblemDescription&); diff --git a/src/include/miopen/solver/implicitgemm_util.hpp b/src/include/miopen/solver/implicitgemm_util.hpp index 88262b4a32..d9aad50b98 100644 --- a/src/include/miopen/solver/implicitgemm_util.hpp +++ b/src/include/miopen/solver/implicitgemm_util.hpp @@ -419,7 +419,7 @@ static inline bool IsApplicableXdlops(const ExecutionContext& ctx, ///\todo remove template -inline static auto GetPerformanceConfigBase(const ConvolutionContext& ctx, +inline static auto GetPerformanceConfigBase(const ExecutionContext& ctx, const ProblemDescription& problem) { PerformanceImplicitGemm_t pp; @@ -459,7 +459,7 @@ static inline size_t ComputeLDSRequiredSize(const ProblemDescription& problem, return lds_size; } -static inline bool use_amd_inline_asm(const ConvolutionContext& ctx, +static inline bool use_amd_inline_asm(const ExecutionContext& ctx, const ProblemDescription& problem) { @@ -475,7 +475,7 @@ static inline bool use_amd_inline_asm(const ConvolutionContext& ctx, return !miopen::IsDisabled(MIOPEN_DEBUG_IMPLICIT_GEMM_NON_XDLOPS_INLINE_ASM{}); } -static inline bool is_use_amd_buffer_load_store(const ConvolutionContext& ctx) +static inline bool is_use_amd_buffer_load_store(const ExecutionContext& ctx) { #if WORKAROUND_MIOPEN_ISSUE_557 const auto device_name = ctx.GetStream().GetDeviceName(); @@ -485,7 +485,7 @@ static inline bool is_use_amd_buffer_load_store(const ConvolutionContext& ctx) #endif } -static inline bool is_use_v_fmac_f32(const ConvolutionContext& ctx) +static inline bool is_use_v_fmac_f32(const ExecutionContext& ctx) { const auto device_name = ctx.GetStream().GetDeviceName(); return StartsWith(device_name, "gfx103"); @@ -570,7 +570,7 @@ int amd_lds_write_max_length() constexpr std::size_t get_lds_max_number_of_byte() { return 65536; } -static inline auto get_static_ck_common_compiler_flag(const ConvolutionContext& ctx) +static inline auto get_static_ck_common_compiler_flag(const ExecutionContext& ctx) { auto compiler_flag = std::string(" --std=c++14"); @@ -601,7 +601,7 @@ static inline auto get_static_ck_common_compiler_flag(const ConvolutionContext& return compiler_flag; } -static inline bool IsComposableKernelSupportedHardware(const ConvolutionContext& c) +static inline bool IsComposableKernelSupportedHardware(const ExecutionContext& c) { return (StartsWith(c.GetStream().GetDeviceName(), "gfx803") && c.GetStream().GetMaxComputeUnits() == 64) || diff --git a/src/include/miopen/solver/mlir_common.hpp b/src/include/miopen/solver/mlir_common.hpp index ed5e289117..d926277e4a 100644 --- a/src/include/miopen/solver/mlir_common.hpp +++ b/src/include/miopen/solver/mlir_common.hpp @@ -27,7 +27,8 @@ #ifndef GUARD_MLIR_COMMON_HPP_ #define GUARD_MLIR_COMMON_HPP_ -#include +#include +#include #include @@ -37,13 +38,13 @@ namespace mlir { std::string GetKernelName(const ProblemDescription& problem, bool is_xdlops, int kernel_id = 0); -std::string ConstructBuildOptions(const ConvolutionContext& ctx, +std::string ConstructBuildOptions(const ExecutionContext& ctx, const ProblemDescription& problem, bool is_xdlops, int kernel_id = 0); template -std::string ConstructBuildOptions(const ConvolutionContext& ctx, +std::string ConstructBuildOptions(const ExecutionContext& ctx, const ProblemDescription& problem, const T& perf_config, bool is_xdlops, diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index 8de5e6c87c..522f5931b5 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -206,7 +206,7 @@ static auto GetBwdWrW2DSolvers() static auto GetFFTSolvers() { return miopen::solver::SolverContainer{}; } std::vector -FindAllGemmSolutions(const miopen::ConvolutionContext& ctx, +FindAllGemmSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx) { @@ -214,14 +214,13 @@ FindAllGemmSolutions(const miopen::ConvolutionContext& ctx, } std::vector> -AllGemmWorkspaceSize(const miopen::ConvolutionContext& ctx, - const miopen::ProblemDescription& problem) +AllGemmWorkspaceSize(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem) { return GetGemmSolvers().GetWorkspaceSizes(ctx, problem); } std::vector -FindAllDirectSolutions(const miopen::ConvolutionContext& ctx, +FindAllDirectSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx) { @@ -229,28 +228,28 @@ FindAllDirectSolutions(const miopen::ConvolutionContext& ctx, } std::vector> -AllDirectForwardBackwardDataWorkspaceSize(const miopen::ConvolutionContext& ctx, +AllDirectForwardBackwardDataWorkspaceSize(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem) { return GetDirectSolvers().GetWorkspaceSizes(ctx, problem); } std::vector> -FindAllWinogradWorkspaceSizes(const miopen::ConvolutionContext& ctx, +FindAllWinogradWorkspaceSizes(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem) { return GetWindogradSolvers().GetWorkspaceSizes(ctx, problem); } std::vector> -FindWinogradWrWWorkspaceSizes(const miopen::ConvolutionContext& ctx, +FindWinogradWrWWorkspaceSizes(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem) { return GetWindogradWrWSolvers().GetWorkspaceSizes(ctx, problem); } std::vector> -FindAllImplicitGemmWorkspaceSizes(const miopen::ConvolutionContext& ctx, +FindAllImplicitGemmWorkspaceSizes(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem) { #if WORKAROUND_SWDEV_227826 @@ -264,7 +263,7 @@ FindAllImplicitGemmWorkspaceSizes(const miopen::ConvolutionContext& ctx, } std::vector -FindAllImplicitGemmSolutions(const miopen::ConvolutionContext& ctx, +FindAllImplicitGemmSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx) { @@ -280,7 +279,7 @@ FindAllImplicitGemmSolutions(const miopen::ConvolutionContext& ctx, } std::vector -FindAllWinogradSolutions(const miopen::ConvolutionContext& ctx, +FindAllWinogradSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx) { @@ -288,7 +287,7 @@ FindAllWinogradSolutions(const miopen::ConvolutionContext& ctx, } std::vector -FindWinogradWrWAllSolutions(const miopen::ConvolutionContext& ctx, +FindWinogradWrWAllSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx) { @@ -296,14 +295,14 @@ FindWinogradWrWAllSolutions(const miopen::ConvolutionContext& ctx, } std::vector> -AllDirectBwdWrW2DWorkspaceSize(const miopen::ConvolutionContext& ctx, +AllDirectBwdWrW2DWorkspaceSize(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem) { return GetBwdWrW2DSolvers().GetWorkspaceSizes(ctx, problem); } std::vector> -FindImplicitGemmWrWWorkspaceSizes(const miopen::ConvolutionContext& ctx, +FindImplicitGemmWrWWorkspaceSizes(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem) { #if WORKAROUND_SWDEV_227826 @@ -317,7 +316,7 @@ FindImplicitGemmWrWWorkspaceSizes(const miopen::ConvolutionContext& ctx, } std::vector -FindImplicitGemmWrWAllSolutions(const miopen::ConvolutionContext& ctx, +FindImplicitGemmWrWAllSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx) { @@ -334,7 +333,7 @@ FindImplicitGemmWrWAllSolutions(const miopen::ConvolutionContext& ctx, } std::vector -FindAllBwdWrW2DSolutions(const miopen::ConvolutionContext& ctx, +FindAllBwdWrW2DSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx) { @@ -342,7 +341,7 @@ FindAllBwdWrW2DSolutions(const miopen::ConvolutionContext& ctx, } std::vector -FindAllFFTSolutions(const miopen::ConvolutionContext& ctx, +FindAllFFTSolutions(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem, const miopen::AnyInvokeParams& invoke_ctx) { @@ -350,7 +349,7 @@ FindAllFFTSolutions(const miopen::ConvolutionContext& ctx, } std::vector> -AllFFTForwardBackwardDataWorkspaceSize(const miopen::ConvolutionContext& ctx, +AllFFTForwardBackwardDataWorkspaceSize(const miopen::ExecutionContext& ctx, const miopen::ProblemDescription& problem) { return GetFFTSolvers().GetWorkspaceSizes(ctx, problem); diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp index 07e6f28b19..8c042e3e7b 100644 --- a/src/ocl/convolutionocl.cpp +++ b/src/ocl/convolutionocl.cpp @@ -113,12 +113,11 @@ static Invoker PrepareInvoker(ExecutionContext ctx, problem.SetupFloats(ctx); ctx.do_search = false; - const auto legacy_ctx = ConvolutionContext{ctx}; const auto legacy_problem = ProblemDescription{problem}; const auto solver = solver_id.GetSolver(); auto db = GetDb(ctx); auto solution = - solver.FindSolution(legacy_ctx, legacy_problem, db, {}); // auto tune is not expected here + solver.FindSolution(ctx, legacy_problem, db, {}); // auto tune is not expected here auto& handle = ctx.GetStream(); auto invoker = handle.PrepareInvoker(*solution.invoker_factory, solution.construction_params); const auto algo = AlgorithmName{solver_id.GetAlgo(problem.GetDirection())}; @@ -198,15 +197,15 @@ static inline std::vector FindConvolution(const ExecutionContext& ctx else { results = UserFindDbRecord::TryLoad(ctx.GetStream(), problem, [&](DbRecord& record) { - auto conv_ctx = ConvolutionContext{ctx}; - conv_ctx.use_dynamic_solutions_only = findMode.IsDynamicHybrid(ctx); + auto ctx_copy = ctx; + ctx_copy.use_dynamic_solutions_only = findMode.IsDynamicHybrid(ctx); auto legacy_problem = ProblemDescription(problem); ConvFindCore(invoke_ctx, record, - conv_ctx, + ctx_copy, legacy_problem, - conv.IsWinograd3x3SupportedAndFast(conv_ctx, legacy_problem), + conv.IsWinograd3x3SupportedAndFast(ctx_copy, legacy_problem), GetConvSolverFinders()); }); } @@ -519,7 +518,7 @@ struct SolutionTimeComparator }; std::vector -ConvolutionDescriptor::GetSolutionsFallback(const ExecutionContext& exec_ctx, +ConvolutionDescriptor::GetSolutionsFallback(const ExecutionContext& ctx, const conv::ProblemDescription& problem, const size_t maxSolutionCount) const { @@ -531,7 +530,6 @@ ConvolutionDescriptor::GetSolutionsFallback(const ExecutionContext& exec_ctx, /// \todo This is terrible. Should do away when we converge to /// single conv::ProblemDescription type. - const auto ctx = ConvolutionContext{exec_ctx}; const auto legacy_problem = ProblemDescription{problem}; const auto& inDesc = (problem.GetDirection() == conv::Direction::Forward) ? problem.GetIn() : problem.GetOut(); @@ -547,7 +545,7 @@ ConvolutionDescriptor::GetSolutionsFallback(const ExecutionContext& exec_ctx, #if MIOPEN_ENABLE_AI_IMMED_MODE_FALLBACK if(!miopen::IsDisabled(MIOPEN_DEBUG_ENABLE_AI_IMMED_MODE_FALLBACK{})) { - const static std::string arch = exec_ctx.GetStream().GetDeviceName(); + const static std::string arch = ctx.GetStream().GetDeviceName(); auto solvers = ai::immed_mode::PredictSolver(legacy_problem, ctx, arch); if(!solvers.empty()) { @@ -618,7 +616,7 @@ ConvolutionDescriptor::GetSolutionsFallback(const ExecutionContext& exec_ctx, return interim; } -std::vector GetSolutions(const ExecutionContext& exec_ctx, +std::vector GetSolutions(const ExecutionContext& ctx, const conv::ProblemDescription& problem, const size_t maxSolutionCount) { @@ -633,7 +631,7 @@ std::vector GetSolutions(const ExecutionContext& exec_ctx, break; } - const FindDbRecord fdb_record{exec_ctx.GetStream(), problem}; + const FindDbRecord fdb_record{ctx.GetStream(), problem}; if(fdb_record.empty()) return {}; @@ -641,13 +639,6 @@ std::vector GetSolutions(const ExecutionContext& exec_ctx, auto interim = std::vector{}; interim.reserve(20); // Heuristic for speed. - // Individual Solvers can be enabled/disabled by environment settings. - // Applicability is also affected by presence of external tools (e.g. assembler) - // ROCm version, specific features of GPU (like xnack) etc. - // All the above can be found by calling IsApplicable(). - // We need fully initialized context for this, see below. - auto ctx = ConvolutionContext{exec_ctx}; - for(const auto& pair : fdb_record) { const auto algo = static_cast(algo_resolver(pair.second.algorithm)); @@ -719,7 +710,7 @@ std::size_t ConvolutionDescriptor::GetForwardSolutionWorkspaceSize(Handle& handl return 0; const auto problem = conv::ProblemDescription{xDesc, wDesc, yDesc, *this, conv::Direction::Forward}; - auto ctx = ConvolutionContext{}; + auto ctx = ExecutionContext{}; ctx.SetStream(&handle); if(sol.IsApplicable(ctx, problem)) return sol.GetWorkspaceSize(ctx, problem); @@ -928,7 +919,7 @@ std::size_t ConvolutionDescriptor::GetBackwardSolutionWorkspaceSize(Handle& hand return 0; const auto problem = conv::ProblemDescription{dyDesc, wDesc, dxDesc, *this, conv::Direction::BackwardData}; - auto ctx = ConvolutionContext{}; + auto ctx = ExecutionContext{}; ctx.SetStream(&handle); if(sol.IsApplicable(ctx, problem)) return sol.GetWorkspaceSize(ctx, problem); @@ -1126,7 +1117,7 @@ std::size_t ConvolutionDescriptor::GetWrwSolutionWorkspaceSize(Handle& handle, return 0; const auto problem = conv::ProblemDescription{dyDesc, dwDesc, xDesc, *this, conv::Direction::BackwardWeights}; - auto ctx = ConvolutionContext{}; + auto ctx = ExecutionContext{}; ctx.SetStream(&handle); if(sol.IsApplicable(ctx, problem)) return sol.GetWorkspaceSize(ctx, problem); diff --git a/src/problem.cpp b/src/problem.cpp index 35a85074fe..afc806b3a2 100644 --- a/src/problem.cpp +++ b/src/problem.cpp @@ -347,7 +347,7 @@ std::vector Problem::FindSolutionsImpl(Handle& handle, const auto legacy_problem = ProblemDescription{conv_problem}; const auto netcfg = conv_problem.BuildConfKey(); - auto conv_ctx = ConvolutionContext{{&handle}}; + auto conv_ctx = ExecutionContext{&handle}; conv_problem.SetupFloats(conv_ctx); decltype(auto) db = GetDb(conv_ctx); diff --git a/src/solution.cpp b/src/solution.cpp index 5f5fa18512..97d4420cb8 100644 --- a/src/solution.cpp +++ b/src/solution.cpp @@ -177,7 +177,7 @@ void Solution::RunImpl(Handle& handle, } const auto legacy_problem = ProblemDescription{conv_problem}; - auto conv_ctx = ConvolutionContext{{&handle}}; + auto conv_ctx = ExecutionContext{&handle}; conv_problem.SetupFloats(conv_ctx); decltype(auto) db = GetDb(conv_ctx); diff --git a/src/solver.cpp b/src/solver.cpp index 0b623a5178..d83935e646 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -573,7 +573,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) // IMPORTANT: New solvers should be added to the end of the function! } -bool ThisSolverIsDeprecatedStatic::IsDisabled(const ConvolutionContext& ctx) +bool ThisSolverIsDeprecatedStatic::IsDisabled(const ExecutionContext& ctx) { static const bool device_is_allowed = [&]() { if(miopen::IsEnabled(MIOPEN_DEBUG_ENABLE_DEPRECATED_SOLVERS{})) diff --git a/src/solver/conv_MP_bidirectional_winograd.cpp b/src/solver/conv_MP_bidirectional_winograd.cpp index 5d638469d5..a653157f58 100644 --- a/src/solver/conv_MP_bidirectional_winograd.cpp +++ b/src/solver/conv_MP_bidirectional_winograd.cpp @@ -177,7 +177,7 @@ static bool IsApplicableGEMM(const ProblemDescription& problem) } template -static bool IsApplicableTransform(const ConvolutionContext& ctx, const ProblemDescription& problem) +static bool IsApplicableTransform(const ExecutionContext& ctx, const ProblemDescription& problem) { #if MIOPEN_BACKEND_HIP if(!ctx.use_asm_kernels) @@ -319,7 +319,7 @@ static bool IsApplicableTransform(const ConvolutionContext& ctx, const ProblemDe template bool ConvMPBidirectWinograd::IsApplicable( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { // HIP backend required for sending ptr (buffer + offset) // ROCBLAS for GEMM step @@ -359,7 +359,7 @@ bool ConvMPBidirectWinograd::IsA template size_t ConvMPBidirectWinograd::GetWorkspaceSize( - const ConvolutionContext&, const ProblemDescription& problem) const + const ExecutionContext&, const ProblemDescription& problem) const { const miopenDataType_t transform_data_type = miopen::IsEnabled(MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_EXPEREMENTAL_FP16_TRANSFORM{}) @@ -378,7 +378,7 @@ size_t ConvMPBidirectWinograd::G } template -static InvokerFactory MakeWinogradInvokerFactory(const ConvolutionContext& ctx, +static InvokerFactory MakeWinogradInvokerFactory(const ExecutionContext& ctx, const ProblemDescription& problem, InvokerFactory xdlops_factory = InvokerFactory(), bool isXdlops = false) @@ -640,7 +640,7 @@ static InvokerFactory MakeWinogradInvokerFactory(const ConvolutionContext& ctx, template ConvSolution ConvMPBidirectWinograd::GetSolution( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { ConvSolution result; result.workspace_sz = GetWorkspaceSize(ctx, problem); @@ -724,14 +724,14 @@ template struct ConvMPBidirectWinograd<4, 3>; template struct ConvMPBidirectWinograd<5, 3>; template struct ConvMPBidirectWinograd<6, 3>; -// ConvolutionContext and ProblemDescription transformation +// ExecutionContext and ProblemDescription transformation // for winograd buffers calculation using xdlops_convolution template -ConvolutionContext ConvMPBidirectWinograd_xdlops:: - GetTransformedConvContext(const ConvolutionContext& ctx, +ExecutionContext ConvMPBidirectWinograd_xdlops:: + GetTransformedConvContext(const ExecutionContext& ctx, const ProblemDescription& transformed_problem) const { - auto transformed_ctx = ConvolutionContext{static_cast(ctx)}; + auto transformed_ctx = ExecutionContext{static_cast(ctx)}; transformed_problem.SetupFloats(transformed_ctx); return transformed_ctx; @@ -846,7 +846,7 @@ static conv::DataInvokeParams GetTransformedInvokeContext(const ProblemDescripti template bool ConvMPBidirectWinograd_xdlops::IsApplicable( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { static const int wino_data_tile = std::max(WinoDataH, WinoDataW); @@ -878,7 +878,7 @@ bool ConvMPBidirectWinograd_xdlops ConvSolution ConvMPBidirectWinograd_xdlops::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmForwardV4R4Xdlops& config) const { @@ -918,7 +918,7 @@ ConvMPBidirectWinograd_xdlops::G template PerformanceImplicitGemmForwardV4R4Xdlops ConvMPBidirectWinograd_xdlops::Search( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { diff --git a/src/solver/conv_asm_1x1u.cpp b/src/solver/conv_asm_1x1u.cpp index 4da4f6dc02..0664c32eb8 100644 --- a/src/solver/conv_asm_1x1u.cpp +++ b/src/solver/conv_asm_1x1u.cpp @@ -386,7 +386,7 @@ bool PerformanceConfigConvAsm1x1U::ModelApplyToken(int index, return this->IsPartiallyValid(problem, index + 1); } -static bool IsModelApplicable(const ConvolutionContext& ctx, const ProblemDescription& problem) +static bool IsModelApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) { if(!miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U_AI_HEUR{})) return false; @@ -415,7 +415,7 @@ static std::vector TransformFeatures(const ProblemDescription& problem, s return features; } -void PerformanceConfigConvAsm1x1U::RunParmeterPredictionModel(const ConvolutionContext& ctx, +void PerformanceConfigConvAsm1x1U::RunParmeterPredictionModel(const ExecutionContext& ctx, const ProblemDescription& problem, bool& valid) { @@ -479,7 +479,7 @@ void PerformanceConfigConvAsm1x1U::StaticHeuristic(const ProblemDescription& pro } } -void PerformanceConfigConvAsm1x1U::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceConfigConvAsm1x1U::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { if(problem.GetInDataType() == miopenDouble) @@ -501,7 +501,7 @@ void PerformanceConfigConvAsm1x1U::HeuristicInit(const ConvolutionContext& ctx, } PerformanceConfigConvAsm1x1U -ConvAsm1x1U::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, +ConvAsm1x1U::GetDefaultPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem) const { PerformanceConfigConvAsm1x1U pp; @@ -510,15 +510,14 @@ ConvAsm1x1U::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, return pp; } -bool ConvAsm1x1U::IsValidPerformanceConfig(const ConvolutionContext&, +bool ConvAsm1x1U::IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigConvAsm1x1U& config) const { return config.IsValidValue() && config.IsValid(problem); } -bool ConvAsm1x1U::IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const +bool ConvAsm1x1U::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1U{})) return false; @@ -620,7 +619,7 @@ bool ConvAsm1x1U::IsApplicable(const ConvolutionContext& ctx, return ok; } -size_t ConvAsm1x1U::GetWorkspaceSize(const ConvolutionContext&, +size_t ConvAsm1x1U::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const { if(UseSubsample(problem) || UseUpsample(problem)) @@ -641,7 +640,7 @@ static int divide_round_plus_inf(const int x, const int y) return x / y; } -ConvSolution ConvAsm1x1U::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvAsm1x1U::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigConvAsm1x1U& config) const { @@ -914,7 +913,7 @@ ConvSolution ConvAsm1x1U::GetSolution(const ConvolutionContext& ctx, return result; } -PerformanceConfigConvAsm1x1U ConvAsm1x1U::Search(const ConvolutionContext& ctx, +PerformanceConfigConvAsm1x1U ConvAsm1x1U::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { diff --git a/src/solver/conv_asm_1x1u_stride2.cpp b/src/solver/conv_asm_1x1u_stride2.cpp index ba23b9d9dc..b9925ee30c 100644 --- a/src/solver/conv_asm_1x1u_stride2.cpp +++ b/src/solver/conv_asm_1x1u_stride2.cpp @@ -460,7 +460,7 @@ void PerformanceConfigConvAsm1x1UV2::HeuristicInit(const ProblemDescription& pro } PerformanceConfigConvAsm1x1UV2 -ConvAsm1x1UV2::GetDefaultPerformanceConfig(const ConvolutionContext&, +ConvAsm1x1UV2::GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription& problem) const { PerformanceConfigConvAsm1x1UV2 pp; @@ -469,14 +469,14 @@ ConvAsm1x1UV2::GetDefaultPerformanceConfig(const ConvolutionContext&, return pp; } -bool ConvAsm1x1UV2::IsValidPerformanceConfig(const ConvolutionContext&, +bool ConvAsm1x1UV2::IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigConvAsm1x1UV2& config) const { return config.IsValidValue() && config.IsValid(problem); } -bool ConvAsm1x1UV2::IsApplicable(const ConvolutionContext& ctx, +bool ConvAsm1x1UV2::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_ASM_1X1UV2{})) @@ -594,7 +594,7 @@ bool ConvAsm1x1UV2::IsApplicable(const ConvolutionContext& ctx, return ok; } -ConvSolution ConvAsm1x1UV2::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvAsm1x1UV2::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigConvAsm1x1UV2& config) const { @@ -754,7 +754,7 @@ ConvSolution ConvAsm1x1UV2::GetSolution(const ConvolutionContext& ctx, return result; } -PerformanceConfigConvAsm1x1UV2 ConvAsm1x1UV2::Search(const ConvolutionContext& ctx, +PerformanceConfigConvAsm1x1UV2 ConvAsm1x1UV2::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { diff --git a/src/solver/conv_asm_3x3u.cpp b/src/solver/conv_asm_3x3u.cpp index b185b959af..18f07b9630 100644 --- a/src/solver/conv_asm_3x3u.cpp +++ b/src/solver/conv_asm_3x3u.cpp @@ -150,7 +150,7 @@ void PerformanceConfigConvAsm3x3U::HeuristicInit(const ProblemDescription& probl } PerformanceConfigConvAsm3x3U -ConvAsm3x3U::GetDefaultPerformanceConfig(const ConvolutionContext&, +ConvAsm3x3U::GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription& problem) const { PerformanceConfigConvAsm3x3U pp; @@ -159,15 +159,14 @@ ConvAsm3x3U::GetDefaultPerformanceConfig(const ConvolutionContext&, return pp; } -bool ConvAsm3x3U::IsValidPerformanceConfig(const ConvolutionContext&, +bool ConvAsm3x3U::IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigConvAsm3x3U& config) const { return config.IsValidValue() && config.IsValid(problem); } -bool ConvAsm3x3U::IsApplicable(const ConvolutionContext& ctx, - const ProblemDescription& problem) const +bool ConvAsm3x3U::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_ASM_3X3U{})) return false; @@ -242,7 +241,7 @@ bool ConvAsm3x3U::IsApplicable(const ConvolutionContext& ctx, // clang-format on } -ConvSolution ConvAsm3x3U::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvAsm3x3U::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigConvAsm3x3U& config) const { @@ -321,7 +320,7 @@ ConvSolution ConvAsm3x3U::GetSolution(const ConvolutionContext& ctx, return result; } -PerformanceConfigConvAsm3x3U ConvAsm3x3U::Search(const ConvolutionContext& ctx, +PerformanceConfigConvAsm3x3U ConvAsm3x3U::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { diff --git a/src/solver/conv_asm_dir_BwdWrW1x1.cpp b/src/solver/conv_asm_dir_BwdWrW1x1.cpp index b6b2458157..0abe71326f 100644 --- a/src/solver/conv_asm_dir_BwdWrW1x1.cpp +++ b/src/solver/conv_asm_dir_BwdWrW1x1.cpp @@ -307,7 +307,7 @@ bool PerformanceConfigConvAsmBwdWrW1x1::IsValidValue() const && IsFromPack<0, 1, 2, 3, 4>(data_prefetch); // clang-format on } -bool PerformanceConfigConvAsmBwdWrW1x1::IsValid(const ConvolutionContext& ctx, +bool PerformanceConfigConvAsmBwdWrW1x1::IsValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { @@ -362,7 +362,7 @@ bool PerformanceConfigConvAsmBwdWrW1x1::IsValid(const ConvolutionContext& ctx, return true; } -void PerformanceConfigConvAsmBwdWrW1x1::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceConfigConvAsmBwdWrW1x1::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { short_store = @@ -449,7 +449,7 @@ void PerformanceConfigConvAsmBwdWrW1x1::HeuristicInit(const ConvolutionContext& } PerformanceConfigConvAsmBwdWrW1x1 -ConvAsmBwdWrW1x1::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, +ConvAsmBwdWrW1x1::GetDefaultPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem) const { PerformanceConfigConvAsmBwdWrW1x1 pp; @@ -459,14 +459,14 @@ ConvAsmBwdWrW1x1::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, } bool ConvAsmBwdWrW1x1::IsValidPerformanceConfig( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigConvAsmBwdWrW1x1& config) const { return config.IsValidValue() && config.IsValid(ctx, problem); } -bool ConvAsmBwdWrW1x1::IsApplicable(const ConvolutionContext& ctx, +bool ConvAsmBwdWrW1x1::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW1X1{})) @@ -549,7 +549,7 @@ static int divide_round_plus_inf(const int x, const int y) return x / y; } -size_t ConvAsmBwdWrW1x1::GetWorkspaceSize(const ConvolutionContext&, +size_t ConvAsmBwdWrW1x1::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const { if(UseSubsample(problem)) @@ -563,7 +563,7 @@ size_t ConvAsmBwdWrW1x1::GetWorkspaceSize(const ConvolutionContext&, return 0; } -ConvSolution ConvAsmBwdWrW1x1::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvAsmBwdWrW1x1::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigConvAsmBwdWrW1x1& config) const { @@ -851,7 +851,7 @@ ConvSolution ConvAsmBwdWrW1x1::GetSolution(const ConvolutionContext& ctx, return result; } -PerformanceConfigConvAsmBwdWrW1x1 ConvAsmBwdWrW1x1::Search(const ConvolutionContext& ctx, +PerformanceConfigConvAsmBwdWrW1x1 ConvAsmBwdWrW1x1::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { diff --git a/src/solver/conv_asm_dir_BwdWrW3x3.cpp b/src/solver/conv_asm_dir_BwdWrW3x3.cpp index 7b0b0567d8..ae58cfcd9b 100644 --- a/src/solver/conv_asm_dir_BwdWrW3x3.cpp +++ b/src/solver/conv_asm_dir_BwdWrW3x3.cpp @@ -143,7 +143,7 @@ static bool IsReverseInOutAllowed(const ProblemDescription& problem) inline int elements_in_dword(const ProblemDescription& problem) { return problem.IsFp16() ? 2 : 1; } -bool PerformanceConfigAsmDirect3x3WrW::IsValid(const ConvolutionContext& ctx, +bool PerformanceConfigAsmDirect3x3WrW::IsValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(!IsValidValue()) @@ -250,7 +250,7 @@ bool PerformanceConfigAsmDirect3x3WrW::IsValid(const ConvolutionContext& ctx, return true; } -void PerformanceConfigAsmDirect3x3WrW::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceConfigAsmDirect3x3WrW::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { limit_wave_cnt = 0; @@ -332,7 +332,7 @@ void PerformanceConfigAsmDirect3x3WrW::HeuristicInit(const ConvolutionContext& c } PerformanceConfigAsmDirect3x3WrW -ConvAsmBwdWrW3x3::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, +ConvAsmBwdWrW3x3::GetDefaultPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem) const { PerformanceConfigAsmDirect3x3WrW pp; @@ -342,14 +342,14 @@ ConvAsmBwdWrW3x3::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, } bool ConvAsmBwdWrW3x3::IsValidPerformanceConfig( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigAsmDirect3x3WrW& config) const { return config.IsValidValue() && config.IsValid(ctx, problem); } -bool ConvAsmBwdWrW3x3::IsApplicable(const ConvolutionContext& ctx, +bool ConvAsmBwdWrW3x3::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_ASM_WRW3X3{})) @@ -445,7 +445,7 @@ bool ConvAsmBwdWrW3x3::IsApplicable(const ConvolutionContext& ctx, return ok; } -ConvSolution ConvAsmBwdWrW3x3::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvAsmBwdWrW3x3::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigAsmDirect3x3WrW& config) const { @@ -562,7 +562,7 @@ ConvSolution ConvAsmBwdWrW3x3::GetSolution(const ConvolutionContext& ctx, return result; } -PerformanceConfigAsmDirect3x3WrW ConvAsmBwdWrW3x3::Search(const ConvolutionContext& ctx, +PerformanceConfigAsmDirect3x3WrW ConvAsmBwdWrW3x3::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp index 71c53f61a9..9cfdd8aeea 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp @@ -461,7 +461,7 @@ GetImplicitGemmGtcDynamicBwdXdlopsNHWCKernel( } void PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::HeuristicInit( - const ConvolutionContext&, const ProblemDescription& problem) + const ExecutionContext&, const ProblemDescription& problem) { static const std::vector> tile_list_fp32 = { std::make_tuple(128, 128, 16), @@ -887,7 +887,7 @@ bool PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC::IsValid( PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetDefaultPerformanceConfig( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC pp; pp.HeuristicInit(ctx, problem); @@ -895,7 +895,7 @@ ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetDefaultPerformanceConfig( return pp; } bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config) const { @@ -903,7 +903,7 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsValidPerformanceConfig( } PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC -ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::Search(const ConvolutionContext& ctx, +ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -911,7 +911,7 @@ ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::Search(const ConvolutionContext& ctx } bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_BWD_GTC_XDLOPS_NHWC{})) return false; @@ -977,7 +977,7 @@ bool ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::IsApplicable( } size_t ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetWorkspaceSize( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { const int hi = problem.GetOutHeight_(); const int wi = problem.GetOutWidth_(); @@ -1031,7 +1031,7 @@ size_t ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetWorkspaceSize( } ConvSolution ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC& config) const { diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp index bbedf8d680..b16258235e 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nchwc.cpp @@ -257,7 +257,7 @@ static std::tuple // splits_4G GetImplicitGemmGtcDynamicFwdDlopsNCHWCKernel( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC& config) { @@ -518,7 +518,7 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC::IsValid( PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::GetDefaultPerformanceConfig( - const ConvolutionContext&, const ProblemDescription& problem) const + const ExecutionContext&, const ProblemDescription& problem) const { PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC pp; pp.HeuristicInit(problem); @@ -527,14 +527,14 @@ ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::GetDefaultPerformanceConfig( } bool ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC& config) const { return config.IsValidValue() && config.IsValid(problem); } PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC -ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::Search(const ConvolutionContext& ctx, +ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -542,7 +542,7 @@ ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::Search(const ConvolutionContext& ctx } bool ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::IsApplicable( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_DLOPS_NCHWC{})) return false; @@ -591,7 +591,7 @@ bool ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::IsApplicable( } ConvSolution ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC& config) const { diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp index e315fd0895..4ab9ce1c37 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -360,7 +360,7 @@ GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel( } void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit( - const ConvolutionContext&, const ProblemDescription& problem) + const ExecutionContext&, const ProblemDescription& problem) { static const std::vector> tile_list_fp32 = { std::make_tuple(128, 128, 16), @@ -768,7 +768,7 @@ bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid( PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetDefaultPerformanceConfig( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC pp; pp.HeuristicInit(ctx, problem); @@ -777,7 +777,7 @@ ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetDefaultPerformanceConfig( } bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config) const { @@ -785,7 +785,7 @@ bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsValidPerformanceConfig( } PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC -ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::Search(const ConvolutionContext& ctx, +ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -793,7 +793,7 @@ ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::Search(const ConvolutionContext& ctx } size_t ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetWorkspaceSize( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { const int hi = problem.GetInHeight_(); const int wi = problem.GetInWidth_(); @@ -849,7 +849,7 @@ size_t ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetWorkspaceSize( } bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsApplicable( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS_NHWC{})) return false; @@ -916,7 +916,7 @@ bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsApplicable( return true; } ConvSolution ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config) const { diff --git a/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp b/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp index d7395bb0e9..8560c65052 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_perf_config.cpp @@ -256,7 +256,7 @@ std::string PerformanceConfigAsmImplicitGemmGTC::ToString() const return ss.str(); } -std::string PerformanceConfigAsmImplicitGemmGTC::ToKernelName(const ConvolutionContext& ctx) const +std::string PerformanceConfigAsmImplicitGemmGTC::ToKernelName(const ExecutionContext& ctx) const { std::ostringstream kernel_name; const auto device_name = ctx.GetStream().GetDeviceName(); @@ -487,7 +487,7 @@ std::string PerformanceConfigAsmImplicitGemmGTCvector::ToString() const } std::string -PerformanceConfigAsmImplicitGemmGTCvector::ToKernelName(const ConvolutionContext& ctx) const +PerformanceConfigAsmImplicitGemmGTCvector::ToKernelName(const ExecutionContext& ctx) const { std::ostringstream kernel_name; const auto device_name = ctx.GetStream().GetDeviceName(); diff --git a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp index c8dee39a79..8ac238395a 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp @@ -451,7 +451,7 @@ void PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::SetParamsForKSplit( } void PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::HeuristicInit( - const ConvolutionContext& ctx, const ProblemDescription& problem) + const ExecutionContext& ctx, const ProblemDescription& problem) { static const std::vector> tile_list_fp32 = { std::make_tuple(128, 128, 16), @@ -816,7 +816,7 @@ bool PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::IsValid( PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetDefaultPerformanceConfig( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC pp; pp.HeuristicInit(ctx, problem); @@ -824,14 +824,14 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetDefaultPerformanceConfig( return pp; } bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC& config) const { return config.IsValidValue() && config.IsValid(problem); } PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC -ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::Search(const ConvolutionContext& ctx, +ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -839,7 +839,7 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::Search(const ConvolutionContext& ctx } bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsApplicable( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS_NHWC{})) return false; @@ -956,7 +956,7 @@ ComputeDynamicIGemmWrwKernelArgsNHWC(const conv::ProblemDescription& problem, } size_t ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetWorkspaceSize( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { const int hi = problem.GetOutHeight_(); const int wi = problem.GetOutWidth_(); @@ -1010,7 +1010,7 @@ size_t ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetWorkspaceSize( } ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC& config) const { diff --git a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp index 7ddf2e3049..001f3a8cb7 100644 --- a/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp +++ b/src/solver/conv_ck_igemm_fwd_v6r1_dlops_nchw.cpp @@ -82,7 +82,7 @@ bool PerformanceConvCkIgemmFwdV6r1DlopsNchw::IsValid(const ProblemDescription& p ck_utility::get_ck_convolution_problem_descriptor(problem), compile_param); } -bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx, +bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if WORKAROUND_SWDEV_411729 @@ -121,7 +121,7 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsApplicable(const ConvolutionContext& ctx, } PerformanceConvCkIgemmFwdV6r1DlopsNchw -ConvCkIgemmFwdV6r1DlopsNchw::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, +ConvCkIgemmFwdV6r1DlopsNchw::GetDefaultPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem) const { for(int i = 0; i < ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetTunableList().size(); ++i) @@ -138,7 +138,7 @@ ConvCkIgemmFwdV6r1DlopsNchw::GetDefaultPerformanceConfig(const ConvolutionContex } bool ConvCkIgemmFwdV6r1DlopsNchw::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceConvCkIgemmFwdV6r1DlopsNchw& config) const { @@ -146,7 +146,7 @@ bool ConvCkIgemmFwdV6r1DlopsNchw::IsValidPerformanceConfig( } ConvSolution -ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(const ConvolutionContext& ctx, +ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvCkIgemmFwdV6r1DlopsNchw& config) const { @@ -252,7 +252,7 @@ ConvCkIgemmFwdV6r1DlopsNchw::GetSolution(const ConvolutionContext& ctx, return sol; } -std::size_t ConvCkIgemmFwdV6r1DlopsNchw::GetWorkspaceSize(const ConvolutionContext&, +std::size_t ConvCkIgemmFwdV6r1DlopsNchw::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const { return ck::driver::ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetMaxWorkSpaceSize( @@ -260,7 +260,7 @@ std::size_t ConvCkIgemmFwdV6r1DlopsNchw::GetWorkspaceSize(const ConvolutionConte } PerformanceConvCkIgemmFwdV6r1DlopsNchw -ConvCkIgemmFwdV6r1DlopsNchw::Search(const ConvolutionContext& ctx, +ConvCkIgemmFwdV6r1DlopsNchw::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp index 64c95257e6..4df8df5874 100644 --- a/src/solver/conv_direct_naive_conv.cpp +++ b/src/solver/conv_direct_naive_conv.cpp @@ -176,7 +176,7 @@ std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem) return kernel_name.str(); } -std::string ConvDirectNaiveConvKernelFile(const ConvolutionContext& ctx, +std::string ConvDirectNaiveConvKernelFile(const ExecutionContext& ctx, const ProblemDescription& problem) { const auto device_name = ctx.GetStream().GetDeviceName(); @@ -193,7 +193,7 @@ std::string ConvDirectNaiveConvKernelFile(const ConvolutionContext& ctx, return "naive_conv.cpp"; } -std::string ConvDirectNaiveConvCompileOption(const ConvolutionContext& ctx, +std::string ConvDirectNaiveConvCompileOption(const ExecutionContext& ctx, const ProblemDescription& problem) { std::string filename = ConvDirectNaiveConvKernelFile(ctx, problem); diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp index c5d793860c..f8af0ec2d1 100644 --- a/src/solver/conv_direct_naive_conv_bwd.cpp +++ b/src/solver/conv_direct_naive_conv_bwd.cpp @@ -34,7 +34,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_BWD) namespace miopen { namespace solver { -bool ConvDirectNaiveConvBwd::IsApplicable(const ConvolutionContext& ctx, +bool ConvDirectNaiveConvBwd::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(!miopen::debug::AlwaysEnableConvDirectNaive && @@ -74,7 +74,7 @@ bool ConvDirectNaiveConvBwd::IsApplicable(const ConvolutionContext& ctx, return true; } -ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const { ConvSolution result; diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp index fc8d8e77fd..90d8feee31 100644 --- a/src/solver/conv_direct_naive_conv_fwd.cpp +++ b/src/solver/conv_direct_naive_conv_fwd.cpp @@ -34,7 +34,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD) namespace miopen { namespace solver { -bool ConvDirectNaiveConvFwd::IsApplicable(const ConvolutionContext& ctx, +bool ConvDirectNaiveConvFwd::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(!miopen::debug::AlwaysEnableConvDirectNaive && @@ -74,7 +74,7 @@ bool ConvDirectNaiveConvFwd::IsApplicable(const ConvolutionContext& ctx, return true; } -ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const { ConvSolution result; diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp index 2c85949ad4..6fcf2f71d0 100644 --- a/src/solver/conv_direct_naive_conv_wrw.cpp +++ b/src/solver/conv_direct_naive_conv_wrw.cpp @@ -34,7 +34,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_WRW) namespace miopen { namespace solver { -bool ConvDirectNaiveConvWrw::IsApplicable(const ConvolutionContext& ctx, +bool ConvDirectNaiveConvWrw::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(!miopen::debug::AlwaysEnableConvDirectNaive && @@ -74,7 +74,7 @@ bool ConvDirectNaiveConvWrw::IsApplicable(const ConvolutionContext& ctx, return true; } -ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const { ConvSolution result; diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp index 0b880b2fc8..2602c54320 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp @@ -271,7 +271,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::operator==( PerformanceConfigHipImplicitGemm3DGroupBwdXdlops ConvHipImplicitGemm3DGroupBwdXdlops::GetDefaultPerformanceConfig( - const ConvolutionContext&, const ProblemDescription& problem) const + const ExecutionContext&, const ProblemDescription& problem) const { PerformanceConfigHipImplicitGemm3DGroupBwdXdlops pp; pp.HeuristicInit(problem); @@ -279,7 +279,7 @@ ConvHipImplicitGemm3DGroupBwdXdlops::GetDefaultPerformanceConfig( } bool ConvHipImplicitGemm3DGroupBwdXdlops::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops& config) const { @@ -287,7 +287,7 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsValidPerformanceConfig( } PerformanceConfigHipImplicitGemm3DGroupBwdXdlops -ConvHipImplicitGemm3DGroupBwdXdlops::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemm3DGroupBwdXdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -295,7 +295,7 @@ ConvHipImplicitGemm3DGroupBwdXdlops::Search(const ConvolutionContext& ctx, } bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable( - [[maybe_unused]] const ConvolutionContext& ctx, + [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) const { #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL @@ -335,7 +335,7 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable( } ConvSolution ConvHipImplicitGemm3DGroupBwdXdlops::GetSolution( - [[maybe_unused]] const ConvolutionContext& ctx, + [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem, [[maybe_unused]] const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index 80b969fcbd..f0623c642d 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -271,7 +271,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::operator==( PerformanceConfigHipImplicitGemm3DGroupFwdXdlops ConvHipImplicitGemm3DGroupFwdXdlops::GetDefaultPerformanceConfig( - const ConvolutionContext&, const ProblemDescription& problem) const + const ExecutionContext&, const ProblemDescription& problem) const { PerformanceConfigHipImplicitGemm3DGroupFwdXdlops pp; pp.HeuristicInit(problem); @@ -279,7 +279,7 @@ ConvHipImplicitGemm3DGroupFwdXdlops::GetDefaultPerformanceConfig( } bool ConvHipImplicitGemm3DGroupFwdXdlops::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& config) const { @@ -287,7 +287,7 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsValidPerformanceConfig( } PerformanceConfigHipImplicitGemm3DGroupFwdXdlops -ConvHipImplicitGemm3DGroupFwdXdlops::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemm3DGroupFwdXdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -295,7 +295,7 @@ ConvHipImplicitGemm3DGroupFwdXdlops::Search(const ConvolutionContext& ctx, } bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable( - [[maybe_unused]] const ConvolutionContext& ctx, + [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) const { #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL @@ -333,7 +333,7 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable( } ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution( - [[maybe_unused]] const ConvolutionContext& ctx, + [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem, [[maybe_unused]] const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp index 7292828f69..6fce8a80b8 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp @@ -267,7 +267,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::operator==( PerformanceConfigHipImplicitGemm3DGroupWrwXdlops ConvHipImplicitGemm3DGroupWrwXdlops::GetDefaultPerformanceConfig( - const ConvolutionContext&, const ProblemDescription& problem) const + const ExecutionContext&, const ProblemDescription& problem) const { PerformanceConfigHipImplicitGemm3DGroupWrwXdlops pp; pp.HeuristicInit(problem); @@ -275,7 +275,7 @@ ConvHipImplicitGemm3DGroupWrwXdlops::GetDefaultPerformanceConfig( } bool ConvHipImplicitGemm3DGroupWrwXdlops::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops& config) const { @@ -283,7 +283,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsValidPerformanceConfig( } PerformanceConfigHipImplicitGemm3DGroupWrwXdlops -ConvHipImplicitGemm3DGroupWrwXdlops::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemm3DGroupWrwXdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -291,7 +291,7 @@ ConvHipImplicitGemm3DGroupWrwXdlops::Search(const ConvolutionContext& ctx, } bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable( - [[maybe_unused]] const ConvolutionContext& ctx, + [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) const { #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL @@ -331,7 +331,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable( } ConvSolution ConvHipImplicitGemm3DGroupWrwXdlops::GetSolution( - [[maybe_unused]] const ConvolutionContext& ctx, + [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem, [[maybe_unused]] const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp index 7e380f6289..57c63267d2 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp @@ -229,7 +229,7 @@ bool PerformanceConfigHipImplicitGemmBwdXdlops::operator==( } PerformanceConfigHipImplicitGemmBwdXdlops -ConvHipImplicitGemmBwdXdlops::GetDefaultPerformanceConfig(const ConvolutionContext&, +ConvHipImplicitGemmBwdXdlops::GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription& problem) const { PerformanceConfigHipImplicitGemmBwdXdlops pp; @@ -238,7 +238,7 @@ ConvHipImplicitGemmBwdXdlops::GetDefaultPerformanceConfig(const ConvolutionConte } bool ConvHipImplicitGemmBwdXdlops::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigHipImplicitGemmBwdXdlops& config) const { @@ -246,7 +246,7 @@ bool ConvHipImplicitGemmBwdXdlops::IsValidPerformanceConfig( } PerformanceConfigHipImplicitGemmBwdXdlops -ConvHipImplicitGemmBwdXdlops::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmBwdXdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -254,7 +254,7 @@ ConvHipImplicitGemmBwdXdlops::Search(const ConvolutionContext& ctx, } bool ConvHipImplicitGemmBwdXdlops::IsApplicable( - [[maybe_unused]] const ConvolutionContext& ctx, + [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) const { #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL @@ -303,7 +303,7 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable( } ConvSolution ConvHipImplicitGemmBwdXdlops::GetSolution( - [[maybe_unused]] const ConvolutionContext& ctx, + [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem, [[maybe_unused]] const PerformanceConfigHipImplicitGemmBwdXdlops& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp index c389cb0cee..b2b591b859 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_v1r1.cpp @@ -86,7 +86,7 @@ bool PerformanceImplicitGemmBwdDataV1R1::operator==( } std::tuple -PerformanceImplicitGemmBwdDataV1R1::CalculateGridSize(const ConvolutionContext& ctx, +PerformanceImplicitGemmBwdDataV1R1::CalculateGridSize(const ExecutionContext& ctx, const ProblemDescription& problem) const { int GridSize = 0; @@ -180,7 +180,7 @@ PerformanceImplicitGemmBwdDataV1R1::CalculateBlockGemmPerformanceParameters() co std::tuple PerformanceImplicitGemmBwdDataV1R1::CalculateGemmABlockCopyPerformanceParameters( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { int ClusterLengths_GemmK = 0; int ClusterLengths_GemmM = 0; @@ -245,7 +245,7 @@ PerformanceImplicitGemmBwdDataV1R1::CalculateGemmABlockCopyPerformanceParameters std::tuple PerformanceImplicitGemmBwdDataV1R1::CalculateGemmBBlockCopyPerformanceParameters( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { int ClusterLengths_GemmK = 0; int ClusterLengths_GemmN = 0; @@ -392,7 +392,7 @@ PerformanceImplicitGemmBwdDataV1R1::CalculateGemmCThreadCopyPerformanceParameter } std::tuple PerformanceImplicitGemmBwdDataV1R1::CalculateLdsNumberOfByte( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { std::size_t lds_size = 0; @@ -450,7 +450,7 @@ bool PerformanceImplicitGemmBwdDataV1R1::IsValidValue() const // clang-format on } -bool PerformanceImplicitGemmBwdDataV1R1::IsValid(const ConvolutionContext& ctx, +bool PerformanceImplicitGemmBwdDataV1R1::IsValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(!IsValidValue()) @@ -506,7 +506,7 @@ bool PerformanceImplicitGemmBwdDataV1R1::IsValid(const ConvolutionContext& ctx, return (valid and lds_size <= get_lds_max_number_of_byte()); } -void PerformanceImplicitGemmBwdDataV1R1::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceImplicitGemmBwdDataV1R1::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { PerformanceImplicitGemmBwdDataV1R1 config; @@ -587,7 +587,7 @@ bool PerformanceImplicitGemmBwdDataV1R1::SetNextValue(const ProblemDescription&) } std::tuple -ConvHipImplicitGemmBwdDataV1R1::CalculateGemmSize(const ConvolutionContext& ctx, +ConvHipImplicitGemmBwdDataV1R1::CalculateGemmSize(const ExecutionContext& ctx, const ProblemDescription& problem) { const auto n = ProblemInterpreter::GetBatchN(problem); @@ -607,7 +607,7 @@ ConvHipImplicitGemmBwdDataV1R1::CalculateGemmSize(const ConvolutionContext& ctx, return std::make_tuple(gemm_m, gemm_n, gemm_k); } -size_t ConvHipImplicitGemmBwdDataV1R1::GetWorkspaceSize(const ConvolutionContext&, +size_t ConvHipImplicitGemmBwdDataV1R1::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const { if(problem.IsFp32()) @@ -627,7 +627,7 @@ size_t ConvHipImplicitGemmBwdDataV1R1::GetWorkspaceSize(const ConvolutionContext } } -bool ConvHipImplicitGemmBwdDataV1R1::IsApplicable(const ConvolutionContext& ctx, +bool ConvHipImplicitGemmBwdDataV1R1::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_BWD_V1R1{})) @@ -676,14 +676,14 @@ bool ConvHipImplicitGemmBwdDataV1R1::IsApplicable(const ConvolutionContext& ctx, } PerformanceImplicitGemmBwdDataV1R1 -ConvHipImplicitGemmBwdDataV1R1::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, +ConvHipImplicitGemmBwdDataV1R1::GetDefaultPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem) const { return GetPerformanceConfigBase(ctx, problem); } bool ConvHipImplicitGemmBwdDataV1R1::IsValidPerformanceConfig( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmBwdDataV1R1& config) const { @@ -692,7 +692,7 @@ bool ConvHipImplicitGemmBwdDataV1R1::IsValidPerformanceConfig( } PerformanceImplicitGemmBwdDataV1R1 -ConvHipImplicitGemmBwdDataV1R1::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmBwdDataV1R1::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -700,7 +700,7 @@ ConvHipImplicitGemmBwdDataV1R1::Search(const ConvolutionContext& ctx, } ConvSolution -ConvHipImplicitGemmBwdDataV1R1::GetSolution(const ConvolutionContext& ctx, +ConvHipImplicitGemmBwdDataV1R1::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmBwdDataV1R1& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp index 5b533d72ee..f657fa74fe 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops.cpp @@ -105,7 +105,7 @@ bool PerformanceImplicitGemmBwdV1R1Xdlops::SetNextValue(const ProblemDescription return true; } -void PerformanceImplicitGemmBwdV1R1Xdlops::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceImplicitGemmBwdV1R1Xdlops::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { PerformanceImplicitGemmBwdV1R1Xdlops tmp; @@ -527,7 +527,7 @@ bool PerformanceImplicitGemmBwdV1R1Xdlops::IsReallyValid(const ProblemDescriptio // Return false if a performance config is known to be sub-optimal, comparing to other performance // config inside tuning range bool PerformanceImplicitGemmBwdV1R1Xdlops::IsFastToBeUsedForTuning( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { // somehow, 128x128 wave-wise GEMM tend to spill register // TODO revisit this when 128x128 wave-wise GEMM become efficient @@ -659,7 +659,7 @@ bool PerformanceImplicitGemmBwdV1R1Xdlops::IsFastToBeUsedForTuning( // Return false, if you don't want to this to be included in tuning range used by generic search // A performance config may still be valid w.r.t algorithm correctness, even when IsValid() return // false -bool PerformanceImplicitGemmBwdV1R1Xdlops::IsValid(const ConvolutionContext& ctx, +bool PerformanceImplicitGemmBwdV1R1Xdlops::IsValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { return IsReallyValid(problem) && IsFastToBeUsedForTuning(ctx, problem); @@ -667,7 +667,7 @@ bool PerformanceImplicitGemmBwdV1R1Xdlops::IsValid(const ConvolutionContext& ctx // Used by GenericSearch, not used by HeuristicInit bool ConvHipImplicitGemmBwdDataV1R1Xdlops::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceImplicitGemmBwdV1R1Xdlops& config) const { @@ -699,7 +699,7 @@ ConvHipImplicitGemmBwdDataV1R1Xdlops::CalculateGemmSize(const ProblemDescription PerformanceImplicitGemmBwdV1R1Xdlops ConvHipImplicitGemmBwdDataV1R1Xdlops::GetDefaultPerformanceConfig( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { return GetPerformanceConfigBase(ctx, problem); } @@ -717,7 +717,7 @@ std::tuple PerformanceImplicitGemmBwdV1R1Xdlops::CalculateLds } std::size_t -ConvHipImplicitGemmBwdDataV1R1Xdlops::GetWorkspaceSize(const ConvolutionContext&, +ConvHipImplicitGemmBwdDataV1R1Xdlops::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const { if(problem.IsFp32()) @@ -750,7 +750,7 @@ ConvHipImplicitGemmBwdDataV1R1Xdlops::GetWorkspaceSize(const ConvolutionContext& } } -bool ConvHipImplicitGemmBwdDataV1R1Xdlops::IsApplicable(const ConvolutionContext& ctx, +bool ConvHipImplicitGemmBwdDataV1R1Xdlops::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if WORKAROUND_SWDEV_251757 @@ -809,7 +809,7 @@ bool ConvHipImplicitGemmBwdDataV1R1Xdlops::IsApplicable(const ConvolutionContext } PerformanceImplicitGemmBwdV1R1Xdlops -ConvHipImplicitGemmBwdDataV1R1Xdlops::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmBwdDataV1R1Xdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -817,7 +817,7 @@ ConvHipImplicitGemmBwdDataV1R1Xdlops::Search(const ConvolutionContext& ctx, } ConvSolution ConvHipImplicitGemmBwdDataV1R1Xdlops::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmBwdV1R1Xdlops& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp index 65f8cf6675..e60d6c76a3 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1.cpp @@ -523,7 +523,7 @@ bool PerformanceImplicitGemmBwdDataV4R1::IsValid(const ProblemDescription& probl return (valid and lds_size <= get_lds_max_number_of_byte()); } -void PerformanceImplicitGemmBwdDataV4R1::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceImplicitGemmBwdDataV4R1::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { std::ignore = ctx; @@ -724,7 +724,7 @@ ConvHipImplicitGemmBwdDataV4R1::CalculateGemmSize(const ProblemDescription& prob } } -bool ConvHipImplicitGemmBwdDataV4R1::IsApplicable(const ConvolutionContext& ctx, +bool ConvHipImplicitGemmBwdDataV4R1::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if WORKAROUND_SWDEV_229277_227616_229195 @@ -787,14 +787,14 @@ bool ConvHipImplicitGemmBwdDataV4R1::IsApplicable(const ConvolutionContext& ctx, } PerformanceImplicitGemmBwdDataV4R1 -ConvHipImplicitGemmBwdDataV4R1::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, +ConvHipImplicitGemmBwdDataV4R1::GetDefaultPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem) const { return GetPerformanceConfigBase(ctx, problem); } bool ConvHipImplicitGemmBwdDataV4R1::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceImplicitGemmBwdDataV4R1& config) const { @@ -803,7 +803,7 @@ bool ConvHipImplicitGemmBwdDataV4R1::IsValidPerformanceConfig( } PerformanceImplicitGemmBwdDataV4R1 -ConvHipImplicitGemmBwdDataV4R1::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmBwdDataV4R1::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -811,7 +811,7 @@ ConvHipImplicitGemmBwdDataV4R1::Search(const ConvolutionContext& ctx, } ConvSolution -ConvHipImplicitGemmBwdDataV4R1::GetSolution(const ConvolutionContext& ctx, +ConvHipImplicitGemmBwdDataV4R1::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmBwdDataV4R1& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp index 3e85e4c966..3b3dc8b4d3 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_v4r1_xdlops.cpp @@ -414,7 +414,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsReallyValid( } bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsFastToBeUsedForTuning( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { if(use_spare_set) return true; @@ -511,7 +511,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsFastToBeUsedForTuning( return true; } -bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ConvolutionContext& ctx, +bool PerformanceImplicitGemmBwdDataV4R1Xdlops::IsValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { @@ -604,7 +604,7 @@ bool PerformanceImplicitGemmBwdDataV4R1Xdlops::SetNextValue(const ProblemDescrip return true; } -void PerformanceImplicitGemmBwdDataV4R1Xdlops::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceImplicitGemmBwdDataV4R1Xdlops::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { PerformanceImplicitGemmBwdDataV4R1Xdlops tmp; @@ -807,7 +807,7 @@ ConvHipImplicitGemmBwdDataV4R1Xdlops::CalculateGemmSize(const ProblemDescription return std::make_tuple(g, gemm_m, gemm_n, gemm_k); } -bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsApplicable(const ConvolutionContext& ctx, +bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if WORKAROUND_ISSUE_1206 @@ -869,13 +869,13 @@ bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsApplicable(const ConvolutionContext PerformanceImplicitGemmBwdDataV4R1Xdlops ConvHipImplicitGemmBwdDataV4R1Xdlops::GetDefaultPerformanceConfig( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { return GetPerformanceConfigBase(ctx, problem); } bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceImplicitGemmBwdDataV4R1Xdlops& config) const { @@ -884,7 +884,7 @@ bool ConvHipImplicitGemmBwdDataV4R1Xdlops::IsValidPerformanceConfig( } PerformanceImplicitGemmBwdDataV4R1Xdlops -ConvHipImplicitGemmBwdDataV4R1Xdlops::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmBwdDataV4R1Xdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -892,7 +892,7 @@ ConvHipImplicitGemmBwdDataV4R1Xdlops::Search(const ConvolutionContext& ctx, } ConvSolution ConvHipImplicitGemmBwdDataV4R1Xdlops::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmBwdDataV4R1Xdlops& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp index 773f5a1d32..39e8c71c16 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r1.cpp @@ -40,7 +40,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1) namespace miopen { namespace solver { -bool ConvHipImplicitGemmV4R1Fwd::IsApplicable(const ConvolutionContext& ctx, +bool ConvHipImplicitGemmV4R1Fwd::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R1{})) @@ -86,7 +86,7 @@ bool ConvHipImplicitGemmV4R1Fwd::IsApplicable(const ConvolutionContext& ctx, (c * y * x) % eMultiple == 0 && k % 16 == 0; } -bool ConvHipImplicitGemmV4R1WrW::IsApplicable(const ConvolutionContext& ctx, +bool ConvHipImplicitGemmV4R1WrW::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R1{})) @@ -143,21 +143,21 @@ bool ConvHipImplicitGemmV4R1WrW::IsApplicable(const ConvolutionContext& ctx, } PerformanceImplicitGemmV4R1 -ConvHipImplicitGemmV4R1Fwd::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, +ConvHipImplicitGemmV4R1Fwd::GetDefaultPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem) const { return GetPerformanceConfigBase(ctx, problem); } PerformanceImplicitGemmV4R1 -ConvHipImplicitGemmV4R1WrW::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, +ConvHipImplicitGemmV4R1WrW::GetDefaultPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem) const { return GetPerformanceConfigBase(ctx, problem); } bool ConvHipImplicitGemmV4R1Fwd::IsValidPerformanceConfig( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmV4R1& config) const { @@ -166,7 +166,7 @@ bool ConvHipImplicitGemmV4R1Fwd::IsValidPerformanceConfig( } bool ConvHipImplicitGemmV4R1WrW::IsValidPerformanceConfig( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmV4R1& config) const { @@ -175,14 +175,14 @@ bool ConvHipImplicitGemmV4R1WrW::IsValidPerformanceConfig( } PerformanceImplicitGemmV4R1 -ConvHipImplicitGemmV4R1Fwd::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmV4R1Fwd::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { return GenericSearch(*this, ctx, problem, invoke_ctx); } PerformanceImplicitGemmV4R1 -ConvHipImplicitGemmV4R1WrW::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmV4R1WrW::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -190,7 +190,7 @@ ConvHipImplicitGemmV4R1WrW::Search(const ConvolutionContext& ctx, } ConvSolution -ConvHipImplicitGemmV4R1Fwd::GetSolution(const ConvolutionContext& ctx, +ConvHipImplicitGemmV4R1Fwd::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmV4R1& config) const { @@ -393,7 +393,7 @@ ConvHipImplicitGemmV4R1Fwd::GetSolution(const ConvolutionContext& ctx, } ConvSolution -ConvHipImplicitGemmV4R1WrW::GetSolution(const ConvolutionContext& ctx, +ConvHipImplicitGemmV4R1WrW::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmV4R1& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp index 07fbae436e..9cbe662180 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4.cpp @@ -471,7 +471,7 @@ bool PerformanceImplicitGemmV4R4Fwd::IsValid(const ProblemDescription& problem) return (valid and lds_size <= get_lds_max_number_of_byte()); } -void PerformanceImplicitGemmV4R4Fwd::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceImplicitGemmV4R4Fwd::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { std::ignore = ctx; @@ -572,7 +572,7 @@ ConvHipImplicitGemmV4R4Fwd::CalculateGemmSize(const ProblemDescription& problem) return std::make_tuple(gemm_m, gemm_n, gemm_k); } -bool ConvHipImplicitGemmV4R4Fwd::IsApplicable(const ConvolutionContext& ctx, +bool ConvHipImplicitGemmV4R4Fwd::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4{})) @@ -610,14 +610,14 @@ bool ConvHipImplicitGemmV4R4Fwd::IsApplicable(const ConvolutionContext& ctx, } PerformanceImplicitGemmV4R4Fwd -ConvHipImplicitGemmV4R4Fwd::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, +ConvHipImplicitGemmV4R4Fwd::GetDefaultPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem) const { return GetPerformanceConfigBase(ctx, problem); } bool ConvHipImplicitGemmV4R4Fwd::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceImplicitGemmV4R4Fwd& config) const { @@ -626,7 +626,7 @@ bool ConvHipImplicitGemmV4R4Fwd::IsValidPerformanceConfig( } PerformanceImplicitGemmV4R4Fwd -ConvHipImplicitGemmV4R4Fwd::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmV4R4Fwd::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -634,7 +634,7 @@ ConvHipImplicitGemmV4R4Fwd::Search(const ConvolutionContext& ctx, } ConvSolution -ConvHipImplicitGemmV4R4Fwd::GetSolution(const ConvolutionContext& ctx, +ConvHipImplicitGemmV4R4Fwd::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmV4R4Fwd& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp index 795e3d1704..9c09efe397 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops.cpp @@ -120,7 +120,7 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops::SetNextValue(const ProblemDescrip return true; } -void PerformanceImplicitGemmForwardV4R4Xdlops::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceImplicitGemmForwardV4R4Xdlops::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { PerformanceImplicitGemmForwardV4R4Xdlops tmp; @@ -624,7 +624,7 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops::IsReallyValid( // Return false if a performance config is known to be sub-optimal, comparing to other performance // config inside tuning range bool PerformanceImplicitGemmForwardV4R4Xdlops::IsFastToBeUsedForTuning( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { // somehow, 128x128 wave-wise GEMM tend to spill register // TODO revisit this when 128x128 wave-wise GEMM become efficient @@ -807,7 +807,7 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops::IsFastToBeUsedForTuning( // Return false, if you don't want to this to be included in tuning range used by generic search // A performance config may still be valid w.r.t algorithm correctness, even when IsValid() return // false -bool PerformanceImplicitGemmForwardV4R4Xdlops::IsValid(const ConvolutionContext& ctx, +bool PerformanceImplicitGemmForwardV4R4Xdlops::IsValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { return IsReallyValid(problem) && IsFastToBeUsedForTuning(ctx, problem); @@ -815,7 +815,7 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops::IsValid(const ConvolutionContext& // Used by GenericSearch, not used by HeuristicInit bool ConvHipImplicitGemmForwardV4R4Xdlops::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceImplicitGemmForwardV4R4Xdlops& config) const { @@ -847,7 +847,7 @@ ConvHipImplicitGemmForwardV4R4Xdlops::CalculateGemmSize(const ProblemDescription PerformanceImplicitGemmForwardV4R4Xdlops ConvHipImplicitGemmForwardV4R4Xdlops::GetDefaultPerformanceConfig( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { PerformanceImplicitGemmForwardV4R4Xdlops config; config.HeuristicInit(ctx, problem); @@ -856,7 +856,7 @@ ConvHipImplicitGemmForwardV4R4Xdlops::GetDefaultPerformanceConfig( } ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmForwardV4R4Xdlops& config) const { @@ -966,7 +966,7 @@ ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops::GetSolution( return result; } -bool ConvHipImplicitGemmForwardV4R4Xdlops::IsApplicable(const ConvolutionContext& ctx, +bool ConvHipImplicitGemmForwardV4R4Xdlops::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4_XDLOPS{})) @@ -1031,7 +1031,7 @@ bool ConvHipImplicitGemmForwardV4R4Xdlops::IsApplicable(const ConvolutionContext } PerformanceImplicitGemmForwardV4R4Xdlops -ConvHipImplicitGemmForwardV4R4Xdlops::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmForwardV4R4Xdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp index 7fa139d21a..d25ca1b68b 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r4_xdlops_padded_gemm.cpp @@ -144,7 +144,7 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm::SetNextValue(const Pr } void PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm::HeuristicInit( - const ConvolutionContext& ctx, const ProblemDescription& problem) + const ExecutionContext& ctx, const ProblemDescription& problem) { PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm tmp; @@ -666,7 +666,7 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsReallyValid( /// Used by HeuristicInit and IsValid. Return false if a performance config is known /// to be sub-optimal, comparing to other performance config inside tuning range. bool PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsFastToBeUsedForTuning( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { // somehow, 128x128 wave-wise GEMM tend to spill register // TODO revisit this when 128x128 wave-wise GEMM become efficient @@ -846,14 +846,14 @@ bool PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsFastToBeUsedForTuni /// included in tuning range used by generic search. A performance config may still be valid w.r.t /// algorithm correctness, even when IsValid() returns false. bool PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsValid( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { return IsReallyValid(problem) && IsFastToBeUsedForTuning(ctx, problem); } // Used by GenericSearch, not used by HeuristicInit bool ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm& config) const { @@ -895,7 +895,7 @@ ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::CalculateGemmSize( PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::GetDefaultPerformanceConfig( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm config; config.HeuristicInit(ctx, problem); @@ -904,7 +904,7 @@ ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::GetDefaultPerformanceConfig( } ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm& config) const { @@ -1033,7 +1033,7 @@ ConvSolution ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::GetSolution( } bool ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsApplicable( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R4_PADDED_GEMM_XDLOPS{})) return false; @@ -1124,7 +1124,7 @@ bool ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::IsApplicable( } PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm -ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const diff --git a/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp index 9bd59d36b9..4915c48e2e 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp @@ -154,7 +154,7 @@ bool PerformanceImplicitGemmForwardV4R5Xdlops::SetNextValue(const ProblemDescrip return true; } -void PerformanceImplicitGemmForwardV4R5Xdlops::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceImplicitGemmForwardV4R5Xdlops::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { PerformanceImplicitGemmForwardV4R5Xdlops tmp; @@ -676,7 +676,7 @@ bool PerformanceImplicitGemmForwardV4R5Xdlops::IsReallyValid( // Return false if a performance config is known to be sub-optimal, comparing to other performance // config inside tuning range bool PerformanceImplicitGemmForwardV4R5Xdlops::IsFastToBeUsedForTuning( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { if(use_spare_set) return true; @@ -855,7 +855,7 @@ bool PerformanceImplicitGemmForwardV4R5Xdlops::IsFastToBeUsedForTuning( // Return false, if you don't want to this to be included in tuning range used by generic search // A performance config may still be valid w.r.t algorithm correctness, even when IsValid() return // false -bool PerformanceImplicitGemmForwardV4R5Xdlops::IsValid(const ConvolutionContext& ctx, +bool PerformanceImplicitGemmForwardV4R5Xdlops::IsValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { return IsReallyValid(problem) && IsFastToBeUsedForTuning(ctx, problem); @@ -863,7 +863,7 @@ bool PerformanceImplicitGemmForwardV4R5Xdlops::IsValid(const ConvolutionContext& // Used by GenericSearch, not used by HeuristicInit bool ConvHipImplicitGemmForwardV4R5Xdlops::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceImplicitGemmForwardV4R5Xdlops& config) const { @@ -872,7 +872,7 @@ bool ConvHipImplicitGemmForwardV4R5Xdlops::IsValidPerformanceConfig( PerformanceImplicitGemmForwardV4R5Xdlops ConvHipImplicitGemmForwardV4R5Xdlops::GetDefaultPerformanceConfig( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { PerformanceImplicitGemmForwardV4R5Xdlops config; config.HeuristicInit(ctx, problem); @@ -881,7 +881,7 @@ ConvHipImplicitGemmForwardV4R5Xdlops::GetDefaultPerformanceConfig( } ConvSolution ConvHipImplicitGemmForwardV4R5Xdlops::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmForwardV4R5Xdlops& config) const { @@ -996,7 +996,7 @@ ConvSolution ConvHipImplicitGemmForwardV4R5Xdlops::GetSolution( return result; } -bool ConvHipImplicitGemmForwardV4R5Xdlops::IsApplicable(const ConvolutionContext& ctx, +bool ConvHipImplicitGemmForwardV4R5Xdlops::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_FWD_V4R5_XDLOPS{})) @@ -1068,7 +1068,7 @@ bool ConvHipImplicitGemmForwardV4R5Xdlops::IsApplicable(const ConvolutionContext } PerformanceImplicitGemmForwardV4R5Xdlops -ConvHipImplicitGemmForwardV4R5Xdlops::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmForwardV4R5Xdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { diff --git a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp index ae2395dd0a..73907eb788 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp @@ -230,7 +230,7 @@ bool PerformanceConfigHipImplicitGemmFwdXdlops::operator==( } PerformanceConfigHipImplicitGemmFwdXdlops -ConvHipImplicitGemmFwdXdlops::GetDefaultPerformanceConfig(const ConvolutionContext&, +ConvHipImplicitGemmFwdXdlops::GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription& problem) const { PerformanceConfigHipImplicitGemmFwdXdlops pp; @@ -239,7 +239,7 @@ ConvHipImplicitGemmFwdXdlops::GetDefaultPerformanceConfig(const ConvolutionConte } bool ConvHipImplicitGemmFwdXdlops::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigHipImplicitGemmFwdXdlops& config) const { @@ -247,7 +247,7 @@ bool ConvHipImplicitGemmFwdXdlops::IsValidPerformanceConfig( } PerformanceConfigHipImplicitGemmFwdXdlops -ConvHipImplicitGemmFwdXdlops::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmFwdXdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -255,7 +255,7 @@ ConvHipImplicitGemmFwdXdlops::Search(const ConvolutionContext& ctx, } bool ConvHipImplicitGemmFwdXdlops::IsApplicable( - [[maybe_unused]] const ConvolutionContext& ctx, + [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) const { #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL @@ -303,7 +303,7 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable( } ConvSolution ConvHipImplicitGemmFwdXdlops::GetSolution( - [[maybe_unused]] const ConvolutionContext& ctx, + [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem, [[maybe_unused]] const PerformanceConfigHipImplicitGemmFwdXdlops& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp index b51031780f..c2d0b83141 100644 --- a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp @@ -258,7 +258,7 @@ bool PerformanceConfigHipImplicitGemmGroupFwdXdlops::operator==( PerformanceConfigHipImplicitGemmGroupFwdXdlops ConvHipImplicitGemmGroupFwdXdlops::GetDefaultPerformanceConfig( - const ConvolutionContext&, const ProblemDescription& problem) const + const ExecutionContext&, const ProblemDescription& problem) const { PerformanceConfigHipImplicitGemmGroupFwdXdlops pp; pp.HeuristicInit(problem); @@ -266,7 +266,7 @@ ConvHipImplicitGemmGroupFwdXdlops::GetDefaultPerformanceConfig( } bool ConvHipImplicitGemmGroupFwdXdlops::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceConfigHipImplicitGemmGroupFwdXdlops& config) const { @@ -274,7 +274,7 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsValidPerformanceConfig( } PerformanceConfigHipImplicitGemmGroupFwdXdlops -ConvHipImplicitGemmGroupFwdXdlops::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmGroupFwdXdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -282,7 +282,7 @@ ConvHipImplicitGemmGroupFwdXdlops::Search(const ConvolutionContext& ctx, } bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable( - [[maybe_unused]] const ConvolutionContext& ctx, + [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) const { #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL @@ -322,7 +322,7 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable( } ConvSolution ConvHipImplicitGemmGroupFwdXdlops::GetSolution( - [[maybe_unused]] const ConvolutionContext& ctx, + [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem, [[maybe_unused]] const PerformanceConfigHipImplicitGemmGroupFwdXdlops& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_nonxdlops_common.cpp b/src/solver/conv_hip_implicit_gemm_nonxdlops_common.cpp index de11fa0fa7..472b58f913 100644 --- a/src/solver/conv_hip_implicit_gemm_nonxdlops_common.cpp +++ b/src/solver/conv_hip_implicit_gemm_nonxdlops_common.cpp @@ -57,7 +57,7 @@ bool PerformanceImplicitGemm::operator==(const PerformanceImplicitGemm& other) c // clang-format on } -bool PerformanceImplicitGemm::IsValid(const ConvolutionContext& ctx, +bool PerformanceImplicitGemm::IsValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { std::size_t N = KernelBatchN(problem); @@ -173,7 +173,7 @@ bool PerformanceImplicitGemm::IsValid(const ConvolutionContext& ctx, return (InBlockCopySubLengths_E == 1 && InBlockCopySubLengths_B == 1); } -bool PerformanceImplicitGemmV4R1::IsValid(const ConvolutionContext& ctx, +bool PerformanceImplicitGemmV4R1::IsValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { std::size_t N = KernelBatchN(problem); @@ -278,7 +278,7 @@ bool PerformanceImplicitGemmV4R1::IsValid(const ConvolutionContext& ctx, return (InBlockCopySubLengths_E == 1 && InBlockCopySubLengths_B == 1); } -void PerformanceImplicitGemm::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceImplicitGemm::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { // default diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp index 0a33f611c0..637486ef50 100644 --- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp +++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4.cpp @@ -474,7 +474,7 @@ bool PerformanceImplicitGemmV4R4WrW::IsValid(const ProblemDescription& problem) return (valid and lds_size <= get_lds_max_number_of_byte()); } -void PerformanceImplicitGemmV4R4WrW::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceImplicitGemmV4R4WrW::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { std::ignore = ctx; @@ -575,7 +575,7 @@ ConvHipImplicitGemmV4R4WrW::CalculateGemmSize(const ProblemDescription& problem) return std::make_tuple(gemm_m, gemm_n, gemm_k); } -bool ConvHipImplicitGemmV4R4WrW::IsApplicable(const ConvolutionContext& ctx, +bool ConvHipImplicitGemmV4R4WrW::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4{})) @@ -613,14 +613,14 @@ bool ConvHipImplicitGemmV4R4WrW::IsApplicable(const ConvolutionContext& ctx, } PerformanceImplicitGemmV4R4WrW -ConvHipImplicitGemmV4R4WrW::GetDefaultPerformanceConfig(const ConvolutionContext& ctx, +ConvHipImplicitGemmV4R4WrW::GetDefaultPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem) const { return GetPerformanceConfigBase(ctx, problem); } bool ConvHipImplicitGemmV4R4WrW::IsValidPerformanceConfig( - const ConvolutionContext&, + const ExecutionContext&, const ProblemDescription& problem, const PerformanceImplicitGemmV4R4WrW& config) const { @@ -629,7 +629,7 @@ bool ConvHipImplicitGemmV4R4WrW::IsValidPerformanceConfig( } PerformanceImplicitGemmV4R4WrW -ConvHipImplicitGemmV4R4WrW::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmV4R4WrW::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -637,7 +637,7 @@ ConvHipImplicitGemmV4R4WrW::Search(const ConvolutionContext& ctx, } ConvSolution -ConvHipImplicitGemmV4R4WrW::GetSolution(const ConvolutionContext& ctx, +ConvHipImplicitGemmV4R4WrW::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmV4R4WrW& config) const { diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp index 070ad3615f..5a42ba3255 100644 --- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp @@ -122,7 +122,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops::SetNextValue(const ProblemDescription return true; } -void PerformanceImplicitGemmWrwV4R4Xdlops::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceImplicitGemmWrwV4R4Xdlops::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { PerformanceImplicitGemmWrwV4R4Xdlops tmp; @@ -271,7 +271,7 @@ std::tuple PerformanceImplicitGemmWrwV4R4Xdlops::CalculateBlockSize() } std::tuple -PerformanceImplicitGemmWrwV4R4Xdlops::CalculateGridSize(const ConvolutionContext& ctx, +PerformanceImplicitGemmWrwV4R4Xdlops::CalculateGridSize(const ExecutionContext& ctx, const ProblemDescription& problem) const { int GridSize = 0; @@ -305,7 +305,7 @@ PerformanceImplicitGemmWrwV4R4Xdlops::CalculateGridSize(const ConvolutionContext std::tuple PerformanceImplicitGemmWrwV4R4Xdlops::CalculateGemmSizeAndGemmKBlock( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { int gemm_g = -1; int gemm_m = -1; @@ -622,7 +622,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops::IsValidValue() const // Used by HeuristicInit() and GenericSearch // Only return false if a performance config will violate requirements given by kernel algorithm -bool PerformanceImplicitGemmWrwV4R4Xdlops::IsReallyValid(const ConvolutionContext& ctx, +bool PerformanceImplicitGemmWrwV4R4Xdlops::IsReallyValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(!IsValidValue()) @@ -689,7 +689,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops::IsReallyValid(const ConvolutionContex // Return false if a performance config is known to be sub-optimal, comparing to other performance // config inside tuning range bool PerformanceImplicitGemmWrwV4R4Xdlops::IsFastToBeUsedForTuning( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { if(use_spare_set) @@ -842,7 +842,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops::IsFastToBeUsedForTuning( // Return false, if you don't want to this to be included in tuning range used by generic search // A performance config may still be valid w.r.t algorithm correctness, even when IsValid() return // false -bool PerformanceImplicitGemmWrwV4R4Xdlops::IsValid(const ConvolutionContext& ctx, +bool PerformanceImplicitGemmWrwV4R4Xdlops::IsValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { return IsReallyValid(ctx, problem) && IsFastToBeUsedForTuning(ctx, problem); @@ -850,7 +850,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops::IsValid(const ConvolutionContext& ctx // Used by GenericSearch, not used by HeuristicInit bool ConvHipImplicitGemmWrwV4R4Xdlops::IsValidPerformanceConfig( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmWrwV4R4Xdlops& config) const { @@ -858,7 +858,7 @@ bool ConvHipImplicitGemmWrwV4R4Xdlops::IsValidPerformanceConfig( } PerformanceImplicitGemmWrwV4R4Xdlops ConvHipImplicitGemmWrwV4R4Xdlops::GetDefaultPerformanceConfig( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { PerformanceImplicitGemmWrwV4R4Xdlops config; config.HeuristicInit(ctx, problem); @@ -867,7 +867,7 @@ PerformanceImplicitGemmWrwV4R4Xdlops ConvHipImplicitGemmWrwV4R4Xdlops::GetDefaul } ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmWrwV4R4Xdlops& config) const { @@ -1036,7 +1036,7 @@ ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops::GetSolution( return result; } -bool ConvHipImplicitGemmWrwV4R4Xdlops::IsApplicable(const ConvolutionContext& ctx, +bool ConvHipImplicitGemmWrwV4R4Xdlops::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4_XDLOPS{})) @@ -1100,7 +1100,7 @@ bool ConvHipImplicitGemmWrwV4R4Xdlops::IsApplicable(const ConvolutionContext& ct } PerformanceImplicitGemmWrwV4R4Xdlops -ConvHipImplicitGemmWrwV4R4Xdlops::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmWrwV4R4Xdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -1109,7 +1109,7 @@ ConvHipImplicitGemmWrwV4R4Xdlops::Search(const ConvolutionContext& ctx, } std::size_t -ConvHipImplicitGemmWrwV4R4Xdlops::GetWorkspaceSize(const ConvolutionContext&, +ConvHipImplicitGemmWrwV4R4Xdlops::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const { if(problem.IsFp32()) diff --git a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp index 7b4295df35..abd178dcca 100644 --- a/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp +++ b/src/solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops_padded_gemm.cpp @@ -123,7 +123,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::SetNextValue(const Proble } void PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::HeuristicInit( - const ConvolutionContext& ctx, const ProblemDescription& problem) + const ExecutionContext& ctx, const ProblemDescription& problem) { PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm tmp; // GemmMFactor GemmNFactor, GemmKTotalFactor are fixed value at this moment. @@ -270,7 +270,7 @@ std::tuple PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::Calculat } std::tuple PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::CalculateGridSize( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { int GridSize = 0; @@ -560,7 +560,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsValidValue() const // Used by HeuristicInit() and GenericSearch // Only return false if a performance config will violate requirements given by kernel algorithm bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsReallyValid( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { if(!IsValidValue()) return false; @@ -633,7 +633,7 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsReallyValid( // Return false if a performance config is known to be sub-optimal, comparing to other performance // config inside tuning range bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsFastToBeUsedForTuning( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { // somehow, 128x128 wave-wise GEMM tend to spill register // TODO revisit this when 128x128 wave-wise GEMM become efficient @@ -798,14 +798,14 @@ bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsFastToBeUsedForTuning( // A performance config may still be valid w.r.t algorithm correctness, even when IsValid() return // false bool PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsValid( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { return IsReallyValid(ctx, problem) && IsFastToBeUsedForTuning(ctx, problem); } // Used by GenericSearch, not used by HeuristicInit bool ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsValidPerformanceConfig( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm& config) const { @@ -814,7 +814,7 @@ bool ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsValidPerformanceConfig( std::tuple PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::CalculateGemmSizeAndGemmKBlock( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { int gemm_g = -1; int gemm_m = -1; @@ -911,7 +911,7 @@ PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm::CalculateGemmSizeAndGemmKBlock PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::GetDefaultPerformanceConfig( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm config; config.HeuristicInit(ctx, problem); @@ -920,7 +920,7 @@ ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::GetDefaultPerformanceConfig( } ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm& config) const { @@ -1103,7 +1103,7 @@ ConvSolution ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::GetSolution( } bool ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsApplicable( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_HIP_WRW_V4R4_PADDED_GEMM_XDLOPS{})) return false; @@ -1188,7 +1188,7 @@ bool ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::IsApplicable( } PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm -ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::Search(const ConvolutionContext& ctx, +ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -1197,7 +1197,7 @@ ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::Search(const ConvolutionContext& c } std::size_t ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm::GetWorkspaceSize( - const ConvolutionContext&, const ProblemDescription& problem) const + const ExecutionContext&, const ProblemDescription& problem) const { if(problem.IsFp32()) return 0; diff --git a/src/solver/conv_mlir_igemm_bwd.cpp b/src/solver/conv_mlir_igemm_bwd.cpp index 58787c2532..783c68350c 100644 --- a/src/solver/conv_mlir_igemm_bwd.cpp +++ b/src/solver/conv_mlir_igemm_bwd.cpp @@ -37,7 +37,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_MLIR_IGEMM_BWD) namespace miopen { namespace solver { -bool ConvMlirIgemmBwd::IsApplicable(const ConvolutionContext& ctx, +bool ConvMlirIgemmBwd::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if MIOPEN_USE_MLIR @@ -70,13 +70,13 @@ bool ConvMlirIgemmBwd::IsApplicable(const ConvolutionContext& ctx, } PerformanceConvMlirIgemm -ConvMlirIgemmBwd::GetDefaultPerformanceConfig(const ConvolutionContext&, +ConvMlirIgemmBwd::GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const { return PerformanceConvMlirIgemm::MlirHeuristicInitRequest(); } -bool ConvMlirIgemmBwd::IsValidPerformanceConfig(const ConvolutionContext& ctx, +bool ConvMlirIgemmBwd::IsValidPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvMlirIgemm& config) const { @@ -84,14 +84,14 @@ bool ConvMlirIgemmBwd::IsValidPerformanceConfig(const ConvolutionContext& ctx, return config.IsValid(ctx, problem); } -PerformanceConvMlirIgemm ConvMlirIgemmBwd::Search(const ConvolutionContext& ctx, +PerformanceConvMlirIgemm ConvMlirIgemmBwd::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { return GenericSearch(*this, ctx, problem, invoke_ctx); } -ConvSolution ConvMlirIgemmBwd::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvMlirIgemmBwd::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvMlirIgemm& config) const { diff --git a/src/solver/conv_mlir_igemm_bwd_xdlops.cpp b/src/solver/conv_mlir_igemm_bwd_xdlops.cpp index a4a9549db8..41062cc32c 100644 --- a/src/solver/conv_mlir_igemm_bwd_xdlops.cpp +++ b/src/solver/conv_mlir_igemm_bwd_xdlops.cpp @@ -38,7 +38,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_MLIR_IGEMM_BWD_XDLOPS) namespace miopen { namespace solver { -bool ConvMlirIgemmBwdXdlops::IsApplicable(const ConvolutionContext& ctx, +bool ConvMlirIgemmBwdXdlops::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if MIOPEN_USE_MLIR @@ -64,14 +64,14 @@ bool ConvMlirIgemmBwdXdlops::IsApplicable(const ConvolutionContext& ctx, } PerformanceConvMlirIgemmXdlops -ConvMlirIgemmBwdXdlops::GetDefaultPerformanceConfig(const ConvolutionContext&, +ConvMlirIgemmBwdXdlops::GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const { return PerformanceConvMlirIgemmXdlops::MlirHeuristicInitRequest(); } bool ConvMlirIgemmBwdXdlops::IsValidPerformanceConfig( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvMlirIgemmXdlops& config) const { @@ -80,14 +80,14 @@ bool ConvMlirIgemmBwdXdlops::IsValidPerformanceConfig( } PerformanceConvMlirIgemmXdlops -ConvMlirIgemmBwdXdlops::Search(const ConvolutionContext& ctx, +ConvMlirIgemmBwdXdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { return GenericSearch(*this, ctx, problem, invoke_ctx); } -ConvSolution ConvMlirIgemmBwdXdlops::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvMlirIgemmBwdXdlops::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvMlirIgemmXdlops& config) const { diff --git a/src/solver/conv_mlir_igemm_fwd.cpp b/src/solver/conv_mlir_igemm_fwd.cpp index c4983852b1..2cc196ae10 100644 --- a/src/solver/conv_mlir_igemm_fwd.cpp +++ b/src/solver/conv_mlir_igemm_fwd.cpp @@ -85,7 +85,7 @@ bool PerformanceConvMlirIgemm::operator==(const PerformanceConvMlirIgemm& other) // clang-format on } -bool PerformanceConvMlirIgemm::IsValid(const ConvolutionContext& ctx, +bool PerformanceConvMlirIgemm::IsValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if MIOPEN_USE_MLIR @@ -136,13 +136,13 @@ bool PerformanceConvMlirIgemm::SetNextValue(const ProblemDescription&) } PerformanceConvMlirIgemm -ConvMlirIgemmFwd::GetDefaultPerformanceConfig(const ConvolutionContext&, +ConvMlirIgemmFwd::GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const { return PerformanceConvMlirIgemm::MlirHeuristicInitRequest(); } -bool ConvMlirIgemmFwd::IsValidPerformanceConfig(const ConvolutionContext& ctx, +bool ConvMlirIgemmFwd::IsValidPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvMlirIgemm& config) const { @@ -150,14 +150,14 @@ bool ConvMlirIgemmFwd::IsValidPerformanceConfig(const ConvolutionContext& ctx, return config.IsValid(ctx, problem); } -PerformanceConvMlirIgemm ConvMlirIgemmFwd::Search(const ConvolutionContext& ctx, +PerformanceConvMlirIgemm ConvMlirIgemmFwd::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { return GenericSearch(*this, ctx, problem, invoke_ctx); } -bool ConvMlirIgemmFwd::IsApplicable(const ConvolutionContext& ctx, +bool ConvMlirIgemmFwd::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if MIOPEN_USE_MLIR @@ -189,7 +189,7 @@ bool ConvMlirIgemmFwd::IsApplicable(const ConvolutionContext& ctx, #endif } -ConvSolution ConvMlirIgemmFwd::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvMlirIgemmFwd::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvMlirIgemm& config) const { diff --git a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp index 8256213623..c761abc137 100644 --- a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp +++ b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp @@ -52,7 +52,7 @@ void PerformanceConvMlirIgemmXdlops::SetMlirHeuristicInitRequest() GemmBThreadCopyMoreGemmKPack = false; } -bool ConvMlirIgemmFwdXdlops::IsApplicable(const ConvolutionContext& ctx, +bool ConvMlirIgemmFwdXdlops::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if MIOPEN_USE_MLIR @@ -125,7 +125,7 @@ bool PerformanceConvMlirIgemmXdlops::operator==(const PerformanceConvMlirIgemmXd // clang-format on } -bool PerformanceConvMlirIgemmXdlops::IsValid(const ConvolutionContext& ctx, +bool PerformanceConvMlirIgemmXdlops::IsValid(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if MIOPEN_USE_MLIR @@ -189,14 +189,14 @@ bool PerformanceConvMlirIgemmXdlops::SetNextValue(const ProblemDescription& prob } PerformanceConvMlirIgemmXdlops -ConvMlirIgemmFwdXdlops::GetDefaultPerformanceConfig(const ConvolutionContext&, +ConvMlirIgemmFwdXdlops::GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const { return PerformanceConvMlirIgemmXdlops::MlirHeuristicInitRequest(); } bool ConvMlirIgemmFwdXdlops::IsValidPerformanceConfig( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvMlirIgemmXdlops& config) const { @@ -205,14 +205,14 @@ bool ConvMlirIgemmFwdXdlops::IsValidPerformanceConfig( } PerformanceConvMlirIgemmXdlops -ConvMlirIgemmFwdXdlops::Search(const ConvolutionContext& ctx, +ConvMlirIgemmFwdXdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { return GenericSearch(*this, ctx, problem, invoke_ctx); } -ConvSolution ConvMlirIgemmFwdXdlops::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvMlirIgemmFwdXdlops::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvMlirIgemmXdlops& config) const { diff --git a/src/solver/conv_mlir_igemm_wrw.cpp b/src/solver/conv_mlir_igemm_wrw.cpp index d5f88dcaea..cb9f6ae7b2 100644 --- a/src/solver/conv_mlir_igemm_wrw.cpp +++ b/src/solver/conv_mlir_igemm_wrw.cpp @@ -38,7 +38,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_MLIR_IGEMM_WRW) namespace miopen { namespace solver { -bool ConvMlirIgemmWrW::IsApplicable(const ConvolutionContext& ctx, +bool ConvMlirIgemmWrW::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if MIOPEN_USE_MLIR @@ -71,13 +71,13 @@ bool ConvMlirIgemmWrW::IsApplicable(const ConvolutionContext& ctx, } PerformanceConvMlirIgemm -ConvMlirIgemmWrW::GetDefaultPerformanceConfig(const ConvolutionContext&, +ConvMlirIgemmWrW::GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const { return PerformanceConvMlirIgemm::MlirHeuristicInitRequest(); } -bool ConvMlirIgemmWrW::IsValidPerformanceConfig(const ConvolutionContext& ctx, +bool ConvMlirIgemmWrW::IsValidPerformanceConfig(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvMlirIgemm& config) const { @@ -85,14 +85,14 @@ bool ConvMlirIgemmWrW::IsValidPerformanceConfig(const ConvolutionContext& ctx, return config.IsValid(ctx, problem); } -PerformanceConvMlirIgemm ConvMlirIgemmWrW::Search(const ConvolutionContext& ctx, +PerformanceConvMlirIgemm ConvMlirIgemmWrW::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { return GenericSearch(*this, ctx, problem, invoke_ctx); } -ConvSolution ConvMlirIgemmWrW::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvMlirIgemmWrW::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvMlirIgemm& config) const { diff --git a/src/solver/conv_mlir_igemm_wrw_xdlops.cpp b/src/solver/conv_mlir_igemm_wrw_xdlops.cpp index 2f3bc63f50..fe11c828c8 100644 --- a/src/solver/conv_mlir_igemm_wrw_xdlops.cpp +++ b/src/solver/conv_mlir_igemm_wrw_xdlops.cpp @@ -39,7 +39,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_MLIR_IGEMM_WRW_XDLOPS) namespace miopen { namespace solver { -bool ConvMlirIgemmWrWXdlops::IsApplicable(const ConvolutionContext& ctx, +bool ConvMlirIgemmWrWXdlops::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if MIOPEN_USE_MLIR @@ -65,14 +65,14 @@ bool ConvMlirIgemmWrWXdlops::IsApplicable(const ConvolutionContext& ctx, } PerformanceConvMlirIgemmXdlops -ConvMlirIgemmWrWXdlops::GetDefaultPerformanceConfig(const ConvolutionContext&, +ConvMlirIgemmWrWXdlops::GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const { return PerformanceConvMlirIgemmXdlops::MlirHeuristicInitRequest(); } bool ConvMlirIgemmWrWXdlops::IsValidPerformanceConfig( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvMlirIgemmXdlops& config) const { @@ -81,14 +81,14 @@ bool ConvMlirIgemmWrWXdlops::IsValidPerformanceConfig( } PerformanceConvMlirIgemmXdlops -ConvMlirIgemmWrWXdlops::Search(const ConvolutionContext& ctx, +ConvMlirIgemmWrWXdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { return GenericSearch(*this, ctx, problem, invoke_ctx); } -ConvSolution ConvMlirIgemmWrWXdlops::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvMlirIgemmWrWXdlops::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConvMlirIgemmXdlops& config) const { @@ -130,7 +130,7 @@ ConvSolution ConvMlirIgemmWrWXdlops::GetSolution(const ConvolutionContext& ctx, #endif } -std::size_t ConvMlirIgemmWrWXdlops::GetWorkspaceSize(const ConvolutionContext& ctx, +std::size_t ConvMlirIgemmWrWXdlops::GetWorkspaceSize(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if MIOPEN_USE_MLIR diff --git a/src/solver/conv_ocl_dir2D11x11.cpp b/src/solver/conv_ocl_dir2D11x11.cpp index 145cf9ed13..b76621a591 100644 --- a/src/solver/conv_ocl_dir2D11x11.cpp +++ b/src/solver/conv_ocl_dir2D11x11.cpp @@ -36,7 +36,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD11X11) namespace miopen { namespace solver { -bool ConvOclDirectFwd11x11::IsApplicable(const ConvolutionContext& ctx, +bool ConvOclDirectFwd11x11::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD11X11{})) @@ -65,7 +65,7 @@ bool ConvOclDirectFwd11x11::IsApplicable(const ConvolutionContext& ctx, problem.GetKernelStrideH() == 4 && problem.GetKernelStrideW() == 4; } -ConvSolution ConvOclDirectFwd11x11::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvOclDirectFwd11x11::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const { ConvSolution result; diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp index eded6fddf8..4e0cda8629 100644 --- a/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp +++ b/src/solver/conv_ocl_dir2D_bwdWrW_1x1.cpp @@ -39,7 +39,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW1X1) namespace miopen { namespace solver { -bool ConvOclBwdWrW1x1::IsApplicable(const ConvolutionContext& ctx, +bool ConvOclBwdWrW1x1::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if WORKAROUND_SWDEV_266868 @@ -96,7 +96,7 @@ static inline int GetNPasses(const ProblemDescription& problem) return n_passes; } -size_t ConvOclBwdWrW1x1::GetWorkspaceSize(const ConvolutionContext&, +size_t ConvOclBwdWrW1x1::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const { const int n_passes = GetNPasses(problem); @@ -112,7 +112,7 @@ size_t ConvOclBwdWrW1x1::GetWorkspaceSize(const ConvolutionContext&, return 0; } -ConvSolution ConvOclBwdWrW1x1::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvOclBwdWrW1x1::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const { ConvSolution result; diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp index d4e79250e1..2b400909f8 100644 --- a/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp +++ b/src/solver/conv_ocl_dir2D_bwdWrW_2.cpp @@ -139,7 +139,7 @@ static bool IsTunable(const ProblemDescription& problem) (problem.GetWeightsWidth_() == 1 && problem.GetWeightsHeight_() == 1))); } -bool ConvOclBwdWrW2NonTunable::IsApplicable(const ConvolutionContext& ctx, +bool ConvOclBwdWrW2NonTunable::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { // At present, auto-tuning is disabled for non-group 3x3 and 1x1 filters for multiple @@ -148,7 +148,7 @@ bool ConvOclBwdWrW2NonTunable::IsApplicable(const ConvolutionContext& ctx, return ConvOclBwdWrW2<1>::IsApplicableBase(ctx, problem) && !IsTunable(problem); } -ConvSolution ConvOclBwdWrW2NonTunable::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvOclBwdWrW2NonTunable::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const { // Invoking base class GetSolution with default values for params obtained @@ -230,7 +230,7 @@ static size_t GetNBatchBlks(const ProblemDescription& problem) template bool PerformanceConfigConvOclBwdWrw2::IsValid( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { if(!IsValidValue()) { @@ -438,7 +438,7 @@ void PerformanceConfigConvOclBwdWrw2::HeuristicInit( template bool ConvOclBwdWrW2::IsValidPerformanceConfig( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigConvOclBwdWrw2& config) const { @@ -446,7 +446,7 @@ bool ConvOclBwdWrW2::IsValidPerformanceConfig( } template -bool ConvOclBwdWrW2::IsApplicableBase(const ConvolutionContext& ctx, +bool ConvOclBwdWrW2::IsApplicableBase(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW2{})) @@ -506,7 +506,7 @@ bool ConvOclBwdWrW2::IsApplicableBase(const ConvolutionContext& c } template -bool ConvOclBwdWrW2::IsApplicable(const ConvolutionContext& ctx, +bool ConvOclBwdWrW2::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { return IsApplicableBase(ctx, problem) && IsTunable(problem); @@ -514,7 +514,7 @@ bool ConvOclBwdWrW2::IsApplicable(const ConvolutionContext& ctx, template PerformanceConfigConvOclBwdWrw2 -ConvOclBwdWrW2::GetDefaultPerformanceConfig(const ConvolutionContext&, +ConvOclBwdWrW2::GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription& problem) const { PerformanceConfigConvOclBwdWrw2 pp; @@ -523,7 +523,7 @@ ConvOclBwdWrW2::GetDefaultPerformanceConfig(const ConvolutionCont } template -size_t ConvOclBwdWrW2::GetWorkspaceSize(const ConvolutionContext&, +size_t ConvOclBwdWrW2::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const { const size_t n_batch_blks = GetNBatchBlks(problem); @@ -543,7 +543,7 @@ size_t ConvOclBwdWrW2::GetWorkspaceSize(const ConvolutionContext& template ConvSolution ConvOclBwdWrW2::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigConvOclBwdWrw2& config) const { @@ -747,7 +747,7 @@ ConvSolution ConvOclBwdWrW2::GetSolution( template PerformanceConfigConvOclBwdWrw2 -ConvOclBwdWrW2::Search(const ConvolutionContext& ctx, +ConvOclBwdWrW2::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { diff --git a/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp b/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp index 2db6109bf5..4f00c8f55b 100644 --- a/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp +++ b/src/solver/conv_ocl_dir2D_bwdWrW_53.cpp @@ -40,7 +40,7 @@ namespace solver { // problematic configs. static bool WorkaroundSwdev168168() { return true; } -bool ConvOclBwdWrW53::IsApplicable(const ConvolutionContext& ctx, +bool ConvOclBwdWrW53::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_OCL_WRW53{})) @@ -317,7 +317,7 @@ static inline void ComputeNumInputWidthLoops( } } -size_t ConvOclBwdWrW53::GetWorkspaceSize(const ConvolutionContext&, +size_t ConvOclBwdWrW53::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const { int n_stacks = std::min(problem.GetBatchSize_(), 1U); @@ -339,7 +339,7 @@ size_t ConvOclBwdWrW53::GetWorkspaceSize(const ConvolutionContext&, return 0; } -ConvSolution ConvOclBwdWrW53::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvOclBwdWrW53::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const { ConvSolution result; diff --git a/src/solver/conv_ocl_dir2Dfwd.cpp b/src/solver/conv_ocl_dir2Dfwd.cpp index 70005e606e..c7bd8c00df 100644 --- a/src/solver/conv_ocl_dir2Dfwd.cpp +++ b/src/solver/conv_ocl_dir2Dfwd.cpp @@ -35,7 +35,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD) namespace miopen { namespace solver { -bool ConvOclDirectFwd::IsApplicable(const ConvolutionContext& ctx, +bool ConvOclDirectFwd::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD{})) @@ -118,7 +118,7 @@ bool ConvOclDirectFwd::IsApplicable(const ConvolutionContext& ctx, /// and some logic from the corresponding opencl kernel source. /// The cases which lead to errors can be later omitted from the search. /// \todo Get rid the duplication of code where possible. -bool ConvOclDirectFwd::IsValidPerformanceConfig(const ConvolutionContext&, +bool ConvOclDirectFwd::IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription& problem, const LegacyPerformanceConfig& config) const { @@ -276,7 +276,7 @@ bool ConvOclDirectFwd::IsValidPerformanceConfig(const ConvolutionContext&, return true; } -ConvSolution ConvOclDirectFwd::BaseGetSolution(const ConvolutionContext& ctx, +ConvSolution ConvOclDirectFwd::BaseGetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const LegacyPerformanceConfig& config) { @@ -485,7 +485,7 @@ ConvSolution ConvOclDirectFwd::BaseGetSolution(const ConvolutionContext& ctx, return result; } -ConvSolution ConvOclDirectFwd::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvOclDirectFwd::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const LegacyPerformanceConfig& config) const { diff --git a/src/solver/conv_ocl_dir2Dfwd1x1.cpp b/src/solver/conv_ocl_dir2Dfwd1x1.cpp index 71dc41fc65..b21effc0b3 100644 --- a/src/solver/conv_ocl_dir2Dfwd1x1.cpp +++ b/src/solver/conv_ocl_dir2Dfwd1x1.cpp @@ -38,7 +38,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1) namespace miopen { namespace solver { -bool ConvOclDirectFwd1x1::IsApplicable(const ConvolutionContext& ctx, +bool ConvOclDirectFwd1x1::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if WORKAROUND_SWDEV_271887 @@ -76,7 +76,7 @@ bool ConvOclDirectFwd1x1::IsApplicable(const ConvolutionContext& ctx, problem.GetPadW() == 0 && problem.GetPadH() == 0; } -ConvSolution ConvOclDirectFwd1x1::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvOclDirectFwd1x1::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const LegacyPerformanceConfig& config) const { diff --git a/src/solver/conv_ocl_dir2Dfwd_exhaustive_search.cpp b/src/solver/conv_ocl_dir2Dfwd_exhaustive_search.cpp index 7d5d320b81..5e21bcae07 100644 --- a/src/solver/conv_ocl_dir2Dfwd_exhaustive_search.cpp +++ b/src/solver/conv_ocl_dir2Dfwd_exhaustive_search.cpp @@ -54,7 +54,7 @@ namespace solver { * select default configuration if a known configuration has not been found. */ LegacyPerformanceConfig ConvOclDirectFwdLegacyExhaustiveSearch::GetDefaultPerformanceConfig( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { // LegacyPerformanceConfig result{}; @@ -142,7 +142,7 @@ static int MeasurePerfConfig(const Handle& handle, ConstData_t wei_ocl_buf, ConstData_t bias_ocl_buf, double& processing_time, - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const LegacyPerformanceConfig& config) { @@ -213,7 +213,7 @@ static int MeasurePerfConfig(const Handle& handle, } LegacyPerformanceConfig -ConvOclDirectFwdLegacyExhaustiveSearch::Search(const ConvolutionContext& ctx, +ConvOclDirectFwdLegacyExhaustiveSearch::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -231,7 +231,7 @@ ConvOclDirectFwdLegacyExhaustiveSearch::Search(const ConvolutionContext& ctx, template LegacyPerformanceConfig -ConvOclDirectFwdLegacyExhaustiveSearch::SearchImpl(const ConvolutionContext& ctx, +ConvOclDirectFwdLegacyExhaustiveSearch::SearchImpl(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { diff --git a/src/solver/conv_ocl_dir2Dfwdgen.cpp b/src/solver/conv_ocl_dir2Dfwdgen.cpp index 80fb0708e1..f35e57b71c 100644 --- a/src/solver/conv_ocl_dir2Dfwdgen.cpp +++ b/src/solver/conv_ocl_dir2Dfwdgen.cpp @@ -34,7 +34,7 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWDGEN) namespace miopen { namespace solver { -bool ConvOclDirectFwdGen::IsApplicable(const ConvolutionContext& ctx, +bool ConvOclDirectFwdGen::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWDGEN{})) @@ -97,7 +97,7 @@ bool ConvOclDirectFwdGen::IsApplicable(const ConvolutionContext& ctx, && (problem.GetKernelStrideW() > 1 || problem.GetKernelStrideH() > 1))); // clang-format on } -ConvSolution ConvOclDirectFwdGen::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvOclDirectFwdGen::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const { int n_in_stacks = 0; diff --git a/src/solver/conv_winoRxS.cpp b/src/solver/conv_winoRxS.cpp index cdff22ac1d..d9cbeb713f 100644 --- a/src/solver/conv_winoRxS.cpp +++ b/src/solver/conv_winoRxS.cpp @@ -302,7 +302,7 @@ PerformanceConfigConvBinWinogradRxS::PerformanceConfigConvBinWinogradRxS(int n_g } template -void PerformanceConfigConvBinWinogradRxS::HeuristicInit(const ConvolutionContext& ctx, +void PerformanceConfigConvBinWinogradRxS::HeuristicInit(const ExecutionContext& ctx, const ProblemDescription& problem) { const auto n_inputs_per_group = problem.GetInChannels_() / problem.GetGroupCount(), @@ -365,7 +365,7 @@ bool PerformanceConfigConvBinWinogradRxS::IsValidValue() const return PerfFieldRules().IsIn(*this); } -bool PerformanceConfigConvBinWinogradRxS::IsValid(const ConvolutionContext& ctx) const +bool PerformanceConfigConvBinWinogradRxS::IsValid(const ExecutionContext& ctx) const { if(ctx.GetStream().GetMaxHardwareComputeUnits() < n_groups) return false; @@ -384,7 +384,7 @@ bool PerformanceConfigConvBinWinogradRxS::operator==( template PerformanceConfigConvBinWinogradRxS ConvBinWinoRxS::GetDefaultPerformanceConfig( - const ConvolutionContext& ctx, const ProblemDescription& problem) const + const ExecutionContext& ctx, const ProblemDescription& problem) const { PerformanceConfigConvBinWinogradRxS pp; pp.HeuristicInit(ctx, problem); @@ -394,7 +394,7 @@ ConvBinWinoRxS::GetDefaultPerformanceConfig( template bool ConvBinWinoRxS::IsValidPerformanceConfig( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription&, const PerformanceConfigConvBinWinogradRxS& config) const { @@ -403,7 +403,7 @@ bool ConvBinWinoRxS::IsValidPerformanceConfig( template PerformanceConfigConvBinWinogradRxS -ConvBinWinoRxS::Search(const ConvolutionContext& ctx, +ConvBinWinoRxS::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { @@ -436,7 +436,7 @@ class ShaderModel : public UnifiedDescriptionConv2d bool out_of_model_scope; // Shader model produces unreliable results. public: - ShaderModel(const ConvolutionContext& ctx, + ShaderModel(const ExecutionContext& ctx, const ProblemDescription& problem, size_t Winodata, size_t Winofilter) @@ -610,7 +610,7 @@ class ShaderModel : public UnifiedDescriptionConv2d }; template -static float GetWtiBase(const ConvolutionContext& ctx, const ProblemDescription& problem) +static float GetWtiBase(const ExecutionContext& ctx, const ProblemDescription& problem) { constexpr auto WTI_UNKNOWN = -2.0; const auto rv = ShaderModel(ctx, problem, Winodata, Winofilter).ComputeWti(); @@ -618,7 +618,7 @@ static float GetWtiBase(const ConvolutionContext& ctx, const ProblemDescription& } template -static bool IsApplicableBase(const ConvolutionContext& ctx, const ProblemDescription& problem) +static bool IsApplicableBase(const ExecutionContext& ctx, const ProblemDescription& problem) { if(!problem.Is2d()) return false; @@ -694,7 +694,7 @@ static bool IsApplicableBase(const ConvolutionContext& ctx, const ProblemDescrip } template -bool ConvBinWinoRxS::IsApplicable(const ConvolutionContext& ctx, +bool ConvBinWinoRxS::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(IS2X3) @@ -716,7 +716,7 @@ bool ConvBinWinoRxS::IsApplicable(const ConvolutionContext template static inline boost::optional -GetPerfConfFromEnv(const ConvolutionContext& ctx) +GetPerfConfFromEnv(const ExecutionContext& ctx) { PerformanceConfigConvBinWinogradRxS fromEnv; std::string s; @@ -752,7 +752,7 @@ GetPerfConfFromEnv(const ConvolutionContext& ctx) template ConvSolution ConvBinWinoRxS::GetSolution( - const ConvolutionContext& ctx, + const ExecutionContext& ctx, const ProblemDescription& problem, const PerformanceConfigConvBinWinogradRxS& config) const { @@ -1075,7 +1075,7 @@ ConvSolution ConvBinWinoRxS::GetSolution( return result; } -bool ConvBinWinogradRxSf2x3g1::IsApplicable(const ConvolutionContext& ctx, +bool ConvBinWinogradRxSf2x3g1::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(miopen::IsDisabled(MIOPEN_DEBUG_AMD_WINOGRAD_RXS_F2X3_G1{})) @@ -1083,13 +1083,13 @@ bool ConvBinWinogradRxSf2x3g1::IsApplicable(const ConvolutionContext& ctx, return IsApplicableBase<2, 3>(ctx, problem) && problem.GetGroupCount() == 1; } -float ConvBinWinogradRxSf2x3g1::GetWti(const ConvolutionContext& ctx, +float ConvBinWinogradRxSf2x3g1::GetWti(const ExecutionContext& ctx, const ProblemDescription& problem) const { return GetWtiBase<2, 3>(ctx, problem); } -ConvSolution ConvBinWinogradRxSf2x3g1::GetSolution(const ConvolutionContext& ctx, +ConvSolution ConvBinWinogradRxSf2x3g1::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const { const auto tunable = ConvBinWinoRxS<2, 3>{}; diff --git a/src/solver/conv_wino_fury_RxS.cpp b/src/solver/conv_wino_fury_RxS.cpp index 12c9639a66..89f870e35e 100644 --- a/src/solver/conv_wino_fury_RxS.cpp +++ b/src/solver/conv_wino_fury_RxS.cpp @@ -165,7 +165,7 @@ class ShaderModel : public UnifiedDescriptionConv2d } // namespace template -bool ConvWinoFuryRxS::IsApplicable(const ConvolutionContext& ctx, +bool ConvWinoFuryRxS::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { if(!problem.Is2d()) @@ -195,7 +195,7 @@ bool ConvWinoFuryRxS::IsApplicable(const ConvolutionContex } template -float ConvWinoFuryRxS::GetWti(const ConvolutionContext& ctx, +float ConvWinoFuryRxS::GetWti(const ExecutionContext& ctx, const ProblemDescription& problem) const { auto n_groups = ctx.GetStream().GetMaxHardwareComputeUnits(); @@ -204,7 +204,7 @@ float ConvWinoFuryRxS::GetWti(const ConvolutionContext& ct template ConvSolution -ConvWinoFuryRxS::GetSolution(const ConvolutionContext& ctx, +ConvWinoFuryRxS::GetSolution(const ExecutionContext& ctx, const ProblemDescription& problem) const { // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) diff --git a/src/solver/mlir_common.cpp b/src/solver/mlir_common.cpp index eaaa0e42c3..4101db86f9 100644 --- a/src/solver/mlir_common.cpp +++ b/src/solver/mlir_common.cpp @@ -121,7 +121,7 @@ static std::string GetOperation(const ProblemDescription& problem) /* Construct the options string passed to MLIR to cause it to generate a given convolution.*/ -std::string ConstructBuildOptions(const ConvolutionContext& ctx, +std::string ConstructBuildOptions(const ExecutionContext& ctx, const ProblemDescription& problem, bool is_xdlops, int kernel_id) diff --git a/test/conv_common.hpp b/test/conv_common.hpp index 3d510bb21b..99f964a0f6 100644 --- a/test/conv_common.hpp +++ b/test/conv_common.hpp @@ -86,7 +86,7 @@ static inline bool is_direct_fwd_bwd_data_supported(miopen::Handle& handle, (dir == miopen::conv::Direction::Forward) ? miopen::conv::ProblemDescription{xDesc, wDesc, yDesc, convDesc, dir} : miopen::conv::ProblemDescription{yDesc, wDesc, xDesc, convDesc, dir}; - auto ctx = miopen::ConvolutionContext{}; + auto ctx = miopen::ExecutionContext{}; ctx.do_search = false; ctx.save_srch_req = false; ctx.disable_perfdb_access = true; @@ -110,7 +110,7 @@ static inline bool is_direct_bwd_wrw_supported(miopen::Handle& handle, const auto problem = miopen::conv::ProblemDescription{ yDesc, wDesc, xDesc, convDesc, miopen::conv::Direction::BackwardWeights}; - auto ctx = miopen::ConvolutionContext{}; + auto ctx = miopen::ExecutionContext{}; ctx.do_search = false; ctx.save_srch_req = false; @@ -136,7 +136,7 @@ static inline bool skip_config(miopen::Handle& handle, const auto conv_problem = miopen::conv::ProblemDescription{ xDesc, wDesc, yDesc, convDesc, miopen::conv::Direction::Forward}; const auto problem = miopen::ProblemDescription{conv_problem}; - auto ctx = miopen::ConvolutionContext{}; + auto ctx = miopen::ExecutionContext{}; ctx.do_search = false; ctx.save_srch_req = false; diff --git a/test/embed_sqlite.cpp b/test/embed_sqlite.cpp index dfb663802e..32aa1371a8 100644 --- a/test/embed_sqlite.cpp +++ b/test/embed_sqlite.cpp @@ -31,7 +31,7 @@ #if MIOPEN_EMBED_DB #include -#include +#include #include #include #include @@ -67,7 +67,7 @@ struct EmbedSQLite : test_driver const auto conv_problem = miopen::conv::ProblemDescription{ x.desc, w.desc, y.desc, filter, miopen::conv::Direction::Forward}; const auto problem = miopen::ProblemDescription{conv_problem}; - miopen::ConvolutionContext ctx{}; + miopen::ExecutionContext ctx{}; ctx.SetStream(&handle); // Check PerfDb { diff --git a/test/gpu_conv.hpp b/test/gpu_conv.hpp index 240b191028..3165ace5fc 100644 --- a/test/gpu_conv.hpp +++ b/test/gpu_conv.hpp @@ -89,7 +89,7 @@ bool gpu_ref_convolution_fwd(const tensor& input, input.desc, in_dev.get(), weights.desc, wei_dev.get(), rout.desc, out_dev.get()}; const auto problem = miopen::conv::ProblemDescription{ input.desc, weights.desc, rout.desc, filter, miopen::conv::Direction::Forward}; - auto ctx = miopen::ConvolutionContext{}; + auto ctx = miopen::ExecutionContext{}; ctx.SetStream(&handle); if(naive_solver.IsApplicable(ctx, problem)) { @@ -125,7 +125,7 @@ bool gpu_ref_convolution_bwd(tensor& input, output.desc, out_dev.get(), weights.desc, wei_dev.get(), input.desc, in_dev.get()}; const auto problem = miopen::conv::ProblemDescription{ output.desc, weights.desc, input.desc, filter, miopen::conv::Direction::BackwardData}; - auto ctx = miopen::ConvolutionContext{}; + auto ctx = miopen::ExecutionContext{}; ctx.SetStream(&handle); if(naive_solver.IsApplicable(ctx, problem)) { @@ -165,7 +165,7 @@ bool gpu_ref_convolution_wrw(const tensor& input, input.desc, filter, miopen::conv::Direction::BackwardWeights}; - auto ctx = miopen::ConvolutionContext{}; + auto ctx = miopen::ExecutionContext{}; ctx.SetStream(&handle); if(naive_solver.IsApplicable(ctx, problem)) { diff --git a/test/gtest/db_sync.cpp b/test/gtest/db_sync.cpp index e6a4288e1f..d7e59468a2 100644 --- a/test/gtest/db_sync.cpp +++ b/test/gtest/db_sync.cpp @@ -29,7 +29,7 @@ #include #include "get_handle.hpp" #include -#include +#include #include #include @@ -402,7 +402,7 @@ TEST(DBSync, DISABLED_DynamicFDBSync) std::unordered_map checked_kdbs; auto& handle = get_handle(); - auto _ctx = miopen::ConvolutionContext{}; + auto _ctx = miopen::ExecutionContext{}; _ctx.SetStream(&handle); for(const auto& kinder : find_db.GetCacheMap()) @@ -458,7 +458,7 @@ TEST(DbSync, DISABLED_StaticFDBSync) std::unordered_map checked_kdbs; auto& handle = get_handle(); - auto _ctx = miopen::ConvolutionContext{}; + auto _ctx = miopen::ExecutionContext{}; _ctx.SetStream(&handle); size_t cnt_finddb_entry = 0; for(const auto& kinder : find_db.GetCacheMap()) diff --git a/test/gtest/group_conv3d_bwd.cpp b/test/gtest/group_conv3d_bwd.cpp index 8e794749dd..e53a690021 100644 --- a/test/gtest/group_conv3d_bwd.cpp +++ b/test/gtest/group_conv3d_bwd.cpp @@ -56,7 +56,7 @@ void SolverBwd(const miopen::TensorDescriptor& inputDesc, const auto problem = miopen::conv::ProblemDescription{ inputDesc, wDesc, outputDesc, convDesc, miopen::conv::Direction::BackwardData}; - auto ctx = miopen::ConvolutionContext{}; + auto ctx = miopen::ExecutionContext{}; ctx.SetStream(&handle); diff --git a/test/gtest/group_conv3d_fwd.cpp b/test/gtest/group_conv3d_fwd.cpp index 876f513258..2b52a1b43a 100644 --- a/test/gtest/group_conv3d_fwd.cpp +++ b/test/gtest/group_conv3d_fwd.cpp @@ -56,7 +56,7 @@ void SolverFwd(const miopen::TensorDescriptor& inputDesc, const auto problem = miopen::conv::ProblemDescription{ inputDesc, wDesc, outputDesc, convDesc, miopen::conv::Direction::Forward}; - auto ctx = miopen::ConvolutionContext{}; + auto ctx = miopen::ExecutionContext{}; ctx.SetStream(&handle); diff --git a/test/gtest/group_conv3d_wrw.cpp b/test/gtest/group_conv3d_wrw.cpp index 0fae3402d8..13e88da5ad 100644 --- a/test/gtest/group_conv3d_wrw.cpp +++ b/test/gtest/group_conv3d_wrw.cpp @@ -57,7 +57,7 @@ void SolverWrw(const miopen::TensorDescriptor& inputDesc, const auto problem = miopen::conv::ProblemDescription{ inputDesc, wDesc, outputDesc, convDesc, miopen::conv::Direction::BackwardWeights}; - auto ctx = miopen::ConvolutionContext{}; + auto ctx = miopen::ExecutionContext{}; ctx.SetStream(&handle); diff --git a/test/gtest/group_conv_fwd.cpp b/test/gtest/group_conv_fwd.cpp index e5a1dbbea9..c8fdec4cae 100644 --- a/test/gtest/group_conv_fwd.cpp +++ b/test/gtest/group_conv_fwd.cpp @@ -56,7 +56,7 @@ void SolverFwd(const miopen::TensorDescriptor& inputDesc, const auto problem = miopen::conv::ProblemDescription{ inputDesc, wDesc, outputDesc, convDesc, miopen::conv::Direction::Forward}; - auto ctx = miopen::ConvolutionContext{}; + auto ctx = miopen::ExecutionContext{}; ctx.SetStream(&handle); diff --git a/test/gtest/kernel_tuning_net.cpp b/test/gtest/kernel_tuning_net.cpp index 0a8885486d..65c7977944 100644 --- a/test/gtest/kernel_tuning_net.cpp +++ b/test/gtest/kernel_tuning_net.cpp @@ -93,7 +93,7 @@ void TestParameterPredictionModel(miopen::ProblemDescription problem, auto&& handle = get_handle(); if(handle.GetDeviceName() != "gfx908") GTEST_SKIP(); - miopen::ConvolutionContext ctx; + miopen::ExecutionContext ctx; ctx.SetStream(&handle); T perf_config; bool valid = false; diff --git a/test/gtest/solver_bwd.hpp b/test/gtest/solver_bwd.hpp index 728c38fcda..febc35ae01 100644 --- a/test/gtest/solver_bwd.hpp +++ b/test/gtest/solver_bwd.hpp @@ -59,9 +59,9 @@ struct ConvBwdSolverTest output.desc, conv_desc, miopen::conv::Direction::BackwardData); - const auto problem = miopen::ProblemDescription{conv_problem}; - const miopen::ConvolutionContext ctx = [&] { - auto tmp = miopen::ConvolutionContext{&handle}; + const auto problem = miopen::ProblemDescription{conv_problem}; + const miopen::ExecutionContext ctx = [&] { + auto tmp = miopen::ExecutionContext{&handle}; problem.conv_problem.SetupFloats(tmp); return tmp; }(); diff --git a/test/gtest/solver_fwd.hpp b/test/gtest/solver_fwd.hpp index ac30ad15ca..a41667d5fc 100644 --- a/test/gtest/solver_fwd.hpp +++ b/test/gtest/solver_fwd.hpp @@ -62,8 +62,8 @@ struct ConvFwdSolverTest this->output.desc, this->conv_desc, miopen::conv::Direction::Forward}); - const miopen::ConvolutionContext ctx = [&] { - auto tmp = miopen::ConvolutionContext{&handle}; + const miopen::ExecutionContext ctx = [&] { + auto tmp = miopen::ExecutionContext{&handle}; problem.conv_problem.SetupFloats(tmp); return tmp; }(); diff --git a/test/gtest/solver_wrw.hpp b/test/gtest/solver_wrw.hpp index 51f078fd21..6adba65bf6 100644 --- a/test/gtest/solver_wrw.hpp +++ b/test/gtest/solver_wrw.hpp @@ -59,8 +59,8 @@ struct ConvWrwSolverTest input.desc, conv_desc, miopen::conv::Direction::BackwardWeights}); - const miopen::ConvolutionContext ctx = [&] { - auto tmp = miopen::ConvolutionContext{&handle}; + const miopen::ExecutionContext ctx = [&] { + auto tmp = miopen::ExecutionContext{&handle}; problem.conv_problem.SetupFloats(tmp); return tmp; }(); diff --git a/test/gtest/tuna_net.cpp b/test/gtest/tuna_net.cpp index 288c4f0068..6eff96daeb 100644 --- a/test/gtest/tuna_net.cpp +++ b/test/gtest/tuna_net.cpp @@ -91,7 +91,7 @@ void TestSolverPredictionModel(miopen::ProblemDescription& problem, std::size_t std::string device = handle.GetDeviceName(); if(device != "gfx908") GTEST_SKIP(); - miopen::ConvolutionContext ctx; + miopen::ExecutionContext ctx; ctx.SetStream(&handle); std::vector solvers = miopen::ai::immed_mode::PredictSolver(problem, ctx, device); std::size_t solver = diff --git a/test/solver.cpp b/test/solver.cpp index 3777daf6ae..d61524a29e 100644 --- a/test/solver.cpp +++ b/test/solver.cpp @@ -49,12 +49,12 @@ class TrivialTestSolver final : public solver::ConvSolver const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription& problem) const override + bool IsApplicable(const ExecutionContext&, const ProblemDescription& problem) const override { return problem.GetInWidth_() == 1; } - solver::ConvSolution GetSolution(const ConvolutionContext&, + solver::ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&) const override { solver::ConvSolution ret; @@ -88,12 +88,12 @@ class SearchableTestSolver final : public solver::ConvTunableSolver const std::string& SolverDbId() const override { return GetSolverDbId(); } - bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override + bool IsApplicable(const ExecutionContext&, const ProblemDescription&) const override { return true; } - TestConfig GetDefaultPerformanceConfig(const ConvolutionContext&, + TestConfig GetDefaultPerformanceConfig(const ExecutionContext&, const ProblemDescription&) const override { TestConfig config{}; @@ -101,14 +101,14 @@ class SearchableTestSolver final : public solver::ConvTunableSolver return config; } - bool IsValidPerformanceConfig(const ConvolutionContext&, + bool IsValidPerformanceConfig(const ExecutionContext&, const ProblemDescription&, const TestConfig&) const override { return true; } - TestConfig Search(const ConvolutionContext&, + TestConfig Search(const ExecutionContext&, const ProblemDescription&, const AnyInvokeParams&) const override { @@ -118,7 +118,7 @@ class SearchableTestSolver final : public solver::ConvTunableSolver return config; } - solver::ConvSolution GetSolution(const ConvolutionContext&, + solver::ConvSolution GetSolution(const ExecutionContext&, const ProblemDescription&, const TestConfig& config) const override { @@ -140,7 +140,7 @@ class SearchableTestSolver final : public solver::ConvTunableSolver // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) int SearchableTestSolver::_serches_done = 0; -static solver::ConvSolution FindSolution(const ConvolutionContext& ctx, +static solver::ConvSolution FindSolution(const ExecutionContext& ctx, const ProblemDescription& problem, const std::string& db_path) { @@ -175,29 +175,29 @@ class SolverTest ConstructTest(db_path, TrivialTestSolver::FileName(), {1, 1, 1, 1}, - [](ConvolutionContext& c) { c.do_search = true; }); + [](ExecutionContext& c) { c.do_search = true; }); ConstructTest(db_path, SearchableTestSolver::NoSearchFileName(), {1, 1, 1, 2}, - [](ConvolutionContext& c) { c.do_search = false; }); + [](ExecutionContext& c) { c.do_search = false; }); ConstructTest(db_path, SearchableTestSolver::FileName(), {1, 1, 1, 2}, - [](ConvolutionContext& c) { c.do_search = true; }); + [](ExecutionContext& c) { c.do_search = true; }); const auto& searchable_solver = StaticContainer::Instance(); const auto searches = SearchableTestSolver::searches_done(); // Should read in both cases: result is already in DB, solver is searchable. ConstructTest( - db_path, SearchableTestSolver::FileName(), {1, 1, 1, 2}, [](ConvolutionContext&) {}); + db_path, SearchableTestSolver::FileName(), {1, 1, 1, 2}, [](ExecutionContext&) {}); ConstructTest(db_path, SearchableTestSolver::FileName(), {1, 1, 1, 2}, - [](ConvolutionContext& c) { c.do_search = true; }); + [](ExecutionContext& c) { c.do_search = true; }); // Checking no more searches were done. EXPECT_EQUAL(searches, searchable_solver.searches_done()); @@ -208,14 +208,14 @@ class SolverTest const std::string& db_path, const char* expected_kernel, const std::initializer_list& in, - const std::function& context_filler = [](ConvolutionContext&) {}) + const std::function& context_filler = [](ExecutionContext&) {}) { const auto problem = conv::ProblemDescription{TensorDescriptor{miopenFloat, in}, TensorDescriptor{miopenFloat, in}, TensorDescriptor{miopenFloat, in}, ConvolutionDescriptor{}, conv::Direction::Forward}; - auto ctx = ConvolutionContext{}; + auto ctx = ExecutionContext{}; ctx.SetStream(&get_handle()); context_filler(ctx); From 55f9999d2bdef79c38bc238872930aa37f50084a Mon Sep 17 00:00:00 2001 From: JD Date: Mon, 25 Sep 2023 21:43:03 -0500 Subject: [PATCH 10/36] [Jenkins][CI] clean workspace after each stage (#2412) --- Jenkinsfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index 9d10064b59..433f4e9622 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -309,6 +309,7 @@ def reboot(){ def buildHipClangJobAndReboot(Map conf=[:]){ try{ buildHipClangJob(conf) + cleanWs() } catch(e){ echo "throwing error exception for the stage" @@ -362,6 +363,7 @@ def RunPerfTest(Map conf=[:]){ catch (Exception err){ currentBuild.result = 'SUCCESS' } + cleanWs() } } } From d2909a6f6b7607b4b7cbb23b62eb8ef63dc508ab Mon Sep 17 00:00:00 2001 From: xinlipn Date: Mon, 25 Sep 2023 23:30:28 -0700 Subject: [PATCH 11/36] [tests] convert test_conv_igemm_mlir_fwd to gTest (#2291) --- test/CMakeLists.txt | 42 ------- test/gtest/CMakeLists.txt | 2 +- test/gtest/conv_igemm_dynamic.cpp | 189 ++++++++++++++++++++++++++++++ 3 files changed, 190 insertions(+), 43 deletions(-) create mode 100644 test/gtest/conv_igemm_dynamic.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 9f6432ec6d..d2d8a83436 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1218,48 +1218,6 @@ set(DYNAMIC_IMPLICITGEMM_XDLOPS_NHWC_BWD_ENVS ${DYNAMIC_IMPLICITGEMM_COMMON} MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC) -if(${CODECOV_TEST}) - add_custom_test(test_conv_igemm_dynamic_small GFX908_DISABLED GFX90A_DISABLED SKIP_XNACK_ON - COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 32 32 17 17 --weights 32 32 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-backward-data --disable-backward-weights --disable-validation - COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $ --verbose --input 64 64 28 28 --weights 32 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data --disable-validation - COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $ --verbose --input 64 64 28 28 --weights 16 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights --disable-validation - ) - set_tests_properties(test_conv_igemm_dynamic_small PROPERTIES COST 800) -else() - add_custom_test(test_conv_igemm_dynamic_small GFX908_DISABLED GFX90A_DISABLED SKIP_XNACK_ON - COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 16 16 56 56 --weights 64 16 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights - COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 16 64 34 34 --weights 64 64 3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights - COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 32 32 17 17 --weights 32 32 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-backward-data --disable-backward-weights - COMMAND ${DYNAMIC_IMPLICITGEMM_1X1_ENVS} $ --verbose --input 16 384 8 8 --weights 64 384 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights - COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $ --verbose --input 64 64 28 28 --weights 32 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data - COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $ --verbose --input 16 128 36 36 --weights 32 128 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data - COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $ --verbose --input 64 64 28 28 --weights 16 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights - COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $ --verbose --input 16 128 36 36 --weights 32 128 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights - ) -endif() #if CODECOV_TEST - -add_custom_test(test_conv_igemm_dynamic SKIP_UNLESS_ALL GFX908_DISABLED GFX90A_DISABLED SKIP_XNACK_ON -COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 64 64 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 64 256 34 34 --weights 256 256 3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 128 128 35 35 --weights 128 128 3 3 --pads_strides_dilations 0 0 2 2 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 64 1536 8 8 --weights 256 1536 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 128 48 7 7 --weights 128 48 5 5 --pads_strides_dilations 2 2 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_ENVS} $ --verbose --input 128 128 17 17 --weights 128 128 1 7 --pads_strides_dilations 0 3 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_1X1_ENVS} $ --verbose --input 128 256 28 28 --weights 128 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_1X1_ENVS} $ --verbose --input 64 1536 8 8 --weights 256 1536 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_1X1_ENVS} $ --verbose --input 128 768 17 17 --weights 128 768 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-backward-data --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $ --verbose --input 64 64 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $ --verbose --input 32 128 34 34 --weights 64 128 3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $ --verbose --input 128 128 35 35 --weights 128 128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $ --verbose --input 128 256 56 56 --weights 64 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $ --verbose --input 64 512 28 28 --weights 256 512 1 1 --pads_strides_dilations 0 0 2 2 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_WRW_ENVS} $ --verbose --input 64 512 14 14 --weights 256 512 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-data -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $ --verbose --input 64 64 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $ --verbose --input 32 128 34 34 --weights 64 128 3 3 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $ --verbose --input 128 128 35 35 --weights 128 128 3 3 --pads_strides_dilations 1 1 1 1 1 1 --disable-forward --disable-backward-weights -COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS} $ --verbose --input 128 256 56 56 --weights 64 256 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights -) - # gfx90a is disabled due to WORKAROUND_ISSUE_1187 add_custom_test(test_conv_igemm_dynamic_xdlops_bwd SKIP_UNLESS_ALL HALF_ENABLED GFX90A_DISABLED GFX94X_ENABLED GFX900_DISABLED GFX906_DISABLED SKIP_XNACK_ON COMMAND ${DYNAMIC_IMPLICITGEMM_BWD_ENVS_XDLOPS} $ ${MIOPEN_TEST_FLOAT_ARG} --verbose --input 64 64 28 28 --weights 16 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 --disable-forward --disable-backward-weights diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index 61966d1888..4b7f2aaf08 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -35,7 +35,7 @@ function(add_gtest TEST_NAME) target_link_libraries(test_${TEST_NAME} gtest_main MIOpen ${Boost_LIBRARIES} hip::host $) endif() # Enable CMake to discover the test binary - gtest_discover_tests(test_${TEST_NAME} PROPERTIES ENVIRONMENT "MIOPEN_USER_DB_PATH=${CMAKE_CURRENT_BINARY_DIR};MIOPEN_TEST_ALL=${MIOPEN_TEST_ALL};MIOPEN_TEST_MLIR=${MIOPEN_TEST_MLIR};MIOPEN_TEST_COMPOSABLEKERNEL=${MIOPEN_TEST_COMPOSABLEKERNEL}") + gtest_discover_tests(test_${TEST_NAME} PROPERTIES ENVIRONMENT "MIOPEN_USER_DB_PATH=${CMAKE_CURRENT_BINARY_DIR};MIOPEN_TEST_FLOAT_ARG=${MIOPEN_TEST_FLOAT_ARG};MIOPEN_TEST_ALL=${MIOPEN_TEST_ALL};MIOPEN_TEST_MLIR=${MIOPEN_TEST_MLIR};MIOPEN_TEST_COMPOSABLEKERNEL=${MIOPEN_TEST_COMPOSABLEKERNEL}") endif() endfunction() diff --git a/test/gtest/conv_igemm_dynamic.cpp b/test/gtest/conv_igemm_dynamic.cpp new file mode 100644 index 0000000000..25a4e179c5 --- /dev/null +++ b/test/gtest/conv_igemm_dynamic.cpp @@ -0,0 +1,189 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include + +#include +#include +#include +#include "../conv2d.hpp" +#include "get_handle.hpp" + +using TestCase = std::tuple, std::string>; + +MIOPEN_DECLARE_ENV_VAR(MIOPEN_TEST_GPU_XNACK_ENABLED) + +static bool SkipTest(void) { return miopen::IsEnabled(MIOPEN_TEST_GPU_XNACK_ENABLED{}); } + +void GetArgs(const TestCase& param, std::vector& tokens) +{ + auto env_vars = std::get<0>(param); + for(auto& elem : env_vars) + { + putenv(elem.data()); + } + + auto cmd = std::get<1>(param); + + std::stringstream ss(cmd); + std::istream_iterator begin(ss); + std::istream_iterator end; + while(begin != end) + tokens.push_back(*begin++); +} + +class Conv2dFloat : public testing::TestWithParam> +{ +}; + +void Run2dDriver(miopenDataType_t prec) +{ + + std::vector params; + switch(prec) + { + case miopenFloat: params = Conv2dFloat::GetParam(); break; + case miopenHalf: + case miopenInt8: + case miopenBFloat16: + case miopenInt8x4: + case miopenInt32: + case miopenDouble: + case miopenFloat8: + case miopenBFloat8: + FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt8x4, miopenInt32, " + "miopenDouble, miopenFloat8, miopenBFloat8 " + "data type not supported by conv_igemm_dynamic test"; + + default: params = Conv2dFloat::GetParam(); + } + + for(const auto& test_value : params) + { + std::vector tokens; + GetArgs(test_value, tokens); + std::vector ptrs; + + std::transform(tokens.begin(), + tokens.end(), + std::back_inserter(ptrs), + [](const std::string& str) { return str.data(); }); + + testing::internal::CaptureStderr(); + test_drive(ptrs.size(), ptrs.data()); + auto capture = testing::internal::GetCapturedStderr(); + std::cout << capture; + } +}; + +bool IsTestSupportedForDevice(const miopen::Handle& handle) +{ + std::string devName = handle.GetDeviceName(); + if(devName == "gfx900" || devName == "gfx906") + return true; + else + return false; +} + +TEST_P(Conv2dFloat, FloatTest) +{ + const auto& handle = get_handle(); + if(IsTestSupportedForDevice(handle) && !SkipTest()) + { + Run2dDriver(miopenFloat); + } + else + { + GTEST_SKIP(); + } +}; + +std::vector GetTestCases(const std::string& precision) +{ + + std::vector env = { + "MIOPEN_FIND_MODE=normal", + "MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmV4R1DynamicFwd"}; + std::vector env_1x1 = { + "MIOPEN_FIND_MODE=normal", + "MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmV4R1DynamicFwd_1x1"}; + std::vector env_wrw = { + "MIOPEN_FIND_MODE=normal", + "MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmV4R1DynamicWrw"}; + std::vector env_bwd = { + "MIOPEN_FIND_MODE=normal", + "MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvAsmImplicitGemmV4R1DynamicBwd"}; + + std::string v = " --verbose"; + std::string dis_bk_data = " --disable-backward-data"; + std::string dis_bk_wei = " --disable-backward-weights"; + std::string dis_fwd = " --disable-forward"; + std::string dis_vali = " --disable-validation"; + + const std::vector test_cases = { + // clang-format off +#if CODECOV_TEST + TestCase{env, precision + v + " --input 32 32 17 17 --weights 32 32 1 7 --pads_strides_dilations 0 3 1 1 1 1" + dis_bk_data + dis_bk_wei + dis_vali}, + TestCase{env_wrw, precision + v + " --input 64 64 28 28 --weights 32 64 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data + dis_vali}, + TestCase{env_bwd, precision + v + " --input 64 64 28 28 --weights 16 64 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_wei + dis_vali}, +#else + TestCase{env, precision + v + " --input 16 16 56 56 --weights 64 16 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env, precision + v + " --input 16 64 34 34 --weights 64 64 3 3 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env, precision + v + " --input 32 32 17 17 --weights 32 32 1 7 --pads_strides_dilations 0 3 1 1 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env_1x1, precision + v + " --input 16 384 8 8 --weights 64 384 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env_wrw, precision + v + " --input 64 64 28 28 --weights 32 64 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data}, + TestCase{env_wrw, precision + v + " --input 16 128 36 36 --weights 32 128 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data}, + TestCase{env_bwd, precision + v + " --input 64 64 28 28 --weights 16 64 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_wei}, + TestCase{env_bwd, precision + v + " --input 16 128 36 36 --weights 32 128 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_wei}, +#endif + +#if MIOPEN_TEST_ALL + //SKIP_UNLESS_ALL + TestCase{env, precision + v + " --input 64 64 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env, precision + v + " --input 64 256 34 34 --weights 256 256 3 3 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env, precision + v + " --input 128 128 35 35 --weights 128 128 3 3 --pads_strides_dilations 0 0 2 2 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env, precision + v + " --input 64 1536 8 8 --weights 256 1536 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env, precision + v + " --input 128 48 7 7 --weights 128 48 5 5 --pads_strides_dilations 2 2 1 1 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env, precision + v + " --input 128 128 17 17 --weights 128 128 1 7 --pads_strides_dilations 0 3 1 1 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env_1x1, precision + v + " --input 128 256 28 28 --weights 128 256 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env_1x1, precision + v + " --input 64 1536 8 8 --weights 256 1536 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env_1x1, precision + v + " --input 128 768 17 17 --weights 128 768 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_bk_data + dis_bk_wei}, + TestCase{env_wrw, precision + v + " --input 64 64 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data}, + TestCase{env_wrw, precision + v + " --input 32 128 34 34 --weights 64 128 3 3 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data}, + TestCase{env_wrw, precision + v + " --input 128 128 35 35 --weights 128 128 3 3 --pads_strides_dilations 1 1 1 1 1 1" + dis_fwd + dis_bk_data}, + TestCase{env_wrw, precision + v + " --input 128 256 56 56 --weights 64 256 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data}, + TestCase{env_wrw, precision + v + " --input 64 512 28 28 --weights 256 512 1 1 --pads_strides_dilations 0 0 2 2 1 1" + dis_fwd + dis_bk_data}, + TestCase{env_wrw, precision + v + " --input 64 512 14 14 --weights 256 512 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_data}, + TestCase{env_bwd, precision + v + " --input 64 64 56 56 --weights 256 64 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_wei}, + TestCase{env_bwd, precision + v + " --input 32 128 34 34 --weights 64 128 3 3 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_wei}, + TestCase{env_bwd, precision + v + " --input 128 128 35 35 --weights 128 128 3 3 --pads_strides_dilations 1 1 1 1 1 1" + dis_fwd + dis_bk_wei}, + TestCase{env_bwd, precision + v + " --input 128 256 56 56 --weights 64 256 1 1 --pads_strides_dilations 0 0 1 1 1 1" + dis_fwd + dis_bk_wei} +#endif + // clang-format on + }; + return test_cases; +} + +INSTANTIATE_TEST_SUITE_P(ConvIgemmDynamic, Conv2dFloat, testing::Values(GetTestCases("--float"))); From 1e325a7bebb11c629e39808008b67ffed69ee486 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Mon, 25 Sep 2023 23:15:09 -0700 Subject: [PATCH 12/36] Revert "cmake: enable finding installed ZStd library (#2362)" This reverts commit e608b4325646afeabb5e52846997b926d2019d19. --- cmake/Findzstd.cmake | 91 -------------------------------------------- src/CMakeLists.txt | 5 --- 2 files changed, 96 deletions(-) delete mode 100644 cmake/Findzstd.cmake diff --git a/cmake/Findzstd.cmake b/cmake/Findzstd.cmake deleted file mode 100644 index 43ea6f9b40..0000000000 --- a/cmake/Findzstd.cmake +++ /dev/null @@ -1,91 +0,0 @@ -################################################################################ -# -# MIT License -# -# Copyright (c) 2023 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ - -# Try to find the zstd library -# -# If successful, the following variables will be defined: -# zstd_INCLUDE_DIR -# zstd_LIBRARY -# zstd_STATIC_LIBRARY -# zstd_FOUND -# -# Additionally, one of the following import targets will be defined: -# zstd::libzstd_shared -# zstd::libzstd_static - -if(MSVC) - set(zstd_STATIC_LIBRARY_SUFFIX "_static\\${CMAKE_STATIC_LIBRARY_SUFFIX}$") -else() - set(zstd_STATIC_LIBRARY_SUFFIX "\\${CMAKE_STATIC_LIBRARY_SUFFIX}$") -endif() - -find_path(zstd_INCLUDE_DIR NAMES zstd.h) -find_library(zstd_LIBRARY NAMES zstd zstd_static) -find_library(zstd_STATIC_LIBRARY NAMES - zstd_static - "${CMAKE_STATIC_LIBRARY_PREFIX}zstd${CMAKE_STATIC_LIBRARY_SUFFIX}") - -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args( - zstd DEFAULT_MSG - zstd_LIBRARY zstd_INCLUDE_DIR -) - -if(zstd_FOUND) - if(zstd_LIBRARY MATCHES "${zstd_STATIC_LIBRARY_SUFFIX}$") - set(zstd_STATIC_LIBRARY "${zstd_LIBRARY}") - elseif (NOT TARGET zstd::libzstd_shared) - add_library(zstd::libzstd_shared SHARED IMPORTED) - if(MSVC) - # IMPORTED_LOCATION is the path to the DLL and IMPORTED_IMPLIB is the "library". - get_filename_component(zstd_DIRNAME "${zstd_LIBRARY}" DIRECTORY) - string(REGEX REPLACE "${CMAKE_INSTALL_LIBDIR}$" "${CMAKE_INSTALL_BINDIR}" zstd_DIRNAME "${zstd_DIRNAME}") - get_filename_component(zstd_BASENAME "${zstd_LIBRARY}" NAME) - string(REGEX REPLACE "\\${CMAKE_LINK_LIBRARY_SUFFIX}$" "${CMAKE_SHARED_LIBRARY_SUFFIX}" zstd_BASENAME "${zstd_BASENAME}") - set_target_properties(zstd::libzstd_shared PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIR}" - IMPORTED_LOCATION "${zstd_DIRNAME}/${zstd_BASENAME}" - IMPORTED_IMPLIB "${zstd_LIBRARY}") - unset(zstd_DIRNAME) - unset(zstd_BASENAME) - else() - set_target_properties(zstd::libzstd_shared PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIR}" - IMPORTED_LOCATION "${zstd_LIBRARY}") - endif() - endif() - if(zstd_STATIC_LIBRARY MATCHES "${zstd_STATIC_LIBRARY_SUFFIX}$" AND - NOT TARGET zstd::libzstd_static) - add_library(zstd::libzstd_static STATIC IMPORTED) - set_target_properties(zstd::libzstd_static PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIR}" - IMPORTED_LOCATION "${zstd_STATIC_LIBRARY}") - endif() -endif() - -unset(zstd_STATIC_LIBRARY_SUFFIX) - -mark_as_advanced(zstd_INCLUDE_DIR zstd_LIBRARY zstd_STATIC_LIBRARY) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4ffed2b4c8..f604a98c51 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -682,11 +682,6 @@ rocm_set_soversion(MIOpen ${MIOpen_SOVERSION}) clang_tidy_check(MIOpen) -find_package(zstd) -if(zstd_FOUND) - target_link_libraries(MIOpen PRIVATE zstd::libzstd_static) -endif() - function(target_internal_library TARGET) target_link_libraries(${TARGET} PRIVATE ${ARGN}) set(PASS_ARGS debug optimized) From 839249db9286fd6ec102ae5f9bb93d7f9a7cd0b2 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Tue, 26 Sep 2023 10:13:32 -0700 Subject: [PATCH 13/36] Revert "Revert "cmake: enable finding installed ZStd library (#2362)"" This reverts commit 1e325a7bebb11c629e39808008b67ffed69ee486. --- cmake/Findzstd.cmake | 91 ++++++++++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 5 +++ 2 files changed, 96 insertions(+) create mode 100644 cmake/Findzstd.cmake diff --git a/cmake/Findzstd.cmake b/cmake/Findzstd.cmake new file mode 100644 index 0000000000..43ea6f9b40 --- /dev/null +++ b/cmake/Findzstd.cmake @@ -0,0 +1,91 @@ +################################################################################ +# +# MIT License +# +# Copyright (c) 2023 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################ + +# Try to find the zstd library +# +# If successful, the following variables will be defined: +# zstd_INCLUDE_DIR +# zstd_LIBRARY +# zstd_STATIC_LIBRARY +# zstd_FOUND +# +# Additionally, one of the following import targets will be defined: +# zstd::libzstd_shared +# zstd::libzstd_static + +if(MSVC) + set(zstd_STATIC_LIBRARY_SUFFIX "_static\\${CMAKE_STATIC_LIBRARY_SUFFIX}$") +else() + set(zstd_STATIC_LIBRARY_SUFFIX "\\${CMAKE_STATIC_LIBRARY_SUFFIX}$") +endif() + +find_path(zstd_INCLUDE_DIR NAMES zstd.h) +find_library(zstd_LIBRARY NAMES zstd zstd_static) +find_library(zstd_STATIC_LIBRARY NAMES + zstd_static + "${CMAKE_STATIC_LIBRARY_PREFIX}zstd${CMAKE_STATIC_LIBRARY_SUFFIX}") + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + zstd DEFAULT_MSG + zstd_LIBRARY zstd_INCLUDE_DIR +) + +if(zstd_FOUND) + if(zstd_LIBRARY MATCHES "${zstd_STATIC_LIBRARY_SUFFIX}$") + set(zstd_STATIC_LIBRARY "${zstd_LIBRARY}") + elseif (NOT TARGET zstd::libzstd_shared) + add_library(zstd::libzstd_shared SHARED IMPORTED) + if(MSVC) + # IMPORTED_LOCATION is the path to the DLL and IMPORTED_IMPLIB is the "library". + get_filename_component(zstd_DIRNAME "${zstd_LIBRARY}" DIRECTORY) + string(REGEX REPLACE "${CMAKE_INSTALL_LIBDIR}$" "${CMAKE_INSTALL_BINDIR}" zstd_DIRNAME "${zstd_DIRNAME}") + get_filename_component(zstd_BASENAME "${zstd_LIBRARY}" NAME) + string(REGEX REPLACE "\\${CMAKE_LINK_LIBRARY_SUFFIX}$" "${CMAKE_SHARED_LIBRARY_SUFFIX}" zstd_BASENAME "${zstd_BASENAME}") + set_target_properties(zstd::libzstd_shared PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIR}" + IMPORTED_LOCATION "${zstd_DIRNAME}/${zstd_BASENAME}" + IMPORTED_IMPLIB "${zstd_LIBRARY}") + unset(zstd_DIRNAME) + unset(zstd_BASENAME) + else() + set_target_properties(zstd::libzstd_shared PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIR}" + IMPORTED_LOCATION "${zstd_LIBRARY}") + endif() + endif() + if(zstd_STATIC_LIBRARY MATCHES "${zstd_STATIC_LIBRARY_SUFFIX}$" AND + NOT TARGET zstd::libzstd_static) + add_library(zstd::libzstd_static STATIC IMPORTED) + set_target_properties(zstd::libzstd_static PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${zstd_INCLUDE_DIR}" + IMPORTED_LOCATION "${zstd_STATIC_LIBRARY}") + endif() +endif() + +unset(zstd_STATIC_LIBRARY_SUFFIX) + +mark_as_advanced(zstd_INCLUDE_DIR zstd_LIBRARY zstd_STATIC_LIBRARY) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f604a98c51..4ffed2b4c8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -682,6 +682,11 @@ rocm_set_soversion(MIOpen ${MIOpen_SOVERSION}) clang_tidy_check(MIOpen) +find_package(zstd) +if(zstd_FOUND) + target_link_libraries(MIOpen PRIVATE zstd::libzstd_static) +endif() + function(target_internal_library TARGET) target_link_libraries(${TARGET} PRIVATE ${ARGN}) set(PASS_ARGS debug optimized) From c977e00d2a2b1a27e4734f980a3e857c7f5cba26 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Sep 2023 11:54:16 -0700 Subject: [PATCH 14/36] Bump cryptography from 41.0.3 to 41.0.4 in /docs/.sphinx (#2408) Bumps [cryptography](https://github.com/pyca/cryptography) from 41.0.3 to 41.0.4. - [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pyca/cryptography/compare/41.0.3...41.0.4) --- updated-dependencies: - dependency-name: cryptography dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/.sphinx/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt index 7d0f7f499e..6f10fcce12 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/.sphinx/requirements.txt @@ -26,7 +26,7 @@ charset-normalizer==3.1.0 # via requests click==8.1.3 # via sphinx-external-toc -cryptography==41.0.3 +cryptography==41.0.4 # via pyjwt deprecated==1.2.13 # via pygithub @@ -92,7 +92,7 @@ requests==2.31.0 # via # pygithub # sphinx -rocm-docs-core>=0.24.0 +rocm-docs-core==0.24.2 # via -r requirements.in smmap==5.0.0 # via gitdb From b9e724a49324369dbdd9ee3a5d08eeb0495871b8 Mon Sep 17 00:00:00 2001 From: mentat <108366729+bghimireamd@users.noreply.github.com> Date: Tue, 26 Sep 2023 13:54:51 -0500 Subject: [PATCH 15/36] [SWDEV-416089][Doc] convolution API in MIOpen is restricted to alpha = 1.0 and beta = 0.0 (#2419) --- include/miopen/miopen.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index f0c0ce1aa6..a59181acf3 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -1715,6 +1715,8 @@ miopenFindConvolutionForwardAlgorithm(miopenHandle_t handle, * Runs the forward convolution layer based on the selected algorithm. The function * miopenFindConvolutionForwardAlgorithm() must have been executed previously to * determine the required memory needed for the workspace and the best convolutional algorithm. + * The scaling parameter alpha (float) and shift parameter beta (float) are only supported for + * alpha = 1 and beta = 0. * * If using Group/Depthwise convolution mode, call miopenSetConvolutionGroupCount() before running * this. @@ -1751,6 +1753,8 @@ MIOPEN_EXPORT miopenStatus_t miopenConvolutionForward(miopenHandle_t handle, /*! @brief Calculate element-wise scale and shift of a tensor via a bias tensor * * This function applies an element-wise bias to a data tensor from an input bias tensor. + * The scaling parameter alpha (float) and shift parameter beta (float) are only supported for + * alpha = 1 and beta = 0. * * @param handle MIOpen handle (input) * @param alpha Floating point scaling factor, allocated on the host (input) @@ -2018,6 +2022,8 @@ miopenConvolutionBackwardWeights(miopenHandle_t handle, /*! @brief Calculates the gradient with respect to the bias. * * Compute the convolution backwards gradient with respect to the bias tensor. + * The scaling parameter alpha (float) and shift parameter beta (float) are only supported for + * alpha = 1 and beta = 0. * * @param handle MIOpen handle (input) * @param alpha Floating point scaling factor, allocated on the host (input) From 3413d2daaeb44b7d6eadcc03033a5954a118491e Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Wed, 27 Sep 2023 07:39:12 -0700 Subject: [PATCH 16/36] [HotFix] zstd dependency on multi Linux distributes (#2417) --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4ffed2b4c8..71289c8b42 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -684,7 +684,7 @@ clang_tidy_check(MIOpen) find_package(zstd) if(zstd_FOUND) - target_link_libraries(MIOpen PRIVATE zstd::libzstd_static) + target_link_libraries(MIOpen PRIVATE $,zstd::libzstd_shared,zstd::libzstd_static>) endif() function(target_internal_library TARGET) From 6d539ee81321121570606e4ef62e6d072775bbd8 Mon Sep 17 00:00:00 2001 From: Reid Kawaja <74506315+reidkwja@users.noreply.github.com> Date: Thu, 28 Sep 2023 13:32:20 -0400 Subject: [PATCH 17/36] [CI][Jenkins] Enable rebooting in CI stages for CI stages with GPU use (#2420) * conf_reboot * configs_chg --- Jenkinsfile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 433f4e9622..e7ffb0da1b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -317,7 +317,7 @@ def buildHipClangJobAndReboot(Map conf=[:]){ throw e } finally{ - if (conf.get("needs_reboot", false)) { + if (conf.get("needs_reboot", true)) { reboot() } } @@ -544,7 +544,7 @@ pipeline { stage("HIP Package") { agent{ label rocmnode("nogpu") } steps{ - buildHipClangJobAndReboot( package_build: "true", needs_gpu:false) + buildHipClangJobAndReboot( package_build: "true", needs_gpu:false, needs_reboot:false) } } } @@ -561,7 +561,7 @@ pipeline { build_cmd = "make -j\$(nproc) -k analyze" } steps{ - buildHipClangJobAndReboot(setup_cmd: setup_cmd, build_cmd: build_cmd, needs_gpu:false) + buildHipClangJobAndReboot(setup_cmd: setup_cmd, build_cmd: build_cmd, needs_gpu:false, needs_reboot:false) } } stage('Clang Format') { @@ -588,7 +588,7 @@ pipeline { build_cmd = "make -j\$(nproc) " } steps{ - buildHipClangJobAndReboot(build_fin: "ON", needs_gpu:false, build_install: "true") + buildHipClangJobAndReboot(build_fin: "ON", needs_gpu:false, needs_reboot:false, build_install: "true") } } stage('Perf DB Validity Test') { @@ -598,7 +598,7 @@ pipeline { } steps{ - CheckPerfDbValid(setup_flags: fin_flags, config_targets: "all", build_fin: "ON", needs_gpu:false, build_install: "true") + CheckPerfDbValid(setup_flags: fin_flags, config_targets: "all", build_fin: "ON", needs_gpu:false, needs_reboot:false, build_install: "true") } } stage('HipNoGPU Debug Build Test') { @@ -612,7 +612,7 @@ pipeline { build_cmd = "make -j\$(nproc)" } steps{ - buildHipClangJob( build_type: 'debug', setup_flags: HipNoGPU_flags, build_cmd: build_cmd, needs_gpu:false) + buildHipClangJob( build_type: 'debug', setup_flags: HipNoGPU_flags, build_cmd: build_cmd, needs_gpu:false, needs_reboot:false) } } } From 2e2b37ac816813a5c05b97f3e1381cd3501a64a3 Mon Sep 17 00:00:00 2001 From: Chris Erb Date: Mon, 2 Oct 2023 19:25:17 -0500 Subject: [PATCH 18/36] [Bug Fixes] miopen_rocblas_gemm_ex3 call - invoker cache extra elements - conv direct naive input cast (#2414) * bugfixes miopen_rocblas_gemm_ex3 call would always throw error invoker cache adding extra elements conv direct naive yielding incorrect input cast for kernel arg * clear clang format issue --------- Co-authored-by: Jun Liu --- src/gemm_v2.cpp | 9 +++++---- src/invoker_cache.cpp | 7 +++++-- src/solver/conv_direct_naive_conv.cpp | 4 ++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp index 804587aac0..19e302f166 100644 --- a/src/gemm_v2.cpp +++ b/src/gemm_v2.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #if MIOPEN_BACKEND_HIP #include @@ -173,6 +174,7 @@ rocblas_status miopen_rocblas_gemm_ex3(const miopen::Handle& handle, rocblas_gemm_algo::rocblas_gemm_algo_standard, 0, flags); // gfx90a_alt_impl)); + return rb_status; #pragma clang diagnostic pop #endif MIOPEN_THROW(miopenStatusBadParm, "An appropriate version of rocBLAS is required for this op"); @@ -258,10 +260,9 @@ std::ostream& operator<<(std::ostream& stream, const GemmDescriptor& gemm_desc) << "strideC " << gemm_desc.strideC << ", " << "alpha " << gemm_desc.alpha << ", " << "beta " << gemm_desc.beta << ", " - << "dataType " << gemm_desc.dataType << "a_cast_type" << gemm_desc.a_cast_type - << ", " - << "b_cast_type" << gemm_desc.b_cast_type << ", " - << "} "; + << "dataType " << GetDataType(gemm_desc.dataType) << ", " + << "a_cast_type " << GetDataType(gemm_desc.a_cast_type) << ", " + << "b_cast_type " << GetDataType(gemm_desc.b_cast_type) << "} "; } #if MIOPEN_USE_ROCBLAS diff --git a/src/invoker_cache.cpp b/src/invoker_cache.cpp index f7df500ee5..937a1b2e08 100644 --- a/src/invoker_cache.cpp +++ b/src/invoker_cache.cpp @@ -104,8 +104,11 @@ void InvokerCache::Register(const Key& key, const Invoker& invoker) auto it = invokers.find(key.first); if(it != invokers.end()) it->second.invokers.insert({key.second, invoker}); - auto& item = invokers.insert({key.first, Item{}}).first->second; - item.invokers.insert({key.second, invoker}); + else + { + auto& item = invokers.insert({key.first, Item{}}).first->second; + item.invokers.insert({key.second, invoker}); + } MIOPEN_LOG_I2("Invoker registered for algorithm " << key.first << " and solver " << key.second); } diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp index 4df8df5874..5c468768fa 100644 --- a/src/solver/conv_direct_naive_conv.cpp +++ b/src/solver/conv_direct_naive_conv.cpp @@ -212,12 +212,12 @@ std::string ConvDirectNaiveConvCompileOption(const ExecutionContext& ctx, ss << " -DWEIGHTS_TYPE=" << miopen::GetDataType(problem.GetWeightsDataType()); ss << " -DOUTPUT_TYPE=" << miopen::GetDataType(ProblemInterpreter::GetOutputDataType(problem)); - const auto in_cast_type = problem.GetInCastType(); + const auto in_cast_type = ProblemInterpreter::GetInputCastType(problem); if(in_cast_type) ss << " -DINPUT_CAST_TYPE=" << miopen::GetDataType(*in_cast_type); const auto wei_cast_type = problem.GetWeightsCastType(); if(wei_cast_type) - ss << " -DWEIGHTS_CAST_TYPE=" << miopen::GetDataType(*(wei_cast_type)); + ss << " -DWEIGHTS_CAST_TYPE=" << miopen::GetDataType(*wei_cast_type); const auto out_cast_type = ProblemInterpreter::GetOutputCastType(problem); if(out_cast_type) ss << " -DOUTPUT_CAST_TYPE=" << miopen::GetDataType(*out_cast_type); From 2065081d68bf9c6cd7a71903942ff121d3483c8c Mon Sep 17 00:00:00 2001 From: Reid Kawaja <74506315+reidkwja@users.noreply.github.com> Date: Tue, 3 Oct 2023 12:44:59 -0400 Subject: [PATCH 19/36] [CI][Jenkins] Disabling smoke stages for CI branch runs (#2422) --- Jenkinsfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index e7ffb0da1b..b6f2373ece 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -431,15 +431,15 @@ pipeline { description: "") booleanParam( name: "BUILD_SMOKE_FP32", - defaultValue: true, + defaultValue: false, description: "") booleanParam( name: "BUILD_SMOKE_AUX1", - defaultValue: true, + defaultValue: false, description: "") booleanParam( name: "BUILD_SMOKE_FP16_BF16_INT8", - defaultValue: true, + defaultValue: false, description: "") booleanParam( name: "BUILD_FULL_TESTS", From e8b4acf440c2800fd872d3450839c7b4c12907cc Mon Sep 17 00:00:00 2001 From: mentat <108366729+bghimireamd@users.noreply.github.com> Date: Tue, 3 Oct 2023 11:45:57 -0500 Subject: [PATCH 20/36] [Tests] disable solver ConvHipImplicitGemm3DGroupWrwXdlops on Vega10 (#2432) --- src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp index 6fce8a80b8..3c94374b4e 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp @@ -312,7 +312,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable( const std::string& arch = ctx.GetStream().GetDeviceName(); if(miopen::StartsWith(arch, "gfx11") || miopen::StartsWith(arch, "gfx10")) return false; - if(arch == "gfx906") + if(arch == "gfx906" || arch == "gfx900") return false; switch(problem.GetInDataType()) { From d8e5c6cd3f8cf6c3946e29fbbdc8a6b2bb6e5f2c Mon Sep 17 00:00:00 2001 From: mentat <108366729+bghimireamd@users.noreply.github.com> Date: Tue, 3 Oct 2023 17:19:26 -0500 Subject: [PATCH 21/36] [Dockerfile] Upgrade cmake so that MIOpen docker can compile Composable Kernel (#2424) * upgrade cmake so that MIOpen docker can compile Composable Kernel * pin the cmake version to 3.27.5 --- Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile b/Dockerfile index 31a0334eeb..d958879d3c 100755 --- a/Dockerfile +++ b/Dockerfile @@ -106,6 +106,9 @@ RUN ccache -s ADD docs/.sphinx/requirements.txt /doc-requirements.txt RUN pip3 install -r /doc-requirements.txt +# Composable Kernel requires this version cmake +RUN pip3 install --upgrade cmake==3.27.5 + # Use parallel job to accelerate tensile build # Workaround for Tensile with TargetID feature ARG USE_TARGETID="OFF" From 1605ca8e0e0cd34f4f0930e8a7105f181a5214bd Mon Sep 17 00:00:00 2001 From: Chris Erb Date: Tue, 3 Oct 2023 18:45:10 -0500 Subject: [PATCH 22/36] [Bug Fix] Compilation fix for -DMIOPEN_USE_ROCBLAS=Off (#2435) --- src/hip/handlehip.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hip/handlehip.cpp b/src/hip/handlehip.cpp index b03f8cd190..ff6d27d26e 100644 --- a/src/hip/handlehip.cpp +++ b/src/hip/handlehip.cpp @@ -262,7 +262,7 @@ struct HandleImpl rhandle_pool.push_back(std::move(r_ptr)); } #else - void add_stream(StreamPtr& s_ptr) { stream_pool.push_back(s_ptr); } + void add_stream(StreamPtr s_ptr) { stream_pool.push_back(s_ptr); } #endif // stream_pool used as cache for parallel streams created by MIOpen. StreamPtrPool stream_pool; @@ -362,7 +362,7 @@ void Handle::ReserveExtraStreamsInPool(int cnt) const auto new_rhandle = CreateRocblasHandle(new_stream.get()); this->impl->ms_resourse_ptr->add_resours(std::move(new_stream), std::move(new_rhandle)); #else - this->impl->ms_resourse_ptr->add_resours(std::move(new_stream)); + this->impl->ms_resourse_ptr->add_stream(std::move(new_stream)); #endif } } From c7e0d377ec8d0b7c348bd81d81a64a8d73fe3647 Mon Sep 17 00:00:00 2001 From: mentat <108366729+bghimireamd@users.noreply.github.com> Date: Thu, 5 Oct 2023 09:38:20 -0500 Subject: [PATCH 23/36] bg/lwpmiopen 193 : Integrate CK's batch norm backward training into non-tunable MIOpen solver (#2385) --- src/CMakeLists.txt | 2 + src/batch_norm_api.cpp | 7 - src/include/miopen/batchnorm/solvers.hpp | 20 + .../miopen/solver/implicitgemm_ck_util.hpp | 65 +- src/ocl/batchnormocl.cpp | 9 +- src/solver.cpp | 2 + src/solver/batchnorm/backward_ck.cpp | 251 ++++++ src/solver/batchnorm/forward_training_ck.cpp | 239 ++++++ test/bn_spatial_nhwc_test.cpp | 749 ------------------ test/fusionHost.hpp | 31 +- test/gtest/bn.hpp | 171 ++++ test/gtest/bn_bwd.cpp | 73 ++ test/gtest/bn_fwd_train.cpp | 73 ++ test/gtest/bn_infer.cpp | 8 +- test/gtest/bn_test_data.hpp | 223 +++++- test/gtest/test_operations.hpp | 35 + 16 files changed, 1164 insertions(+), 794 deletions(-) create mode 100644 src/solver/batchnorm/backward_ck.cpp create mode 100644 src/solver/batchnorm/forward_training_ck.cpp delete mode 100644 test/bn_spatial_nhwc_test.cpp create mode 100644 test/gtest/bn_bwd.cpp create mode 100644 test/gtest/bn_fwd_train.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 71289c8b42..abc0679a8a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -152,6 +152,7 @@ set( MIOpen_Source solver/activ/bwd_1.cpp solver/activ/fwd_0.cpp solver/activ/fwd_1.cpp + solver/batchnorm/backward_ck.cpp solver/batchnorm/backward_per_activation.cpp solver/batchnorm/backward_per_activation_fused.cpp solver/batchnorm/backward_spatial_multiple.cpp @@ -163,6 +164,7 @@ set( MIOpen_Source solver/batchnorm/forward_per_activation_fused.cpp solver/batchnorm/forward_spatial_multiple.cpp solver/batchnorm/forward_spatial_single.cpp + solver/batchnorm/forward_training_ck.cpp solver/conv_asm_1x1u.cpp solver/conv_asm_1x1u_bias_activ_fused.cpp solver/conv_asm_1x1u_stride2.cpp diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp index 03db138945..69454b185a 100644 --- a/src/batch_norm_api.cpp +++ b/src/batch_norm_api.cpp @@ -243,13 +243,6 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, const void* savedMean, const void* savedInvVariance) { - // bfloat16 not supported for batchnorm operation - if(miopen::deref(xDesc).GetType() == miopenBFloat16 || - miopen::deref(dyDesc).GetType() == miopenBFloat16 || - miopen::deref(dxDesc).GetType() == miopenBFloat16) - { - return miopenStatusNotImplemented; - } MIOPEN_LOG_FUNCTION(handle, bn_mode, diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp index c7d050abeb..70d64bb204 100644 --- a/src/include/miopen/batchnorm/solvers.hpp +++ b/src/include/miopen/batchnorm/solvers.hpp @@ -142,6 +142,26 @@ struct BnCKFwdInference final : BatchnormSolver const miopen::batchnorm::ProblemDescription& problem) const override; }; +struct BnCKBwdBackward final : BatchnormSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; +}; + +struct BnCKFwdTraining final : BatchnormSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; +}; + } // namespace batchnorm } // namespace solver diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp index 8656bdbabc..318d970170 100644 --- a/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -41,8 +41,10 @@ typename ConvPtrsType::iterator FindConvPtrByID(ConvPtrsType& conv_ptrs, }); } -template -std::vector FillValidKernelsIDs(const ProblemDescription& problem) +template +std::vector FillValidKernelsIDs(const ProblemDescriptionType& problem) { const auto args = CKArgsType{problem}; const auto conv_ptrs = DeviceOpType::GetInstances(); @@ -59,8 +61,10 @@ std::vector FillValidKernelsIDs(const ProblemDescription& problem) return valid_kernels; } -template -bool IsCKArgsSupported(const ProblemDescription& problem, const std::string& kernel_id) +template +bool IsCKArgsSupported(const ProblemDescriptionType& problem, const std::string& kernel_id) { auto conv_ptrs = DeviceOpType::GetInstances(); auto ptr_iter = FindConvPtrByID(conv_ptrs, kernel_id); @@ -68,20 +72,25 @@ bool IsCKArgsSupported(const ProblemDescription& problem, const std::string& ker return (ptr_iter != conv_ptrs.end()) && CKArgsType{problem}.IsSupportedBy(*ptr_iter); } -template -bool IsCKApplicable(const ProblemDescription& problem) +template +bool IsCKApplicable(const ProblemDescriptionType& problem) { const auto args = CKArgsType{problem}; - if(!std::all_of(args.strides.begin(), args.strides.end(), [](auto x) { return x == 1; })) - return false; + // if(!std::all_of(args.strides.begin(), args.strides.end(), [](auto x) { return x == 1; })) + // return false; const auto ptrs = DeviceOpType::GetInstances(); return std::any_of( ptrs.begin(), ptrs.end(), [&args](auto& ptr) { return args.IsSupportedBy(ptr); }); } -template -ConvSolution InitInvokerFactory(const ProblemDescription& problem, const std::string& kernel_id) +template +ConvSolution InitInvokerFactory(const ProblemDescriptionType& problem, const std::string& kernel_id) { auto conv_ptrs = DeviceOpType::GetInstances(); auto ptr_iter = FindConvPtrByID(conv_ptrs, kernel_id); @@ -112,5 +121,41 @@ ConvSolution InitInvokerFactory(const ProblemDescription& problem, const std::st return result; } +template +ConvSolution InitAnyInvokerFactory(const ProblemDescriptionType& problem, + const std::string& kernel_id) +{ + auto conv_ptrs = DeviceOpType::GetInstances(); + auto ptr_iter = FindConvPtrByID(conv_ptrs, kernel_id); + + if(ptr_iter == conv_ptrs.end()) + return {miopenStatusInvalidValue}; + + ConvSolution result; + result.invoker_factory = + [ck_args = CKArgsType{problem}, + sh_conv_ptr = std::shared_ptr{std::move(*ptr_iter)}](const std::vector&) mutable { + return [ck_args = std::move(ck_args), sh_conv_ptr = std::move(sh_conv_ptr)]( + const Handle& handle, const AnyInvokeParams& primitive_parameters) { + const auto& data_ctx = primitive_parameters.CastTo(); + auto argument_ptr = ck_args.MakeArgPtr(sh_conv_ptr, data_ctx); + auto invoker_ptr = sh_conv_ptr->MakeInvokerPointer(); + + const auto enable_profiling = handle.IsProfilingEnabled(); + float elapsed_time = + invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling}); + if(enable_profiling) + { + handle.ResetKernelTime(); + handle.AccumKernelTime(elapsed_time); + } + }; + }; + return result; +} + } // namespace solver } // namespace miopen diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 6c8a079a2a..6147a827b8 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -131,7 +131,8 @@ void BatchNormForwardTraining(Handle& handle, return tmp; }(); - const auto solvers = solver::SolverContainer{}; @@ -300,7 +301,7 @@ void BatchNormBackward(Handle& handle, { MIOPEN_THROW(miopenStatusBadParm); } - if(dxDesc.GetType() != dyDesc.GetType() || dyDesc.GetType() != xDesc.GetType()) + if(dxDesc.GetType() != dyDesc.GetType()) { MIOPEN_THROW(miopenStatusBadParm); } @@ -338,7 +339,6 @@ void BatchNormBackward(Handle& handle, tmp.dx = dx; tmp.bnScale = bnScale; tmp.resultBnScaleDiff = resultBnScaleDiff; - tmp.resultBnScaleDiff = resultBnScaleDiff; tmp.resultBnBiasDiff = resultBnBiasDiff; tmp.epsilon = epsilon; tmp.savedMean = savedMean; @@ -346,7 +346,8 @@ void BatchNormBackward(Handle& handle, return tmp; }(); - const auto solvers = solver::SolverContainer{}; diff --git a/src/solver.cpp b/src/solver.cpp index d83935e646..4cd680dd9c 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -569,6 +569,8 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) RegisterWithSolver( registry, ++id, ConvHipImplicitGemm3DGroupBwdXdlops{}, miopenConvolutionAlgoImplicitGEMM); Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdInference{}.SolverDbId()); + Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKBwdBackward{}.SolverDbId()); + Register(registry, ++id, Primitive::Batchnorm, batchnorm::BnCKFwdTraining{}.SolverDbId()); // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp new file mode 100644 index 0000000000..fba8724990 --- /dev/null +++ b/src/solver/batchnorm/backward_ck.cpp @@ -0,0 +1,251 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL +#include +#include +#include +#endif +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_BACK) + +namespace miopen { +namespace solver { +namespace batchnorm { +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using index_t = int32_t; + +constexpr index_t Rank = 4; +constexpr index_t NumBatchNormReduceDim = 3; + +using F16 = ck::half_t; +using F32 = float; +using F64 = double; +using BF16 = ushort; + +template +using DeviceOpBNBwdPtrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceBatchNormBwd>; + +struct CKArgsBNormBwd +{ + CKArgsBNormBwd(const miopen::batchnorm::ProblemDescription& problem) + { + std::copy(problem.GetXDesc().GetLengths().begin(), + problem.GetXDesc().GetLengths().end(), + lens.begin()); + + std::copy(problem.GetXDesc().GetStrides().begin(), + problem.GetXDesc().GetStrides().end(), + strides.begin()); + arrScaleBiasMeanVarLengths[0] = lens[1]; // get channel + arrScaleBiasMeanVarStrides[0] = 1; + + // prep for CK + std::sort(strides.begin(), strides.end(), std::greater<>()); + std::rotate(lens.begin() + 1, lens.begin() + 2, lens.end()); + } + + CKArgsBNormBwd(const CKArgsBNormBwd&) = default; + CKArgsBNormBwd(CKArgsBNormBwd&&) = default; + CKArgsBNormBwd& operator=(const CKArgsBNormBwd&) = default; + + template + auto MakeArgPtr(const InvokerPtr& invoker_ptr, const InvokerParams& data_ctx) const + { + return invoker_ptr->MakeArgumentPointer(lens, + strides, + strides, + strides, + reduceDims, + arrScaleBiasMeanVarLengths, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + data_ctx.x, + data_ctx.dy, + data_ctx.bnScale, + data_ctx.savedMean, + data_ctx.savedInvVariance, + epsilon, + PassThrough{}, + data_ctx.dx, + data_ctx.resultBnScaleDiff, + data_ctx.resultBnBiasDiff); + } + + template + bool IsSupportedBy(const ConvPtr& invoker_ptr) const + { + auto arg_ptr = MakeArgPtr(invoker_ptr, miopen::batchnorm::BwdInvokeParams{}); + return invoker_ptr->IsSupportedArgument(arg_ptr.get()); + } + + std::array lens; // inOutLengths + std::array strides; // inOutStrides + std::vector invariantDims; + + std::array arrScaleBiasMeanVarLengths; + std::array arrScaleBiasMeanVarStrides; + + double epsilon = 1e-5; + std::array reduceDims{0, 1, 2}; +}; + +template +static bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) +{ + return IsCKApplicable, + CKArgsBNormBwd>(problem); +} + +#endif + +bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx, + const miopen::batchnorm::ProblemDescription& bn_problem) const +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = ctx; + std::ignore = fdesc_problem; + return false; +#else + if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_BACK{})) + return false; + if(!bn_problem.IsLayoutNHWC()) + return false; + if(!ck_utility::is_ck_supported_hardware(ctx.GetStream())) + return false; + if(bn_problem.GetXDesc().GetType() != bn_problem.GetScaleBiasDiffDesc().GetType()) + return false; + + switch(bn_problem.GetXDesc().GetType()) + { + case miopenFloat: return CheckCKApplicability(bn_problem); + case miopenDouble: return CheckCKApplicability(bn_problem); + case miopenHalf: return CheckCKApplicability(bn_problem); + case miopenBFloat16: + return CheckCKApplicability(bn_problem); + case miopenInt32: + case miopenInt8: + case miopenInt8x4: + case miopenBFloat8: + case miopenFloat8: + default: MIOPEN_THROW("Unsupported datatype"); + } + return false; +#endif +} + +template +ConvSolution MakeAnyInvokerFactory(const miopen::batchnorm::ProblemDescription& bn_problem) +{ + const auto& valid_kernel_ids = FillValidKernelsIDs, + CKArgsBNormBwd>(bn_problem); + assert(!valid_kernel_ids.empty()); + const auto& kernel_id = valid_kernel_ids[0]; + return InitAnyInvokerFactory, + CKArgsBNormBwd, + miopen::batchnorm::BwdInvokeParams>(bn_problem, kernel_id); +} + +ConvSolution BnCKBwdBackward::GetSolution( + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::batchnorm::ProblemDescription& bn_problem) const +{ +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + switch(bn_problem.GetXDesc().GetType()) + { + + case miopenFloat: return MakeAnyInvokerFactory(bn_problem); + case miopenDouble: return MakeAnyInvokerFactory(bn_problem); + case miopenHalf: return MakeAnyInvokerFactory(bn_problem); + case miopenBFloat16: + return MakeAnyInvokerFactory(bn_problem); + case miopenInt8: + case miopenInt32: + case miopenInt8x4: + case miopenBFloat8: + case miopenFloat8: + default: + MIOPEN_THROW(miopenStatusInternalError, "BnCKBwdBackward operation not for this data type"); + } +#endif + return {}; +} + +} // namespace batchnorm +} // namespace solver +} // namespace miopen diff --git a/src/solver/batchnorm/forward_training_ck.cpp b/src/solver/batchnorm/forward_training_ck.cpp new file mode 100644 index 0000000000..a65cec14a9 --- /dev/null +++ b/src/solver/batchnorm/forward_training_ck.cpp @@ -0,0 +1,239 @@ + +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL +#include +#include +#include +#endif +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_CK_BN_FWD_TRAINING) + +namespace miopen { +namespace solver { +namespace batchnorm { +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + +using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; +using index_t = int32_t; + +constexpr index_t Rank = 4; +constexpr index_t NumBatchNormReduceDim = 3; + +using F16 = ck::half_t; +using F32 = float; +using F64 = double; +using BF16 = ushort; + +template +using DeviceOpBNFwdTrainingPtrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceBatchNormFwd>; + +struct CKArgsBNormFwdTraining +{ + CKArgsBNormFwdTraining(const miopen::batchnorm::ProblemDescription& problem) + { + std::copy(problem.GetXDesc().GetLengths().begin(), + problem.GetXDesc().GetLengths().end(), + xyLengths.begin()); + + std::copy(problem.GetXDesc().GetStrides().begin(), + problem.GetXDesc().GetStrides().end(), + xyStrides.begin()); + arrScaleBiasMeanVarLengths[0] = xyLengths[1]; // get channel + arrScaleBiasMeanVarStrides[0] = 1; + + // prep for CK + std::sort(xyStrides.begin(), xyStrides.end(), std::greater<>()); + std::rotate(xyLengths.begin() + 1, xyLengths.begin() + 2, xyLengths.end()); + } + + CKArgsBNormFwdTraining(const CKArgsBNormFwdTraining&) = default; + CKArgsBNormFwdTraining(CKArgsBNormFwdTraining&&) = default; + CKArgsBNormFwdTraining& operator=(const CKArgsBNormFwdTraining&) = default; + + template + auto MakeArgPtr(const InvokerPtr& invoker_ptr, const InvokerParams& data_ctx) const + { + return invoker_ptr->MakeArgumentPointer(xyLengths, + xyStrides, + xyStrides, + reduceDims, + arrScaleBiasMeanVarLengths, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + arrScaleBiasMeanVarStrides, + data_ctx.x, + data_ctx.bnScale, + data_ctx.bnBias, + data_ctx.epsilon, + PassThroughOp{}, + data_ctx.y, + data_ctx.resultSaveMean, + data_ctx.resultSaveInvVariance, + data_ctx.expAvgFactor, + data_ctx.resultRunningMean, + data_ctx.resultRunningVariance); + } + + template + bool IsSupportedBy(const ConvPtr& invoker_ptr) const + { + auto arg_ptr = MakeArgPtr(invoker_ptr, miopen::batchnorm::InvokeParams{}); + return invoker_ptr->IsSupportedArgument(arg_ptr.get()); + } + + std::array xyLengths; + std::array xyStrides; + std::vector invariantDims; + + std::array arrScaleBiasMeanVarLengths; + std::array arrScaleBiasMeanVarStrides; + + std::array reduceDims{0, 1, 2}; +}; + +template +static bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription& problem) +{ + return IsCKApplicable, + CKArgsBNormFwdTraining>(problem); +} +#endif + +bool BnCKFwdTraining::IsApplicable(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& bn_problem) const +{ +#if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL + std::ignore = context; + std::ignore = fdesc_problem; + return false; +#else + if(miopen::IsDisabled(MIOPEN_DEBUG_CONV_CK_BN_FWD_TRAINING{})) + return false; + if(!bn_problem.IsLayoutNHWC()) + return false; + if(!ck_utility::is_ck_supported_hardware(context.GetStream())) + return false; + + switch(bn_problem.GetXDesc().GetType()) + { + case miopenHalf: return CheckCKApplicability(bn_problem); + case miopenFloat: return CheckCKApplicability(bn_problem); + case miopenDouble: return CheckCKApplicability(bn_problem); + case miopenBFloat16: return CheckCKApplicability(bn_problem); + case miopenInt32: + case miopenInt8: + case miopenInt8x4: + case miopenBFloat8: + case miopenFloat8: + default: MIOPEN_THROW("BnCKFwdTraining operation does not supprot this data type"); + } + return false; +#endif +} + +template +ConvSolution MakeAnyInvokerFactory(const miopen::batchnorm::ProblemDescription& bn_problem) +{ + const auto& valid_kernel_ids = FillValidKernelsIDs, + CKArgsBNormFwdTraining>(bn_problem); + assert(!valid_kernel_ids.empty()); + const auto& kernel_id = valid_kernel_ids[0]; + return InitAnyInvokerFactory, + CKArgsBNormFwdTraining, + miopen::batchnorm::InvokeParams>(bn_problem, kernel_id); +} + +ConvSolution BnCKFwdTraining::GetSolution( + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::batchnorm::ProblemDescription& bn_problem) const +{ +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + switch(bn_problem.GetXDesc().GetType()) + { + + case miopenFloat: return MakeAnyInvokerFactory(bn_problem); + case miopenDouble: return MakeAnyInvokerFactory(bn_problem); + case miopenHalf: return MakeAnyInvokerFactory(bn_problem); + case miopenBFloat16: return MakeAnyInvokerFactory(bn_problem); + case miopenInt8: + case miopenInt32: + case miopenInt8x4: + case miopenBFloat8: + case miopenFloat8: + default: + MIOPEN_THROW(miopenStatusInternalError, "BnCKFwdTraining operation not for this data type"); + } +#endif + return {}; +} + +} // namespace batchnorm +} // namespace solver +} // namespace miopen diff --git a/test/bn_spatial_nhwc_test.cpp b/test/bn_spatial_nhwc_test.cpp deleted file mode 100644 index abca57e7ce..0000000000 --- a/test/bn_spatial_nhwc_test.cpp +++ /dev/null @@ -1,749 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#include "driver.hpp" -#include "get_handle.hpp" -#include "tensor_holder.hpp" -#include "test.hpp" -#include "verify.hpp" -#include "random.hpp" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MIO_BN_TEST_EXPAVGFACTOR 0.1 -#define MIO_BN_TEST_EPSILON 1e-5 -#define MIO_BN_USE_MIX_PREC 1 -#if MIO_BN_USE_MIX_PREC == 1 -#define PREC_TYPE float -#else -#define PREC_TYPE T -#endif - -template -struct verify_forward_train_bn_spatial -{ - const tensor input; - const tensor scale; - const tensor shift; - - std::tuple, tensor, tensor, tensor, tensor> cpu() const - { - double epsilon = MIO_BN_TEST_EPSILON; - double expAvgFactor = MIO_BN_TEST_EXPAVGFACTOR; - - std::size_t n_batch, channels, height, width; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - - std::size_t rs_n_batch, rs_channels, rs_height, rs_width; - auto derivedBnDesc = - miopen::TensorDescriptor(input.desc.GetType(), - std::vector{1, 1, 1, channels}, - std::vector{channels, channels, channels, 1}); - std::tie(rs_n_batch, rs_height, rs_width, rs_channels) = - miopen::tien<4>(derivedBnDesc.GetLengths()); - - tensor runMean; - tensor runVar; - if(input.desc.GetType() == miopenFloat) - { - runMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}.generate( - tensor_elem_gen_integer{17}); - runVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}.generate( - tensor_elem_gen_integer{17}); - } - else - { - prng::reset_seed(); - runMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - runVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - - const U Data_scale = static_cast(0.001); - for(std::size_t i = 0; i < runMean.desc.GetElementSize(); i++) - { - runMean[i] = prng::gen_descreet_uniform_sign(Data_scale, 100); - runVar[i] = prng::gen_descreet_unsigned(Data_scale, 100); - } - } - auto saveMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - auto saveInvVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - auto out = input; - std::fill(out.begin(), out.end(), 0); - - const auto nhw = double(height * width * n_batch); - par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - double variance_accum = 0.; - double mean_accum = 0.; - double invVar = 0.; - double newRunMean = 0.; - double adjust = 0.; - - std::vector variance_accum_arr(height, 0.0); - std::vector mean_accum_arr(height, 0.0); - std::vector dshift_accum_arr(height, 0.0); - std::vector dscale_accum_arr(height, 0.0); - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - mean_accum_arr[row] += input(bidx, cidx, row, column); - } - } - } - for(std::size_t i = 0; i < height; i++) - mean_accum += mean_accum_arr[i]; - - mean_accum /= nhw; - - elemStd = 0.; - variance_accum = 0.; - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - out(bidx, cidx, row, column) = elemStd = - input(bidx, cidx, row, column) - mean_accum; - variance_accum_arr[row] += elemStd * elemStd; - } - } - } - for(std::size_t i = 0; i < height; i++) - variance_accum += variance_accum_arr[i]; - - variance_accum /= nhw; - invVar = 1.0 / sqrt(variance_accum + epsilon); - - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - out(bidx, cidx, row, column) = - scale(0, 0, 0, cidx) * (invVar * out(bidx, cidx, row, column)) + - shift(0, 0, 0, cidx); - } - } - } - - saveMean(0, 0, 0, cidx) = mean_accum; - saveInvVar(0, 0, 0, cidx) = invVar; - - newRunMean = runMean(0, 0, 0, cidx) * (1 - expAvgFactor); - runMean(0, 0, 0, cidx) = mean_accum * expAvgFactor + newRunMean; - adjust = (n_batch * height * width == 1) ? variance_accum - : (nhw / (nhw - 1)) * variance_accum; - runVar(0, 0, 0, cidx) = - (1 - expAvgFactor) * runVar(0, 0, 0, cidx) + expAvgFactor * adjust; - }); - - return std::make_tuple(out, runMean, runVar, saveMean, saveInvVar); - } - - std::tuple, tensor, tensor, tensor, tensor> gpu() const - { - auto&& handle = get_handle(); - - std::size_t n_batch, channels, height, width; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); - - auto out = input; - std::fill(out.begin(), out.end(), 0); - - std::size_t rs_n_batch, rs_channels, rs_height, rs_width; - auto derivedBnDesc = - miopen::TensorDescriptor(input.desc.GetType(), - std::vector{1, 1, 1, channels}, - std::vector{channels, channels, channels, 1}); - std::tie(rs_n_batch, rs_height, rs_width, rs_channels) = - miopen::tien<4>(derivedBnDesc.GetLengths()); - - tensor runMean; - tensor runVar; - if(input.desc.GetType() == miopenFloat) - { - runMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}.generate( - tensor_elem_gen_integer{17}); - runVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}.generate( - tensor_elem_gen_integer{17}); - } - else - { - prng::reset_seed(); - runMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - runVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - - const U Data_scale = static_cast(0.001); - for(std::size_t i = 0; i < runMean.desc.GetElementSize(); i++) - { - runMean[i] = prng::gen_descreet_uniform_sign(Data_scale, 100); - runVar[i] = prng::gen_descreet_unsigned(Data_scale, 100); - } - } - - auto saveMean = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - auto saveInvVar = tensor{rs_n_batch, rs_height, rs_width, rs_channels}; - - auto in_dev = handle.Write(input.data); - auto scale_dev = handle.Write(scale.data); - auto shift_dev = handle.Write(shift.data); - - auto runMean_dev = handle.Write(runMean.data); - auto runVar_dev = handle.Write(runVar.data); - auto saveMean_dev = handle.Create(channels); - auto saveInvVar_dev = handle.Create(channels); - auto out_dev = handle.Create(n_batch * channels * height * width); - - double epsilon = MIO_BN_TEST_EPSILON; - double expAvgFactor = MIO_BN_TEST_EXPAVGFACTOR; - - float alpha = 1.0; - float beta = 0.0; - - miopen::BatchNormForwardTraining(handle, - miopenBNSpatial, - &alpha, - &beta, - input.desc, - in_dev.get(), - out.desc, - out_dev.get(), - scale.desc, - scale_dev.get(), - shift_dev.get(), - expAvgFactor, - runMean_dev.get(), - runVar_dev.get(), - epsilon, - saveMean_dev.get(), - saveInvVar_dev.get()); - - saveMean.data = handle.Read(saveMean_dev, saveMean.data.size()); - saveInvVar.data = handle.Read(saveInvVar_dev, saveInvVar.data.size()); - runMean.data = handle.Read(runMean_dev, runMean.data.size()); - runVar.data = handle.Read(runVar_dev, runVar.data.size()); - out.data = handle.Read(out_dev, out.data.size()); - - return std::make_tuple(out, runMean, runVar, saveMean, saveInvVar); - } - - void fail(int badtensor) const - { - std::cout << "Forward Train Spatial Batch Normalization: " << std::endl; - std::cout << "Input tensor: " << input.desc.ToString() << std::endl; - - switch(badtensor) - { - case(0): std::cout << "Output tensor output failed verification." << std::endl; break; - case(1): std::cout << "Running Mean output tensor failed verification." << std::endl; break; - case(2): - std::cout << "Running Variance output tensor failed verification." << std::endl; - break; - case(3): std::cout << "Saved Mean tensor failed verification." << std::endl; break; - case(4): std::cout << "Saved Variance tensor failed verification." << std::endl; break; - default: break; - } - } -}; - -template -struct verify_backward_bn_spatial_recalc -{ - const tensor x_input; - const tensor dy_input; - const tensor scale; - - std::tuple, tensor, tensor> cpu() const - { - double epsilon = MIO_BN_TEST_EPSILON; - - std::size_t n_batch, channels, height, width; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - - std::size_t ss_n_batch, ss_channels, ss_height, ss_width; - auto derivedBnDesc = - miopen::TensorDescriptor(x_input.desc.GetType(), - std::vector{1, 1, 1, channels}, - std::vector{channels, channels, channels, 1}); - std::tie(ss_n_batch, ss_height, ss_width, ss_channels) = - miopen::tien<4>(derivedBnDesc.GetLengths()); - - auto dx_out = dy_input; - std::fill(dx_out.begin(), dx_out.end(), 0); - - auto dscale = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dscale.begin(), dscale.end(), 0); - - auto dshift = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dshift.begin(), dshift.end(), 0); - - const auto nhw = double(height * width * n_batch); - - par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - unsigned int xhat_index; - double mean = 0.; - double invVar = 0.; - double dyelem = 0.; - double variance = 0.; - - std::vector xhat(height * width * n_batch, 0.0); - std::vector variance_accum_arr(height, 0.0); - std::vector mean_accum_arr(height, 0.0); - std::vector dshift_accum_arr(height, 0.0); - std::vector dscale_accum_arr(height, 0.0); - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - mean_accum_arr[row] += x_input(bidx, cidx, row, column); - } - } - } - for(std::size_t i = 0; i < height; i++) - mean += mean_accum_arr[i]; - - mean /= nhw; - - elemStd = 0.; - variance = 0.; - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - elemStd = x_input(bidx, cidx, row, column) - mean; - variance_accum_arr[row] += elemStd * elemStd; - } - } - } - for(std::size_t i = 0; i < height; i++) - variance += variance_accum_arr[i]; - - variance /= nhw; - invVar = 1. / double(sqrt(variance + epsilon)); - - dscale(0, cidx, 0, 0) = 0.; - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - xhat_index = height * width * bidx + (width * row + column); - elemStd = x_input(bidx, cidx, row, column) - mean; - xhat[xhat_index] = elemStd * invVar; - dyelem = dy_input(bidx, cidx, row, column); - dshift_accum_arr[row] += dyelem; - dscale_accum_arr[row] += xhat[xhat_index] * dyelem; - } - } - } - for(std::size_t i = 0; i < height; i++) - { - dshift(0, cidx, 0, 0) += dshift_accum_arr[i]; - dscale(0, cidx, 0, 0) += dscale_accum_arr[i]; - } - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - xhat_index = height * width * bidx + (width * row + column); - - double tmp1 = - nhw * dy_input(bidx, cidx, row, column) - dshift(0, cidx, 0, 0); - double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (scale(0, 0, 0, cidx) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = tmp3 * (tmp2 + tmp1); - } - } - } - }); - - return std::make_tuple(dx_out, dscale, dshift); - } - - std::tuple, tensor, tensor> gpu() const - { - auto&& handle = get_handle(); - - std::size_t n_batch, channels, height, width; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - - auto dx_out = dy_input; - std::fill(dx_out.begin(), dx_out.end(), 0); - - std::size_t ss_n_batch, ss_channels, ss_height, ss_width; - auto derivedBnDesc = - miopen::TensorDescriptor(x_input.desc.GetType(), - std::vector{1, 1, 1, channels}, - std::vector{channels, channels, channels, 1}); - std::tie(ss_n_batch, ss_height, ss_width, ss_channels) = - miopen::tien<4>(derivedBnDesc.GetLengths()); - - auto dscale = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dscale.begin(), dscale.end(), 0); - - auto dshift = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dshift.begin(), dshift.end(), 0); - - float alpha = 1.0; - float beta = 0.0; - - auto xin_dev = handle.Write(x_input.data); - auto dyin_dev = handle.Write(dy_input.data); - auto scale_dev = handle.Write(scale.data); - auto dscale_dev = handle.Write(dscale.data); - auto dshift_dev = handle.Write(dshift.data); - auto dx_out_dev = handle.Write(dx_out.data); - - double epsilon = MIO_BN_TEST_EPSILON; - - miopen::BatchNormBackward(handle, - miopenBNSpatial, - &alpha, - &beta, - &alpha, - &beta, - x_input.desc, - xin_dev.get(), - dy_input.desc, - dyin_dev.get(), - dx_out.desc, - dx_out_dev.get(), - scale.desc, - scale_dev.get(), - dscale_dev.get(), - dshift_dev.get(), - epsilon, - nullptr, - nullptr); - - dx_out.data = handle.Read(dx_out_dev, dx_out.data.size()); - dscale.data = handle.Read(dscale_dev, dscale.data.size()); - dshift.data = handle.Read(dshift_dev, dshift.data.size()); - - return std::make_tuple(dx_out, dscale, dshift); - } - - void fail(int badtensor) const - { - std::cout << "Backward Batch Spatial Normalization Recalc Mean and Variance: " << std::endl; - std::cout << "X Input tensor: " << x_input.desc.ToString() << std::endl; - std::cout << "Delta Y Input tensor: " << dy_input.desc.ToString() << std::endl; - switch(badtensor) - { - case(0): - std::cout << "Delta X output tensor output failed verification." << std::endl; - break; - case(1): std::cout << "Delta scale output tensor failed verification." << std::endl; break; - case(2): std::cout << "Delta shift output tensor failed verification." << std::endl; break; - default: break; - } - } -}; - -template -struct verify_backward_bn_spatial_use_saved -{ - const tensor x_input; - const tensor dy_input; - const tensor scale; - const tensor savedMean; - const tensor savedInvVar; - std::tuple, tensor, tensor> cpu() const - { - - std::size_t n_batch, channels, height, width; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - - auto dx_out = dy_input; - std::fill(dx_out.begin(), dx_out.end(), 0); - - std::size_t ss_n_batch, ss_channels, ss_height, ss_width; - auto derivedBnDesc = - miopen::TensorDescriptor(x_input.desc.GetType(), - std::vector{1, 1, 1, channels}, - std::vector{channels, channels, channels, 1}); - std::tie(ss_n_batch, ss_height, ss_width, ss_channels) = - miopen::tien<4>(derivedBnDesc.GetLengths()); - - auto dscale = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dscale.begin(), dscale.end(), 0); - - auto dshift = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dshift.begin(), dshift.end(), 0); - - const auto nhw = double(height * width * n_batch); - - par_for(channels, 1, [&](int cidx) { - double elemStd = 0.; - unsigned int xhat_index; - double mean = savedMean(0, 0, 0, cidx); - double invVar = savedInvVar(0, 0, 0, cidx); - double dyelem = 0.; - - std::vector xhat(n_batch * height * width, 0.0); - std::vector dshift_accum_arr(height, 0.0); - std::vector dscale_accum_arr(height, 0.0); - dscale(0, cidx, 0, 0) = 0.; - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - xhat_index = height * width * bidx + (width * row + column); - elemStd = x_input(bidx, cidx, row, column) - mean; - xhat[xhat_index] = elemStd * invVar; - dyelem = dy_input(bidx, cidx, row, column); - dshift_accum_arr[row] += dyelem; - dscale_accum_arr[row] += xhat[xhat_index] * dyelem; - } - } - } - for(std::size_t i = 0; i < height; i++) - { - dshift(0, cidx, 0, 0) += dshift_accum_arr[i]; - dscale(0, cidx, 0, 0) += dscale_accum_arr[i]; - } - - for(std::size_t row = 0; row < height; row++) - { - for(std::size_t column = 0; column < width; column++) - { - for(std::size_t bidx = 0; bidx < n_batch; bidx++) - { - xhat_index = height * width * bidx + (width * row + column); - - double tmp1 = - nhw * dy_input(bidx, cidx, row, column) - dshift(0, cidx, 0, 0); - double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (scale(0, 0, 0, cidx) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = tmp3 * (tmp2 + tmp1); - } - } - } - }); - - return std::make_tuple(dx_out, dscale, dshift); - } - - std::tuple, tensor, tensor> gpu() const - { - auto&& handle = get_handle(); - - std::size_t n_batch, channels, height, width; - std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); - - auto dx_out = dy_input; - std::fill(dx_out.begin(), dx_out.end(), 0); - - std::size_t ss_n_batch, ss_channels, ss_height, ss_width; - auto derivedBnDesc = - miopen::TensorDescriptor(x_input.desc.GetType(), - std::vector{1, 1, 1, channels}, - std::vector{channels, channels, channels, 1}); - std::tie(ss_n_batch, ss_height, ss_width, ss_channels) = - miopen::tien<4>(derivedBnDesc.GetLengths()); - - auto dscale = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dscale.begin(), dscale.end(), 0); - - auto dshift = tensor{ss_n_batch, ss_channels, ss_height, ss_width}; - std::fill(dshift.begin(), dshift.end(), 0); - - float alpha = 1.0; - float beta = 0.0; - - auto xin_dev = handle.Write(x_input.data); - auto dyin_dev = handle.Write(dy_input.data); - auto scale_dev = handle.Write(scale.data); - auto dscale_dev = handle.Write(dscale.data); - auto dshift_dev = handle.Write(dshift.data); - auto dx_out_dev = handle.Write(dx_out.data); - auto savedMean_dev = handle.Write(savedMean.data); - auto savedInvVar_dev = handle.Write(savedInvVar.data); - - double epsilon = MIO_BN_TEST_EPSILON; - - miopen::BatchNormBackward(handle, - miopenBNSpatial, - &alpha, - &beta, - &alpha, - &beta, - x_input.desc, - xin_dev.get(), - dy_input.desc, - dyin_dev.get(), - dx_out.desc, - dx_out_dev.get(), - scale.desc, - scale_dev.get(), - dscale_dev.get(), - dshift_dev.get(), - epsilon, - savedMean_dev.get(), - savedInvVar_dev.get()); - - dx_out.data = handle.Read(dx_out_dev, dx_out.data.size()); - dscale.data = handle.Read(dscale_dev, dscale.data.size()); - dshift.data = handle.Read(dshift_dev, dshift.data.size()); - - return std::make_tuple(dx_out, dscale, dshift); - } - - void fail(int badtensor) const - { - std::cout << "Backward Batch Spatial Normalization Use Saved Mean and Variance: " - << std::endl; - std::cout << "X Input tensor: " << x_input.desc.ToString() << std::endl; - std::cout << "Delta Y Input tensor: " << dy_input.desc.ToString() << std::endl; - switch(badtensor) - { - case(0): - std::cout << "Delta X output tensor output failed verification." << std::endl; - break; - case(1): std::cout << "Delta scale output tensor failed verification." << std::endl; break; - case(2): std::cout << "Delta shift output tensor failed verification." << std::endl; break; - default: break; - } - } -}; - -template -struct batch_norm_spatial_nhwc_driver : test_driver -{ - tensor input; - tensor scale; - tensor shift; - batch_norm_spatial_nhwc_driver() - { - this->batch_factor = 4; - add(input, - "input", - get_bn_spatial_input_tensor( - tensor_elem_gen_integer{miopen_type{} == miopenHalf ? 5 : 17})); - } - - void run() - { - std::size_t n, c, h, w; - std::tie(n, c, h, w) = miopen::tien<4>(input.desc.GetLengths()); - - std::size_t ssn, ssc, ssh, ssw; - auto derivedBnDesc = miopen::TensorDescriptor(input.desc.GetType(), - std::vector{1, 1, 1, c}, - std::vector{c, c, c, 1}); - std::tie(ssn, ssh, ssw, ssc) = miopen::tien<4>(derivedBnDesc.GetLengths()); - - std::vector new_len = input.desc.GetLengths(); - std::vector new_str; - miopen::tensor_layout_to_strides(new_len, "NCHW", "NHWC", new_str); - input.desc = miopen::TensorDescriptor(miopen_type{}, new_len, new_str); - - if(input.desc.GetType() == miopenFloat) - { - scale = tensor{ssn, ssh, ssw, ssc}.generate(tensor_elem_gen_integer{17}); - shift = tensor{ssn, ssh, ssw, ssc}.generate(tensor_elem_gen_integer{17}); - } - else - { - scale = tensor{ssn, ssh, ssw, ssc}; - shift = tensor{ssn, ssh, ssw, ssc}; - - const PREC_TYPE Data_scale = static_cast(1e-4); - for(std::size_t i = 0; i < scale.desc.GetElementSize(); i++) - { - scale[i] = prng::gen_descreet_uniform_sign(Data_scale, 100); - shift[i] = prng::gen_descreet_uniform_sign(Data_scale, 100); - } - for(std::size_t i = 0; i < input.desc.GetElementSize(); i++) - { - input[i] = prng::gen_descreet_uniform_sign(static_cast(1e-5), 100); - } - } - - auto outpair = verify(verify_forward_train_bn_spatial{input, scale, shift}); - - auto dy_input = std::get<0>(outpair.second); - for(std::size_t bidx = 0; bidx < n; bidx++) - { - for(std::size_t cidx = 0; cidx < c; cidx++) - { - for(std::size_t row = 0; row < h; row++) - { - for(std::size_t column = 0; column < w; column++) - { - dy_input(bidx, cidx, row, column) *= 0.1; - } - } - } - } - this->tolerance = 80 * input.desc.GetElementSize(); - verify(verify_backward_bn_spatial_recalc{input, dy_input, scale}); - - auto savedMean = std::get<3>(outpair.second); - auto savedInvVar = std::get<4>(outpair.second); - verify(verify_backward_bn_spatial_use_saved{ - input, dy_input, scale, savedMean, savedInvVar}); - } -}; - -int main(int argc, const char* argv[]) -{ - test_drive(argc, argv); - return 0; -} diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index cffefea0e2..5374abd1fa 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -36,7 +36,6 @@ #include #include #include -// #include "driver.hpp" #include "get_handle.hpp" #include "tensor_holder.hpp" #include "verify.hpp" @@ -203,17 +202,17 @@ void batchNormPerActivHostInference(const tensor& input, }); } -template +template void batchNormSpatialHostFwdTrain(const tensor& input, tensor& out, const tensor& scale, const tensor& bias, double epsilon, double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) { int height, width, n_batch, channels; @@ -279,15 +278,15 @@ void batchNormSpatialHostFwdTrain(const tensor& input, }); } -template -void batchNormSpatialHostBwdTrain(const tensor& x_input, - const tensor& dy_input, - tensor& dx_out, - const tensor& scale, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) +template +void batchNormSpatialHostBwdTrain(const tensor& x_input, + const tensor& dy_input, + tensor& dx_out, + const tensor& scale, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) { int height, width, n_batch, channels; @@ -335,7 +334,7 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); } // end for(n_batchs) } // for (column) } // for (row) diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index 0b763da411..22f8391fe6 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -84,3 +84,174 @@ struct BNInferTest : public ::testing::TestWithParam +struct BNBwdTest : public ::testing::TestWithParam> +{ +protected: + void SetUp() override + { + test_skipped = false; + std::tie(bn_config, tensor_layout) = GetParam(); + bn_bwd_test_data.SetUpImpl(bn_config, tensor_layout); + + auto&& handle = get_handle(); + miopenBatchNormalizationBackward(&handle, + bn_config.mode, + &bn_bwd_test_data.alphaDataDiff, + &bn_bwd_test_data.betaDataDiff, + &bn_bwd_test_data.alphaParamDiff, + &bn_bwd_test_data.betaParamDiff, + &bn_bwd_test_data.input.desc, + bn_bwd_test_data.in_dev.get(), + &bn_bwd_test_data.dy.desc, + bn_bwd_test_data.dy_dev.get(), + &bn_bwd_test_data.output.desc, + bn_bwd_test_data.out_dev.get(), + &bn_bwd_test_data.bnScale.desc, + bn_bwd_test_data.bnScale_dev.get(), + bn_bwd_test_data.dScale_dev.get(), + bn_bwd_test_data.dBias_dev.get(), + bn_bwd_test_data.epsilon, + bn_bwd_test_data.savedMean_dev.get(), + bn_bwd_test_data.savedInvVar_dev.get()); + + std::fill(bn_bwd_test_data.output.begin(), + bn_bwd_test_data.output.end(), + std::numeric_limits::quiet_NaN()); + } + + void TearDown() override + { + if(test_skipped) + return; + auto&& handle = get_handle(); + bn_bwd_test_data.output.data = + handle.Read(bn_bwd_test_data.out_dev, bn_bwd_test_data.output.data.size()); + bn_bwd_test_data.dScale.data = handle.Read(bn_bwd_test_data.dScale_dev, + bn_bwd_test_data.dScale.data.size()); + bn_bwd_test_data.dBias.data = + handle.Read(bn_bwd_test_data.dBias_dev, bn_bwd_test_data.dBias.data.size()); + + test::ComputeCPUBNBwd(bn_bwd_test_data); + + // using tolerance = 1e-4 since this the tolerance CK uses + test::CompareTensor(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 1e-4); + test::CompareTensor(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 1e-4); + test::CompareTensor(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 1e-4); + } + + BNTestCase bn_config; + bool test_skipped = false; + BNBwdTestData + bn_bwd_test_data; + miopenTensorLayout_t tensor_layout; +}; + +template +struct BNFwdTrainTest + : public ::testing::TestWithParam> +{ +protected: + void SetUp() override + { + test_skipped = false; + std::tie(bn_config, tensor_layout) = GetParam(); + bn_fwd_train_test_data.SetUpImpl(bn_config, tensor_layout); + + auto&& handle = get_handle(); + miopenBatchNormalizationForwardTraining(&handle, + bn_config.mode, + &bn_fwd_train_test_data.alpha, + &bn_fwd_train_test_data.beta, + &bn_fwd_train_test_data.input.desc, + bn_fwd_train_test_data.in_dev.get(), + &bn_fwd_train_test_data.output.desc, + bn_fwd_train_test_data.out_dev.get(), + &bn_fwd_train_test_data.scale.desc, + bn_fwd_train_test_data.scale_dev.get(), + bn_fwd_train_test_data.shift_dev.get(), + bn_fwd_train_test_data.averageFactor, + bn_fwd_train_test_data.runMean_dev.get(), + bn_fwd_train_test_data.runVariance_dev.get(), + bn_fwd_train_test_data.epsilon, + bn_fwd_train_test_data.saveMean_dev.get(), + bn_fwd_train_test_data.saveVariance_dev.get()); + + std::fill(bn_fwd_train_test_data.output.begin(), + bn_fwd_train_test_data.output.end(), + std::numeric_limits::quiet_NaN()); + std::fill(bn_fwd_train_test_data.saveMean_ref.begin(), + bn_fwd_train_test_data.saveMean_ref.end(), + std::numeric_limits::quiet_NaN()); + std::fill(bn_fwd_train_test_data.saveVariance_ref.begin(), + bn_fwd_train_test_data.saveVariance_ref.end(), + std::numeric_limits::quiet_NaN()); + } + + void TearDown() override + { + if(test_skipped) + return; + auto&& handle = get_handle(); + bn_fwd_train_test_data.output.data = handle.Read( + bn_fwd_train_test_data.out_dev, bn_fwd_train_test_data.output.data.size()); + + bn_fwd_train_test_data.saveMean.data = handle.Read( + bn_fwd_train_test_data.saveMean_dev, bn_fwd_train_test_data.saveMean.data.size()); + bn_fwd_train_test_data.saveVariance.data = + handle.Read(bn_fwd_train_test_data.saveVariance_dev, + bn_fwd_train_test_data.saveVariance_ref.data.size()); + bn_fwd_train_test_data.runMean.data = handle.Read( + bn_fwd_train_test_data.runMean_dev, bn_fwd_train_test_data.runMean_ref.data.size()); + bn_fwd_train_test_data.runVariance.data = + handle.Read(bn_fwd_train_test_data.runVariance_dev, + bn_fwd_train_test_data.runVariance_ref.data.size()); + test::ComputeCPUBNFwdTrain(bn_fwd_train_test_data); + + // 4e-3 is tolerance used by CK kernel. + test::CompareTensor( + bn_fwd_train_test_data.output, bn_fwd_train_test_data.ref_out, 4e-3); + test::CompareTensor( + bn_fwd_train_test_data.saveMean, bn_fwd_train_test_data.saveMean_ref, 4e-3); + test::CompareTensor( + bn_fwd_train_test_data.saveVariance, bn_fwd_train_test_data.saveVariance_ref, 4e-3); + test::CompareTensor( + bn_fwd_train_test_data.runMean, bn_fwd_train_test_data.runMean_ref, 4e-3); + test::CompareTensor( + bn_fwd_train_test_data.runVariance, bn_fwd_train_test_data.runVariance_ref, 4e-3); + } + + BNTestCase bn_config; + bool test_skipped = false; + BNFwdTrainTestData + bn_fwd_train_test_data; + miopenTensorLayout_t tensor_layout; +}; diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp new file mode 100644 index 0000000000..722b42e872 --- /dev/null +++ b/test/gtest/bn_bwd.cpp @@ -0,0 +1,73 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "bn.hpp" + +struct BNBwdTestTestHalf + : BNBwdTest +{ +}; + +struct BNBwdTestFloat : BNBwdTest +{ +}; + +struct BNBwdTestBFloat16 : BNBwdTest +{ +}; + +struct BNBwdTestDouble : BNBwdTest +{ +}; + +TEST_P(BNBwdTestTestHalf, BnBwdCKHalf) {} + +TEST_P(BNBwdTestFloat, BnBwdCKFloat) {} + +// Currently disabled since miopen::batchnorm::MakeForwardTrainingNetworkConfig +// only supports half and float +TEST_P(BNBwdTestBFloat16, DISABLED_BnBwdCKBFloat16) {} +TEST_P(BNBwdTestDouble, DISABLED_BnBwdCKDouble) {} + +INSTANTIATE_TEST_SUITE_P(BNBwdTestTestHalfNHWCSuite, + BNBwdTestTestHalf, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNBwdTestFloatNHWCSuite, + BNBwdTestFloat, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNBwdTestBFloat16NHWCSuite, + BNBwdTestBFloat16, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNBwdTestDoubleNHWCSuite, + BNBwdTestDouble, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp new file mode 100644 index 0000000000..4a4dd4c728 --- /dev/null +++ b/test/gtest/bn_fwd_train.cpp @@ -0,0 +1,73 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "bn.hpp" + +struct BNFwdTrainTestHalf + : BNFwdTrainTest +{ +}; + +struct BNFwdTrainTestFloat : BNFwdTrainTest +{ +}; + +struct BNFwdTrainTestDouble : BNFwdTrainTest +{ +}; + +struct BNFwdTrainTestBFloat16 : BNFwdTrainTest +{ +}; + +TEST_P(BNFwdTrainTestHalf, BnFwdTrainCKHalf) {} + +TEST_P(BNFwdTrainTestFloat, BnFwdTrainCKFloat) {} + +// Currently disabled since miopen::batchnorm::MakeForwardTrainingNetworkConfig +// only supports half and float +TEST_P(BNFwdTrainTestDouble, DISABLED_BnFwdTrainCKDouble) {} +TEST_P(BNFwdTrainTestBFloat16, DISABLED_BnFwdTrainCKBFloat16) {} + +INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestHalfNHWCSuite, + BNFwdTrainTestHalf, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite, + BNFwdTrainTestFloat, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite, + BNFwdTrainTestDouble, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); + +INSTANTIATE_TEST_SUITE_P(BNFwdTrainTestFloatNHWCSuite, + BNFwdTrainTestBFloat16, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp index 6598ef7169..0dceaa1ba5 100644 --- a/test/gtest/bn_infer.cpp +++ b/test/gtest/bn_infer.cpp @@ -43,14 +43,14 @@ struct BNInferTestBFloat16 : BNInferTest +#include "random.hpp" #include #include @@ -60,7 +59,8 @@ std::vector Network1() { // pyt_mlperf_resnet50v1.5 return { - {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {192, 1, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, + {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, @@ -125,7 +125,7 @@ struct BNTestData { input = tensor{miopen_type{}, tensor_layout, bn_config.GetInput()}; output = tensor{miopen_type{}, tensor_layout, bn_config.GetInput()}; - ref_out = output; + ref_out = tensor{miopen_type{}, tensor_layout, bn_config.GetInput()}; } void InitTensorsWithRandValue() @@ -226,3 +226,218 @@ struct BNInferTestData : public BNTestData estVariance_dev = handle.Write(estVariance.data); } }; + +template +struct BNBwdTestData : public BNTestData +{ + void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) + { + BNTestData::SetUpImpl(config, t_layout); + CreateTensors(); + InitTensorsWithRandValue(); + WriteToGPU(); + } + + tensor bnScale; + + tensor savedMean; + tensor savedInvVar; + + tensor dy; + tensor dScale; + tensor dBias; + tensor dScale_ref; + tensor dBias_ref; + + miopen::Allocator::ManageDataPtr bnScale_dev; + miopen::Allocator::ManageDataPtr savedMean_dev; + miopen::Allocator::ManageDataPtr savedInvVar_dev; + + miopen::Allocator::ManageDataPtr dy_dev; + miopen::Allocator::ManageDataPtr dScale_dev; + miopen::Allocator::ManageDataPtr dBias_dev; + miopen::Allocator::ManageDataPtr dScale_ref_dev; + miopen::Allocator::ManageDataPtr dBias_ref_dev; + double epsilon = std::numeric_limits::epsilon(); + + float alphaDataDiff = static_cast(1), betaDataDiff = static_cast(0); + float alphaParamDiff = static_cast(1), betaParamDiff = static_cast(0); + +private: + void CreateTensors() + { + dy = tensor{miopen_type{}, + BNTestData::tensor_layout, + BNTestData::bn_config.GetInput()}; + + auto derivedBnDesc = miopen::TensorDescriptor{}; + miopen::DeriveBNTensorDescriptor(derivedBnDesc, + BNTestData::input.desc, + BNTestData::bn_mode); + bnScale = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + savedMean = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + savedInvVar = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + dScale = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + dBias = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + dScale_ref = dScale; + dBias_ref = dBias; + } + + void InitTensorsWithRandValue() + { + auto gen_value = [](auto...) { + return prng::gen_descreet_uniform_sign(static_cast(1e-2), 100); + }; + dy.generate(gen_value); + bnScale.generate(gen_value); + savedMean.generate(gen_value); + + auto gen_var = [](auto...) { + return static_cast(1e-2) * + static_cast(prng::gen_0_to_B(100) + 1); + }; + savedInvVar.generate(gen_var); + + std::fill(dScale.begin(), dScale.end(), 0.); + std::fill(dBias.begin(), dBias.end(), 0.); + + std::fill(dScale_ref.begin(), dScale_ref.end(), 0.); + std::fill(dBias_ref.begin(), dBias_ref.end(), 0.); + } + void WriteToGPU() + { + auto&& handle = get_handle(); + + bnScale_dev = handle.Write(bnScale.data); + savedMean_dev = handle.Write(savedMean.data); + savedInvVar_dev = handle.Write(savedInvVar.data); + dy_dev = handle.Write(dy.data); + + dScale_dev = handle.Write(dScale.data); + dBias_dev = handle.Write(dBias.data); + } +}; + +template +struct BNFwdTrainTestData : public BNTestData +{ + void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) + { + BNTestData::SetUpImpl(config, t_layout); + CreateTensors(); + InitTensorsWithRandValue(); + WriteToGPU(); + } + + tensor scale; + tensor shift; + tensor saveMean; + tensor saveVariance; + tensor runMean; + tensor runVariance; + + tensor saveMean_ref; + tensor saveVariance_ref; + tensor runMean_ref; + tensor runVariance_ref; + + miopen::Allocator::ManageDataPtr scale_dev; + miopen::Allocator::ManageDataPtr shift_dev; // bias + miopen::Allocator::ManageDataPtr saveMean_dev; + miopen::Allocator::ManageDataPtr saveVariance_dev; + miopen::Allocator::ManageDataPtr runMean_dev; + miopen::Allocator::ManageDataPtr runVariance_dev; + double epsilon = 1.0e-5; + double averageFactor = 0.1; + float alpha = static_cast(1.0f); + float beta = static_cast(0); + const float activ_alpha = static_cast(0.5f); + const float activ_beta = static_cast(0.5f); + const float activ_gamma = static_cast(0.5f); + +private: + void CreateTensors() + { + auto derivedBnDesc = miopen::TensorDescriptor{}; + miopen::DeriveBNTensorDescriptor(derivedBnDesc, + BNTestData::input.desc, + BNTestData::bn_mode); + scale = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + shift = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + saveMean = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + saveVariance = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + runMean = tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + runVariance = + tensor{miopen_type{}, + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + } + + void InitTensorsWithRandValue() + { + auto gen_value = [](auto...) { + return prng::gen_descreet_uniform_sign(static_cast(1e-2), 100); + }; + scale.generate(gen_value); + shift.generate(gen_value); + + auto gen_var = [](auto...) { + return static_cast(1e-2) * + static_cast(prng::gen_0_to_B(100) + 1); + }; + runMean.generate(gen_var); + runVariance.generate(gen_var); + + saveMean_ref = saveMean; + saveVariance_ref = saveVariance; + runMean_ref = runMean; + runVariance_ref = runVariance; + } + void WriteToGPU() + { + auto&& handle = get_handle(); + scale_dev = handle.Write(scale.data); + shift_dev = handle.Write(shift.data); + saveMean_dev = handle.Write(saveMean.data); + saveVariance_dev = handle.Write(saveVariance.data); + runMean_dev = handle.Write(runMean.data); + runVariance_dev = handle.Write(runVariance.data); + } +}; diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index d1528fe2bb..da41212302 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -38,6 +38,41 @@ void ComputeCPUBNInference(DLModule& dl_module) dl_module.estVariance); } +template +void ComputeCPUBNBwd(DLModule& dl_module) +{ + batchNormSpatialHostBwdTrain(dl_module.input, + dl_module.dy, + dl_module.ref_out, + dl_module.bnScale, + dl_module.dScale_ref, + dl_module.dBias_ref, + dl_module.savedMean, + dl_module.savedInvVar); +} + +template +void ComputeCPUBNFwdTrain(DLModule& dl_module) +{ + batchNormSpatialHostFwdTrain(dl_module.input, + dl_module.ref_out, + dl_module.scale, + dl_module.shift, + dl_module.epsilon, + dl_module.averageFactor, + dl_module.saveMean_ref, + dl_module.saveVariance_ref, + dl_module.runMean_ref, + dl_module.runVariance_ref); +} + template void CompareTensor(const tensor& output, const tensor& ref_out, From 14118a413eec00071800d4efa48ef0199bbbabd5 Mon Sep 17 00:00:00 2001 From: amberhassaan Date: Thu, 5 Oct 2023 18:27:19 -0400 Subject: [PATCH 24/36] Reference kernel for 3D convolution for non-packed tensors (#2334) --- src/CMakeLists.txt | 1 + src/hip/hip_build_utils.cpp | 2 +- src/include/miopen/hipoc_kernel.hpp | 24 +- .../miopen/solver/conv_direct_naive_conv.hpp | 95 +- .../gpu_reference_kernel/fp8_kern_types.h | 6 +- .../gpu_reference_kernel/naive_conv.cpp | 1719 +++++++++++------ src/kernels/stride_array.hpp | 86 + src/solver/conv_direct_naive_conv.cpp | 57 +- src/solver/conv_direct_naive_conv_bwd.cpp | 39 + src/solver/conv_direct_naive_conv_fwd.cpp | 31 +- src/solver/conv_direct_naive_conv_wrw.cpp | 35 + test/gpu_reference_kernel.cpp | 3 +- test/gtest/conv3d_test_case.hpp | 112 ++ test/gtest/group_conv3d_bwd.cpp | 2 +- test/gtest/group_conv3d_bwd.hpp | 88 +- test/gtest/group_conv3d_fwd.cpp | 2 +- test/gtest/group_conv3d_fwd.hpp | 88 +- test/gtest/group_conv3d_wrw.cpp | 2 +- test/gtest/group_conv3d_wrw.hpp | 88 +- test/gtest/group_solver.hpp | 6 +- 20 files changed, 1633 insertions(+), 853 deletions(-) create mode 100644 src/kernels/stride_array.hpp create mode 100644 test/gtest/conv3d_test_case.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index abc0679a8a..7866ad1a5a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -390,6 +390,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/workaround_issue_1431.hpp kernels/hip_f8_impl.hpp kernels/hip_float8.hpp + kernels/stride_array.hpp ) set(MIOPEN_KERNELS diff --git a/src/hip/hip_build_utils.cpp b/src/hip/hip_build_utils.cpp index 8f6f9f0c50..86cf3a7272 100644 --- a/src/hip/hip_build_utils.cpp +++ b/src/hip/hip_build_utils.cpp @@ -73,7 +73,7 @@ static boost::filesystem::path HipBuildImpl(boost::optional& tmp_dir, auto env = std::string(""); if(params.find("-std=") == std::string::npos) - params += " --std=c++11"; + params += " --std=c++17"; #if HIP_PACKAGE_VERSION_FLAT < 4001000000ULL params += " --cuda-gpu-arch=" + lots.device; diff --git a/src/include/miopen/hipoc_kernel.hpp b/src/include/miopen/hipoc_kernel.hpp index ba9992bab3..73ac77f160 100644 --- a/src/include/miopen/hipoc_kernel.hpp +++ b/src/include/miopen/hipoc_kernel.hpp @@ -26,14 +26,15 @@ #ifndef GUARD_MIOPEN_HIPOC_KERNEL_HPP #define GUARD_MIOPEN_HIPOC_KERNEL_HPP -#include -#include #include #include #include #include + +#include +#include +#include #include -#include namespace miopen { @@ -47,29 +48,20 @@ inline HipEventPtr make_hip_event() #if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017 -#if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017 template struct KernelArgsPair { - static const int alignment = sizeof(U); - static const int padding = (alignment - sizeof(T) % alignment) % alignment; - static const int second_index = sizeof(T) + padding; + constexpr static auto alignU = alignof(U); + constexpr static auto padding = (alignU - (sizeof(T) % alignU)) % alignU; + constexpr static auto second_index = sizeof(T) + padding; KernelArgsPair(T x, U y) { new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew) new(buffer + second_index) U(y); } + alignas(U) char buffer[second_index + sizeof(U)] = {}; }; -#else -template -struct KernelArgsPair -{ - KernelArgsPair(T x, U y) : first(x), second(y) {} - T first; - U second; -}; -#endif template struct KernelArgsPack; diff --git a/src/include/miopen/solver/conv_direct_naive_conv.hpp b/src/include/miopen/solver/conv_direct_naive_conv.hpp index 7bad52ff9e..6d935b249d 100644 --- a/src/include/miopen/solver/conv_direct_naive_conv.hpp +++ b/src/include/miopen/solver/conv_direct_naive_conv.hpp @@ -25,9 +25,15 @@ *******************************************************************************/ #pragma once -#include #include #include +#include "miopen/../../kernels/stride_array.hpp" + +#include +#include +#include +#include +#include namespace miopen { @@ -54,5 +60,92 @@ bool IsOutputBfp16(const ProblemDescription&); bool IsOutputInt8(const ProblemDescription&); bool IsOutputInt32(const ProblemDescription&); +namespace conv_internal { + +void DebugPrintTensorStrides(const TensorDescriptor& inDesc, + const TensorDescriptor& wDesc, + const TensorDescriptor& outDesc); + +/** + * Get the index where group (G) stride should go. For NCHW, we want to convert + * its strides to NGCHW, and for NHWC, we want to convert its strides to NHWGC. + * Same applies for the 3D case. + */ +int GetGroupStrideIndex(const ProblemDescription& problem); + +/** + * split the strides for C dimension in a tensor descriptor into (G, C_per_group). + * Normally, (in packed case) num channels is a multiplying factor in the stride of + * whatever lies to the left of C, e.g., in NCHW, N's stride contains C as a + * factor. We output NGCHW for NCHW (and NHWGC for NHWC) + * where the stride[G] = stride[N] / num_groups + */ +template +V SplitStrideCtoGC(int num_groups, const V& orig_strides, int G_stride_idx) +{ + assert(G_stride_idx > 0 && G_stride_idx <= orig_strides.size()); + // (G_stride_idx - 1) is the stride index of whatever lies to the left and + // contains C or K as a multiplying factor. We divide this value by num_groups + // to get G_stride_val + assert(orig_strides[G_stride_idx - 1] % num_groups == 0); + + V ret{orig_strides}; + auto G_stride_val = orig_strides[G_stride_idx - 1] / num_groups; + + ret.insert(ret.begin() + G_stride_idx, G_stride_val); + + return ret; +} + +/** + * Weight tensor has original dims: [K, C_per_group, Y, X] (2D case) + * We return a new stride vector with strides for [G, K_per_group, C_per_group, Y, X] + * Stride for G is computed as stride[C_per_group] * K_per_group and inserted at + * left most position + */ +template +V SplitWeiStrideKtoGK(int k_per_group, const V& wei_strides) +{ + V ret{wei_strides}; + ret.insert(ret.begin(), wei_strides[0] * k_per_group); + return ret; +} + +template +struct ChooseStride +{ +}; + +template <> +struct ChooseStride<5u> +{ + using type = Strides5D; +}; + +template <> +struct ChooseStride<6u> +{ + using type = Strides6D; +}; + +template +auto MakeStrideArray(V vec) +{ + typename ChooseStride::type ret; + assert(vec.size() == N); + + // MIOpen stores strides for NHWC in NCHW order, i.e. C stride in 2nd from left. + // We sort the input stride vector so that smallest stride is at index 0. This + // (little-endian) order is what naive convolution kernel expects for strides + std::sort(vec.begin(), vec.end()); + + for(unsigned i = 0; i < N; ++i) + { + ret[i] = static_cast(vec[i]); + } + return ret; +} +} // end namespace conv_internal + } // namespace solver } // namespace miopen diff --git a/src/kernels/gpu_reference_kernel/fp8_kern_types.h b/src/kernels/gpu_reference_kernel/fp8_kern_types.h index 3bac0a31f7..b14302e0c2 100644 --- a/src/kernels/gpu_reference_kernel/fp8_kern_types.h +++ b/src/kernels/gpu_reference_kernel/fp8_kern_types.h @@ -58,6 +58,6 @@ #define KERNEL_NAME_SUFFIX CAT(CAT(INPUT_TYPE, _), CAT(CAT(WEIGHTS_TYPE, _), OUTPUT_TYPE)) -#define FWD_KERNEL_NAME CAT(naive_conv_fwd_nchw_, KERNEL_NAME_SUFFIX) -#define BWD_KERNEL_NAME CAT(naive_conv_bwd_nchw_, KERNEL_NAME_SUFFIX) -#define WRW_KERNEL_NAME CAT(naive_conv_wrw_nchw_, KERNEL_NAME_SUFFIX) +#define FWD_KERNEL_NAME CAT(naive_conv_packed_fwd_nchw_, KERNEL_NAME_SUFFIX) +#define BWD_KERNEL_NAME CAT(naive_conv_packed_bwd_nchw_, KERNEL_NAME_SUFFIX) +#define WRW_KERNEL_NAME CAT(naive_conv_packed_wrw_nchw_, KERNEL_NAME_SUFFIX) diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp index 24d7cd489e..b243b1234a 100644 --- a/src/kernels/gpu_reference_kernel/naive_conv.cpp +++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp @@ -46,6 +46,8 @@ typedef float float_t; #endif #endif // __HIPCC_RTC__ +#include "stride_array.hpp" + // hcc seems need __device__ __host__ together to compile, and no extern "C" typedef union value_bf16_fp32_t { @@ -114,10 +116,27 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val) return static_cast(val & 0xff); } -template +/// \todo remove template parameter 'bool ASSUME_PACKED' in a follow up PR +/// --amberhassaan +/// Notes (Amber): +/// - The following code used to assume that group (G) is an implicit +/// dimension, i.e. c= c_per_group * group and k = k_per_group * group. This is not +/// true for non-packed case because group (G) dimension needs to have its stride +/// explicitly specified for address math to make sense. This is also how +/// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw, +/// and nhwc should be nhwgc. Same follows for the 3D case. +/// +/// - strides here are in the little-endian order, i.e., for NHWC, stride for N is +/// at index 3 while stride for C is at index 0. This is reverse of how strides are +/// stored in tensor descriptors, which are big-endian. + +template inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, dst_data_t* __restrict__ p_out, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides, int hi, int wi, int n, @@ -148,18 +167,36 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, int in = (bid / k_per_group) % n; int ig = bid / (n * k_per_group); - p_in += static_cast(in) * c * hi * wi + static_cast(ig) * c_per_group * hi * wi; - p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + - static_cast(ik) * c_per_group * fy * fx; - p_out += static_cast(in) * k * ho * wo + - static_cast(ig) * k_per_group * ho * wo + static_cast(ik) * ho * wo; + if constexpr(ASSUME_PACKED) + { + p_in += + static_cast(in) * c * hi * wi + static_cast(ig) * c_per_group * hi * wi; + + p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + + static_cast(ik) * c_per_group * fy * fx; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += static_cast(in) * k * ho * wo + + static_cast(ig) * k_per_group * ho * wo + + static_cast(ik) * ho * wo; + } + else + { + p_in += static_cast(in) * in_strides[4] + static_cast(ig) * in_strides[3]; + + p_wei += + static_cast(ig) * wei_strides[4] + static_cast(ik) * wei_strides[3]; + + p_out += static_cast(in) * out_strides[4] + + static_cast(ig) * out_strides[3] + + static_cast(ik) * out_strides[2]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int iho = tid / wo; int iwo = tid % wo; - double value = .0f; + acc_data_t value = 0; for(int ic = 0; ic < c_per_group; ic++) { @@ -178,25 +215,58 @@ inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, if(valid_w & valid_h) { - size_t i_idx = static_cast(ic) * hi * wi + - static_cast(cur_h) * wi + static_cast(cur_w); - size_t f_idx = static_cast(ic) * fy * fx + - static_cast(iy) * fx + static_cast(ix); - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(ic) * hi * wi + + static_cast(cur_h) * wi + + static_cast(cur_w); + + size_t f_idx = static_cast(ic) * fy * fx + + static_cast(iy) * fx + static_cast(ix); + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t i_idx = static_cast(ic) * in_strides[2] + + static_cast(cur_h) * in_strides[1] + + static_cast(cur_w) * in_strides[0]; + + size_t f_idx = static_cast(ic) * wei_strides[2] + + static_cast(iy) * wei_strides[1] + + static_cast(ix) * wei_strides[0]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } } } } } - size_t o_idx = static_cast(iho) * wo + static_cast(iwo); - p_out[o_idx] = cast_to(value); + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(iho) * wo + static_cast(iwo); + + p_out[o_idx] = cast_to(value); + } + else + { + size_t o_idx = static_cast(iho) * out_strides[1] + + static_cast(iwo) * out_strides[0]; + + p_out[o_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides, int hi, int wi, int n, @@ -227,19 +297,35 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in, int in = (bid / c_per_group) % n; int ig = bid / (n * c_per_group); - p_in += static_cast(in) * c * hi * wi + - static_cast(ig) * c_per_group * hi * wi + static_cast(ic) * hi * wi; - p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + - static_cast(ic) * fy * fx; - p_out += - static_cast(in) * k * ho * wo + static_cast(ig) * k_per_group * ho * wo; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * c * hi * wi + + static_cast(ig) * c_per_group * hi * wi + static_cast(ic) * hi * wi; + + p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + + static_cast(ic) * fy * fx; + + p_out += + static_cast(in) * k * ho * wo + static_cast(ig) * k_per_group * ho * wo; + } + else + { + p_in += static_cast(in) * in_strides[4] + static_cast(ig) * in_strides[3] + + static_cast(ic) * in_strides[2]; + + p_wei += + static_cast(ig) * wei_strides[4] + static_cast(ic) * wei_strides[2]; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += + static_cast(in) * out_strides[4] + static_cast(ig) * out_strides[3]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ihi = tid / wi; int iwi = tid % wi; - double value = .0f; + acc_data_t value = 0; for(int ik = 0; ik < k_per_group; ik++) { @@ -264,26 +350,59 @@ inline __device__ void naive_conv_bwd_nchw(dst_data_t* __restrict__ p_in, if(valid_h & valid_w) { - size_t o_idx = static_cast(ik) * ho * wo + - static_cast(cur_ho) * wo + - static_cast(cur_wo); - size_t f_idx = static_cast(ik) * c_per_group * fy * fx + - static_cast(iy) * fx + static_cast(ix); - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(ik) * ho * wo + + static_cast(cur_ho) * wo + + static_cast(cur_wo); + + size_t f_idx = static_cast(ik) * c_per_group * fy * fx + + static_cast(iy) * fx + static_cast(ix); + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t o_idx = static_cast(ik) * out_strides[2] + + static_cast(cur_ho) * out_strides[1] + + static_cast(cur_wo) * out_strides[0]; + + size_t f_idx = static_cast(ik) * wei_strides[3] + + static_cast(iy) * wei_strides[1] + + static_cast(ix) * wei_strides[0]; + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } } } } } - size_t i_idx = static_cast(ihi) * wi + static_cast(iwi); - p_in[i_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(ihi) * wi + static_cast(iwi); + + p_in[i_idx] = cast_to(value); + } + else + { + size_t i_idx = + static_cast(ihi) * in_strides[1] + static_cast(iwi) * in_strides[0]; + + p_in[i_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in, dst_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides, int hi, int wi, int n, @@ -315,18 +434,34 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in, int ik = bid % k_per_group; int ig = bid / k_per_group; - p_in += static_cast(ig) * c_per_group * hi * wi; - p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + - static_cast(ik) * c_per_group * fy * fx; - p_out += static_cast(ig) * k_per_group * ho * wo + static_cast(ik) * ho * wo; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(ig) * c_per_group * hi * wi; + + p_wei += static_cast(ig) * k_per_group * c_per_group * fy * fx + + static_cast(ik) * c_per_group * fy * fx; + + p_out += + static_cast(ig) * k_per_group * ho * wo + static_cast(ik) * ho * wo; + } + else + { + p_in += static_cast(ig) * in_strides[3]; + + p_wei += + static_cast(ig) * wei_strides[4] + static_cast(ik) * wei_strides[3]; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += + static_cast(ig) * out_strides[3] + static_cast(ik) * out_strides[2]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ix = tid % fx; int iy = (tid / fx) % fy; int ic = tid / (fx * fy); - double value = .0f; + acc_data_t value = 0; for(int in = 0; in < n; in++) { @@ -345,28 +480,64 @@ inline __device__ void naive_conv_wrw_nchw(const src_data_t* __restrict__ p_in, if(valid_h & valid_w) { - size_t i_idx = static_cast(in) * c * hi * wi + - static_cast(ic) * hi * wi + - static_cast(cur_h) * wi + static_cast(cur_w); - size_t o_idx = static_cast(in) * k * ho * wo + - static_cast(iho) * wo + static_cast(iwo); - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(in) * c * hi * wi + + static_cast(ic) * hi * wi + + static_cast(cur_h) * wi + + static_cast(cur_w); + + size_t o_idx = static_cast(in) * k * ho * wo + + static_cast(iho) * wo + static_cast(iwo); + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } + else + { + size_t i_idx = static_cast(in) * in_strides[4] + + static_cast(ic) * in_strides[2] + + static_cast(cur_h) * in_strides[1] + + static_cast(cur_w) * in_strides[0]; + + size_t o_idx = static_cast(in) * out_strides[4] + + static_cast(iho) * out_strides[1] + + static_cast(iwo) * out_strides[0]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } } } } } - size_t f_idx = static_cast(ic) * fy * fx + static_cast(iy) * fx + - static_cast(ix); - p_wei[f_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t f_idx = static_cast(ic) * fy * fx + static_cast(iy) * fx + + static_cast(ix); + + p_wei[f_idx] = cast_to(value); + } + else + { + size_t f_idx = static_cast(ic) * wei_strides[2] + + static_cast(iy) * wei_strides[1] + + static_cast(ix) * wei_strides[0]; + + p_wei[f_idx] = cast_to(value); + } } } // design block_size 256 -template +template inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, dst_data_t* __restrict__ p_out, + Strides6D in_strides, + Strides6D wei_strides, + Strides6D out_strides, int di, int hi, int wi, @@ -405,21 +576,37 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in, int in = (bid / k_per_group) % n; int ig = bid / (n * k_per_group); - p_in += static_cast(in) * c * di * hi * wi + - static_cast(ig) * c_per_group * di * hi * wi; - p_wei += static_cast(ig) * k_per_group * c_per_group * fz * fy * fx + - static_cast(ik) * c_per_group * fz * fy * fx; - p_out += static_cast(in) * k * do_ * ho * wo + - static_cast(ig) * k_per_group * do_ * ho * wo + - static_cast(ik) * do_ * ho * wo; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * c * di * hi * wi + + static_cast(ig) * c_per_group * di * hi * wi; + + p_wei += static_cast(ig) * k_per_group * c_per_group * fz * fy * fx + + static_cast(ik) * c_per_group * fz * fy * fx; + + p_out += static_cast(in) * k * do_ * ho * wo + + static_cast(ig) * k_per_group * do_ * ho * wo + + static_cast(ik) * do_ * ho * wo; + } + else + { + p_in += static_cast(in) * in_strides[5] + static_cast(ig) * in_strides[4]; + + p_wei += + static_cast(ig) * wei_strides[5] + static_cast(ik) * wei_strides[4]; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += static_cast(in) * out_strides[5] + + static_cast(ig) * out_strides[4] + + static_cast(ik) * out_strides[3]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int iwo = tid % wo; int iho = (tid / wo) % ho; int ido = tid / (ho * wo); - double value = .0f; + acc_data_t value = 0; for(int ic = 0; ic < c_per_group; ic++) { @@ -444,30 +631,67 @@ inline __device__ void naive_conv_fwd_ncdhw(const src_data_t* __restrict__ p_in, if(valid_d & valid_w & valid_h) { - size_t i_idx = static_cast(ic) * di * hi * wi + - static_cast(cur_d) * hi * wi + - static_cast(cur_h) * wi + - static_cast(cur_w); - size_t f_idx = static_cast(ic) * fz * fy * fx + - static_cast(iz) * fy * fx + - static_cast(iy) * fx + static_cast(ix); - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(ic) * di * hi * wi + + static_cast(cur_d) * hi * wi + + static_cast(cur_h) * wi + + static_cast(cur_w); + + size_t f_idx = static_cast(ic) * fz * fy * fx + + static_cast(iz) * fy * fx + + static_cast(iy) * fx + + static_cast(ix); + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t i_idx = static_cast(ic) * in_strides[3] + + static_cast(cur_d) * in_strides[2] + + static_cast(cur_h) * in_strides[1] + + static_cast(cur_w) * in_strides[0]; + + size_t f_idx = static_cast(ic) * wei_strides[3] + + static_cast(iz) * wei_strides[2] + + static_cast(iy) * wei_strides[1] + + static_cast(ix) * wei_strides[0]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } } } } } } - size_t o_idx = static_cast(ido) * ho * wo + static_cast(iho) * wo + - static_cast(iwo); - p_out[o_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(ido) * ho * wo + static_cast(iho) * wo + + static_cast(iwo); + + p_out[o_idx] = cast_to(value); + } + else + { + size_t o_idx = static_cast(ido) * out_strides[2] + + static_cast(iho) * out_strides[1] + + static_cast(iwo) * out_strides[0]; + + p_out[o_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides6D in_strides, + Strides6D wei_strides, + Strides6D out_strides, int di, int hi, int wi, @@ -506,21 +730,37 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in, int in = (bid / c_per_group) % n; int ig = bid / (n * c_per_group); - p_in += static_cast(in) * c * di * hi * wi + - static_cast(ig) * c_per_group * di * hi * wi + - static_cast(ic) * di * hi * wi; - p_wei += static_cast(ig) * k_per_group * c_per_group * fz * fy * fx + - static_cast(ic) * fz * fy * fx; - p_out += static_cast(in) * k * do_ * ho * wo + - static_cast(ig) * k_per_group * do_ * ho * wo; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * c * di * hi * wi + + static_cast(ig) * c_per_group * di * hi * wi + + static_cast(ic) * di * hi * wi; + + p_wei += static_cast(ig) * k_per_group * c_per_group * fz * fy * fx + + static_cast(ic) * fz * fy * fx; + + p_out += static_cast(in) * k * do_ * ho * wo + + static_cast(ig) * k_per_group * do_ * ho * wo; + } + else + { + p_in += static_cast(in) * in_strides[5] + static_cast(ig) * in_strides[4] + + static_cast(ic) * in_strides[3]; + + p_wei += + static_cast(ig) * wei_strides[5] + static_cast(ic) * wei_strides[3]; + + p_out += + static_cast(in) * out_strides[5] + static_cast(ig) * out_strides[4]; + } - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int iwi = tid % wi; int ihi = (tid / wi) % hi; int idi = tid / (hi * wi); - double value = .0f; + acc_data_t value = 0; for(int ik = 0; ik < k_per_group; ik++) { @@ -554,30 +794,67 @@ inline __device__ void naive_conv_bwd_ncdhw(dst_data_t* __restrict__ p_in, if(valid_d & valid_h & valid_w) { - size_t o_idx = static_cast(ik) * do_ * ho * wo + - static_cast(cur_do) * ho * wo + - static_cast(cur_ho) * wo + - static_cast(cur_wo); - size_t f_idx = static_cast(ik) * c_per_group * fz * fy * fx + - static_cast(iz) * fy * fx + - static_cast(iy) * fx + static_cast(ix); - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(ik) * do_ * ho * wo + + static_cast(cur_do) * ho * wo + + static_cast(cur_ho) * wo + + static_cast(cur_wo); + + size_t f_idx = + static_cast(ik) * c_per_group * fz * fy * fx + + static_cast(iz) * fy * fx + + static_cast(iy) * fx + static_cast(ix); + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t o_idx = static_cast(ik) * out_strides[3] + + static_cast(cur_do) * out_strides[2] + + static_cast(cur_ho) * out_strides[1] + + static_cast(cur_wo) * out_strides[0]; + + size_t f_idx = static_cast(ik) * wei_strides[4] + + static_cast(iz) * wei_strides[2] + + static_cast(iy) * wei_strides[1] + + static_cast(ix) * wei_strides[0]; + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } } } } } } - size_t i_idx = static_cast(idi) * hi * wi + static_cast(ihi) * wi + - static_cast(iwi); - p_in[i_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(idi) * hi * wi + static_cast(ihi) * wi + + static_cast(iwi); + + p_in[i_idx] = cast_to(value); + } + else + { + size_t i_idx = static_cast(idi) * in_strides[2] + + static_cast(ihi) * in_strides[1] + + static_cast(iwi) * in_strides[0]; + + p_in[i_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in, dst_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides6D in_strides, + Strides6D wei_strides, + Strides6D out_strides, int di, int hi, int wi, @@ -615,20 +892,35 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in, int ik = bid % k_per_group; int ig = bid / k_per_group; - p_in += static_cast(ig) * c_per_group * di * hi * wi; - p_wei += static_cast(ig) * k_per_group * c_per_group * fz * fy * fx + - static_cast(ik) * c_per_group * fz * fy * fx; - p_out += static_cast(ig) * k_per_group * do_ * ho * wo + - static_cast(ik) * do_ * ho * wo; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(ig) * c_per_group * di * hi * wi; + + p_wei += static_cast(ig) * k_per_group * c_per_group * fz * fy * fx + + static_cast(ik) * c_per_group * fz * fy * fx; + + p_out += static_cast(ig) * k_per_group * do_ * ho * wo + + static_cast(ik) * do_ * ho * wo; + } + else + { + p_in += static_cast(ig) * in_strides[4]; + + p_wei += + static_cast(ig) * wei_strides[5] + static_cast(ik) * wei_strides[4]; + + p_out += + static_cast(ig) * out_strides[4] + static_cast(ik) * out_strides[3]; + } - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ix = tid % fx; int iy = (tid / fx) % fy; int iz = (tid / (fx * fy)) % fz; int ic = tid / (fx * fy * fz); - double value = .0f; + acc_data_t value = 0; for(int in = 0; in < n; in++) { @@ -653,33 +945,73 @@ inline __device__ void naive_conv_wrw_ncdhw(const src_data_t* __restrict__ p_in, if(valid_d & valid_h & valid_w) { - size_t i_idx = static_cast(in) * c * di * hi * wi + - static_cast(ic) * di * hi * wi + - static_cast(cur_d) * hi * wi + - static_cast(cur_h) * wi + - static_cast(cur_w); - size_t o_idx = static_cast(in) * k * do_ * ho * wo + - static_cast(ido) * ho * wo + - static_cast(iho) * wo + static_cast(iwo); - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(in) * c * di * hi * wi + + static_cast(ic) * di * hi * wi + + static_cast(cur_d) * hi * wi + + static_cast(cur_h) * wi + + static_cast(cur_w); + + size_t o_idx = static_cast(in) * k * do_ * ho * wo + + static_cast(ido) * ho * wo + + static_cast(iho) * wo + + static_cast(iwo); + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } + else + { + size_t i_idx = static_cast(in) * in_strides[5] + + static_cast(ic) * in_strides[3] + + static_cast(cur_d) * in_strides[2] + + static_cast(cur_h) * in_strides[1] + + static_cast(cur_w) * in_strides[0]; + + size_t o_idx = static_cast(in) * out_strides[5] + + static_cast(ido) * out_strides[2] + + static_cast(iho) * out_strides[1] + + static_cast(iwo) * out_strides[0]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } } } } } } - size_t f_idx = static_cast(ic) * fz * fy * fx + static_cast(iz) * fy * fx + - static_cast(iy) * fx + static_cast(ix); - p_wei[f_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t f_idx = static_cast(ic) * fz * fy * fx + + static_cast(iz) * fy * fx + static_cast(iy) * fx + + static_cast(ix); + + p_wei[f_idx] = cast_to(value); + } + else + { + size_t f_idx = static_cast(ic) * wei_strides[3] + + static_cast(iz) * wei_strides[2] + + static_cast(iy) * wei_strides[1] + + static_cast(ix) * wei_strides[0]; + + p_wei[f_idx] = cast_to(value); + } } } /***************************** nhwc *****************************/ // design block_size 256 -template +template inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, dst_data_t* __restrict__ p_out, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides, int hi, int wi, int n, @@ -711,17 +1043,32 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in, int in = (bid / ho) % n; int ig = bid / (n * ho); - p_in += static_cast(in) * hi * wi * c + static_cast(ig) * c_per_group; - p_wei += static_cast(ig) * k_per_group * fy * fx * c_per_group; - p_out += static_cast(in) * ho * wo * k + static_cast(ig) * k_per_group + - static_cast(iho) * wo * k; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * hi * wi * c + static_cast(ig) * c_per_group; + + p_wei += static_cast(ig) * k_per_group * fy * fx * c_per_group; + + p_out += static_cast(in) * ho * wo * k + static_cast(iho) * wo * k + + static_cast(ig) * k_per_group; + } + else + { + p_in += static_cast(in) * in_strides[4] + static_cast(ig) * in_strides[1]; + + p_wei += static_cast(ig) * wei_strides[4]; + + p_out += static_cast(in) * out_strides[4] + + static_cast(iho) * out_strides[3] + + static_cast(ig) * out_strides[1]; + } - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int iwo = tid / k_per_group; int ik = tid % k_per_group; - double value = .0f; + acc_data_t value = 0; for(int iy = 0; iy < fy; iy++) { @@ -740,27 +1087,61 @@ inline __device__ void naive_conv_fwd_nhwc(const src_data_t* __restrict__ p_in, if(valid_w & valid_h) { - size_t i_idx = static_cast(cur_h) * wi * c + - static_cast(cur_w) * c + static_cast(ic); - size_t f_idx = static_cast(ik) * fy * fx * c_per_group + - static_cast(iy) * fx * c_per_group + - static_cast(ix) * c_per_group + - static_cast(ic); - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(cur_h) * wi * c + + static_cast(cur_w) * c + static_cast(ic); + + size_t f_idx = static_cast(ik) * fy * fx * c_per_group + + static_cast(iy) * fx * c_per_group + + static_cast(ix) * c_per_group + + static_cast(ic); + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t i_idx = static_cast(cur_h) * in_strides[3] + + static_cast(cur_w) * in_strides[2] + + static_cast(ic) * in_strides[0]; + + size_t f_idx = static_cast(ik) * wei_strides[3] + + static_cast(iy) * wei_strides[2] + + static_cast(ix) * wei_strides[1] + + static_cast(ic) * wei_strides[0]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } } } } } - size_t o_idx = static_cast(iwo) * k + static_cast(ik); - p_out[o_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(iwo) * k + static_cast(ik); + + p_out[o_idx] = cast_to(value); + } + else + { + size_t o_idx = static_cast(iwo) * out_strides[2] + + static_cast(ik) * out_strides[0]; + + p_out[o_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides, int hi, int wi, int n, @@ -792,17 +1173,32 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, int in = (bid / hi) % n; int ig = bid / (n * hi); - p_in += static_cast(in) * hi * wi * c + static_cast(ihi) * wi * c + - static_cast(ig) * c_per_group; - p_wei += static_cast(ig) * k_per_group * fy * fx * c_per_group; - p_out += static_cast(in) * ho * wo * k + static_cast(ig) * k_per_group; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * hi * wi * c + static_cast(ihi) * wi * c + + static_cast(ig) * c_per_group; + + p_wei += static_cast(ig) * k_per_group * fy * fx * c_per_group; + + p_out += static_cast(in) * ho * wo * k + static_cast(ig) * k_per_group; + } + else + { + p_in += static_cast(in) * in_strides[4] + static_cast(ihi) * in_strides[3] + + static_cast(ig) * in_strides[1]; + + p_wei += static_cast(ig) * wei_strides[4]; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += + static_cast(in) * out_strides[4] + static_cast(ig) * out_strides[1]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int iwi = tid / c_per_group; int ic = tid % c_per_group; - double value = .0f; + acc_data_t value = 0; for(int iy = 0; iy < fy; iy++) { @@ -827,27 +1223,61 @@ inline __device__ void naive_conv_bwd_nhwc(dst_data_t* __restrict__ p_in, if(valid_h & valid_w) { - size_t o_idx = static_cast(cur_ho) * wo * k + - static_cast(cur_wo) * k + static_cast(ik); - size_t f_idx = static_cast(ik) * fy * fx * c_per_group + - static_cast(iy) * fx * c_per_group + - static_cast(ix) * c_per_group + - static_cast(ic); - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(cur_ho) * wo * k + + static_cast(cur_wo) * k + + static_cast(ik); + + size_t f_idx = static_cast(ik) * fy * fx * c_per_group + + static_cast(iy) * fx * c_per_group + + static_cast(ix) * c_per_group + + static_cast(ic); + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t o_idx = static_cast(cur_ho) * out_strides[3] + + static_cast(cur_wo) * out_strides[2] + + static_cast(ik) * out_strides[0]; + + size_t f_idx = static_cast(ik) * wei_strides[3] + + static_cast(iy) * wei_strides[2] + + static_cast(ix) * wei_strides[1] + + static_cast(ic) * wei_strides[0]; + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } } } } } - size_t i_idx = static_cast(iwi) * c + static_cast(ic); - p_in[i_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(iwi) * c + static_cast(ic); + + p_in[i_idx] = cast_to(value); + } + else + { + size_t i_idx = + static_cast(iwi) * in_strides[2] + static_cast(ic) * in_strides[0]; + p_in[i_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in, dst_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides5D in_strides, + Strides5D wei_strides, + Strides5D out_strides, int hi, int wi, int n, @@ -879,18 +1309,33 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in, int ik = bid % k_per_group; int ig = bid / k_per_group; - p_in += static_cast(ig) * c_per_group; - p_wei += static_cast(ig) * k_per_group * fy * fx * c_per_group + - static_cast(ik) * fy * fx * c_per_group; - p_out += static_cast(ig) * k_per_group + static_cast(ik); + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(ig) * c_per_group; + + p_wei += static_cast(ig) * k_per_group * fy * fx * c_per_group + + static_cast(ik) * fy * fx * c_per_group; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += static_cast(ig) * k_per_group + static_cast(ik); + } + else + { + p_in += static_cast(ig) * in_strides[1]; + + p_wei += + static_cast(ig) * wei_strides[4] + static_cast(ik) * wei_strides[3]; + + p_out += + static_cast(ig) * out_strides[1] + static_cast(ik) * out_strides[0]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ic = tid % c_per_group; int ix = (tid / c_per_group) % fx; int iy = tid / (c_per_group * fx); - double value = .0f; + acc_data_t value = 0; for(int in = 0; in < n; in++) { @@ -909,29 +1354,65 @@ inline __device__ void naive_conv_wrw_nhwc(const src_data_t* __restrict__ p_in, if(valid_h & valid_w) { - size_t i_idx = static_cast(in) * hi * wi * c + - static_cast(cur_h) * wi * c + - static_cast(cur_w) * c + static_cast(ic); - size_t o_idx = static_cast(in) * ho * wo * k + - static_cast(iho) * wo * k + - static_cast(iwo) * k; - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(in) * hi * wi * c + + static_cast(cur_h) * wi * c + + static_cast(cur_w) * c + static_cast(ic); + + size_t o_idx = static_cast(in) * ho * wo * k + + static_cast(iho) * wo * k + + static_cast(iwo) * k; + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } + else + { + size_t i_idx = static_cast(in) * in_strides[4] + + static_cast(cur_h) * in_strides[3] + + static_cast(cur_w) * in_strides[2] + + static_cast(ic) * in_strides[0]; + + size_t o_idx = static_cast(in) * out_strides[4] + + static_cast(iho) * out_strides[3] + + static_cast(iwo) * out_strides[2]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } } } } } - size_t f_idx = static_cast(iy) * fx * c_per_group + - static_cast(ix) * c_per_group + static_cast(ic); - p_wei[f_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t f_idx = static_cast(iy) * fx * c_per_group + + static_cast(ix) * c_per_group + static_cast(ic); + + p_wei[f_idx] = cast_to(value); + } + else + { + size_t f_idx = static_cast(iy) * wei_strides[2] + + static_cast(ix) * wei_strides[1] + + static_cast(ic) * wei_strides[0]; + + p_wei[f_idx] = cast_to(value); + } } } // design block_size 256 -template +template inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, dst_data_t* __restrict__ p_out, + Strides6D in_strides, + Strides6D wei_strides, + Strides6D out_strides, int di, int hi, int wi, @@ -970,18 +1451,36 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in, int in = (bid / do_) % n; int ig = bid / (n * do_); - p_in += static_cast(in) * di * hi * wi * c + static_cast(ig) * c_per_group; - p_wei += static_cast(ig) * k_per_group * fz * fy * fx * c_per_group; - p_out += static_cast(in) * do_ * ho * wo * k + static_cast(ido) * ho * wo * k + - static_cast(ig) * k_per_group; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * di * hi * wi * c + static_cast(ig) * c_per_group; + + p_wei += static_cast(ig) * k_per_group * fz * fy * fx * c_per_group; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += static_cast(in) * do_ * ho * wo * k + + static_cast(ido) * ho * wo * k + static_cast(ig) * k_per_group; + } + else + { + // dim order NDHWGC + // replace C and K with G * C_per_G and G * K_per_G + p_in += static_cast(in) * in_strides[5] + static_cast(ig) * in_strides[1]; + + // Assumes that group G is the highest dimension in the layout + p_wei += static_cast(ig) * wei_strides[5]; + + p_out += static_cast(in) * out_strides[5] + + static_cast(ido) * out_strides[4] + + static_cast(ig) * out_strides[1]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ik = tid % k_per_group; int iwo = (tid / k_per_group) % wo; int iho = tid / (k_per_group * wo); - double value = .0f; + acc_data_t value = 0; for(int iz = 0; iz < fz; iz++) { @@ -1005,30 +1504,69 @@ inline __device__ void naive_conv_fwd_ndhwc(const src_data_t* __restrict__ p_in, { if(valid_d & valid_w & valid_h) { - size_t i_idx = static_cast(cur_d) * hi * wi * c + - static_cast(cur_h) * wi * c + - static_cast(cur_w) * c + static_cast(ic); - size_t f_idx = static_cast(ik) * fz * fy * fx * c_per_group + - static_cast(iz) * fy * fx * c_per_group + - static_cast(iy) * fx * c_per_group + - static_cast(ix) * c_per_group + - static_cast(ic); - value += cast_to(p_in[i_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(cur_d) * hi * wi * c + + static_cast(cur_h) * wi * c + + static_cast(cur_w) * c + + static_cast(ic); + + size_t f_idx = + static_cast(ik) * fz * fy * fx * c_per_group + + static_cast(iz) * fy * fx * c_per_group + + static_cast(iy) * fx * c_per_group + + static_cast(ix) * c_per_group + static_cast(ic); + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t i_idx = static_cast(cur_d) * in_strides[4] + + static_cast(cur_h) * in_strides[3] + + static_cast(cur_w) * in_strides[2] + + static_cast(ic) * in_strides[0]; + + size_t f_idx = static_cast(ik) * wei_strides[4] + + static_cast(iz) * wei_strides[3] + + static_cast(iy) * wei_strides[2] + + static_cast(ix) * wei_strides[1] + + static_cast(ic) * wei_strides[0]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_wei[f_idx]); + } } } } } } - size_t o_idx = static_cast(iho) * wo * k + static_cast(iwo) * k + - static_cast(ik); - p_out[o_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(iho) * wo * k + static_cast(iwo) * k + + static_cast(ik); + + p_out[o_idx] = cast_to(value); + } + else + { + size_t o_idx = static_cast(iho) * out_strides[3] + + static_cast(iwo) * out_strides[2] + + static_cast(ik) * out_strides[0]; + + p_out[o_idx] = cast_to(value); + } } } -template + +template inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, const src_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides6D in_strides, + Strides6D wei_strides, + Strides6D out_strides, int di, int hi, int wi, @@ -1052,6 +1590,7 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, int fx, int group) { + /* * need to compute total input pixel: `group * n * di * hi * wi * * c_per_group`. @@ -1067,18 +1606,34 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, int in = (bid / di) % n; int ig = bid / (n * di); - p_in += static_cast(in) * di * hi * wi * c + static_cast(idi) * hi * wi * c + - static_cast(ig) * c_per_group; - p_wei += static_cast(ig) * k_per_group * fz * fy * fx * c_per_group; - p_out += static_cast(in) * do_ * ho * wo * k + static_cast(ig) * k_per_group; + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(in) * di * hi * wi * c + + static_cast(idi) * hi * wi * c + static_cast(ig) * c_per_group; + + p_wei += static_cast(ig) * k_per_group * fz * fy * fx * c_per_group; + + p_out += + static_cast(in) * do_ * ho * wo * k + static_cast(ig) * k_per_group; + } + else + { + p_in += static_cast(in) * in_strides[5] + static_cast(idi) * in_strides[4] + + static_cast(ig) * in_strides[1]; + + p_wei += static_cast(ig) * wei_strides[5]; + + p_out += + static_cast(in) * out_strides[5] + static_cast(ig) * out_strides[1]; + } - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ic = tid % c_per_group; int iwi = (tid / c_per_group) % wi; int ihi = (tid / (c_per_group * wi)); - double value = .0f; + acc_data_t value = 0; for(int iz = 0; iz < fz; iz++) { @@ -1111,32 +1666,69 @@ inline __device__ void naive_conv_bwd_ndhwc(dst_data_t* __restrict__ p_in, { if(valid_d & valid_h & valid_w) { - size_t o_idx = static_cast(cur_do) * ho * wo * k + - static_cast(cur_ho) * wo * k + - static_cast(cur_wo) * k + - static_cast(ik); - size_t f_idx = static_cast(ik) * fz * fy * fx * c_per_group + - static_cast(iz) * fy * fx * c_per_group + - static_cast(iy) * fx * c_per_group + - static_cast(ix) * c_per_group + - static_cast(ic); - value += cast_to(p_out[o_idx]) * - cast_to(p_wei[f_idx]); + if constexpr(ASSUME_PACKED) + { + size_t o_idx = static_cast(cur_do) * ho * wo * k + + static_cast(cur_ho) * wo * k + + static_cast(cur_wo) * k + + static_cast(ik); + + size_t f_idx = + static_cast(ik) * fz * fy * fx * c_per_group + + static_cast(iz) * fy * fx * c_per_group + + static_cast(iy) * fx * c_per_group + + static_cast(ix) * c_per_group + static_cast(ic); + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } + else + { + size_t o_idx = static_cast(cur_do) * out_strides[4] + + static_cast(cur_ho) * out_strides[3] + + static_cast(cur_wo) * out_strides[2] + + static_cast(ik) * out_strides[0]; + + size_t f_idx = static_cast(ik) * wei_strides[4] + + static_cast(iz) * wei_strides[3] + + static_cast(iy) * wei_strides[2] + + static_cast(ix) * wei_strides[1] + + static_cast(ic) * wei_strides[0]; + + value += cast_to(p_out[o_idx]) * + cast_to(p_wei[f_idx]); + } } } } } } - size_t i_idx = static_cast(ihi) * wi * c + static_cast(iwi) * c + - static_cast(ic); - p_in[i_idx] = cast_to(value); + + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(ihi) * wi * c + static_cast(iwi) * c + + static_cast(ic); + + p_in[i_idx] = cast_to(value); + } + else + { + size_t i_idx = static_cast(ihi) * in_strides[3] + + static_cast(iwi) * in_strides[2] + + static_cast(ic) * in_strides[0]; + + p_in[i_idx] = cast_to(value); + } } } -template +template inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in, dst_data_t* __restrict__ p_wei, const src_data_t* __restrict__ p_out, + Strides6D in_strides, + Strides6D wei_strides, + Strides6D out_strides, int di, int hi, int wi, @@ -1174,19 +1766,34 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in, int ik = bid % k_per_group; int ig = bid / k_per_group; - p_in += static_cast(ig) * c_per_group; - p_wei += static_cast(ig) * k_per_group * fz * fy * fx * c_per_group + - static_cast(ik) * fz * fy * fx * c_per_group; - p_out += static_cast(ig) * k_per_group + static_cast(ik); + if constexpr(ASSUME_PACKED) + { + p_in += static_cast(ig) * c_per_group; + + p_wei += static_cast(ig) * k_per_group * fz * fy * fx * c_per_group + + static_cast(ik) * fz * fy * fx * c_per_group; - for(int tid = threadIdx.x; tid < thread_length; tid += 256) + p_out += static_cast(ig) * k_per_group + static_cast(ik); + } + else + { + p_in += static_cast(ig) * in_strides[1]; + + p_wei += + static_cast(ig) * wei_strides[5] + static_cast(ik) * wei_strides[4]; + + p_out += + static_cast(ig) * out_strides[1] + static_cast(ik) * out_strides[0]; + } + + for(int tid = threadIdx.x; tid < thread_length; tid += blockDim.x) { int ic = tid % c_per_group; int ix = (tid / c_per_group) % fx; int iy = (tid / (c_per_group * fx)) % fy; int iz = (tid / (c_per_group * fx * fy)); - double value = .0f; + acc_data_t value = 0; for(int in = 0; in < n; in++) { @@ -1211,374 +1818,340 @@ inline __device__ void naive_conv_wrw_ndhwc(const src_data_t* __restrict__ p_in, if(valid_d & valid_h & valid_w) { - size_t i_idx = static_cast(in) * di * hi * wi * c + - static_cast(cur_d) * hi * wi * c + - static_cast(cur_h) * wi * c + - static_cast(cur_w) * c + static_cast(ic); - size_t o_idx = static_cast(in) * do_ * ho * wo * k + - static_cast(ido) * ho * wo * k + - static_cast(iho) * wo * k + - static_cast(iwo) * k; - value += cast_to(p_in[i_idx]) * - cast_to(p_out[o_idx]); + + if constexpr(ASSUME_PACKED) + { + size_t i_idx = static_cast(in) * di * hi * wi * c + + static_cast(cur_d) * hi * wi * c + + static_cast(cur_h) * wi * c + + static_cast(cur_w) * c + + static_cast(ic); + + size_t o_idx = static_cast(in) * do_ * ho * wo * k + + static_cast(ido) * ho * wo * k + + static_cast(iho) * wo * k + + static_cast(iwo) * k; + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } + else + { + + size_t i_idx = static_cast(in) * in_strides[5] + + static_cast(cur_d) * in_strides[4] + + static_cast(cur_h) * in_strides[3] + + static_cast(cur_w) * in_strides[2] + + static_cast(ic) * in_strides[0]; + + size_t o_idx = static_cast(in) * out_strides[5] + + static_cast(ido) * out_strides[4] + + static_cast(iho) * out_strides[3] + + static_cast(iwo) * out_strides[2]; + + value += cast_to(p_in[i_idx]) * + cast_to(p_out[o_idx]); + } } } } } } - size_t f_idx = static_cast(iz) * fy * fx * c_per_group + - static_cast(iy) * fx * c_per_group + - static_cast(ix) * c_per_group + static_cast(ic); - p_wei[f_idx] = cast_to(value); - } -} - -#define DEFINE_2D_NAIVE_FWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_fwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - src_data_t* __restrict__ p_in, \ - src_data_t* __restrict__ p_wei, \ - dst_data_t* __restrict__ p_out, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int ho, \ - int wo, \ - int sy, \ - int sx, \ - int dy, \ - int dx, \ - int py, \ - int px, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_fwd_##tensor_layout(p_in, \ - p_wei, \ - p_out, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - ho, \ - wo, \ - sy, \ - sx, \ - dy, \ - dx, \ - py, \ - px, \ - fy, \ - fx, \ - group); \ - } -#define DEFINE_2D_NAIVE_BWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_bwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - dst_data_t* __restrict__ p_in, \ - src_data_t* __restrict__ p_wei, \ - src_data_t* __restrict__ p_out, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int ho, \ - int wo, \ - int sy, \ - int sx, \ - int dy, \ - int dx, \ - int py, \ - int px, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_bwd_##tensor_layout(p_in, \ - p_wei, \ - p_out, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - ho, \ - wo, \ - sy, \ - sx, \ - dy, \ - dx, \ - py, \ - px, \ - fy, \ - fx, \ - group); \ - } + if constexpr(ASSUME_PACKED) + { + size_t f_idx = static_cast(iz) * fy * fx * c_per_group + + static_cast(iy) * fx * c_per_group + + static_cast(ix) * c_per_group + static_cast(ic); -#define DEFINE_2D_NAIVE_WRW_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_wrw_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - src_data_t* __restrict__ p_in, \ - dst_data_t* __restrict__ p_wei, \ - src_data_t* __restrict__ p_out, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int ho, \ - int wo, \ - int sy, \ - int sx, \ - int dy, \ - int dx, \ - int py, \ - int px, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_wrw_##tensor_layout(p_in, \ - p_wei, \ - p_out, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - ho, \ - wo, \ - sy, \ - sx, \ - dy, \ - dx, \ - py, \ - px, \ - fy, \ - fx, \ - group); \ - } + p_wei[f_idx] = cast_to(value); + } + else + { + size_t f_idx = static_cast(iz) * wei_strides[3] + + static_cast(iy) * wei_strides[2] + + static_cast(ix) * wei_strides[1] + + static_cast(ic) * wei_strides[0]; -#define DEFINE_3D_NAIVE_FWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_fwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - src_data_t* __restrict__ p_in, \ - src_data_t* __restrict__ p_wei, \ - dst_data_t* __restrict__ p_out, \ - int di, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int do_, \ - int ho, \ - int wo, \ - int sz, \ - int sy, \ - int sx, \ - int dz, \ - int dy, \ - int dx, \ - int pz, \ - int py, \ - int px, \ - int fz, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_fwd_##tensor_layout(p_in, \ - p_wei, \ - p_out, \ - di, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - do_, \ - ho, \ - wo, \ - sz, \ - sy, \ - sx, \ - dz, \ - dy, \ - dx, \ - pz, \ - py, \ - px, \ - fz, \ - fy, \ - fx, \ - group); \ + p_wei[f_idx] = cast_to(value); + } } +} -#define DEFINE_3D_NAIVE_BWD_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_bwd_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - dst_data_t* __restrict__ p_in, \ - src_data_t* __restrict__ p_wei, \ - src_data_t* __restrict__ p_out, \ - int di, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int do_, \ - int ho, \ - int wo, \ - int sz, \ - int sy, \ - int sx, \ - int dz, \ - int dy, \ - int dx, \ - int pz, \ - int py, \ - int px, \ - int fz, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_bwd_##tensor_layout(p_in, \ - p_wei, \ - p_out, \ - di, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - do_, \ - ho, \ - wo, \ - sz, \ - sy, \ - sx, \ - dz, \ - dy, \ - dx, \ - pz, \ - py, \ - px, \ - fz, \ - fy, \ - fx, \ - group); \ +#define DEFINE_2D_NAIVE_CONV_KERNEL(direction, tensor_layout, src_data_t, acc_data_t, dst_data_t) \ + extern "C" __global__ void \ + naive_conv_packed_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ + src_data_t* __restrict__ p_in, \ + src_data_t* __restrict__ p_wei, \ + dst_data_t* __restrict__ p_out, \ + Strides5D in_strides, \ + Strides5D wei_strides, \ + Strides5D out_strides, \ + int hi, \ + int wi, \ + int n, \ + int k_per_group, \ + int c_per_group, \ + int ho, \ + int wo, \ + int sy, \ + int sx, \ + int dy, \ + int dx, \ + int py, \ + int px, \ + int fy, \ + int fx, \ + int group) \ + { \ + naive_conv_##direction##_##tensor_layout( \ + p_in, \ + p_wei, \ + p_out, \ + in_strides, \ + wei_strides, \ + out_strides, \ + hi, \ + wi, \ + n, \ + k_per_group, \ + c_per_group, \ + ho, \ + wo, \ + sy, \ + sx, \ + dy, \ + dx, \ + py, \ + px, \ + fy, \ + fx, \ + group); \ + } \ + extern "C" __global__ void \ + naive_conv_nonpacked_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ + src_data_t* __restrict__ p_in, \ + src_data_t* __restrict__ p_wei, \ + dst_data_t* __restrict__ p_out, \ + Strides5D in_strides, \ + Strides5D wei_strides, \ + Strides5D out_strides, \ + int hi, \ + int wi, \ + int n, \ + int k_per_group, \ + int c_per_group, \ + int ho, \ + int wo, \ + int sy, \ + int sx, \ + int dy, \ + int dx, \ + int py, \ + int px, \ + int fy, \ + int fx, \ + int group) \ + { \ + naive_conv_##direction##_##tensor_layout( \ + p_in, \ + p_wei, \ + p_out, \ + in_strides, \ + wei_strides, \ + out_strides, \ + hi, \ + wi, \ + n, \ + k_per_group, \ + c_per_group, \ + ho, \ + wo, \ + sy, \ + sx, \ + dy, \ + dx, \ + py, \ + px, \ + fy, \ + fx, \ + group); \ } -#define DEFINE_3D_NAIVE_WRW_CONV_KERNEL(tensor_layout, src_data_t, acc_data_t, dst_data_t) \ - extern "C" __global__ void \ - naive_conv_wrw_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ - src_data_t* __restrict__ p_in, \ - dst_data_t* __restrict__ p_wei, \ - src_data_t* __restrict__ p_out, \ - int di, \ - int hi, \ - int wi, \ - int n, \ - int k_per_group, \ - int c_per_group, \ - int do_, \ - int ho, \ - int wo, \ - int sz, \ - int sy, \ - int sx, \ - int dz, \ - int dy, \ - int dx, \ - int pz, \ - int py, \ - int px, \ - int fz, \ - int fy, \ - int fx, \ - int group) \ - { \ - naive_conv_wrw_##tensor_layout(p_in, \ - p_wei, \ - p_out, \ - di, \ - hi, \ - wi, \ - n, \ - k_per_group, \ - c_per_group, \ - do_, \ - ho, \ - wo, \ - sz, \ - sy, \ - sx, \ - dz, \ - dy, \ - dx, \ - pz, \ - py, \ - px, \ - fz, \ - fy, \ - fx, \ - group); \ +#define DEFINE_3D_NAIVE_CONV_KERNEL(direction, tensor_layout, src_data_t, acc_data_t, dst_data_t) \ + extern "C" __global__ void \ + naive_conv_packed_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ + src_data_t* __restrict__ p_in, \ + src_data_t* __restrict__ p_wei, \ + dst_data_t* __restrict__ p_out, \ + Strides6D in_strides, \ + Strides6D wei_strides, \ + Strides6D out_strides, \ + int di, \ + int hi, \ + int wi, \ + int n, \ + int k_per_group, \ + int c_per_group, \ + int do_, \ + int ho, \ + int wo, \ + int sz, \ + int sy, \ + int sx, \ + int dz, \ + int dy, \ + int dx, \ + int pz, \ + int py, \ + int px, \ + int fz, \ + int fy, \ + int fx, \ + int group) \ + { \ + naive_conv_##direction##_##tensor_layout( \ + p_in, \ + p_wei, \ + p_out, \ + in_strides, \ + wei_strides, \ + out_strides, \ + di, \ + hi, \ + wi, \ + n, \ + k_per_group, \ + c_per_group, \ + do_, \ + ho, \ + wo, \ + sz, \ + sy, \ + sx, \ + dz, \ + dy, \ + dx, \ + pz, \ + py, \ + px, \ + fz, \ + fy, \ + fx, \ + group); \ + } \ + extern "C" __global__ void \ + naive_conv_nonpacked_##direction##_##tensor_layout##_##src_data_t##_##acc_data_t##_##dst_data_t( \ + src_data_t* __restrict__ p_in, \ + src_data_t* __restrict__ p_wei, \ + dst_data_t* __restrict__ p_out, \ + Strides6D in_strides, \ + Strides6D wei_strides, \ + Strides6D out_strides, \ + int di, \ + int hi, \ + int wi, \ + int n, \ + int k_per_group, \ + int c_per_group, \ + int do_, \ + int ho, \ + int wo, \ + int sz, \ + int sy, \ + int sx, \ + int dz, \ + int dy, \ + int dx, \ + int pz, \ + int py, \ + int px, \ + int fz, \ + int fy, \ + int fx, \ + int group) \ + { \ + naive_conv_##direction##_##tensor_layout( \ + p_in, \ + p_wei, \ + p_out, \ + in_strides, \ + wei_strides, \ + out_strides, \ + di, \ + hi, \ + wi, \ + n, \ + k_per_group, \ + c_per_group, \ + do_, \ + ho, \ + wo, \ + sz, \ + sy, \ + sx, \ + dz, \ + dy, \ + dx, \ + pz, \ + py, \ + px, \ + fz, \ + fy, \ + fx, \ + group); \ } -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, float, double, float) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, half, double, half) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, ushort, double, ushort) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, int8_t, int32_t, int32_t) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nchw, int8_t, int32_t, float) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, float, double, float) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, half, double, half) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, ushort, double, ushort) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, int8_t, int32_t, int32_t) -DEFINE_2D_NAIVE_FWD_CONV_KERNEL(nhwc, int8_t, int32_t, float) - -DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nchw, float, double, float) -DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nchw, half, double, half) -DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nchw, ushort, double, ushort) -DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nhwc, float, double, float) -DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nhwc, half, double, half) -DEFINE_2D_NAIVE_BWD_CONV_KERNEL(nhwc, ushort, double, ushort) - -DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nchw, float, double, float) -DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nchw, half, double, half) -DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nchw, ushort, double, ushort) -DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nhwc, float, double, float) -DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nhwc, half, double, half) -DEFINE_2D_NAIVE_WRW_CONV_KERNEL(nhwc, ushort, double, ushort) - -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, float, double, float) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, half, double, half) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, ushort, double, ushort) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, int8_t, int32_t, int32_t) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ncdhw, int8_t, int32_t, float) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, float, double, float) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, half, double, half) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, ushort, double, ushort) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, int8_t, int32_t, int32_t) -DEFINE_3D_NAIVE_FWD_CONV_KERNEL(ndhwc, int8_t, int32_t, float) - -DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ncdhw, float, double, float) -DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ncdhw, half, double, half) -DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ncdhw, ushort, double, ushort) -DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ndhwc, float, double, float) -DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ndhwc, half, double, half) -DEFINE_3D_NAIVE_BWD_CONV_KERNEL(ndhwc, ushort, double, ushort) - -DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ncdhw, float, double, float) -DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ncdhw, half, double, half) -DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ncdhw, ushort, double, ushort) -DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ndhwc, float, double, float) -DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ndhwc, half, double, half) -DEFINE_3D_NAIVE_WRW_CONV_KERNEL(ndhwc, ushort, double, ushort) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, float, double, float) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, half, double, half) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, ushort, double, ushort) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, int32_t) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nchw, int8_t, int32_t, float) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, float, double, float) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, half, double, half) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, ushort, double, ushort) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, int32_t) +DEFINE_2D_NAIVE_CONV_KERNEL(fwd, nhwc, int8_t, int32_t, float) + +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, float, double, float) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, half, double, half) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nchw, ushort, double, ushort) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, float, double, float) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, half, double, half) +DEFINE_2D_NAIVE_CONV_KERNEL(bwd, nhwc, ushort, double, ushort) + +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, float, double, float) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, half, double, half) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nchw, ushort, double, ushort) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, float, double, float) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, half, double, half) +DEFINE_2D_NAIVE_CONV_KERNEL(wrw, nhwc, ushort, double, ushort) + +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, float, double, float) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, half, double, half) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, ushort, double, ushort) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, int8_t, int32_t, int32_t) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ncdhw, int8_t, int32_t, float) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, float, double, float) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, half, double, half) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, ushort, double, ushort) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, int8_t, int32_t, int32_t) +DEFINE_3D_NAIVE_CONV_KERNEL(fwd, ndhwc, int8_t, int32_t, float) + +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, float, double, float) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, half, double, half) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ncdhw, ushort, double, ushort) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, float, double, float) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, half, double, half) +DEFINE_3D_NAIVE_CONV_KERNEL(bwd, ndhwc, ushort, double, ushort) + +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, float, double, float) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, half, double, half) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ncdhw, ushort, double, ushort) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, float, double, float) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, half, double, half) +DEFINE_3D_NAIVE_CONV_KERNEL(wrw, ndhwc, ushort, double, ushort) + +/// \todo discuss whether we should split the kernels into separate files, or +/// figure out a mechanism to compile each kernel separately to reduce hipRTC +/// compilation times. --amberhassaan diff --git a/src/kernels/stride_array.hpp b/src/kernels/stride_array.hpp new file mode 100644 index 0000000000..32cb1f85b6 --- /dev/null +++ b/src/kernels/stride_array.hpp @@ -0,0 +1,86 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#ifdef __HIPCC_RTC__ +#ifndef WORKAROUND_ISSUE_HIPRTC_TRUE_TYPE +#include +#endif +#endif // __HIPCC_RTC__ + +/// \todo Uncomment when hip RTC accepts std::array -- amberhassaan +// #include +// using StrideIndexType = int; +// using Strides3D = std::array; +// using Strides4D = std::array; +// using Strides5D = std::array; +// using Strides6D = std::array; +template +class MyArray +{ + T data_[N] = {}; + +public: + constexpr static const unsigned SIZE = N; + __host__ __device__ constexpr unsigned size() const { return N; } + + __host__ __device__ const T& operator[](unsigned i) const { return data_[i]; } + + __host__ T& operator[](unsigned i) { return data_[i]; } + + __host__ __device__ MyArray() = default; + __host__ __device__ MyArray(const MyArray&) = default; + __host__ __device__ MyArray(MyArray&&) noexcept = default; + __host__ __device__ MyArray& operator=(const MyArray&) = default; + __host__ __device__ MyArray& operator=(MyArray&&) noexcept = default; + __host__ __device__ ~MyArray() = default; +}; + +using StrideIndexType = size_t; +using Strides5D = MyArray; +using Strides6D = MyArray; + +template +__host__ __device__ void printStrideArray(const char* name, const StrideArray& sarr) +{ + printf("%s = [", name); + for(int i = 0; i < StrideArray::SIZE; ++i) + { + printf("%zu,", sarr[i]); + } + printf("]\n"); +} + +template +__host__ __device__ void printStrideArrays(const StrideArray& in_strides, + const StrideArray& wei_strides, + const StrideArray& out_strides) +{ + + printStrideArray("in_strides", in_strides); + printStrideArray("wei_strides", wei_strides); + printStrideArray("out_strides", out_strides); +} diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp index 5c468768fa..86a8a4161e 100644 --- a/src/solver/conv_direct_naive_conv.cpp +++ b/src/solver/conv_direct_naive_conv.cpp @@ -24,6 +24,7 @@ * *******************************************************************************/ +#include "miopen/env.hpp" #include #include #include @@ -105,10 +106,20 @@ bool IsOutputInt32(const ProblemDescription& problem) problem.GetOutDataType() == miopenInt32; } +MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS); + std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem) { std::ostringstream kernel_name; - kernel_name << "naive_conv_"; + if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS())) + { + kernel_name << "naive_conv_packed_"; + } + else + { + kernel_name << "naive_conv_nonpacked_"; + } + if(problem.direction.IsForward()) kernel_name << "fwd_"; else if(problem.direction.IsBackwardData()) @@ -244,5 +255,49 @@ bool ConvDirectNaiveConvIsApplicableByKernelType(const ExecutionContext& ctx, return true; } +/// Figure out the index of C (channel) stride so we can expand it into +/// (G, C_per_group). Return value G_stride_idx is the position of G stride +/// in the stride vector, such that the (G_stride_idx - 1) is the index that +/// contains C's stride as a multiplying factor +int conv_internal::GetGroupStrideIndex(const ProblemDescription& problem) +{ + int G_stride_idx = -1; + if(problem.IsLayoutDefault()) + { + G_stride_idx = 1; + } + else + { + assert(problem.IsLayoutNHWC()); + assert(problem.Is2d() || problem.Is3d()); + // + // G_stride_idx = problem.Is2d() ? 3 : 4; + // For NHWC, MIOpen stores strides in NCHW order, so we are interested in 1 + W's + // stride as that will be the value of G_stride_idx; + G_stride_idx = problem.Is2d() ? 4 : 5; + } + assert(G_stride_idx != -1); + return G_stride_idx; +} + +void conv_internal::DebugPrintTensorStrides(const TensorDescriptor& inDesc, + const TensorDescriptor& wDesc, + const TensorDescriptor& outDesc) +{ + + auto printOneStrideVec = [](const char* name, const auto& vec) { + MIOPEN_LOG_I(name << " = ["); + for(const size_t v : vec) + { + MIOPEN_LOG_I(v << ","); + } + MIOPEN_LOG_I("]\n"); + }; + + printOneStrideVec("inDesc = ", inDesc.GetStrides()); + printOneStrideVec("wDesc = ", wDesc.GetStrides()); + printOneStrideVec("outDesc = ", outDesc.GetStrides()); +} + } // namespace solver } // namespace miopen diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp index f8af0ec2d1..1a28f8aae6 100644 --- a/src/solver/conv_direct_naive_conv_bwd.cpp +++ b/src/solver/conv_direct_naive_conv_bwd.cpp @@ -142,14 +142,27 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, }(); kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); + int G_stride_idx = conv_internal::GetGroupStrideIndex(problem); + if(problem.Is2d()) + { result.invoker_factory = [=](const std::vector& kernels) { const auto kern = kernels[0]; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; float elapsed = 0; + auto in_strides = conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC( + group, tensors.inDesc.GetStrides(), G_stride_idx)); + // For weights, we split K to (G, K_per_group), which is always index 0 + auto wei_strides = conv_internal::MakeStrideArray<5>( + conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); + auto out_strides = + conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC( + group, tensors.outDesc.GetStrides(), G_stride_idx)); + /// \ref backward_tensors_reversed_why if(is_f8) + { handle.Run(kern)(tensors.out, tensors.w, tensors.in, @@ -172,10 +185,15 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, problem.GetConv().attribute.fp8rounding_mode.Get() == miopenF8RoundingModeStochastic, problem.GetConv().attribute.fp8rounding_mode.GetSeed()); + } else + { handle.Run(kern)(tensors.out, tensors.w, tensors.in, + out_strides, + wei_strides, + in_strides, hi, wi, n, @@ -192,6 +210,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, fy, fx, group); + } if(handle.IsProfilingEnabled()) elapsed += handle.GetKernelTime(); @@ -202,7 +221,9 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, } }; }; + } else + { result.invoker_factory = [=](const std::vector& kernels) { const auto kern = kernels[0]; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { @@ -210,9 +231,26 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, const auto& tensors = data_ctx.tensors; float elapsed = 0; + auto in_strides = conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC( + group, tensors.inDesc.GetStrides(), G_stride_idx)); + // For weights, we split K to (G, K_per_group), which is always index 0 + auto wei_strides = conv_internal::MakeStrideArray<6>( + conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); + auto out_strides = + conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC( + group, tensors.outDesc.GetStrides(), G_stride_idx)); + + /// \anchor backward_tensors_reversed_why + /// \todo Someone made the silly decision of swapping in and + /// out pointers in ConvTensors for backward pass, so now I have to + /// pass out in place of in, out_strides in place of in_strides and + /// vice-versa --amberhassaan handle.Run(kern)(tensors.out, tensors.w, tensors.in, + out_strides, + wei_strides, + in_strides, di, hi, wi, @@ -245,6 +283,7 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, } }; }; + } result.construction_params.push_back(kernel); return result; } diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp index 90d8feee31..a4656d929a 100644 --- a/src/solver/conv_direct_naive_conv_fwd.cpp +++ b/src/solver/conv_direct_naive_conv_fwd.cpp @@ -27,7 +27,6 @@ #include #include #include -#include MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_CONV_FWD) @@ -142,13 +141,26 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx, kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); + int G_stride_idx = conv_internal::GetGroupStrideIndex(problem); + if(problem.Is2d()) + { result.invoker_factory = [=](const std::vector& kernels) { const auto kern = kernels[0]; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; float elapsed = 0; + + auto in_strides = conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC( + group, tensors.inDesc.GetStrides(), G_stride_idx)); + // For weights, we split K to (G, K_per_group), which is always index 0 + auto wei_strides = conv_internal::MakeStrideArray<5>( + conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); + auto out_strides = + conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC( + group, tensors.outDesc.GetStrides(), G_stride_idx)); + if(is_f8) { handle.Run(kern)(tensors.in, @@ -179,6 +191,9 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx, handle.Run(kern)(tensors.in, tensors.w, tensors.out, + in_strides, + wei_strides, + out_strides, hi, wi, n, @@ -206,7 +221,9 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx, } }; }; + } else + { result.invoker_factory = [=](const std::vector& kernels) { const auto kern = kernels[0]; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { @@ -214,9 +231,20 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx, const auto& tensors = data_ctx.tensors; float elapsed = 0; + auto in_strides = conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC( + group, tensors.inDesc.GetStrides(), G_stride_idx)); + // For weights, we split K to (G, K_per_group), which is always index 0 + auto wei_strides = conv_internal::MakeStrideArray<6>( + conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.wDesc.GetStrides())); + auto out_strides = + conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC( + group, tensors.outDesc.GetStrides(), G_stride_idx)); handle.Run(kern)(tensors.in, tensors.w, tensors.out, + in_strides, + wei_strides, + out_strides, di, hi, wi, @@ -249,6 +277,7 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx, } }; }; + } result.construction_params.push_back(kernel); return result; } diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp index 6fcf2f71d0..dfe1c342b0 100644 --- a/src/solver/conv_direct_naive_conv_wrw.cpp +++ b/src/solver/conv_direct_naive_conv_wrw.cpp @@ -129,14 +129,28 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, return false; }(); + int G_stride_idx = conv_internal::GetGroupStrideIndex(problem); + if(problem.Is2d()) + { result.invoker_factory = [=](const std::vector& kernels) { const auto kern = kernels[0]; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { decltype(auto) data_ctx = primitive_parameters.CastTo(); const auto& tensors = data_ctx.tensors; float elapsed = 0; + + auto in_strides = conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC( + group, tensors.xDesc.GetStrides(), G_stride_idx)); + // For weights, we split K to (G, K_per_group), which is always index 0 + auto wei_strides = conv_internal::MakeStrideArray<5>( + conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides())); + auto out_strides = + conv_internal::MakeStrideArray<5>(conv_internal::SplitStrideCtoGC( + group, tensors.dyDesc.GetStrides(), G_stride_idx)); + if(is_f8) + { handle.Run(kern)(tensors.x, tensors.dw, tensors.dy, @@ -159,10 +173,15 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, problem.GetConv().attribute.fp8rounding_mode.Get() == miopenF8RoundingModeStochastic, problem.GetConv().attribute.fp8rounding_mode.GetSeed()); + } else + { handle.Run(kern)(tensors.x, tensors.dw, tensors.dy, + in_strides, + wei_strides, + out_strides, hi, wi, n, @@ -179,6 +198,7 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, fy, fx, group); + } if(handle.IsProfilingEnabled()) elapsed += handle.GetKernelTime(); @@ -189,7 +209,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, } }; }; + } else + { result.invoker_factory = [=](const std::vector& kernels) { const auto kern = kernels[0]; return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) { @@ -197,9 +219,21 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, const auto& tensors = data_ctx.tensors; float elapsed = 0; + auto in_strides = conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC( + group, tensors.xDesc.GetStrides(), G_stride_idx)); + // For weights, we split K to (G, K_per_group), which is always index 0 + auto wei_strides = conv_internal::MakeStrideArray<6>( + conv_internal::SplitWeiStrideKtoGK(k_per_group, tensors.dwDesc.GetStrides())); + auto out_strides = + conv_internal::MakeStrideArray<6>(conv_internal::SplitStrideCtoGC( + group, tensors.dyDesc.GetStrides(), G_stride_idx)); + handle.Run(kern)(tensors.x, tensors.dw, tensors.dy, + in_strides, + wei_strides, + out_strides, di, hi, wi, @@ -232,6 +266,7 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, } }; }; + } result.construction_params.push_back(kernel); return result; } diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp index c3b26a80a9..aa3dda788d 100644 --- a/test/gpu_reference_kernel.cpp +++ b/test/gpu_reference_kernel.cpp @@ -95,7 +95,8 @@ struct gpu_reference_kernel_base static std::vector get_image_size() { return {9, 14}; } - static std::vector get_channel_size() { return {3, 8}; } + // Warning: Channel size must be multiple of group size + static std::vector get_channel_size() { return {4, 8}; } static std::vector get_filter_depth() { return {1, 3}; } diff --git a/test/gtest/conv3d_test_case.hpp b/test/gtest/conv3d_test_case.hpp new file mode 100644 index 0000000000..242615077f --- /dev/null +++ b/test/gtest/conv3d_test_case.hpp @@ -0,0 +1,112 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include + +#include "get_handle.hpp" +#include + +#include "../driver/tensor_driver.hpp" +#include "conv_common.hpp" + +template +miopenDataType_t GetDataType(); + +template <> +miopenDataType_t GetDataType() +{ + return miopenFloat; +} + +template <> +miopenDataType_t GetDataType() +{ + return miopenHalf; +} + +template <> +miopenDataType_t GetDataType() +{ + return miopenInt8; +} + +struct Conv3DTestCase +{ + size_t G; + size_t N; + size_t C; + size_t D; + size_t H; + size_t W; + size_t k; + size_t z; + size_t y; + size_t x; + size_t pad_x; + size_t pad_y; + size_t pad_z; + size_t stride_x; + size_t stride_y; + size_t stride_z; + size_t dilation_x; + size_t dilation_y; + size_t dilation_z; + miopenConvolutionMode_t conv_mode; + friend std::ostream& operator<<(std::ostream& os, const Conv3DTestCase& tc) + { + return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D + << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z + << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z + << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z + << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x + << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y + << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode; + } + + std::vector GetInput() { return {N, C, D, H, W}; } + std::vector GetWeights() + { + EXPECT_EQUAL(C % G, 0); + return {k, C / G, z, y, x}; + } + + miopen::ConvolutionDescriptor GetConv() + { + return miopen::ConvolutionDescriptor{ + 3, + miopenConvolution, + miopenPaddingDefault, + {static_cast(pad_z), static_cast(pad_y), static_cast(pad_x)}, + {static_cast(stride_z), static_cast(stride_y), static_cast(stride_x)}, + {static_cast(dilation_z), + static_cast(dilation_y), + static_cast(dilation_x)}, + {0, 0, 0}, + static_cast(G), + 1.0}; + } +}; diff --git a/test/gtest/group_conv3d_bwd.cpp b/test/gtest/group_conv3d_bwd.cpp index e53a690021..a9bffceff1 100644 --- a/test/gtest/group_conv3d_bwd.cpp +++ b/test/gtest/group_conv3d_bwd.cpp @@ -44,7 +44,7 @@ void SolverBwd(const miopen::TensorDescriptor& inputDesc, const miopen::TensorDescriptor& outputDesc, ConstData_t output, const miopen::ConvolutionDescriptor& convDesc, - const ConvTestCase& conv_config, + const Conv3DTestCase& conv_config, bool& test_skipped) { auto&& handle = get_handle(); diff --git a/test/gtest/group_conv3d_bwd.hpp b/test/gtest/group_conv3d_bwd.hpp index 410d71e6d0..71702c5808 100644 --- a/test/gtest/group_conv3d_bwd.hpp +++ b/test/gtest/group_conv3d_bwd.hpp @@ -25,89 +25,9 @@ *******************************************************************************/ #pragma once -#include +#include "conv3d_test_case.hpp" -#include "get_handle.hpp" -#include - -#include "../driver/tensor_driver.hpp" -#include "conv_common.hpp" - -template -miopenDataType_t GetDataType(); - -template <> -miopenDataType_t GetDataType() -{ - return miopenFloat; -} - -template <> -miopenDataType_t GetDataType() -{ - return miopenHalf; -} - -template <> -miopenDataType_t GetDataType() -{ - return miopenInt8; -} - -struct ConvTestCase -{ - size_t G; - size_t N; - size_t C; - size_t D; - size_t H; - size_t W; - size_t k; - size_t z; - size_t y; - size_t x; - size_t pad_x; - size_t pad_y; - size_t pad_z; - size_t stride_x; - size_t stride_y; - size_t stride_z; - size_t dilation_x; - size_t dilation_y; - size_t dilation_z; - miopenConvolutionMode_t conv_mode; - friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc) - { - return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D - << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z - << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z - << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z - << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x - << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y - << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode; - } - - std::vector GetInput() { return {N, C, D, H, W}; } - std::vector GetWeights() { return {k, C, z, y, x}; } - - miopen::ConvolutionDescriptor GetConv() - { - return miopen::ConvolutionDescriptor{ - 3, - miopenConvolution, - miopenPaddingDefault, - {static_cast(pad_z), static_cast(pad_y), static_cast(pad_x)}, - {static_cast(stride_z), static_cast(stride_y), static_cast(stride_x)}, - {static_cast(dilation_z), - static_cast(dilation_y), - static_cast(dilation_x)}, - {0, 0, 0}, - static_cast(G), - 1.0}; - } -}; - -std::vector ConvTestConfigs() +std::vector ConvTestConfigs() { // g n c d h w k z y x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z return {{1, 128, 64, 14, 28, 28, 64, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, {1, 64, 32, 28, 28, 28, 32, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, @@ -133,7 +53,7 @@ inline int SetTensorLayout(miopen::TensorDescriptor& desc) template struct ConvBwdSolverTest : public ::testing::TestWithParam< - std::tuple> + std::tuple> { protected: void SetUp() override @@ -188,7 +108,7 @@ struct ConvBwdSolverTest EXPECT_TRUE(error < threshold) << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; } - ConvTestCase conv_config; + Conv3DTestCase conv_config; miopen::ConvolutionDescriptor conv_desc; tensor input; tensor weights; diff --git a/test/gtest/group_conv3d_fwd.cpp b/test/gtest/group_conv3d_fwd.cpp index 2b52a1b43a..18d54355e8 100644 --- a/test/gtest/group_conv3d_fwd.cpp +++ b/test/gtest/group_conv3d_fwd.cpp @@ -44,7 +44,7 @@ void SolverFwd(const miopen::TensorDescriptor& inputDesc, const miopen::TensorDescriptor& outputDesc, Data_t output, const miopen::ConvolutionDescriptor& convDesc, - const ConvTestCase& conv_config, + const Conv3DTestCase& conv_config, bool& test_skipped) { auto&& handle = get_handle(); diff --git a/test/gtest/group_conv3d_fwd.hpp b/test/gtest/group_conv3d_fwd.hpp index 983f897d78..c8767399a7 100644 --- a/test/gtest/group_conv3d_fwd.hpp +++ b/test/gtest/group_conv3d_fwd.hpp @@ -25,89 +25,9 @@ *******************************************************************************/ #pragma once -#include +#include "conv3d_test_case.hpp" -#include "get_handle.hpp" -#include - -#include "../driver/tensor_driver.hpp" -#include "conv_common.hpp" - -template -miopenDataType_t GetDataType(); - -template <> -miopenDataType_t GetDataType() -{ - return miopenFloat; -} - -template <> -miopenDataType_t GetDataType() -{ - return miopenHalf; -} - -template <> -miopenDataType_t GetDataType() -{ - return miopenInt8; -} - -struct ConvTestCase -{ - size_t G; - size_t N; - size_t C; - size_t D; - size_t H; - size_t W; - size_t k; - size_t z; - size_t y; - size_t x; - size_t pad_x; - size_t pad_y; - size_t pad_z; - size_t stride_x; - size_t stride_y; - size_t stride_z; - size_t dilation_x; - size_t dilation_y; - size_t dilation_z; - miopenConvolutionMode_t conv_mode; - friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc) - { - return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D - << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z - << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z - << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z - << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x - << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y - << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode; - } - - std::vector GetInput() { return {N, C, D, H, W}; } - std::vector GetWeights() { return {k, C, z, y, x}; } - - miopen::ConvolutionDescriptor GetConv() - { - return miopen::ConvolutionDescriptor{ - 3, - miopenConvolution, - miopenPaddingDefault, - {static_cast(pad_z), static_cast(pad_y), static_cast(pad_x)}, - {static_cast(stride_z), static_cast(stride_y), static_cast(stride_x)}, - {static_cast(dilation_z), - static_cast(dilation_y), - static_cast(dilation_x)}, - {0, 0, 0}, - static_cast(G), - 1.0}; - } -}; - -std::vector ConvTestConfigs() +std::vector ConvTestConfigs() { // g n c d h w k z y x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z return {{1, 128, 64, 14, 28, 28, 64, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, {1, 64, 32, 28, 28, 28, 32, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, @@ -139,7 +59,7 @@ inline int SetTensorLayout(miopen::TensorDescriptor& desc) template struct ConvFwdSolverTest : public ::testing::TestWithParam< - std::tuple> + std::tuple> { protected: void SetUp() override @@ -195,7 +115,7 @@ struct ConvFwdSolverTest EXPECT_TRUE(error < threshold) << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; } - ConvTestCase conv_config; + Conv3DTestCase conv_config; miopen::ConvolutionDescriptor conv_desc; tensor input; tensor weights; diff --git a/test/gtest/group_conv3d_wrw.cpp b/test/gtest/group_conv3d_wrw.cpp index 13e88da5ad..977a06220a 100644 --- a/test/gtest/group_conv3d_wrw.cpp +++ b/test/gtest/group_conv3d_wrw.cpp @@ -44,7 +44,7 @@ void SolverWrw(const miopen::TensorDescriptor& inputDesc, const miopen::TensorDescriptor& outputDesc, ConstData_t output, // dy const miopen::ConvolutionDescriptor& convDesc, - const ConvTestCase& conv_config, + const Conv3DTestCase& conv_config, bool& test_skipped) { diff --git a/test/gtest/group_conv3d_wrw.hpp b/test/gtest/group_conv3d_wrw.hpp index 76d8ae5d90..bf5824b4fa 100644 --- a/test/gtest/group_conv3d_wrw.hpp +++ b/test/gtest/group_conv3d_wrw.hpp @@ -25,89 +25,9 @@ *******************************************************************************/ #pragma once -#include +#include "conv3d_test_case.hpp" -#include "get_handle.hpp" -#include - -#include "../driver/tensor_driver.hpp" -#include "conv_common.hpp" - -template -miopenDataType_t GetDataType(); - -template <> -miopenDataType_t GetDataType() -{ - return miopenFloat; -} - -template <> -miopenDataType_t GetDataType() -{ - return miopenHalf; -} - -template <> -miopenDataType_t GetDataType() -{ - return miopenInt8; -} - -struct ConvTestCase -{ - size_t G; - size_t N; - size_t C; - size_t D; - size_t H; - size_t W; - size_t k; - size_t z; - size_t y; - size_t x; - size_t pad_x; - size_t pad_y; - size_t pad_z; - size_t stride_x; - size_t stride_y; - size_t stride_z; - size_t dilation_x; - size_t dilation_y; - size_t dilation_z; - miopenConvolutionMode_t conv_mode; - friend std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc) - { - return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D - << " H:" << tc.H << " W:" << tc.W << " k:" << tc.k << " z:" << tc.z - << " y:" << tc.y << " x:" << tc.x << " pad_z:" << tc.pad_z - << " pad_y:" << tc.pad_y << " pad_x:" << tc.pad_x << " stride_z:" << tc.stride_z - << " stride_y:" << tc.stride_y << " stride_x:" << tc.stride_x - << " dilation_z:" << tc.dilation_z << " dilation_y:" << tc.dilation_y - << " dilation_x:" << tc.dilation_x << " conv_mode:" << tc.conv_mode; - } - - std::vector GetInput() { return {N, C, D, H, W}; } - std::vector GetWeights() { return {k, C, z, y, x}; } - - miopen::ConvolutionDescriptor GetConv() - { - return miopen::ConvolutionDescriptor{ - 3, - miopenConvolution, - miopenPaddingDefault, - {static_cast(pad_z), static_cast(pad_y), static_cast(pad_x)}, - {static_cast(stride_z), static_cast(stride_y), static_cast(stride_x)}, - {static_cast(dilation_z), - static_cast(dilation_y), - static_cast(dilation_x)}, - {0, 0, 0}, - static_cast(G), - 1.0}; - } -}; - -std::vector ConvTestConfigs() +std::vector ConvTestConfigs() { // g n c d h w k z y x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z return {{1, 128, 64, 14, 28, 28, 64, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, {1, 64, 32, 28, 28, 28, 32, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, @@ -135,7 +55,7 @@ inline int SetTensorLayout(miopen::TensorDescriptor& desc) template struct ConvWrwSolverTest : public ::testing::TestWithParam< - std::tuple> + std::tuple> { protected: void SetUp() override @@ -191,7 +111,7 @@ struct ConvWrwSolverTest EXPECT_TRUE(error < threshold) << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; } - ConvTestCase conv_config; + Conv3DTestCase conv_config; miopen::ConvolutionDescriptor conv_desc; tensor input; tensor weights; diff --git a/test/gtest/group_solver.hpp b/test/gtest/group_solver.hpp index 6fe02e00da..3d9ebddca3 100644 --- a/test/gtest/group_solver.hpp +++ b/test/gtest/group_solver.hpp @@ -80,7 +80,11 @@ struct ConvTestCase } std::vector GetInput() { return {N, C, H, W}; } - std::vector GetWeights() { return {k, C, y, x}; } + std::vector GetWeights() + { + EXPECT_EQUAL(C % G, 0); + return {k, C / G, y, x}; + } miopen::ConvolutionDescriptor GetConv() { From 9c713bbc633a1d80f59e6475a2f21a6279c55367 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 6 Oct 2023 14:45:19 -0700 Subject: [PATCH 25/36] [Doc] Bump rocm-docs-core from 0.24.2 to 0.25.0 in /docs/.sphinx (#2434) Bumps [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) from 0.24.2 to 0.25.0. - [Release notes](https://github.com/RadeonOpenCompute/rocm-docs-core/releases) - [Changelog](https://github.com/RadeonOpenCompute/rocm-docs-core/blob/develop/CHANGELOG.md) - [Commits](https://github.com/RadeonOpenCompute/rocm-docs-core/compare/v0.24.2...v0.25.0) --- updated-dependencies: - dependency-name: rocm-docs-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/.sphinx/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt index 6f10fcce12..f358d19826 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/.sphinx/requirements.txt @@ -92,7 +92,7 @@ requests==2.31.0 # via # pygithub # sphinx -rocm-docs-core==0.24.2 +rocm-docs-core==0.25.0 # via -r requirements.in smmap==5.0.0 # via gitdb From 05970fab9f9f6df456723ad6e1d6e33489eae234 Mon Sep 17 00:00:00 2001 From: xinlipn Date: Sat, 7 Oct 2023 21:11:02 -0700 Subject: [PATCH 26/36] Fix weight tensor intialization to replace old PR1950 (#2436) --- test/gtest/solver_f8.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/gtest/solver_f8.hpp b/test/gtest/solver_f8.hpp index 34a10a4cfe..bf77347ef8 100644 --- a/test/gtest/solver_f8.hpp +++ b/test/gtest/solver_f8.hpp @@ -160,7 +160,7 @@ struct ConvFwdSolverTest test_skipped = false; std::tie(algo, conv_config) = GetParam(); input = tensor{conv_config.N, conv_config.C, conv_config.H, conv_config.W}; - weights = tensor{conv_config.k, conv_config.C, conv_config.x, conv_config.y}; + weights = tensor{conv_config.k, conv_config.C, conv_config.y, conv_config.x}; auto gen_fp8_value = [=](auto...) { const auto tmp = float8(scalar_gen_random_float{-0.5, 0.5}()); From 6f5cb6851235f1943d6ec44768f5b185e2bf4e56 Mon Sep 17 00:00:00 2001 From: Chris Erb Date: Mon, 9 Oct 2023 18:21:40 -0500 Subject: [PATCH 27/36] Add typecast to config key (#2413) * add typecast value to config key, as optional arg to fdb_key * fix clang-format issue * Save space in db key and optimize code. Do not print casting value when casting is not actually necessary. Co-authored-by: Artem Tamazov * do not print casting to confkey when unnecessary, code cleanup, datatype rename * move GetDataTypeName to problem_descrption_base.hpp, organize includes * fix missing header --------- Co-authored-by: Jun Liu Co-authored-by: Artem Tamazov --- src/activ/problem_description.cpp | 2 ++ src/conv/problem_description.cpp | 24 +++++++++++++-- .../miopen/activ/problem_description.hpp | 2 +- .../miopen/conv/problem_description.hpp | 29 ++++--------------- .../miopen/pooling/problem_description.hpp | 1 + .../miopen/problem_description_base.hpp | 20 +++++++++++++ src/pooling/problem_description.cpp | 2 ++ 7 files changed, 54 insertions(+), 26 deletions(-) diff --git a/src/activ/problem_description.cpp b/src/activ/problem_description.cpp index b1e21a76e8..9bc484259d 100644 --- a/src/activ/problem_description.cpp +++ b/src/activ/problem_description.cpp @@ -68,6 +68,8 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const ss << ((packed) ? "11" : "10"); // + lite bit ss << xDesc.GetType(); + if(const auto ct = xDesc.GetCastType()) + ss << GetDataTypeName(*ct); ss << activDesc.GetMode(); ss << read_unit; ss << MAP_RD; diff --git a/src/conv/problem_description.cpp b/src/conv/problem_description.cpp index 76c47cbcd9..40fdd2f4c9 100644 --- a/src/conv/problem_description.cpp +++ b/src/conv/problem_description.cpp @@ -127,6 +127,19 @@ void ProblemDescription::BuildConfKey(std::string& conf_key) const ss << 'x' << GetOutLayout(); } ss << 'x' << EncodeDataTypesForKey(GetInDataType(), GetWeightsDataType(), GetOutDataType()); + + std::ostringstream optional; + if(const auto ct = GetInCastType()) + optional << "ci" << GetDataTypeName(*ct); + if(const auto ct = GetWeightsCastType()) + optional << "cw" << GetDataTypeName(*ct); + if(const auto ct = GetOutCastType()) + optional << "co" << GetDataTypeName(*ct); + if(!optional.str().empty()) + { + ss << 'x' << optional.str(); + } + ss << 'x' << PrintDHW('x', GetSpatialDims(), GetPadD(), GetPadH(), GetPadW()); ss << 'x' << PrintDHW( @@ -175,11 +188,18 @@ void ProblemDescription::Serialize(std::ostream& stream) const { // Group count > 1 identifies Group/Depthwise modes. if(GetGroupCount() != 1) - optional << 'g' << GetGroupCount(); + optional << "_g" << GetGroupCount(); + + if(const auto ct = GetInCastType()) + optional << "_ci" << GetDataTypeName(*ct); + if(const auto ct = GetWeightsCastType()) + optional << "_cw" << GetDataTypeName(*ct); + if(const auto ct = GetOutCastType()) + optional << "_co" << GetDataTypeName(*ct); } if(!optional.str().empty()) { - stream << '_' << optional.str(); + stream << optional.str(); } } diff --git a/src/include/miopen/activ/problem_description.hpp b/src/include/miopen/activ/problem_description.hpp index a22a1ba0d8..d5c09850c3 100644 --- a/src/include/miopen/activ/problem_description.hpp +++ b/src/include/miopen/activ/problem_description.hpp @@ -27,8 +27,8 @@ #pragma once #include -#include #include +#include #include diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp index 438e1d306a..d6b735291e 100644 --- a/src/include/miopen/conv/problem_description.hpp +++ b/src/include/miopen/conv/problem_description.hpp @@ -26,16 +26,17 @@ #pragma once +#include #include -#include #include + +#include +#include +#include + #if MIOPEN_ENABLE_SQLITE #include #endif -#include -#include - -#include namespace miopen { @@ -44,24 +45,6 @@ struct ExecutionContext; std::string EncodeDataTypesForKey(miopenDataType_t in, miopenDataType_t weights, miopenDataType_t out); -inline std::string GetDataTypeName(miopenDataType_t data_type) -{ - switch(data_type) - { - case miopenFloat: return "FP32"; - case miopenHalf: return "FP16"; - case miopenInt8: return "INT8"; - case miopenInt8x4: return "INT8x4"; - case miopenInt32: return "INT32"; - case miopenBFloat16: return "BF16"; - case miopenDouble: return "FP64"; - case miopenFloat8: return "FP8"; - case miopenBFloat8: return "BFP8"; - } - - return "Unknown(" + std::to_string(data_type) + ")"; -} - template constexpr auto GetDHW(unsigned spatial_dims, const std::vector& data) { diff --git a/src/include/miopen/pooling/problem_description.hpp b/src/include/miopen/pooling/problem_description.hpp index 962c73c7dc..dc3e1c25b6 100644 --- a/src/include/miopen/pooling/problem_description.hpp +++ b/src/include/miopen/pooling/problem_description.hpp @@ -26,6 +26,7 @@ #pragma once +#include #include #include diff --git a/src/include/miopen/problem_description_base.hpp b/src/include/miopen/problem_description_base.hpp index 3ca8a178b1..a3d51f120e 100644 --- a/src/include/miopen/problem_description_base.hpp +++ b/src/include/miopen/problem_description_base.hpp @@ -26,8 +26,28 @@ #pragma once +#include + namespace miopen { +inline std::string GetDataTypeName(miopenDataType_t data_type) +{ + switch(data_type) + { + case miopenFloat: return "FP32"; + case miopenHalf: return "FP16"; + case miopenInt8: return "INT8"; + case miopenInt8x4: return "INT8x4"; + case miopenInt32: return "INT32"; + case miopenBFloat16: return "BF16"; + case miopenDouble: return "FP64"; + case miopenFloat8: return "FP8"; + case miopenBFloat8: return "BF8"; + } + + return "Unknown(" + std::to_string(data_type) + ")"; +} + struct ProblemDescriptionBase { }; diff --git a/src/pooling/problem_description.cpp b/src/pooling/problem_description.cpp index 0ee5469684..8e171a4ac0 100644 --- a/src/pooling/problem_description.cpp +++ b/src/pooling/problem_description.cpp @@ -61,6 +61,8 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const ss << "m" + std::to_string(pooling_method); ss << "_dt" << xDesc.GetType(); + if(const auto ct = xDesc.GetCastType()) + ss << "_dct" << GetDataTypeName(*ct); ss << "_ker" << get_vect_config(pooling.lens); ss << "_str" << get_vect_config(pooling.strides); ss << "_pad" << get_vect_config(pooling.pads); From 8c4239d4b06361729ba9359d2095cc639841ee9c Mon Sep 17 00:00:00 2001 From: Chris Erb Date: Mon, 9 Oct 2023 18:22:16 -0500 Subject: [PATCH 28/36] [Bugfix] Add cast swapping for swapped gemm inputs. (#2443) * add swapping for cast types when swapping A+B for gemm --- src/gemm_v2.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp index 19e302f166..1750f625da 100644 --- a/src/gemm_v2.cpp +++ b/src/gemm_v2.cpp @@ -413,6 +413,7 @@ miopenStatus_t CallGemm(const Handle& handle, gemm_desc.isColMajor = !gemm_desc.isColMajor; std::swap(A, B); std::swap(a_offset, b_offset); + std::swap(gemm_desc.a_cast_type, gemm_desc.b_cast_type); std::swap(gemm_desc.transA, gemm_desc.transB); std::swap(gemm_desc.m, gemm_desc.n); std::swap(gemm_desc.lda, gemm_desc.ldb); @@ -665,6 +666,7 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle, gemm_desc.isColMajor = !gemm_desc.isColMajor; std::swap(A, B); std::swap(a_offset, b_offset); + std::swap(gemm_desc.a_cast_type, gemm_desc.b_cast_type); std::swap(gemm_desc.transA, gemm_desc.transB); std::swap(gemm_desc.m, gemm_desc.n); std::swap(gemm_desc.lda, gemm_desc.ldb); @@ -938,6 +940,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, gemm_desc.isColMajor = !gemm_desc.isColMajor; std::swap(A, B); std::swap(a_offset, b_offset); + std::swap(gemm_desc.a_cast_type, gemm_desc.b_cast_type); std::swap(gemm_desc.transA, gemm_desc.transB); std::swap(gemm_desc.m, gemm_desc.n); std::swap(gemm_desc.lda, gemm_desc.ldb); From 1d6db535f44c33624417fef65ea3718d25ae4c48 Mon Sep 17 00:00:00 2001 From: Chris Erb Date: Wed, 11 Oct 2023 01:10:44 -0500 Subject: [PATCH 29/36] [Bugfix] Kernel name fix, compilation err fix (#2446) --- src/kernels/gpu_reference_kernel/fp8_kern_types.h | 6 +++--- src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/kernels/gpu_reference_kernel/fp8_kern_types.h b/src/kernels/gpu_reference_kernel/fp8_kern_types.h index b14302e0c2..d714a0514c 100644 --- a/src/kernels/gpu_reference_kernel/fp8_kern_types.h +++ b/src/kernels/gpu_reference_kernel/fp8_kern_types.h @@ -58,6 +58,6 @@ #define KERNEL_NAME_SUFFIX CAT(CAT(INPUT_TYPE, _), CAT(CAT(WEIGHTS_TYPE, _), OUTPUT_TYPE)) -#define FWD_KERNEL_NAME CAT(naive_conv_packed_fwd_nchw_, KERNEL_NAME_SUFFIX) -#define BWD_KERNEL_NAME CAT(naive_conv_packed_bwd_nchw_, KERNEL_NAME_SUFFIX) -#define WRW_KERNEL_NAME CAT(naive_conv_packed_wrw_nchw_, KERNEL_NAME_SUFFIX) +#define FWD_KERNEL_NAME CAT(naive_conv_nonpacked_fwd_nchw_, KERNEL_NAME_SUFFIX) +#define BWD_KERNEL_NAME CAT(naive_conv_nonpacked_bwd_nchw_, KERNEL_NAME_SUFFIX) +#define WRW_KERNEL_NAME CAT(naive_conv_nonpacked_wrw_nchw_, KERNEL_NAME_SUFFIX) diff --git a/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp b/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp index e6b2945beb..3b4eabecfb 100644 --- a/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp +++ b/src/kernels/gpu_reference_kernel/fp8_naive_conv.cpp @@ -26,7 +26,7 @@ #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS #include #include -#include +//#include #include #endif From b20e20f72a15b6f16efbd4699726fbed212888e6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Oct 2023 23:13:05 -0700 Subject: [PATCH 30/36] Bump gitpython from 3.1.35 to 3.1.37 in /docs/.sphinx (#2445) Bumps [gitpython](https://github.com/gitpython-developers/GitPython) from 3.1.35 to 3.1.37. - [Release notes](https://github.com/gitpython-developers/GitPython/releases) - [Changelog](https://github.com/gitpython-developers/GitPython/blob/main/CHANGES) - [Commits](https://github.com/gitpython-developers/GitPython/compare/3.1.35...3.1.37) --- updated-dependencies: - dependency-name: gitpython dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- docs/.sphinx/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt index f358d19826..eb52b0503e 100644 --- a/docs/.sphinx/requirements.txt +++ b/docs/.sphinx/requirements.txt @@ -40,7 +40,7 @@ fastjsonschema==2.16.3 # via rocm-docs-core gitdb==4.0.10 # via gitpython -gitpython==3.1.35 +gitpython==3.1.37 # via rocm-docs-core idna==3.4 # via requests From b438fd95f0f978a6b9aca551dfc8cf974b19d3b4 Mon Sep 17 00:00:00 2001 From: Chris Erb Date: Wed, 11 Oct 2023 01:20:17 -0500 Subject: [PATCH 31/36] Add MIOPEN_BETA_API defines around f8 (#2430) --------- Co-authored-by: JD --- CMakeLists.txt | 3 +++ driver/layernorm_driver.hpp | 6 +++--- fin | 2 +- include/miopen/miopen.h | 17 +++++++++++++++-- src/include/miopen/layernorm.hpp | 4 ++-- src/layer_norm.cpp | 4 ++-- src/solver/batchnorm/backward_ck.cpp | 6 +++--- 7 files changed, 29 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ac8484172b..aebd984989 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,9 @@ include(ROCMCreatePackage) include(CheckCXXCompilerFlag) include(ROCMHeaderWrapper) +# Build library with Beta APIs +add_definitions("-DMIOPEN_BETA_API=1") + set(MIOPEN_ENABLE_AI_IMMED_MODE_FALLBACK On CACHE BOOL "Enable AI-based fallback for Immediate Mode") set(MIOPEN_ENABLE_AI_KERNEL_TUNING On CACHE BOOL "Enable AI heuristic for kernel tuning") set(MIOPEN_ENABLE_SQLITE On CACHE BOOL "") diff --git a/driver/layernorm_driver.hpp b/driver/layernorm_driver.hpp index 8251472625..e43f3f2f37 100644 --- a/driver/layernorm_driver.hpp +++ b/driver/layernorm_driver.hpp @@ -255,19 +255,19 @@ int LayerNormDriver::AllocateBuffersAndCopy() for(int i = 0; i < in_sz; i++) { - in[i] = RAN_GEN(static_cast(0.0), static_cast(1.0)); + in[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(1.0)); } status = in_dev->ToGPU(q, in.data()); for(int i = 0; i < weight_sz; i++) { - weight[i] = RAN_GEN(static_cast(0.0), static_cast(1.0)); + weight[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(1.0)); } status = weight_dev->ToGPU(q, weight.data()); for(int i = 0; i < bias_sz; i++) { - bias[i] = RAN_GEN(static_cast(0.0), static_cast(1.0)); + bias[i] = prng::gen_A_to_B(static_cast(0.0), static_cast(1.0)); } status = bias_dev->ToGPU(q, bias.data()); diff --git a/fin b/fin index b2f3f4db3c..26b5c32864 160000 --- a/fin +++ b/fin @@ -1 +1 @@ -Subproject commit b2f3f4db3c3d7dd757e6d9e68719a780d8114dfa +Subproject commit 26b5c328642a6af5041539ceae36b9340829384b diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index a59181acf3..abbec2599c 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -112,11 +112,13 @@ typedef enum miopenStatusVersionMismatch = 10, /*!< Version mismatch of the supplied binary data argment. */ } miopenStatus_t; +#ifdef MIOPEN_BETA_API typedef enum { miopenF8RoundingModeStandard = 0, miopenF8RoundingModeStochastic = 1, } miopenF8RoundingMode_t; +#endif /*! @brief Get character string for an error code. * @@ -354,9 +356,14 @@ typedef enum 4, /*!< Pack of four 8-bit int points in NCHW_VECT_C format (Partially supported) */ miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction) (Partially supported) */ - miopenDouble = 6, /*!< 64-bit floating point (Partially supported) */ + miopenDouble = 6, /*!< 64-bit floating point (Partially supported) */ +#ifdef MIOPEN_BETA_API miopenFloat8 = 7, - miopenBFloat8 = 8 + miopenBFloat8 = 8, +#else +// miopenReserved1 = 7, +// miopenReserved2 = 8, +#endif } miopenDataType_t; /*! @ingroup tensor @@ -601,11 +608,15 @@ typedef enum MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC = 1, /*!< Restrict MIOpen convolutions to kernels which produce numerically deterministic results. 0 - disabled (default), 1 - enabled >*/ +#ifdef MIOPEN_BETA_API MIOPEN_CONVOLUTION_ATTRIB_FP8_ROUNDING_MODE = 2, /*!*/ +#else +// miopenReserved1 = 2, +#endif } miopenConvolutionAttrib_t; /** @addtogroup tensor @@ -723,6 +734,7 @@ MIOPEN_EXPORT miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t const int* dimsA, const int* stridesA); +#ifdef MIOPEN_BETA_API /*! @brief Set the tensor cast type * * For tensors where the cast_type attribute is set, the tensor elements would be converted to the @@ -734,6 +746,7 @@ MIOPEN_EXPORT miopenStatus_t miopenSetTensorDescriptor(miopenTensorDescriptor_t */ MIOPEN_EXPORT miopenStatus_t miopenSetTensorCastType(miopenTensorDescriptor_t tensorDesc, miopenDataType_t cast_type); +#endif /*! @brief Set shape of N-dimensional tensor * diff --git a/src/include/miopen/layernorm.hpp b/src/include/miopen/layernorm.hpp index 8ec2d96055..f897e79eea 100644 --- a/src/include/miopen/layernorm.hpp +++ b/src/include/miopen/layernorm.hpp @@ -49,8 +49,8 @@ miopenStatus_t LayerNormForward(const Handle& handle, const TensorDescriptor& rstdDesc, Data_t rstd, miopenLayerNormMode_t mode, - const float epsilon, - const int32_t normalized_dim); + float epsilon, + int32_t normalized_dim); } // namespace miopen #endif // _MIOPEN_LAYERNORM_HPP_ diff --git a/src/layer_norm.cpp b/src/layer_norm.cpp index 3d52bc771f..33030887ee 100644 --- a/src/layer_norm.cpp +++ b/src/layer_norm.cpp @@ -48,8 +48,8 @@ miopenStatus_t LayerNormForward(const Handle& handle, const TensorDescriptor& rstdDesc, Data_t rstd, miopenLayerNormMode_t mode, - const float epsilon, - const int32_t normalized_dim) + float epsilon, + int32_t normalized_dim) { if(x == nullptr || y == nullptr) { diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp index fba8724990..6fecc24049 100644 --- a/src/solver/batchnorm/backward_ck.cpp +++ b/src/solver/batchnorm/backward_ck.cpp @@ -156,11 +156,11 @@ static bool CheckCKApplicability(const miopen::batchnorm::ProblemDescription& pr #endif -bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx, +bool BnCKBwdBackward::IsApplicable(const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& bn_problem) const { #if !MIOPEN_BACKEND_HIP || !MIOPEN_USE_COMPOSABLEKERNEL - std::ignore = ctx; + std::ignore = context; std::ignore = fdesc_problem; return false; #else @@ -168,7 +168,7 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx, return false; if(!bn_problem.IsLayoutNHWC()) return false; - if(!ck_utility::is_ck_supported_hardware(ctx.GetStream())) + if(!ck_utility::is_ck_supported_hardware(context.GetStream())) return false; if(bn_problem.GetXDesc().GetType() != bn_problem.GetScaleBiasDiffDesc().GetType()) return false; From b45e54d6c6737376d2820cce093dda41244b9380 Mon Sep 17 00:00:00 2001 From: Artem Tamazov Date: Wed, 11 Oct 2023 09:56:45 +0300 Subject: [PATCH 32/36] Remove INT8x4 support (#2441) --- docs/datatypes.md | 3 +- include/miopen/miopen.h | 11 ++-- src/check_numerics.cpp | 2 +- src/convolution.cpp | 2 +- src/gemm_v2.cpp | 52 ++++++------------- src/hip/batched_transpose_sol.cpp | 6 ++- src/include/miopen/datatype.hpp | 12 ++--- src/include/miopen/tensor.hpp | 4 +- src/include/miopen/visit_float.hpp | 2 +- src/kernels/MIOpenIm2d2Col.cl | 6 --- src/kernels/MIOpenIm3d2Col.cl | 6 --- .../MIOpenSubTensorOpWithScalarKernel.cl | 6 +-- .../MIOpenSubTensorOpWithSubTensorKernel.cl | 6 +-- .../MIOpenSubTensorOpWithTransformKernel.cl | 6 +-- src/kernels/MIOpenUtilKernels4.cl | 6 --- src/ocl/convolutionocl.cpp | 8 +-- src/ocl/tensorocl.cpp | 42 ++++++--------- src/ocl/utilocl.cpp | 33 +++--------- src/pooling_api.cpp | 2 +- src/reducetensor.cpp | 7 ++- src/reducetensor_api.cpp | 2 +- src/solver/batchnorm/forward_inference_ck.cpp | 4 +- .../conv_ck_igemm_fwd_bias_activ_fused.cpp | 8 +-- src/solver/conv_direct_naive_conv.cpp | 2 +- ...ip_implicit_gemm_3d_grouped_bwd_xdlops.cpp | 8 +-- ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 8 +-- ...ip_implicit_gemm_3d_grouped_wrw_xdlops.cpp | 8 +-- ...conv_hip_implicit_gemm_bwd_data_xdlops.cpp | 8 +-- .../conv_hip_implicit_gemm_fwd_xdlops.cpp | 8 +-- ...v_hip_implicit_gemm_grouped_fwd_xdlops.cpp | 8 +-- src/solver/gemm.cpp | 50 ++++++------------ src/solver/mlir_common.cpp | 2 +- src/tensor.cpp | 2 +- test/conv_common.hpp | 33 +++++------- test/driver.hpp | 4 +- test/gpu_reference_kernel.cpp | 2 - test/gtest/conv_embed_db.cpp | 4 +- test/gtest/conv_hip_igemm_xdlops.cpp | 4 +- test/gtest/conv_igemm_dynamic.cpp | 4 +- test/gtest/conv_igemm_mlir.cpp | 4 +- test/gtest/conv_igemm_mlir_xdlops.cpp | 4 +- test/gtest/conv_trans.cpp | 4 +- test/gtest/db_sync.cpp | 2 +- test/tensor_holder.hpp | 7 +-- test/tensor_transform.cpp | 2 +- 45 files changed, 151 insertions(+), 263 deletions(-) diff --git a/docs/datatypes.md b/docs/datatypes.md index 1a46e5fd62..f5d8f28931 100644 --- a/docs/datatypes.md +++ b/docs/datatypes.md @@ -10,7 +10,7 @@ typedef enum { miopenFloat = 1, miopenInt32 = 2, miopenInt8 = 3, - miopenInt8x4 = 4, + /* Value 4 is reserved. */ miopenBFloat16 = 5, } miopenDataType_t; ``` @@ -22,7 +22,6 @@ Type descriptions: * `miopenFloat` - 32-bit floating point * `miopenInt32` - 32-bit integer, used primarily for `int8` convolution outputs * `miopenInt8` - 8-bit integer, currently only supported by `int8` convolution forward path, tensor set, tensor copy, tensor cast, tensor transform, tensor transpose, and im2col. - * `miopenInt8x4` - 8-bit 4 element vector type used primarily with `int8` convolutions forward path. * `miopenBFloat16` - brain float fp-16 (8-bit exponent, 7-bit fraction), currently only supported by convolutions, tensor set, and tensor copy. diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index abbec2599c..0c55b3becd 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -348,12 +348,11 @@ MIOPEN_DECLARE_OBJECT(miopenReduceTensorDescriptor); */ typedef enum { - miopenHalf = 0, /*!< 16-bit floating point (Fully supported) */ - miopenFloat = 1, /*!< 32-bit floating point (Fully supported) */ - miopenInt32 = 2, /*!< 32-bit int point (Partially supported) */ - miopenInt8 = 3, /*!< 8-bit int point (Partially supported) */ - miopenInt8x4 = - 4, /*!< Pack of four 8-bit int points in NCHW_VECT_C format (Partially supported) */ + miopenHalf = 0, /*!< 16-bit floating point (Fully supported) */ + miopenFloat = 1, /*!< 32-bit floating point (Fully supported) */ + miopenInt32 = 2, /*!< 32-bit int point (Partially supported) */ + miopenInt8 = 3, /*!< 8-bit int point (Partially supported) */ + miopenInt8x4 = 4, /*!< Pack of four Int8 in NCHW_VECT_C format (Support discontinued) */ miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction) (Partially supported) */ miopenDouble = 6, /*!< 64-bit floating point (Partially supported) */ diff --git a/src/check_numerics.cpp b/src/check_numerics.cpp index b0882995e9..50cb0af4af 100644 --- a/src/check_numerics.cpp +++ b/src/check_numerics.cpp @@ -64,7 +64,7 @@ std::string GetKernelName(miopenDataType_t data_type) case miopenBFloat8: return {"check_numerics_bf8"}; case miopenInt32: case miopenInt8: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenDouble: default: return {""}; } diff --git a/src/convolution.cpp b/src/convolution.cpp index 403ff777cd..ac7c28fdc4 100644 --- a/src/convolution.cpp +++ b/src/convolution.cpp @@ -358,7 +358,7 @@ ConvolutionDescriptor::GetForwardOutputTensorWithLayout(const TensorDescriptor& std::vector out_strides; tensor_layout_to_strides( out_lens, default_layout, yLayout, xDesc.GetVectorLength(), out_strides); - return {(xDesc.GetType() == miopenInt8 || xDesc.GetType() == miopenInt8x4 + return {(xDesc.GetType() == miopenInt8 ? (yType) : xDesc.GetType()), // TODO: This function overrides the output type with // essentially the input which is incorrect. diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp index 1750f625da..fad06870ce 100644 --- a/src/gemm_v2.cpp +++ b/src/gemm_v2.cpp @@ -63,10 +63,6 @@ /// "disabled expansion of recursive macro" injected by rocblas headers. #define AVOID_ROCBLAS_WRAPPERS_204 (MIOPEN_ROCBLAS_VERSION_FLAT >= 2004000) -/// Maintain API compatibility with various rocBLAS version -#define USE_GEMM_FLAGS_PACK_INT8X4 \ - ((MIOPEN_ROCBLAS_VERSION_FLAT >= 2038000) && (MIOPEN_ROCBLAS_VERSION_FLAT < 4000000)) - /// Maintain API compatibility for versions not supporting FP16 alternate implementations #define USE_GEMM_FLAGS_FP16_ALT_IMPL (MIOPEN_ROCBLAS_VERSION_FLAT >= 2043000) /// Some 2.42 versions have rocblas_gemm_flags_fp16_alt_impl, but @@ -110,7 +106,7 @@ static inline rocblas_datatype rocBlasComputeType(const miopen::GemmDescriptor& { // Complex compute types are only supported in newer version of the API assert(desc.dataType == desc.a_cast_type && desc.dataType == desc.b_cast_type); - if(desc.dataType == miopenInt8 || desc.dataType == miopenInt8x4) + if(desc.dataType == miopenInt8) return rocblas_datatype::rocblas_datatype_i32_r; else return rocblas_datatype::rocblas_datatype_f32_r; @@ -441,7 +437,6 @@ miopenStatus_t CallGemm(const Handle& handle, switch(gemm_desc.dataType) { - case miopenInt8x4: case miopenInt8: { assert(gemm_desc.k % 4 == 0); @@ -473,12 +468,7 @@ miopenStatus_t CallGemm(const Handle& handle, rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_i32_r, rocblas_gemm_algo::rocblas_gemm_algo_standard, 0, -#if USE_GEMM_FLAGS_PACK_INT8X4 - rocblas_gemm_flags_pack_int8x4 -#else - 0 -#endif - ); + 0); } break; case miopenInt32: break; @@ -622,9 +612,9 @@ miopenStatus_t CallGemm(const Handle& handle, }; break; + case miopenInt8x4: case miopenDouble: { - MIOPEN_THROW(miopenStatusBadParm, - "miopenDouble data type not supported by MIOpenGEMM."); + MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type."); }; break; } @@ -695,7 +685,6 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle, switch(gemm_desc.dataType) { - case miopenInt8x4: case miopenInt8: { assert(gemm_desc.k % 4 == 0); @@ -731,12 +720,7 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle, rocblas_datatype::rocblas_datatype_i32_r, rocblas_gemm_algo::rocblas_gemm_algo_standard, 0, -#if USE_GEMM_FLAGS_PACK_INT8X4 - rocblas_gemm_flags_pack_int8x4 -#else - 0 -#endif - ); + 0); } break; case miopenInt32: break; @@ -895,10 +879,10 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle, break; } + case miopenInt8x4: case miopenDouble: { - MIOPEN_THROW(miopenStatusBadParm, - "miopenDouble data type not supported by MIOpenGEMM."); - } + MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type."); + }; break; } @@ -971,7 +955,6 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, switch(gemm_desc.dataType) { - case miopenInt8x4: case miopenInt8: { assert(gemm_desc.k % 4 == 0); @@ -1005,12 +988,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_i32_r, rocblas_gemm_algo::rocblas_gemm_algo_standard, 0, -#if USE_GEMM_FLAGS_PACK_INT8X4 - rocblas_gemm_flags_pack_int8x4 -#else - 0 -#endif - ); + 0); } } break; @@ -1166,10 +1144,10 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle, break; } + case miopenInt8x4: case miopenDouble: { - MIOPEN_THROW(miopenStatusBadParm, - "miopenDouble data type not supported by MIOpenGEMM."); - } + MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type."); + }; break; } @@ -1199,7 +1177,7 @@ GemmDescriptor CreateGemmDescriptorConvFwd(const TensorDescriptor& wDesc, { #ifndef NDEBUG assert(wDesc.GetType() == xDesc.GetType()); - if(wDesc.GetType() != miopenInt8 && wDesc.GetType() != miopenInt8x4) + if(wDesc.GetType() != miopenInt8) assert(wDesc.GetType() == yDesc.GetType()); #endif @@ -1354,7 +1332,7 @@ GemmDescriptor CreateGemmDescriptorConvCNHWFwd(const TensorDescriptor& wDesc, { #ifndef NDEBUG assert(wDesc.GetType() == xDesc.GetType()); - if(wDesc.GetType() != miopenInt8 && wDesc.GetType() != miopenInt8x4) + if(wDesc.GetType() != miopenInt8) assert(wDesc.GetType() == yDesc.GetType()); #endif @@ -1458,7 +1436,7 @@ GemmDescriptor CreateGemmStridedBatchedDescriptorConv1x1Fwd(const TensorDescript { #ifndef NDEBUG assert(wDesc.GetType() == xDesc.GetType()); - if(wDesc.GetType() != miopenInt8 && wDesc.GetType() != miopenInt8x4) + if(wDesc.GetType() != miopenInt8) assert(wDesc.GetType() == yDesc.GetType()); #else (void)yDesc; diff --git a/src/hip/batched_transpose_sol.cpp b/src/hip/batched_transpose_sol.cpp index 4ffcf34b77..01349775ca 100644 --- a/src/hip/batched_transpose_sol.cpp +++ b/src/hip/batched_transpose_sol.cpp @@ -322,7 +322,11 @@ BatchedTransposeSolution::BatchedTransposeSolution(const ExecutionContext& ctx, uint32_t width_) : data_type(data_type_), batch(batch_), height(height_), width(width_) { - if(data_type == miopenInt8x4 || data_type == miopenDouble) + if(!(data_type == miopenHalf // + || data_type == miopenFloat // + || data_type == miopenInt32 // + || data_type == miopenInt8 // + || data_type == miopenBFloat16)) MIOPEN_THROW("These data type are not supported"); num_cu = ctx.GetStream().GetMaxComputeUnits(); std::size_t data_size = miopen::GetTypeSize(data_type); diff --git a/src/include/miopen/datatype.hpp b/src/include/miopen/datatype.hpp index 485bdb3d67..29057e3de9 100644 --- a/src/include/miopen/datatype.hpp +++ b/src/include/miopen/datatype.hpp @@ -53,7 +53,10 @@ inline std::string GetDataType(miopenDataType_t type) type_str = "bfloat16"; } break; - case miopenInt8x4: + case miopenInt8x4: { + type_str = "UNSUPPORTED_TYPE"; + } + break; case miopenInt8: { type_str = "int8_t"; } @@ -137,7 +140,6 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type) int use_fp16x8 = 0; int use_fp32 = 0; int use_int8 = 0; - int use_int8x4 = 0; int use_int32 = 0; int use_bfp16 = 0; int use_fp64 = 0; @@ -150,15 +152,14 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type) case miopenHalf: use_fp16 = 1; break; case miopenFloat: use_fp32 = 1; break; case miopenInt8: use_int8 = 1; break; - case miopenInt8x4: use_int8x4 = 1; break; case miopenBFloat16: use_bfp16 = 1; break; case miopenInt32: use_int32 = 1; break; case miopenDouble: use_fp64 = 1; break; case miopenFloat8: use_fp8 = 1; break; case miopenBFloat8: use_bfp8 = 1; break; + case miopenInt8x4: // fallthrough default: - MIOPEN_THROW( - "Only float, half, bfloat16, int8, int8x4, float8, bfloat8 data type is supported."); + MIOPEN_THROW("Only float, half, bfloat16, int8, float8, bfloat8 data types are supported."); break; } @@ -168,7 +169,6 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type) {"MIOPEN_USE_FP16x8", use_fp16x8}, {"MIOPEN_USE_FP32", use_fp32}, {"MIOPEN_USE_INT8", use_int8}, - {"MIOPEN_USE_INT8x4", use_int8x4}, {"MIOPEN_USE_BFP16", use_bfp16}, {"MIOPEN_USE_INT32", use_int32}, {"MIOPEN_USE_RNE_BFLOAT16", use_rne_bfloat16}, diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp index b8d72df67c..a133ab4adc 100644 --- a/src/include/miopen/tensor.hpp +++ b/src/include/miopen/tensor.hpp @@ -101,13 +101,13 @@ inline std::size_t GetTypeSize(miopenDataType_t d) case miopenFloat: return 4; case miopenHalf: case miopenBFloat16: return 2; - case miopenInt8x4: + case miopenInt8x4: break; case miopenInt8: case miopenFloat8: case miopenBFloat8: return 1; case miopenDouble: return 8; } - MIOPEN_THROW("Unknown data type"); + MIOPEN_THROW("Unknown or unsupported data type"); } template diff --git a/src/include/miopen/visit_float.hpp b/src/include/miopen/visit_float.hpp index d26afba9dd..35e1ae6ae7 100644 --- a/src/include/miopen/visit_float.hpp +++ b/src/include/miopen/visit_float.hpp @@ -79,7 +79,6 @@ void visit_float(miopenDataType_t t, F f) } case miopenFloat8: case miopenBFloat8: - case miopenInt8x4: case miopenInt8: { f(as_float{}); break; @@ -92,6 +91,7 @@ void visit_float(miopenDataType_t t, F f) f(as_float{}); break; } + case miopenInt8x4: MIOPEN_THROW("miopenInt8x4: Support discontinued."); } } diff --git a/src/kernels/MIOpenIm2d2Col.cl b/src/kernels/MIOpenIm2d2Col.cl index 7b1522db6f..852ccff955 100644 --- a/src/kernels/MIOpenIm2d2Col.cl +++ b/src/kernels/MIOpenIm2d2Col.cl @@ -40,10 +40,6 @@ #define MIOPEN_USE_INT8 0 #endif -#ifndef MIOPEN_USE_INT8x4 -#define MIOPEN_USE_INT8x4 0 -#endif - #ifndef MIOPEN_USE_INT32 #define MIOPEN_USE_INT32 0 #endif @@ -58,8 +54,6 @@ #if MIOPEN_USE_INT8 || MIOPEN_USE_FP8 || MIOPEN_USE_BFP8 typedef char data_t; -#elif MIOPEN_USE_INT8x4 -typedef uint data_t; #elif MIOPEN_USE_INT32 typedef int data_t; #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) diff --git a/src/kernels/MIOpenIm3d2Col.cl b/src/kernels/MIOpenIm3d2Col.cl index 7ccb8363b7..5ee437a068 100644 --- a/src/kernels/MIOpenIm3d2Col.cl +++ b/src/kernels/MIOpenIm3d2Col.cl @@ -40,18 +40,12 @@ #define MIOPEN_USE_INT8 0 #endif -#ifndef MIOPEN_USE_INT8x4 -#define MIOPEN_USE_INT8x4 0 -#endif - #ifndef MIOPEN_USE_INT32 #define MIOPEN_USE_INT32 0 #endif #if MIOPEN_USE_INT8 typedef char data_t; -#elif MIOPEN_USE_INT8x4 -typedef uint data_t; #elif MIOPEN_USE_INT32 typedef int data_t; #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) diff --git a/src/kernels/MIOpenSubTensorOpWithScalarKernel.cl b/src/kernels/MIOpenSubTensorOpWithScalarKernel.cl index 87fd82a7fd..dc8c96fc60 100644 --- a/src/kernels/MIOpenSubTensorOpWithScalarKernel.cl +++ b/src/kernels/MIOpenSubTensorOpWithScalarKernel.cl @@ -46,17 +46,13 @@ #define MIOPEN_USE_INT8 0 #endif -#ifndef MIOPEN_USE_INT8x4 -#define MIOPEN_USE_INT8x4 0 -#endif - #ifndef MIOPEN_USE_INT32 #define MIOPEN_USE_INT32 0 #endif #include "float_types.h" -#if MIOPEN_USE_INT8 == 1 || MIOPEN_USE_INT8x4 == 1 +#if MIOPEN_USE_INT8 == 1 #define _FLOAT char #endif diff --git a/src/kernels/MIOpenSubTensorOpWithSubTensorKernel.cl b/src/kernels/MIOpenSubTensorOpWithSubTensorKernel.cl index d0d1762a10..1de4843712 100644 --- a/src/kernels/MIOpenSubTensorOpWithSubTensorKernel.cl +++ b/src/kernels/MIOpenSubTensorOpWithSubTensorKernel.cl @@ -46,11 +46,7 @@ #define MIOPEN_USE_INT8 0 #endif -#ifndef MIOPEN_USE_INT8x4 -#define MIOPEN_USE_INT8x4 0 -#endif - -#if MIOPEN_USE_INT8 == 1 || MIOPEN_USE_INT8x4 == 1 +#if MIOPEN_USE_INT8 == 1 #define _FLOAT char #ifndef FLT_MAX #define MAX_VAL 127 /* max value */ diff --git a/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl b/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl index bc17f2ecf4..bd06f89626 100644 --- a/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl +++ b/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl @@ -43,11 +43,7 @@ #define MIOPEN_USE_INT8 0 #endif -#ifndef MIOPEN_USE_INT8x4 -#define MIOPEN_USE_INT8x4 0 -#endif - -#if MIOPEN_USE_INT8 == 1 || MIOPEN_USE_INT8x4 == 1 +#if MIOPEN_USE_INT8 == 1 #define _FLOAT char #ifndef FLT_MAX #define MAX_VAL 127 /* max value */ diff --git a/src/kernels/MIOpenUtilKernels4.cl b/src/kernels/MIOpenUtilKernels4.cl index 861e563012..09e6b73cff 100644 --- a/src/kernels/MIOpenUtilKernels4.cl +++ b/src/kernels/MIOpenUtilKernels4.cl @@ -40,10 +40,6 @@ #define MIOPEN_USE_INT8 0 #endif -#ifndef MIOPEN_USE_INT8x4 -#define MIOPEN_USE_INT8x4 0 -#endif - #ifndef MIOPEN_USE_INT32 #define MIOPEN_USE_INT32 0 #endif @@ -58,8 +54,6 @@ #if MIOPEN_USE_INT8 || MIOPEN_USE_FP8 || MIOPEN_USE_BFP8 typedef char data_t; -#elif MIOPEN_USE_INT8x4 -typedef uint data_t; #elif MIOPEN_USE_INT32 typedef int data_t; #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp index 8c042e3e7b..94b083577d 100644 --- a/src/ocl/convolutionocl.cpp +++ b/src/ocl/convolutionocl.cpp @@ -296,8 +296,7 @@ void ValidateConvTensors(const ConvTensors& tensors) tensors.xDesc.GetSize() != tensors.wDesc.GetSize(); const auto trivial_tensor_types_not_matched = - tensors.xDesc.GetType() != tensors.yDesc.GetType() && - tensors.xDesc.GetType() != miopenInt8 && tensors.xDesc.GetType() != miopenInt8x4; + tensors.xDesc.GetType() != tensors.yDesc.GetType() && tensors.xDesc.GetType() != miopenInt8; // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) { // MIOPEN_THROW(miopenStatusBadParm); @@ -426,11 +425,6 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle, ValidateConvTensors(tensors); ValidateAlphaBeta(alpha, beta); - if(algo != miopenConvolutionFwdAlgoGEMM && xDesc.GetType() == miopenInt8x4) - { - MIOPEN_THROW(miopenStatusBadParm); - } - ConvForwardCheckNumerics(handle, tensors, [&]() { ValidateGroupCount(xDesc, wDesc, *this); diff --git a/src/ocl/tensorocl.cpp b/src/ocl/tensorocl.cpp index 9c7bff6992..1e1f8b1df9 100644 --- a/src/ocl/tensorocl.cpp +++ b/src/ocl/tensorocl.cpp @@ -1597,10 +1597,13 @@ void ScaleTensor(const Handle& handle, assert(yDim_flat > 0 && yDim_flat <= 5); const miopenDataType_t dataType = yDesc_flat.GetType(); - if(dataType == miopenInt8 || dataType == miopenInt8x4 || dataType == miopenBFloat16) + + if(!(dataType == miopenHalf // + || dataType == miopenFloat // + || dataType == miopenInt32 // + || dataType == miopenDouble)) { - MIOPEN_THROW(miopenStatusBadParm, - "Tensor scale operation is not supported for int8, int8x4, and bfloat16."); + MIOPEN_THROW(miopenStatusBadParm, "ScaleTensor: unsupported data type."); } std::string kernel_name = "SubTensorOpWithScalar" + std::to_string(yDim_flat) + "d"; @@ -1941,8 +1944,7 @@ std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, mio case miopenDouble: // TODO MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor."); - case miopenInt8x4: - MIOPEN_THROW(miopenStatusBadParm, "miopenInt8x4 data type not supported in cast tensor."); + case miopenInt8x4: // fallthrough default: MIOPEN_THROW(miopenStatusBadParm, "Invalid data type in cast tensor desc."); } } @@ -2237,24 +2239,6 @@ void TransformTensor(const Handle& handle, } } } - else if(xDesc.GetType() == miopenInt8 && yDesc.GetType() == miopenInt8x4 && x_len.size() >= 3) - { - if(x_len[1] <= (y_len[1] - 4) || y_len[1] % 4 != 0) - { - MIOPEN_THROW("Invalid y channel size"); - } - - transpose_NCHW2Vec(handle, x_len, x, y, 4, false, true, alpha, beta); - } - else if(xDesc.GetType() == miopenInt8x4 && yDesc.GetType() == miopenInt8 && x_len.size() >= 3) - { - if(y_len[1] <= (x_len[1] - 4) || x_len[1] % 4 != 0) - { - MIOPEN_THROW("Invalid x channel size"); - } - - transpose_NCHW2Vec(handle, y_len, x, y, 4, false, false, alpha, beta); - } else { auto x_y_len = boost::combine(x_len, y_len); @@ -2294,12 +2278,20 @@ void TransformTensor(const Handle& handle, const miopenDataType_t dataTypex = xDesc_flat.GetType(); const miopenDataType_t dataTypey = yDesc_flat.GetType(); - if(dataTypex == miopenInt8 || dataTypex == miopenInt8x4) + if(!(dataTypex == miopenHalf // + || dataTypex == miopenFloat // + || dataTypex == miopenInt32 // + || dataTypex == miopenBFloat16 // + || dataTypex == miopenDouble)) { MIOPEN_THROW("Tensor x is a unsupported data type"); } - if(dataTypey == miopenInt8 || dataTypey == miopenInt8x4) + if(!(dataTypey == miopenHalf // + || dataTypey == miopenFloat // + || dataTypey == miopenInt32 // + || dataTypey == miopenBFloat16 // + || dataTypey == miopenDouble)) { MIOPEN_THROW("Tensor y is a unsupported data type"); } diff --git a/src/ocl/utilocl.cpp b/src/ocl/utilocl.cpp index 0a88efb2f1..d536e819e6 100644 --- a/src/ocl/utilocl.cpp +++ b/src/ocl/utilocl.cpp @@ -80,8 +80,8 @@ float Im2d2ColGPU(const Handle& handle, int data_size_bound = c * in_h * in_w; - int data_size_bound_pack = type == miopenInt8x4 ? data_size_bound * 4 : data_size_bound; - int im_offset_pack = type == miopenInt8x4 ? im_offset / 4 : im_offset; + int data_size_bound_pack = data_size_bound; + int im_offset_pack = im_offset; if(!kernels.empty()) { @@ -105,7 +105,7 @@ float Im2d2ColGPU(const Handle& handle, } else { - const int c_pack = type == miopenInt8x4 ? c / 4 : c; + const int c_pack = c; std::string params; int num_ch_per_wg; @@ -331,9 +331,8 @@ float Im3d2ColGPU(const Handle& handle, auto&& kernels = handle.GetKernels("miopenIm3d2Col", network_config); - // int8x4 vectorize-c format - int im_offset_pack = type == miopenInt8x4 ? im_offset / 4 : im_offset; - int im_c_pack = type == miopenInt8x4 ? im_c / 4 : im_c; + int im_offset_pack = im_offset; + int im_c_pack = im_c; if(!kernels.empty()) { @@ -772,13 +771,6 @@ float transpose_NCHW2CNHW(const Handle& handle, std::string params = GetDataTypeKernelParams(type); - if(type == miopenInt8x4) - { - c /= 4; - in_offset /= 4; - out_offset /= 4; - } - if(h_stride == 1 && w_stride == 1 && type == miopenFloat) { kernel_name += "_V1"; @@ -910,13 +902,6 @@ float transpose_CNHW2NCHW(const Handle& handle, std::string params = GetDataTypeKernelParams(type); - if(type == miopenInt8x4) - { - c /= 4; - in_offset /= 4; - out_offset /= 4; - } - if(h_stride == 1 && w_stride == 1 && type == miopenFloat) { kernel_name += "_V1"; @@ -1170,14 +1155,8 @@ float transpose_packed_MN2NM(const Handle& handle, auto&& kernels = handle.GetKernels(kernel_name, network_config); std::string params = GetDataTypeKernelParams(type); - if(type == miopenInt8x4) - { - m /= 4; - in_offset /= 4; - out_offset /= 4; - } - if(!(type == miopenInt8x4 || type == miopenInt8)) + if(type != miopenInt8) { MIOPEN_THROW("transpose_packed_MN2NM only meant for int8 variants."); } diff --git a/src/pooling_api.cpp b/src/pooling_api.cpp index ef526804cf..bf318f7b78 100644 --- a/src/pooling_api.cpp +++ b/src/pooling_api.cpp @@ -50,7 +50,7 @@ inline void Pooling_logging_cmd(const miopenPoolingDescriptor_t poolDesc, case miopenFloat: ss << "pool"; break; case miopenInt32: case miopenInt8: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: case miopenFloat8: diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp index 6cf29c2f64..9cabbc03f6 100644 --- a/src/reducetensor.cpp +++ b/src/reducetensor.cpp @@ -211,11 +211,10 @@ inline int GetDataTypeSize(miopenDataType_t t) case miopenFloat8: case miopenBFloat8: case miopenInt8: return (1); - case miopenInt8x4: return (4); + case miopenInt8x4: return (4); // Support discontinued. case miopenBFloat16: return (2); case miopenInt32: return (4); - default: - MIOPEN_THROW("Only float, half, double, bfloat16, int8, int8x4 data type is supported."); + default: MIOPEN_THROW("Only float, half, double, bfloat16, int8 data types are supported."); }; }; @@ -269,7 +268,7 @@ inline int GetDataTypeId(miopenDataType_t t) case miopenBFloat16: return (static_cast('B')); case miopenDouble: return (static_cast('D')); case miopenInt8: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenFloat8: case miopenBFloat8: case miopenInt32: return (static_cast('O')); diff --git a/src/reducetensor_api.cpp b/src/reducetensor_api.cpp index 685b2fb430..10b1ff73bf 100644 --- a/src/reducetensor_api.cpp +++ b/src/reducetensor_api.cpp @@ -43,7 +43,7 @@ static void LogCmdRedux(const miopen::ReduceTensorDescriptor reduceTensorDesc, ss << "reducefp16"; else if(aDesc.GetType() == miopenBFloat16) ss << "reducebfp16"; - else if(aDesc.GetType() == miopenInt8 || aDesc.GetType() == miopenInt8x4) + else if(aDesc.GetType() == miopenInt8) ss << "reduceint8"; else if(aDesc.GetType() == miopenDouble) ss << "reducefp64"; diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp index 5a7918cc64..ff17432a62 100644 --- a/src/solver/batchnorm/forward_inference_ck.cpp +++ b/src/solver/batchnorm/forward_inference_ck.cpp @@ -200,7 +200,7 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& context, return (CheckCKApplicability(bn_problem) != -1); case miopenInt32: case miopenInt8: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenFloat8: case miopenBFloat8: default: MIOPEN_THROW("Unsupported datatype"); @@ -244,7 +244,7 @@ BnCKFwdInference::GetSolution(const ExecutionContext& context, break; case miopenInt8: case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenFloat8: case miopenBFloat8: default: MIOPEN_THROW("Unsupported datatype"); diff --git a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp index 9520a79a90..e2df6f8097 100644 --- a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp +++ b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp @@ -287,7 +287,7 @@ void PerformanceConfigConvCKIgemmFwdBiasActivFused::HeuristicInit( case miopenInt8: case miopenFloat: case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: default: MIOPEN_THROW("Unsupported datatype"); @@ -342,7 +342,7 @@ bool PerformanceConfigConvCKIgemmFwdBiasActivFused::IsValid( case miopenInt8: case miopenFloat: case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: default: MIOPEN_THROW("Unsupported datatype"); @@ -435,7 +435,7 @@ bool ConvCKIgemmFwdBiasActivFused::IsApplicable(const FusionContext& ctx, case miopenInt8: case miopenFloat: case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: default: MIOPEN_THROW("Unsupported datatype"); @@ -469,7 +469,7 @@ ConvSolution ConvCKIgemmFwdBiasActivFused::GetSolution( case miopenInt8: case miopenFloat: case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: default: MIOPEN_THROW("Unsupported datatype"); diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp index 86a8a4161e..992b196b45 100644 --- a/src/solver/conv_direct_naive_conv.cpp +++ b/src/solver/conv_direct_naive_conv.cpp @@ -49,7 +49,7 @@ bool ConvDirectNaiveConvIsAssemblyKernel(const ExecutionContext& ctx, { const auto device_name = ctx.GetStream().GetDeviceName(); return (device_name == "gfx906" || device_name == "gfx908") && ctx.rmv.IsV3() && - problem.IsLayoutDefault() && (!problem.IsInt8()); + problem.IsLayoutDefault() && (problem.IsFp16() || problem.IsFp32() || problem.IsBfp16()); } // Check tensor data type respectively diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp index 2602c54320..94983c7f0e 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp @@ -210,7 +210,7 @@ void PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::HeuristicInit( case miopenFloat: Init(problem); break; case miopenInt8: Init(problem); break; case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenFloat8: case miopenBFloat8: @@ -253,7 +253,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::IsValid( case miopenFloat: return CheckIsSupportCKArgs(problem); case miopenInt8: return CheckIsSupportCKArgs(problem); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenFloat8: case miopenBFloat8: @@ -324,7 +324,7 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable( case miopenFloat: return CheckCKApplicability(problem); case miopenInt8: return CheckCKApplicability(problem); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenFloat8: case miopenBFloat8: @@ -352,7 +352,7 @@ ConvSolution ConvHipImplicitGemm3DGroupBwdXdlops::GetSolution( return InitInvokerFactory, CKArgs, conv::DataInvokeParams>( problem, config.kernel_id); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: case miopenFloat8: diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index f0623c642d..a21c9ba300 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -210,7 +210,7 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::HeuristicInit( case miopenFloat: Init(problem); break; case miopenInt8: Init(problem); break; case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenFloat8: case miopenBFloat8: case miopenBFloat16: @@ -253,7 +253,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::IsValid( case miopenFloat: return CheckIsSupportCKArgs(problem); case miopenInt8: return CheckIsSupportCKArgs(problem); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenFloat8: case miopenBFloat8: case miopenBFloat16: @@ -322,7 +322,7 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable( case miopenFloat: return CheckCKApplicability(problem); case miopenInt8: return CheckCKApplicability(problem); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenFloat8: case miopenBFloat8: case miopenBFloat16: @@ -350,7 +350,7 @@ ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution( return InitInvokerFactory, CKArgs, conv::DataInvokeParams>( problem, config.kernel_id); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: case miopenFloat8: diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp index 3c94374b4e..6225410599 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp @@ -206,7 +206,7 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::HeuristicInit( case miopenFloat: Init(problem); break; case miopenInt8: Init(problem); break; case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenFloat8: case miopenBFloat8: @@ -249,7 +249,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::IsValid( case miopenFloat: return CheckIsSupportCKArgs(problem); case miopenInt8: return CheckIsSupportCKArgs(problem); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenFloat8: case miopenBFloat8: @@ -320,7 +320,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable( case miopenFloat: return CheckCKApplicability(problem); case miopenInt8: return CheckCKApplicability(problem); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenFloat8: case miopenBFloat8: @@ -348,7 +348,7 @@ ConvSolution ConvHipImplicitGemm3DGroupWrwXdlops::GetSolution( return InitInvokerFactory, CKArgs, conv::WrWInvokeParams>( problem, config.kernel_id); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenFloat8: case miopenBFloat8: diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp index 57c63267d2..5aa17e75bf 100644 --- a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp @@ -172,7 +172,7 @@ void PerformanceConfigHipImplicitGemmBwdXdlops::HeuristicInit( case miopenBFloat8: case miopenInt8: case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: break; } @@ -214,7 +214,7 @@ bool PerformanceConfigHipImplicitGemmBwdXdlops::IsValid( case miopenBFloat8: case miopenInt8: case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: break; } @@ -294,7 +294,7 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable( case miopenBFloat8: case miopenInt8: case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: break; } @@ -318,7 +318,7 @@ ConvSolution ConvHipImplicitGemmBwdXdlops::GetSolution( problem, config.kernel_id); case miopenInt8: case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: case miopenFloat8: diff --git a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp index 73907eb788..7a9c322bc7 100644 --- a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp @@ -172,7 +172,7 @@ void PerformanceConfigHipImplicitGemmFwdXdlops::HeuristicInit( case miopenFloat8: case miopenBFloat8: case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: break; } @@ -215,7 +215,7 @@ bool PerformanceConfigHipImplicitGemmFwdXdlops::IsValid( case miopenFloat8: case miopenBFloat8: case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: break; } @@ -294,7 +294,7 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable( case miopenFloat8: case miopenBFloat8: case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: break; } @@ -320,7 +320,7 @@ ConvSolution ConvHipImplicitGemmFwdXdlops::GetSolution( return InitInvokerFactory, CKArgs, conv::DataInvokeParams>( problem, config.kernel_id); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: case miopenFloat8: diff --git a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp index c2d0b83141..0a8b6eeb03 100644 --- a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp @@ -198,7 +198,7 @@ void PerformanceConfigHipImplicitGemmGroupFwdXdlops::HeuristicInit( case miopenFloat: Init(problem); break; case miopenInt8: Init(problem); break; case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenFloat8: case miopenBFloat8: @@ -240,7 +240,7 @@ bool PerformanceConfigHipImplicitGemmGroupFwdXdlops::IsValid( case miopenFloat: return CheckIsSupportCKArgs(problem); case miopenInt8: return CheckIsSupportCKArgs(problem); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenFloat8: case miopenBFloat8: @@ -311,7 +311,7 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable( case miopenFloat: return CheckCKApplicability(problem); case miopenInt8: return CheckCKApplicability(problem); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenFloat8: case miopenBFloat8: @@ -339,7 +339,7 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlops::GetSolution( return InitInvokerFactory, CKArgs, conv::DataInvokeParams>( problem, config.kernel_id); case miopenInt32: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: case miopenFloat8: diff --git a/src/solver/gemm.cpp b/src/solver/gemm.cpp index 7ad238e7fa..6ae4f5dde7 100644 --- a/src/solver/gemm.cpp +++ b/src/solver/gemm.cpp @@ -87,13 +87,15 @@ bool GemmFwdBase::IsApplicable(const ExecutionContext& ctx, const auto& xDesc = problem.GetIn(); const auto& wDesc = problem.GetWeights(); const auto& yDesc = problem.GetOut(); - if(xDesc.GetType() == miopenInt8x4 || xDesc.GetType() == miopenInt8) - { - // rocBlas needs the output to be int32 always - if(yDesc.GetType() != miopenFloat && yDesc.GetType() != miopenInt32 && - yDesc.GetType() != miopenInt8x4) - return false; - } + + // rocBlas needs the output to be 32-bit always + if(xDesc.GetType() == miopenInt8x4) + return false; + if(xDesc.GetType() == miopenInt8 // + && (yDesc.GetType() != miopenFloat // + && yDesc.GetType() != miopenInt32)) + return false; + const auto rblas_fp8_supported = miopen::StartsWith(ctx.GetStream().GetDeviceName(), "gfx94"); if(problem.IsTensorsCasted()) { @@ -185,8 +187,7 @@ float GemmFwdBase::GetWti(const ExecutionContext&, const conv::ProblemDescriptio n_transpose_packed_MN2NM = 1; n_gemm_strided_batched = conv.group_count; n_transpose_CNHW2NCHW = 1; - if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) && - yDesc.GetType() != miopenInt32) + if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32) n_CastTensor = 1; } // 1x1_stride=1 with GEMM and zero workspace @@ -205,8 +206,7 @@ float GemmFwdBase::GetWti(const ExecutionContext&, const conv::ProblemDescriptio n_gemm_strided_batched = conv.group_count; n_gemm_runs = in_n; } - if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) && - yDesc.GetType() != miopenInt32) + if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32) n_CastTensor = 1; } else // not 1x1 @@ -216,8 +216,7 @@ float GemmFwdBase::GetWti(const ExecutionContext&, const conv::ProblemDescriptio n_transpose_packed_MN2NM = in_n; n_gemm_strided_batched = conv.group_count; n_gemm_runs = in_n; - if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) && - yDesc.GetType() != miopenInt32) + if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32) n_CastTensor = 1; } @@ -410,7 +409,7 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context, x_t_size *= 2; } - if(wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) + if(wDesc.GetType() == miopenInt8) { const auto xts = GetTypeSize(xDesc.GetType()); if(xts > 0) @@ -494,8 +493,7 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context, if(handle.IsProfilingEnabled()) time_gemm += handle.GetKernelTime(); - if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) && - yDesc.GetType() != miopenInt32) + if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32) { TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides()); @@ -782,7 +780,6 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context, auto solution = ConvSolution{miopenStatusSuccess}; const auto group_count = conv.group_count; - const auto lowp_quant = conv.lowp_quant; if(group_count > 1) { @@ -881,14 +878,6 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context, } } - if(wDesc.GetType() == miopenInt8x4 && yDesc.GetType() != miopenInt32) - { - TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides()); - CastTensor(handle, &lowp_quant, ygemmDesc, y, yDesc, y, 0, 0); - if(handle.IsProfilingEnabled()) - time_gemm += handle.GetKernelTime(); - } - if(handle.IsProfilingEnabled()) { handle.ResetKernelTime(); @@ -967,14 +956,6 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context, if(handle.IsProfilingEnabled()) time += handle.GetKernelTime(); - if(wDesc.GetType() == miopenInt8x4 && yDesc.GetType() != miopenInt32) - { - TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides()); - CastTensor(handle, &lowp_quant, ygemmDesc, y, yDesc, y, 0, 0); - if(handle.IsProfilingEnabled()) - time += handle.GetKernelTime(); - } - if(handle.IsProfilingEnabled()) { handle.ResetKernelTime(); @@ -1268,8 +1249,7 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context, } } - if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) && - yDesc.GetType() != miopenInt32) + if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32) { TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides()); diff --git a/src/solver/mlir_common.cpp b/src/solver/mlir_common.cpp index 4101db86f9..4e41b8944a 100644 --- a/src/solver/mlir_common.cpp +++ b/src/solver/mlir_common.cpp @@ -57,7 +57,7 @@ static const char* DTypeName(miopenDataType_t ty) case miopenBFloat16: return "bf16"; case miopenInt32: return "i32"; case miopenInt8: return "i8"; - case miopenInt8x4: return "i8x4"; + case miopenInt8x4: return "i8x4"; // Support discontinued. case miopenFloat8: return "fp8"; case miopenBFloat8: return "bfp8"; } diff --git a/src/tensor.cpp b/src/tensor.cpp index ca4f1afc7a..df0d7c2819 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -50,7 +50,7 @@ bool IsDataTypeSupported(miopenDataType_t t) case miopenFloat8: case miopenBFloat8: case miopenInt8: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenBFloat16: case miopenDouble: return true; } diff --git a/test/conv_common.hpp b/test/conv_common.hpp index 99f964a0f6..e3fa9766e6 100644 --- a/test/conv_common.hpp +++ b/test/conv_common.hpp @@ -217,12 +217,7 @@ tensor get_output_tensor(const miopen::ConvolutionDescriptor& filter, ? input.desc.GetLayout(miopen::tensor_layout_get_default(input.desc.GetSize())) : out_layout; return tensor{filter.GetForwardOutputTensorWithLayout( - input.desc, - weights.desc, - yLayout, - weights.desc.GetType() == miopenInt8x4 - ? (std::is_same{} ? miopenInt32 : miopenFloat) - : miopen_type{})}; + input.desc, weights.desc, yLayout, miopen_type{})}; } enum class ConvApi @@ -572,8 +567,7 @@ struct verify_forward_conv : conv_base auto rout = ref_conv_fwd(input, weights, out, filter); if(filter.mode != miopenTranspose) { - bool is_int8 = - weights.desc.GetType() == miopenInt8 || weights.desc.GetType() == miopenInt8x4; + bool is_int8 = weights.desc.GetType() == miopenInt8; bool is_vect_c = weights.desc.GetVectorLength() > 1; rout.par_for_each([&](auto... is) { if(is_int8 && !is_vect_c) @@ -774,7 +768,7 @@ struct verify_forward_conv : conv_base break; case ConvApi::Find_1_0: case ConvApi::Find_2_0: - if(weights.desc.GetType() == miopenInt8 || weights.desc.GetType() == miopenInt8x4) + if(weights.desc.GetType() == miopenInt8) { bool is_transform = (input.desc.GetLengths()[1] % 4 != 0 || is_vect); @@ -786,10 +780,8 @@ struct verify_forward_conv : conv_base in_len[1] = ((in_len[1] + 3) / 4) * 4; wei_len[1] = ((wei_len[1] + 3) / 4) * 4; - miopen::TensorDescriptor input_vpad_desc(is_vect ? miopenInt8x4 : miopenInt8, - in_len); - miopen::TensorDescriptor weight_vpad_desc(is_vect ? miopenInt8x4 : miopenInt8, - wei_len); + miopen::TensorDescriptor input_vpad_desc(miopenInt8, in_len); + miopen::TensorDescriptor weight_vpad_desc(miopenInt8, wei_len); auto input_vpad = tensor{in_len}; auto weights_vpad = tensor{wei_len}; @@ -1738,8 +1730,8 @@ struct verify_forward_conv_int8 : conv_base in_len[1] = ((in_len[1] + 3) / 4) * 4; wei_len[1] = ((wei_len[1] + 3) / 4) * 4; - miopen::TensorDescriptor input_vpad_desc(is_vect ? miopenInt8x4 : miopenInt8, in_len); - miopen::TensorDescriptor weight_vpad_desc(is_vect ? miopenInt8x4 : miopenInt8, wei_len); + miopen::TensorDescriptor input_vpad_desc(miopenInt8, in_len); + miopen::TensorDescriptor weight_vpad_desc(miopenInt8, wei_len); auto input_vpad = tensor{in_len}; auto weights_vpad = tensor{wei_len}; @@ -2046,7 +2038,7 @@ struct conv_driver : test_driver filter.spatialDim = get_spatial_dim(); else filter.spatialDim = filter_dims.size(); - bool is_int8 = (input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenInt8x4); + bool is_int8 = (input.desc.GetType() == miopenInt8); filter.mode = cmode_lookup[miopen::ToUpper(conv_mode)]; filter.paddingMode = pmode_lookup[miopen::ToUpper(pad_mode)]; @@ -2360,8 +2352,7 @@ struct conv_driver : test_driver bool skip_backward_weights = is_int8; #if TEST_DIRECT_SUPPORTED_CONFIG_ONLY - if(input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenInt8x4 || - input.desc.GetType() == miopenBFloat16) + if(input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenBFloat16) { show_command(); std::cout << "Direct path doesn't support Int8 or BFloat16 type." << std::endl; @@ -2405,7 +2396,8 @@ struct conv_driver : test_driver size_t total_mem; if(is_int8) { - // TODO: Tout here was float which should have been int32 + /// \todo Properly construct the `output` tensor descriptor + /// and get rid of this special "int8" stuff. auto output_int8 = get_output_tensor(filter, input, weights, out_layout); const auto problem = ConvProblemDescription{input.desc, @@ -2422,6 +2414,9 @@ struct conv_driver : test_driver } else { + /// \todo Take into account `skip_forward`, `skip_backward_data`, + /// `skip_backward_weights` and use this path to compute `total_mem` for int8 + /// variations. const auto fwd_problem = miopen::conv::ProblemDescription{ input.desc, weights.desc, diff --git a/test/driver.hpp b/test/driver.hpp index 0a8e2d3080..fd83dd1fc5 100644 --- a/test/driver.hpp +++ b/test/driver.hpp @@ -274,7 +274,7 @@ struct test_driver { case miopenHalf: ss << "--half "; break; case miopenBFloat16: ss << "--bfloat16 "; break; - case miopenInt8x4: + case miopenInt8x4: ss << "--UNSUPPORED_TYPE "; break; case miopenInt8: ss << "--int8 "; break; case miopenInt32: ss << "--int32 "; break; case miopenFloat: ss << "--float "; break; @@ -303,7 +303,7 @@ struct test_driver { case miopenHalf: ret.emplace_back("--half"); break; case miopenBFloat16: ret.emplace_back("--bf16"); break; - case miopenInt8x4: + case miopenInt8x4: ret.emplace_back("--UNSUPPORTED_TYPE"); break; case miopenInt8: ret.emplace_back("--int8"); break; case miopenInt32: ret.emplace_back("--int32"); break; case miopenFloat: ret.emplace_back("--float"); break; diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp index aa3dda788d..be8f3f8430 100644 --- a/test/gpu_reference_kernel.cpp +++ b/test/gpu_reference_kernel.cpp @@ -303,8 +303,6 @@ static std::string miopen_type_to_string(miopenDataType_t type) return "int32"; if(type == miopenInt8) return "int8"; - if(type == miopenInt8x4) - return "int8x4"; if(type == miopenBFloat16) return "bf16"; return "n/a"; diff --git a/test/gtest/conv_embed_db.cpp b/test/gtest/conv_embed_db.cpp index b69fde1b5e..4672bb0404 100644 --- a/test/gtest/conv_embed_db.cpp +++ b/test/gtest/conv_embed_db.cpp @@ -73,12 +73,12 @@ void Run2dDriver(miopenDataType_t prec) case miopenHalf: params = ConfigWithHalf::GetParam(); break; case miopenInt8: params = ConfigWithInt8::GetParam(); break; case miopenBFloat16: params = ConfigWithBFloat16::GetParam(); break; - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenInt32: case miopenFloat8: case miopenBFloat8: case miopenDouble: - FAIL() << "miopenInt8x4, miopenInt32, miopenFloat8, miopenBFloat8, miopenDouble data type " + FAIL() << "miopenInt32, miopenFloat8, miopenBFloat8, miopenDouble data type " "not supported by conv_embed_db test"; default: params = ConfigWithFloat::GetParam(); diff --git a/test/gtest/conv_hip_igemm_xdlops.cpp b/test/gtest/conv_hip_igemm_xdlops.cpp index 508624c847..3407446557 100644 --- a/test/gtest/conv_hip_igemm_xdlops.cpp +++ b/test/gtest/conv_hip_igemm_xdlops.cpp @@ -65,10 +65,10 @@ void Run2dDriver(miopenDataType_t prec) case miopenHalf: case miopenBFloat16: case miopenFloat: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenInt32: case miopenDouble: - FAIL() << "miopenHalf, miopenBFloat16, miopenFloat, miopenInt8x4, miopenInt32, " + FAIL() << "miopenHalf, miopenBFloat16, miopenFloat, miopenInt32, " "miopenDouble data " "type not supported by " "test_conv_hip_igemm_xdlops test"; diff --git a/test/gtest/conv_igemm_dynamic.cpp b/test/gtest/conv_igemm_dynamic.cpp index 25a4e179c5..59341ac8a0 100644 --- a/test/gtest/conv_igemm_dynamic.cpp +++ b/test/gtest/conv_igemm_dynamic.cpp @@ -68,12 +68,12 @@ void Run2dDriver(miopenDataType_t prec) case miopenHalf: case miopenInt8: case miopenBFloat16: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenInt32: case miopenDouble: case miopenFloat8: case miopenBFloat8: - FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt8x4, miopenInt32, " + FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt32, " "miopenDouble, miopenFloat8, miopenBFloat8 " "data type not supported by conv_igemm_dynamic test"; diff --git a/test/gtest/conv_igemm_mlir.cpp b/test/gtest/conv_igemm_mlir.cpp index d5fc0d426c..29d240645c 100644 --- a/test/gtest/conv_igemm_mlir.cpp +++ b/test/gtest/conv_igemm_mlir.cpp @@ -81,13 +81,13 @@ void Run2dDriver(miopenDataType_t prec) case miopenInt8: params = ConfigWithInt8::GetParam(); break; case miopenFloat: params = ConfigWithFloat::GetParam(); break; case miopenBFloat16: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenInt32: case miopenFloat8: case miopenBFloat8: case miopenDouble: MIOPEN_THROW(miopenStatusBadParm, - "miopenBFloat16, miopenInt8x4, miopenInt32, miopenFloat8, miopenBFloat8, " + "miopenBFloat16, miopenInt32, miopenFloat8, miopenBFloat8, " "miopenDouble data type not supported by conv_igemm_mlir test"); default: params = ConfigWithFloat::GetParam(); diff --git a/test/gtest/conv_igemm_mlir_xdlops.cpp b/test/gtest/conv_igemm_mlir_xdlops.cpp index 19913093c0..e2c5a3ed8f 100644 --- a/test/gtest/conv_igemm_mlir_xdlops.cpp +++ b/test/gtest/conv_igemm_mlir_xdlops.cpp @@ -53,13 +53,13 @@ void Run2dDriver(miopenDataType_t prec) case miopenInt8: params = ConfigWithInt8::GetParam(); break; case miopenBFloat16: case miopenFloat: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenInt32: case miopenDouble: case miopenFloat8: case miopenBFloat8: MIOPEN_THROW(miopenStatusBadParm, - "miopenBFloat16, miopenFloat, miopenInt8x4, miopenInt32, miopenDouble data " + "miopenBFloat16, miopenFloat, miopenInt32, miopenDouble data " "type not supported by " "conv_igemm_mlir_xdlops test"); diff --git a/test/gtest/conv_trans.cpp b/test/gtest/conv_trans.cpp index 20015336a0..185a459947 100644 --- a/test/gtest/conv_trans.cpp +++ b/test/gtest/conv_trans.cpp @@ -55,10 +55,10 @@ void Run2dDriver(miopenDataType_t prec) case miopenBFloat8: case miopenInt8: case miopenBFloat16: - case miopenInt8x4: + case miopenInt8x4: // Support discontinued. case miopenInt32: case miopenDouble: - FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt8x4, miopenInt32, miopenDouble " + FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt32, miopenDouble " "data type not supported by " "conv_trans test"; diff --git a/test/gtest/db_sync.cpp b/test/gtest/db_sync.cpp index d7e59468a2..13b647ceac 100644 --- a/test/gtest/db_sync.cpp +++ b/test/gtest/db_sync.cpp @@ -73,7 +73,7 @@ miopenDataType_t GetDataTypeFromString(const std::string& data_type) else if(data_type == "INT8") return miopenInt8; else if(data_type == "INT8x4") - return miopenInt8x4; + return miopenInt8x4; // Support discontinued. Maintain compatibility with old databases. else if(data_type == "INT32") return miopenInt32; else if(data_type == "BF16") diff --git a/test/tensor_holder.hpp b/test/tensor_holder.hpp index 0b05a1e5e8..3fda3b5cf7 100644 --- a/test/tensor_holder.hpp +++ b/test/tensor_holder.hpp @@ -195,9 +195,10 @@ struct tensor tensor(miopen::TensorDescriptor rhs) : desc(std::move(rhs)) { - assert(desc.GetType() == miopen_type{} || - ((miopen_type{} == miopenInt8 || miopen_type{} == miopenInt8x4) && - (desc.GetType() == miopenFloat || desc.GetType() == miopenInt32))); + assert(desc.GetType() == miopen_type{} // + || (miopen_type{} == miopenInt8 // + && (desc.GetType() == miopenFloat // + || desc.GetType() == miopenInt32))); data.resize(desc.GetElementSpace()); } diff --git a/test/tensor_transform.cpp b/test/tensor_transform.cpp index a0585853a0..e87420bb7a 100644 --- a/test/tensor_transform.cpp +++ b/test/tensor_transform.cpp @@ -422,7 +422,7 @@ struct tensor_transform_driver : test_driver } // Test tensor scale addition - if(miopen_type{} == miopenInt8 || miopen_type{} == miopenInt8x4) + if(miopen_type{} == miopenInt8) return; super_src = tensor{superLens_src}.generate(tensor_elem_gen_integer{max_value}); From defb1b04fc74945227188c5de5d243215ade6a22 Mon Sep 17 00:00:00 2001 From: amberhassaan Date: Thu, 12 Oct 2023 00:05:34 -0400 Subject: [PATCH 33/36] Test non-packed inputs with naive reference convolution kernels (#2394) --- driver/random.hpp | 1 + fin | 2 +- src/include/miopen/convolution.hpp | 4 + .../gpu_reference_kernel/naive_conv.cpp | 6 +- src/ocl/convolutionocl.cpp | 123 ++++-- src/solver/conv_direct_naive_conv.cpp | 4 + src/solver/conv_direct_naive_conv_bwd.cpp | 8 +- src/solver/conv_direct_naive_conv_fwd.cpp | 8 +- src/solver/conv_direct_naive_conv_wrw.cpp | 8 +- test/gpu_reference_kernel.cpp | 381 +++++------------- test/gtest/conv_api_strided_tensors.cpp | 8 +- 11 files changed, 221 insertions(+), 332 deletions(-) diff --git a/driver/random.hpp b/driver/random.hpp index 6398048dde..b3be81f56e 100644 --- a/driver/random.hpp +++ b/driver/random.hpp @@ -91,6 +91,7 @@ inline T gen_0_to_B(T B) template inline T gen_A_to_B(T A, T B) { + assert(B > A); return gen_0_to_B(B - A) + A; } diff --git a/fin b/fin index 26b5c32864..afc1a8d87e 160000 --- a/fin +++ b/fin @@ -1 +1 @@ -Subproject commit 26b5c328642a6af5041539ceae36b9340829384b +Subproject commit afc1a8d87e6d00c82903942007bb370ee1f6c760 diff --git a/src/include/miopen/convolution.hpp b/src/include/miopen/convolution.hpp index bac0133106..35c494eab2 100644 --- a/src/include/miopen/convolution.hpp +++ b/src/include/miopen/convolution.hpp @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -404,6 +405,9 @@ struct ConvolutionDescriptor : miopenConvolutionDescriptor friend void to_json(nlohmann::json& json, const ConvolutionDescriptor& conv); friend void from_json(const nlohmann::json& json, ConvolutionDescriptor& conv); + +private: + void ValidateTensors(const ConvTensors& conv_tensors) const; }; void ConvolutionBackwardBias(const Handle& handle, diff --git a/src/kernels/gpu_reference_kernel/naive_conv.cpp b/src/kernels/gpu_reference_kernel/naive_conv.cpp index b243b1234a..125eff94f3 100644 --- a/src/kernels/gpu_reference_kernel/naive_conv.cpp +++ b/src/kernels/gpu_reference_kernel/naive_conv.cpp @@ -126,9 +126,9 @@ inline __device__ __host__ int8_t cast_to(const int32_t& val) /// composable_kernel (CK) treats G dimension. Which is why nchw should be ngchw, /// and nhwc should be nhwgc. Same follows for the 3D case. /// -/// - strides here are in the little-endian order, i.e., for NHWC, stride for N is -/// at index 3 while stride for C is at index 0. This is reverse of how strides are -/// stored in tensor descriptors, which are big-endian. +/// - strides here are stored right to left, i.e., for NHWC, stride for N is +/// at index 3 while stride for C is at index 0. This is different from how the +/// tensor descriptors store strides, which is always NCHW order, left-to-right. template inline __device__ void naive_conv_fwd_nchw(const src_data_t* __restrict__ p_in, diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp index 94b083577d..d66186577c 100644 --- a/src/ocl/convolutionocl.cpp +++ b/src/ocl/convolutionocl.cpp @@ -287,30 +287,6 @@ void ConvolutionDescriptor::FindConvFwdAlgorithm(Handle& handle, namespace { -void ValidateConvTensors(const ConvTensors& tensors) -{ - const auto invalid_buffers = - tensors.x == nullptr || tensors.w == nullptr || tensors.y == nullptr; - - const auto tensor_sizes_not_matched = tensors.xDesc.GetSize() != tensors.yDesc.GetSize() || - tensors.xDesc.GetSize() != tensors.wDesc.GetSize(); - - const auto trivial_tensor_types_not_matched = - tensors.xDesc.GetType() != tensors.yDesc.GetType() && tensors.xDesc.GetType() != miopenInt8; - - // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) { - // MIOPEN_THROW(miopenStatusBadParm); - //} - - const auto x_tensor_invalid = tensors.xDesc.GetSize() < 3; - - const auto bad_parameters = invalid_buffers || tensor_sizes_not_matched || - trivial_tensor_types_not_matched || x_tensor_invalid; - - if(bad_parameters) - MIOPEN_THROW(miopenStatusBadParm); -} - void ValidateAlphaBeta(const void* alpha, const void* beta) { if(!float_equal(*(static_cast(alpha)), 1.0) || @@ -401,6 +377,88 @@ static void ConvForwardCheckNumerics(const Handle& handle, } } +void ConvolutionDescriptor::ValidateTensors(const ConvTensors& tensors) const +{ + + // Group stride in current TensorDescriptor is implicit. When invoking kernels, + // we need to add the group dimension G and compute its stride. We want the stride + // left of C to be a multiple of group count G. e.g. for NCHW, the stride for N + // should be a multiple of G so that we can compute the strides for NGCHW + auto bad_group_stride = [this](const TensorDescriptor& td) { + auto l = td.GetLayout_t(); + int g_stride_index = -1; + if(l == miopenTensorNCHW || l == miopenTensorNCDHW) + { + g_stride_index = 0; // stride index for N; + } + else if(l == miopenTensorNHWC || l == miopenTensorNDHWC) + { + // stride index for W. Normally this would be 2nd-last stride but we store + // strides in NCHW order for some weird reason. + g_stride_index = td.GetStrides().size() - 1; + } + else + { + MIOPEN_THROW(miopenStatusInternalError, "Layout not supported for grouped convolution"); + } + + if(g_stride_index != -1) + { + return (td.GetStrides()[g_stride_index] % this->group_count) != 0; + } + + return false; + }; + + // invalid_buffers + if(tensors.x == nullptr || tensors.w == nullptr || tensors.y == nullptr) + { + MIOPEN_THROW(miopenStatusBadParm, "One of the convolution tensors is null"); + } + + // x_tensor_invalid = + if(tensors.xDesc.GetSize() < 3) + { + MIOPEN_THROW(miopenStatusBadParm, "input tensor's number of dimensions is wrong"); + } + + // tensor_sizes_not_matched = + if(tensors.xDesc.GetSize() != tensors.yDesc.GetSize() || + tensors.xDesc.GetSize() != tensors.wDesc.GetSize()) + { + MIOPEN_THROW(miopenStatusBadParm, + "number of dimensions mismatch between input, output and weights tensors"); + } + + // trivial_tensor_types_not_matched = + if(tensors.xDesc.GetType() != tensors.yDesc.GetType() && + tensors.xDesc.GetType() != miopenInt8 && tensors.xDesc.GetType() != miopenInt8x4) + { + MIOPEN_THROW(miopenStatusBadParm, "input/output tensor data types do not match"); + } + + // check for bad_group_stride. This applies for input and output only. There + // is no check for weight tensor currently. + // no need to check for group_count == 1 + + if((this->group_count > 1) && bad_group_stride(tensors.xDesc)) + { + MIOPEN_THROW( + miopenStatusBadParm, + "Invalid input tensor strides. Channel stride must be a multiple of group count"); + } + if((this->group_count > 1) && bad_group_stride(tensors.yDesc)) + { + MIOPEN_THROW( + miopenStatusBadParm, + "Invalid output tensor strides. Channel stride must be a multiple of group count"); + } + + // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) { + // MIOPEN_THROW(miopenStatusBadParm); + //} +} + void ConvolutionDescriptor::ConvolutionForward(Handle& handle, const void* alpha, const TensorDescriptor& xDesc, @@ -416,13 +474,8 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle, { MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize); - if(!(xDesc.IsPacked() && wDesc.IsPacked() && yDesc.IsPacked())) - { - MIOPEN_THROW(miopenStatusNotImplemented, "Only fully packed tensors are supported"); - } - const auto tensors = ConvFwdTensors{xDesc, x, wDesc, w, yDesc, y}; - ValidateConvTensors(tensors); + ValidateTensors(tensors); ValidateAlphaBeta(alpha, beta); ConvForwardCheckNumerics(handle, tensors, [&]() { @@ -735,7 +788,7 @@ void ConvolutionDescriptor::ConvolutionForwardImmediate(Handle& handle, MIOPEN_LOG_I("solver_id = " << solver_id.ToString() << ", workspace = " << workSpaceSize); const auto tensors = ConvFwdTensors{xDesc, x, wDesc, w, yDesc, y}; - ValidateConvTensors(tensors); + ValidateTensors(tensors); if(!solver_id.IsValid()) MIOPEN_THROW(miopenStatusBadParm); @@ -871,7 +924,7 @@ void ConvolutionDescriptor::ConvolutionBackwardData(Handle& handle, auto tensors = ConvBwdTensors{dyDesc, dy, wDesc, w, dxDesc, dx}; - ValidateConvTensors(tensors); + ValidateTensors(tensors); ValidateAlphaBeta(alpha, beta); ConvBwdCheckNumerics(handle, tensors, beta, [&]() { @@ -937,7 +990,7 @@ void ConvolutionDescriptor::ConvolutionBackwardImmediate(Handle& handle, MIOPEN_LOG_I("solver_id = " << solver_id.ToString() << ", workspace = " << workSpaceSize); auto tensors = ConvBwdTensors{dyDesc, dy, wDesc, w, dxDesc, dx}; - ValidateConvTensors(tensors); + ValidateTensors(tensors); static const float beta = 0.0f; ConvBwdCheckNumerics(handle, tensors, &beta, [&]() { @@ -1071,7 +1124,7 @@ void ConvolutionDescriptor::ConvolutionBackwardWeights(const Handle& handle, { MIOPEN_LOG_I("algo = " << algo << ", workspace = " << workSpaceSize); decltype(auto) tensors = ConvWrwTensors{dyDesc, dy, xDesc, x, dwDesc, dw}; - ValidateConvTensors(tensors); + ValidateTensors(tensors); ValidateAlphaBeta(alpha, beta); if(xDesc.GetType() == miopenInt8) @@ -1134,7 +1187,7 @@ void ConvolutionDescriptor::ConvolutionWrwImmediate(Handle& handle, { MIOPEN_LOG_I("solver_id = " << solver_id.ToString() << ", workspace = " << workSpaceSize); auto tensors = ConvWrwTensors{dyDesc, dy, xDesc, x, dwDesc, dw}; - ValidateConvTensors(tensors); + ValidateTensors(tensors); if(xDesc.GetType() == miopenInt8) MIOPEN_THROW(miopenStatusBadParm); diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp index 992b196b45..f87511f911 100644 --- a/src/solver/conv_direct_naive_conv.cpp +++ b/src/solver/conv_direct_naive_conv.cpp @@ -111,11 +111,15 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS); std::string ConvDirectNaiveConvKernelName(const ProblemDescription& problem) { std::ostringstream kernel_name; + + /// \todo remove packed reference convolution kernels --amberhassaan +#ifndef NDEBUG // enable in debug mode only if(miopen::IsEnabled(MIOPEN_DEBUG_CONV_DIRECT_NAIVE_USE_PACKED_KERNELS())) { kernel_name << "naive_conv_packed_"; } else +#endif { kernel_name << "naive_conv_nonpacked_"; } diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp index 1a28f8aae6..dea91c9ecf 100644 --- a/src/solver/conv_direct_naive_conv_bwd.cpp +++ b/src/solver/conv_direct_naive_conv_bwd.cpp @@ -134,12 +134,8 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, kernel.l_wk.push_back(1); kernel.l_wk.push_back(1); - const auto is_f8 = [&]() { - if(kernel.kernel_file == "fp8_naive_conv.cpp") - return true; - else - return false; - }(); + const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp"); + kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); int G_stride_idx = conv_internal::GetGroupStrideIndex(problem); diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp index a4656d929a..5bc25a2367 100644 --- a/src/solver/conv_direct_naive_conv_fwd.cpp +++ b/src/solver/conv_direct_naive_conv_fwd.cpp @@ -122,12 +122,6 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx, KernelInfo kernel; kernel.kernel_file = ConvDirectNaiveConvKernelFile(ctx, problem); - const auto is_f8 = [&]() { - if(kernel.kernel_file == "fp8_naive_conv.cpp") - return true; - else - return false; - }(); kernel.kernel_name = ConvDirectNaiveConvKernelName(problem); kernel.g_wk.clear(); @@ -139,6 +133,8 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx, kernel.l_wk.push_back(1); kernel.l_wk.push_back(1); + const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp"); + kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); int G_stride_idx = conv_internal::GetGroupStrideIndex(problem); diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp index dfe1c342b0..a8c4d40e0b 100644 --- a/src/solver/conv_direct_naive_conv_wrw.cpp +++ b/src/solver/conv_direct_naive_conv_wrw.cpp @@ -121,13 +121,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, kernel.l_wk.push_back(1); kernel.l_wk.push_back(1); + const auto is_f8 = (kernel.kernel_file == "fp8_naive_conv.cpp"); + kernel.comp_options = ConvDirectNaiveConvCompileOption(ctx, problem); - const auto is_f8 = [&]() { - if(kernel.kernel_file == "fp8_naive_conv.cpp") - return true; - else - return false; - }(); int G_stride_idx = conv_internal::GetGroupStrideIndex(problem); diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp index be8f3f8430..e166781b9b 100644 --- a/test/gpu_reference_kernel.cpp +++ b/test/gpu_reference_kernel.cpp @@ -24,6 +24,8 @@ * *******************************************************************************/ +#include +#include #include #include #include @@ -73,17 +75,9 @@ std::string tensor_layout_to_string(tensor_layout_t layout) struct gpu_reference_kernel_base { miopenHandle_t handle{}; -#if MIOPEN_BACKEND_OPENCL - cl_command_queue q{}; -#endif - gpu_reference_kernel_base() - { - miopenCreate(&handle); -#if MIOPEN_BACKEND_OPENCL - miopenGetStream(handle, &q); -#endif - } + gpu_reference_kernel_base() { miopenCreate(&handle); } + ~gpu_reference_kernel_base() { miopenDestroy(handle); } static int conv_out_size(int in_size, int pad, int dilation, int ksize, int stride) @@ -308,6 +302,21 @@ static std::string miopen_type_to_string(miopenDataType_t type) return "n/a"; } +/// input: a vector of lengths of dims in a tensor +/// multiply each element with a random constant integer +void pad_tensor_strides(std::vector& strides) +{ + constexpr int min_stride_multiplier = 1; + constexpr int max_stride_multiplier = 5; + + auto c = prng::gen_A_to_B(min_stride_multiplier, max_stride_multiplier); + for(auto& v : strides) + { + // cppcheck-suppress useStlAlgorithm + v = v * c; + } +} + template in_len({n, c, hi, wi}); std::vector wei_len({k, c_per_group, fy, fx}); @@ -360,28 +364,25 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base miopen::tensor_layout_to_strides(wei_len, layout_default, layout_string, wei_strides); miopen::tensor_layout_to_strides(out_len, layout_default, layout_string, out_strides); + pad_tensor_strides(in_strides); + pad_tensor_strides(wei_strides); + pad_tensor_strides(out_strides); + tensor in(in_len, in_strides); tensor wei(wei_len, wei_strides); tensor out(out_len, out_strides); -#if MIOPEN_BACKEND_OPENCL - cl_context ctx; - clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); - cl_int status = CL_SUCCESS; - cl_mem in_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * in_sz, nullptr, &status); - cl_mem wei_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * wei_sz, nullptr, nullptr); - cl_mem out_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(Tout) * out_sz, nullptr, nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + + auto in_sz = in.data.size(); + auto wei_sz = wei.data.size(); + auto out_sz = out.data.size(); + void* in_dev; void* wei_dev; void* out_dev; EXPECT(hipMalloc(&in_dev, sizeof(TRef) * in_sz) == hipSuccess); EXPECT(hipMalloc(&wei_dev, sizeof(TRef) * wei_sz) == hipSuccess); EXPECT(hipMalloc(&out_dev, sizeof(Tout) * out_sz) == hipSuccess); -#endif + EXPECT(miopenCreateConvolutionDescriptor(&convDesc) == miopenStatusSuccess); EXPECT(miopenInitConvolutionNdDescriptor(convDesc, 2, @@ -417,27 +418,9 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base // initialize data with integer rand_tensor_integer(in); rand_tensor_integer(wei); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueWriteBuffer(q, - in_dev, - CL_TRUE, - 0, - sizeof(TRef) * in_sz, - in.data.data(), - 0, - nullptr, - nullptr); - status |= clEnqueueWriteBuffer(q, - wei_dev, - CL_TRUE, - 0, - sizeof(TRef) * wei_sz, - wei.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + /// \ref copy_non_packed_output_before_convolution + rand_tensor_integer(out); + EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -445,7 +428,19 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base wei.data.data(), sizeof(TRef) * wei_sz, hipMemcpyHostToDevice) == hipSuccess); -#endif + /// \anchor copy_non_packed_output_before_convolution + /// \note Output is a non-packed tensor, which means there are + /// elements that convolution will not update. In order to verify + /// the convolution result, the GPU buffer should have the same + /// data as the CPU in both update and not-updated elements. + /// Therefore, we copy the output to the GPU buffer after + /// initializing it with random values. + /// + EXPECT(hipMemcpy(out_dev, + out.data.data(), + sizeof(Tout) * out_sz, + hipMemcpyHostToDevice) == hipSuccess); + cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -470,23 +465,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base miopenStatusSuccess); tensor out_host(out_len, out_strides); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - out_dev, - CL_TRUE, - 0, - sizeof(Tout) * out_sz, - out_host.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP EXPECT(hipMemcpy(out_host.data.data(), out_dev, sizeof(Tout) * out_sz, hipMemcpyDeviceToHost) == hipSuccess); -#endif + // we expect excact match, since use integer valid_result = verify_tensor(out_host, out); } @@ -495,36 +478,22 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base // initialize data with integer rand_tensor_integer(out); rand_tensor_integer(wei); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueWriteBuffer(q, - out_dev, - CL_TRUE, - 0, - sizeof(TRef) * out_sz, - out.data.data(), - 0, - nullptr, - nullptr); - status |= clEnqueueWriteBuffer(q, - wei_dev, - CL_TRUE, - 0, - sizeof(TRef) * wei_sz, - wei.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + /// \ref copy_non_packed_output_before_convolution + rand_tensor_integer(in); + /// \ref copy_non_packed_output_before_convolution + + EXPECT(hipMemcpy( + in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == + hipSuccess); EXPECT(hipMemcpy(out_dev, out.data.data(), - sizeof(TRef) * out_sz, + sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); EXPECT(hipMemcpy(wei_dev, wei.data.data(), sizeof(TRef) * wei_sz, hipMemcpyHostToDevice) == hipSuccess); -#endif + cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -549,23 +518,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base miopenStatusSuccess); tensor in_host(in_len, in_strides); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - in_dev, - CL_TRUE, - 0, - sizeof(TRef) * in_sz, - in_host.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy(in_host.data.data(), in_dev, sizeof(TRef) * in_sz, hipMemcpyDeviceToHost) == hipSuccess); -#endif // we expect excact match, since use integer valid_result = verify_tensor(in_host, in); @@ -574,35 +531,22 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base { rand_tensor_integer(in); rand_tensor_integer(out); -#if MIOPEN_BACKEND_OPENCL - status |= clEnqueueWriteBuffer(q, - in_dev, - CL_TRUE, - 0, - sizeof(TRef) * in_sz, - in.data.data(), - 0, - nullptr, - nullptr); - status |= clEnqueueWriteBuffer(q, - out_dev, - CL_TRUE, - 0, - sizeof(TRef) * out_sz, - out.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + /// \ref copy_non_packed_output_before_convolution + rand_tensor_integer(wei); + EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); + /// \ref copy_non_packed_output_before_convolution + EXPECT(hipMemcpy(wei_dev, + wei.data.data(), + sizeof(TRef) * wei_sz, + hipMemcpyHostToDevice) == hipSuccess); EXPECT(hipMemcpy(out_dev, out.data.data(), - sizeof(TRef) * out_sz, + sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); -#endif + cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -627,23 +571,11 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base miopenStatusSuccess); tensor wei_host(wei_len, wei_strides); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - wei_dev, - CL_TRUE, - 0, - sizeof(TRef) * wei_sz, - wei_host.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy(wei_host.data.data(), wei_dev, sizeof(TRef) * wei_sz, hipMemcpyDeviceToHost) == hipSuccess); -#endif // we expect excact match, since use integer valid_result = verify_tensor(wei_host, wei); @@ -665,15 +597,10 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base miopenDestroyTensorDescriptor(inDesc); miopenDestroyTensorDescriptor(weiDesc); miopenDestroyTensorDescriptor(outDesc); -#if MIOPEN_BACKEND_OPENCL - clReleaseMemObject(in_dev); - clReleaseMemObject(wei_dev); - clReleaseMemObject(out_dev); -#elif MIOPEN_BACKEND_HIP + hipFree(in_dev); hipFree(wei_dev); hipFree(out_dev); -#endif }; iterate_conv_2d(run_conv_2d); @@ -717,11 +644,6 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base int wo = conv_out_size(wi, px, dx, fx, sx); int do_ = conv_out_size(di, pz, dz, fz, sz); int c_per_group = c / g; - int k_per_group = k / g; - - int in_sz = g * n * c_per_group * di * hi * wi; - int wei_sz = g * k_per_group * c_per_group * fz * fy * fx; - int out_sz = g * n * k_per_group * do_ * ho * wo; std::vector in_len({n, c, di, hi, wi}); std::vector wei_len({k, c_per_group, fz, fy, fx}); @@ -738,28 +660,26 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base miopen::tensor_layout_to_strides(wei_len, layout_default, layout_string, wei_strides); miopen::tensor_layout_to_strides(out_len, layout_default, layout_string, out_strides); + pad_tensor_strides(in_strides); + pad_tensor_strides(wei_strides); + pad_tensor_strides(out_strides); + tensor in(in_len, in_strides); tensor wei(wei_len, wei_strides); tensor out(out_len, out_strides); -#if MIOPEN_BACKEND_OPENCL - cl_context ctx; - clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); - cl_int status = CL_SUCCESS; - cl_mem in_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * in_sz, nullptr, &status); - cl_mem wei_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * wei_sz, nullptr, nullptr); - cl_mem out_dev = - clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(Tout) * out_sz, nullptr, nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + + auto in_sz = in.data.size(); + auto wei_sz = wei.data.size(); + auto out_sz = out.data.size(); + void* in_dev; void* wei_dev; void* out_dev; + EXPECT(hipMalloc(&in_dev, sizeof(TRef) * in_sz) == hipSuccess); EXPECT(hipMalloc(&wei_dev, sizeof(TRef) * wei_sz) == hipSuccess); EXPECT(hipMalloc(&out_dev, sizeof(Tout) * out_sz) == hipSuccess); -#endif + EXPECT(miopenCreateConvolutionDescriptor(&convDesc) == miopenStatusSuccess); EXPECT(miopenInitConvolutionNdDescriptor(convDesc, 3, @@ -795,35 +715,21 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base // initialize data with integer rand_tensor_integer(in); rand_tensor_integer(wei); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueWriteBuffer(q, - in_dev, - CL_TRUE, - 0, - sizeof(TRef) * in_sz, - in.data.data(), - 0, - nullptr, - nullptr); - status |= clEnqueueWriteBuffer(q, - wei_dev, - CL_TRUE, - 0, - sizeof(TRef) * wei_sz, - wei.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + /// \ref copy_non_packed_output_before_convolution + rand_tensor_integer(out); + EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); + /// \ref copy_non_packed_output_before_convolution + EXPECT(hipMemcpy(out_dev, + out.data.data(), + sizeof(Tout) * out_sz, + hipMemcpyHostToDevice) == hipSuccess); EXPECT(hipMemcpy(wei_dev, wei.data.data(), sizeof(TRef) * wei_sz, hipMemcpyHostToDevice) == hipSuccess); -#endif cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), in, @@ -849,23 +755,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base miopenStatusSuccess); tensor out_host(out_len, out_strides); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - out_dev, - CL_TRUE, - 0, - sizeof(Tout) * out_sz, - out_host.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy(out_host.data.data(), out_dev, sizeof(Tout) * out_sz, hipMemcpyDeviceToHost) == hipSuccess); -#endif // we expect excact match, since use integer valid_result = verify_tensor(out_host, out); @@ -875,36 +769,22 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base // initialize data with integer rand_tensor_integer(out); rand_tensor_integer(wei); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueWriteBuffer(q, - out_dev, - CL_TRUE, - 0, - sizeof(TRef) * out_sz, - out.data.data(), - 0, - nullptr, - nullptr); - status |= clEnqueueWriteBuffer(q, - wei_dev, - CL_TRUE, - 0, - sizeof(TRef) * wei_sz, - wei.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + /// \ref copy_non_packed_output_before_convolution + rand_tensor_integer(in); + + /// \ref copy_non_packed_output_before_convolution + EXPECT(hipMemcpy( + in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == + hipSuccess); EXPECT(hipMemcpy(out_dev, out.data.data(), - sizeof(TRef) * out_sz, + sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); EXPECT(hipMemcpy(wei_dev, wei.data.data(), sizeof(TRef) * wei_sz, hipMemcpyHostToDevice) == hipSuccess); -#endif + cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -929,23 +809,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base miopenStatusSuccess); tensor in_host(in_len, in_strides); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - in_dev, - CL_TRUE, - 0, - sizeof(TRef) * in_sz, - in_host.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy(in_host.data.data(), in_dev, sizeof(TRef) * in_sz, hipMemcpyDeviceToHost) == hipSuccess); -#endif // we expect excact match, since use integer valid_result = verify_tensor(in_host, in); @@ -954,35 +822,22 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base { rand_tensor_integer(in, 3, -2); rand_tensor_integer(out, 3, -2); -#if MIOPEN_BACKEND_OPENCL - status |= clEnqueueWriteBuffer(q, - in_dev, - CL_TRUE, - 0, - sizeof(TRef) * in_sz, - in.data.data(), - 0, - nullptr, - nullptr); - status |= clEnqueueWriteBuffer(q, - out_dev, - CL_TRUE, - 0, - sizeof(TRef) * out_sz, - out.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + /// \ref copy_non_packed_output_before_convolution + rand_tensor_integer(wei); + EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); + /// \ref copy_non_packed_output_before_convolution + EXPECT(hipMemcpy(wei_dev, + wei.data.data(), + sizeof(TRef) * wei_sz, + hipMemcpyHostToDevice) == hipSuccess); EXPECT(hipMemcpy(out_dev, out.data.data(), - sizeof(TRef) * out_sz, + sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); -#endif + cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -1007,23 +862,11 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base miopenStatusSuccess); tensor wei_host(wei_len, wei_strides); -#if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - wei_dev, - CL_TRUE, - 0, - sizeof(TRef) * wei_sz, - wei_host.data.data(), - 0, - nullptr, - nullptr); - EXPECT(status == CL_SUCCESS); -#elif MIOPEN_BACKEND_HIP + EXPECT(hipMemcpy(wei_host.data.data(), wei_dev, sizeof(TRef) * wei_sz, hipMemcpyDeviceToHost) == hipSuccess); -#endif // we expect excact match, since use integer valid_result = verify_tensor(wei_host, wei, 8.0); // max possible int @@ -1049,15 +892,9 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base miopenDestroyTensorDescriptor(weiDesc); miopenDestroyTensorDescriptor(outDesc); -#if MIOPEN_BACKEND_OPENCL - clReleaseMemObject(in_dev); - clReleaseMemObject(wei_dev); - clReleaseMemObject(out_dev); -#elif MIOPEN_BACKEND_HIP hipFree(in_dev); hipFree(wei_dev); hipFree(out_dev); -#endif }; iterate_conv_3d(run_conv_3d); diff --git a/test/gtest/conv_api_strided_tensors.cpp b/test/gtest/conv_api_strided_tensors.cpp index 2a59dcd696..04d56ec908 100644 --- a/test/gtest/conv_api_strided_tensors.cpp +++ b/test/gtest/conv_api_strided_tensors.cpp @@ -139,7 +139,9 @@ class ConvStridedTensors : public ::testing::Test std::vector h_output; }; -// This test should be replaced when strided tensors are fully implemented +/// \todo re-enable this test after NCDHW grouped convolution lands (PR 2429) +/// \todo add cpu reference convolution for verification --amberhassaan +#if 0 TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented) { auto device = Device(handle); @@ -178,9 +180,8 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented) const float alpha = 1.f; const float beta = 0.f; - // miopenConvolutionForward() must return error if the format is not supported ASSERT_TRUE(device.Synchronize()); - ASSERT_NE(miopenConvolutionForward(handle, + ASSERT_EQ(miopenConvolutionForward(handle, &alpha, input_descr, d_input.Data(), @@ -196,3 +197,4 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented) miopenStatusSuccess); ASSERT_TRUE(device.Synchronize()); } +#endif From 4855e6eb9fd9a63977bb3cad6e3f74a97a046ce6 Mon Sep 17 00:00:00 2001 From: Daming Feng Date: Thu, 12 Oct 2023 03:08:31 -0500 Subject: [PATCH 34/36] 3D forward convolution solver with non-packed input tensors (#2418) --- .../miopen/solver/ck_utility_common.hpp | 10 + src/solver/conv_direct_naive_conv_bwd.cpp | 3 + src/solver/conv_direct_naive_conv_fwd.cpp | 3 + src/solver/conv_direct_naive_conv_wrw.cpp | 3 + ...ip_implicit_gemm_3d_grouped_bwd_xdlops.cpp | 18 +- ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 18 +- ...ip_implicit_gemm_3d_grouped_wrw_xdlops.cpp | 20 +- test/gpu_reference_kernel.cpp | 172 ++++++++++++++++-- test/gtest/conv_api_strided_tensors.cpp | 4 - test/gtest/nonpack_conv3d_fwd.cpp | 98 ++++++++++ test/gtest/nonpack_conv3d_fwd.hpp | 157 ++++++++++++++++ 11 files changed, 468 insertions(+), 38 deletions(-) create mode 100644 test/gtest/nonpack_conv3d_fwd.cpp create mode 100644 test/gtest/nonpack_conv3d_fwd.hpp diff --git a/src/include/miopen/solver/ck_utility_common.hpp b/src/include/miopen/solver/ck_utility_common.hpp index f4f91fa228..003b067e50 100644 --- a/src/include/miopen/solver/ck_utility_common.hpp +++ b/src/include/miopen/solver/ck_utility_common.hpp @@ -63,6 +63,16 @@ static inline bool is_ck_supported_hardware(const Handle& handle) StartsWith(handle.GetDeviceName(), "gfx1102"); } +static inline bool is_conv_ck_supported_hardware(const std::string& device_name, bool is_wrw) +{ + auto res_wrw = StartsWith(device_name, "gfx908") || StartsWith(device_name, "gfx90a") || + StartsWith(device_name, "gfx940") || StartsWith(device_name, "gfx941") || + StartsWith(device_name, "gfx942"); + return is_wrw ? res_wrw + : (res_wrw || StartsWith(device_name, "gfx900") || + StartsWith(device_name, "gfx906")); +} + static inline bool is_support_amd_buffer_atomic_fadd(const std::string& device_name) { return StartsWith(device_name, "gfx908"); diff --git a/src/solver/conv_direct_naive_conv_bwd.cpp b/src/solver/conv_direct_naive_conv_bwd.cpp index dea91c9ecf..1e8f006ef0 100644 --- a/src/solver/conv_direct_naive_conv_bwd.cpp +++ b/src/solver/conv_direct_naive_conv_bwd.cpp @@ -162,6 +162,9 @@ ConvSolution ConvDirectNaiveConvBwd::GetSolution(const ExecutionContext& ctx, handle.Run(kern)(tensors.out, tensors.w, tensors.in, + out_strides, + wei_strides, + in_strides, hi, wi, n, diff --git a/src/solver/conv_direct_naive_conv_fwd.cpp b/src/solver/conv_direct_naive_conv_fwd.cpp index 5bc25a2367..f1ed2f5b10 100644 --- a/src/solver/conv_direct_naive_conv_fwd.cpp +++ b/src/solver/conv_direct_naive_conv_fwd.cpp @@ -162,6 +162,9 @@ ConvSolution ConvDirectNaiveConvFwd::GetSolution(const ExecutionContext& ctx, handle.Run(kern)(tensors.in, tensors.w, tensors.out, + in_strides, + wei_strides, + out_strides, hi, wi, n, diff --git a/src/solver/conv_direct_naive_conv_wrw.cpp b/src/solver/conv_direct_naive_conv_wrw.cpp index a8c4d40e0b..b83b334faa 100644 --- a/src/solver/conv_direct_naive_conv_wrw.cpp +++ b/src/solver/conv_direct_naive_conv_wrw.cpp @@ -150,6 +150,9 @@ ConvSolution ConvDirectNaiveConvWrw::GetSolution(const ExecutionContext& ctx, handle.Run(kern)(tensors.x, tensors.dw, tensors.dy, + in_strides, + wei_strides, + out_strides, hi, wi, n, diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp index 94983c7f0e..58efe498ff 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp @@ -32,6 +32,7 @@ #include #include #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL +#include #include #endif #include @@ -86,10 +87,16 @@ struct CKArgs output = {G, N, K, Do, Ho, Wo}; weight = {G, K, C, Z, Y, X}; - // strides from NHWGC to GNCHW laout - in_strides = {C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C}; - out_strides = {K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K}; - wei_strides = {K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C}; + // miopen strides to CK strides + auto miopen_in_strides = problem.GetIn().GetStrides(); + auto miopen_out_strides = problem.GetOut().GetStrides(); + auto miopen_wei_strides = problem.GetWeights().GetStrides(); + miopen_in_strides.insert(miopen_in_strides.begin(), C); + miopen_out_strides.insert(miopen_out_strides.begin(), K); + miopen_wei_strides.insert(miopen_wei_strides.begin(), K * miopen_wei_strides[0]); + std::copy(miopen_in_strides.begin(), miopen_in_strides.end(), in_strides.begin()); + std::copy(miopen_out_strides.begin(), miopen_out_strides.end(), out_strides.begin()); + std::copy(miopen_wei_strides.begin(), miopen_wei_strides.end(), wei_strides.begin()); strides = {ProblemInterpreter::GetAdjustedConvolutionStrideD(problem), ProblemInterpreter::GetAdjustedConvolutionStrideH(problem), @@ -315,8 +322,7 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable( return false; if(!problem.IsLayoutNHWC()) return false; - const std::string& arch = ctx.GetStream().GetDeviceName(); - if(miopen::StartsWith(arch, "gfx11") || miopen::StartsWith(arch, "gfx10")) + if(!ck_utility::is_conv_ck_supported_hardware(ctx.GetStream().GetDeviceName(), false)) return false; switch(problem.GetInDataType()) { diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index a21c9ba300..e7a44456b9 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -32,6 +32,7 @@ #include #include #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL +#include #include #endif #include @@ -86,10 +87,16 @@ struct CKArgs output = {G, N, K, Do, Ho, Wo}; weight = {G, K, C, Z, Y, X}; - // strides from NHWGC to GNCHW laout - in_strides = {C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C}; - out_strides = {K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K}; - wei_strides = {K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C}; + // miopen strides to CK strides + auto miopen_in_strides = problem.GetIn().GetStrides(); + auto miopen_out_strides = problem.GetOut().GetStrides(); + auto miopen_wei_strides = problem.GetWeights().GetStrides(); + miopen_in_strides.insert(miopen_in_strides.begin(), C); + miopen_out_strides.insert(miopen_out_strides.begin(), K); + miopen_wei_strides.insert(miopen_wei_strides.begin(), K * miopen_wei_strides[0]); + std::copy(miopen_in_strides.begin(), miopen_in_strides.end(), in_strides.begin()); + std::copy(miopen_out_strides.begin(), miopen_out_strides.end(), out_strides.begin()); + std::copy(miopen_wei_strides.begin(), miopen_wei_strides.end(), wei_strides.begin()); strides = {ProblemInterpreter::GetAdjustedConvolutionStrideD(problem), ProblemInterpreter::GetAdjustedConvolutionStrideH(problem), @@ -313,8 +320,7 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable( return false; if(!problem.IsLayoutNHWC()) return false; - const std::string& arch = ctx.GetStream().GetDeviceName(); - if(!(arch == "gfx908" || arch == "gfx90a")) + if(!ck_utility::is_conv_ck_supported_hardware(ctx.GetStream().GetDeviceName(), false)) return false; switch(problem.GetInDataType()) { diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp index 6225410599..d0236e4f42 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp @@ -32,6 +32,7 @@ #include #include #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL +#include #include #endif #include @@ -84,10 +85,16 @@ struct CKArgs output = {G, N, K, Do, Ho, Wo}; weight = {G, K, C, Z, Y, X}; - // strides from NHWGC to GNCHW laout - in_strides = {C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C}; - out_strides = {K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K}; - wei_strides = {K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C}; + // miopen strides to CK strides + auto miopen_in_strides = problem.GetIn().GetStrides(); + auto miopen_out_strides = problem.GetOut().GetStrides(); + auto miopen_wei_strides = problem.GetWeights().GetStrides(); + miopen_in_strides.insert(miopen_in_strides.begin(), C); + miopen_out_strides.insert(miopen_out_strides.begin(), K); + miopen_wei_strides.insert(miopen_wei_strides.begin(), K * miopen_wei_strides[0]); + std::copy(miopen_in_strides.begin(), miopen_in_strides.end(), in_strides.begin()); + std::copy(miopen_out_strides.begin(), miopen_out_strides.end(), out_strides.begin()); + std::copy(miopen_wei_strides.begin(), miopen_wei_strides.end(), wei_strides.begin()); strides = {ProblemInterpreter::GetAdjustedConvolutionStrideD(problem), ProblemInterpreter::GetAdjustedConvolutionStrideH(problem), @@ -309,10 +316,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable( return false; if(!problem.IsLayoutNHWC()) return false; - const std::string& arch = ctx.GetStream().GetDeviceName(); - if(miopen::StartsWith(arch, "gfx11") || miopen::StartsWith(arch, "gfx10")) - return false; - if(arch == "gfx906" || arch == "gfx900") + if(!ck_utility::is_conv_ck_supported_hardware(ctx.GetStream().GetDeviceName(), true)) return false; switch(problem.GetInDataType()) { diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp index e166781b9b..a6a0b1e2bc 100644 --- a/test/gpu_reference_kernel.cpp +++ b/test/gpu_reference_kernel.cpp @@ -376,13 +376,25 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base auto wei_sz = wei.data.size(); auto out_sz = out.data.size(); +#if MIOPEN_BACKEND_OPENCL + cl_context ctx; + clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); + cl_int status = CL_SUCCESS; + cl_mem in_dev = + clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * in_sz, nullptr, &status); + cl_mem wei_dev = + clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * wei_sz, nullptr, nullptr); + cl_mem out_dev = + clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(Tout) * out_sz, nullptr, nullptr); + EXPECT(status == CL_SUCCESS); +#elif MIOPEN_BACKEND_HIP void* in_dev; void* wei_dev; void* out_dev; EXPECT(hipMalloc(&in_dev, sizeof(TRef) * in_sz) == hipSuccess); EXPECT(hipMalloc(&wei_dev, sizeof(TRef) * wei_sz) == hipSuccess); EXPECT(hipMalloc(&out_dev, sizeof(Tout) * out_sz) == hipSuccess); - +#endif EXPECT(miopenCreateConvolutionDescriptor(&convDesc) == miopenStatusSuccess); EXPECT(miopenInitConvolutionNdDescriptor(convDesc, 2, @@ -420,7 +432,27 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base rand_tensor_integer(wei); /// \ref copy_non_packed_output_before_convolution rand_tensor_integer(out); - +#if MIOPEN_BACKEND_OPENCL + status = clEnqueueWriteBuffer(q, + in_dev, + CL_TRUE, + 0, + sizeof(TRef) * in_sz, + in.data.data(), + 0, + nullptr, + nullptr); + status |= clEnqueueWriteBuffer(q, + wei_dev, + CL_TRUE, + 0, + sizeof(TRef) * wei_sz, + wei.data.data(), + 0, + nullptr, + nullptr); + EXPECT(status == CL_SUCCESS); +#elif MIOPEN_BACKEND_HIP EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -440,7 +472,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base out.data.data(), sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); - +#endif cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -480,8 +512,28 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base rand_tensor_integer(wei); /// \ref copy_non_packed_output_before_convolution rand_tensor_integer(in); +#if MIOPEN_BACKEND_OPENCL + status = clEnqueueWriteBuffer(q, + out_dev, + CL_TRUE, + 0, + sizeof(TRef) * out_sz, + out.data.data(), + 0, + nullptr, + nullptr); + status |= clEnqueueWriteBuffer(q, + wei_dev, + CL_TRUE, + 0, + sizeof(TRef) * wei_sz, + wei.data.data(), + 0, + nullptr, + nullptr); + EXPECT(status == CL_SUCCESS); +#elif MIOPEN_BACKEND_HIP /// \ref copy_non_packed_output_before_convolution - EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -493,7 +545,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base wei.data.data(), sizeof(TRef) * wei_sz, hipMemcpyHostToDevice) == hipSuccess); - +#endif cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -533,7 +585,27 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base rand_tensor_integer(out); /// \ref copy_non_packed_output_before_convolution rand_tensor_integer(wei); - +#if MIOPEN_BACKEND_OPENCL + status |= clEnqueueWriteBuffer(q, + in_dev, + CL_TRUE, + 0, + sizeof(TRef) * in_sz, + in.data.data(), + 0, + nullptr, + nullptr); + status |= clEnqueueWriteBuffer(q, + out_dev, + CL_TRUE, + 0, + sizeof(TRef) * out_sz, + out.data.data(), + 0, + nullptr, + nullptr); + EXPECT(status == CL_SUCCESS); +#elif MIOPEN_BACKEND_HIP EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -546,7 +618,7 @@ struct gpu_reference_conv_2d : gpu_reference_kernel_base out.data.data(), sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); - +#endif cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -672,6 +744,18 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base auto wei_sz = wei.data.size(); auto out_sz = out.data.size(); +#if MIOPEN_BACKEND_OPENCL + cl_context ctx; + clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); + cl_int status = CL_SUCCESS; + cl_mem in_dev = + clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * in_sz, nullptr, &status); + cl_mem wei_dev = + clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(TRef) * wei_sz, nullptr, nullptr); + cl_mem out_dev = + clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(Tout) * out_sz, nullptr, nullptr); + EXPECT(status == CL_SUCCESS); +#elif MIOPEN_BACKEND_HIP void* in_dev; void* wei_dev; void* out_dev; @@ -679,7 +763,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base EXPECT(hipMalloc(&in_dev, sizeof(TRef) * in_sz) == hipSuccess); EXPECT(hipMalloc(&wei_dev, sizeof(TRef) * wei_sz) == hipSuccess); EXPECT(hipMalloc(&out_dev, sizeof(Tout) * out_sz) == hipSuccess); - +#endif EXPECT(miopenCreateConvolutionDescriptor(&convDesc) == miopenStatusSuccess); EXPECT(miopenInitConvolutionNdDescriptor(convDesc, 3, @@ -717,7 +801,27 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base rand_tensor_integer(wei); /// \ref copy_non_packed_output_before_convolution rand_tensor_integer(out); - +#if MIOPEN_BACKEND_OPENCL + status = clEnqueueWriteBuffer(q, + in_dev, + CL_TRUE, + 0, + sizeof(TRef) * in_sz, + in.data.data(), + 0, + nullptr, + nullptr); + status |= clEnqueueWriteBuffer(q, + wei_dev, + CL_TRUE, + 0, + sizeof(TRef) * wei_sz, + wei.data.data(), + 0, + nullptr, + nullptr); + EXPECT(status == CL_SUCCESS); +#elif MIOPEN_BACKEND_HIP EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -730,7 +834,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base wei.data.data(), sizeof(TRef) * wei_sz, hipMemcpyHostToDevice) == hipSuccess); - +#endif cpu_convolution_forward(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -771,7 +875,27 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base rand_tensor_integer(wei); /// \ref copy_non_packed_output_before_convolution rand_tensor_integer(in); - +#if MIOPEN_BACKEND_OPENCL + status = clEnqueueWriteBuffer(q, + out_dev, + CL_TRUE, + 0, + sizeof(TRef) * out_sz, + out.data.data(), + 0, + nullptr, + nullptr); + status |= clEnqueueWriteBuffer(q, + wei_dev, + CL_TRUE, + 0, + sizeof(TRef) * wei_sz, + wei.data.data(), + 0, + nullptr, + nullptr); + EXPECT(status == CL_SUCCESS); +#elif MIOPEN_BACKEND_HIP /// \ref copy_non_packed_output_before_convolution EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == @@ -784,7 +908,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base wei.data.data(), sizeof(TRef) * wei_sz, hipMemcpyHostToDevice) == hipSuccess); - +#endif cpu_convolution_backward_data(miopen::deref(convDesc).GetSpatialDimension(), in, wei, @@ -824,7 +948,27 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base rand_tensor_integer(out, 3, -2); /// \ref copy_non_packed_output_before_convolution rand_tensor_integer(wei); - +#if MIOPEN_BACKEND_OPENCL + status |= clEnqueueWriteBuffer(q, + in_dev, + CL_TRUE, + 0, + sizeof(TRef) * in_sz, + in.data.data(), + 0, + nullptr, + nullptr); + status |= clEnqueueWriteBuffer(q, + out_dev, + CL_TRUE, + 0, + sizeof(TRef) * out_sz, + out.data.data(), + 0, + nullptr, + nullptr); + EXPECT(status == CL_SUCCESS); +#elif MIOPEN_BACKEND_HIP EXPECT(hipMemcpy( in_dev, in.data.data(), sizeof(TRef) * in_sz, hipMemcpyHostToDevice) == hipSuccess); @@ -837,7 +981,7 @@ struct gpu_reference_conv_3d : gpu_reference_kernel_base out.data.data(), sizeof(Tout) * out_sz, hipMemcpyHostToDevice) == hipSuccess); - +#endif cpu_convolution_backward_weight(miopen::deref(convDesc).GetSpatialDimension(), in, wei, diff --git a/test/gtest/conv_api_strided_tensors.cpp b/test/gtest/conv_api_strided_tensors.cpp index 04d56ec908..9a2876b3f0 100644 --- a/test/gtest/conv_api_strided_tensors.cpp +++ b/test/gtest/conv_api_strided_tensors.cpp @@ -139,9 +139,6 @@ class ConvStridedTensors : public ::testing::Test std::vector h_output; }; -/// \todo re-enable this test after NCDHW grouped convolution lands (PR 2429) -/// \todo add cpu reference convolution for verification --amberhassaan -#if 0 TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented) { auto device = Device(handle); @@ -197,4 +194,3 @@ TEST_F(ConvStridedTensors, ConvStridedTensorsNotImplemented) miopenStatusSuccess); ASSERT_TRUE(device.Synchronize()); } -#endif diff --git a/test/gtest/nonpack_conv3d_fwd.cpp b/test/gtest/nonpack_conv3d_fwd.cpp new file mode 100644 index 0000000000..35cc492c74 --- /dev/null +++ b/test/gtest/nonpack_conv3d_fwd.cpp @@ -0,0 +1,98 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include + +#include "tensor_util.hpp" +#include "get_handle.hpp" +#include "nonpack_conv3d_fwd.hpp" + +struct ConvFwdSolverTest3D : ConvFwdSolverTest +{ +}; + +template +void SolverFwd(const miopen::TensorDescriptor& inputDesc, + ConstData_t input, + const miopen::TensorDescriptor& wDesc, + ConstData_t weight, + const miopen::TensorDescriptor& outputDesc, + Data_t output, + const miopen::ConvolutionDescriptor& convDesc, + const NonPackTestCase& conv_config, + bool& test_skipped) +{ + auto&& handle = get_handle(); + + Solver solv{}; + + const auto tensors = + miopen::ConvFwdTensors{inputDesc, input, wDesc, weight, outputDesc, output}; + + const auto problem = miopen::conv::ProblemDescription{ + inputDesc, wDesc, outputDesc, convDesc, miopen::conv::Direction::Forward}; + auto ctx = miopen::ExecutionContext{}; + + ctx.SetStream(&handle); + + if(!solv.IsApplicable(ctx, problem)) + { + test_skipped = true; + GTEST_SKIP() << solv.SolverDbId() + << "ConvHipImplicitGemm3DGroupFwdXdlops Not Applicable for this problem" + << conv_config; + } + const auto invoke_params = miopen::conv::DataInvokeParams{tensors, nullptr, 0, false}; + + ASSERT_TRUE(solv.IsApplicable(ctx, problem)); + auto sol = solv.GetSolution(ctx, problem, solv.GetDefaultPerformanceConfig(ctx, problem)); + ASSERT_TRUE(sol.Succeeded()); + ASSERT_TRUE(sol.invoker_factory); + const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params); + (invoker)(handle, invoke_params); + handle.Finish(); +} + +TEST_P(ConvFwdSolverTest3D, CKNonPackConvFwd3D) +{ + SolverFwd(input.desc, + in_dev.get(), + weights.desc, + wei_dev.get(), + output.desc, + out_dev.get(), + conv_desc, + conv_config, + test_skipped); +} + +INSTANTIATE_TEST_SUITE_P(ConvFwdTest, + ConvFwdSolverTest3D, + testing::Combine(testing::Values(miopenConvolutionFwdAlgoImplicitGEMM), + testing::ValuesIn(ConvTestConfigs()), + testing::Values(miopenTensorNDHWC))); diff --git a/test/gtest/nonpack_conv3d_fwd.hpp b/test/gtest/nonpack_conv3d_fwd.hpp new file mode 100644 index 0000000000..5b3677cbb4 --- /dev/null +++ b/test/gtest/nonpack_conv3d_fwd.hpp @@ -0,0 +1,157 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include "conv3d_test_case.hpp" + +struct NonPackTestCase : Conv3DTestCase +{ + size_t i0; + size_t i1; + size_t i2; + size_t i3; + size_t i4; + size_t w0; + size_t w1; + size_t w2; + size_t w3; + size_t w4; + size_t o0; + size_t o1; + size_t o2; + size_t o3; + size_t o4; + std::vector GetInputStrides() { return {i0, i1, i2, i3, i4}; } + std::vector GetWeightStrides() { return {w0, w1, w2, w3, w4}; } + std::vector GetOutputStrides() { return {o0, o1, o2, o3, o4}; } +}; + +std::vector ConvTestConfigs() +{ // g n c d h w k z y x pad_x pad_y pad_z stri_x stri_y stri_z dia_x dia_y dia_z + return {{{1, 4, 16, 4, 9, 16, 16, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, + 10240, + 1, + 2560, + 160, + 16, + 432, + 1, + 144, + 48, + 16, + 9216, + 1, + 2304, + 256, + 16}, + {{1, 1, 64, 3, 16, 16, 128, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, miopenConvolution}, + 65536, + 1, + 24000, + 2048, + 64, + 1728, + 1, + 576, + 192, + 64, + 98304, + 1, + 32768, + 2048, + 128}}; +} + +template +struct ConvFwdSolverTest + : public ::testing::TestWithParam< + std::tuple> +{ +protected: + void SetUp() override + { + test_skipped = false; + + std::tie(algo, conv_config, tensor_layout) = GetParam(); + input = tensor{ + miopen_type{}, tensor_layout, conv_config.GetInput(), conv_config.GetInputStrides()}; + weights = tensor{miopen_type{}, tensor_layout, conv_config.GetWeights()}; + std::random_device rd{}; + std::mt19937 gen{rd()}; + std::uniform_real_distribution<> d{-3, 3}; + auto gen_value = [&](auto...) { return d(gen); }; + input.generate(gen_value); + weights.generate(gen_value); + conv_desc = conv_config.GetConv(); + + miopen::TensorDescriptor output_desc = + conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType()); + output = tensor{miopen_type{}, tensor_layout, output_desc.GetLengths()}; + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + auto&& handle = get_handle(); + in_dev = handle.Write(input.data); + wei_dev = handle.Write(weights.data); + out_dev = handle.Write(output.data); + } + void TearDown() override + { + if(test_skipped) + return; + + auto&& handle = get_handle(); + + miopen::TensorDescriptor output_desc = + conv_desc.GetForwardOutputTensor(input.desc, weights.desc, GetDataType()); + ref_out = tensor{miopen_type{}, tensor_layout, output_desc.GetLengths()}; + ref_out = ref_conv_fwd(input, weights, output, conv_desc); + output.data = handle.Read(out_dev, output.data.size()); + EXPECT_FALSE(miopen::range_zero(ref_out)) << "Cpu data is all zeros"; + EXPECT_FALSE(miopen::range_zero(output)) << "Gpu data is all zeros"; + EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output)); + + const double tolerance = 80; + double threshold = std::numeric_limits::epsilon() * tolerance; + auto error = miopen::rms_range(ref_out, output); + + EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0) + << "Non finite number found in the CPU data"; + + EXPECT_TRUE(error < threshold) + << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; + } + NonPackTestCase conv_config; + miopen::ConvolutionDescriptor conv_desc; + tensor input; + tensor weights; + tensor output; + tensor ref_out; + miopen::Allocator::ManageDataPtr in_dev; + miopen::Allocator::ManageDataPtr wei_dev; + miopen::Allocator::ManageDataPtr out_dev; + miopenConvFwdAlgorithm_t algo = miopenConvolutionFwdAlgoImplicitGEMM; + bool test_skipped = false; + miopenTensorLayout_t tensor_layout; +}; From 251c1d18718c83db9ad42adcf5a497e63c2a3eba Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Thu, 12 Oct 2023 01:09:25 -0700 Subject: [PATCH 35/36] Bump CK comit for ROCm 6.0 (#2439) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bd4a9cb992..9d0d711550 100755 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,4 @@ nlohmann/json@v3.9.1 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCmSoftwarePlatform/FunctionalPlus@v0.2.18-p0 ROCmSoftwarePlatform/eigen@3.4.0 ROCmSoftwarePlatform/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCmSoftwarePlatform/composable_kernel@114c2646df9c45c531befc9eff9315b405098f56 -DDTYPES="fp16;fp32;bf16" -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON \ No newline at end of file +ROCmSoftwarePlatform/composable_kernel@503204136541d6d58194025f4220b603693e391c -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON \ No newline at end of file From 300045c73b2ef71887abbc60bc8f96488a5c999a Mon Sep 17 00:00:00 2001 From: Chris Erb Date: Mon, 16 Oct 2023 21:55:35 -0500 Subject: [PATCH 36/36] [MI100][MI200] Kernel db updates (#2454) --- src/kernels/gfx908.kdb.bz2 | 4 ++-- src/kernels/gfx90a.kdb.bz2 | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/kernels/gfx908.kdb.bz2 b/src/kernels/gfx908.kdb.bz2 index d19d4e35c2..014466d9d3 100644 --- a/src/kernels/gfx908.kdb.bz2 +++ b/src/kernels/gfx908.kdb.bz2 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:888797fb8848d87096fa447f82e96bbe61ecfca0f754667638fd6efa8da3004e -size 336743541 +oid sha256:7f5b1925fbd2f58ab7011913867fcd59222e3cc09dee040f08acb6e7e22e0fda +size 250110225 diff --git a/src/kernels/gfx90a.kdb.bz2 b/src/kernels/gfx90a.kdb.bz2 index 613df18f7a..b811a6b688 100644 --- a/src/kernels/gfx90a.kdb.bz2 +++ b/src/kernels/gfx90a.kdb.bz2 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d76d7c53648f4864a5cfe9267e8cb9171abab81de9d1732a9f94bafb0816b61 -size 250548882 +oid sha256:d22333c785c9081f293e56bc3f5baccc5410d578d2be1dc5e5d523d02de8b5ed +size 250698825