diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 02496548e8..43bc4d798a 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -33,3 +33,4 @@ The MIOpen API library is structured as follows: * :doc:`Cat <../doxygen/html/group__cat>` (experimental) * :doc:`SGD <../doxygen/html/group___s_g_d>` (experimental) * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental) + * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental) diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index de1a0edd2d..afd10b6232 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -43,6 +43,7 @@ add_executable(MIOpenDriver dm_dropout.cpp dm_fusion.cpp dm_gemm.cpp + dm_getitem.cpp dm_groupnorm.cpp dm_layernorm.cpp dm_lrn.cpp diff --git a/driver/InputFlags.cpp b/driver/InputFlags.cpp index 41f872b0e8..12df05cfb5 100644 --- a/driver/InputFlags.cpp +++ b/driver/InputFlags.cpp @@ -292,6 +292,165 @@ TensorParameters InputFlags::GetValueTensor(const std::string& long_name) const MIOPEN_THROW("Too many tensor descriptor parameters."); } + +TensorParametersUint64 InputFlags::GetValueTensorUint64(const std::string& long_name) const +{ + const auto& input = MapInputs.at(FindShortName(long_name)); + const auto components = miopen::SplitDelim(input.value.c_str(), ','); + + if(components.size() < 1) + return {}; + + auto parse = [](auto line) { + auto ret = std::vector{}; + const auto strs = miopen::SplitDelim(line, 'x'); + for(auto&& str : strs) + { + auto elem = uint64_t{}; + auto ss = std::istringstream{str}; + ss >> elem; + + if(ss.bad() || ss.fail()) + MIOPEN_THROW("Invalid tensor component " + str + " in " + line + "."); + + ret.push_back(elem); + } + return ret; + }; + + auto lens = parse(components[0]); + + if(components.size() == 1) + return {lens}; + + auto layout = std::string{}; + auto strides = std::vector{}; + + if(std::isdigit(components[1][0])) + strides = parse(components[1]); + else + layout = components[1]; + + if(components.size() == 2) + return {lens, strides, layout}; + + MIOPEN_THROW("Too many tensor descriptor parameters."); +} + +std::vector InputFlags::GetValueVectorInt(const std::string& long_name) const +{ + const auto& input = MapInputs.at(FindShortName(long_name)); + + auto ret = std::vector{}; + const auto strs = miopen::SplitDelim(input.value.c_str(), ','); + + for(auto&& str : strs) + { + auto elem = int32_t{}; + auto ss = std::istringstream{str}; + ss >> elem; + + if(ss.bad() || ss.fail()) + MIOPEN_THROW("Invalid tensor component " + str + " in " + input.value.c_str() + "."); + + ret.push_back(elem); + } + + return ret; +} + +std::vector InputFlags::GetValueVectorUint64(const std::string& long_name) const +{ + const auto& input = MapInputs.at(FindShortName(long_name)); + + auto ret = std::vector{}; + const auto strs = miopen::SplitDelim(input.value.c_str(), ','); + + for(auto&& str : strs) + { + auto elem = uint64_t{}; + auto ss = std::istringstream{str}; + ss >> elem; + + if(ss.bad() || ss.fail()) + MIOPEN_THROW("Invalid tensor component " + str + " in " + input.value.c_str() + "."); + + ret.push_back(elem); + } + + return ret; +} + +std::vector> +InputFlags::GetValue2dVectorInt(const std::string& long_name) const +{ + const auto& input = MapInputs.at(FindShortName(long_name)); + const auto components = miopen::SplitDelim(input.value.c_str(), ','); + auto output = std::vector>{}; + + if(components.size() < 1) + return {}; + + auto parse = [](auto line) { + auto ret = std::vector{}; + const auto strs = miopen::SplitDelim(line, 'x'); + for(auto&& str : strs) + { + auto elem = int32_t{}; + auto ss = std::istringstream{str}; + ss >> elem; + + if(ss.bad() || ss.fail()) + MIOPEN_THROW("Invalid tensor component " + str + " in " + line + "."); + + ret.push_back(elem); + } + return ret; + }; + + for(auto&& component : components) + { + output.push_back(parse(component)); + } + + return output; +} + +std::vector> +InputFlags::GetValue2dVectorUint64(const std::string& long_name) const +{ + const auto& input = MapInputs.at(FindShortName(long_name)); + const auto components = miopen::SplitDelim(input.value.c_str(), ','); + auto output = std::vector>{}; + + if(components.size() < 1) + return {}; + + auto parse = [](auto line) { + auto ret = std::vector{}; + const auto strs = miopen::SplitDelim(line, 'x'); + for(auto&& str : strs) + { + auto elem = uint64_t{}; + auto ss = std::istringstream{str}; + ss >> elem; + + if(ss.bad() || ss.fail()) + MIOPEN_THROW("Invalid tensor component " + str + " in " + line + "."); + + ret.push_back(elem); + } + return ret; + }; + + for(auto&& component : components) + { + output.push_back(parse(component)); + } + + return output; +} + void InputFlags::SetValue(const std::string& long_name, const std::string& new_value) { char short_name = FindShortName(long_name); diff --git a/driver/InputFlags.hpp b/driver/InputFlags.hpp index 557a895b11..43f7c3a206 100644 --- a/driver/InputFlags.hpp +++ b/driver/InputFlags.hpp @@ -63,6 +63,25 @@ struct TensorParameters void CalculateStrides(); }; +struct TensorParametersUint64 +{ + std::vector lengths = {}; + std::vector strides = {}; + std::string layout = ""; + + TensorParametersUint64 FillMissing(const TensorParametersUint64& other) const + { + return { + (lengths.empty() ? other.lengths : lengths), + (strides.empty() ? other.strides : strides), + (layout.empty() ? other.layout : layout), + }; + } + + uint64_t SetTensordDescriptor(miopenTensorDescriptor_t result, miopenDataType_t data_type); + void CalculateStrides(); +}; + class InputFlags { std::map MapInputs; @@ -90,6 +109,11 @@ class InputFlags uint64_t GetValueUint64(const std::string& _long_name) const; double GetValueDouble(const std::string& _long_name) const; TensorParameters GetValueTensor(const std::string& long_name) const; + TensorParametersUint64 GetValueTensorUint64(const std::string& long_name) const; + std::vector GetValueVectorInt(const std::string& long_name) const; + std::vector GetValueVectorUint64(const std::string& long_name) const; + std::vector> GetValue2dVectorInt(const std::string& long_name) const; + std::vector> GetValue2dVectorUint64(const std::string& long_name) const; void SetValue(const std::string& long_name, const std::string& new_value); void StoreOptionalFlagValue(char short_name, const std::string& input_value); diff --git a/driver/addlayernorm_driver.hpp b/driver/addlayernorm_driver.hpp index 4741d2d820..e74a1548e6 100644 --- a/driver/addlayernorm_driver.hpp +++ b/driver/addlayernorm_driver.hpp @@ -176,8 +176,8 @@ class AddLayerNormDriver : public Driver std::vector weight; std::vector bias; std::vector out; - std::vector mean; - std::vector rstd; + std::vector mean; + std::vector rstd; std::vector outhost; std::vector meanhost; std::vector rstdhost; @@ -259,7 +259,7 @@ int AddLayerNormDriver::AddCmdLineArgs() inflags.AddInputFlag("eps", 'e', "0.00001", "Alpha (Default=0.00001)", "double"); inflags.AddInputFlag("normalized_dim", 'o', "3", "Nomalized Dim (Default=3)", "int"); inflags.AddInputFlag( - "mode", 'm', "0", "elemwise affine mode (0), weight and bias mode (1) (Default=0)", "int"); + "mode", 'm', "2", "elemwise affine mode (2), weight and bias mode (3) (Default=0)", "int"); inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); @@ -291,16 +291,16 @@ int AddLayerNormDriver::AllocateBuffersAndCopy() weight_dev = std::unique_ptr(new GPUMem(ctx, weight_sz, sizeof(Tgpu))); bias_dev = std::unique_ptr(new GPUMem(ctx, bias_sz, sizeof(Tgpu))); out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); - mean_dev = std::unique_ptr(new GPUMem(ctx, mean_sz, sizeof(Tref))); - rstd_dev = std::unique_ptr(new GPUMem(ctx, rstd_sz, sizeof(Tref))); + mean_dev = std::unique_ptr(new GPUMem(ctx, mean_sz, sizeof(Tgpu))); + rstd_dev = std::unique_ptr(new GPUMem(ctx, rstd_sz, sizeof(Tgpu))); in = std::vector(in_sz, Tgpu0val); in2 = std::vector(in2_sz, Tgpu0val); weight = std::vector(weight_sz, Tgpu0val); bias = std::vector(bias_sz, Tgpu0val); out = std::vector(out_sz, Tgpu0val); - mean = std::vector(mean_sz, Tref0val); - rstd = std::vector(rstd_sz, Tref0val); + mean = std::vector(mean_sz, Tgpu0val); + rstd = std::vector(rstd_sz, Tgpu0val); outhost = std::vector(out_sz, Tref0val); meanhost = std::vector(mean_sz, Tref0val); rstdhost = std::vector(rstd_sz, Tref0val); diff --git a/driver/cat_driver.hpp b/driver/cat_driver.hpp index 51eb16b1c7..3254b5f3bc 100644 --- a/driver/cat_driver.hpp +++ b/driver/cat_driver.hpp @@ -183,8 +183,8 @@ template int CatDriver::AddCmdLineArgs() { inflags.AddInputFlag("forw", 'F', "1", "Run only Forward Cat (Default=1)", "int"); - inflags.AddTensorFlag("input1", '1', "", "input1 tensor descriptor"); - inflags.AddTensorFlag("input2", '2', "", "input2 tensor descriptor"); + inflags.AddTensorFlag("input1", '1', "2x32x128x128x128", "input1 tensor descriptor"); + inflags.AddTensorFlag("input2", '2', "2x32x128x128x128", "input2 tensor descriptor"); inflags.AddTensorFlag("input3", '3', "", "input3 tensor descriptor"); inflags.AddTensorFlag("input4", '4', "", "input4 tensor descriptor"); inflags.AddTensorFlag("input5", '5', "", "input5 tensor descriptor"); diff --git a/driver/dm_getitem.cpp b/driver/dm_getitem.cpp new file mode 100644 index 0000000000..bfb72be96a --- /dev/null +++ b/driver/dm_getitem.cpp @@ -0,0 +1,40 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "getitem_driver.hpp" +#include "registry_driver_maker.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "getitem") + return new GetitemDriver(); + if(base_arg == "getitemfp16") + return new GetitemDriver(); + if(base_arg == "getitembfp16") + return new GetitemDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index bf951a54ad..a36121f676 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -169,13 +169,13 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) [[noreturn]] inline void Usage() { printf("Usage: ./driver *base_arg* *other_args*\n"); - printf("Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], " - "pool[fp16], lrn[fp16], " + printf("Supported Base Arguments: conv[fp16|int8|bfp16], pool[fp16], lrn[fp16], " "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " - "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " + "tensorop, reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " - "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw\n"); + "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " + "getitem[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -190,22 +190,22 @@ inline std::string ParseBaseArg(int argc, char* argv[]) std::string arg = argv[1]; if(arg != "conv" && arg != "convfp16" && arg != "convint8" && arg != "convbfp16" && - arg != "convfp8" && arg != "convbfp8" && arg != "CBAInfer" && arg != "CBAInferfp16" && arg != "pool" && arg != "poolfp16" && arg != "lrn" && arg != "lrnfp16" && arg != "activ" && arg != "activfp16" && arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" && arg != "bnormfp16" && arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" && arg != "rnn_seqfp16" && arg != "gemm" && arg != "gemmfp16" && arg != "ctc" && - arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "tensoropfp16" && - arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && arg != "layernorm" && - arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" && - arg != "sumbfp16" && arg != "groupnorm" && arg != "groupnormfp16" && - arg != "groupnormbfp16" && arg != "cat" && arg != "catfp16" && arg != "catbfp16" && - arg != "addlayernorm" && arg != "addlayernormfp16" && arg != "addlayernormbfp16" && - arg != "t5layernorm" && arg != "t5layernormfp16" && arg != "t5layernormbfp16" && - arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" && + arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "reduce" && + arg != "reducefp16" && arg != "reducefp64" && arg != "layernorm" && arg != "layernormfp16" && + arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" && arg != "sumbfp16" && + arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" && + arg != "catfp16" && arg != "catbfp16" && arg != "addlayernorm" && + arg != "addlayernormfp16" && arg != "addlayernormbfp16" && arg != "t5layernorm" && + arg != "t5layernormfp16" && arg != "t5layernormbfp16" && arg != "adam" && + arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" && arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" && arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" && - arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "--version") + arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" && + arg != "getitemfp16" && arg != "getitembfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/getitem_driver.hpp b/driver/getitem_driver.hpp new file mode 100644 index 0000000000..c48c9a0520 --- /dev/null +++ b/driver/getitem_driver.hpp @@ -0,0 +1,545 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_GETITEM_DRIVER_HPP +#define GUARD_MIOPEN_GETITEM_DRIVER_HPP + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" +#include "random.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> + +template +int32_t mloGetitemBackwardRunHost(miopenTensorDescriptor_t dyDesc, + uint32_t indexCount, + miopenTensorDescriptor_t* indexDescs, + miopenTensorDescriptor_t dxDesc, + miopenTensorDescriptor_t errorDesc, + Tgpu* dy, + int32_t** indexs, + Tcheck* dxhost, + int32_t* errorhost, + uint32_t dimCount, + int32_t* dims, + uint32_t sliceCount, + int32_t* slices, + uint32_t offset) +{ + auto dy_dims = miopen::deref(dyDesc).GetLengths(); + auto dy_numel = std::accumulate(dy_dims.begin(), dy_dims.end(), 1L, std::multiplies()); + auto dx_dims = miopen::deref(dxDesc).GetLengths(); + auto index_dims = miopen::deref(indexDescs[0]).GetLengths(); + auto index_numel = + std::accumulate(index_dims.begin(), index_dims.end(), 1L, std::multiplies()); + auto element_index = std::vector(indexCount * index_numel + indexCount); + + std::vector output_dims; + for(int32_t i = 0; i < dimCount; i++) + { + output_dims.push_back(dx_dims[dims[i]]); + } + + auto dim_info_offset = indexCount > 0 ? indexCount * index_dims[0] : 0; + auto start_dim = dims[0]; + + auto dy_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dyDesc)); + auto dxhost_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dxDesc)); + miopen::slice_tv<5>(dxhost_tv, sliceCount, slices); + + int32_t ret = 0; + + // Get element index form indexs + for(size_t j = 0; j < indexCount; j++) + { + const auto& index_dim = dims[j]; + const auto& dim_size = output_dims[j]; + + for(size_t o = 0; o < index_numel; o++) + { + int32_t getitem_index = indexs[j][o]; + + if(getitem_index >= 0 && getitem_index < dim_size) + { + element_index[(o * indexCount) + j] = getitem_index; + } + else if(getitem_index >= -dim_size && getitem_index < 0) + { + element_index[(o * indexCount) + j] = getitem_index + dim_size; + } + else + { + errorhost[j] = -1; + } + + if(o == 0) + { + element_index[dim_info_offset + j] = index_dim; + } + } + } + + // GetItem + for(size_t o = 0; o < dy_numel; o++) + { + tensor_layout_t<5> ncdhw(dy_tv, o); + tensor_layout_t<5> idx(ncdhw); + + if(indexCount > 0) + { + size_t dim_cursor = ncdhw.layout[start_dim]; + size_t i = start_dim; + size_t j = 0; + + for(; i < start_dim + indexCount; ++i, ++j) + { + size_t dim_idx = element_index[dim_info_offset + j]; + idx.layout[dim_idx] = element_index[(dim_cursor * indexCount) + j]; + } + + i = element_index[dim_info_offset + indexCount - 1] + 1; + dim_cursor = start_dim + 1; + for(; i < 5; ++i, ++dim_cursor) + { + idx.layout[i] = ncdhw.layout[dim_cursor]; + } + } + + dxhost[dxhost_tv.get_tensor_view_idx(idx)] += dy[dy_tv.get_tensor_view_idx(ncdhw)]; + } + + return ret; +} + +template +class GetitemDriver : public Driver +{ +public: + GetitemDriver() : Driver() + { + miopenCreateTensorDescriptor(&dyDesc); + miopenCreateTensorDescriptor(&dxDesc); + miopenCreateTensorDescriptor(&errorDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + + int VerifyBackward() override; + int VerifyForward() override; + ~GetitemDriver() override + { + miopenDestroyTensorDescriptor(dyDesc); + for(auto indexDesc : indexDescs) + { + miopenDestroyTensorDescriptor(indexDesc); + } + miopenDestroyTensorDescriptor(dxDesc); + miopenDestroyTensorDescriptor(errorDesc); + } + +private: + InputFlags inflags; + + miopenTensorDescriptor_t dyDesc; + std::vector indexDescs; + miopenTensorDescriptor_t dxDesc; + miopenTensorDescriptor_t errorDesc; + + std::unique_ptr dy_dev; + std::vector> index_devs; + std::unique_ptr dx_dev; + std::unique_ptr error_dev; + std::unique_ptr workspace_dev; + + std::vector dy; + std::vector> indexs; + std::vector dx; + std::vector error; + std::vector workspace; + std::vector dxhost; + std::vector errorhost; + + size_t ws_sizeInBytes; + + std::vector dims; + std::vector> slices; + std::vector slices_flat; + uint32_t offset; + + std::vector output_dims; + std::vector index_devs_ptr; + std::vector indexs_ptr; +}; + +template +int GetitemDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + + if(inflags.GetValueInt("indexcount") < 0) + MIOPEN_THROW("Index count is negative: " + inflags.GetValueStr("indexcount") + "."); + + if(inflags.GetValueInt("dimcount") < 0) + MIOPEN_THROW("Dim count is negative: " + inflags.GetValueStr("dimcount") + "."); + + if(inflags.GetValueInt("slicecount") < 0) + MIOPEN_THROW("Slice count is negative: " + inflags.GetValueStr("slicecount") + "."); + + return miopenStatusSuccess; +} + +template +int GetitemDriver::GetandSetData() +{ + auto dyTensorParam = inflags.GetValueTensorUint64("doutput"); + auto dxTensorParam = inflags.GetValueTensorUint64("dinput"); + auto indexCountParam = inflags.GetValueInt("indexcount"); + auto dimCountParam = inflags.GetValueInt("dimcount"); + auto sliceCountParam = inflags.GetValueInt("slicecount"); + offset = inflags.GetValueInt("offset"); + + auto indexTensorLengths = inflags.GetValue2dVectorInt("indexs"); + if(indexTensorLengths.size() != indexCountParam) + MIOPEN_THROW("Error parsing indexs tensor: " + inflags.GetValueStr("indexs") + "."); + + dims = inflags.GetValueVectorInt("dims"); + if(dims.size() != dimCountParam) + MIOPEN_THROW("Error parsing dims tensor: " + inflags.GetValueStr("dims") + "."); + + for(auto dim : dims) + { + output_dims.push_back(dxTensorParam.lengths[dim]); + } + + slices = inflags.GetValue2dVectorInt("slices"); + if(slices.size() != sliceCountParam) + MIOPEN_THROW("Error parsing slices: " + inflags.GetValueStr("slices") + "."); + + for(auto slice : slices) + { + for(int32_t i = 0; i < 4; i++) + { + slices_flat.push_back(slice[i]); + } + } + + if(SetTensorNd(dyDesc, dyTensorParam.lengths, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing doutput tensor: " + inflags.GetValueStr("doutput") + "."); + + for(auto indexTensorLength : indexTensorLengths) + { + miopenTensorDescriptor_t indexDesc; + miopenCreateTensorDescriptor(&indexDesc); + if(SetTensorNd(indexDesc, indexTensorLength, miopenInt32) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing indexs tensor: " + inflags.GetValueStr("indexs") + "."); + indexDescs.push_back(indexDesc); + } + + if(SetTensorNd(dxDesc, dxTensorParam.lengths, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing dinput tensor: " + inflags.GetValueStr("dinput") + "."); + + std::vector error_length; + error_length.push_back(indexCountParam); + if(SetTensorNd(errorDesc, error_length, miopen_type{}) != miopenStatusSuccess) + MIOPEN_THROW("Error making error tensor: " + inflags.GetValueStr("indexcount") + "."); + + return 0; +} + +template +int GetitemDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "0", "Run only Forward Getitem (Default=0)", "int"); + inflags.AddTensorFlag("doutput", 'O', "128x128", "doutput tensor descriptor"); + inflags.AddTensorFlag("indexs", 'D', "128", "indexs tensor descriptor"); + inflags.AddTensorFlag("dinput", 'N', "128x128", "dinput tensor descriptor"); + + inflags.AddInputFlag("indexcount", '1', "1", "the number of indexs tensor(Default=1)", "int"); + inflags.AddInputFlag("dimcount", '2', "1", "The dimensions(Default=1)", "int"); + inflags.AddInputFlag("dims", '3', "0", "The dimensions(Default=0)", "vector"); + inflags.AddInputFlag("slicecount", '4', "0", "The number of slices(Default=0)", "int"); + inflags.AddInputFlag("slices", + '5', + "", + "The slices(Default=\'\'" + ")", + "vector>"); + inflags.AddInputFlag("offset", '6', "0", "The offset of output(Default=0)", "int"); + + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int GetitemDriver::AllocateBuffersAndCopy() +{ + size_t dy_sz = GetTensorSize(dyDesc); + size_t dx_sz = GetTensorSize(dxDesc); + size_t error_sz = GetTensorSize(errorDesc); + + miopenGetGetitemWorkspaceSize( + GetHandle(), indexDescs.size(), indexDescs.data(), &ws_sizeInBytes); + + uint32_t ctx = 0; + + dy_dev = std::unique_ptr(new GPUMem(ctx, dy_sz, sizeof(Tgpu))); + dx_dev = std::unique_ptr(new GPUMem(ctx, dx_sz, sizeof(Tgpu))); + error_dev = std::unique_ptr(new GPUMem(ctx, error_sz, sizeof(int32_t))); + workspace_dev = std::unique_ptr(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte))); + + dy = std::vector(dy_sz, static_cast(0)); + dx = std::vector(dx_sz, static_cast(0)); + error = std::vector(error_sz, static_cast(0)); + workspace = std::vector(ws_sizeInBytes / sizeof(int32_t), static_cast(0)); + dxhost = std::vector(dx_sz, static_cast(0)); + errorhost = std::vector(error_sz, static_cast(0)); + + for(int32_t i = 0; i < dy_sz; i++) + { + dy[i] = prng::gen_A_to_B(static_cast(-1), static_cast(1)); + } + + for(int32_t i = 0; i < indexDescs.size(); i++) + { + size_t index_sz = GetTensorSize(indexDescs[i]); + index_devs.push_back(std::unique_ptr(new GPUMem(ctx, index_sz, sizeof(int32_t)))); + indexs.push_back(std::vector(index_sz, static_cast(0))); + auto& index = indexs.back(); + auto index_dev = index_devs.back().get(); + + for(int j = 0; j < index_sz; j++) + { + index[j] = prng::gen_A_to_B(static_cast(0), + static_cast(output_dims[i])); + } + if(index_dev->ToGPU(GetStream(), index.data()) != 0) + std::cerr << "Error copying (index) to GPU, size: " << index_dev->GetSize() + << std::endl; + index_devs_ptr.push_back(index_dev->GetMem()); + indexs_ptr.push_back(index.data()); + } + + if(dy_dev->ToGPU(GetStream(), dy.data()) != 0) + std::cerr << "Error copying (dy) to GPU, size: " << dy_dev->GetSize() << std::endl; + + if(workspace_dev->ToGPU(GetStream(), workspace.data()) != 0) + std::cerr << "Error copying (workspace) to GPU, size: " << workspace_dev->GetSize() + << std::endl; + + if(dx_dev->ToGPU(GetStream(), dx.data()) != 0) + std::cerr << "Error copying (dx) to GPU, size: " << dx_dev->GetSize() << std::endl; + + if(error_dev->ToGPU(GetStream(), error.data()) != 0) + std::cerr << "Error copying (error) to GPU, size: " << error_dev->GetSize() << std::endl; + + return miopenStatusSuccess; +} + +template +int GetitemDriver::RunForwardGPU() +{ + return miopenStatusSuccess; +} + +template +int GetitemDriver::RunBackwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int32_t i = 0; i < inflags.GetValueInt("iter"); i++) + { + + if(dx_dev->ToGPU(GetStream(), dx.data()) != 0) + std::cerr << "Error copying (dx) to GPU, size: " << dx_dev->GetSize() << std::endl; + + miopenGetitemBackward(GetHandle(), + workspace_dev->GetMem(), + ws_sizeInBytes, + dyDesc, + dy_dev->GetMem(), + indexDescs.size(), + indexDescs.data(), + index_devs_ptr.data(), + dxDesc, + dx_dev->GetMem(), + errorDesc, + error_dev->GetMem(), + dims.size(), + dims.data(), + slices.size(), + slices_flat.data(), + offset); + + float time = 0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int32_t iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Backward Getitem Elapsed: " << t.gettime_ms() / iter + << " ms" << std::endl; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Backward Getitem Elapsed: " << kernel_average_time << " ms" + << std::endl; + } + + if(dx_dev->FromGPU(GetStream(), dx.data()) != 0) + std::cerr << "Error copying (dx_dev) from GPU, size: " << dx_dev->GetSize() << std::endl; + + if(error_dev->FromGPU(GetStream(), error.data()) != 0) + std::cerr << "Error copying (error_dev) from GPU, size: " << error_dev->GetSize() + << std::endl; + + return miopenStatusSuccess; +} + +template +int GetitemDriver::RunBackwardCPU() +{ + mloGetitemBackwardRunHost(dyDesc, + indexDescs.size(), + indexDescs.data(), + dxDesc, + errorDesc, + dy.data(), + indexs_ptr.data(), + dxhost.data(), + errorhost.data(), + dims.size(), + dims.data(), + slices.size(), + slices_flat.data(), + offset); + + return miopenStatusSuccess; +} + +template +Tref GetitemDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + // In the case of layernorm, there is a cumulative sum operation, and in the case of + // floating point operation, the result value can change if the order of the summed values + // is changed. So apply a threshold that is 10 times larger than other operations. + auto tolerance = std::is_same::value ? 1.5e-4 : 8.2e-1; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + // If there is an atomic operation on the GPU kernel, a large error occurs depending on the + // calculation order, so it is multiplied by 10 times. + if(std::is_same::value) + tolerance *= 8000.0; + return tolerance; +} + +template +int GetitemDriver::VerifyForward() +{ + return miopenStatusSuccess; +} + +template +int GetitemDriver::VerifyBackward() +{ + RunBackwardCPU(); + const Tref tolerance = GetTolerance(); + + auto error_dx = miopen::rms_range(dxhost, dx); + + if(!std::isfinite(error_dx) || error_dx > tolerance) + { + std::cout << "Backward Getitem FAILED: " << error_dx << " > " << tolerance << std::endl; + return EC_VerifyBwd; + } + else + { + std::cout << "Backward Getitem Verifies OK on CPU reference (" << error_dx << " < " + << tolerance << ')' << std::endl; + } + + auto error_error = miopen::rms_range(errorhost, error); + + if(!std::isfinite(error_error) || std::abs(static_cast(error_error)) != 0.0f) + { + std::cout << "Backward Getitem FAILED: Result does not equal" << std::endl; + return EC_VerifyBwd; + } + else + { + std::cout << "Backward Getitem Verifies OK on CPU and GPU" << std::endl; + } + + return miopenStatusSuccess; +} + +#endif // GUARD_MIOPEN_GETITEM_DRIVER_HPP diff --git a/driver/groupnorm_driver.hpp b/driver/groupnorm_driver.hpp index c143496cdd..1e97f541a0 100644 --- a/driver/groupnorm_driver.hpp +++ b/driver/groupnorm_driver.hpp @@ -110,8 +110,8 @@ class GroupNormDriver : public Driver std::vector weight; std::vector bias; std::vector out; - std::vector mean; - std::vector rstd; + std::vector mean; + std::vector rstd; std::vector outhost; std::vector meanhost; std::vector rstdhost; @@ -158,14 +158,14 @@ template int GroupNormDriver::AddCmdLineArgs() { inflags.AddInputFlag("forw", 'F', "1", "Run only Forward GroupNorm (Default=1)", "int"); - inflags.AddInputFlag("batchsize", 'n', "100", "Mini-batch size (Default=100)", "int"); - inflags.AddInputFlag("in_channels", 'c', "6", "Number of Input Channels (Default=6)", "int"); - inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int"); - inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int"); - inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int"); + inflags.AddInputFlag("batchsize", 'n', "32", "Mini-batch size (Default=100)", "int"); + inflags.AddInputFlag("in_channels", 'c', "32", "Number of Input Channels (Default=6)", "int"); + inflags.AddInputFlag("in_d", 'D', "14", "Input Depth (Default=0)", "int"); + inflags.AddInputFlag("in_h", 'H', "14", "Input Height (Default=32)", "int"); + inflags.AddInputFlag("in_w", 'W', "14", "Input Width (Default=32)", "int"); inflags.AddInputFlag("eps", 'e', "0.00001", "Alpha (Default=0.00001)", "double"); - inflags.AddInputFlag("num_groups", 'g', "3", "num_groups", "int"); + inflags.AddInputFlag("num_groups", 'g', "4", "num_groups", "int"); inflags.AddInputFlag( "mode", 'm', "0", "elemwise affine mode (0), weight and bias mode (1) (Default=0)", "int"); @@ -224,15 +224,15 @@ int GroupNormDriver::AllocateBuffersAndCopy() weight_dev = std::unique_ptr(new GPUMem(ctx, weight_sz, sizeof(Tgpu))); bias_dev = std::unique_ptr(new GPUMem(ctx, bias_sz, sizeof(Tgpu))); out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); - mean_dev = std::unique_ptr(new GPUMem(ctx, mean_sz, sizeof(Tref))); - rstd_dev = std::unique_ptr(new GPUMem(ctx, rstd_sz, sizeof(Tref))); + mean_dev = std::unique_ptr(new GPUMem(ctx, mean_sz, sizeof(Tgpu))); + rstd_dev = std::unique_ptr(new GPUMem(ctx, rstd_sz, sizeof(Tgpu))); in = std::vector(in_sz, static_cast(0)); weight = std::vector(weight_sz, static_cast(0)); bias = std::vector(bias_sz, static_cast(0)); out = std::vector(out_sz, static_cast(0)); - mean = std::vector(mean_sz, static_cast(0)); - rstd = std::vector(rstd_sz, static_cast(0)); + mean = std::vector(mean_sz, static_cast(0)); + rstd = std::vector(rstd_sz, static_cast(0)); outhost = std::vector(out_sz, static_cast(0)); meanhost = std::vector(mean_sz, static_cast(0)); rstdhost = std::vector(rstd_sz, static_cast(0)); @@ -347,23 +347,14 @@ int GroupNormDriver::RunBackwardGPU() template Tref GroupNormDriver::GetTolerance() { - if(data_type == miopenHalf) - { - return 1e-3; - } - else if(data_type == miopenFloat) - { - return 5e-5; - } - else if(data_type == miopenDouble) - { - return 1e-10; - } - else if(data_type == miopenBFloat16) - { - return 5e-3; - } - return 0; + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; } template diff --git a/driver/layernorm_driver.hpp b/driver/layernorm_driver.hpp index ea5b841c08..5bdf82ce85 100644 --- a/driver/layernorm_driver.hpp +++ b/driver/layernorm_driver.hpp @@ -166,8 +166,8 @@ class LayerNormDriver : public Driver std::vector weight; std::vector bias; std::vector out; - std::vector mean; - std::vector rstd; + std::vector mean; + std::vector rstd; std::vector outhost; std::vector meanhost; std::vector rstdhost; @@ -276,15 +276,15 @@ int LayerNormDriver::AllocateBuffersAndCopy() weight_dev = std::unique_ptr(new GPUMem(ctx, weight_sz, sizeof(Tgpu))); bias_dev = std::unique_ptr(new GPUMem(ctx, bias_sz, sizeof(Tgpu))); out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); - mean_dev = std::unique_ptr(new GPUMem(ctx, mean_sz, sizeof(Tref))); - rstd_dev = std::unique_ptr(new GPUMem(ctx, rstd_sz, sizeof(Tref))); + mean_dev = std::unique_ptr(new GPUMem(ctx, mean_sz, sizeof(Tgpu))); + rstd_dev = std::unique_ptr(new GPUMem(ctx, rstd_sz, sizeof(Tgpu))); in = std::vector(in_sz, Tgpu0val); weight = std::vector(weight_sz, Tgpu0val); bias = std::vector(bias_sz, Tgpu0val); out = std::vector(out_sz, Tgpu0val); - mean = std::vector(mean_sz, Tref0ref); - rstd = std::vector(rstd_sz, Tref0ref); + mean = std::vector(mean_sz, Tgpu0val); + rstd = std::vector(rstd_sz, Tgpu0val); outhost = std::vector(out_sz, Tref0ref); meanhost = std::vector(mean_sz, Tref0ref); rstdhost = std::vector(rstd_sz, Tref0ref); diff --git a/driver/t5layernorm_driver.hpp b/driver/t5layernorm_driver.hpp index 94a4f6b934..3d02a2c3f4 100644 --- a/driver/t5layernorm_driver.hpp +++ b/driver/t5layernorm_driver.hpp @@ -318,12 +318,12 @@ int T5LayerNormDriver::GetandSetData() template int T5LayerNormDriver::AddCmdLineArgs() { - inflags.AddInputFlag("forw", 'F', "1", "Run only Forward T5LayerNorm (Default=1)", "int"); + inflags.AddInputFlag("forw", 'F', "0", "Run only Forward T5LayerNorm (Default=1)", "int"); inflags.AddTensorFlag("input", 'X', "100x3x32x32", "input tensor descriptor"); inflags.AddInputFlag("eps", 'e', "0.00001", "Alpha (Default=0.00001)", "double"); inflags.AddInputFlag( - "mode", 'm', "0", "elemwise affine mode (0), weight mode (1) (Default=0)", "int"); + "mode", 'm', "5", "elemwise affine mode (5), weight mode (6) (Default=5)", "int"); inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); diff --git a/driver/tensor_driver.hpp b/driver/tensor_driver.hpp index f6868fab98..c353a6ee11 100644 --- a/driver/tensor_driver.hpp +++ b/driver/tensor_driver.hpp @@ -173,6 +173,13 @@ inline int SetTensorNd(miopenTensorDescriptor_t t, return miopenSetTensorDescriptor(t, data_type, len.size(), len.data(), nullptr); } +inline int SetTensorNd(miopenTensorDescriptor_t t, + std::vector& len, + miopenDataType_t data_type = miopenFloat) +{ + return miopenSetTensorDescriptorV2(t, data_type, len.size(), len.data(), nullptr); +} + inline int SetTensorNd(miopenTensorDescriptor_t t, std::vector& len, std::vector& strides, diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 0f2c2a5cb0..6b205fc99e 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -69,6 +69,7 @@ * @defgroup groupnorm * @defgroup cat * @defgroup SGD + * @defgroup getitem * */ @@ -5922,6 +5923,8 @@ typedef enum 3, /*!< the operation is getting the minimum value and index of the reduced elements */ MIOPEN_REDUCE_EXTREME_MAX = 4, /*!< the operation is getting the maximum value and index of the reduced elements */ + MIOPEN_REDUCE_CALCULATION_SUM = + 5, /*!< the operation is multiplying the values of the reduced elements */ } miopenReduceExtremeOp_t; // ReduceExtreme APIs @@ -7538,6 +7541,72 @@ miopenTransformersAdamWWithOutput(miopenHandle_t handle, // CLOSEOUT SGD DOXYGEN GROUP #endif // MIOPEN_BETA_API +#ifdef MIOPEN_BETA_API +// GetItem APIs +/** @addtogroup getitem + * + * @{ + */ +/*! @brief Helper function to query the minimum workspace size required by the getitem call + * + * @param [in] handle MIOpen Handle + * @param [in] indexCount Number of input tensor indexs + * @param [in] indexDescs Tensor descriptor of input tensor indexs + * @param [out] sizeInBytes Pointer to data to return the minimum workspace size + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetGetitemWorkspaceSize(miopenHandle_t handle, + uint32_t indexCount, + const miopenTensorDescriptor_t* indexDescs, + size_t* sizeInBytes); + +/*! @brief Execute a getitem backward layer + * + * Backward of getitem for tensor indexing, slicing, masking. + * + * @param [in] handle MIOpen handle + * @param [in] workspace Address of the allocated workspace data + * @param [in] workspaceSizeInBytes Size in bytes of the allocated workspace data + * @param [in] dyDesc Tensor descriptor of input tensor dy + * @param [in] dy Source data tensor dy + * @param [in] indexCount Number of input tensor indexs + * @param [in] indexDescs Tensor descriptor of input tensor indexs(All indexs same + * size) + * @param [in] indexs Source data tensor indexs + * @param [in] dxDesc Tensor descriptor of output tensor dx + * @param [out] dx Data tensor dx(It must be initialized to 0) + * @param [in] errorDesc Tensor descriptor of output tensor error + * @param [out] error Data tensor error(It must be initialized to 0) + * @param [in] dimCount Number of dimensions + * @param [in] dims Dimensions + * @param [in] sliceCount Number of slices + * @param [in] slices Slices + * @param [in] offset Offset of output tensor dx + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + uint32_t indexCount, + const miopenTensorDescriptor_t* indexDescs, + const void* const* indexs, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t errorDesc, + void* error, + uint32_t dimCount, + const int32_t* dims, + uint32_t sliceCount, + const int32_t* slices, + uint32_t offset); + +/** @} */ +// CLOSEOUT GETITEM DOXYGEN GROUP +#endif // MIOPEN_BETA_API + #ifdef __cplusplus } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 14e7c954b1..ea6bbc77f0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -126,6 +126,7 @@ set( MIOpen_Source fusion.cpp fusion/problem_description.cpp generic_search.cpp + getitem_api.cpp graphapi/convolution.cpp graphapi/engine.cpp graphapi/enginecfg.cpp @@ -145,6 +146,7 @@ set( MIOpen_Source groupnorm/problem_description.cpp handle_api.cpp invoker_cache.cpp + getitem/problem_description.cpp kernel_build_params.cpp kernel_warnings.cpp layernorm_api.cpp @@ -278,6 +280,7 @@ set( MIOpen_Source solver/conv_ocl_dir2Dfwd_fused.cpp solver/conv_winoRxS_fused.cpp solver/groupnorm/forward_groupnorm.cpp + solver/getitem/backward_getitem.cpp solver/layernorm/backward_t5layernorm.cpp solver/layernorm/forward_addlayernorm.cpp solver/layernorm/forward_layernorm.cpp @@ -438,6 +441,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/conv_sizes.inc kernels/float_types.h kernels/gpr_alloc.inc + kernels/hip_atomic.hpp kernels/hip_f8_impl.hpp kernels/hip_float8.hpp kernels/inst_wrappers.inc @@ -449,6 +453,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/neuron.inc kernels/rocm_version.inc kernels/stride_array.hpp + kernels/tensor_view.hpp kernels/utilities.inc kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c16_stride1.inc kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c32_stride1.inc @@ -487,6 +492,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenConvDirBatchNormActiv.cl kernels/MIOpenConvDirGenFwd.cl kernels/MIOpenGroupNorm.cpp + kernels/MIOpenGetitem.cpp kernels/MIOpenLayerNorm.cpp kernels/MIOpenLRNBwd.cl kernels/MIOpenLRNFwd.cl @@ -616,6 +622,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN addlayernorm.cpp cat.cpp groupnorm.cpp + getitem.cpp kernel_cache.cpp layernorm.cpp lrn.cpp diff --git a/src/getitem.cpp b/src/getitem.cpp new file mode 100644 index 0000000000..c3b1b0c3bc --- /dev/null +++ b/src/getitem.cpp @@ -0,0 +1,106 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +std::size_t GetGetitemWorkspaceSize(Handle& handle, + uint32_t indexCount, + const TensorDescriptor* const* indexDescs) +{ + auto ctx = ExecutionContext{&handle}; + const auto problem = getitem::ProblemDescription{indexCount, indexDescs}; + + const auto algo = AlgorithmName{"GetitemBackward"}; + const auto solvers = solver::SolverContainer{}; + + auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem, true); + + return pair_size_vector.empty() ? static_cast(0) : pair_size_vector.front().second; +} + +miopenStatus_t GetitemBackward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& dyDesc, + ConstData_t dy, + uint32_t indexCount, + const TensorDescriptor* const* indexDescs, + ConstData_t* indexs, + const TensorDescriptor& dxDesc, + Data_t dx, + const TensorDescriptor& errorDesc, + Data_t error, + uint32_t dimCount, + const int32_t* dims, + uint32_t sliceCount, + const int32_t* slices, + uint32_t offset) +{ + const auto problem = getitem::ProblemDescription{dyDesc, + indexCount, + indexDescs, + dxDesc, + errorDesc, + dimCount, + dims, + sliceCount, + slices, + offset}; + + const auto invoke_params = getitem::GetitemInvokeParams{workspace, + workspaceSizeInBytes, + dyDesc, + dy, + indexCount, + indexDescs, + indexs, + dxDesc, + dx, + errorDesc, + error, + dimCount, + dims, + sliceCount, + slices, + offset}; + + const auto algo = AlgorithmName{"GetitemBackward"}; + const auto solvers = solver::SolverContainer{}; + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/getitem/problem_description.cpp b/src/getitem/problem_description.cpp new file mode 100644 index 0000000000..b8b32109d6 --- /dev/null +++ b/src/getitem/problem_description.cpp @@ -0,0 +1,69 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include + +#include + +namespace miopen { + +namespace getitem { + +NetworkConfig ProblemDescription::MakeNetworkConfig() const +{ + auto dy_dims = dyDesc.GetLengths(); + auto input_dtype = dyDesc.GetType(); + auto error_dtype = errorDesc.GetType(); + + auto input_size = + std::accumulate(dy_dims.begin(), dy_dims.end(), 1ULL, std::multiplies()); + + std::ostringstream ss; + + ss << "getitembwd"; + ss << "input_size" << input_size; + ss << "input_dtype" << input_dtype; + ss << "error_dtype" << error_dtype; + ss << "indexCount" << indexCount; + + for(int i = 0; i < indexCount; ++i) + { + if(i == 0) + ss << "indexs_size"; + const auto& index_dims = (*indexDescs)[i].GetLengths(); + auto index_size = std::accumulate( + index_dims.begin(), index_dims.begin(), 1ULL, std::multiplies()); + ss << index_size << "_"; + } + + return NetworkConfig{ss.str()}; +} + +} // namespace getitem + +} // namespace miopen diff --git a/src/getitem_api.cpp b/src/getitem_api.cpp new file mode 100644 index 0000000000..094f44620f --- /dev/null +++ b/src/getitem_api.cpp @@ -0,0 +1,206 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include + +static void LogCmdGetitem(const miopenTensorDescriptor_t dyDesc, + uint32_t indexCount, + const miopenTensorDescriptor_t* indexDescs, + const miopenTensorDescriptor_t dxDesc, + uint32_t dimCount, + const int32_t* dims, + uint32_t sliceCount, + const int32_t* slices, + uint32_t offset, + bool is_fwd) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(dyDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "getitemfp16"; + } + else if(dtype == miopenFloat) + { + ss << "getitemfp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "getitemf16"; + } + + std::string dy_s; + auto dy_dims = miopen::deref(dyDesc).GetLengths(); + for(int i = 0; i < dy_dims.size(); i++) + { + dy_s += std::to_string(dy_dims[i]); + if(i != dy_dims.size() - 2) + dy_s += ","; + } + ss << " -doutput " << dy_s; + + for(int i = 0; i < indexCount; i++) + { + std::string index_s; + auto index_dims = miopen::deref(indexDescs[i]).GetLengths(); + for(int j = 0; j < index_dims.size(); j++) + { + index_s += std::to_string(index_dims[j]); + if(j != index_dims.size() - 2) + index_s += ","; + } + ss << " -index" << i + 1 << " " << index_s; + } + + std::string dx_s; + auto dx_dims = miopen::deref(dxDesc).GetLengths(); + + for(int i = 0; i < dx_dims.size(); i++) + { + dx_s += std::to_string(dx_dims[i]); + if(i != dx_dims.size() - 2) + dx_s += ","; + } + + ss << " -dx " << dx_s; + + std::string dims_s; + for(int i = 0; i < dimCount; i++) + { + dims_s += std::to_string(dims[i]); + if(i != dimCount - 2) + dims_s += ","; + } + ss << " -dims" << dims_s; + + std::string slices_s; + for(int i = 0; i < sliceCount; i++) + { + slices_s += std::to_string(slices[i]); + if(i != sliceCount - 2) + slices_s += ","; + } + ss << " -slice" << slices_s; + + ss << " -offset" << offset; + ss << " -F " << ((is_fwd) ? "1" : "2"); + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t miopenGetGetitemWorkspaceSize(miopenHandle_t handle, + uint32_t indexCount, + const miopenTensorDescriptor_t* indexDescs, + size_t* sizeInBytes) +{ + MIOPEN_LOG_FUNCTION(handle, indexCount, indexDescs); + + return miopen::try_([&] { + std::vector indexDescsCast; + std::transform(indexDescs, + indexDescs + indexCount, + std::back_inserter(indexDescsCast), + [](const auto& indexDesc) { return &miopen::deref(indexDesc); }); + miopen::deref(sizeInBytes) = miopen::GetGetitemWorkspaceSize( + miopen::deref(handle), indexCount, indexDescsCast.data()); + }); +}; + +extern "C" miopenStatus_t miopenGetitemBackward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + uint32_t indexCount, + const miopenTensorDescriptor_t* indexDescs, + const void* const* indexs, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t errorDesc, + void* error, + uint32_t dimCount, + const int32_t* dims, + uint32_t sliceCount, + const int32_t* slices, + uint32_t offset) +{ + MIOPEN_LOG_FUNCTION(handle, + workspace, + workspaceSizeInBytes, + dyDesc, + dy, + indexCount, + indexDescs, + indexs, + dxDesc, + dx, + errorDesc, + error, + dimCount, + dims, + sliceCount, + slices, + offset); + + LogCmdGetitem( + dyDesc, indexCount, indexDescs, dxDesc, dimCount, dims, sliceCount, slices, offset, true); + return miopen::try_([&] { + std::vector indexsCast; + std::vector indexDescsCast; + std::transform(indexDescs, + indexDescs + indexCount, + std::back_inserter(indexDescsCast), + [](const auto& indexDesc) { return &miopen::deref(indexDesc); }); + std::transform(indexs, + indexs + indexCount, + std::back_inserter(indexsCast), + [](const void* index) { return DataCast(index); }); + + miopen::GetitemBackward(miopen::deref(handle), + DataCast(workspace), + workspaceSizeInBytes, + miopen::deref(dyDesc), + DataCast(dy), + indexCount, + indexDescsCast.data(), + indexsCast.data(), + miopen::deref(dxDesc), + DataCast(dx), + miopen::deref(errorDesc), + DataCast(error), + dimCount, + dims, + sliceCount, + slices, + offset); + }); +} diff --git a/src/include/miopen/find_solution.hpp b/src/include/miopen/find_solution.hpp index ff6a2a1ca1..30735cd38a 100644 --- a/src/include/miopen/find_solution.hpp +++ b/src/include/miopen/find_solution.hpp @@ -395,25 +395,25 @@ struct SolverContainer } template - std::vector> - GetWorkspaceSizes(const Context& ctx, - const Problem& problem, - std::size_t limit = std::numeric_limits::max()) const + std::vector> GetWorkspaceSizes( + const Context& ctx, const Problem& problem, const bool simple_primitive = false) const { std::vector> res; const auto find_only = GetEnvFindOnlySolver(); - std::size_t count = 0; miopen::each_args( [&](auto solver) { - if(count >= limit) - return; - if(find_only && (std::find(find_only->begin(), find_only->end(), Id{solver.SolverDbId()}) == find_only->end())) { // Do nothing (and keep silence for the sake of Tuna), just skip. } - else if(!solver.MayNeedWorkspace()) + // The following optimization is required to avoid checks + // for solvers that have slow IsApplicable() and do not + // require workspace (like MLIR convolutions). However we + // do not want to use it for simple primitives, for example, + // the ones that ExecutePrimitive() which uses the first applicable + // solver: + else if(!simple_primitive && !solver.MayNeedWorkspace()) { MIOPEN_LOG_I2(solver.SolverDbId() << ": Skipped (no workspace required)"); } @@ -429,7 +429,6 @@ struct SolverContainer } else { - ++count; auto sz = solver.GetWorkspaceSize(ctx, problem); res.push_back(std::make_pair(solver.SolverDbId(), sz)); MIOPEN_LOG_I2(solver.SolverDbId() << ": " << sz); diff --git a/src/include/miopen/getitem.hpp b/src/include/miopen/getitem.hpp new file mode 100644 index 0000000000..191b1dba97 --- /dev/null +++ b/src/include/miopen/getitem.hpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_GETITEM_HPP_ +#define MIOPEN_GETITEM_HPP_ + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +MIOPEN_INTERNALS_EXPORT std::size_t GetGetitemWorkspaceSize( + Handle& handle, uint32_t indexCount, const TensorDescriptor* const* indexDescs); + +MIOPEN_INTERNALS_EXPORT miopenStatus_t GetitemBackward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& dyDesc, + ConstData_t dy, + uint32_t indexCount, + const TensorDescriptor* const* indexDescs, + ConstData_t* indexs, + const TensorDescriptor& dxDesc, + Data_t dx, + const TensorDescriptor& errorDesc, + Data_t error, + uint32_t dimCount, + const int32_t* dims, + uint32_t sliceCount, + const int32_t* slices, + uint32_t offset); + +} // namespace miopen +#endif // _MIOPEN_GETITEM_HPP_ diff --git a/src/include/miopen/getitem/invoke_params.hpp b/src/include/miopen/getitem/invoke_params.hpp new file mode 100644 index 0000000000..e663482271 --- /dev/null +++ b/src/include/miopen/getitem/invoke_params.hpp @@ -0,0 +1,97 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include + +namespace miopen { +namespace getitem { + +struct GetitemInvokeParams : public miopen::InvokeParams +{ + + GetitemInvokeParams(Data_t workspace_, + std::size_t workspace_size_, + const TensorDescriptor& dyDesc_, + ConstData_t dy_, + uint32_t indexCount_, + const TensorDescriptor* const* indexDescs_, + ConstData_t* indexs_, + const TensorDescriptor& dxDesc_, + Data_t dx_, + const TensorDescriptor& errorDesc_, + Data_t error_, + uint32_t dimCount_, + const int32_t* dims_, + uint32_t sliceCount_, + const int32_t* slices_, + uint32_t offset_) + : workspace(workspace_), + workspace_size(workspace_size_), + dyDesc(dyDesc_), + dy(dy_), + indexCount(indexCount_), + indexDescs(indexDescs_), + indexs(indexs_), + dxDesc(dxDesc_), + dx(dx_), + errorDesc(errorDesc_), + error(error_), + dimCount(dimCount_), + dims(dims_), + sliceCount(sliceCount_), + slices(slices_), + offset(offset_) + { + } + + Data_t workspace = nullptr; + std::size_t workspace_size = 0; + const TensorDescriptor dyDesc{}; + ConstData_t dy = nullptr; + uint32_t indexCount = 0; + const TensorDescriptor* const* indexDescs = nullptr; + ConstData_t* indexs = nullptr; + const TensorDescriptor dxDesc{}; + Data_t dx = nullptr; + const TensorDescriptor errorDesc{}; + Data_t error = nullptr; + + uint32_t dimCount = 0; + const int32_t* dims = nullptr; + uint32_t sliceCount = 0; + const int32_t* slices = nullptr; + uint32_t offset = 0; + + std::size_t GetWorkspaceSize() const { return workspace_size; } + Data_t GetWorkspace() const { return workspace; } +}; + +} // namespace getitem + +} // namespace miopen diff --git a/src/include/miopen/getitem/problem_description.hpp b/src/include/miopen/getitem/problem_description.hpp new file mode 100644 index 0000000000..fed4e78d22 --- /dev/null +++ b/src/include/miopen/getitem/problem_description.hpp @@ -0,0 +1,181 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace getitem { + +struct ProblemDescription : ProblemDescriptionBase +{ + ProblemDescription(const TensorDescriptor& dyDesc_, + uint32_t indexCount_, + const TensorDescriptor* const* indexDescs_, + const TensorDescriptor& dxDesc_, + const TensorDescriptor& errorDesc_, + uint32_t dimCount_, + const int32_t* dims_, + uint32_t sliceCount_, + const int32_t* slices_, + uint32_t offset_) + : dyDesc(dyDesc_), + indexCount(indexCount_), + indexDescs(indexDescs_), + dxDesc(dxDesc_), + errorDesc(errorDesc_), + dimCount(dimCount_), + dims(dims_), + sliceCount(sliceCount_), + slices(slices_), + offset(offset_) + { + IsValidIndexsLength(); + IsValidIndexs(); + IsValidDims(); + IsValidSlices(); + } + + ProblemDescription(const int32_t indexCount_, const TensorDescriptor* const* indexDescs_) + : indexCount(indexCount_), indexDescs(indexDescs_) + { + IsValidIndexsLength(); + IsValidIndexs(); + } + + const TensorDescriptor& GetDYDesc() const { return dyDesc; } + int32_t GetIndexCount() const { return indexCount; } + const TensorDescriptor& GetIndexDesc(int i) const + { + if(i >= indexCount) + { + MIOPEN_THROW(miopenStatusInternalError, "Item: Invalid tensor index."); + } + return (*indexDescs)[i]; + } + const TensorDescriptor& GetDXDesc() const { return dxDesc; } + const TensorDescriptor& GetErrorDesc() const { return errorDesc; } + int32_t GetDimCount() const { return dimCount; } + int32_t GetDim(int i) const + { + if(i >= indexCount) + { + MIOPEN_THROW(miopenStatusInternalError, "Item: Invalid dim index."); + } + return dims[i]; + } + int32_t GetSliceCount() const { return sliceCount; } + int32_t GetSlice(int i) const + { + if(i >= sliceCount) + { + MIOPEN_THROW(miopenStatusInternalError, "Item: Invalid slice index."); + } + return slices[i]; + } + int32_t GetOffset() const { return offset; } + + bool IsValidIndexsLength() const + { + if(indexCount > 0) + { + auto firstlength = (*indexDescs)[0]; + for(int32_t i = 1; i < indexCount; ++i) + { + if(firstlength != (*indexDescs)[i]) + MIOPEN_THROW(miopenStatusBadParm, + "Getitem: Indexs dimension lengths do not match."); + } + } + return true; + } + + bool IsValidIndexs() const + { + if(indexCount > 0) + { + if(indexDescs == nullptr) + MIOPEN_THROW(miopenStatusBadParm, "Getitem: indexDesc is nullptr."); + } + return true; + } + + bool IsValidDims() const + { + if(dimCount > 0) + + if(dims == nullptr) + MIOPEN_THROW(miopenStatusBadParm, "Getitem: dims is nullptr."); + return true; + } + + bool IsValidSlices() const + { + if(sliceCount > 0) + { + if(slices == nullptr) + MIOPEN_THROW(miopenStatusBadParm, "Getitem: slices is nullptr."); + } + return true; + } + + bool IsSameType() const + { + if(dyDesc.GetType() != dxDesc.GetType()) + { + return false; + } + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +private: + TensorDescriptor dyDesc{}; + uint32_t indexCount = 0; + const TensorDescriptor* const* indexDescs = nullptr; + TensorDescriptor dxDesc{}; + TensorDescriptor errorDesc{}; + + uint32_t dimCount = 0; + const int32_t* dims = nullptr; + uint32_t sliceCount = 0; + const int32_t* slices = nullptr; + uint32_t offset = 0; + + NetworkConfig MakeForwardNetworkConfig() const; +}; + +} // namespace getitem + +} // namespace miopen diff --git a/src/include/miopen/getitem/solvers.hpp b/src/include/miopen/getitem/solvers.hpp new file mode 100644 index 0000000000..f2edcbe437 --- /dev/null +++ b/src/include/miopen/getitem/solvers.hpp @@ -0,0 +1,57 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace getitem { + +using ItemSolver = NonTunableSolverBase; + +struct GetitemBackward final : ItemSolver +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::getitem::ProblemDescription& problem) const override; + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::getitem::ProblemDescription& problem) const override; + std::size_t GetWorkspaceSize(const ExecutionContext& context, + const miopen::getitem::ProblemDescription& problem) const override; + bool MayNeedWorkspace() const override { return true; } +}; + +} // namespace getitem + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/mlo_internal.hpp b/src/include/miopen/mlo_internal.hpp index 43d2e1a067..6252df3087 100644 --- a/src/include/miopen/mlo_internal.hpp +++ b/src/include/miopen/mlo_internal.hpp @@ -119,6 +119,8 @@ inline int AlignUp(int val, unsigned step) return static_cast(((static_cast(val) + step - 1) / step) * step); } +inline size_t AlignUp(size_t val, size_t step) { return (val + step - 1) / step * step; } + namespace miopen { struct TensorDescriptor; diff --git a/src/include/miopen/reduce/problem_description.hpp b/src/include/miopen/reduce/problem_description.hpp index 03001a155b..b48bd3b3ce 100644 --- a/src/include/miopen/reduce/problem_description.hpp +++ b/src/include/miopen/reduce/problem_description.hpp @@ -200,7 +200,8 @@ struct ProblemDescription : ProblemDescriptionBase TensorDescriptor indiceDesc; int32_t dim; - miopenReduceExtremeOp_t reduceExtremeOp; + + miopenReduceExtremeOp_t reduceExtremeOp = MIOPEN_REDUCE_CALCULATION_SUM; NetworkConfig MakeForwardNetworkConfig() const; }; diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index 6ddd83bcef..81c15f6bea 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -58,7 +58,8 @@ enum class Primitive Cat, Mha, Softmax, - Adam + Adam, + Item }; struct MIOPEN_INTERNALS_EXPORT Id diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp new file mode 100644 index 0000000000..9f7430ba8a --- /dev/null +++ b/src/include/miopen/tensor_view_utils.hpp @@ -0,0 +1,80 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef MIOPEN_TENSOR_VIEW_UTIL_HPP_ +#define MIOPEN_TENSOR_VIEW_UTIL_HPP_ + +#include +#include "../../kernels/tensor_view.hpp" + +namespace miopen { + +template +inline tensor_view_t get_inner_expanded_tv(const TensorDescriptor Desc) +{ + auto dims = Desc.GetLengths(); + auto strides = Desc.GetStrides(); + + tensor_view_t tensor_view; + for(size_t i = 0; i < N; ++i) + { + if(i < dims.size()) + { + tensor_view.stride[i] = strides[i]; + tensor_view.size[i] = dims[i]; + } + else + { + tensor_view.stride[i] = (i == 0 ? 1 : strides[i - 1]); + tensor_view.size[i] = 1; + } + } + return tensor_view; +} + +template +inline void slice_tv(tensor_view_t& tensor_view, int32_t sliceCount, const int32_t* slices) +{ + for(int32_t i = 0; i < sliceCount; i++) + { + int32_t dim = slices[4 * i + 0]; + int32_t start = slices[4 * i + 1]; + int32_t end = slices[4 * i + 2]; + int32_t step = slices[4 * i + 3]; + + if(end > static_cast(tensor_view.size[dim])) + end = tensor_view.size[dim]; + + auto len = end - start; + + tensor_view.size[dim] = (len + step - 1) / step; + tensor_view.stride[dim] *= step; + } +} + +} // namespace miopen + +#endif // MIOPEN_TENSOR_REORDER_UTIL_HPP_ diff --git a/src/kernels/MIOpenGetitem.cpp b/src/kernels/MIOpenGetitem.cpp new file mode 100644 index 0000000000..4daba996c8 --- /dev/null +++ b/src/kernels/MIOpenGetitem.cpp @@ -0,0 +1,158 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "hip_atomic.hpp" +#include "miopen_cstdint.hpp" +#include "float_types.h" +#include "tensor_view.hpp" + +template +__device__ void getitembuildindices(const IDX* __restrict__ index, + IDX* __restrict__ element_index, + E* __restrict__ error, + int32_t index_dim, + int32_t indexCount, + int32_t dim_size, + tensor_view_t<5> index_tv, + int32_t dim_offset, + int32_t dim_info_offset) +{ + const uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x; + + tensor_layout_t<5> ncdhw(index_tv, gid); + + if(ncdhw.layout[0] >= index_tv.size[0]) + return; + + uint64_t idx = index_tv.get_tensor_view_idx(ncdhw); + IDX getitem_index = index[idx]; + + if(getitem_index >= 0 && getitem_index < dim_size) + { + element_index[(gid * indexCount) + dim_offset] = getitem_index; + } + else if(getitem_index >= -dim_size && getitem_index < 0) + { + element_index[(gid * indexCount) + dim_offset] = getitem_index + dim_size; + } + else + { + error[dim_offset] = -1; + } + + if(gid == 0) + { + element_index[dim_info_offset + dim_offset] = index_dim; + } +} + +template +__device__ void getitembwd(const TI* __restrict__ dy, + IDX* __restrict__ element_index, + TO* __restrict__ dx, + int32_t start_dim, + int32_t indexCount, + tensor_view_t<5> dy_tv, + tensor_view_t<5> dx_tv, + int32_t dim_info_offset, + int32_t offset) +{ + const uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x; + + tensor_layout_t<5> ncdhw(dy_tv, gid); + + if(ncdhw.layout[0] >= dy_tv.size[0]) + return; + + tensor_layout_t<5> idx = ncdhw; + + if(indexCount > 0) + { + int32_t dim_cursor = ncdhw.layout[start_dim]; + int32_t i = start_dim; + int32_t j = 0; + + for(; i < start_dim + indexCount; ++i, ++j) + { + uint64_t dim_idx = static_cast(element_index[dim_info_offset + j]); + idx.layout[dim_idx] = + static_cast(element_index[(dim_cursor * indexCount) + j]); + } + + i = element_index[dim_info_offset + indexCount - 1] + 1; + dim_cursor = start_dim + 1; + for(; i < 5; ++i, ++dim_cursor) + { + idx.layout[i] = ncdhw.layout[dim_cursor]; + } + } + + idx.layout[0] += offset; + ncdhw.layout[0] += offset; + + atomic_add_g(&dx[dx_tv.get_tensor_view_idx(idx)], dy[dy_tv.get_tensor_view_idx(ncdhw)]); +} + +extern "C" __global__ void GetItemBuildIndices(const INDEX_TYPE* __restrict__ index, + INDEX_TYPE* __restrict__ element_index, + ERROR_TYPE* __restrict__ error, + int32_t index_dim, + int32_t indexCount, + int32_t dim_size, + tensor_view_t<5> index_tv, + int32_t dim_offset, + int32_t dim_info_offset) +{ + // instantiate the kernel + getitembuildindices(index, + element_index, + error, + index_dim, + indexCount, + dim_size, + index_tv, + dim_offset, + dim_info_offset); +} + +extern "C" __global__ void GetitemBwd(const INPUT_TYPE* __restrict__ dy, + INDEX_TYPE* __restrict__ element_index, + OUTPUT_TYPE* __restrict__ dx, + int32_t start_dim, + int32_t indexCount, + tensor_view_t<5> dy_tv, + tensor_view_t<5> dx_tv, + int32_t dim_info_offset, + int32_t offset) +{ + // instantiate the kernel + getitembwd( + dy, element_index, dx, start_dim, indexCount, dy_tv, dx_tv, dim_info_offset, offset); +} diff --git a/src/kernels/MIOpenGroupNorm.cpp b/src/kernels/MIOpenGroupNorm.cpp index 54d70d323b..1ddf58d232 100644 --- a/src/kernels/MIOpenGroupNorm.cpp +++ b/src/kernels/MIOpenGroupNorm.cpp @@ -30,17 +30,18 @@ #include "float_types.h" -extern "C" __global__ void GroupNormFwdContiguous(const FLOAT* __restrict__ x, - FLOAT* __restrict__ y, - const FLOAT* __restrict__ weight, - const FLOAT* __restrict__ bias, - FLOAT_ACCUM* __restrict__ mean, - FLOAT_ACCUM* __restrict__ rstd, - float eps, - uint64_t num_groups, - uint64_t num_channels, - uint64_t numel_per_channel, - bool mode) +template +__device__ void groupnormfwdcontiguous(const TI* __restrict__ x, + const TI* __restrict__ weight, + const TI* __restrict__ bias, + TO* __restrict__ y, + TO* __restrict__ mean, + TO* __restrict__ rstd, + float eps, + uint64_t num_groups, + uint64_t num_channels, + uint64_t numel_per_channel, + bool mode) { /* * Each group works on a single channel. @@ -98,9 +99,9 @@ extern "C" __global__ void GroupNormFwdContiguous(const FLOAT* __restrict__ x, if(lid == 0) { if(mean) - mean[gid] = pmean; + mean[gid] = CVT_ACCUM2FLOAT(pmean); if(rstd) - rstd[gid] = prstd; + rstd[gid] = CVT_ACCUM2FLOAT(prstd); } // forward calculation @@ -119,3 +120,20 @@ extern "C" __global__ void GroupNormFwdContiguous(const FLOAT* __restrict__ x, y[idx] = CVT_ACCUM2FLOAT(val); } } + +extern "C" __global__ void GroupNormFwdContiguous(const INPUT_TYPE* __restrict__ x, + const INPUT_TYPE* __restrict__ weight, + const INPUT_TYPE* __restrict__ bias, + OUTPUT_TYPE* __restrict__ y, + OUTPUT_TYPE* __restrict__ mean, + OUTPUT_TYPE* __restrict__ rstd, + float eps, + uint64_t num_groups, + uint64_t num_channels, + uint64_t numel_per_channel, + bool mode) +{ + // instantiate the kernel + groupnormfwdcontiguous( + x, weight, bias, y, mean, rstd, eps, num_groups, num_channels, numel_per_channel, mode); +} diff --git a/src/kernels/MIOpenLayerNorm.cpp b/src/kernels/MIOpenLayerNorm.cpp index 21880d4b66..9a5e736f94 100644 --- a/src/kernels/MIOpenLayerNorm.cpp +++ b/src/kernels/MIOpenLayerNorm.cpp @@ -40,7 +40,7 @@ __device__ void layernormfwdcontiguous(const TI* __restrict__ x, TO* __restrict__ rstd, float eps, uint64_t inner_size, - bool mode) + int32_t mode) { /* * Each group works on a single channel. @@ -96,9 +96,9 @@ __device__ void layernormfwdcontiguous(const TI* __restrict__ x, if(lid == 0) { if(mean) - mean[gid] = pmean; + mean[gid] = CVT_ACCUM2FLOAT(pmean); if(rstd) - rstd[gid] = prstd; + rstd[gid] = CVT_ACCUM2FLOAT(prstd); } // forward calculation @@ -129,7 +129,7 @@ __device__ void addlayernormfwdcontiguous(const TI* __restrict__ x, TO* __restrict__ rstd, float eps, uint64_t inner_size, - bool mode) + int32_t mode) { const uint64_t gid = blockIdx.x; const uint64_t lid = threadIdx.x; @@ -168,9 +168,9 @@ __device__ void addlayernormfwdcontiguous(const TI* __restrict__ x, if(lid == 0) { if(mean) - mean[gid] = pmean; + mean[gid] = CVT_ACCUM2FLOAT(pmean); if(rstd) - rstd[gid] = prstd; + rstd[gid] = CVT_ACCUM2FLOAT(prstd); } // forward calculation @@ -199,7 +199,7 @@ __device__ void t5layernormfwdcontiguous(const TI* __restrict__ x, TO* __restrict__ rstd, float eps, uint64_t inner_size, - bool mode) + int32_t mode) { const uint64_t gid = blockIdx.x; const uint64_t lid = threadIdx.x; @@ -232,7 +232,7 @@ __device__ void t5layernormfwdcontiguous(const TI* __restrict__ x, if(lid == 0) { if(rstd) - rstd[gid] = prstd; + rstd[gid] = CVT_ACCUM2FLOAT(prstd); } // forward calculation @@ -257,7 +257,7 @@ __device__ void t5layernormbwdcontiguous(const TI* __restrict__ dy, const TI* __restrict__ rstd, TO* __restrict__ dx, uint64_t inner_size, - bool mode) + int32_t mode) { const uint64_t gid = blockIdx.x; const uint64_t lid = threadIdx.x; @@ -401,7 +401,7 @@ extern "C" __global__ void LayernormFwdContiguous(const INPUT_TYPE* __restrict__ OUTPUT_TYPE* __restrict__ rstd, float eps, uint64_t inner_size, - bool mode) + int32_t mode) { // instantiate the kernel layernormfwdcontiguous( @@ -417,7 +417,7 @@ extern "C" __global__ void AddLayernormFwdContiguous(const INPUT_TYPE* __restric OUTPUT_TYPE* __restrict__ rstd, float eps, uint64_t inner_size, - bool mode) + int32_t mode) { // instantiate the kernel addlayernormfwdcontiguous( @@ -430,7 +430,7 @@ extern "C" __global__ void T5LayernormFwdContiguous(const INPUT_TYPE* __restrict OUTPUT_TYPE* __restrict__ rstd, float eps, uint64_t inner_size, - bool mode) + int32_t mode) { // instantiate the kernel t5layernormfwdcontiguous(x, weight, y, rstd, eps, inner_size, mode); @@ -442,7 +442,7 @@ extern "C" __global__ void T5LayernormBwdContiguous(const INPUT_TYPE* __restrict const INPUT_TYPE* __restrict__ rstd, OUTPUT_TYPE* __restrict__ dx, uint64_t inner_size, - bool mode) + int32_t mode) { // instantiate the kernel t5layernormbwdcontiguous(dy, x, weight, rstd, dx, inner_size, mode); diff --git a/src/kernels/hip_atomic.hpp b/src/kernels/hip_atomic.hpp new file mode 100644 index 0000000000..aad6b0a63e --- /dev/null +++ b/src/kernels/hip_atomic.hpp @@ -0,0 +1,95 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + *all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +__device__ static inline __half __ushort_as___half(ushort x) +{ + static_assert(sizeof(ushort) == sizeof(__half), ""); + + __half tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; +} + +__device__ static inline ushort ____half_as_ushort(__half x) +{ + static_assert(sizeof(ushort) == sizeof(__half), ""); + + ushort tmp; + __builtin_memcpy(&tmp, &x, sizeof(tmp)); + + return tmp; +} + +__device__ inline void atomic_add_g(ushort* addr, const float val) +{ + size_t offset = reinterpret_cast(addr) & 0x2; + bool is_32_align = offset; + uint32_t* addr_as_uint32_t = + reinterpret_cast(reinterpret_cast(addr) - offset); + uint32_t current = *addr_as_uint32_t; + + uint32_t expected; + + do + { + expected = current; + ushort current_ushort = is_32_align ? current >> 16 : current & 0xffff; + + float next_float = __uint_as_float(static_cast(current_ushort) << 16) + val; + ushort next_ushort = static_cast(__float_as_uint(next_float) >> 16); + uint32_t next = is_32_align ? (current & 0xffff) | (next_ushort << 16) + : (current & 0xffff0000) | next_ushort; + + current = atomicCAS(addr_as_uint32_t, expected, next); + } while(current != expected); +} + +__device__ inline void atomic_add_g(__half* addr, const __half val) +{ + size_t offset = reinterpret_cast(addr) & 0x2; + bool is_32_align = offset; + uint32_t* addr_as_uint32_t = + reinterpret_cast(reinterpret_cast(addr) - offset); + uint32_t current = *addr_as_uint32_t; + + uint32_t expected; + + do + { + expected = current; + ushort current_ushort = is_32_align ? current >> 16 : current & 0xffff; + + ushort next_ushort = ____half_as_ushort(__ushort_as___half(current_ushort) + val); + uint32_t next = is_32_align ? (current & 0xffff) | (next_ushort << 16) + : (current & 0xffff0000) | next_ushort; + + current = atomicCAS(addr_as_uint32_t, expected, next); + } while(current != expected); +} + +__device__ inline void atomic_add_g(float* addr, const float val) { atomicAdd(addr, val); } diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp new file mode 100644 index 0000000000..d35bfd93fc --- /dev/null +++ b/src/kernels/tensor_view.hpp @@ -0,0 +1,78 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef GUARD_TENSOR_VIEW_HPP +#define GUARD_TENSOR_VIEW_HPP + +template +struct tensor_layout_t; + +template +struct tensor_view_t +{ + // Get index in tensor view at tensor layout + constexpr uint64_t get_tensor_view_idx(const tensor_layout_t& tensor_layout) + { + static_assert(N > 0); + uint64_t idx = 0; + for(auto i = 0; i < N; ++i) + { + idx += stride[i] * tensor_layout.layout[i]; + } + return idx; + } + uint64_t stride[N]; + uint64_t size[N]; +}; + +template +struct tensor_layout_t +{ + // Make tensor layout at index using tensor view + constexpr tensor_layout_t(const tensor_view_t& tensor_view, uint64_t idx) + { + static_assert(N > 0); + uint64_t temp = idx; + if constexpr(N == 1) + { + layout[0] = idx; + } + else + { + for(auto i = N - 1; i > 1; --i) + { + layout[i] = temp % tensor_view.size[i]; + temp = temp / tensor_view.size[i]; + } + layout[1] = temp % tensor_view.size[1]; + layout[0] = temp / tensor_view.size[1]; + } + } + + uint64_t layout[N]; +}; + +#endif // GUARD_TENSOR_VIEW_HPP diff --git a/src/reduce/problem_description.cpp b/src/reduce/problem_description.cpp index ac73d16a02..c50ca4f755 100644 --- a/src/reduce/problem_description.cpp +++ b/src/reduce/problem_description.cpp @@ -38,7 +38,8 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const auto xlength = xDesc.GetLengths(); std::vector outputlength; if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) || - (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX)) + (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX) || + (reduceExtremeOp == MIOPEN_REDUCE_CALCULATION_SUM)) outputlength = yDesc.GetLengths(); else outputlength = indiceDesc.GetLengths(); diff --git a/src/solver.cpp b/src/solver.cpp index e468d38d0a..6b451ca498 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -662,6 +663,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Cat, cat::CatForward{}.SolverDbId()); Register(registry, ++id, Primitive::Adam, adam::Adam{}.SolverDbId()); + Register(registry, ++id, Primitive::Item, getitem::GetitemBackward{}.SolverDbId()); Register(registry, ++id, Primitive::Adam, adam::TransformersAdamW{}.SolverDbId()); diff --git a/src/solver/getitem/backward_getitem.cpp b/src/solver/getitem/backward_getitem.cpp new file mode 100644 index 0000000000..ab44832b8b --- /dev/null +++ b/src/solver/getitem/backward_getitem.cpp @@ -0,0 +1,283 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#define LOCAL_SIZE 256 + +namespace miopen { + +namespace solver { + +namespace getitem { + +bool IsLargeIndex(const miopen::getitem::ProblemDescription& problem) +{ + auto dy_dims = problem.GetDYDesc().GetLengths(); + auto dx_dims = problem.GetDXDesc().GetLengths(); + + for(int32_t i = 0; i < problem.GetDimCount(); i++) + { + if(dy_dims[problem.GetDim(i)] / dx_dims[problem.GetDim(i)] > 400) + return false; + } + + return true; +} + +bool GetitemBackward::IsApplicable(const ExecutionContext& /*context*/, + const miopen::getitem::ProblemDescription& problem) const +{ + if(!problem.IsSameType()) + return false; + if(!IsLargeIndex(problem)) + return false; + return true; +} + +ConvSolution GetitemBackward::GetSolution(const ExecutionContext& /*context*/, + const miopen::getitem::ProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + const auto& dtype = problem.GetDYDesc().GetType(); + const auto& input_dtype = miopen::GetDataType(problem.GetDYDesc().GetType()); + const auto& index_dtype = miopen::GetDataType(problem.GetIndexDesc(0).GetType()); + const auto& error_dtype = miopen::GetDataType(problem.GetErrorDesc().GetType()); + const auto& output_dtype = miopen::GetDataType(problem.GetDXDesc().GetType()); + const auto& dy_dims = problem.GetDYDesc().GetLengths(); + const auto& indexCount = problem.GetIndexCount(); + + auto dy_numel = + std::accumulate(dy_dims.begin(), dy_dims.end(), 1ULL, std::multiplies()); + + for(int32_t i = 0; i < indexCount; i++) + { + const auto& index_dims = problem.GetIndexDesc(i).GetLengths(); + auto index_numel = + std::accumulate(index_dims.begin(), index_dims.end(), 1L, std::multiplies()); + + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(index_numel, xlocalsize); + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenGetitem.cpp"; + kernel.kernel_name = "GetItemBuildIndices"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"INDEX_TYPE", index_dtype}, + {"ERROR_TYPE", error_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"LOCAL_SIZE", LOCAL_SIZE}, + }; + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + { + size_t xlocalsize = LOCAL_SIZE; + size_t xgridsize = AlignUp(dy_numel, xlocalsize); + size_t ylocalsize = 1; + size_t ygridsize = 1; + size_t zlocalsize = 1; + size_t zgridsize = 1; + + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenGetitem.cpp"; + kernel.kernel_name = "GetitemBwd"; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"INDEX_TYPE", index_dtype}, + {"ERROR_TYPE", error_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"LOCAL_SIZE", LOCAL_SIZE}, + }; + + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + const auto& start_dim = params.dims[0]; + const auto& dx_dims = params.dxDesc.GetLengths(); + + const auto& dims = params.dims; + const auto& dimCount = params.dimCount; + + std::vector output_dims(dimCount); + for(int32_t i = 0; i < dimCount; i++) + { + output_dims[i] = static_cast(dx_dims[dims[i]]); + } + + const auto& indexCount = params.indexCount; + const auto& index_dims = params.indexDescs[0]->GetLengths(); + const auto& sliceCount = params.sliceCount; + const auto& slices = params.slices; + auto dim_info_offset = + indexCount > 0 ? indexCount * static_cast(index_dims[0]) : 0; + + auto dy_tv = get_inner_expanded_tv<5>(params.dyDesc); + auto dx_tv = get_inner_expanded_tv<5>(params.dxDesc); + + slice_tv<5>(dx_tv, sliceCount, slices); + + auto elapsed = 0.f; + HipEventPtr start; + HipEventPtr stop; + bool reset_profiling_state = false; + + for(int32_t i = 0; i < indexCount; i++) + { + decltype(auto) build_index_kernel = handle_.Run(kernels[i]); + + const auto& index_dim = dims[i]; + const auto& dim_size = output_dims[i]; + auto index_tv = get_inner_expanded_tv<5>(*params.indexDescs[i]); + const auto& dim_offset = i; + + if((i == 0) && handle_.IsProfilingEnabled()) + { + handle_.EnableProfiling(false); + reset_profiling_state = true; + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + build_index_kernel(params.indexs[i], + params.workspace, + params.error, + index_dim, + indexCount, + dim_size, + index_tv, + dim_offset, + dim_info_offset); + } + + if((indexCount == 0) && handle_.IsProfilingEnabled()) + { + handle_.EnableProfiling(false); + reset_profiling_state = true; + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + decltype(auto) kernel = handle_.Run(kernels[indexCount]); + + kernel(params.dy, + params.workspace, + params.dx, + start_dim, + indexCount, + dy_tv, + dx_tv, + dim_info_offset, + params.offset); + + if(reset_profiling_state) + { + hipEventRecord(stop.get(), handle_.GetStream()); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + + hipEventDestroy(start.get()); + hipEventDestroy(stop.get()); + handle_.EnableProfiling(true); + }; + }; + }; + + return result; +} + +std::size_t +GetitemBackward::GetWorkspaceSize(const ExecutionContext& /*context*/, + const miopen::getitem::ProblemDescription& problem) const +{ + const auto& indexCount = problem.GetIndexCount(); + if(indexCount > 0) + { + const auto& index_dims = problem.GetIndexDesc(0).GetLengths(); + auto index_numel = + std::accumulate(index_dims.begin(), index_dims.end(), 1L, std::multiplies()); + return (indexCount * index_numel + problem.GetIndexCount()) * + get_data_size(problem.GetIndexDesc(0).GetType()); + } + + return 0; +} + +} // namespace getitem + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/groupnorm/forward_groupnorm.cpp b/src/solver/groupnorm/forward_groupnorm.cpp index e4018d16ab..11f66e2f83 100644 --- a/src/solver/groupnorm/forward_groupnorm.cpp +++ b/src/solver/groupnorm/forward_groupnorm.cpp @@ -75,8 +75,10 @@ GroupNormForward::GetSolution(const ExecutionContext& context, auto result = ConvSolution{miopenStatusSuccess}; { - auto dtype = problem.GetXDesc().GetType(); - auto dims = problem.GetXDesc().GetLengths(); + auto dtype = problem.GetXDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetXDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType()); + auto dims = problem.GetXDesc().GetLengths(); size_t num_groups = problem.GetNumGroups(); size_t outer_size = dims[0] * num_groups; @@ -98,6 +100,8 @@ GroupNormForward::GetSolution(const ExecutionContext& context, {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, {"LOCAL_SIZE", LOCAL_SIZE}, }; @@ -125,9 +129,9 @@ GroupNormForward::GetSolution(const ExecutionContext& context, size_t num_channels = dims[1]; kernel(params.x, - params.y, params.weight, params.bias, + params.y, params.mean, params.rstd, params.epsilon, diff --git a/src/solver/layernorm/backward_t5layernorm.cpp b/src/solver/layernorm/backward_t5layernorm.cpp index c62a756b77..cf984f2e77 100644 --- a/src/solver/layernorm/backward_t5layernorm.cpp +++ b/src/solver/layernorm/backward_t5layernorm.cpp @@ -278,7 +278,7 @@ T5LayernormBackward::GetSolution(const ExecutionContext& context, params.rstd, params.dx, inner_size, - static_cast(params.mode % 2)); + static_cast(params.mode)); weight_parallel_kernel(params.dy, params.x, @@ -333,7 +333,7 @@ T5LayernormBackward::GetSolution(const ExecutionContext& context, params.rstd, params.dx, inner_size, - static_cast(params.mode % 2)); + static_cast(params.mode)); weight_kernel(params.dy, params.x, params.rstd, params.dw, outer_size, inner_size); diff --git a/src/solver/layernorm/forward_addlayernorm.cpp b/src/solver/layernorm/forward_addlayernorm.cpp index ba366b318d..98c2c7ca55 100644 --- a/src/solver/layernorm/forward_addlayernorm.cpp +++ b/src/solver/layernorm/forward_addlayernorm.cpp @@ -138,7 +138,7 @@ AddLayernormForward::GetSolution(const ExecutionContext& context, params.rstd, params.epsilon, inner_size, - static_cast(params.mode % 2)); + static_cast(params.mode)); }; }; diff --git a/src/solver/layernorm/forward_layernorm.cpp b/src/solver/layernorm/forward_layernorm.cpp index ffbe479f1f..81e5641836 100644 --- a/src/solver/layernorm/forward_layernorm.cpp +++ b/src/solver/layernorm/forward_layernorm.cpp @@ -137,7 +137,7 @@ LayernormForward::GetSolution(const ExecutionContext& context, params.rstd, params.epsilon, inner_size, - static_cast(params.mode)); + static_cast(params.mode)); }; }; diff --git a/src/solver/reduce/forward_argmax.cpp b/src/solver/reduce/forward_argmax.cpp index 4a44887ea4..9c2f1203cd 100644 --- a/src/solver/reduce/forward_argmax.cpp +++ b/src/solver/reduce/forward_argmax.cpp @@ -40,7 +40,7 @@ namespace reduce { size_t ArgmaxForward::XGridSize(std::vector indicedims) const { - auto indice_numel = + size_t indice_numel = std::accumulate(indicedims.begin(), indicedims.end(), 1ULL, std::multiplies()); return AlignUp(indice_numel, LOCAL_SIZE); } diff --git a/src/solver/reduce/forward_argmin.cpp b/src/solver/reduce/forward_argmin.cpp index c0b3d15aa0..51471c5466 100644 --- a/src/solver/reduce/forward_argmin.cpp +++ b/src/solver/reduce/forward_argmin.cpp @@ -40,7 +40,7 @@ namespace reduce { size_t ArgminForward::XGridSize(std::vector indicedims) const { - auto indice_numel = + size_t indice_numel = std::accumulate(indicedims.begin(), indicedims.end(), 1ULL, std::multiplies()); return AlignUp(indice_numel, LOCAL_SIZE); } diff --git a/src/solver/reduce/forward_max.cpp b/src/solver/reduce/forward_max.cpp index 9537c300cf..a759d9bcfa 100644 --- a/src/solver/reduce/forward_max.cpp +++ b/src/solver/reduce/forward_max.cpp @@ -40,7 +40,7 @@ namespace reduce { size_t MaxForward::XGridSize(std::vector ydims) const { - auto output_numel = + size_t output_numel = std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies()); return AlignUp(output_numel, LOCAL_SIZE); } diff --git a/src/solver/reduce/forward_min.cpp b/src/solver/reduce/forward_min.cpp index f7aae43779..abb0c4b5bd 100644 --- a/src/solver/reduce/forward_min.cpp +++ b/src/solver/reduce/forward_min.cpp @@ -40,7 +40,7 @@ namespace reduce { size_t MinForward::XGridSize(std::vector ydims) const { - auto output_numel = + size_t output_numel = std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies()); return AlignUp(output_numel, LOCAL_SIZE); } diff --git a/src/sum.cpp b/src/sum.cpp index 00caefa1a9..ddfb21917a 100644 --- a/src/sum.cpp +++ b/src/sum.cpp @@ -47,9 +47,9 @@ std::size_t GetSumWorkspaceSize(Handle& handle, const auto algo = AlgorithmName{"SumForward"}; const auto solvers = solver::SolverContainer{}; - auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem); + auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem, true); - return pair_size_vector.empty() ? static_cast(-1) : pair_size_vector.front().second; + return pair_size_vector.empty() ? static_cast(0) : pair_size_vector.front().second; } miopenStatus_t SumForward(Handle& handle, diff --git a/src/t5layernorm.cpp b/src/t5layernorm.cpp index 680270c4b0..5978fdd677 100644 --- a/src/t5layernorm.cpp +++ b/src/t5layernorm.cpp @@ -86,9 +86,9 @@ std::size_t GetT5LayerNormBackwardWorkspaceSize(Handle& handle, const auto algo = AlgorithmName{"T5LayerNormBackward"}; const auto solvers = solver::SolverContainer{}; - auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem); + auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem, true); - return pair_size_vector.empty() ? static_cast(-1) : pair_size_vector.front().second; + return pair_size_vector.empty() ? static_cast(0) : pair_size_vector.front().second; } miopenStatus_t T5LayerNormBackward(Handle& handle, diff --git a/test/gtest/addlayernorm.cpp b/test/gtest/addlayernorm.cpp index 015f427f90..ae39096b7a 100644 --- a/test/gtest/addlayernorm.cpp +++ b/test/gtest/addlayernorm.cpp @@ -59,8 +59,8 @@ using namespace addlayernorm; TEST_P(GPU_AddLayerNorm_FP32, AddLayerNormTestFw) { - auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG); - if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--float") + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) { RunTest(); Verify(); @@ -73,8 +73,8 @@ TEST_P(GPU_AddLayerNorm_FP32, AddLayerNormTestFw) TEST_P(GPU_AddLayerNorm_FP16, AddLayerNormTestFw) { - auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG); - if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--half") + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) { RunTest(); Verify(); @@ -87,8 +87,8 @@ TEST_P(GPU_AddLayerNorm_FP16, AddLayerNormTestFw) TEST_P(GPU_AddLayerNorm_BFP16, AddLayerNormTestFw) { - auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG); - if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--bfloat16") + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) { RunTest(); Verify(); diff --git a/test/gtest/addlayernorm.hpp b/test/gtest/addlayernorm.hpp index 0be011e683..da65ca93c6 100644 --- a/test/gtest/addlayernorm.hpp +++ b/test/gtest/addlayernorm.hpp @@ -78,8 +78,10 @@ void cpu_addlayernorm_forward(tensor input, ref_rstd[o] = static_cast(rstd_v); ford(inner_size)([&](int32_t i) { - float weight_v = mode ? static_cast(weight[i]) : 1; - float bias_v = mode ? static_cast(bias[i]) : 0; + float weight_v = + (mode == MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD) ? 1 : static_cast(weight[i]); + float bias_v = + (mode == MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD) ? 0 : static_cast(bias[i]); ref_output[o * inner_size + i] = static_cast((static_cast(input[o * inner_size + i]) + static_cast(input2[o * inner_size + i]) - mean_v) * diff --git a/test/gtest/cat.cpp b/test/gtest/cat.cpp index 7b394093f5..6bc405e2f0 100644 --- a/test/gtest/cat.cpp +++ b/test/gtest/cat.cpp @@ -52,7 +52,8 @@ using namespace cat; TEST_P(CatTestFloat, CatTestFw) { - if(env::enabled(MIOPEN_TEST_ALL) && (GetFloatArg() == "--float")) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) { RunTest(); Verify(); diff --git a/test/gtest/getitem.cpp b/test/gtest/getitem.cpp new file mode 100644 index 0000000000..6db0c25fff --- /dev/null +++ b/test/gtest/getitem.cpp @@ -0,0 +1,110 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "getitem.hpp" +#include + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace getitem { + +std::string GetFloatArg() +{ + const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + +struct GetitemBwdTestFloat : GetitemBwdTest +{ +}; + +struct GetitemBwdTestHalf : GetitemBwdTest +{ +}; + +struct GetitemBwdTestBFloat16 : GetitemBwdTest +{ +}; + +} // namespace getitem +using namespace getitem; + +TEST_P(GetitemBwdTestFloat, GetitemBwdTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GetitemBwdTestHalf, GetitemBwdTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GetitemBwdTestBFloat16, GetitemBwdTest) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(GetitemTestSet, + GetitemBwdTestFloat, + testing::ValuesIn(GetitemTestConfigs())); +INSTANTIATE_TEST_SUITE_P(GetitemTestSet, + GetitemBwdTestHalf, + testing::ValuesIn(GetitemTestConfigs())); +INSTANTIATE_TEST_SUITE_P(GetitemTestSet, + GetitemBwdTestBFloat16, + testing::ValuesIn(GetitemTestConfigs())); diff --git a/test/gtest/getitem.hpp b/test/gtest/getitem.hpp new file mode 100644 index 0000000000..264f002985 --- /dev/null +++ b/test/gtest/getitem.hpp @@ -0,0 +1,407 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "../driver/tensor_driver.hpp" +#include "get_handle.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include +#include + +template +void cpu_getitem_backward(tensor dy, + uint32_t indexCount, + std::vector> indexs, + tensor& ref_dx, + tensor& ref_error, + uint32_t dimCount, + int32_t* dims, + uint32_t sliceCount, + int32_t* slices, + uint32_t offset) +{ + auto dy_dims = dy.desc.GetLengths(); + auto dy_numel = std::accumulate(dy_dims.begin(), dy_dims.end(), 1L, std::multiplies()); + auto dx_dims = ref_dx.desc.GetLengths(); + auto index_dims = indexs[0].desc.GetLengths(); + auto index_numel = + std::accumulate(index_dims.begin(), index_dims.end(), 1L, std::multiplies()); + auto element_index = std::vector(indexCount * index_numel + indexCount); + + std::vector output_dims; + for(int32_t i = 0; i < dimCount; i++) + { + output_dims.push_back(dx_dims[dims[i]]); + } + + auto dim_info_offset = indexCount > 0 ? indexCount * index_dims[0] : 0; + auto start_dim = dims[0]; + + auto dy_tv = miopen::get_inner_expanded_tv<5>(dy.desc); + auto ref_dx_tv = miopen::get_inner_expanded_tv<5>(ref_dx.desc); + miopen::slice_tv<5>(ref_dx_tv, sliceCount, slices); + + // Get element index form indexs + for(int j = 0; j < indexCount; j++) + { + auto index_dim = dims[j]; + auto dim_size = output_dims[j]; + + par_ford(index_numel)([&](int32_t o) { + int32_t getitem_index = indexs[j][o]; + + if(getitem_index >= 0 && getitem_index < dim_size) + { + element_index[(o * indexCount) + j] = getitem_index; + } + else if(getitem_index >= -dim_size && getitem_index < 0) + { + element_index[(o * indexCount) + j] = getitem_index + dim_size; + } + else + { + ref_error[j] = -1; + } + + if(o == 0) + { + element_index[dim_info_offset + j] = index_dim; + } + }); + } + + // GetItem + par_ford(dy_numel)([&](int32_t o) { + tensor_layout_t<5> ncdhw(dy_tv, o); + tensor_layout_t<5> idx(ncdhw); + + if(indexCount > 0) + { + size_t dim_cursor = ncdhw.layout[start_dim]; + size_t i = start_dim; + size_t j = 0; + + for(; i < start_dim + indexCount; ++i, ++j) + { + size_t dim_idx = element_index[dim_info_offset + j]; + idx.layout[dim_idx] = element_index[(dim_cursor * indexCount) + j]; + } + + i = element_index[dim_info_offset + indexCount - 1] + 1; + dim_cursor = start_dim + 1; + for(; i < 5; ++i, ++dim_cursor) + { + idx.layout[i] = ncdhw.layout[dim_cursor]; + } + } + + ref_dx[ref_dx_tv.get_tensor_view_idx(idx)] += dy[dy_tv.get_tensor_view_idx(ncdhw)]; + }); +} + +struct GetitemTestCase +{ + std::vector dy; + std::vector> indexs; + std::vector dx; + std::vector dims; + std::vector> slices; + uint32_t offset; + + friend std::ostream& operator<<(std::ostream& os, const GetitemTestCase& tc) + { + + os << " dy:"; + auto dy_s = tc.dy; + os << dy_s[0]; + for(int32_t i = 1; i < dy_s.size(); i++) + { + os << "x" << dy_s[i]; + } + + os << " indexs:"; + for(int32_t i = 0; i < tc.indexs.size(); i++) + { + auto index_s = tc.indexs[i]; + if(i != 0) + os << ","; + os << index_s[0]; + for(int32_t j = 1; j < index_s.size(); j++) + { + os << "index" << index_s[j]; + } + } + + os << " dx:"; + auto dx_s = tc.dx; + os << dx_s[0]; + for(int32_t i = 1; i < dx_s.size(); i++) + { + os << "x" << dx_s[i]; + } + + os << " dims:"; + auto dims_s = tc.dims; + os << dims_s[0]; + for(int32_t i = 1; i < dims_s.size(); i++) + { + os << "," << dims_s[i]; + } + + os << " slices:"; + for(int32_t i = 0; i < tc.slices.size(); i++) + { + auto slice_s = tc.slices[i]; + if(i != 0) + os << ","; + os << slice_s[0]; + for(int32_t j = 1; j < slice_s.size(); j++) + { + os << "slice" << slice_s[j]; + } + } + + os << " offset:" << tc.offset; + + return os; + } + + std::vector GetDy() { return dy; } + + std::vector> GetIndexs() { return indexs; } + + std::vector GetDx() { return dx; } + + std::vector GetDims() { return dims; } + + std::vector> GetSlices() { return slices; } +}; + +std::vector GetitemTestConfigs() +{ // dy indexs dx dims slices offset + // clang-format off + return { + { {128, 128}, {{128}}, {128, 128}, {0}, {}, 0}, //llama2 + { {16, 4}, {{16}}, {3234, 4}, {0}, {}, 0}, //ssdlite + { {149, 128}, {{1490}}, {1490, 1128}, {0}, {}, 0}, //llama2_7b + { {10, 128}, {{10}}, {160, 128}, {0}, {}, 0}, + { {4260, 4}, {{4300}}, {4300, 4}, {0}, {}, 0}, //fasterrcnn + { {4260}, {{4300}}, {4300}, {0}, {}, 0} //maskrcnn + }; + // clang-format on +} + +template +struct GetitemBwdTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + getitem_config = GetParam(); + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + + dims = getitem_config.GetDims(); + slices = getitem_config.GetSlices(); + offset = getitem_config.offset; + + for(auto slice : slices) + { + for(int32_t i = 0; i < 4; i++) + { + slices_flat.push_back(slice[i]); + } + } + + auto dy_dim = getitem_config.GetDy(); + auto indexs_dim = getitem_config.GetIndexs(); + auto dx_dim = getitem_config.GetDx(); + std::vector error_dim; + error_dim.push_back(indexs_dim.size()); + + dy = tensor{dy_dim}.generate(gen_value); + + auto output_dims = std::vector{}; + for(auto dim : dims) + { + output_dims.push_back(static_cast(dx_dim[dim])); + } + + for(int32_t i = 0; i < indexs_dim.size(); i++) + { + auto index = tensor{indexs_dim[i]}; + auto index_dims = index.desc.GetLengths(); + auto index_numel = std::accumulate( + index_dims.begin(), index_dims.end(), 1L, std::multiplies()); + for(int32_t j = 0; j < index_numel; j++) + { + index[j] = prng::gen_0_to_B(output_dims[i]); + } + indexs.push_back(index); + } + + dx = tensor{dx_dim}; + std::fill(dx.begin(), dx.end(), static_cast(0)); + + error = tensor{error_dim}; + std::fill(error.begin(), error.end(), static_cast(0)); + + ref_error = tensor{error_dim}; + std::fill(ref_error.begin(), ref_error.end(), static_cast(0)); + + ref_dx = tensor{dx_dim}; + std::fill(ref_dx.begin(), ref_dx.end(), static_cast(0)); + + std::vector indexDescs; + + std::transform(indexs.begin(), + indexs.end(), + std::back_inserter(indexDescs), + [](auto& index) { return &index.desc; }); + + std::vector workspace_dims; + ws_sizeInBytes = + miopen::GetGetitemWorkspaceSize(handle, indexDescs.size(), indexDescs.data()); + + workspace_dims.push_back(ws_sizeInBytes / sizeof(T)); + if(ws_sizeInBytes != 0) + { + workspace = tensor{workspace_dims}; + std::fill(workspace.begin(), workspace.end(), static_cast(0)); + workspace_dev = handle.Write(workspace.data); + } + + dy_dev = handle.Write(dy.data); + + std::transform(indexs.begin(), + indexs.end(), + std::back_inserter(indexs_dev), + [&](auto& index) { return handle.Write(index.data); }); + + dx_dev = handle.Write(dx.data); + error_dev = handle.Write(error.data); + } + void RunTest() + { + auto&& handle = get_handle(); + cpu_getitem_backward(dy, + indexs.size(), + indexs, + ref_dx, + ref_error, + dims.size(), + dims.data(), + slices.size(), + slices_flat.data(), + offset); + + std::vector indexDescs; + std::vector indexData; + + std::transform(indexs.begin(), + indexs.end(), + std::back_inserter(indexDescs), + [](auto& index) { return &index.desc; }); + std::transform(indexs_dev.begin(), + indexs_dev.end(), + std::back_inserter(indexData), + [](auto& index_dev) { return index_dev.get(); }); + + miopenStatus_t status = miopen::GetitemBackward(handle, + workspace_dev.get(), + ws_sizeInBytes, + dy.desc, + dy_dev.get(), + indexDescs.size(), + indexDescs.data(), + indexData.data(), + dx.desc, + dx_dev.get(), + error.desc, + error_dev.get(), + dims.size(), + dims.data(), + slices.size(), + slices_flat.data(), + offset); + + EXPECT_EQ(status, miopenStatusSuccess); + + dx.data = handle.Read(dx_dev, dx.data.size()); + error.data = handle.Read(error_dev, error.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + // In the case of layernorm, there is a cumulative sum operation, and in the case of + // floating point operation, the result value can change if the order of the summed values + // is changed. So apply a threshold that is 10 times larger than other operations. + auto threshold = std::is_same::value ? 1.5e-4 : 8.2e-1; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + // If there is an atomic operation on the GPU kernel, a large error occurs depending on the + // calculation order, so it is multiplied by 10 times. + if(std::is_same::value) + threshold *= 8000.0; + + auto error_dx = miopen::rms_range(ref_dx, dx); + EXPECT_TRUE(miopen::range_distance(ref_dx) == miopen::range_distance(dx)); + EXPECT_TRUE(error_dx < threshold * 10) << "Error dx beyond tolerance Error:" << error_dx + << ", Thresholdx10: " << threshold * 10; + + auto error_error = miopen::rms_range(ref_error, error); + EXPECT_TRUE(miopen::range_distance(ref_error) == miopen::range_distance(error)); + EXPECT_TRUE(std::abs(static_cast(error_error)) == 0.0f) << "Error dx is not equal"; + } + GetitemTestCase getitem_config; + + tensor dy; + std::vector> indexs; + tensor dx; + tensor workspace; + tensor error; + + tensor ref_dx; + tensor ref_error; + + miopen::Allocator::ManageDataPtr dy_dev; + std::vector indexs_dev; + miopen::Allocator::ManageDataPtr dx_dev; + miopen::Allocator::ManageDataPtr workspace_dev; + miopen::Allocator::ManageDataPtr error_dev; + + size_t ws_sizeInBytes; + + std::vector dims; + std::vector> slices; + std::vector slices_flat; + uint32_t offset; +}; diff --git a/test/gtest/groupnorm.cpp b/test/gtest/groupnorm.cpp index 62a83e4081..d40f826c0f 100644 --- a/test/gtest/groupnorm.cpp +++ b/test/gtest/groupnorm.cpp @@ -52,12 +52,8 @@ using namespace groupnorm; TEST_P(GroupNormTestFloat, GroupNormTestFw) { - const auto& handle = get_handle(); - - if((miopen::StartsWith(handle.GetDeviceName(), "gfx908") || - miopen::StartsWith(handle.GetDeviceName(), "gfx90a") || - miopen::StartsWith(handle.GetDeviceName(), "gfx94")) && - env::enabled(MIOPEN_TEST_ALL) && (GetFloatArg() == "--float")) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) { RunTest(); Verify(); diff --git a/test/gtest/layernorm.cpp b/test/gtest/layernorm.cpp index e780ad1648..e1e669c6d3 100644 --- a/test/gtest/layernorm.cpp +++ b/test/gtest/layernorm.cpp @@ -30,8 +30,6 @@ MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) -namespace env = miopen::env; - namespace layernorm { std::string GetFloatArg() @@ -66,7 +64,8 @@ TEST_P(LayerNormTestFloat, LayerNormTestFw) if((miopen::StartsWith(handle.GetDeviceName(), "gfx908") || miopen::StartsWith(handle.GetDeviceName(), "gfx90a") || miopen::StartsWith(handle.GetDeviceName(), "gfx94")) && - env::enabled(MIOPEN_TEST_ALL) && (GetFloatArg() == "--float")) + (!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))) { RunTest(); Verify(); @@ -79,12 +78,12 @@ TEST_P(LayerNormTestFloat, LayerNormTestFw) TEST_P(LayerNormTestHalf, LayerNormTestFw) { - auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG); const auto& handle = get_handle(); if((miopen::StartsWith(handle.GetDeviceName(), "gfx908") || miopen::StartsWith(handle.GetDeviceName(), "gfx90a") || miopen::StartsWith(handle.GetDeviceName(), "gfx94")) && - env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--half") + (!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))) { RunTest(); Verify(); @@ -97,12 +96,12 @@ TEST_P(LayerNormTestHalf, LayerNormTestFw) TEST_P(LayerNormTestBFloat16, LayerNormTestFw) { - auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG); const auto& handle = get_handle(); if((miopen::StartsWith(handle.GetDeviceName(), "gfx908") || miopen::StartsWith(handle.GetDeviceName(), "gfx90a") || miopen::StartsWith(handle.GetDeviceName(), "gfx94")) && - env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--bfloat16") + (!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))) { RunTest(); Verify(); diff --git a/test/gtest/layernorm.hpp b/test/gtest/layernorm.hpp index a50fe031de..cd8813e391 100644 --- a/test/gtest/layernorm.hpp +++ b/test/gtest/layernorm.hpp @@ -76,8 +76,9 @@ void cpu_layernorm_forward(tensor input, ref_rstd[o] = static_cast(rstd_v); ford(inner_size)([&](int32_t i) { - float weight_v = mode ? static_cast(weight[i]) : 1; - float bias_v = mode ? static_cast(bias[i]) : 0; + float weight_v = + (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1 : static_cast(weight[i]); + float bias_v = (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 0 : static_cast(bias[i]); ref_output[o * inner_size + i] = static_cast( (static_cast(input[o * inner_size + i]) - mean_v) * rstd_v * weight_v + bias_v); diff --git a/test/gtest/reduceextreme.cpp b/test/gtest/reduceextreme.cpp index 1d6637deb0..670ec24e1b 100644 --- a/test/gtest/reduceextreme.cpp +++ b/test/gtest/reduceextreme.cpp @@ -59,7 +59,8 @@ using namespace reduceextreme; TEST_P(ReduceExtremeTestFloat, ReduceExtremeTestFw) { - if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--float") + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) { RunTest(); Verify(); @@ -72,7 +73,8 @@ TEST_P(ReduceExtremeTestFloat, ReduceExtremeTestFw) TEST_P(ReduceExtremeTestHalf, ReduceExtremeTestFw) { - if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--half") + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) { RunTest(); Verify(); @@ -85,7 +87,8 @@ TEST_P(ReduceExtremeTestHalf, ReduceExtremeTestFw) TEST_P(ReduceExtremeTestBFloat16, ReduceExtremeTestFw) { - if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--bfloat16") + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) { RunTest(); Verify(); diff --git a/test/gtest/sum.cpp b/test/gtest/sum.cpp index 066c609dd5..fb2e7aefa8 100644 --- a/test/gtest/sum.cpp +++ b/test/gtest/sum.cpp @@ -53,7 +53,8 @@ using namespace sum; TEST_P(SumTestFloat, SumTestFw) { - if(env::enabled(MIOPEN_TEST_ALL) && (GetFloatArg() == "--float")) + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) { RunTest(); Verify(); diff --git a/test/gtest/t5layernorm.cpp b/test/gtest/t5layernorm.cpp index c062f1007e..49b45e88f9 100644 --- a/test/gtest/t5layernorm.cpp +++ b/test/gtest/t5layernorm.cpp @@ -71,8 +71,8 @@ using namespace t5layernorm; TEST_P(T5LayerNormTestFloat, T5LayerNormTestFw) { - auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG); - if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--float") + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) { RunTest(); Verify(); @@ -85,8 +85,8 @@ TEST_P(T5LayerNormTestFloat, T5LayerNormTestFw) TEST_P(T5LayerNormTestHalf, T5LayerNormTestFw) { - auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG); - if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--half") + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) { RunTest(); Verify(); @@ -99,8 +99,8 @@ TEST_P(T5LayerNormTestHalf, T5LayerNormTestFw) TEST_P(T5LayerNormTestBFloat16, T5LayerNormTestFw) { - auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG); - if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--bfloat16") + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) { RunTest(); Verify(); @@ -113,8 +113,8 @@ TEST_P(T5LayerNormTestBFloat16, T5LayerNormTestFw) TEST_P(T5LayerNormBwdTestFloat, T5LayerNormBwdTestFw) { - auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG); - if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--float") + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) { RunTest(); Verify(); @@ -127,8 +127,8 @@ TEST_P(T5LayerNormBwdTestFloat, T5LayerNormBwdTestFw) TEST_P(T5LayerNormBwdTestHalf, T5LayerNormBwdTestFw) { - auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG); - if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--half") + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) { RunTest(); Verify(); @@ -141,8 +141,8 @@ TEST_P(T5LayerNormBwdTestHalf, T5LayerNormBwdTestFw) TEST_P(T5LayerNormBwdTestBFloat16, T5LayerNormBwdTestFw) { - auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG); - if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--bfloat16") + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) { RunTest(); Verify(); diff --git a/test/gtest/t5layernorm.hpp b/test/gtest/t5layernorm.hpp index 505336a130..b82b543b2a 100644 --- a/test/gtest/t5layernorm.hpp +++ b/test/gtest/t5layernorm.hpp @@ -409,7 +409,7 @@ struct T5LayerNormBwdTest : public ::testing::TestWithParam std::fill(ref_dw.begin(), ref_dw.end(), std::numeric_limits::quiet_NaN()); std::vector workspace_dims; - printf("GetT5LayerNormBackwardWorkspaceSize\n"); + ws_sizeInBytes = miopen::GetT5LayerNormBackwardWorkspaceSize( handle, dy.desc, x.desc, weight.desc, rstd.desc, dx.desc, dw.desc, ln_mode); if(ws_sizeInBytes == static_cast(-1)) @@ -472,7 +472,7 @@ struct T5LayerNormBwdTest : public ::testing::TestWithParam // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. if(std::is_same::value) - threshold *= 8.0; + threshold *= 80.0; auto error = miopen::rms_range(ref_dx, dx); EXPECT_TRUE(miopen::range_distance(ref_dx) == miopen::range_distance(dx));