diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 02bcb88622..5a74c95eaa 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -32,3 +32,4 @@ The MIOpen API library is structured as follows: * :doc:`GroupNorm <../doxygen/html/group__groupnorm>` (experimental) * :doc:`Cat <../doxygen/html/group__cat>` (experimental) * :doc:`Argmax<./argmax>` (experimental) + * :doc:`Interpolate <../doxygen/html/group__interpolate>` (experimental) diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 224e550fed..a824ecd45d 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -43,6 +43,7 @@ add_executable(MIOpenDriver dm_fusion.cpp dm_gemm.cpp dm_groupnorm.cpp + dm_interpolate.cpp dm_layernorm.cpp dm_lrn.cpp dm_pool.cpp diff --git a/driver/dm_interpolate.cpp b/driver/dm_interpolate.cpp new file mode 100644 index 0000000000..d3959a7415 --- /dev/null +++ b/driver/dm_interpolate.cpp @@ -0,0 +1,40 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "registry_driver_maker.hpp" +#include "interpolate_driver.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "interpolate") + return new InterpolateDriver(); + if(base_arg == "interpolatefp16") + return new InterpolateDriver(); + if(base_arg == "interpolatebfp16") + return new InterpolateDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index 4cfc2b544e..40aa59cfa5 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -151,7 +151,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "pool[fp16], lrn[fp16], " "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], " "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], " - "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16]\n"); + "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], interpolate[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -176,7 +176,8 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" && arg != "sumbfp16" && arg != "argmax" && arg != "argmaxfp16" && arg != "argmaxbfp16" && arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" && - arg != "catfp16" && arg != "catbfp16" && arg != "--version") + arg != "catfp16" && arg != "catbfp16" && arg != "interpolate" && arg != "interpolatefp16" && + arg != "interpolatebfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/interpolate_driver.hpp b/driver/interpolate_driver.hpp new file mode 100644 index 0000000000..69d0bff864 --- /dev/null +++ b/driver/interpolate_driver.hpp @@ -0,0 +1,554 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_INTERPOLATE_DRIVER_HPP +#define GUARD_MIOPEN_INTERPOLATE_DRIVER_HPP + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "mloInterpolateHost.hpp" +#include "random.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" +#include "util_driver.hpp" + +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> + +#include +#include +#include +#include +#include + +inline std::vector GetStrides(std::vector lengths, int contiguous) +{ + if(contiguous != 0 && contiguous != 1) + std::cerr << "Error Tensor Contiguous should be 0 or 1" << std::endl; + if(contiguous == 0) + std::swap(lengths.front(), lengths.back()); + std::vector strides(lengths.size()); + strides.back() = 1; + for(int i = lengths.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * lengths[i + 1]; + if(contiguous == 0) + std::swap(strides.front(), strides.back()); + return strides; +} + +template +class InterpolateDriver : public Driver +{ +public: + InterpolateDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&outputGradDesc); + miopenCreateTensorDescriptor(&inputGradDesc); + miopenCreateTensorDescriptor(&scaleFactorsDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + template + std::vector GetTensorFromCmd(const char* param); + int GetandSetData() override; + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + int VerifyBackward() override; + int VerifyForward() override; + ~InterpolateDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(outputGradDesc); + miopenDestroyTensorDescriptor(inputGradDesc); + miopenDestroyTensorDescriptor(scaleFactorsDesc); + } + +private: + InputFlags inflags; + + int forw; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t outputDesc; + miopenTensorDescriptor_t outputGradDesc; + miopenTensorDescriptor_t inputGradDesc; + miopenTensorDescriptor_t scaleFactorsDesc; + + std::unique_ptr in_dev; + std::unique_ptr out_dev; + std::unique_ptr out_grad_dev; + std::unique_ptr in_grad_dev; + std::unique_ptr scale_factors_dev; + std::unique_ptr workspace_dev; + + std::vector in; + std::vector out; + std::vector out_host; + + std::vector scale_factors; + + std::vector out_grad; + std::vector in_grad; + std::vector in_grad_host; + std::vector workspace; + + std::vector in_len; + std::vector size; + std::vector config_scale_factors; + miopenInterpolateMode_t mode; + bool align_corners; + size_t ws_sizeInBytes = 0; +}; + +template +int InterpolateDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + return miopenStatusSuccess; +} + +template +template +std::vector InterpolateDriver::GetTensorFromCmd(const char* param) +{ + std::string lengthsStr = inflags.GetValueStr(param); + + std::vector lengths; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = lengthsStr.find(',', pos); + while(new_pos != std::string::npos) + { + std::string sliceStr = lengthsStr.substr(pos, new_pos - pos); + + T len = static_cast(std::stof(sliceStr)); + + lengths.push_back(len); + + pos = new_pos + 1; + new_pos = lengthsStr.find(',', pos); + }; + + std::string sliceStr = lengthsStr.substr(pos); + T len = static_cast(std::stof(sliceStr)); + + lengths.push_back(len); + + return (lengths); +} + +template +int InterpolateDriver::GetandSetData() +{ + in_len = GetTensorFromCmd("input_dims"); + size = GetTensorFromCmd("size"); + config_scale_factors = GetTensorFromCmd("scale_factors"); + mode = static_cast(inflags.GetValueInt("mode")); + align_corners = static_cast(inflags.GetValueInt("align_corners")); + + if(config_scale_factors[0] == -1 && size[0] == -1) + { + config_scale_factors[0] = 1; + for(int i = 1; i < in_len.size() - 2; i++) + { + config_scale_factors.push_back(1); + } + } + + if(config_scale_factors[0] != -1) + { + if(mode != MIOPEN_INTERPOLATE_MODE_NEAREST) + { + for(int i = 0; i < in_len.size() - 2; i++) + { + scale_factors.push_back(config_scale_factors[i]); + } + } + else + { + for(int i = 0; i < in_len.size() - 2; i++) + { + scale_factors.push_back(config_scale_factors[i]); + } + for(int i = in_len.size() - 2; i < 3; i++) + { + scale_factors.push_back(0); + } + } + } + + auto out_len = std::vector({in_len[0], in_len[1]}); + if(size[0] != -1) + { + for(int i = 0; i < size.size(); i++) + { + if(size[i] == 0) + out_len.push_back(static_cast(ceil(in_len[i + 2] * scale_factors[i]))); + else + { + if(config_scale_factors[0] == -1) + { + scale_factors.push_back(static_cast(size[i]) / in_len[i + 2]); + } + else + { + scale_factors[i] = static_cast(size[i]) / in_len[i + 2]; + } + out_len.push_back(size[i]); + } + } + } + else + { + for(int i = 0; i < in_len.size() - 2; i++) + { + out_len.push_back(static_cast(ceil(in_len[i + 2] * scale_factors[i]))); + scale_factors[i] = static_cast(out_len[i + 2]) / in_len[i + 2]; + } + } + + auto in_strides = GetStrides(in_len, inflags.GetValueInt("contiguous")); + auto output_strides = GetStrides(out_len, 1); + + SetTensorNd(inputDesc, in_len, in_strides, data_type); + SetTensorNd(outputDesc, out_len, output_strides, data_type); + + std::vector scale_length = std::vector({scale_factors.size()}); + SetTensorNd(scaleFactorsDesc, scale_length, miopen_type{}); + + SetTensorNd(outputGradDesc, out_len, output_strides, data_type); + SetTensorNd(inputGradDesc, in_len, in_strides, data_type); + + return miopenStatusSuccess; +} + +template +int InterpolateDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward Interpolate (Default=1)", "int"); + inflags.AddInputFlag( + "input_dims", + 'D', + "16,256,1", + "The dimensional lengths of the input tensor (>=3 and <=5 dimensions): N,C,D,H,W. " + "Example: 16,256,1.", + "string"); + inflags.AddInputFlag("size", + 'S', + "-1", + "Output Spatial Size: D,H,W. " + "Default: -1 - Use scale factors instead", + "string"); + inflags.AddInputFlag("scale_factors", + 's', + "-1", + "Multiplier for spatial size: factor_D,factor_H,factor_W. " + "Default: -1 - Use size instead", + "string"); + inflags.AddInputFlag("mode", + 'm', + "0", + "algorithm used for upsampling: 'nearest' | 'linear' | 'bilinear' | " + "'bicubic' | 'trilinear'. Default: 0 - 'nearest'", + "int"); + inflags.AddInputFlag("align_corners", + 'A', + "0", + "This only has an effect when mode is 'linear', 'bilinear', 'bicubic' or " + "'trilinear'. Default: False", + "int"); + inflags.AddInputFlag("contiguous", + 'c', + "1", + "Is input tensor contiguous? (Default=1 for contiguous tensor)", + "int"); + + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "1", "Time (Default=1)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int InterpolateDriver::AllocateBuffersAndCopy() +{ + size_t in_sz = GetTensorSize(inputDesc); + size_t out_sz = GetTensorSize(outputDesc); + size_t scale_factors_sz = GetTensorSize(scaleFactorsDesc); + size_t out_grad_sz = GetTensorSize(outputGradDesc); + size_t in_grad_sz = GetTensorSize(inputGradDesc); + + if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC) + { + miopenGetInterpolateBackwardWorkspaceSize(GetHandle(), + outputGradDesc, + inputGradDesc, + scaleFactorsDesc, + mode, + align_corners, + &ws_sizeInBytes); + if(ws_sizeInBytes == static_cast(-1)) + return miopenStatusAllocFailed; + } + + uint32_t ctx = 0; + + in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); + out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); + scale_factors_dev = std::unique_ptr(new GPUMem(ctx, scale_factors_sz, sizeof(float))); + out_grad_dev = std::unique_ptr(new GPUMem(ctx, out_grad_sz, sizeof(Tgpu))); + in_grad_dev = std::unique_ptr(new GPUMem(ctx, in_grad_sz, sizeof(Tgpu))); + workspace_dev = std::unique_ptr(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte))); + + in = std::vector(in_sz, static_cast(0)); + out = std::vector(out_sz, static_cast(0)); + out_host = std::vector(out_sz, static_cast(0)); + + out_grad = std::vector(out_grad_sz, static_cast(0)); + in_grad = std::vector(in_grad_sz, static_cast(0)); + in_grad_host = std::vector(in_grad_sz, static_cast(0)); + workspace = std::vector(ws_sizeInBytes / sizeof(float), static_cast(0)); + + int status; + + for(int i = 0; i < in_sz; i++) + { + in[i] = prng::gen_A_to_B(static_cast(-5.0f), static_cast(1.0f)); + } + status = in_dev->ToGPU(q, in.data()); + + status |= out_dev->ToGPU(q, out.data()); + + status |= scale_factors_dev->ToGPU(q, scale_factors.data()); + + status |= in_grad_dev->ToGPU(q, in_grad.data()); + + status |= workspace_dev->ToGPU(q, workspace.data()); + + for(int i = 0; i < out_grad_sz; i++) + { + out_grad[i] = prng::gen_A_to_B(static_cast(-10.0), static_cast(10.0)); + } + status |= out_grad_dev->ToGPU(q, out_grad.data()); + + if(status != 0) + std::cout << "Error copying data to GPU\n" << std::endl; + + return miopenStatusSuccess; +} + +template +int InterpolateDriver::RunForwardGPU() +{ + float kernel_total_time = 0.0; + float kernel_first_time = 0.0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenInterpolateForward(GetHandle(), + inputDesc, + in_dev->GetMem(), + outputDesc, + out_dev->GetMem(), + scaleFactorsDesc, + scale_factors_dev->GetMem(), + mode, + align_corners); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + printf("Wall-clock Time Forward Interpolate Elapsed: %f ms\n", t.gettime_ms() / iter); + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + printf("GPU Kernel Time Forward Interpolate Elapsed: %f ms\n", kernel_average_time); + } + + out_dev->FromGPU(GetStream(), out.data()); + + return miopenStatusSuccess; +} + +template +int InterpolateDriver::RunForwardCPU() +{ + size_t nelems = out_host.size(); + mlo_interpolate_forward(inputDesc, + outputDesc, + in.data(), + out_host.data(), + nelems, + scale_factors.data(), + align_corners, + mode); + + return miopenStatusSuccess; +} + +template +int InterpolateDriver::RunBackwardGPU() +{ + float kernel_total_time = 0.0; + float kernel_first_time = 0.0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + in_grad_dev->ToGPU(q, in_grad.data()); + workspace_dev->ToGPU(q, workspace.data()); + miopenInterpolateBackward(GetHandle(), + workspace_dev->GetMem(), + ws_sizeInBytes, + inputGradDesc, + in_grad_dev->GetMem(), + outputGradDesc, + out_grad_dev->GetMem(), + scaleFactorsDesc, + scale_factors_dev->GetMem(), + mode, + align_corners); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + printf("Wall-clock Time Backward Interpolate Elapsed: %f ms\n", t.gettime_ms() / iter); + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + printf("GPU Kernel Time Backward Interpolate Elapsed: %f ms\n", kernel_average_time); + } + + in_grad_dev->FromGPU(GetStream(), in_grad.data()); + + return miopenStatusSuccess; +} + +template +int InterpolateDriver::RunBackwardCPU() +{ + size_t nelems = in_grad_host.size(); + mlo_interpolate_backward(inputGradDesc, + outputGradDesc, + in_grad_host.data(), + out_grad.data(), + nelems, + scale_factors.data(), + align_corners, + mode); + return miopenStatusSuccess; +} + +template +int InterpolateDriver::VerifyForward() +{ + RunForwardCPU(); + auto tolerance = std::numeric_limits::epsilon() * 10; + + auto error = miopen::rms_range(out_host, out); + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Output Forward Interpolate FAILED: " << error << std::endl; + return EC_VerifyFwd; + } + else + { + printf("Output Forward Interpolate Verifies on CPU and GPU (err=%f)\n", error); + } + + return miopenStatusSuccess; +} + +template +int InterpolateDriver::VerifyBackward() +{ + RunBackwardCPU(); + auto tolerance = std::numeric_limits::epsilon() * 10; + auto error = miopen::rms_range(in_grad_host, in_grad); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Backward Interpolate in Input Grad FAILED: " << error + << " while tolerance: " << tolerance << std::endl; + return EC_VerifyFwd; + } + else + { + printf("Backward Interpolate Verifies in Input Grad on CPU and GPU " + "(err=%f)\n", + error); + } + + return miopenStatusSuccess; +} + +#endif // GUARD_MIOPEN_INTERPOLATE_DRIVER_HPP diff --git a/driver/mloInterpolateHost.hpp b/driver/mloInterpolateHost.hpp new file mode 100644 index 0000000000..94cf4fdd96 --- /dev/null +++ b/driver/mloInterpolateHost.hpp @@ -0,0 +1,1071 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MLO_INTERPOLATE_H_ +#define MLO_INTERPOLATE_H_ + +#include "driver.hpp" +#include +#pragma once + +#include +#include +#include + +inline float compute_linear_scale_factor(float scale_factor, + long input_size, + long output_size, + bool align_corners) +{ + if(align_corners) + { + if(input_size == 1) + { + return static_cast(output_size); + } + return static_cast(output_size - 1) / (input_size - 1); + } + else if(scale_factor == 0) + { + return static_cast(output_size) / input_size; + } + else + { + return static_cast(scale_factor); + } +} + +inline float get_src_index(long dest_index, float scale_factor, bool align_corners) +{ + if(align_corners) + { + return dest_index / scale_factor; + } + else + { + return (dest_index + 0.5f) / scale_factor - 0.5f; + } +} + +inline long linear_back_index(long src, float scale_factor, bool align_corners) +{ + return static_cast(std::ceil(get_src_index(src, 1.f / scale_factor, align_corners))); +} + +inline void compute_linear_back_index_from_to(long src, + long input_isze, + long output_size, + float scale_factor, + bool align_corners, + long* from, + long* to) +{ + if(src - 1 < 1) + { + *from = 0; + } + else + { + *from = linear_back_index(src - 1, scale_factor, align_corners); + } + if(src + 1 > input_isze) + { + *to = output_size; + } + else + { + *to = std::min(output_size, linear_back_index(src + 1, scale_factor, align_corners)); + } +} + +inline void compute_source_index_and_lambda(long h, + float scale_factor, + long Hin, + long Hout, + bool align_corners, + long* hin_index0, + long* hin_index1, + float* lambda0, + float* lambda1) +{ + float hin_index_actual = static_cast( + std::max(static_cast(0.), get_src_index(h, scale_factor, align_corners))); + *hin_index0 = static_cast(hin_index_actual); + *hin_index1 = std::min(*hin_index0 + 1, Hin - 1); + *lambda1 = hin_index_actual - *hin_index0; + *lambda0 = 1.f - *lambda1; +} + +inline float get_back_lambda(long src, long src0, long src1, float lambda0, float lambda1) +{ + if(src == src0) + { + if(src0 == src1) + { + return 1; // lambda0 + lambda1 = 1 + } + return lambda0; + } + if(src == src1) + { + return lambda1; + } + // This case can happen due to floating point mutiplification. + // ex> 7 * (105/9) = 87 or 86.99999995 + return 0; +} + +inline float compute_back_lambda( + long dest, long src, float scale_factor, long Hin, long Hout, bool align_corners) +{ + if(Hin == Hout) + { + return 1; + } + long index0; + long index1; + float lambda0; + float lambda1; + compute_source_index_and_lambda( + dest, scale_factor, Hin, Hout, align_corners, &index0, &index1, &lambda0, &lambda1); + return get_back_lambda(src, index0, index1, lambda0, lambda1); +} + +template +int32_t mlo_interpolate_linear_forward(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + const Tgpu* input, + Tcheck* output, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<3>(miopen::deref(inputDesc)); + auto output_tv = + miopen::solver::interpolate::get_inner_expanded_tv<3>(miopen::deref(outputDesc)); + + for(unsigned long gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<3>(output_tv, gid); + long n = tensor_layout.layout[0]; + long c = tensor_layout.layout[1]; + long h = tensor_layout.layout[2]; + + long Hin = input_tv.size[2]; + long Hout = output_tv.size[2]; + if(Hin == Hout || Hout == 1) + { + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(tensor_layout)]; + continue; + } + + float scale_factor_h = scale_factors[0]; + scale_factor_h = compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + + long hin_index0; + long hin_index1; + float lambda1; + float lambda0; + compute_source_index_and_lambda(h, + scale_factor_h, + Hin, + Hout, + align_corners, + &hin_index0, + &hin_index1, + &lambda0, + &lambda1); + + tensor_layout_t<3> input_layout0(n, c, hin_index0); + tensor_layout_t<3> input_layout1(n, c, hin_index1); + + float input0 = input[input_tv.get_tensor_view_idx(input_layout0)]; + float input1 = input[input_tv.get_tensor_view_idx(input_layout1)]; + + output[output_tv.get_tensor_view_idx(tensor_layout)] = + static_cast(input0 * lambda0 + input1 * lambda1); + } + + return 0; +} + +template +int32_t mlo_interpolate_linear_backward(const miopenTensorDescriptor_t inputGradDesc, + const miopenTensorDescriptor_t outputGradDesc, + Tcheck* input_grad, + const Tgpu* output_grad, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + auto output_grad_tv = + miopen::solver::interpolate::get_inner_expanded_tv<3>(miopen::deref(outputGradDesc)); + auto input_grad_tv = + miopen::solver::interpolate::get_inner_expanded_tv<3>(miopen::deref(inputGradDesc)); + + for(unsigned long gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<3>(input_grad_tv, gid); + long n = tensor_layout.layout[0]; + long c = tensor_layout.layout[1]; + long h = tensor_layout.layout[2]; + + long Hin = input_grad_tv.size[2]; + long Hout = output_grad_tv.size[2]; + + if(Hin == Hout) + { + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]; + continue; + } + + float scale_factor_h = scale_factors[0]; + float scale_factor = compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + + long from, to; + compute_linear_back_index_from_to(h, Hin, Hout, scale_factor, align_corners, &from, &to); + + float output = 0; + for(long i = from; i < to; i++) + { + tensor_layout_t<3> output_layout(n, c, i); + output += + static_cast(output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) * + compute_back_lambda(i, h, scale_factor, Hin, Hout, align_corners); + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = static_cast(output); + } + + return 0; +} + +template +int32_t mlo_interpolate_bilinear_forward(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + const Tgpu* input, + Tcheck* output, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(inputDesc)); + auto output_tv = + miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(outputDesc)); + + for(unsigned long gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<4>(output_tv, gid); + long n = tensor_layout.layout[0]; + long c = tensor_layout.layout[1]; + long h = tensor_layout.layout[2]; + long w = tensor_layout.layout[3]; + + long Hin = input_tv.size[2]; + long Hout = output_tv.size[2]; + long Win = input_tv.size[3]; + long Wout = output_tv.size[3]; + + if(Hin == Hout && Win == Wout) + { + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(tensor_layout)]; + continue; + } + + long hin_index0 = h; + long hin_index1 = h; + float hlambda0 = 1; + float hlambda1 = 0; + if(Hin != Hout && Hout != 1) + { + float scale_factor_h = scale_factors[0]; + float scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + compute_source_index_and_lambda(h, + scale_factor_h_, + Hin, + Hout, + align_corners, + &hin_index0, + &hin_index1, + &hlambda0, + &hlambda1); + } + + long win_index0 = w; + long win_index1 = w; + float wlambda0 = 1; + float wlambda1 = 0; + if(Win != Wout && Wout != 1) + { + float scale_factor_w = scale_factors[1]; + float scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + compute_source_index_and_lambda(w, + scale_factor_w_, + Win, + Wout, + align_corners, + &win_index0, + &win_index1, + &wlambda0, + &wlambda1); + } + + tensor_layout_t<4> input_layout00(n, c, hin_index0, win_index0); + tensor_layout_t<4> input_layout01(n, c, hin_index0, win_index1); + tensor_layout_t<4> input_layout10(n, c, hin_index1, win_index0); + tensor_layout_t<4> input_layout11(n, c, hin_index1, win_index1); + + output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast( + (static_cast(input[input_tv.get_tensor_view_idx(input_layout00)]) * wlambda0 + + static_cast(input[input_tv.get_tensor_view_idx(input_layout01)]) * wlambda1) * + hlambda0 + + (static_cast(input[input_tv.get_tensor_view_idx(input_layout10)]) * wlambda0 + + static_cast(input[input_tv.get_tensor_view_idx(input_layout11)]) * wlambda1) * + hlambda1); + } + + return 0; +} + +template +int32_t mlo_interpolate_bilinear_backward(const miopenTensorDescriptor_t inputGradDesc, + const miopenTensorDescriptor_t outputGradDesc, + Tcheck* input_grad, + const Tgpu* output_grad, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + auto output_grad_tv = + miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc)); + auto input_grad_tv = + miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc)); + + for(unsigned long gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<4>(input_grad_tv, gid); + long n = tensor_layout.layout[0]; + long c = tensor_layout.layout[1]; + long h = tensor_layout.layout[2]; + long w = tensor_layout.layout[3]; + + long Hin = input_grad_tv.size[2]; + long Hout = output_grad_tv.size[2]; + long Win = input_grad_tv.size[3]; + long Wout = output_grad_tv.size[3]; + + float scale_factor_h = scale_factors[0]; + float scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + + float scale_factor_w = scale_factors[1]; + float scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + + long h_from, h_to; + if(Hin == Hout) + { + h_from = h; + h_to = h + 1; + } + else + { + compute_linear_back_index_from_to( + h, Hin, Hout, scale_factor_h_, align_corners, &h_from, &h_to); + } + long w_from, w_to; + if(Win == Wout) + { + w_from = w; + w_to = w + 1; + } + else + { + compute_linear_back_index_from_to( + w, Win, Wout, scale_factor_w_, align_corners, &w_from, &w_to); + } + + float output = 0; + for(long i = h_from; i < h_to; i++) + { + float h_lambda = compute_back_lambda(i, h, scale_factor_h_, Hin, Hout, align_corners); + if(h_lambda == 0.) + continue; + for(long j = w_from; j < w_to; j++) + { + float w_lambda = + compute_back_lambda(j, w, scale_factor_w_, Win, Wout, align_corners); + + tensor_layout_t<4> output_layout(n, c, i, j); + + output += static_cast( + output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) * + h_lambda * w_lambda; + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = static_cast(output); + } + + return 0; +} + +template +int32_t mlo_interpolate_trilinear_forward(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + const Tgpu* input, + Tcheck* output, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto output_tv = + miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(outputDesc)); + + for(unsigned long gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<5>(output_tv, gid); + long n = tensor_layout.layout[0]; + long c = tensor_layout.layout[1]; + long d = tensor_layout.layout[2]; + long h = tensor_layout.layout[3]; + long w = tensor_layout.layout[4]; + + long Din = input_tv.size[2]; + long Dout = output_tv.size[2]; + long Hin = input_tv.size[3]; + long Hout = output_tv.size[3]; + long Win = input_tv.size[4]; + long Wout = output_tv.size[4]; + + if(Hin == Hout && Win == Wout && Din == Dout) + { + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(tensor_layout)]; + continue; + } + + long din_index0 = d; + long din_index1 = d; + float dlambda0 = 1; + float dlambda1 = 0; + if(Din != Dout && Dout != 1) + { + float scale_factor_d = scale_factors[0]; + float scale_factor_d_ = + compute_linear_scale_factor(scale_factor_d, Din, Dout, align_corners); + compute_source_index_and_lambda(d, + scale_factor_d_, + Din, + Dout, + align_corners, + &din_index0, + &din_index1, + &dlambda0, + &dlambda1); + } + + long hin_index0 = h; + long hin_index1 = h; + float hlambda0 = 1; + float hlambda1 = 0; + if(Hin != Hout && Hout != 1) + { + float scale_factor_h = scale_factors[1]; + float scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + compute_source_index_and_lambda(h, + scale_factor_h_, + Hin, + Hout, + align_corners, + &hin_index0, + &hin_index1, + &hlambda0, + &hlambda1); + } + + long win_index0 = w; + long win_index1 = w; + float wlambda0 = 1; + float wlambda1 = 0; + if(Win != Wout && Wout != 1) + { + float scale_factor_w = scale_factors[2]; + float scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + compute_source_index_and_lambda(w, + scale_factor_w_, + Win, + Wout, + align_corners, + &win_index0, + &win_index1, + &wlambda0, + &wlambda1); + } + + tensor_layout_t<5> input_layout000(n, c, din_index0, hin_index0, win_index0); + tensor_layout_t<5> input_layout001(n, c, din_index0, hin_index0, win_index1); + tensor_layout_t<5> input_layout010(n, c, din_index0, hin_index1, win_index0); + tensor_layout_t<5> input_layout011(n, c, din_index0, hin_index1, win_index1); + tensor_layout_t<5> input_layout100(n, c, din_index1, hin_index0, win_index0); + tensor_layout_t<5> input_layout101(n, c, din_index1, hin_index0, win_index1); + tensor_layout_t<5> input_layout110(n, c, din_index1, hin_index1, win_index0); + tensor_layout_t<5> input_layout111(n, c, din_index1, hin_index1, win_index1); + + output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast( + (static_cast(input[input_tv.get_tensor_view_idx(input_layout000)]) * wlambda0 + + static_cast(input[input_tv.get_tensor_view_idx(input_layout001)]) * wlambda1) * + hlambda0 + + (static_cast(input[input_tv.get_tensor_view_idx(input_layout010)]) * wlambda0 + + static_cast(input[input_tv.get_tensor_view_idx(input_layout011)]) * wlambda1) * + hlambda1 + + (static_cast(input[input_tv.get_tensor_view_idx(input_layout100)]) * wlambda0 + + static_cast(input[input_tv.get_tensor_view_idx(input_layout101)]) * wlambda1) * + dlambda0 + + (static_cast(input[input_tv.get_tensor_view_idx(input_layout110)]) * wlambda0 + + static_cast(input[input_tv.get_tensor_view_idx(input_layout111)]) * wlambda1) * + dlambda1); + } + + return 0; +} +template +int32_t mlo_interpolate_trilinear_backward(const miopenTensorDescriptor_t inputGradDesc, + const miopenTensorDescriptor_t outputGradDesc, + Tcheck* input_grad, + const Tgpu* output_grad, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + auto output_grad_tv = + miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc)); + auto input_grad_tv = + miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc)); + + for(unsigned long gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<5>(input_grad_tv, gid); + long n = tensor_layout.layout[0]; + long c = tensor_layout.layout[1]; + long d = tensor_layout.layout[2]; + long h = tensor_layout.layout[3]; + long w = tensor_layout.layout[4]; + + long Din = input_grad_tv.size[2]; + long Dout = output_grad_tv.size[2]; + long Hin = input_grad_tv.size[3]; + long Hout = output_grad_tv.size[3]; + long Win = input_grad_tv.size[4]; + long Wout = output_grad_tv.size[4]; + + float scale_factor_d = scale_factors[0]; + float scale_factor_d_ = + compute_linear_scale_factor(scale_factor_d, Din, Dout, align_corners); + + float scale_factor_h = scale_factors[1]; + float scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + + float scale_factor_w = scale_factors[2]; + float scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + + long d_from, d_to, h_from, h_to, w_from, w_to; + compute_linear_back_index_from_to( + d, Din, Dout, scale_factor_d_, align_corners, &d_from, &d_to); + compute_linear_back_index_from_to( + h, Hin, Hout, scale_factor_h_, align_corners, &h_from, &h_to); + compute_linear_back_index_from_to( + w, Win, Wout, scale_factor_w_, align_corners, &w_from, &w_to); + + float output = 0; + for(long i = d_from; i < d_to; i++) + { + float d_lambda = compute_back_lambda(i, d, scale_factor_d_, Din, Dout, align_corners); + for(long j = h_from; j < h_to; j++) + { + float h_lambda = + compute_back_lambda(j, h, scale_factor_h_, Hin, Hout, align_corners); + for(long k = w_from; k < w_to; k++) + { + float w_lambda = + compute_back_lambda(k, w, scale_factor_w_, Win, Wout, align_corners); + tensor_layout_t<5> output_layout(n, c, i, j, k); + + output += output_grad[output_grad_tv.get_tensor_view_idx(output_layout)] * + d_lambda * h_lambda * w_lambda; + } + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = output; + } + + return 0; +} + +inline float compute_scales_value(float scale, long input_size, long output_size) +{ + return (scale == 0.f) ? (static_cast(input_size) / output_size) : (1.0f / scale); +} + +inline long nearest_idx(long output_index, long input_size, long output_size, float scales) +{ + if(output_size == input_size) + { + return output_index; + } + else if(output_size == 2 * input_size) + { + return output_index / 2; + } + else + { + float scale = compute_scales_value(scales, input_size, output_size); + return std::min(static_cast((output_index * scale)), input_size); + } +} + +template +int32_t mlo_nearest_forward(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + const Tgpu* input, + Tcheck* output, + const size_t nelems, + const float* scale_factors) +{ + auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto output_tv = + miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(outputDesc)); + + for(unsigned long gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<5>(output_tv, gid); + long n = tensor_layout.layout[0]; + long c = tensor_layout.layout[1]; + long d = tensor_layout.layout[2]; + long h = tensor_layout.layout[3]; + long w = tensor_layout.layout[4]; + + long Dout = output_tv.size[2]; + long Hout = output_tv.size[3]; + long Wout = output_tv.size[4]; + long Din = input_tv.size[2]; + long Hin = input_tv.size[3]; + long Win = input_tv.size[4]; + + long x = nearest_idx(d, Din, Dout, scale_factors[0]); + long y = nearest_idx(h, Hin, Hout, scale_factors[1]); + long z = nearest_idx(w, Win, Wout, scale_factors[2]); + + tensor_layout_t<5> input_layout(n, c, x, y, z); + + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(input_layout)]; + } + + return 0; +} + +inline long nearest_idx_back(long input_index, long input_size, long output_size, float scales) +{ + if(output_size == input_size) + { + return input_index; + } + else if(output_size == 2 * input_size) + { + return input_index * 2; + } + else + { + float scale = compute_scales_value(scales, input_size, output_size); + return std::min(static_cast(std::ceil(input_index / scale)), output_size); + } +} + +template +int32_t mlo_nearest_backward(const miopenTensorDescriptor_t inputGradDesc, + const miopenTensorDescriptor_t outputGradDesc, + Tcheck* input_grad, + const Tgpu* output_grad, + const size_t nelems, + const float* scale_factors) +{ + auto output_grad_tv = + miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc)); + auto input_grad_tv = + miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc)); + + for(unsigned long gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<5>(input_grad_tv, gid); + long n = tensor_layout.layout[0]; + long c = tensor_layout.layout[1]; + long x = tensor_layout.layout[2]; + long y = tensor_layout.layout[3]; + long z = tensor_layout.layout[4]; + + long Dout = output_grad_tv.size[2]; + long Hout = output_grad_tv.size[3]; + long Wout = output_grad_tv.size[4]; + long Din = input_grad_tv.size[2]; + long Hin = input_grad_tv.size[3]; + long Win = input_grad_tv.size[4]; + + float scale_factor_d = scale_factors[0]; + float scale_factor_h = scale_factors[1]; + float scale_factor_w = scale_factors[2]; + + long dstart = nearest_idx_back(x, Din, Dout, scale_factor_d); + long dlimit = nearest_idx_back(x + 1, Din, Dout, scale_factor_d); + long hstart = nearest_idx_back(y, Hin, Hout, scale_factor_h); + long hlimit = nearest_idx_back(y + 1, Hin, Hout, scale_factor_h); + long wstart = nearest_idx_back(z, Win, Wout, scale_factor_w); + long wlimit = nearest_idx_back(z + 1, Win, Wout, scale_factor_w); + + float grad = 0.f; + for(long d = dstart; d < dlimit; d++) + { + for(long h = hstart; h < hlimit; h++) + { + for(long w = wstart; w < wlimit; w++) + { + tensor_layout_t<5> output_grad_layout(n, c, d, h, w); + grad += static_cast( + output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]); + } + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = static_cast(grad); + } + + return 0; +} + +inline float +bicubic_idx(long output_index, long output_size, float scale_factor, bool align_corners) +{ + if(output_size == 1) + { + if(align_corners) + { + return 0; + } + return -0.5f; + } + return get_src_index(output_index, scale_factor, align_corners); +} + +inline float cubic_convolution1(float x, float A) { return ((A + 2) * x - (A + 3)) * x * x + 1; } + +inline float cubic_convolution2(float x, float A) +{ + return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; +} + +inline void get_cubic_upsampling_coefficients(float coeffs[4], float t) +{ + float A = -0.75f; + + float x1 = t; + coeffs[0] = cubic_convolution2(x1 + 1.0f, A); + coeffs[1] = cubic_convolution1(x1, A); + + float x2 = 1.0f - t; + coeffs[2] = cubic_convolution1(x2, A); + coeffs[3] = cubic_convolution2(x2 + 1.0f, A); +} + +inline float cubic_interp1d(float x0, float x1, float x2, float x3, float t) +{ + float coeffs[4]; + get_cubic_upsampling_coefficients(coeffs, t); + + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +inline long bound(long p, long max_size) { return std::max(std::min(p, max_size - 1), 0L); } + +template +int32_t mlo_bicubic_forward(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + const Tgpu* input, + Tcheck* output, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(inputDesc)); + auto output_tv = + miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(outputDesc)); + + for(unsigned long gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<4>(output_tv, gid); + long n = tensor_layout.layout[0]; + long c = tensor_layout.layout[1]; + long h = tensor_layout.layout[2]; + long w = tensor_layout.layout[3]; + + long Hin = input_tv.size[2]; + long Win = input_tv.size[3]; + long Hout = output_tv.size[2]; + long Wout = output_tv.size[3]; + if(Hin == Hout && Win == Wout) + { + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(tensor_layout)]; + continue; + } + + float scale_factor_h = scale_factors[0]; + float scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + float real_y = bicubic_idx(h, Hout, scale_factor_h_, align_corners); + long in_y = static_cast(std::floor(real_y)); + float t_y = real_y - in_y; + + float scale_factor_w = scale_factors[1]; + float scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + float real_x = bicubic_idx(w, Wout, scale_factor_w_, align_corners); + long in_x = static_cast(std::floor(real_x)); + float t_x = real_x - in_x; + + float coefficients[4]; +#pragma unroll + for(int k = 0; k < 4; k++) + { + long y = bound(in_y - 1 + k, Hin); + tensor_layout_t<4> input_layout0(n, c, y, bound(in_x - 1, Win)); + tensor_layout_t<4> input_layout1(n, c, y, bound(in_x, Win)); + tensor_layout_t<4> input_layout2(n, c, y, bound(in_x + 1, Win)); + tensor_layout_t<4> input_layout3(n, c, y, bound(in_x + 2, Win)); + + coefficients[k] = cubic_interp1d( + static_cast(input[input_tv.get_tensor_view_idx(input_layout0)]), + static_cast(input[input_tv.get_tensor_view_idx(input_layout1)]), + static_cast(input[input_tv.get_tensor_view_idx(input_layout2)]), + static_cast(input[input_tv.get_tensor_view_idx(input_layout3)]), + t_x); + } + output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast(cubic_interp1d( + coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y)); + } + + return 0; +} + +template +int32_t mlo_bicubic_backward(const miopenTensorDescriptor_t inputGradDesc, + const miopenTensorDescriptor_t outputGradDesc, + Tcheck* input_grad, + const Tgpu* output_grad, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + auto output_grad_tv = + miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc)); + auto input_grad_tv = + miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc)); + + std::vector workspace; + workspace.resize(nelems, 0); + + uint64_t Hin = input_grad_tv.size[2]; + uint64_t Hout = output_grad_tv.size[2]; + uint64_t Win = input_grad_tv.size[3]; + uint64_t Wout = output_grad_tv.size[3]; + + size_t out_elems = miopen::deref(outputGradDesc).GetElementSize(); + for(uint64_t gid = 0; gid < out_elems; ++gid) + { + auto tensor_layout = tensor_layout_t<4>(output_grad_tv, gid); + uint64_t n = tensor_layout.layout[0]; + uint64_t c = tensor_layout.layout[1]; + uint64_t h = tensor_layout.layout[2]; + uint64_t w = tensor_layout.layout[3]; + + if(Hin == Hout && Win == Wout) + { + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]; + continue; + } + + float scale_factor_h = scale_factors[0]; + float scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + float real_y = bicubic_idx(h, Hout, scale_factor_h_, align_corners); + int64_t in_y = static_cast(std::floor(real_y)); + float t_y = real_y - static_cast(in_y); + + float scale_factor_w = scale_factors[1]; + float scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + float real_x = bicubic_idx(w, Wout, scale_factor_w_, align_corners); + int64_t in_x = static_cast(std::floor(real_x)); + float t_x = real_x - static_cast(in_x); + + float y_coeffs[4]; + float x_coeffs[4]; + get_cubic_upsampling_coefficients(y_coeffs, t_y); + get_cubic_upsampling_coefficients(x_coeffs, t_x); + float out_value = + static_cast(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]); + + for(int i = 0; i < 4; i++) + { + int64_t input_h = bound(in_y - 1 + i, Hin); + for(int j = 0; j < 4; j++) + { + int64_t input_w = bound(in_x - 1 + j, Win); + tensor_layout_t<4> in_grad_layout(n, c, input_h, input_w); + + workspace[input_grad_tv.get_tensor_view_idx(in_grad_layout)] += + out_value * y_coeffs[i] * x_coeffs[j]; + } + } + } + + if(!(Hin == Hout && Win == Wout)) + { + for(uint64_t gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<4>(input_grad_tv, gid); + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + static_cast(workspace[input_grad_tv.get_tensor_view_idx(tensor_layout)]); + } + } + + return 0; +} + +template +int32_t mlo_interpolate_forward(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + const Tgpu* input, + Tcheck* output, + const size_t nelems, + const float* scale_factors, + const bool align_corners, + const miopenInterpolateMode_t mode) +{ + if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST) + { + return mlo_nearest_forward(inputDesc, outputDesc, input, output, nelems, scale_factors); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_LINEAR) + { + return mlo_interpolate_linear_forward( + inputDesc, outputDesc, input, output, nelems, scale_factors, align_corners); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_BILINEAR) + { + return mlo_interpolate_bilinear_forward( + inputDesc, outputDesc, input, output, nelems, scale_factors, align_corners); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_TRILINEAR) + { + return mlo_interpolate_trilinear_forward( + inputDesc, outputDesc, input, output, nelems, scale_factors, align_corners); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC) + { + return mlo_bicubic_forward( + inputDesc, outputDesc, input, output, nelems, scale_factors, align_corners); + } + + return 0; +} + +template +int32_t mlo_interpolate_backward(const miopenTensorDescriptor_t inputGradDesc, + const miopenTensorDescriptor_t outputGradDesc, + Tcheck* input_grad, + const Tgpu* output_grad, + const size_t nelems, + const float* scale_factors, + const bool align_corners, + const miopenInterpolateMode_t mode) +{ + if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST) + { + return mlo_nearest_backward( + inputGradDesc, outputGradDesc, input_grad, output_grad, nelems, scale_factors); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_LINEAR) + { + return mlo_interpolate_linear_backward(inputGradDesc, + outputGradDesc, + input_grad, + output_grad, + nelems, + scale_factors, + align_corners); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_BILINEAR) + { + return mlo_interpolate_bilinear_backward(inputGradDesc, + outputGradDesc, + input_grad, + output_grad, + nelems, + scale_factors, + align_corners); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_TRILINEAR) + { + return mlo_interpolate_trilinear_backward(inputGradDesc, + outputGradDesc, + input_grad, + output_grad, + nelems, + scale_factors, + align_corners); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC) + { + return mlo_bicubic_backward(inputGradDesc, + outputGradDesc, + input_grad, + output_grad, + nelems, + scale_factors, + align_corners); + } + + return 0; +} + +#endif // MLO_INTERPOLATE_H_ diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index e768c7b349..5fce7be78e 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -68,6 +68,7 @@ * @defgroup argmax * @defgroup groupnorm * @defgroup cat + * @defgroup interpolate * */ @@ -6582,6 +6583,120 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d // CLOSEOUT BackendAPI DOXYGEN GROUP #endif // MIOPEN_BETA_API +#ifdef MIOPEN_BETA_API + +/*! @ingroup interpolate + * @enum miopenInterpolateMode_t + * Modes for Interpolate + */ + +typedef enum +{ + MIOPEN_INTERPOLATE_MODE_NEAREST = 0, + MIOPEN_INTERPOLATE_MODE_LINEAR = 1, + MIOPEN_INTERPOLATE_MODE_BILINEAR = 2, + MIOPEN_INTERPOLATE_MODE_BICUBIC = 3, + MIOPEN_INTERPOLATE_MODE_TRILINEAR = 4, +} miopenInterpolateMode_t; + +// Interpolate APIs +/** @addtogroup interpolate + * + * @{ + */ + +/*! @brief Execute a interpolate forward layer + * + * @param handle MIOpen handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Data tensor input (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param output Data tensor output (output) + * @param scaleFactorsDesc Tensor descriptor for scale factors tensor (input) + * @param scale_factors Data tensor scale factors - multiplier for spatial size (input) + * @param mode Interpolation mode (input) + * @param align_corners If set to True, the input and output tensors are aligned by the + * center points of their corner pixels, preserving the values at the corner pixels. If set to + * False, the input and output tensors are aligned by the corner points of their corner pixels, and + * the interpolation uses edge value padding for out-of-boundary values, making this operation + * independent of input size when scale_factor is kept the same. This only has an effect when mode + * is 'linear', 'bilinear', 'bicubic' or 'trilinear'. (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenInterpolateForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const miopenTensorDescriptor_t scaleFactorsDesc, + const void* scale_factors, + const miopenInterpolateMode_t mode, + const bool align_corners); + +/*! @brief Helper function to query the minimum workspace size required by the Interpolate Bicubic + * Backward call + * + * @param handle MIOpen Handle (input) + * @param outputGradDesc Tensor descriptor for output grad tensor (input) + * @param inputGradDesc Tensor descriptor for input grad tensor (input) + * @param scaleFactorsDesc Tensor descriptor for scale factors tensor (input) + * @param mode Interpolation mode (input) + * @param align_corners If set to True, the input and output tensors are aligned by the + * center points of their corner pixels, preserving the values at the corner pixels. If set to + * False, the input and output tensors are aligned by the corner points of their corner pixels, and + * the interpolation uses edge value padding for out-of-boundary values, making this operation + * independent of input size when scale_factor is kept the same. This only has an effect when mode + * is 'linear', 'bilinear', 'bicubic' or 'trilinear'. (input) + * @param sizeInBytes Pointer to data to return the minimum workspace size (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetInterpolateBackwardWorkspaceSize(miopenHandle_t handle, + const miopenTensorDescriptor_t outputGradDesc, + const miopenTensorDescriptor_t inputGradDesc, + const miopenTensorDescriptor_t scaleFactorsDesc, + const miopenInterpolateMode_t mode, + const bool align_corners, + size_t* sizeInBytes); + +/*! @brief Execute a interpolate backward layer + * + * @param handle MIOpen handle (input) + * @param workspace Pointer to workspace (input) + * @param workspaceSizeInBytes Size of workspace buffer (input) + * @param inputGradDesc Tensor descriptor for input grad tensor (input) + * @param input_grad Data tensor input grad (output) + * @param outputGradDesc Tensor descriptor for output grad tensor (input) + * @param output_grad Data tensor output grad (input) + * @param scaleFactorsDesc Tensor descriptor for scale factors tensor (input) + * @param scale_factors Data tensor scale factors - multiplier for spatial size (input) + * @param mode Interpolation mode (input) + * @param align_corners If set to True, the input and output tensors are aligned by the + * center points of their corner pixels, preserving the values at the corner pixels. If set to + * False, the input and output tensors are aligned by the corner points of their corner pixels, and + * the interpolation uses edge value padding for out-of-boundary values, making this operation + * independent of input size when scale_factor is kept the same. This only has an effect when mode + * is 'linear', 'bilinear', 'bicubic' or 'trilinear'. (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenInterpolateBackward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t inputGradDesc, + void* input_grad, + const miopenTensorDescriptor_t outputGradDesc, + const void* output_grad, + const miopenTensorDescriptor_t scaleFactorsDesc, + const void* scale_factors, + const miopenInterpolateMode_t mode, + const bool align_corners); + +/** @} */ +// CLOSEOUT Interpolate DOXYGEN GROUP +#endif // MIOPEN_BETA_API + #ifdef __cplusplus } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9671eed03c..4e7a0d0530 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -132,6 +132,8 @@ set( MIOpen_Source groupnorm/problem_description.cpp handle_api.cpp invoker_cache.cpp + interpolate_api.cpp + interpolate/problem_description.cpp kernel_build_params.cpp kernel_warnings.cpp layernorm_api.cpp @@ -260,6 +262,15 @@ set( MIOpen_Source solver/gemm_bwd.cpp solver/gemm_wrw.cpp solver/groupnorm/forward_groupnorm.cpp + solver/interpolate/bwd_bicubic_interpolate.cpp + solver/interpolate/bwd_bilinear_interpolate.cpp + solver/interpolate/bwd_linear_interpolate.cpp + solver/interpolate/bwd_nearest_interpolate.cpp + solver/interpolate/bwd_trilinear_interpolate.cpp + solver/interpolate/fwd_bicubic_interpolate.cpp + solver/interpolate/fwd_bilinear_interpolate.cpp + solver/interpolate/fwd_linear_interpolate.cpp + solver/interpolate/fwd_nearest_interpolate.cpp solver/layernorm/forward_layernorm.cpp solver/layernorm/forward_layernorm2d_ck.cpp solver/layernorm/forward_layernorm4d_ck.cpp @@ -421,6 +432,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/neuron.inc kernels/rocm_version.inc kernels/stride_array.hpp + kernels/tensor_view.hpp kernels/utilities.inc kernels/workaround_issue_1431.hpp kernels/xform_bidirect_winograd_code.inc @@ -455,6 +467,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenConvDirBatchNormActiv.cl kernels/MIOpenConvDirGenFwd.cl kernels/MIOpenGroupNorm.cpp + kernels/MIOpenInterpolate.cpp kernels/MIOpenLayerNorm.cpp kernels/MIOpenLRNBwd.cl kernels/MIOpenLRNFwd.cl @@ -579,6 +592,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN argmax.cpp cat.cpp groupnorm.cpp + interpolate.cpp kernel_cache.cpp layer_norm.cpp lrn.cpp diff --git a/src/include/miopen/interpolate.hpp b/src/include/miopen/interpolate.hpp new file mode 100644 index 0000000000..d79bcaf35a --- /dev/null +++ b/src/include/miopen/interpolate.hpp @@ -0,0 +1,94 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_INTERPOLATE_HPP_ +#define MIOPEN_INTERPOLATE_HPP_ + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +miopenStatus_t InterpolateNearestForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const TensorDescriptor& scaleFactorsDesc, + ConstData_t scale_factors, + miopenInterpolateMode_t mode); + +miopenStatus_t InterpolateLinearCubicForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const TensorDescriptor& scaleFactorsDesc, + ConstData_t scale_factors, + miopenInterpolateMode_t mode, + bool align_corners); + +size_t GetInterpolateBicubicBackwardWorkspaceSize(Handle& handle, + const TensorDescriptor& outputGradDesc, + const TensorDescriptor& inputGradDesc, + const TensorDescriptor& scaleFactorsDesc, + miopenInterpolateMode_t mode, + bool align_corners); + +miopenStatus_t InterpolateBicubicBackward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& inputGradDesc, + Data_t input_grad, + const TensorDescriptor& outputGradDesc, + ConstData_t output_grad, + const TensorDescriptor& scaleFactorsDesc, + ConstData_t scale_factors, + miopenInterpolateMode_t mode, + bool align_corners); + +miopenStatus_t InterpolateNearestBackward(Handle& handle, + const TensorDescriptor& inputGradDesc, + Data_t input_grad, + const TensorDescriptor& outputGradDesc, + ConstData_t output_grad, + const TensorDescriptor& scaleFactorsDesc, + ConstData_t scale_factors, + miopenInterpolateMode_t mode); + +miopenStatus_t InterpolateLinearBackward(Handle& handle, + const TensorDescriptor& inputGradDesc, + Data_t input_grad, + const TensorDescriptor& outputGradDesc, + ConstData_t output_grad, + const TensorDescriptor& scaleFactorsDesc, + ConstData_t scale_factors, + miopenInterpolateMode_t mode, + bool align_corners); + +} // namespace miopen +#endif // _MIOPEN_INTERPOLATE_HPP_ diff --git a/src/include/miopen/interpolate/invoke_params.hpp b/src/include/miopen/interpolate/invoke_params.hpp new file mode 100644 index 0000000000..66593cc4ad --- /dev/null +++ b/src/include/miopen/interpolate/invoke_params.hpp @@ -0,0 +1,81 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "miopen/miopen.h" +#include "miopen/common.hpp" +#include +#include + +namespace miopen { +namespace interpolate { + +struct FwdInvokeParams : public miopen::InvokeParams +{ + + FwdInvokeParams() = default; + + const TensorDescriptor* inputDesc = nullptr; + const TensorDescriptor* outputDesc = nullptr; + const TensorDescriptor* scaleFactorsDesc = nullptr; + + ConstData_t input = nullptr; + Data_t output = nullptr; + ConstData_t scale_factors = nullptr; + + miopenInterpolateMode_t mode; + bool align_corners = false; + + std::size_t GetWorkspaceSize() const { return 0; } + Data_t GetWorkspace() const { return nullptr; } +}; + +struct BwdInvokeParams : public miopen::InvokeParams +{ + + BwdInvokeParams() = default; + + const TensorDescriptor* inputGradDesc = nullptr; + const TensorDescriptor* outputGradDesc = nullptr; + const TensorDescriptor* scaleFactorsDesc = nullptr; + + Data_t input_grad = nullptr; + ConstData_t output_grad = nullptr; + ConstData_t scale_factors = nullptr; + + miopenInterpolateMode_t mode; + bool align_corners = false; + + std::size_t workspaceSizeInBytes = 0; + Data_t workspace = nullptr; + + std::size_t GetWorkspaceSize() const { return workspaceSizeInBytes; } + Data_t GetWorkspace() const { return workspace; } +}; + +} // namespace interpolate +} // namespace miopen diff --git a/src/include/miopen/interpolate/problem_description.hpp b/src/include/miopen/interpolate/problem_description.hpp new file mode 100644 index 0000000000..d5abbfd49b --- /dev/null +++ b/src/include/miopen/interpolate/problem_description.hpp @@ -0,0 +1,280 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "miopen/miopen.h" +#include +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace interpolate { + +struct ProblemDescription : ProblemDescriptionBase +{ + ProblemDescription(const TensorDescriptor& scaleFactorsDesc_, + const miopenInterpolateMode_t mode_, + const bool align_corners_) + : scaleFactorsDesc(scaleFactorsDesc_), mode(mode_), align_corners(align_corners_) + { + IsValidMode(); + IsValidType(); + } + + const TensorDescriptor& GetScaleFactorsDesc() const { return scaleFactorsDesc; } + miopenInterpolateMode_t GetMode() const { return mode; } + bool GetAlignCorners() const { return align_corners; } + + bool IsValidMode() const + { + if(mode != MIOPEN_INTERPOLATE_MODE_NEAREST && mode != MIOPEN_INTERPOLATE_MODE_LINEAR && + mode != MIOPEN_INTERPOLATE_MODE_BILINEAR && mode != MIOPEN_INTERPOLATE_MODE_TRILINEAR && + mode != MIOPEN_INTERPOLATE_MODE_BICUBIC) + { + MIOPEN_THROW(miopenStatusBadParm, "Interpolate: Invalid mode."); + } + return true; + } + + bool IsValidType() const + { + if(scaleFactorsDesc.GetType() != miopenFloat) + { + std::cout << "scaleFactorsDesc.GetType() = " << scaleFactorsDesc.GetType() + << "miopenFloat type:" << miopenFloat << std::endl; + MIOPEN_THROW(miopenStatusBadParm, + "Interpolate: Scale factor type should be miopenFloat."); + } + + return true; + } + +protected: + TensorDescriptor scaleFactorsDesc; + miopenInterpolateMode_t mode; + bool align_corners = false; +}; + +struct FwdProblemDescription : ProblemDescription +{ + FwdProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& outputDesc_, + const TensorDescriptor& scaleFactorsDesc_, + const miopenInterpolateMode_t mode_, + const bool align_corners_) + : ProblemDescription(scaleFactorsDesc_, mode_, align_corners_), + inputDesc(inputDesc_), + outputDesc(outputDesc_) + { + IsValidDims(); + IsValidLength(); + IsSameType(); + } + + const TensorDescriptor& GetInputDesc() const { return inputDesc; } + const TensorDescriptor& GetOutputDesc() const { return outputDesc; } + + bool IsValidLength() const + { + if(inputDesc.GetSize() < 3 || inputDesc.GetSize() > 5) + { + MIOPEN_THROW(miopenStatusBadParm, + "Interpolate: Input or output tensor size < 3 or > 5 is not valid."); + } + + if(outputDesc.GetSize() != inputDesc.GetSize()) + { + MIOPEN_THROW(miopenStatusBadParm, + "Interpolate: Input and output tensor size do not match."); + } + + if((outputDesc.GetSize() - 2) != scaleFactorsDesc.GetElementSize()) + { + if(mode != MIOPEN_INTERPOLATE_MODE_NEAREST) + { + MIOPEN_THROW( + miopenStatusBadParm, + "Interpolate: Output tensor size and scale factors length do not match."); + } + } + return true; + } + + bool IsValidDims() const + { + if(mode == MIOPEN_INTERPOLATE_MODE_LINEAR) + { + if(inputDesc.GetSize() != 3) + { + MIOPEN_THROW(miopenStatusBadParm, "Interpolate: Linear mode requires 3D tensors."); + } + } + if(mode == MIOPEN_INTERPOLATE_MODE_BILINEAR) + { + if(inputDesc.GetSize() != 4) + { + MIOPEN_THROW(miopenStatusBadParm, + "Interpolate: Bilinear mode requires 4D tensors."); + } + } + if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC) + { + if(inputDesc.GetSize() != 4) + { + MIOPEN_THROW(miopenStatusBadParm, "Interpolate: Bicubic mode requires 4D tensors."); + } + } + if(mode == MIOPEN_INTERPOLATE_MODE_TRILINEAR) + { + if(inputDesc.GetSize() != 5) + { + MIOPEN_THROW(miopenStatusBadParm, + "Interpolate: Trilinear mode requires 5D tensors."); + } + } + return true; + } + + bool IsSameType() const + { + if(inputDesc.GetType() != outputDesc.GetType()) + { + MIOPEN_THROW(miopenStatusBadParm, + "Interpolate: Input and output tensor type do not match."); + } + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +private: + TensorDescriptor inputDesc; + TensorDescriptor outputDesc; +}; + +struct BwdProblemDescription : ProblemDescription +{ + BwdProblemDescription(const TensorDescriptor& inputGradDesc_, + const TensorDescriptor& outputGradDesc_, + const TensorDescriptor& scaleFactorsDesc_, + const miopenInterpolateMode_t mode_, + const bool align_corners_) + : ProblemDescription(scaleFactorsDesc_, mode_, align_corners_), + inputGradDesc(inputGradDesc_), + outputGradDesc(outputGradDesc_) + { + IsValidDims(); + IsValidLength(); + IsSameType(); + } + const TensorDescriptor& GetInputGradDesc() const { return inputGradDesc; } + const TensorDescriptor& GetOutputGradDesc() const { return outputGradDesc; } + + bool IsValidLength() const + { + if(inputGradDesc.GetSize() < 3 || inputGradDesc.GetSize() > 5) + { + MIOPEN_THROW( + miopenStatusBadParm, + "Interpolate: Input grad or output grad tensor size < 3 or > 5 is not valid."); + } + + if(outputGradDesc.GetSize() != inputGradDesc.GetSize()) + { + MIOPEN_THROW(miopenStatusBadParm, + "Interpolate: Input grad and output grad tensor size do not match."); + } + + if((outputGradDesc.GetSize() - 2) != scaleFactorsDesc.GetElementSize()) + { + if(mode != MIOPEN_INTERPOLATE_MODE_NEAREST) + { + MIOPEN_THROW(miopenStatusBadParm, + "Interpolate: Tensor size and scale factors length do not match."); + } + } + return true; + } + + bool IsValidDims() const + { + if(mode == MIOPEN_INTERPOLATE_MODE_LINEAR) + { + if(inputGradDesc.GetSize() != 3) + { + MIOPEN_THROW(miopenStatusBadParm, "Interpolate: Linear mode requires 3D tensors."); + } + } + if(mode == MIOPEN_INTERPOLATE_MODE_BILINEAR) + { + if(inputGradDesc.GetSize() != 4) + { + MIOPEN_THROW(miopenStatusBadParm, + "Interpolate: Bilinear mode requires 4D tensors."); + } + } + if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC) + { + if(inputGradDesc.GetSize() != 4) + { + MIOPEN_THROW(miopenStatusBadParm, "Interpolate: Bicubic mode requires 4D tensors."); + } + } + if(mode == MIOPEN_INTERPOLATE_MODE_TRILINEAR) + { + if(inputGradDesc.GetSize() != 5) + { + MIOPEN_THROW(miopenStatusBadParm, + "Interpolate: Trilinear mode requires 5D tensors."); + } + } + return true; + } + + bool IsSameType() const + { + if(inputGradDesc.GetType() != outputGradDesc.GetType()) + { + MIOPEN_THROW(miopenStatusBadParm, + "Interpolate: Input grad and output grad tensor type do not match."); + } + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +private: + TensorDescriptor inputGradDesc; + TensorDescriptor outputGradDesc; +}; + +} // namespace interpolate + +} // namespace miopen diff --git a/src/include/miopen/interpolate/solvers.hpp b/src/include/miopen/interpolate/solvers.hpp new file mode 100644 index 0000000000..9469374385 --- /dev/null +++ b/src/include/miopen/interpolate/solvers.hpp @@ -0,0 +1,214 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include +#include "miopen/kernel_build_params.hpp" + +namespace miopen { + +namespace solver { + +const auto make_hip_kernel = [](std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + KernelBuildParameters build_params) { + while(localsize.size() < 3) + localsize.push_back(1); + while(gridsize.size() < 3) + gridsize.push_back(1); + for(int i = 0; i < localsize.size(); ++i) + gridsize[i] = AlignUp(gridsize[i], localsize[i]); + return KernelInfo{ + build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name}; +}; + +namespace interpolate { + +using InterpolateFwdSolver = + NonTunableSolverBase; + +using InterpolateBwdSolver = + NonTunableSolverBase; + +// FORWARD NEAREST +struct InterpolateNearestForward final : InterpolateFwdSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::interpolate::FwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::interpolate::FwdProblemDescription& problem) const override; +}; + +// FORWARD LINEAR +struct InterpolateLinearForward final : InterpolateFwdSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::interpolate::FwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::interpolate::FwdProblemDescription& problem) const override; +}; + +// FORWARD BILINEAR +struct InterpolateBilinearForward final : InterpolateFwdSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::interpolate::FwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::interpolate::FwdProblemDescription& problem) const override; +}; + +// FORWARD BICUBIC +struct InterpolateBicubicForward final : InterpolateFwdSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::interpolate::FwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::interpolate::FwdProblemDescription& problem) const override; +}; + +// BACKWARD NEAREST +struct InterpolateNearestBackward final : InterpolateBwdSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const override; +}; + +// BACKWARD LINEAR +struct InterpolateLinearBackward final : InterpolateBwdSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const override; +}; + +// BACKWARD BILINEAR +struct InterpolateBilinearBackward final : InterpolateBwdSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const override; +}; + +// BACKWARD TRILINEAR +struct InterpolateTrilinearBackward final : InterpolateBwdSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const override; +}; + +// BACKWARD BICUBIC +struct InterpolateBicubicBackward final : InterpolateBwdSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + bool IsApplicable(const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const override; + + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const override; + + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const override; + + bool MayNeedWorkspace() const override { return true; } +}; + +} // namespace interpolate + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/interpolate/utils.hpp b/src/include/miopen/interpolate/utils.hpp new file mode 100644 index 0000000000..bc22491536 --- /dev/null +++ b/src/include/miopen/interpolate/utils.hpp @@ -0,0 +1,81 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include "../src/kernels/tensor_view.hpp" +#include + +namespace miopen { + +namespace solver { + +namespace interpolate { + +template +inline tensor_view_t get_inner_expanded_tv(const TensorDescriptor Desc) +{ + auto dims = Desc.GetLengths(); + auto strides = Desc.GetStrides(); + + tensor_view_t tensor_view; + for(size_t i = 0; i < strides.size(); ++i) + { + tensor_view.stride[i] = strides[i]; + tensor_view.size[i] = dims[i]; + } + for(size_t i = strides.size(); i < N; ++i) + { + tensor_view.stride[i] = tensor_view.stride[i - 1]; + tensor_view.size[i] = 1; + } + return tensor_view; +} + +template +inline void slice_tv(tensor_view_t& tensor_view, int32_t sliceCount, const int32_t* slices) +{ + for(int32_t i = 0; i < sliceCount; i++) + { + int32_t dim = slices[4 * i + 0]; + int32_t start = slices[4 * i + 1]; + int32_t end = slices[4 * i + 2]; + int32_t step = slices[4 * i + 3]; + + if(end > static_cast(tensor_view.size[dim])) + end = tensor_view.size[dim]; + + auto len = end - start; + + tensor_view.size[dim] = (len + step - 1) / step; + tensor_view.stride[dim] *= step; + } +} + +} // namespace interpolate + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index c52dc020ac..c606de1dcf 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -56,7 +56,8 @@ enum class Primitive Reduce, Cat, Mha, - Softmax + Softmax, + Interpolate }; struct MIOPEN_EXPORT Id diff --git a/src/interpolate.cpp b/src/interpolate.cpp new file mode 100644 index 0000000000..0a1f07bd55 --- /dev/null +++ b/src/interpolate.cpp @@ -0,0 +1,243 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/miopen.h" +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +miopenStatus_t InterpolateNearestForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const TensorDescriptor& scaleFactorsDesc, + ConstData_t scale_factors, + const miopenInterpolateMode_t mode) +{ + const auto problem = + interpolate::FwdProblemDescription{inputDesc, outputDesc, scaleFactorsDesc, mode, false}; + + const auto invoke_params = [&]() { + auto tmp = interpolate::FwdInvokeParams{}; + tmp.inputDesc = &inputDesc; + tmp.outputDesc = &outputDesc; + tmp.scaleFactorsDesc = &scaleFactorsDesc; + + tmp.input = input; + tmp.output = output; + tmp.scale_factors = scale_factors; + + tmp.mode = mode; + + return tmp; + }(); + const auto algo = AlgorithmName{"InterpolateForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +miopenStatus_t InterpolateLinearCubicForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const TensorDescriptor& scaleFactorsDesc, + ConstData_t scale_factors, + const miopenInterpolateMode_t mode, + const bool align_corners) +{ + const auto problem = interpolate::FwdProblemDescription{ + inputDesc, outputDesc, scaleFactorsDesc, mode, align_corners}; + + const auto invoke_params = [&]() { + auto tmp = interpolate::FwdInvokeParams{}; + tmp.inputDesc = &inputDesc; + tmp.outputDesc = &outputDesc; + tmp.scaleFactorsDesc = &scaleFactorsDesc; + + tmp.input = input; + tmp.output = output; + tmp.scale_factors = scale_factors; + + tmp.mode = mode; + tmp.align_corners = align_corners; + + return tmp; + }(); + const auto algo = AlgorithmName{"InterpolateForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +miopenStatus_t InterpolateNearestBackward(Handle& handle, + const TensorDescriptor& inputGradDesc, + Data_t input_grad, + const TensorDescriptor& outputGradDesc, + ConstData_t output_grad, + const TensorDescriptor& scaleFactorsDesc, + ConstData_t scale_factors, + const miopenInterpolateMode_t mode) +{ + const auto problem = interpolate::BwdProblemDescription{ + inputGradDesc, outputGradDesc, scaleFactorsDesc, mode, false}; + + const auto invoke_params = [&]() { + auto tmp = interpolate::BwdInvokeParams{}; + tmp.inputGradDesc = &inputGradDesc; + tmp.outputGradDesc = &outputGradDesc; + tmp.scaleFactorsDesc = &scaleFactorsDesc; + + tmp.input_grad = input_grad; + tmp.output_grad = output_grad; + tmp.scale_factors = scale_factors; + + tmp.mode = mode; + + return tmp; + }(); + const auto algo = AlgorithmName{"InterpolateBackward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +size_t GetInterpolateBicubicBackwardWorkspaceSize(Handle& handle, + const TensorDescriptor& outputGradDesc, + const TensorDescriptor& inputGradDesc, + const TensorDescriptor& scaleFactorsDesc, + const miopenInterpolateMode_t mode, + const bool align_corners) +{ + auto ctx = ExecutionContext{&handle}; + const auto problem = interpolate::BwdProblemDescription{ + inputGradDesc, outputGradDesc, scaleFactorsDesc, mode, align_corners}; + + const auto algo = AlgorithmName{"InterpolateBackward"}; + const auto solvers = solver::SolverContainer{}; + + auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem); + + return pair_size_vector.empty() ? static_cast(-1) : pair_size_vector.front().second; +} + +miopenStatus_t InterpolateBicubicBackward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& inputGradDesc, + Data_t input_grad, + const TensorDescriptor& outputGradDesc, + ConstData_t output_grad, + const TensorDescriptor& scaleFactorsDesc, + ConstData_t scale_factors, + const miopenInterpolateMode_t mode, + const bool align_corners) +{ + const auto problem = interpolate::BwdProblemDescription{ + inputGradDesc, outputGradDesc, scaleFactorsDesc, mode, align_corners}; + + const auto invoke_params = [&]() { + auto tmp = interpolate::BwdInvokeParams{}; + tmp.inputGradDesc = &inputGradDesc; + tmp.outputGradDesc = &outputGradDesc; + tmp.scaleFactorsDesc = &scaleFactorsDesc; + + tmp.input_grad = input_grad; + tmp.output_grad = output_grad; + tmp.scale_factors = scale_factors; + + tmp.mode = mode; + tmp.align_corners = align_corners; + + tmp.workspace = workspace; + tmp.workspaceSizeInBytes = workspaceSizeInBytes; + + return tmp; + }(); + const auto algo = AlgorithmName{"InterpolateBackward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +miopenStatus_t InterpolateLinearBackward(Handle& handle, + const TensorDescriptor& inputGradDesc, + Data_t input_grad, + const TensorDescriptor& outputGradDesc, + ConstData_t output_grad, + const TensorDescriptor& scaleFactorsDesc, + ConstData_t scale_factors, + const miopenInterpolateMode_t mode, + const bool align_corners) +{ + const auto problem = interpolate::BwdProblemDescription{ + inputGradDesc, outputGradDesc, scaleFactorsDesc, mode, align_corners}; + + const auto invoke_params = [&]() { + auto tmp = interpolate::BwdInvokeParams{}; + tmp.inputGradDesc = &inputGradDesc; + tmp.outputGradDesc = &outputGradDesc; + tmp.scaleFactorsDesc = &scaleFactorsDesc; + + tmp.input_grad = input_grad; + tmp.output_grad = output_grad; + tmp.scale_factors = scale_factors; + + tmp.mode = mode; + tmp.align_corners = align_corners; + + return tmp; + }(); + const auto algo = AlgorithmName{"InterpolateBackward"}; + const auto solvers = + solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/interpolate/problem_description.cpp b/src/interpolate/problem_description.cpp new file mode 100644 index 0000000000..4b6419a78d --- /dev/null +++ b/src/interpolate/problem_description.cpp @@ -0,0 +1,90 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include +#include + +namespace miopen { + +namespace interpolate { + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +NetworkConfig FwdProblemDescription::MakeNetworkConfig() const +{ + auto input_dims = inputDesc.GetLengths(); + auto output_dims = outputDesc.GetLengths(); + auto input_dtype = inputDesc.GetType(); + miopenInterpolateMode_t mode = GetMode(); + bool align_corners = GetAlignCorners(); + + std::ostringstream ss; + ss << "interpolate_fwd"; + ss << "mode" << mode; + ss << "align_corners" << align_corners; + ss << "input_dtype" << input_dtype; + ss << "input_dims" << input_dims; + ss << "output_dims" << output_dims; + + return NetworkConfig{ss.str()}; +} + +NetworkConfig BwdProblemDescription::MakeNetworkConfig() const +{ + auto input_grad_dims = inputGradDesc.GetLengths(); + auto output_grad_dims = outputGradDesc.GetLengths(); + auto output_dtype = outputGradDesc.GetType(); + miopenInterpolateMode_t mode = GetMode(); + bool align_corners = GetAlignCorners(); + + std::ostringstream ss; + ss << "interpolate_bwd"; + ss << "mode" << mode; + ss << "align_corners" << align_corners; + ss << "output_grad_dtype" << output_dtype; + ss << "output_grad_dims" << output_grad_dims; + ss << "input_grad_dims" << input_grad_dims; + + return NetworkConfig{ss.str()}; +} + +} // namespace interpolate + +} // namespace miopen diff --git a/src/interpolate_api.cpp b/src/interpolate_api.cpp new file mode 100644 index 0000000000..5ff4b012b0 --- /dev/null +++ b/src/interpolate_api.cpp @@ -0,0 +1,215 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/miopen.h" +#include +#include +#include +#include +#include + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +static void LogCmdInterpolate(const miopenTensorDescriptor_t xDesc, + const miopenTensorDescriptor_t oDesc, + bool is_fwd, + const miopenInterpolateMode_t mode) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(xDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "interpolatefp16"; + } + else if(dtype == miopenFloat) + { + ss << "interpolate"; + } + else if(dtype == miopenBFloat16) + { + ss << "interpolatebfp16"; + } + + MIOPEN_LOG_FUNCTION(xDesc, oDesc, mode); + ss << " -D " << miopen::deref(xDesc).GetLengths(); + ss << " -Si " << miopen::deref(xDesc).GetStrides(); + ss << " -So " << miopen::deref(oDesc).GetStrides(); + + ss << " -F " << ((is_fwd) ? "1" : "2"); + ss << " -R " << mode; + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t miopenInterpolateForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const miopenTensorDescriptor_t scaleFactorsDesc, + const void* scale_factors, + const miopenInterpolateMode_t mode, + const bool align_corners) +{ + MIOPEN_LOG_FUNCTION(handle, + inputDesc, + input, + outputDesc, + output, + scaleFactorsDesc, + scale_factors, + mode, + align_corners); + + LogCmdInterpolate(inputDesc, outputDesc, true, mode); + if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST) + { + return miopen::try_([&] { + miopen::InterpolateNearestForward(miopen::deref(handle), + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(outputDesc), + DataCast(output), + miopen::deref(scaleFactorsDesc), + DataCast(scale_factors), + mode); + }); + } + return miopen::try_([&] { + miopen::InterpolateLinearCubicForward(miopen::deref(handle), + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(outputDesc), + DataCast(output), + miopen::deref(scaleFactorsDesc), + DataCast(scale_factors), + mode, + align_corners); + }); +} + +extern "C" miopenStatus_t +miopenGetInterpolateBackwardWorkspaceSize(miopenHandle_t handle, + const miopenTensorDescriptor_t outputGradDesc, + const miopenTensorDescriptor_t inputGradDesc, + const miopenTensorDescriptor_t scaleFactorsDesc, + const miopenInterpolateMode_t mode, + const bool align_corners, + size_t* sizeInBytes) +{ + + MIOPEN_LOG_FUNCTION( + handle, outputGradDesc, inputGradDesc, scaleFactorsDesc, mode, align_corners, sizeInBytes); + + return miopen::try_([&] { + miopen::deref(sizeInBytes) = + miopen::GetInterpolateBicubicBackwardWorkspaceSize(miopen::deref(handle), + miopen::deref(outputGradDesc), + miopen::deref(inputGradDesc), + miopen::deref(scaleFactorsDesc), + mode, + align_corners); + }); +} + +extern "C" miopenStatus_t miopenInterpolateBackward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t inputGradDesc, + void* input_grad, + const miopenTensorDescriptor_t outputGradDesc, + const void* output_grad, + const miopenTensorDescriptor_t scaleFactorsDesc, + const void* scale_factors, + const miopenInterpolateMode_t mode, + const bool align_corners) +{ + MIOPEN_LOG_FUNCTION(handle, + inputGradDesc, + input_grad, + outputGradDesc, + output_grad, + scaleFactorsDesc, + scale_factors, + mode, + align_corners); + + LogCmdInterpolate(inputGradDesc, outputGradDesc, false, mode); + if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST) + { + return miopen::try_([&] { + miopen::InterpolateNearestBackward(miopen::deref(handle), + miopen::deref(inputGradDesc), + DataCast(input_grad), + miopen::deref(outputGradDesc), + DataCast(output_grad), + miopen::deref(scaleFactorsDesc), + DataCast(scale_factors), + mode); + }); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC) + { + return miopen::try_([&] { + miopen::InterpolateBicubicBackward(miopen::deref(handle), + DataCast(workspace), + workspaceSizeInBytes, + miopen::deref(inputGradDesc), + DataCast(input_grad), + miopen::deref(outputGradDesc), + DataCast(output_grad), + miopen::deref(scaleFactorsDesc), + DataCast(scale_factors), + mode, + align_corners); + }); + } + return miopen::try_([&] { + miopen::InterpolateLinearBackward(miopen::deref(handle), + miopen::deref(inputGradDesc), + DataCast(input_grad), + miopen::deref(outputGradDesc), + DataCast(output_grad), + miopen::deref(scaleFactorsDesc), + DataCast(scale_factors), + mode, + align_corners); + }); +} diff --git a/src/kernels/MIOpenInterpolate.cpp b/src/kernels/MIOpenInterpolate.cpp new file mode 100644 index 0000000000..4e3e5b2cf3 --- /dev/null +++ b/src/kernels/MIOpenInterpolate.cpp @@ -0,0 +1,967 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "tensor_view.hpp" + +__device__ inline FLOAT_ACCUM compute_linear_scale_factor(FLOAT_ACCUM scale_factor, + int64_t input_size, + int64_t output_size, + bool align_corners) +{ + if(align_corners) + { + if(input_size == 1) + { + return static_cast(output_size); + } + return static_cast(output_size - 1) / (input_size - 1); + } + else if(scale_factor == 0) + { + return static_cast(output_size) / input_size; + } + else + { + return static_cast(scale_factor); + } +} + +__device__ inline FLOAT_ACCUM +get_src_index(int64_t dest_index, FLOAT_ACCUM scale_factor, bool align_corners) +{ + if(align_corners) + { + return dest_index / scale_factor; + } + else + { + // Follow Opencv resize logic. + return (dest_index + 0.5f) / scale_factor - 0.5f; + } +} + +__device__ inline int64_t +linear_back_index(int64_t src, FLOAT_ACCUM scale_factor, bool align_corners) +{ + return static_cast(ceil(get_src_index(src, 1.f / scale_factor, align_corners))); +} + +__device__ inline void compute_linear_back_index_from_to(int64_t src, + int64_t input_size, + int64_t output_size, + FLOAT_ACCUM scale_factor, + bool align_corners, + int64_t* from, + int64_t* to) +{ + if(src - 1 < 1) + { + *from = 0; + } + else + { + *from = linear_back_index(src - 1, scale_factor, align_corners); + } + if(src + 1 > input_size) + { + *to = output_size; + } + else + { + *to = min(output_size, linear_back_index(src + 1, scale_factor, align_corners)); + } +} + +__device__ inline void compute_source_index_and_lambda(int64_t h, + FLOAT_ACCUM scale_factor, + int64_t Hin, + bool align_corners, + int64_t* hin_index0, + int64_t* hin_index1, + FLOAT_ACCUM* lambda0, + FLOAT_ACCUM* lambda1) +{ + FLOAT_ACCUM hin_index_actual = max(0., get_src_index(h, scale_factor, align_corners)); + *hin_index0 = static_cast(hin_index_actual); + *hin_index1 = min(*hin_index0 + 1, Hin - 1); + *lambda1 = hin_index_actual - *hin_index0; + *lambda0 = 1.f - *lambda1; +} + +__device__ inline FLOAT_ACCUM +get_back_lambda(int64_t src, int64_t src0, int64_t src1, FLOAT_ACCUM lambda0, FLOAT_ACCUM lambda1) +{ + if(src == src0) + { + if(src0 == src1) + { + return 1; // lambda0 + lambda1 = 1 + } + return lambda0; + } + if(src == src1) + { + return lambda1; + } + // This case can happen due to floating point mutiplification. + // ex> 7 * (105/9) = 87 or 86.99999995 + return 0; +} + +__device__ inline FLOAT_ACCUM compute_back_lambda(int64_t dest, + int64_t src, + FLOAT_ACCUM scale_factor, + int64_t Hin, + int64_t Hout, + bool align_corners) +{ + if(Hin == Hout) + { + return 1; + } + int64_t index0; + int64_t index1; + FLOAT_ACCUM lambda0; + FLOAT_ACCUM lambda1; + compute_source_index_and_lambda( + dest, scale_factor, Hin, align_corners, &index0, &index1, &lambda0, &lambda1); + return get_back_lambda(src, index0, index1, lambda0, lambda1); +} + +template +__device__ inline void interpolateLinearForward(const TI* __restrict__ input, + TO* __restrict__ output, + const tensor_view_t<3> input_tv, + const tensor_view_t<3> output_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= nelems) + return; + + auto tensor_layout = tensor_layout_t<3>(output_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t h = tensor_layout.layout[2]; + + int64_t Hin = input_tv.size[2]; + int64_t Hout = output_tv.size[2]; + if(Hin == Hout || Hout == 1) + { + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(tensor_layout)]; + return; + } + + FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[0]); + scale_factor_h = compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + + int64_t hin_index0; + int64_t hin_index1; + FLOAT_ACCUM lambda1; + FLOAT_ACCUM lambda0; + compute_source_index_and_lambda( + h, scale_factor_h, Hin, align_corners, &hin_index0, &hin_index1, &lambda0, &lambda1); + + tensor_layout_t<3> input_layout0(n, c, hin_index0); + + tensor_layout_t<3> input_layout1(n, c, hin_index1); + + FLOAT_ACCUM input0 = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout0)]); + FLOAT_ACCUM input1 = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout1)]); + + output[output_tv.get_tensor_view_idx(tensor_layout)] = + CVT_ACCUM2FLOAT(input0 * lambda0 + input1 * lambda1); +} + +extern "C" __global__ void InterpolateLinearForward(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + const tensor_view_t<3> input_tv, + const tensor_view_t<3> output_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + interpolateLinearForward( + input, output, input_tv, output_tv, nelems, scale_factors, align_corners); +} + +template +__device__ inline void interpolateLinearBackward(TO* __restrict__ input_grad, + const TI* __restrict__ output_grad, + const tensor_view_t<3> input_grad_tv, + const tensor_view_t<3> output_grad_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= nelems) + return; + + auto tensor_layout = tensor_layout_t<3>(input_grad_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t h = tensor_layout.layout[2]; + + int64_t Hin = input_grad_tv.size[2]; + int64_t Hout = output_grad_tv.size[2]; + + if(Hin == Hout) + { + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]; + return; + } + + FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[0]); + FLOAT_ACCUM scale_factor = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + + int64_t from, to; + compute_linear_back_index_from_to(h, Hin, Hout, scale_factor, align_corners, &from, &to); + + FLOAT_ACCUM output = 0; + for(int64_t i = from; i < to; i++) + { + tensor_layout_t<3> output_layout(n, c, i); + output += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) * + compute_back_lambda(i, h, scale_factor, Hin, Hout, align_corners); + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(output); +} + +extern "C" __global__ void InterpolateLinearBackward(OUTPUT_TYPE* __restrict__ input_grad, + const INPUT_TYPE* __restrict__ output_grad, + const tensor_view_t<3> input_grad_tv, + const tensor_view_t<3> output_grad_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + interpolateLinearBackward(input_grad, + output_grad, + input_grad_tv, + output_grad_tv, + nelems, + scale_factors, + align_corners); +} + +template +__device__ inline void interpolateBilinearForward(const TI* __restrict__ input, + TO* __restrict__ output, + const tensor_view_t<4> input_tv, + const tensor_view_t<4> output_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= nelems) + return; + + auto tensor_layout = tensor_layout_t<4>(output_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t h = tensor_layout.layout[2]; + int64_t w = tensor_layout.layout[3]; + + int64_t Hin = input_tv.size[2]; + int64_t Hout = output_tv.size[2]; + int64_t Win = input_tv.size[3]; + int64_t Wout = output_tv.size[3]; + + if(Hin == Hout && Win == Wout) + { + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(tensor_layout)]; + return; + } + + int64_t hin_index0 = h; + int64_t hin_index1 = h; + FLOAT_ACCUM hlambda0 = 1; + FLOAT_ACCUM hlambda1 = 0; + if(Hin != Hout && Hout != 1) + { + FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[0]); + FLOAT_ACCUM scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + compute_source_index_and_lambda( + h, scale_factor_h_, Hin, align_corners, &hin_index0, &hin_index1, &hlambda0, &hlambda1); + } + + int64_t win_index0 = w; + int64_t win_index1 = w; + FLOAT_ACCUM wlambda0 = 1; + FLOAT_ACCUM wlambda1 = 0; + if(Win != Wout && Wout != 1) + { + FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[1]); + FLOAT_ACCUM scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + compute_source_index_and_lambda( + w, scale_factor_w_, Win, align_corners, &win_index0, &win_index1, &wlambda0, &wlambda1); + } + + tensor_layout_t<4> input_layout00(n, c, hin_index0, win_index0); + tensor_layout_t<4> input_layout01(n, c, hin_index0, win_index1); + tensor_layout_t<4> input_layout10(n, c, hin_index1, win_index0); + tensor_layout_t<4> input_layout11(n, c, hin_index1, win_index1); + + output[output_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT( + (CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout00)]) * wlambda0 + + CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout01)]) * wlambda1) * + hlambda0 + + (CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout10)]) * wlambda0 + + CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout11)]) * wlambda1) * + hlambda1); +} + +extern "C" __global__ void InterpolateBilinearForward(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + const tensor_view_t<4> input_tv, + const tensor_view_t<4> output_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + interpolateBilinearForward( + input, output, input_tv, output_tv, nelems, scale_factors, align_corners); +} + +template +__device__ inline void interpolateBilinearBackward(TO* __restrict__ input_grad, + const TI* __restrict__ output_grad, + const tensor_view_t<4> input_grad_tv, + const tensor_view_t<4> output_grad_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= nelems) + return; + + auto tensor_layout = tensor_layout_t<4>(input_grad_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t h = tensor_layout.layout[2]; + int64_t w = tensor_layout.layout[3]; + + int64_t Hin = input_grad_tv.size[2]; + int64_t Hout = output_grad_tv.size[2]; + int64_t Win = input_grad_tv.size[3]; + int64_t Wout = output_grad_tv.size[3]; + + FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[0]); + FLOAT_ACCUM scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + + FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[1]); + FLOAT_ACCUM scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + + int64_t h_from, h_to; + if(Hin == Hout) + { + h_from = h; + h_to = h + 1; + } + else + { + compute_linear_back_index_from_to( + h, Hin, Hout, scale_factor_h_, align_corners, &h_from, &h_to); + } + int64_t w_from, w_to; + if(Win == Wout) + { + w_from = w; + w_to = w + 1; + } + else + { + compute_linear_back_index_from_to( + w, Win, Wout, scale_factor_w_, align_corners, &w_from, &w_to); + } + + FLOAT_ACCUM output = 0; + for(int64_t i = h_from; i < h_to; i++) + { + FLOAT_ACCUM h_lambda = compute_back_lambda(i, h, scale_factor_h_, Hin, Hout, align_corners); + if(h_lambda == 0.) + continue; + for(int64_t j = w_from; j < w_to; j++) + { + FLOAT_ACCUM w_lambda = + compute_back_lambda(j, w, scale_factor_w_, Win, Wout, align_corners); + + tensor_layout_t<4> output_layout(n, c, i, j); + + output += + CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) * + h_lambda * w_lambda; + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(output); +} + +extern "C" __global__ void InterpolateBilinearBackward(OUTPUT_TYPE* __restrict__ input_grad, + const INPUT_TYPE* __restrict__ output_grad, + const tensor_view_t<4> input_grad_tv, + const tensor_view_t<4> output_grad_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + interpolateBilinearBackward(input_grad, + output_grad, + input_grad_tv, + output_grad_tv, + nelems, + scale_factors, + align_corners); +} + +template +__device__ inline void interpolateTrilinearBackward(TO* __restrict__ input_grad, + const TI* __restrict__ output_grad, + const tensor_view_t<5> input_grad_tv, + const tensor_view_t<5> output_grad_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= nelems) + return; + + auto tensor_layout = tensor_layout_t<5>(input_grad_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t d = tensor_layout.layout[2]; + int64_t h = tensor_layout.layout[3]; + int64_t w = tensor_layout.layout[4]; + + int64_t Din = input_grad_tv.size[2]; + int64_t Dout = output_grad_tv.size[2]; + int64_t Hin = input_grad_tv.size[3]; + int64_t Hout = output_grad_tv.size[3]; + int64_t Win = input_grad_tv.size[4]; + int64_t Wout = output_grad_tv.size[4]; + + FLOAT_ACCUM scale_factor_d = CVT_FP32_2ACCUM(scale_factors[0]); + FLOAT_ACCUM scale_factor_d_ = + compute_linear_scale_factor(scale_factor_d, Din, Dout, align_corners); + + FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[1]); + FLOAT_ACCUM scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + + FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[2]); + FLOAT_ACCUM scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + + int64_t d_from, d_to; + if(Din == Dout) + { + d_from = d; + d_to = d + 1; + } + else + { + compute_linear_back_index_from_to( + d, Din, Dout, scale_factor_d_, align_corners, &d_from, &d_to); + } + int64_t h_from, h_to; + if(Hin == Hout) + { + h_from = h; + h_to = h + 1; + } + else + { + compute_linear_back_index_from_to( + h, Hin, Hout, scale_factor_h_, align_corners, &h_from, &h_to); + } + int64_t w_from, w_to; + if(Win == Wout) + { + w_from = w; + w_to = w + 1; + } + else + { + compute_linear_back_index_from_to( + w, Win, Wout, scale_factor_w_, align_corners, &w_from, &w_to); + } + + FLOAT_ACCUM output = 0; + for(int64_t i = d_from; i < d_to; i++) + { + FLOAT_ACCUM d_lambda = compute_back_lambda(i, d, scale_factor_d_, Din, Dout, align_corners); + if(d_lambda == 0.f) + continue; + for(int64_t j = h_from; j < h_to; j++) + { + FLOAT_ACCUM h_lambda = + compute_back_lambda(j, h, scale_factor_h_, Hin, Hout, align_corners); + if(h_lambda == 0.f) + continue; + for(int64_t k = w_from; k < w_to; k++) + { + FLOAT_ACCUM w_lambda = + compute_back_lambda(k, w, scale_factor_w_, Win, Wout, align_corners); + tensor_layout_t<5> output_layout(n, c, i, j, k); + + output += CVT_FLOAT2ACCUM( + output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) * + d_lambda * h_lambda * w_lambda; + } + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(output); +} + +extern "C" __global__ void InterpolateTrilinearBackward(OUTPUT_TYPE* __restrict__ input_grad, + const INPUT_TYPE* __restrict__ output_grad, + const tensor_view_t<5> input_grad_tv, + const tensor_view_t<5> output_grad_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + interpolateTrilinearBackward(input_grad, + output_grad, + input_grad_tv, + output_grad_tv, + nelems, + scale_factors, + align_corners); +} + +__device__ inline FLOAT_ACCUM +compute_scales_value(FLOAT_ACCUM scale, int64_t input_size, int64_t output_size) +{ + return (scale == 0.f) ? (static_cast(input_size) / output_size) : (1.0f / scale); +} + +__device__ inline int64_t +nearest_idx(int64_t output_index, int64_t input_size, int64_t output_size, FLOAT_ACCUM scales) +{ + if(output_size == input_size) + { + return output_index; + } + else if(output_size == 2 * input_size) + { + return output_index / 2; + } + else + { + FLOAT_ACCUM scale = compute_scales_value(scales, input_size, output_size); + return min(static_cast((output_index * scale)), input_size); + } +} + +template +__device__ inline void interpolateNearestForward(const TI* __restrict__ input, + TO* __restrict__ output, + const tensor_view_t<5> input_tv, + const tensor_view_t<5> output_tv, + const size_t nelems, + const float* scale_factors) +{ + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= nelems) + return; + + auto tensor_layout = tensor_layout_t<5>(output_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t d = tensor_layout.layout[2]; + int64_t h = tensor_layout.layout[3]; + int64_t w = tensor_layout.layout[4]; + + int64_t Dout = output_tv.size[2]; + int64_t Hout = output_tv.size[3]; + int64_t Wout = output_tv.size[4]; + int64_t Din = input_tv.size[2]; + int64_t Hin = input_tv.size[3]; + int64_t Win = input_tv.size[4]; + + FLOAT_ACCUM scale_factor_d = CVT_FP32_2ACCUM(scale_factors[0]); + FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[1]); + FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[2]); + + int64_t x = nearest_idx(d, Din, Dout, scale_factor_d); + int64_t y = nearest_idx(h, Hin, Hout, scale_factor_h); + int64_t z = nearest_idx(w, Win, Wout, scale_factor_w); + + tensor_layout_t<5> input_layout(n, c, x, y, z); + + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(input_layout)]; +} + +extern "C" __global__ void InterpolateNearestForward(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + const tensor_view_t<5> input_tv, + const tensor_view_t<5> output_tv, + const size_t nelems, + const float* scale_factors) +{ + interpolateNearestForward( + input, output, input_tv, output_tv, nelems, scale_factors); +} + +__device__ inline int64_t +nearest_idx_back(int64_t input_index, int64_t input_size, int64_t output_size, FLOAT_ACCUM scales) +{ + if(output_size == input_size) + { + return input_index; + } + else if(output_size == 2 * input_size) + { + return input_index * 2; + } + else + { + FLOAT_ACCUM scale = compute_scales_value(scales, input_size, output_size); + return min(static_cast(ceil(input_index / scale)), output_size); + } +} + +template +__device__ inline void interpolateNearestBackward(TO* __restrict__ input_grad, + const TI* __restrict__ output_grad, + const tensor_view_t<5> input_grad_tv, + const tensor_view_t<5> output_grad_tv, + const size_t nelems, + const float* scale_factors) +{ + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= nelems) + return; + + auto tensor_layout = tensor_layout_t<5>(input_grad_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t x = tensor_layout.layout[2]; + int64_t y = tensor_layout.layout[3]; + int64_t z = tensor_layout.layout[4]; + + int64_t Dout = output_grad_tv.size[2]; + int64_t Hout = output_grad_tv.size[3]; + int64_t Wout = output_grad_tv.size[4]; + int64_t Din = input_grad_tv.size[2]; + int64_t Hin = input_grad_tv.size[3]; + int64_t Win = input_grad_tv.size[4]; + + FLOAT_ACCUM scale_factor_d = CVT_FP32_2ACCUM(scale_factors[0]); + FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[1]); + FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[2]); + + int64_t dstart = nearest_idx_back(x, Din, Dout, scale_factor_d); + int64_t dlimit = nearest_idx_back(x + 1, Din, Dout, scale_factor_d); + int64_t hstart = nearest_idx_back(y, Hin, Hout, scale_factor_h); + int64_t hlimit = nearest_idx_back(y + 1, Hin, Hout, scale_factor_h); + int64_t wstart = nearest_idx_back(z, Win, Wout, scale_factor_w); + int64_t wlimit = nearest_idx_back(z + 1, Win, Wout, scale_factor_w); + + FLOAT_ACCUM grad = 0.f; + for(int64_t d = dstart; d < dlimit; d++) + { + for(int64_t h = hstart; h < hlimit; h++) + { + for(int64_t w = wstart; w < wlimit; w++) + { + tensor_layout_t<5> output_grad_layout(n, c, d, h, w); + grad += CVT_FLOAT2ACCUM( + output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]); + } + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(grad); +} + +extern "C" __global__ void InterpolateNearestBackward(OUTPUT_TYPE* __restrict__ input_grad, + const INPUT_TYPE* __restrict__ output_grad, + const tensor_view_t<5> input_grad_tv, + const tensor_view_t<5> output_grad_tv, + const size_t nelems, + const float* scale_factors) +{ + interpolateNearestBackward( + input_grad, output_grad, input_grad_tv, output_grad_tv, nelems, scale_factors); +} + +__device__ inline FLOAT_ACCUM +bicubic_idx(int64_t output_index, int64_t output_size, FLOAT_ACCUM scale_factor, bool align_corners) +{ + if(output_size == 1) + { + if(align_corners) + { + return 0; + } + return -0.5f; + } + return get_src_index(output_index, scale_factor, align_corners); +} + +__device__ inline FLOAT_ACCUM cubic_convolution1(FLOAT_ACCUM x, FLOAT_ACCUM A) +{ + return ((A + 2) * x - (A + 3)) * x * x + 1; +} + +__device__ inline FLOAT_ACCUM cubic_convolution2(FLOAT_ACCUM x, FLOAT_ACCUM A) +{ + return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; +} + +__device__ inline void get_cubic_upsampling_coefficients(FLOAT_ACCUM coeffs[4], FLOAT_ACCUM t) +{ + FLOAT_ACCUM A = -0.75f; + + FLOAT_ACCUM x1 = t; + coeffs[0] = cubic_convolution2(x1 + 1.0f, A); + coeffs[1] = cubic_convolution1(x1, A); + + FLOAT_ACCUM x2 = 1.0f - t; + coeffs[2] = cubic_convolution1(x2, A); + coeffs[3] = cubic_convolution2(x2 + 1.0f, A); +} + +__device__ inline FLOAT_ACCUM +cubic_interp1d(FLOAT_ACCUM x0, FLOAT_ACCUM x1, FLOAT_ACCUM x2, FLOAT_ACCUM x3, FLOAT_ACCUM t) +{ + FLOAT_ACCUM coeffs[4]; + get_cubic_upsampling_coefficients(coeffs, t); + + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +__device__ inline int64_t bound(int64_t p, int64_t max_size) +{ + return max(min(p, max_size - 1), 0l); +} + +template +__device__ inline void interpolateBicubicForward(const TI* __restrict__ input, + TO* __restrict__ output, + const tensor_view_t<4> input_tv, + const tensor_view_t<4> output_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= nelems) + return; + + auto tensor_layout = tensor_layout_t<4>(output_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t h = tensor_layout.layout[2]; + int64_t w = tensor_layout.layout[3]; + + int64_t Hin = input_tv.size[2]; + int64_t Win = input_tv.size[3]; + int64_t Hout = output_tv.size[2]; + int64_t Wout = output_tv.size[3]; + if(Hin == Hout && Win == Wout) + { + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(tensor_layout)]; + return; + } + + FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[0]); + FLOAT_ACCUM scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + FLOAT_ACCUM real_y = bicubic_idx(h, Hout, scale_factor_h_, align_corners); + int64_t in_y = static_cast(floor(real_y)); + FLOAT_ACCUM t_y = real_y - static_cast(in_y); + + FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[1]); + FLOAT_ACCUM scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + FLOAT_ACCUM real_x = bicubic_idx(w, Wout, scale_factor_w_, align_corners); + int64_t in_x = static_cast(floor(real_x)); + FLOAT_ACCUM t_x = real_x - static_cast(in_x); + + FLOAT_ACCUM coefficients[4]; +#pragma unroll + for(int k = 0; k < 4; k++) + { + int64_t y = bound(in_y - 1 + k, Hin); + tensor_layout_t<4> input_layout0(n, c, y, bound(in_x - 1, Win)); + tensor_layout_t<4> input_layout1(n, c, y, bound(in_x, Win)); + tensor_layout_t<4> input_layout2(n, c, y, bound(in_x + 1, Win)); + tensor_layout_t<4> input_layout3(n, c, y, bound(in_x + 2, Win)); + + coefficients[k] = + cubic_interp1d(CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout0)]), + CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout1)]), + CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout2)]), + CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout3)]), + t_x); + } + + output[output_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT( + cubic_interp1d(coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y)); +} + +extern "C" __global__ void InterpolateBicubicForward(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + const tensor_view_t<4> input_tv, + const tensor_view_t<4> output_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + interpolateBicubicForward( + input, output, input_tv, output_tv, nelems, scale_factors, align_corners); +} + +template +__device__ inline void interpolateBicubicBackward(TD* __restrict__ workspace, + const TI* __restrict__ output_grad, + const tensor_view_t<4> input_grad_tv, + const tensor_view_t<4> output_grad_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= nelems) + return; + + auto tensor_layout = tensor_layout_t<4>(output_grad_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t h = tensor_layout.layout[2]; + int64_t w = tensor_layout.layout[3]; + + int64_t Hin = input_grad_tv.size[2]; + int64_t Hout = output_grad_tv.size[2]; + int64_t Win = input_grad_tv.size[3]; + int64_t Wout = output_grad_tv.size[3]; + + if(Hin == Hout && Win == Wout) + { + workspace[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]); + return; + } + + FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[0]); + FLOAT_ACCUM scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + FLOAT_ACCUM real_y = bicubic_idx(h, Hout, scale_factor_h_, align_corners); + int64_t in_y = static_cast(floor(real_y)); + FLOAT_ACCUM t_y = real_y - static_cast(in_y); + + FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[1]); + FLOAT_ACCUM scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + FLOAT_ACCUM real_x = bicubic_idx(w, Wout, scale_factor_w_, align_corners); + int64_t in_x = static_cast(floor(real_x)); + FLOAT_ACCUM t_x = real_x - static_cast(in_x); + + FLOAT_ACCUM y_coeffs[4]; + FLOAT_ACCUM x_coeffs[4]; + get_cubic_upsampling_coefficients(y_coeffs, t_y); + get_cubic_upsampling_coefficients(x_coeffs, t_x); + + FLOAT_ACCUM out_value = + CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]); +#pragma unroll + for(int i = 0; i < 4; i++) + { + int64_t input_h = bound(in_y - 1 + i, Hin); +#pragma unroll + for(int j = 0; j < 4; j++) + { + int64_t input_w = bound(in_x - 1 + j, Win); + tensor_layout_t<4> in_grad_layout(n, c, input_h, input_w); + + atomicAdd(workspace + input_grad_tv.get_tensor_view_idx(in_grad_layout), + out_value * y_coeffs[i] * x_coeffs[j]); + } + } +} + +template +__device__ inline void interpolateBicubicBackward_paste(TO* __restrict__ input_grad, + const TD* __restrict__ workspace, + const tensor_view_t<4> input_grad_tv, + const size_t nelems) +{ + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= nelems) + return; + + auto tensor_layout = tensor_layout_t<4>(input_grad_tv, gid); + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + CVT_ACCUM2FLOAT(workspace[input_grad_tv.get_tensor_view_idx(tensor_layout)]); +} + +extern "C" __global__ void InterpolateBicubicBackward(DTYPE* __restrict__ workspace, + const INPUT_TYPE* __restrict__ output_grad, + const tensor_view_t<4> input_grad_tv, + const tensor_view_t<4> output_grad_tv, + const size_t nelems, + const float* scale_factors, + const bool align_corners) +{ + interpolateBicubicBackward(workspace, + output_grad, + input_grad_tv, + output_grad_tv, + nelems, + scale_factors, + align_corners); +} + +extern "C" __global__ void InterpolateBicubicBackward_paste(OUTPUT_TYPE* __restrict__ input_grad, + const DTYPE* __restrict__ workspace, + const tensor_view_t<4> input_grad_tv, + const size_t nelems) +{ + interpolateBicubicBackward_paste( + input_grad, workspace, input_grad_tv, nelems); +} diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp new file mode 100644 index 0000000000..9d0a3d28e9 --- /dev/null +++ b/src/kernels/tensor_view.hpp @@ -0,0 +1,118 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#ifndef GUARD_TENSOR_VIEW_H +#define GUARD_TENSOR_VIEW_H + +template +struct tensor_layout_t; + +template +struct tensor_view_t +{ + // Get index in tensor view at tensor layout + constexpr uint64_t get_tensor_view_idx(const tensor_layout_t& tensor_layout) const + { + static_assert(N > 0); + uint64_t idx = 0; + for(auto i = 0; i < N; ++i) + { + idx += stride[i] * tensor_layout.layout[i]; + } + return idx; + } + uint64_t stride[N]; + uint64_t size[N]; +}; + +template +struct tensor_layout_t +{ + // Make tensor layout at index using tensor view + constexpr tensor_layout_t(const tensor_view_t& tensor_view, uint64_t idx) + { + static_assert(N > 0); + uint64_t temp = idx; + if constexpr(N == 1) + { + layout[0] = idx; + } + else + { + for(auto i = N - 1; i > 1; --i) + { + layout[i] = temp % tensor_view.size[i]; + temp = temp / tensor_view.size[i]; + } + layout[1] = temp % tensor_view.size[1]; + layout[0] = temp / tensor_view.size[1]; + } + } + + constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w) + { + static_assert(N == 5); + layout[0] = n; + layout[1] = c; + layout[2] = d; + layout[3] = h; + layout[4] = w; + } + + constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t h, uint64_t w) + { + static_assert(N == 4); + layout[0] = n; + layout[1] = c; + layout[2] = h; + layout[3] = w; + } + + constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w) + { + static_assert(N == 3); + layout[0] = n; + layout[1] = h; + layout[2] = w; + } + + constexpr tensor_layout_t(uint64_t n, uint64_t w) + { + static_assert(N == 2); + layout[0] = n; + layout[1] = w; + } + + constexpr tensor_layout_t(uint64_t n) + { + static_assert(N == 1); + layout[0] = n; + } + + uint64_t layout[N]; +}; + +#endif // GUARD_TENSOR_VIEW_H diff --git a/src/solver.cpp b/src/solver.cpp index f45f3058a6..9bc48f0423 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -649,6 +650,43 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId()); Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId()); + Register(registry, + ++id, + Primitive::Interpolate, + interpolate::InterpolateNearestForward{}.SolverDbId()); + Register(registry, + ++id, + Primitive::Interpolate, + interpolate::InterpolateLinearForward{}.SolverDbId()); + Register(registry, + ++id, + Primitive::Interpolate, + interpolate::InterpolateBilinearForward{}.SolverDbId()); + Register(registry, + ++id, + Primitive::Interpolate, + interpolate::InterpolateBicubicForward{}.SolverDbId()); + Register(registry, + ++id, + Primitive::Interpolate, + interpolate::InterpolateNearestBackward{}.SolverDbId()); + Register(registry, + ++id, + Primitive::Interpolate, + interpolate::InterpolateLinearBackward{}.SolverDbId()); + Register(registry, + ++id, + Primitive::Interpolate, + interpolate::InterpolateBilinearBackward{}.SolverDbId()); + Register(registry, + ++id, + Primitive::Interpolate, + interpolate::InterpolateTrilinearBackward{}.SolverDbId()); + Register(registry, + ++id, + Primitive::Interpolate, + interpolate::InterpolateBicubicBackward{}.SolverDbId()); + // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/interpolate/bwd_bicubic_interpolate.cpp b/src/solver/interpolate/bwd_bicubic_interpolate.cpp new file mode 100644 index 0000000000..3a534180a0 --- /dev/null +++ b/src/solver/interpolate/bwd_bicubic_interpolate.cpp @@ -0,0 +1,208 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/miopen.h" +#include +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_BWD_BICUBIC 256 + +namespace miopen { + +namespace solver { + +namespace interpolate { + +bool IsOverRocmBicubicBwd(const miopen::interpolate::BwdProblemDescription& problem) +{ + TensorDescriptor output_grad_desc = problem.GetOutputGradDesc(); + TensorDescriptor input_grad_desc = problem.GetInputGradDesc(); + auto dtype = input_grad_desc.GetType(); + + float scale_h = + static_cast(output_grad_desc.GetLengths()[2]) / input_grad_desc.GetLengths()[2]; + float scale_w = + static_cast(output_grad_desc.GetLengths()[3]) / input_grad_desc.GetLengths()[3]; + + if(dtype == miopenHalf || dtype == miopenBFloat16) + { + if(scale_h + scale_w < 8 && scale_h + scale_w > 1.4) + return true; + else + return false; + } + else + { + if(output_grad_desc.GetLengths()[2] + output_grad_desc.GetLengths()[3] <= 256 && + (input_grad_desc.GetElementSize() >= 10000)) + return true; + else + return false; + } +} + +bool InterpolateBicubicBackward::IsApplicable( + const ExecutionContext&, const miopen::interpolate::BwdProblemDescription& problem) const +{ + if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_BICUBIC) + return false; + if(!IsOverRocmBicubicBwd(problem)) + return false; + + return true; +} + +ConvSolution InterpolateBicubicBackward::GetSolution( + const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); + + { + auto dtype = problem.GetInputGradDesc().GetType(); + size_t N_total = problem.GetOutputGradDesc().GetElementSize(); + size_t N_total_paste = problem.GetInputGradDesc().GetElementSize(); + + auto kernel = KernelInfo{}; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"DTYPE", "float"}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_BICUBIC}, + {N_total}, + "MIOpenInterpolate.cpp", + "InterpolateBicubicBackward", + build_params)); + + if(dtype != miopenFloat) + { + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_BICUBIC}, + {N_total_paste}, + "MIOpenInterpolate.cpp", + "InterpolateBicubicBackward_paste", + build_params)); + } + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + auto input_grad_tv = get_inner_expanded_tv<4>(deref(params.inputGradDesc)); + auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc)); + auto dtype = deref(params.inputGradDesc).GetType(); + size_t nelems = params.outputGradDesc->GetElementSize(); + + int kernelCnt = 0; + decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]); + + float elapsed = 0.0f; + HipEventPtr start; + HipEventPtr stop; + + const bool profiling = handle_.IsProfilingEnabled(); + if(kernels.size() > 1 && profiling) + { + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + handle_.EnableProfiling(false); + hipEventRecord(start.get(), handle_.GetStream()); + } + + if(dtype == miopenFloat) + { + kernel(params.input_grad, + params.output_grad, + input_grad_tv, + output_grad_tv, + nelems, + params.scale_factors, + params.align_corners); + } + else + { + kernel(params.workspace, + params.output_grad, + input_grad_tv, + output_grad_tv, + nelems, + params.scale_factors, + params.align_corners); + + nelems = params.inputGradDesc->GetElementSize(); + kernel = handle_.Run(kernels[kernelCnt++]); + kernel(params.input_grad, params.workspace, input_grad_tv, nelems); + } + + if(kernels.size() > 1 && profiling) + { + hipEventRecord(stop.get(), handle_.GetStream()); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + + // Clean up + hipEventDestroy(start.get()); + hipEventDestroy(stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + handle_.EnableProfiling(true); + }; + }; + }; + + return result; +} + +std::size_t InterpolateBicubicBackward::GetWorkspaceSize( + const ExecutionContext&, const miopen::interpolate::BwdProblemDescription& problem) const +{ + return problem.GetInputGradDesc().GetElementSize() * sizeof(float); +} + +} // namespace interpolate + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/interpolate/bwd_bilinear_interpolate.cpp b/src/solver/interpolate/bwd_bilinear_interpolate.cpp new file mode 100644 index 0000000000..36050a926f --- /dev/null +++ b/src/solver/interpolate/bwd_bilinear_interpolate.cpp @@ -0,0 +1,142 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/miopen.h" +#include +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_BWD_BILINEAR 256 + +namespace miopen { + +namespace solver { + +namespace interpolate { + +bool IsOverRocmBilinearBwd(const miopen::interpolate::BwdProblemDescription& problem) +{ + TensorDescriptor input_grad_desc = problem.GetInputGradDesc(); + TensorDescriptor output_grad_desc = problem.GetOutputGradDesc(); + auto dtype = input_grad_desc.GetType(); + + float scale_h = + static_cast(output_grad_desc.GetLengths()[2]) / input_grad_desc.GetLengths()[2]; + float scale_w = + static_cast(output_grad_desc.GetLengths()[3]) / input_grad_desc.GetLengths()[3]; + + if(dtype == miopenHalf || dtype == miopenBFloat16) + { + if(scale_h + scale_w < 2) + return false; + } + else if(dtype == miopenFloat) + { + if(scale_h + scale_w < 14) + return false; + } + + return true; +} + +bool InterpolateBilinearBackward::IsApplicable( + const ExecutionContext&, const miopen::interpolate::BwdProblemDescription& problem) const +{ + if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_BILINEAR) + return false; + if(!IsOverRocmBilinearBwd(problem)) + return false; + + return true; +} + +ConvSolution InterpolateBilinearBackward::GetSolution( + const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); + + { + auto dtype = problem.GetInputGradDesc().GetType(); + size_t N_total = problem.GetInputGradDesc().GetElementSize(); + + auto kernel = KernelInfo{}; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"DTYPE", "float"}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_BILINEAR}, + {N_total}, + "MIOpenInterpolate.cpp", + "InterpolateBilinearBackward", + build_params)); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto input_grad_tv = get_inner_expanded_tv<4>(deref(params.inputGradDesc)); + auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc)); + size_t nelems = params.inputGradDesc->GetElementSize(); + + kernel(params.input_grad, + params.output_grad, + input_grad_tv, + output_grad_tv, + nelems, + params.scale_factors, + params.align_corners); + }; + }; + + return result; +} + +} // namespace interpolate + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/interpolate/bwd_linear_interpolate.cpp b/src/solver/interpolate/bwd_linear_interpolate.cpp new file mode 100644 index 0000000000..b105417853 --- /dev/null +++ b/src/solver/interpolate/bwd_linear_interpolate.cpp @@ -0,0 +1,134 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_BWD_LINEAR 256 + +namespace miopen { + +namespace solver { + +namespace interpolate { + +bool IsOverRocmLinearBwd(const miopen::interpolate::BwdProblemDescription& problem) +{ + TensorDescriptor input_grad_desc = problem.GetInputGradDesc(); + auto dtype = input_grad_desc.GetType(); + + if(dtype == miopenFloat) + { + if(input_grad_desc.GetElementSize() < 4000) + return false; + } + else if(dtype == miopenHalf || dtype == miopenBFloat16) + { + if(input_grad_desc.GetElementSize() < 960) + return false; + } + + return true; +} + +bool InterpolateLinearBackward::IsApplicable( + const ExecutionContext&, const miopen::interpolate::BwdProblemDescription& problem) const +{ + if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_LINEAR) + return false; + if(!IsOverRocmLinearBwd(problem)) + return false; + return true; +} + +ConvSolution InterpolateLinearBackward::GetSolution( + const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); + + { + auto dtype = problem.GetInputGradDesc().GetType(); + size_t N_total = problem.GetInputGradDesc().GetElementSize(); + + auto kernel = KernelInfo{}; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"DTYPE", "float"}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_LINEAR}, + {N_total}, + "MIOpenInterpolate.cpp", + "InterpolateLinearBackward", + build_params)); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto input_grad_tv = get_inner_expanded_tv<3>(deref(params.inputGradDesc)); + auto output_grad_tv = get_inner_expanded_tv<3>(deref(params.outputGradDesc)); + size_t nelems = params.inputGradDesc->GetElementSize(); + + kernel(params.input_grad, + params.output_grad, + input_grad_tv, + output_grad_tv, + nelems, + params.scale_factors, + params.align_corners); + }; + }; + + return result; +} + +} // namespace interpolate + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/interpolate/bwd_nearest_interpolate.cpp b/src/solver/interpolate/bwd_nearest_interpolate.cpp new file mode 100644 index 0000000000..d66fc78829 --- /dev/null +++ b/src/solver/interpolate/bwd_nearest_interpolate.cpp @@ -0,0 +1,150 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/activ.hpp" +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_BWD_NEAREST 256 + +namespace miopen { + +namespace solver { + +namespace interpolate { + +bool IsOverRocmNearestBwd(const miopen::interpolate::BwdProblemDescription& problem) +{ + TensorDescriptor input_grad_desc = problem.GetInputGradDesc(); + TensorDescriptor output_grad_desc = problem.GetOutputGradDesc(); + if(input_grad_desc.GetLengths().size() == 3) + { + if(output_grad_desc.GetElementSize() < 8000 || input_grad_desc.GetLengths()[0] < 10) + return false; + } + else if(input_grad_desc.GetLengths().size() == 4) + { + float scale_h = + static_cast(output_grad_desc.GetLengths()[2]) / input_grad_desc.GetLengths()[2]; + float scale_w = + static_cast(output_grad_desc.GetLengths()[3]) / input_grad_desc.GetLengths()[3]; + + if(input_grad_desc.GetLengths()[0] < 10 || (scale_h + scale_w <= 4)) + return false; + } + else if(input_grad_desc.GetLengths().size() == 5) + { + float scale_h = + static_cast(output_grad_desc.GetLengths()[2]) / input_grad_desc.GetLengths()[2]; + float scale_w = + static_cast(output_grad_desc.GetLengths()[3]) / input_grad_desc.GetLengths()[3]; + float scale_d = + static_cast(output_grad_desc.GetLengths()[4]) / input_grad_desc.GetLengths()[4]; + + if(scale_h + scale_w + scale_d < 6) + return false; + } + + return true; +} + +bool InterpolateNearestBackward::IsApplicable( + const ExecutionContext&, const miopen::interpolate::BwdProblemDescription& problem) const +{ + if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_NEAREST) + return false; + if(!IsOverRocmNearestBwd(problem)) + return false; + return true; +} + +ConvSolution InterpolateNearestBackward::GetSolution( + const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); + + { + auto dtype = problem.GetInputGradDesc().GetType(); + size_t N_total = problem.GetInputGradDesc().GetElementSize(); + + auto kernel = KernelInfo{}; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"DTYPE", "float"}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_NEAREST}, + {N_total}, + "MIOpenInterpolate.cpp", + "InterpolateNearestBackward", + build_params)); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto input_grad_tv = get_inner_expanded_tv<5>(deref(params.inputGradDesc)); + auto output_grad_tv = get_inner_expanded_tv<5>(deref(params.outputGradDesc)); + size_t nelems = params.inputGradDesc->GetElementSize(); + + kernel(params.input_grad, + params.output_grad, + input_grad_tv, + output_grad_tv, + nelems, + params.scale_factors); + }; + }; + + return result; +} + +} // namespace interpolate + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/interpolate/bwd_trilinear_interpolate.cpp b/src/solver/interpolate/bwd_trilinear_interpolate.cpp new file mode 100644 index 0000000000..b153070dd5 --- /dev/null +++ b/src/solver/interpolate/bwd_trilinear_interpolate.cpp @@ -0,0 +1,145 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/activ.hpp" +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/miopen.h" +#include +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_BWD_TRILINEAR 256 + +namespace miopen { + +namespace solver { + +namespace interpolate { + +bool IsOverRocmTrilinearBwd(const miopen::interpolate::BwdProblemDescription& problem) +{ + TensorDescriptor input_grad_desc = problem.GetInputGradDesc(); + TensorDescriptor output_grad_desc = problem.GetOutputGradDesc(); + auto dtype = input_grad_desc.GetType(); + + float scale_h = + static_cast(output_grad_desc.GetLengths()[2]) / input_grad_desc.GetLengths()[2]; + float scale_w = + static_cast(output_grad_desc.GetLengths()[3]) / input_grad_desc.GetLengths()[3]; + float scale_d = + static_cast(output_grad_desc.GetLengths()[4]) / input_grad_desc.GetLengths()[4]; + + if(dtype == miopenHalf || dtype == miopenBFloat16) + { + if(scale_h + scale_w + scale_d < 3.1f) + return false; + } + else if(dtype == miopenFloat) + { + if(scale_h + scale_w + scale_d <= 6.0f) + return false; + } + + return true; +} + +bool InterpolateTrilinearBackward::IsApplicable( + const ExecutionContext&, const miopen::interpolate::BwdProblemDescription& problem) const +{ + if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_TRILINEAR) + return false; + if(!IsOverRocmTrilinearBwd(problem)) + return false; + + return true; +} + +ConvSolution InterpolateTrilinearBackward::GetSolution( + const ExecutionContext& context, + const miopen::interpolate::BwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); + + { + auto dtype = problem.GetInputGradDesc().GetType(); + size_t N_total = problem.GetInputGradDesc().GetElementSize(); + + auto kernel = KernelInfo{}; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"DTYPE", "float"}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_TRILINEAR}, + {N_total}, + "MIOpenInterpolate.cpp", + "InterpolateTrilinearBackward", + build_params)); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto input_grad_tv = get_inner_expanded_tv<5>(deref(params.inputGradDesc)); + auto output_grad_tv = get_inner_expanded_tv<5>(deref(params.outputGradDesc)); + size_t nelems = params.inputGradDesc->GetElementSize(); + + kernel(params.input_grad, + params.output_grad, + input_grad_tv, + output_grad_tv, + nelems, + params.scale_factors, + params.align_corners); + }; + }; + + return result; +} + +} // namespace interpolate + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/interpolate/fwd_bicubic_interpolate.cpp b/src/solver/interpolate/fwd_bicubic_interpolate.cpp new file mode 100644 index 0000000000..22d8d594c1 --- /dev/null +++ b/src/solver/interpolate/fwd_bicubic_interpolate.cpp @@ -0,0 +1,131 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_FWD_BICUBIC 256 + +namespace miopen { + +namespace solver { + +namespace interpolate { + +bool IsOverRocmBicubicFwd(const miopen::interpolate::FwdProblemDescription& problem) +{ + TensorDescriptor output_desc = problem.GetOutputDesc(); + TensorDescriptor input_desc = problem.GetInputDesc(); + + float scale_h = static_cast(output_desc.GetLengths()[2]) / input_desc.GetLengths()[2]; + float scale_w = static_cast(output_desc.GetLengths()[3]) / input_desc.GetLengths()[3]; + + if((output_desc.GetLengths()[2] + output_desc.GetLengths()[3] > 256) && + (scale_h + scale_w >= 2)) + return false; + + return true; +} + +bool InterpolateBicubicForward::IsApplicable( + const ExecutionContext&, const miopen::interpolate::FwdProblemDescription& problem) const +{ + if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_BICUBIC) + return false; + if(!IsOverRocmBicubicFwd(problem)) + return false; + + return true; +} + +ConvSolution InterpolateBicubicForward::GetSolution( + const ExecutionContext& context, + const miopen::interpolate::FwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); + + { + auto dtype = problem.GetOutputDesc().GetType(); + size_t N_total = problem.GetOutputDesc().GetElementSize(); + + auto kernel = KernelInfo{}; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"DTYPE", "float"}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_BICUBIC}, + {N_total}, + "MIOpenInterpolate.cpp", + "InterpolateBicubicForward", + build_params)); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto input_tv = get_inner_expanded_tv<4>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc)); + size_t nelems = params.outputDesc->GetElementSize(); + + kernel(params.input, + params.output, + input_tv, + output_tv, + nelems, + params.scale_factors, + params.align_corners); + }; + }; + + return result; +} + +} // namespace interpolate + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/interpolate/fwd_bilinear_interpolate.cpp b/src/solver/interpolate/fwd_bilinear_interpolate.cpp new file mode 100644 index 0000000000..dd93c3d9b2 --- /dev/null +++ b/src/solver/interpolate/fwd_bilinear_interpolate.cpp @@ -0,0 +1,128 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_FWD_BILINEAR 256 + +namespace miopen { + +namespace solver { + +namespace interpolate { + +bool IsOverRocmBilinearFwd(const miopen::interpolate::FwdProblemDescription& problem) +{ + TensorDescriptor output_desc = problem.GetOutputDesc(); + + if(output_desc.GetLengths()[2] + output_desc.GetLengths()[3] > 256) + { + return false; + } + + return true; +} + +bool InterpolateBilinearForward::IsApplicable( + const ExecutionContext&, const miopen::interpolate::FwdProblemDescription& problem) const +{ + if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_BILINEAR) + return false; + if(!IsOverRocmBilinearFwd(problem)) + return false; + + return true; +} + +ConvSolution InterpolateBilinearForward::GetSolution( + const ExecutionContext& context, + const miopen::interpolate::FwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); + + { + auto dtype = problem.GetOutputDesc().GetType(); + size_t N_total = problem.GetOutputDesc().GetElementSize(); + + auto kernel = KernelInfo{}; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"DTYPE", "float"}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_BILINEAR}, + {N_total}, + "MIOpenInterpolate.cpp", + "InterpolateBilinearForward", + build_params)); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto input_tv = get_inner_expanded_tv<4>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc)); + size_t nelems = params.outputDesc->GetElementSize(); + + kernel(params.input, + params.output, + input_tv, + output_tv, + nelems, + params.scale_factors, + params.align_corners); + }; + }; + + return result; +} + +} // namespace interpolate + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/interpolate/fwd_linear_interpolate.cpp b/src/solver/interpolate/fwd_linear_interpolate.cpp new file mode 100644 index 0000000000..2df80d058c --- /dev/null +++ b/src/solver/interpolate/fwd_linear_interpolate.cpp @@ -0,0 +1,114 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_FWD_LINEAR 256 + +namespace miopen { + +namespace solver { + +namespace interpolate { + +bool InterpolateLinearForward::IsApplicable( + const ExecutionContext&, const miopen::interpolate::FwdProblemDescription& problem) const +{ + if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_LINEAR) + return false; + + return true; +} + +ConvSolution InterpolateLinearForward::GetSolution( + const ExecutionContext& context, + const miopen::interpolate::FwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); + + { + auto dtype = problem.GetOutputDesc().GetType(); + size_t N_total = problem.GetOutputDesc().GetElementSize(); + + auto kernel = KernelInfo{}; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"DTYPE", "float"}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_LINEAR}, + {N_total}, + "MIOpenInterpolate.cpp", + "InterpolateLinearForward", + build_params)); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto input_tv = get_inner_expanded_tv<3>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<3>(deref(params.outputDesc)); + size_t nelems = params.outputDesc->GetElementSize(); + + kernel(params.input, + params.output, + input_tv, + output_tv, + nelems, + params.scale_factors, + params.align_corners); + }; + }; + + return result; +} + +} // namespace interpolate + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/interpolate/fwd_nearest_interpolate.cpp b/src/solver/interpolate/fwd_nearest_interpolate.cpp new file mode 100644 index 0000000000..95250ef03b --- /dev/null +++ b/src/solver/interpolate/fwd_nearest_interpolate.cpp @@ -0,0 +1,127 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_FWD_NEAREST 256 + +namespace miopen { + +namespace solver { + +namespace interpolate { + +bool IsOverRocmNearestFwd(const miopen::interpolate::FwdProblemDescription& problem) +{ + TensorDescriptor input_desc = problem.GetInputDesc(); + if(input_desc.GetLengths().size() == 3) + { + size_t nelems = problem.GetInputDesc().GetElementSize(); + if(nelems < 4096) + return false; + } + else if(input_desc.GetLengths().size() == 4 || input_desc.GetLengths().size() == 5) + { + return false; + } + + return true; +} + +bool InterpolateNearestForward::IsApplicable( + const ExecutionContext&, const miopen::interpolate::FwdProblemDescription& problem) const +{ + if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_NEAREST) + return false; + if(!IsOverRocmNearestFwd(problem)) + return false; + + return true; +} + +ConvSolution InterpolateNearestForward::GetSolution( + const ExecutionContext& context, + const miopen::interpolate::FwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); + + { + auto dtype = problem.GetOutputDesc().GetType(); + size_t N_total = problem.GetOutputDesc().GetElementSize(); + + auto kernel = KernelInfo{}; + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + {"DTYPE", "float"}, + }; + + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_NEAREST}, + {N_total}, + "MIOpenInterpolate.cpp", + "InterpolateNearestForward", + build_params)); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + auto input_tv = get_inner_expanded_tv<5>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<5>(deref(params.outputDesc)); + size_t nelems = params.outputDesc->GetElementSize(); + + kernel(params.input, params.output, input_tv, output_tv, nelems, params.scale_factors); + }; + }; + + return result; +} + +} // namespace interpolate + +} // namespace solver + +} // namespace miopen diff --git a/test/cpu_interpolate.hpp b/test/cpu_interpolate.hpp new file mode 100644 index 0000000000..98b4a53ddb --- /dev/null +++ b/test/cpu_interpolate.hpp @@ -0,0 +1,984 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_CPU_INTERPOLATE_HPP +#define GUARD_CPU_INTERPOLATE_HPP + +#include "miopen/miopen.h" +#include "tensor_holder.hpp" +#include + +inline float compute_linear_scale_factor(float scale_factor, + int64_t input_size, + int64_t output_size, + bool align_corners) +{ + if(align_corners) + { + if(input_size == 1) + { + return static_cast(output_size); + } + return static_cast(output_size - 1) / (input_size - 1); + } + else if(scale_factor == 0) + { + return static_cast(output_size) / input_size; + } + else + { + return static_cast(scale_factor); + } +} + +inline float get_src_index(int64_t dest_index, float scale_factor, bool align_corners) +{ + if(align_corners) + { + return dest_index / scale_factor; + } + else + { + return (dest_index + 0.5f) / scale_factor - 0.5f; + } +} + +inline int64_t linear_back_index(int64_t src, float scale_factor, bool align_corners) +{ + return static_cast(std::ceil(get_src_index(src, 1.f / scale_factor, align_corners))); +} + +inline void compute_linear_back_index_from_to(int64_t src, + int64_t input_isze, + int64_t output_size, + float scale_factor, + bool align_corners, + int64_t* from, + int64_t* to) +{ + if(src - 1 < 1) + { + *from = 0; + } + else + { + *from = linear_back_index(src - 1, scale_factor, align_corners); + } + if(src + 1 > input_isze) + { + *to = output_size; + } + else + { + *to = std::min(output_size, linear_back_index(src + 1, scale_factor, align_corners)); + } +} + +inline void compute_source_index_and_lambda(int64_t h, + float scale_factor, + int64_t Hin, + int64_t Hout, + bool align_corners, + int64_t* hin_index0, + int64_t* hin_index1, + float* lambda0, + float* lambda1) +{ + float hin_index_actual = static_cast( + std::max(static_cast(0.), get_src_index(h, scale_factor, align_corners))); + *hin_index0 = static_cast(hin_index_actual); + *hin_index1 = std::min(*hin_index0 + 1, Hin - 1); + *lambda1 = hin_index_actual - *hin_index0; + *lambda0 = 1.f - *lambda1; +} + +inline float get_back_lambda(int64_t src, int64_t src0, int64_t src1, float lambda0, float lambda1) +{ + if(src == src0) + { + if(src0 == src1) + { + return 1; // lambda0 + lambda1 = 1 + } + return lambda0; + } + if(src == src1) + { + return lambda1; + } + // This case can happen due to floating point mutiplification. + // ex> 7 * (105/9) = 87 or 86.99999995 + return 0; +} + +inline float compute_back_lambda( + int64_t dest, int64_t src, float scale_factor, int64_t Hin, int64_t Hout, bool align_corners) +{ + if(Hin == Hout) + { + return 1; + } + int64_t index0; + int64_t index1; + float lambda0; + float lambda1; + compute_source_index_and_lambda( + dest, scale_factor, Hin, Hout, align_corners, &index0, &index1, &lambda0, &lambda1); + return get_back_lambda(src, index0, index1, lambda0, lambda1); +} + +template +void cpu_interpolate_linear_forward(const tensor input, + tensor& output, + const size_t nelems, + const tensor scale_factors, + const bool align_corners) +{ + auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<3>(input.desc); + auto output_tv = miopen::solver::interpolate::get_inner_expanded_tv<3>(output.desc); + + for(int64_t gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<3>(output_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t h = tensor_layout.layout[2]; + + int64_t Hin = input_tv.size[2]; + int64_t Hout = output_tv.size[2]; + if(Hin == Hout || Hout == 1) + { + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(tensor_layout)]; + continue; + } + + float scale_factor_h = scale_factors[0]; + scale_factor_h = compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + + int64_t hin_index0; + int64_t hin_index1; + float lambda1; + float lambda0; + compute_source_index_and_lambda(h, + scale_factor_h, + Hin, + Hout, + align_corners, + &hin_index0, + &hin_index1, + &lambda0, + &lambda1); + + tensor_layout_t<3> input_layout0(n, c, hin_index0); + tensor_layout_t<3> input_layout1(n, c, hin_index1); + + float input0 = input[input_tv.get_tensor_view_idx(input_layout0)]; + float input1 = input[input_tv.get_tensor_view_idx(input_layout1)]; + + output[output_tv.get_tensor_view_idx(tensor_layout)] = + static_cast(input0 * lambda0 + input1 * lambda1); + } +} + +template +void cpu_interpolate_linear_backward(tensor& input_grad, + const tensor output_grad, + const size_t nelems, + const tensor scale_factors, + const bool align_corners) +{ + auto output_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<3>(output_grad.desc); + auto input_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<3>(input_grad.desc); + + for(int64_t gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<3>(input_grad_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t h = tensor_layout.layout[2]; + + int64_t Hin = input_grad_tv.size[2]; + int64_t Hout = output_grad_tv.size[2]; + + if(Hin == Hout) + { + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]; + continue; + } + + float scale_factor_h = scale_factors[0]; + float scale_factor = compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + + int64_t from, to; + compute_linear_back_index_from_to(h, Hin, Hout, scale_factor, align_corners, &from, &to); + + float output = 0; + for(int64_t i = from; i < to; i++) + { + tensor_layout_t<3> output_layout(n, c, i); + output += + static_cast(output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) * + compute_back_lambda(i, h, scale_factor, Hin, Hout, align_corners); + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = static_cast(output); + } +} + +template +void cpu_interpolate_bilinear_forward(const tensor input, + tensor& output, + const size_t nelems, + const tensor scale_factors, + const bool align_corners) +{ + auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(input.desc); + auto output_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(output.desc); + + for(int64_t gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<4>(output_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t h = tensor_layout.layout[2]; + int64_t w = tensor_layout.layout[3]; + + int64_t Hin = input_tv.size[2]; + int64_t Hout = output_tv.size[2]; + int64_t Win = input_tv.size[3]; + int64_t Wout = output_tv.size[3]; + + if(Hin == Hout && Win == Wout) + { + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(tensor_layout)]; + continue; + } + + int64_t hin_index0 = h; + int64_t hin_index1 = h; + float hlambda0 = 1; + float hlambda1 = 0; + if(Hin != Hout && Hout != 1) + { + float scale_factor_h = scale_factors[0]; + float scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + compute_source_index_and_lambda(h, + scale_factor_h_, + Hin, + Hout, + align_corners, + &hin_index0, + &hin_index1, + &hlambda0, + &hlambda1); + } + + int64_t win_index0 = w; + int64_t win_index1 = w; + float wlambda0 = 1; + float wlambda1 = 0; + if(Win != Wout && Wout != 1) + { + float scale_factor_w = scale_factors[1]; + float scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + compute_source_index_and_lambda(w, + scale_factor_w_, + Win, + Wout, + align_corners, + &win_index0, + &win_index1, + &wlambda0, + &wlambda1); + } + + tensor_layout_t<4> input_layout00(n, c, hin_index0, win_index0); + tensor_layout_t<4> input_layout01(n, c, hin_index0, win_index1); + tensor_layout_t<4> input_layout10(n, c, hin_index1, win_index0); + tensor_layout_t<4> input_layout11(n, c, hin_index1, win_index1); + + output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast( + (static_cast(input[input_tv.get_tensor_view_idx(input_layout00)]) * wlambda0 + + static_cast(input[input_tv.get_tensor_view_idx(input_layout01)]) * wlambda1) * + hlambda0 + + (static_cast(input[input_tv.get_tensor_view_idx(input_layout10)]) * wlambda0 + + static_cast(input[input_tv.get_tensor_view_idx(input_layout11)]) * wlambda1) * + hlambda1); + } +} + +template +void cpu_interpolate_bilinear_backward(tensor& input_grad, + const tensor output_grad, + const size_t nelems, + const tensor scale_factors, + const bool align_corners) +{ + auto output_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(output_grad.desc); + auto input_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(input_grad.desc); + + for(int64_t gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<4>(input_grad_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t h = tensor_layout.layout[2]; + int64_t w = tensor_layout.layout[3]; + + int64_t Hin = input_grad_tv.size[2]; + int64_t Hout = output_grad_tv.size[2]; + int64_t Win = input_grad_tv.size[3]; + int64_t Wout = output_grad_tv.size[3]; + + float scale_factor_h = scale_factors[0]; + float scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + + float scale_factor_w = scale_factors[1]; + float scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + + int64_t h_from, h_to; + if(Hin == Hout) + { + h_from = h; + h_to = h + 1; + } + else + { + compute_linear_back_index_from_to( + h, Hin, Hout, scale_factor_h_, align_corners, &h_from, &h_to); + } + int64_t w_from, w_to; + if(Win == Wout) + { + w_from = w; + w_to = w + 1; + } + else + { + compute_linear_back_index_from_to( + w, Win, Wout, scale_factor_w_, align_corners, &w_from, &w_to); + } + + float output = 0; + for(int64_t i = h_from; i < h_to; i++) + { + float h_lambda = compute_back_lambda(i, h, scale_factor_h_, Hin, Hout, align_corners); + if(h_lambda == 0.) + continue; + for(int64_t j = w_from; j < w_to; j++) + { + float w_lambda = + compute_back_lambda(j, w, scale_factor_w_, Win, Wout, align_corners); + + tensor_layout_t<4> output_layout(n, c, i, j); + output += static_cast( + output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) * + h_lambda * w_lambda; + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = static_cast(output); + } +} + +template +void cpu_interpolate_trilinear_forward(const tensor input, + tensor& output, + const size_t nelems, + const tensor scale_factors, + const bool align_corners) +{ + auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(input.desc); + auto output_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(output.desc); + + for(int64_t gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<5>(output_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t d = tensor_layout.layout[2]; + int64_t h = tensor_layout.layout[3]; + int64_t w = tensor_layout.layout[4]; + + int64_t Din = input_tv.size[2]; + int64_t Dout = output_tv.size[2]; + int64_t Hin = input_tv.size[3]; + int64_t Hout = output_tv.size[3]; + int64_t Win = input_tv.size[4]; + int64_t Wout = output_tv.size[4]; + + if(Hin == Hout && Win == Wout && Din == Dout) + { + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(tensor_layout)]; + continue; + } + + int64_t din_index0 = d; + int64_t din_index1 = d; + float dlambda0 = 1; + float dlambda1 = 0; + if(Din != Dout && Dout != 1) + { + float scale_factor_d = scale_factors[0]; + float scale_factor_d_ = + compute_linear_scale_factor(scale_factor_d, Din, Dout, align_corners); + compute_source_index_and_lambda(d, + scale_factor_d_, + Din, + Dout, + align_corners, + &din_index0, + &din_index1, + &dlambda0, + &dlambda1); + } + + int64_t hin_index0 = h; + int64_t hin_index1 = h; + float hlambda0 = 1; + float hlambda1 = 0; + if(Hin != Hout && Hout != 1) + { + float scale_factor_h = scale_factors[1]; + float scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + compute_source_index_and_lambda(h, + scale_factor_h_, + Hin, + Hout, + align_corners, + &hin_index0, + &hin_index1, + &hlambda0, + &hlambda1); + } + + int64_t win_index0 = w; + int64_t win_index1 = w; + float wlambda0 = 1; + float wlambda1 = 0; + if(Win != Wout && Wout != 1) + { + float scale_factor_w = scale_factors[2]; + float scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + compute_source_index_and_lambda(w, + scale_factor_w_, + Win, + Wout, + align_corners, + &win_index0, + &win_index1, + &wlambda0, + &wlambda1); + } + + tensor_layout_t<5> input_layout000(n, c, din_index0, hin_index0, win_index0); + tensor_layout_t<5> input_layout001(n, c, din_index0, hin_index0, win_index1); + tensor_layout_t<5> input_layout010(n, c, din_index0, hin_index1, win_index0); + tensor_layout_t<5> input_layout011(n, c, din_index0, hin_index1, win_index1); + tensor_layout_t<5> input_layout100(n, c, din_index1, hin_index0, win_index0); + tensor_layout_t<5> input_layout101(n, c, din_index1, hin_index0, win_index1); + tensor_layout_t<5> input_layout110(n, c, din_index1, hin_index1, win_index0); + tensor_layout_t<5> input_layout111(n, c, din_index1, hin_index1, win_index1); + + output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast( + (static_cast(input[input_tv.get_tensor_view_idx(input_layout000)]) * wlambda0 + + static_cast(input[input_tv.get_tensor_view_idx(input_layout001)]) * wlambda1) * + hlambda0 + + (static_cast(input[input_tv.get_tensor_view_idx(input_layout010)]) * wlambda0 + + static_cast(input[input_tv.get_tensor_view_idx(input_layout011)]) * wlambda1) * + hlambda1 + + (static_cast(input[input_tv.get_tensor_view_idx(input_layout100)]) * wlambda0 + + static_cast(input[input_tv.get_tensor_view_idx(input_layout101)]) * wlambda1) * + dlambda0 + + (static_cast(input[input_tv.get_tensor_view_idx(input_layout110)]) * wlambda0 + + static_cast(input[input_tv.get_tensor_view_idx(input_layout111)]) * wlambda1) * + dlambda1); + } +} +template +void cpu_interpolate_trilinear_backward(tensor& input_grad, + const tensor output_grad, + const size_t nelems, + const tensor scale_factors, + const bool align_corners) +{ + auto output_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(output_grad.desc); + auto input_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(input_grad.desc); + + for(int64_t gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<5>(input_grad_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t d = tensor_layout.layout[2]; + int64_t h = tensor_layout.layout[3]; + int64_t w = tensor_layout.layout[4]; + + int64_t Din = input_grad_tv.size[2]; + int64_t Dout = output_grad_tv.size[2]; + int64_t Hin = input_grad_tv.size[3]; + int64_t Hout = output_grad_tv.size[3]; + int64_t Win = input_grad_tv.size[4]; + int64_t Wout = output_grad_tv.size[4]; + + float scale_factor_d = scale_factors[0]; + float scale_factor_d_ = + compute_linear_scale_factor(scale_factor_d, Din, Dout, align_corners); + + float scale_factor_h = scale_factors[1]; + float scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + + float scale_factor_w = scale_factors[2]; + float scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + + int64_t d_from, d_to, h_from, h_to, w_from, w_to; + compute_linear_back_index_from_to( + d, Din, Dout, scale_factor_d_, align_corners, &d_from, &d_to); + compute_linear_back_index_from_to( + h, Hin, Hout, scale_factor_h_, align_corners, &h_from, &h_to); + compute_linear_back_index_from_to( + w, Win, Wout, scale_factor_w_, align_corners, &w_from, &w_to); + + float output = 0; + for(int64_t i = d_from; i < d_to; i++) + { + float d_lambda = compute_back_lambda(i, d, scale_factor_d_, Din, Dout, align_corners); + for(int64_t j = h_from; j < h_to; j++) + { + float h_lambda = + compute_back_lambda(j, h, scale_factor_h_, Hin, Hout, align_corners); + for(int64_t k = w_from; k < w_to; k++) + { + float w_lambda = + compute_back_lambda(k, w, scale_factor_w_, Win, Wout, align_corners); + + tensor_layout_t<5> output_layout(n, c, i, j, k); + output += output_grad[output_grad_tv.get_tensor_view_idx(output_layout)] * + d_lambda * h_lambda * w_lambda; + } + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = output; + } +} + +inline float compute_scales_value(float scale, int64_t input_size, int64_t output_size) +{ + return (scale == 0.f) ? (static_cast(input_size) / output_size) : (1.0f / scale); +} + +inline int64_t +nearest_idx(int64_t output_index, int64_t input_size, int64_t output_size, float scales) +{ + if(output_size == input_size) + { + return output_index; + } + else if(output_size == 2 * input_size) + { + return output_index / 2; + } + else + { + float scale = compute_scales_value(scales, input_size, output_size); + return std::min(static_cast((output_index * scale)), input_size); + } +} + +template +void cpu_nearest_forward(const tensor input, + tensor& output, + const size_t nelems, + const tensor scale_factors) +{ + auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(input.desc); + auto output_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(output.desc); + + for(int64_t gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<5>(output_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t d = tensor_layout.layout[2]; + int64_t h = tensor_layout.layout[3]; + int64_t w = tensor_layout.layout[4]; + + int64_t Dout = output_tv.size[2]; + int64_t Hout = output_tv.size[3]; + int64_t Wout = output_tv.size[4]; + int64_t Din = input_tv.size[2]; + int64_t Hin = input_tv.size[3]; + int64_t Win = input_tv.size[4]; + + int64_t x = nearest_idx(d, Din, Dout, scale_factors[0]); + int64_t y = nearest_idx(h, Hin, Hout, scale_factors[1]); + int64_t z = nearest_idx(w, Win, Wout, scale_factors[2]); + + tensor_layout_t<5> input_layout(n, c, x, y, z); + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(input_layout)]; + } +} + +inline int64_t +nearest_idx_back(int64_t input_index, int64_t input_size, int64_t output_size, float scales) +{ + if(output_size == input_size) + { + return input_index; + } + else if(output_size == 2 * input_size) + { + return input_index * 2; + } + else + { + float scale = compute_scales_value(scales, input_size, output_size); + return std::min(static_cast(std::ceil(input_index / scale)), output_size); + } +} + +template +void cpu_nearest_backward(tensor& input_grad, + const tensor output_grad, + const size_t nelems, + const tensor scale_factors) +{ + auto input_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(input_grad.desc); + auto output_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(output_grad.desc); + + for(int64_t gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<5>(input_grad_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t x = tensor_layout.layout[2]; + int64_t y = tensor_layout.layout[3]; + int64_t z = tensor_layout.layout[4]; + + int64_t Dout = output_grad_tv.size[2]; + int64_t Hout = output_grad_tv.size[3]; + int64_t Wout = output_grad_tv.size[4]; + int64_t Din = input_grad_tv.size[2]; + int64_t Hin = input_grad_tv.size[3]; + int64_t Win = input_grad_tv.size[4]; + + float scale_factor_d = scale_factors[0]; + float scale_factor_h = scale_factors[1]; + float scale_factor_w = scale_factors[2]; + + int64_t dstart = nearest_idx_back(x, Din, Dout, scale_factor_d); + int64_t dlimit = nearest_idx_back(x + 1, Din, Dout, scale_factor_d); + int64_t hstart = nearest_idx_back(y, Hin, Hout, scale_factor_h); + int64_t hlimit = nearest_idx_back(y + 1, Hin, Hout, scale_factor_h); + int64_t wstart = nearest_idx_back(z, Win, Wout, scale_factor_w); + int64_t wlimit = nearest_idx_back(z + 1, Win, Wout, scale_factor_w); + + float grad = 0.f; + for(int64_t d = dstart; d < dlimit; d++) + { + for(int64_t h = hstart; h < hlimit; h++) + { + for(int64_t w = wstart; w < wlimit; w++) + { + tensor_layout_t<5> output_grad_layout(n, c, d, h, w); + grad += static_cast( + output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]); + } + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = static_cast(grad); + } +} + +inline float +bicubic_idx(int64_t output_index, int64_t output_size, float scale_factor, bool align_corners) +{ + if(output_size == 1) + { + if(align_corners) + { + return 0; + } + return -0.5f; + } + return get_src_index(output_index, scale_factor, align_corners); +} + +inline float cubic_convolution1(float x, float A) { return ((A + 2) * x - (A + 3)) * x * x + 1; } + +inline float cubic_convolution2(float x, float A) +{ + return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; +} + +inline void get_cubic_upsampling_coefficients(float coeffs[4], float t) +{ + float A = -0.75f; + + float x1 = t; + coeffs[0] = cubic_convolution2(x1 + 1.0f, A); + coeffs[1] = cubic_convolution1(x1, A); + + float x2 = 1.0f - t; + coeffs[2] = cubic_convolution1(x2, A); + coeffs[3] = cubic_convolution2(x2 + 1.0f, A); +} + +inline float cubic_interp1d(float x0, float x1, float x2, float x3, float t) +{ + float coeffs[4]; + get_cubic_upsampling_coefficients(coeffs, t); + + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +inline int64_t bound(int64_t p, int64_t max_size) +{ + return std::max(std::min(p, max_size - 1), 0L); +} + +template +void cpu_bicubic_forward(const tensor input, + tensor& output, + const size_t nelems, + const tensor scale_factors, + const bool align_corners) +{ + auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(input.desc); + auto output_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(output.desc); + + for(int64_t gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<4>(output_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t h = tensor_layout.layout[2]; + int64_t w = tensor_layout.layout[3]; + + int64_t Hin = input_tv.size[2]; + int64_t Win = input_tv.size[3]; + int64_t Hout = output_tv.size[2]; + int64_t Wout = output_tv.size[3]; + if(Hin == Hout && Win == Wout) + { + output[output_tv.get_tensor_view_idx(tensor_layout)] = + input[input_tv.get_tensor_view_idx(tensor_layout)]; + continue; + } + + float scale_factor_h = scale_factors[0]; + float scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + float real_y = bicubic_idx(h, Hout, scale_factor_h_, align_corners); + int64_t in_y = static_cast(std::floor(real_y)); + float t_y = real_y - in_y; + + float scale_factor_w = scale_factors[1]; + float scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + float real_x = bicubic_idx(w, Wout, scale_factor_w_, align_corners); + int64_t in_x = static_cast(std::floor(real_x)); + float t_x = real_x - in_x; + + float coefficients[4]; +#pragma unroll + for(int k = 0; k < 4; k++) + { + int64_t y = bound(in_y - 1 + k, Hin); + tensor_layout_t<4> input_layout0(n, c, y, bound(in_x - 1, Win)); + tensor_layout_t<4> input_layout1(n, c, y, bound(in_x, Win)); + tensor_layout_t<4> input_layout2(n, c, y, bound(in_x + 1, Win)); + tensor_layout_t<4> input_layout3(n, c, y, bound(in_x + 2, Win)); + + coefficients[k] = cubic_interp1d( + static_cast(input[input_tv.get_tensor_view_idx(input_layout0)]), + static_cast(input[input_tv.get_tensor_view_idx(input_layout1)]), + static_cast(input[input_tv.get_tensor_view_idx(input_layout2)]), + static_cast(input[input_tv.get_tensor_view_idx(input_layout3)]), + t_x); + } + output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast(cubic_interp1d( + coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y)); + } +} + +template +void cpu_bicubic_backward(tensor& input_grad, + const tensor output_grad, + const size_t nelems, + const tensor scale_factors, + const bool align_corners) +{ + auto input_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(input_grad.desc); + auto output_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(output_grad.desc); + + std::vector workspace; + workspace.resize(nelems, 0.f); + + int64_t Hin = input_grad_tv.size[2]; + int64_t Hout = output_grad_tv.size[2]; + int64_t Win = input_grad_tv.size[3]; + int64_t Wout = output_grad_tv.size[3]; + + size_t out_elems = output_grad.desc.GetElementSize(); + + for(int64_t gid = 0; gid < out_elems; ++gid) + { + auto tensor_layout = tensor_layout_t<4>(output_grad_tv, gid); + int64_t n = tensor_layout.layout[0]; + int64_t c = tensor_layout.layout[1]; + int64_t h = tensor_layout.layout[2]; + int64_t w = tensor_layout.layout[3]; + + if(Hin == Hout && Win == Wout) + { + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]; + continue; + } + + float scale_factor_h = scale_factors[0]; + float scale_factor_h_ = + compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners); + float real_y = bicubic_idx(h, Hout, scale_factor_h_, align_corners); + int64_t in_y = static_cast(std::floor(real_y)); + float t_y = real_y - static_cast(in_y); + + float scale_factor_w = scale_factors[1]; + float scale_factor_w_ = + compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners); + float real_x = bicubic_idx(w, Wout, scale_factor_w_, align_corners); + int64_t in_x = static_cast(std::floor(real_x)); + float t_x = real_x - static_cast(in_x); + + float y_coeffs[4]; + float x_coeffs[4]; + get_cubic_upsampling_coefficients(y_coeffs, t_y); + get_cubic_upsampling_coefficients(x_coeffs, t_x); + + float out_value = + static_cast(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]); + +#pragma unroll + for(int i = 0; i < 4; i++) + { + int64_t input_h = bound(in_y - 1 + i, Hin); +#pragma unroll + for(int j = 0; j < 4; j++) + { + int64_t input_w = bound(in_x - 1 + j, Win); + tensor_layout_t<4> in_grad_layout(n, c, input_h, input_w); + workspace[input_grad_tv.get_tensor_view_idx(in_grad_layout)] += + out_value * y_coeffs[i] * x_coeffs[j]; + } + } + } + + if(!(Hin == Hout && Win == Wout)) + { + for(int64_t gid = 0; gid < nelems; ++gid) + { + auto tensor_layout = tensor_layout_t<4>(input_grad_tv, gid); + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + static_cast(workspace[input_grad_tv.get_tensor_view_idx(tensor_layout)]); + } + } +} + +template +void cpu_interpolate_forward(const tensor input, + tensor& output, + const size_t nelems, + const tensor scale_factors, + const bool align_corners, + const miopenInterpolateMode_t mode) +{ + if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST) + { + cpu_nearest_forward(input, output, nelems, scale_factors); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_LINEAR) + { + cpu_interpolate_linear_forward(input, output, nelems, scale_factors, align_corners); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_BILINEAR) + { + cpu_interpolate_bilinear_forward(input, output, nelems, scale_factors, align_corners); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_TRILINEAR) + { + cpu_interpolate_trilinear_forward(input, output, nelems, scale_factors, align_corners); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC) + { + cpu_bicubic_forward(input, output, nelems, scale_factors, align_corners); + } +} + +template +void cpu_interpolate_backward(tensor& input_grad, + const tensor output_grad, + const size_t nelems, + const tensor scale_factors, + const bool align_corners, + const miopenInterpolateMode_t mode) +{ + if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST) + { + cpu_nearest_backward(input_grad, output_grad, nelems, scale_factors); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_LINEAR) + { + cpu_interpolate_linear_backward( + input_grad, output_grad, nelems, scale_factors, align_corners); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_BILINEAR) + { + cpu_interpolate_bilinear_backward( + input_grad, output_grad, nelems, scale_factors, align_corners); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_TRILINEAR) + { + cpu_interpolate_trilinear_backward( + input_grad, output_grad, nelems, scale_factors, align_corners); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC) + { + cpu_bicubic_backward(input_grad, output_grad, nelems, scale_factors, align_corners); + } +} + +#endif // GUARD_CPU_INTERPOLATE_HPP diff --git a/test/gtest/interpolate.cpp b/test/gtest/interpolate.cpp new file mode 100644 index 0000000000..4502da88c3 --- /dev/null +++ b/test/gtest/interpolate.cpp @@ -0,0 +1,175 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include "interpolate.hpp" + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace interpolate { + +std::string GetFloatArg() +{ + const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG)); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + +struct GPU_Interpolate_fwd_FP32 : InterpolateTestFwd +{ +}; + +struct GPU_Interpolate_fwd_FP16 : InterpolateTestFwd +{ +}; + +struct GPU_Interpolate_fwd_BFP16 : InterpolateTestFwd +{ +}; + +struct GPU_Interpolate_bwd_FP32 : InterpolateTestBwd +{ +}; + +struct GPU_Interpolate_bwd_FP16 : InterpolateTestBwd +{ +}; + +struct GPU_Interpolate_bwd_BFP16 : InterpolateTestBwd +{ +}; + +} // namespace interpolate +using namespace interpolate; + +// FORWARD TEST +TEST_P(GPU_Interpolate_fwd_FP32, InterpolateTest) +{ + if((miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && GetFloatArg() == "--float") || + miopen::IsUnset(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_Interpolate_fwd_FP16, InterpolateTest) +{ + if((miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && GetFloatArg() == "--half") || + miopen::IsUnset(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_Interpolate_fwd_BFP16, InterpolateTest) +{ + if((miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && GetFloatArg() == "--bfloat16") || + miopen::IsUnset(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Interpolate_fwd_FP32, + testing::ValuesIn(InterpolateTestFwdConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Interpolate_fwd_FP16, + testing::ValuesIn(InterpolateTestFwdConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Interpolate_fwd_BFP16, + testing::ValuesIn(InterpolateTestFwdConfigs())); + +// BACKWARD TEST +TEST_P(GPU_Interpolate_bwd_FP32, InterpolateTestBwd) +{ + if((miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && GetFloatArg() == "--float") || + miopen::IsUnset(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_Interpolate_bwd_FP16, InterpolateTestBwd) +{ + if((miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && GetFloatArg() == "--half") || + miopen::IsUnset(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_Interpolate_bwd_BFP16, InterpolateTestBwd) +{ + if((miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && GetFloatArg() == "--bfloat16") || + miopen::IsUnset(ENV(MIOPEN_TEST_ALL))) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Interpolate_bwd_FP32, + testing::ValuesIn(InterpolateTestBwdConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Interpolate_bwd_FP16, + testing::ValuesIn(InterpolateTestBwdConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Interpolate_bwd_BFP16, + testing::ValuesIn(InterpolateTestBwdConfigs())); diff --git a/test/gtest/interpolate.hpp b/test/gtest/interpolate.hpp new file mode 100644 index 0000000000..a6f369507f --- /dev/null +++ b/test/gtest/interpolate.hpp @@ -0,0 +1,390 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "../driver/tensor_driver.hpp" +#include "cpu_interpolate.hpp" +#include "get_handle.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include +#include +#include + +template +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +struct InterpolateTestCase +{ + std::vector input; + std::vector size; + std::vector scale_factors; + miopenInterpolateMode_t mode; + bool align_corners; + + friend std::ostream& operator<<(std::ostream& os, const InterpolateTestCase& tc) + { + return os << " input:" << tc.input << " size:" << tc.size + << " scale_factors:" << tc.scale_factors << " mode:" << tc.mode + << " align_corners:" << tc.align_corners; + } + + std::vector GetInput() const { return input; } +}; + +inline std::vector InterpolateTestFwdConfigs() +{ + return { + {{16, 256, 1, 1}, {32, 32}, {0, 0}, MIOPEN_INTERPOLATE_MODE_BILINEAR, false}, + {{16, 256, 1, 1}, {32, 32}, {0, 0}, MIOPEN_INTERPOLATE_MODE_BILINEAR, true}, + {{16, 256, 20, 20}, {40, 40}, {2, 2}, MIOPEN_INTERPOLATE_MODE_BICUBIC, false}, + {{16, 256, 20, 20}, {40, 40}, {0, 0}, MIOPEN_INTERPOLATE_MODE_BICUBIC, true}, + {{16, 256, 1}, {32}, {0}, MIOPEN_INTERPOLATE_MODE_NEAREST, false}, + {{16, 256, 1}, {32}, {0}, MIOPEN_INTERPOLATE_MODE_LINEAR, false}, + {{16, 256, 1}, {32}, {0}, MIOPEN_INTERPOLATE_MODE_LINEAR, true}, + }; +} + +inline std::vector InterpolateTestBwdConfigs() +{ + return { + {{16, 256, 1, 1, 1}, {32, 32, 32}, {32, 32, 32}, MIOPEN_INTERPOLATE_MODE_TRILINEAR, false}, + {{16, 256, 1, 1, 1}, {32, 32, 32}, {0, 0, 0}, MIOPEN_INTERPOLATE_MODE_TRILINEAR, true}, + {{16, 256, 1, 1, 1}, {32, 32, 32}, {0, 0, 0}, MIOPEN_INTERPOLATE_MODE_NEAREST, false}, + {{16, 256, 1, 1}, {32, 32}, {0, 0}, MIOPEN_INTERPOLATE_MODE_NEAREST, false}, + {{16, 256, 1, 1}, {32, 32}, {0, 0}, MIOPEN_INTERPOLATE_MODE_BILINEAR, false}, + {{16, 256, 1, 1}, {32, 32}, {0, 0}, MIOPEN_INTERPOLATE_MODE_BILINEAR, true}, + {{16, 256, 20, 20}, {40, 40}, {2, 2}, MIOPEN_INTERPOLATE_MODE_BICUBIC, false}, + {{16, 256, 20, 20}, {40, 40}, {0, 0}, MIOPEN_INTERPOLATE_MODE_BICUBIC, true}, + {{16, 256, 1}, {32}, {0}, MIOPEN_INTERPOLATE_MODE_NEAREST, false}, + {{16, 256, 1}, {32}, {0}, MIOPEN_INTERPOLATE_MODE_LINEAR, false}, + {{16, 256, 1}, {32}, {0}, MIOPEN_INTERPOLATE_MODE_LINEAR, true}, + }; +} + +inline std::vector GetStrides(std::vector input, bool contiguous) +{ + if(!contiguous) + std::swap(input.front(), input.back()); + std::vector strides(input.size()); + strides.back() = 1; + for(int i = input.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * input[i + 1]; + if(!contiguous) + std::swap(strides.front(), strides.back()); + return strides; +} + +// FORWARD TEST +template +struct InterpolateTestFwd : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + interpolate_config = GetParam(); + + auto in_dim = interpolate_config.GetInput(); + auto size = interpolate_config.size; + mode = interpolate_config.mode; + align_corners = interpolate_config.align_corners; + + if(mode != MIOPEN_INTERPOLATE_MODE_NEAREST) + { + scale_factors = tensor{size.size()}; + for(int i = 0; i < size.size(); i++) + scale_factors[i] = interpolate_config.scale_factors[i]; + } + else + { + scale_factors = tensor{3}; + for(int i = 0; i < size.size(); i++) + scale_factors[i] = interpolate_config.scale_factors[i]; + for(int i = size.size(); i < 3; i++) + scale_factors[i] = 0; + } + + auto out_dim = std::vector({in_dim[0], in_dim[1]}); + for(int i = 0; i < size.size(); i++) + { + if(scale_factors[i] != 0) + out_dim.push_back(static_cast(ceil(in_dim[i + 2] * scale_factors[i]))); + else + { + out_dim.push_back(size[i]); + } + } + + auto gen_input_value = [](auto...) { + return prng::gen_A_to_B(static_cast(-5.0f), static_cast(1.0f)); + }; + + auto in_strides = GetStrides(in_dim, true); + input = tensor{in_dim, in_strides}.generate(gen_input_value); + + auto out_strides = GetStrides(out_dim, true); + output = tensor{out_dim, out_strides}; + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + + ref_output = tensor{out_dim, out_strides}; + std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits::quiet_NaN()); + + input_dev = handle.Write(input.data); + output_dev = handle.Write(output.data); + scale_factors_dev = handle.Write(scale_factors.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + size_t nelems = output.desc.GetElementSize(); + + cpu_interpolate_forward(input, ref_output, nelems, scale_factors, align_corners, mode); + + if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST) + { + status = miopen::InterpolateNearestForward(handle, + input.desc, + input_dev.get(), + output.desc, + output_dev.get(), + scale_factors.desc, + scale_factors_dev.get(), + mode); + } + else + { + status = miopen::InterpolateLinearCubicForward(handle, + input.desc, + input_dev.get(), + output.desc, + output_dev.get(), + scale_factors.desc, + scale_factors_dev.get(), + mode, + align_corners); + } + fflush(stdout); + ASSERT_EQ(status, miopenStatusSuccess); + + output.data = handle.Read(output_dev, output.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + + auto error = miopen::rms_range(ref_output, output); + + ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output)); + EXPECT_LT(error, threshold * 10) << "Error output beyond tolerance Error:" << error + << ", Thresholdx10: " << threshold * 10; + } + InterpolateTestCase interpolate_config; + + tensor input; + tensor output; + tensor ref_output; + tensor scale_factors; + + miopenInterpolateMode_t mode; + bool align_corners; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr output_dev; + miopen::Allocator::ManageDataPtr scale_factors_dev; +}; + +// BACKWARD TEST +template +struct InterpolateTestBwd : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + interpolate_config = GetParam(); + + auto in_dim = interpolate_config.GetInput(); + auto in_grad_dim = in_dim; + auto size = interpolate_config.size; + mode = interpolate_config.mode; + align_corners = interpolate_config.align_corners; + + if(mode != MIOPEN_INTERPOLATE_MODE_NEAREST) + { + scale_factors = tensor{size.size()}; + for(int i = 0; i < size.size(); i++) + scale_factors[i] = interpolate_config.scale_factors[i]; + } + else + { + scale_factors = tensor{3}; + for(int i = 0; i < size.size(); i++) + scale_factors[i] = interpolate_config.scale_factors[i]; + for(int i = size.size(); i < 3; i++) + scale_factors[i] = 0; + } + + auto out_grad_dim = std::vector({in_dim[0], in_dim[1]}); + for(int i = 0; i < size.size(); i++) + { + if(scale_factors[i] != 0) + out_grad_dim.push_back(static_cast(ceil(in_dim[i + 2] * scale_factors[i]))); + else + out_grad_dim.push_back(size[i]); + } + + auto gen_output_grad_value = [](auto...) { + return prng::gen_A_to_B(static_cast(-5.0f), static_cast(5.0f)); + }; + + auto out_grad_strides = GetStrides(out_grad_dim, true); + output_grad = tensor{out_grad_dim, out_grad_strides}.generate(gen_output_grad_value); + + auto in_strides = GetStrides(in_grad_dim, true); + input_grad = tensor{in_grad_dim, in_strides}; + std::fill(input_grad.begin(), input_grad.end(), static_cast(0.f)); + + ref_input_grad = tensor{in_grad_dim, in_strides}; + std::fill(ref_input_grad.begin(), ref_input_grad.end(), static_cast(0.f)); + + if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC) + { + ws_sizeInBytes = miopen::GetInterpolateBicubicBackwardWorkspaceSize( + handle, output_grad.desc, input_grad.desc, scale_factors.desc, mode, align_corners); + if(ws_sizeInBytes == static_cast(-1)) + GTEST_SKIP(); + + workspace = tensor{in_grad_dim, in_strides}; + std::fill(workspace.begin(), workspace.end(), 0.f); + + workspace_dev = handle.Write(workspace.data); + } + + output_grad_dev = handle.Write(output_grad.data); + input_grad_dev = handle.Write(input_grad.data); + scale_factors_dev = handle.Write(scale_factors.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + size_t nelems = input_grad.desc.GetElementSize(); + + cpu_interpolate_backward( + ref_input_grad, output_grad, nelems, scale_factors, align_corners, mode); + + if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST) + { + status = miopen::InterpolateNearestBackward(handle, + input_grad.desc, + input_grad_dev.get(), + output_grad.desc, + output_grad_dev.get(), + scale_factors.desc, + scale_factors_dev.get(), + mode); + } + else if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC) + { + status = miopen::InterpolateBicubicBackward(handle, + workspace_dev.get(), + ws_sizeInBytes, + input_grad.desc, + input_grad_dev.get(), + output_grad.desc, + output_grad_dev.get(), + scale_factors.desc, + scale_factors_dev.get(), + mode, + align_corners); + } + else + { + status = miopen::InterpolateLinearBackward(handle, + input_grad.desc, + input_grad_dev.get(), + output_grad.desc, + output_grad_dev.get(), + scale_factors.desc, + scale_factors_dev.get(), + mode, + align_corners); + } + fflush(stdout); + ASSERT_EQ(status, miopenStatusSuccess); + + input_grad.data = handle.Read(input_grad_dev, input_grad.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + + auto error = miopen::rms_range(ref_input_grad, input_grad); + + ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad)); + EXPECT_LT(error, threshold * 10) << "Error input grad beyond tolerance Error:" << error + << ", Thresholdx10: " << threshold * 10; + } + InterpolateTestCase interpolate_config; + + tensor workspace; + tensor input_grad; + tensor output_grad; + tensor ref_input_grad; + tensor scale_factors; + + miopenInterpolateMode_t mode; + bool align_corners; + + miopen::Allocator::ManageDataPtr input_grad_dev; + miopen::Allocator::ManageDataPtr output_grad_dev; + miopen::Allocator::ManageDataPtr scale_factors_dev; + miopen::Allocator::ManageDataPtr workspace_dev; + + size_t ws_sizeInBytes; +};