diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 02bcb88622..5a74c95eaa 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -32,3 +32,4 @@ The MIOpen API library is structured as follows:
   * :doc:`GroupNorm <../doxygen/html/group__groupnorm>` (experimental)
   * :doc:`Cat <../doxygen/html/group__cat>` (experimental)
   * :doc:`Argmax<./argmax>` (experimental)
+  * :doc:`Interpolate <../doxygen/html/group__interpolate>` (experimental)
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 224e550fed..a824ecd45d 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -43,6 +43,7 @@ add_executable(MIOpenDriver
     dm_fusion.cpp
     dm_gemm.cpp
     dm_groupnorm.cpp
+    dm_interpolate.cpp
     dm_layernorm.cpp
     dm_lrn.cpp
     dm_pool.cpp
diff --git a/driver/dm_interpolate.cpp b/driver/dm_interpolate.cpp
new file mode 100644
index 0000000000..d3959a7415
--- /dev/null
+++ b/driver/dm_interpolate.cpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "registry_driver_maker.hpp"
+#include "interpolate_driver.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "interpolate")
+        return new InterpolateDriver<float, float>();
+    if(base_arg == "interpolatefp16")
+        return new InterpolateDriver<float16, float>();
+    if(base_arg == "interpolatebfp16")
+        return new InterpolateDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index 4cfc2b544e..40aa59cfa5 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -151,7 +151,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "pool[fp16], lrn[fp16], "
            "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
            "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
-           "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16]\n");
+           "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], interpolate[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -176,7 +176,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" &&
        arg != "sumbfp16" && arg != "argmax" && arg != "argmaxfp16" && arg != "argmaxbfp16" &&
        arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" &&
-       arg != "catfp16" && arg != "catbfp16" && arg != "--version")
+       arg != "catfp16" && arg != "catbfp16" && arg != "interpolate" && arg != "interpolatefp16" &&
+       arg != "interpolatebfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/interpolate_driver.hpp b/driver/interpolate_driver.hpp
new file mode 100644
index 0000000000..69d0bff864
--- /dev/null
+++ b/driver/interpolate_driver.hpp
@@ -0,0 +1,554 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_INTERPOLATE_DRIVER_HPP
+#define GUARD_MIOPEN_INTERPOLATE_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "mloInterpolateHost.hpp"
+#include "random.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+#include "util_driver.hpp"
+
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/env.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <vector>
+
+inline std::vector<int> GetStrides(std::vector<int> lengths, int contiguous)
+{
+    if(contiguous != 0 && contiguous != 1)
+        std::cerr << "Error Tensor Contiguous should be 0 or 1" << std::endl;
+    if(contiguous == 0)
+        std::swap(lengths.front(), lengths.back());
+    std::vector<int> strides(lengths.size());
+    strides.back() = 1;
+    for(int i = lengths.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * lengths[i + 1];
+    if(contiguous == 0)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+template <typename Tgpu, typename Tref>
+class InterpolateDriver : public Driver
+{
+public:
+    InterpolateDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&outputGradDesc);
+        miopenCreateTensorDescriptor(&inputGradDesc);
+        miopenCreateTensorDescriptor(&scaleFactorsDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    template <typename T>
+    std::vector<T> GetTensorFromCmd(const char* param);
+    int GetandSetData() override;
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~InterpolateDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(outputGradDesc);
+        miopenDestroyTensorDescriptor(inputGradDesc);
+        miopenDestroyTensorDescriptor(scaleFactorsDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t outputDesc;
+    miopenTensorDescriptor_t outputGradDesc;
+    miopenTensorDescriptor_t inputGradDesc;
+    miopenTensorDescriptor_t scaleFactorsDesc;
+
+    std::unique_ptr<GPUMem> in_dev;
+    std::unique_ptr<GPUMem> out_dev;
+    std::unique_ptr<GPUMem> out_grad_dev;
+    std::unique_ptr<GPUMem> in_grad_dev;
+    std::unique_ptr<GPUMem> scale_factors_dev;
+    std::unique_ptr<GPUMem> workspace_dev;
+
+    std::vector<Tgpu> in;
+    std::vector<Tgpu> out;
+    std::vector<Tref> out_host;
+
+    std::vector<float> scale_factors;
+
+    std::vector<Tgpu> out_grad;
+    std::vector<Tgpu> in_grad;
+    std::vector<Tref> in_grad_host;
+    std::vector<float> workspace;
+
+    std::vector<int> in_len;
+    std::vector<int> size;
+    std::vector<float> config_scale_factors;
+    miopenInterpolateMode_t mode;
+    bool align_corners;
+    size_t ws_sizeInBytes = 0;
+};
+
+template <typename Tgpu, typename Tref>
+int InterpolateDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+template <typename T>
+std::vector<T> InterpolateDriver<Tgpu, Tref>::GetTensorFromCmd(const char* param)
+{
+    std::string lengthsStr = inflags.GetValueStr(param);
+
+    std::vector<T> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        T len = static_cast<T>(std::stof(sliceStr));
+
+        lengths.push_back(len);
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    T len                = static_cast<T>(std::stof(sliceStr));
+
+    lengths.push_back(len);
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+int InterpolateDriver<Tgpu, Tref>::GetandSetData()
+{
+    in_len               = GetTensorFromCmd<int>("input_dims");
+    size                 = GetTensorFromCmd<int>("size");
+    config_scale_factors = GetTensorFromCmd<float>("scale_factors");
+    mode                 = static_cast<miopenInterpolateMode_t>(inflags.GetValueInt("mode"));
+    align_corners        = static_cast<bool>(inflags.GetValueInt("align_corners"));
+
+    if(config_scale_factors[0] == -1 && size[0] == -1)
+    {
+        config_scale_factors[0] = 1;
+        for(int i = 1; i < in_len.size() - 2; i++)
+        {
+            config_scale_factors.push_back(1);
+        }
+    }
+
+    if(config_scale_factors[0] != -1)
+    {
+        if(mode != MIOPEN_INTERPOLATE_MODE_NEAREST)
+        {
+            for(int i = 0; i < in_len.size() - 2; i++)
+            {
+                scale_factors.push_back(config_scale_factors[i]);
+            }
+        }
+        else
+        {
+            for(int i = 0; i < in_len.size() - 2; i++)
+            {
+                scale_factors.push_back(config_scale_factors[i]);
+            }
+            for(int i = in_len.size() - 2; i < 3; i++)
+            {
+                scale_factors.push_back(0);
+            }
+        }
+    }
+
+    auto out_len = std::vector<int>({in_len[0], in_len[1]});
+    if(size[0] != -1)
+    {
+        for(int i = 0; i < size.size(); i++)
+        {
+            if(size[i] == 0)
+                out_len.push_back(static_cast<int>(ceil(in_len[i + 2] * scale_factors[i])));
+            else
+            {
+                if(config_scale_factors[0] == -1)
+                {
+                    scale_factors.push_back(static_cast<float>(size[i]) / in_len[i + 2]);
+                }
+                else
+                {
+                    scale_factors[i] = static_cast<float>(size[i]) / in_len[i + 2];
+                }
+                out_len.push_back(size[i]);
+            }
+        }
+    }
+    else
+    {
+        for(int i = 0; i < in_len.size() - 2; i++)
+        {
+            out_len.push_back(static_cast<int>(ceil(in_len[i + 2] * scale_factors[i])));
+            scale_factors[i] = static_cast<float>(out_len[i + 2]) / in_len[i + 2];
+        }
+    }
+
+    auto in_strides     = GetStrides(in_len, inflags.GetValueInt("contiguous"));
+    auto output_strides = GetStrides(out_len, 1);
+
+    SetTensorNd(inputDesc, in_len, in_strides, data_type);
+    SetTensorNd(outputDesc, out_len, output_strides, data_type);
+
+    std::vector<int> scale_length = std::vector<int>({scale_factors.size()});
+    SetTensorNd(scaleFactorsDesc, scale_length, miopen_type<float>{});
+
+    SetTensorNd(outputGradDesc, out_len, output_strides, data_type);
+    SetTensorNd(inputGradDesc, in_len, in_strides, data_type);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int InterpolateDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward Interpolate (Default=1)", "int");
+    inflags.AddInputFlag(
+        "input_dims",
+        'D',
+        "16,256,1",
+        "The dimensional lengths of the input tensor (>=3 and <=5 dimensions): N,C,D,H,W. "
+        "Example: 16,256,1.",
+        "string");
+    inflags.AddInputFlag("size",
+                         'S',
+                         "-1",
+                         "Output Spatial Size: D,H,W. "
+                         "Default: -1 - Use scale factors instead",
+                         "string");
+    inflags.AddInputFlag("scale_factors",
+                         's',
+                         "-1",
+                         "Multiplier for spatial size: factor_D,factor_H,factor_W. "
+                         "Default: -1 - Use size instead",
+                         "string");
+    inflags.AddInputFlag("mode",
+                         'm',
+                         "0",
+                         "algorithm used for upsampling: 'nearest' | 'linear' | 'bilinear' | "
+                         "'bicubic' | 'trilinear'. Default: 0 - 'nearest'",
+                         "int");
+    inflags.AddInputFlag("align_corners",
+                         'A',
+                         "0",
+                         "This only has an effect when mode is 'linear', 'bilinear', 'bicubic' or "
+                         "'trilinear'. Default: False",
+                         "int");
+    inflags.AddInputFlag("contiguous",
+                         'c',
+                         "1",
+                         "Is input tensor contiguous? (Default=1 for contiguous tensor)",
+                         "int");
+
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int");
+    inflags.AddInputFlag("time", 't', "1", "Time (Default=1)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int InterpolateDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t in_sz            = GetTensorSize(inputDesc);
+    size_t out_sz           = GetTensorSize(outputDesc);
+    size_t scale_factors_sz = GetTensorSize(scaleFactorsDesc);
+    size_t out_grad_sz      = GetTensorSize(outputGradDesc);
+    size_t in_grad_sz       = GetTensorSize(inputGradDesc);
+
+    if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC)
+    {
+        miopenGetInterpolateBackwardWorkspaceSize(GetHandle(),
+                                                  outputGradDesc,
+                                                  inputGradDesc,
+                                                  scaleFactorsDesc,
+                                                  mode,
+                                                  align_corners,
+                                                  &ws_sizeInBytes);
+        if(ws_sizeInBytes == static_cast<size_t>(-1))
+            return miopenStatusAllocFailed;
+    }
+
+    uint32_t ctx = 0;
+
+    in_dev            = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
+    out_dev           = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
+    scale_factors_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, scale_factors_sz, sizeof(float)));
+    out_grad_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_grad_sz, sizeof(Tgpu)));
+    in_grad_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_grad_sz, sizeof(Tgpu)));
+    workspace_dev     = std::unique_ptr<GPUMem>(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte)));
+
+    in       = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
+    out      = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
+    out_host = std::vector<Tref>(out_sz, static_cast<Tref>(0));
+
+    out_grad     = std::vector<Tgpu>(out_grad_sz, static_cast<Tgpu>(0));
+    in_grad      = std::vector<Tgpu>(in_grad_sz, static_cast<Tgpu>(0));
+    in_grad_host = std::vector<Tref>(in_grad_sz, static_cast<Tref>(0));
+    workspace    = std::vector<float>(ws_sizeInBytes / sizeof(float), static_cast<float>(0));
+
+    int status;
+
+    for(int i = 0; i < in_sz; i++)
+    {
+        in[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-5.0f), static_cast<Tgpu>(1.0f));
+    }
+    status = in_dev->ToGPU(q, in.data());
+
+    status |= out_dev->ToGPU(q, out.data());
+
+    status |= scale_factors_dev->ToGPU(q, scale_factors.data());
+
+    status |= in_grad_dev->ToGPU(q, in_grad.data());
+
+    status |= workspace_dev->ToGPU(q, workspace.data());
+
+    for(int i = 0; i < out_grad_sz; i++)
+    {
+        out_grad[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-10.0), static_cast<Tgpu>(10.0));
+    }
+    status |= out_grad_dev->ToGPU(q, out_grad.data());
+
+    if(status != 0)
+        std::cout << "Error copying data to GPU\n" << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int InterpolateDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0.0;
+    float kernel_first_time = 0.0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenInterpolateForward(GetHandle(),
+                                 inputDesc,
+                                 in_dev->GetMem(),
+                                 outputDesc,
+                                 out_dev->GetMem(),
+                                 scaleFactorsDesc,
+                                 scale_factors_dev->GetMem(),
+                                 mode,
+                                 align_corners);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            printf("Wall-clock Time Forward Interpolate Elapsed: %f ms\n", t.gettime_ms() / iter);
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        printf("GPU Kernel Time Forward Interpolate Elapsed: %f ms\n", kernel_average_time);
+    }
+
+    out_dev->FromGPU(GetStream(), out.data());
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int InterpolateDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    size_t nelems = out_host.size();
+    mlo_interpolate_forward<Tgpu, Tref>(inputDesc,
+                                        outputDesc,
+                                        in.data(),
+                                        out_host.data(),
+                                        nelems,
+                                        scale_factors.data(),
+                                        align_corners,
+                                        mode);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int InterpolateDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0.0;
+    float kernel_first_time = 0.0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        in_grad_dev->ToGPU(q, in_grad.data());
+        workspace_dev->ToGPU(q, workspace.data());
+        miopenInterpolateBackward(GetHandle(),
+                                  workspace_dev->GetMem(),
+                                  ws_sizeInBytes,
+                                  inputGradDesc,
+                                  in_grad_dev->GetMem(),
+                                  outputGradDesc,
+                                  out_grad_dev->GetMem(),
+                                  scaleFactorsDesc,
+                                  scale_factors_dev->GetMem(),
+                                  mode,
+                                  align_corners);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            printf("Wall-clock Time Backward Interpolate Elapsed: %f ms\n", t.gettime_ms() / iter);
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        printf("GPU Kernel Time Backward Interpolate Elapsed: %f ms\n", kernel_average_time);
+    }
+
+    in_grad_dev->FromGPU(GetStream(), in_grad.data());
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int InterpolateDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    size_t nelems = in_grad_host.size();
+    mlo_interpolate_backward<Tgpu, Tref>(inputGradDesc,
+                                         outputGradDesc,
+                                         in_grad_host.data(),
+                                         out_grad.data(),
+                                         nelems,
+                                         scale_factors.data(),
+                                         align_corners,
+                                         mode);
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int InterpolateDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    auto tolerance = std::numeric_limits<Tgpu>::epsilon() * 10;
+
+    auto error = miopen::rms_range(out_host, out);
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Output Forward Interpolate FAILED: " << error << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        printf("Output Forward Interpolate Verifies on CPU and GPU (err=%f)\n", error);
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int InterpolateDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    auto tolerance = std::numeric_limits<Tgpu>::epsilon() * 10;
+    auto error     = miopen::rms_range(in_grad_host, in_grad);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Backward Interpolate in Input Grad FAILED: " << error
+                  << " while tolerance: " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        printf("Backward Interpolate Verifies in Input Grad on CPU and GPU "
+               "(err=%f)\n",
+               error);
+    }
+
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_INTERPOLATE_DRIVER_HPP
diff --git a/driver/mloInterpolateHost.hpp b/driver/mloInterpolateHost.hpp
new file mode 100644
index 0000000000..94cf4fdd96
--- /dev/null
+++ b/driver/mloInterpolateHost.hpp
@@ -0,0 +1,1071 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MLO_INTERPOLATE_H_
+#define MLO_INTERPOLATE_H_
+
+#include "driver.hpp"
+#include <cstdio>
+#pragma once
+
+#include <cmath>
+#include <miopen/tensor.hpp>
+#include <miopen/interpolate/utils.hpp>
+
+inline float compute_linear_scale_factor(float scale_factor,
+                                         long input_size,
+                                         long output_size,
+                                         bool align_corners)
+{
+    if(align_corners)
+    {
+        if(input_size == 1)
+        {
+            return static_cast<float>(output_size);
+        }
+        return static_cast<float>(output_size - 1) / (input_size - 1);
+    }
+    else if(scale_factor == 0)
+    {
+        return static_cast<float>(output_size) / input_size;
+    }
+    else
+    {
+        return static_cast<float>(scale_factor);
+    }
+}
+
+inline float get_src_index(long dest_index, float scale_factor, bool align_corners)
+{
+    if(align_corners)
+    {
+        return dest_index / scale_factor;
+    }
+    else
+    {
+        return (dest_index + 0.5f) / scale_factor - 0.5f;
+    }
+}
+
+inline long linear_back_index(long src, float scale_factor, bool align_corners)
+{
+    return static_cast<long>(std::ceil(get_src_index(src, 1.f / scale_factor, align_corners)));
+}
+
+inline void compute_linear_back_index_from_to(long src,
+                                              long input_isze,
+                                              long output_size,
+                                              float scale_factor,
+                                              bool align_corners,
+                                              long* from,
+                                              long* to)
+{
+    if(src - 1 < 1)
+    {
+        *from = 0;
+    }
+    else
+    {
+        *from = linear_back_index(src - 1, scale_factor, align_corners);
+    }
+    if(src + 1 > input_isze)
+    {
+        *to = output_size;
+    }
+    else
+    {
+        *to = std::min(output_size, linear_back_index(src + 1, scale_factor, align_corners));
+    }
+}
+
+inline void compute_source_index_and_lambda(long h,
+                                            float scale_factor,
+                                            long Hin,
+                                            long Hout,
+                                            bool align_corners,
+                                            long* hin_index0,
+                                            long* hin_index1,
+                                            float* lambda0,
+                                            float* lambda1)
+{
+    float hin_index_actual = static_cast<float>(
+        std::max(static_cast<float>(0.), get_src_index(h, scale_factor, align_corners)));
+    *hin_index0 = static_cast<long>(hin_index_actual);
+    *hin_index1 = std::min(*hin_index0 + 1, Hin - 1);
+    *lambda1    = hin_index_actual - *hin_index0;
+    *lambda0    = 1.f - *lambda1;
+}
+
+inline float get_back_lambda(long src, long src0, long src1, float lambda0, float lambda1)
+{
+    if(src == src0)
+    {
+        if(src0 == src1)
+        {
+            return 1; // lambda0 + lambda1 = 1
+        }
+        return lambda0;
+    }
+    if(src == src1)
+    {
+        return lambda1;
+    }
+    // This case can happen due to floating point mutiplification.
+    // ex> 7 * (105/9) = 87 or 86.99999995
+    return 0;
+}
+
+inline float compute_back_lambda(
+    long dest, long src, float scale_factor, long Hin, long Hout, bool align_corners)
+{
+    if(Hin == Hout)
+    {
+        return 1;
+    }
+    long index0;
+    long index1;
+    float lambda0;
+    float lambda1;
+    compute_source_index_and_lambda(
+        dest, scale_factor, Hin, Hout, align_corners, &index0, &index1, &lambda0, &lambda1);
+    return get_back_lambda(src, index0, index1, lambda0, lambda1);
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mlo_interpolate_linear_forward(const miopenTensorDescriptor_t inputDesc,
+                                       const miopenTensorDescriptor_t outputDesc,
+                                       const Tgpu* input,
+                                       Tcheck* output,
+                                       const size_t nelems,
+                                       const float* scale_factors,
+                                       const bool align_corners)
+{
+    auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<3>(miopen::deref(inputDesc));
+    auto output_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<3>(miopen::deref(outputDesc));
+
+    for(unsigned long gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<3>(output_tv, gid);
+        long n             = tensor_layout.layout[0];
+        long c             = tensor_layout.layout[1];
+        long h             = tensor_layout.layout[2];
+
+        long Hin  = input_tv.size[2];
+        long Hout = output_tv.size[2];
+        if(Hin == Hout || Hout == 1)
+        {
+            output[output_tv.get_tensor_view_idx(tensor_layout)] =
+                input[input_tv.get_tensor_view_idx(tensor_layout)];
+            continue;
+        }
+
+        float scale_factor_h = scale_factors[0];
+        scale_factor_h = compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+
+        long hin_index0;
+        long hin_index1;
+        float lambda1;
+        float lambda0;
+        compute_source_index_and_lambda(h,
+                                        scale_factor_h,
+                                        Hin,
+                                        Hout,
+                                        align_corners,
+                                        &hin_index0,
+                                        &hin_index1,
+                                        &lambda0,
+                                        &lambda1);
+
+        tensor_layout_t<3> input_layout0(n, c, hin_index0);
+        tensor_layout_t<3> input_layout1(n, c, hin_index1);
+
+        float input0 = input[input_tv.get_tensor_view_idx(input_layout0)];
+        float input1 = input[input_tv.get_tensor_view_idx(input_layout1)];
+
+        output[output_tv.get_tensor_view_idx(tensor_layout)] =
+            static_cast<Tcheck>(input0 * lambda0 + input1 * lambda1);
+    }
+
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mlo_interpolate_linear_backward(const miopenTensorDescriptor_t inputGradDesc,
+                                        const miopenTensorDescriptor_t outputGradDesc,
+                                        Tcheck* input_grad,
+                                        const Tgpu* output_grad,
+                                        const size_t nelems,
+                                        const float* scale_factors,
+                                        const bool align_corners)
+{
+    auto output_grad_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<3>(miopen::deref(outputGradDesc));
+    auto input_grad_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<3>(miopen::deref(inputGradDesc));
+
+    for(unsigned long gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<3>(input_grad_tv, gid);
+        long n             = tensor_layout.layout[0];
+        long c             = tensor_layout.layout[1];
+        long h             = tensor_layout.layout[2];
+
+        long Hin  = input_grad_tv.size[2];
+        long Hout = output_grad_tv.size[2];
+
+        if(Hin == Hout)
+        {
+            input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+                output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)];
+            continue;
+        }
+
+        float scale_factor_h = scale_factors[0];
+        float scale_factor = compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+
+        long from, to;
+        compute_linear_back_index_from_to(h, Hin, Hout, scale_factor, align_corners, &from, &to);
+
+        float output = 0;
+        for(long i = from; i < to; i++)
+        {
+            tensor_layout_t<3> output_layout(n, c, i);
+            output +=
+                static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) *
+                compute_back_lambda(i, h, scale_factor, Hin, Hout, align_corners);
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = static_cast<Tcheck>(output);
+    }
+
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mlo_interpolate_bilinear_forward(const miopenTensorDescriptor_t inputDesc,
+                                         const miopenTensorDescriptor_t outputDesc,
+                                         const Tgpu* input,
+                                         Tcheck* output,
+                                         const size_t nelems,
+                                         const float* scale_factors,
+                                         const bool align_corners)
+{
+    auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
+    auto output_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(outputDesc));
+
+    for(unsigned long gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<4>(output_tv, gid);
+        long n             = tensor_layout.layout[0];
+        long c             = tensor_layout.layout[1];
+        long h             = tensor_layout.layout[2];
+        long w             = tensor_layout.layout[3];
+
+        long Hin  = input_tv.size[2];
+        long Hout = output_tv.size[2];
+        long Win  = input_tv.size[3];
+        long Wout = output_tv.size[3];
+
+        if(Hin == Hout && Win == Wout)
+        {
+            output[output_tv.get_tensor_view_idx(tensor_layout)] =
+                input[input_tv.get_tensor_view_idx(tensor_layout)];
+            continue;
+        }
+
+        long hin_index0 = h;
+        long hin_index1 = h;
+        float hlambda0  = 1;
+        float hlambda1  = 0;
+        if(Hin != Hout && Hout != 1)
+        {
+            float scale_factor_h = scale_factors[0];
+            float scale_factor_h_ =
+                compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+            compute_source_index_and_lambda(h,
+                                            scale_factor_h_,
+                                            Hin,
+                                            Hout,
+                                            align_corners,
+                                            &hin_index0,
+                                            &hin_index1,
+                                            &hlambda0,
+                                            &hlambda1);
+        }
+
+        long win_index0 = w;
+        long win_index1 = w;
+        float wlambda0  = 1;
+        float wlambda1  = 0;
+        if(Win != Wout && Wout != 1)
+        {
+            float scale_factor_w = scale_factors[1];
+            float scale_factor_w_ =
+                compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+            compute_source_index_and_lambda(w,
+                                            scale_factor_w_,
+                                            Win,
+                                            Wout,
+                                            align_corners,
+                                            &win_index0,
+                                            &win_index1,
+                                            &wlambda0,
+                                            &wlambda1);
+        }
+
+        tensor_layout_t<4> input_layout00(n, c, hin_index0, win_index0);
+        tensor_layout_t<4> input_layout01(n, c, hin_index0, win_index1);
+        tensor_layout_t<4> input_layout10(n, c, hin_index1, win_index0);
+        tensor_layout_t<4> input_layout11(n, c, hin_index1, win_index1);
+
+        output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast<Tcheck>(
+            (static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout00)]) * wlambda0 +
+             static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout01)]) * wlambda1) *
+                hlambda0 +
+            (static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout10)]) * wlambda0 +
+             static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout11)]) * wlambda1) *
+                hlambda1);
+    }
+
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mlo_interpolate_bilinear_backward(const miopenTensorDescriptor_t inputGradDesc,
+                                          const miopenTensorDescriptor_t outputGradDesc,
+                                          Tcheck* input_grad,
+                                          const Tgpu* output_grad,
+                                          const size_t nelems,
+                                          const float* scale_factors,
+                                          const bool align_corners)
+{
+    auto output_grad_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc));
+    auto input_grad_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc));
+
+    for(unsigned long gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<4>(input_grad_tv, gid);
+        long n             = tensor_layout.layout[0];
+        long c             = tensor_layout.layout[1];
+        long h             = tensor_layout.layout[2];
+        long w             = tensor_layout.layout[3];
+
+        long Hin  = input_grad_tv.size[2];
+        long Hout = output_grad_tv.size[2];
+        long Win  = input_grad_tv.size[3];
+        long Wout = output_grad_tv.size[3];
+
+        float scale_factor_h = scale_factors[0];
+        float scale_factor_h_ =
+            compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+
+        float scale_factor_w = scale_factors[1];
+        float scale_factor_w_ =
+            compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+
+        long h_from, h_to;
+        if(Hin == Hout)
+        {
+            h_from = h;
+            h_to   = h + 1;
+        }
+        else
+        {
+            compute_linear_back_index_from_to(
+                h, Hin, Hout, scale_factor_h_, align_corners, &h_from, &h_to);
+        }
+        long w_from, w_to;
+        if(Win == Wout)
+        {
+            w_from = w;
+            w_to   = w + 1;
+        }
+        else
+        {
+            compute_linear_back_index_from_to(
+                w, Win, Wout, scale_factor_w_, align_corners, &w_from, &w_to);
+        }
+
+        float output = 0;
+        for(long i = h_from; i < h_to; i++)
+        {
+            float h_lambda = compute_back_lambda(i, h, scale_factor_h_, Hin, Hout, align_corners);
+            if(h_lambda == 0.)
+                continue;
+            for(long j = w_from; j < w_to; j++)
+            {
+                float w_lambda =
+                    compute_back_lambda(j, w, scale_factor_w_, Win, Wout, align_corners);
+
+                tensor_layout_t<4> output_layout(n, c, i, j);
+
+                output += static_cast<float>(
+                              output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) *
+                          h_lambda * w_lambda;
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = static_cast<Tcheck>(output);
+    }
+
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mlo_interpolate_trilinear_forward(const miopenTensorDescriptor_t inputDesc,
+                                          const miopenTensorDescriptor_t outputDesc,
+                                          const Tgpu* input,
+                                          Tcheck* output,
+                                          const size_t nelems,
+                                          const float* scale_factors,
+                                          const bool align_corners)
+{
+    auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto output_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
+
+    for(unsigned long gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<5>(output_tv, gid);
+        long n             = tensor_layout.layout[0];
+        long c             = tensor_layout.layout[1];
+        long d             = tensor_layout.layout[2];
+        long h             = tensor_layout.layout[3];
+        long w             = tensor_layout.layout[4];
+
+        long Din  = input_tv.size[2];
+        long Dout = output_tv.size[2];
+        long Hin  = input_tv.size[3];
+        long Hout = output_tv.size[3];
+        long Win  = input_tv.size[4];
+        long Wout = output_tv.size[4];
+
+        if(Hin == Hout && Win == Wout && Din == Dout)
+        {
+            output[output_tv.get_tensor_view_idx(tensor_layout)] =
+                input[input_tv.get_tensor_view_idx(tensor_layout)];
+            continue;
+        }
+
+        long din_index0 = d;
+        long din_index1 = d;
+        float dlambda0  = 1;
+        float dlambda1  = 0;
+        if(Din != Dout && Dout != 1)
+        {
+            float scale_factor_d = scale_factors[0];
+            float scale_factor_d_ =
+                compute_linear_scale_factor(scale_factor_d, Din, Dout, align_corners);
+            compute_source_index_and_lambda(d,
+                                            scale_factor_d_,
+                                            Din,
+                                            Dout,
+                                            align_corners,
+                                            &din_index0,
+                                            &din_index1,
+                                            &dlambda0,
+                                            &dlambda1);
+        }
+
+        long hin_index0 = h;
+        long hin_index1 = h;
+        float hlambda0  = 1;
+        float hlambda1  = 0;
+        if(Hin != Hout && Hout != 1)
+        {
+            float scale_factor_h = scale_factors[1];
+            float scale_factor_h_ =
+                compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+            compute_source_index_and_lambda(h,
+                                            scale_factor_h_,
+                                            Hin,
+                                            Hout,
+                                            align_corners,
+                                            &hin_index0,
+                                            &hin_index1,
+                                            &hlambda0,
+                                            &hlambda1);
+        }
+
+        long win_index0 = w;
+        long win_index1 = w;
+        float wlambda0  = 1;
+        float wlambda1  = 0;
+        if(Win != Wout && Wout != 1)
+        {
+            float scale_factor_w = scale_factors[2];
+            float scale_factor_w_ =
+                compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+            compute_source_index_and_lambda(w,
+                                            scale_factor_w_,
+                                            Win,
+                                            Wout,
+                                            align_corners,
+                                            &win_index0,
+                                            &win_index1,
+                                            &wlambda0,
+                                            &wlambda1);
+        }
+
+        tensor_layout_t<5> input_layout000(n, c, din_index0, hin_index0, win_index0);
+        tensor_layout_t<5> input_layout001(n, c, din_index0, hin_index0, win_index1);
+        tensor_layout_t<5> input_layout010(n, c, din_index0, hin_index1, win_index0);
+        tensor_layout_t<5> input_layout011(n, c, din_index0, hin_index1, win_index1);
+        tensor_layout_t<5> input_layout100(n, c, din_index1, hin_index0, win_index0);
+        tensor_layout_t<5> input_layout101(n, c, din_index1, hin_index0, win_index1);
+        tensor_layout_t<5> input_layout110(n, c, din_index1, hin_index1, win_index0);
+        tensor_layout_t<5> input_layout111(n, c, din_index1, hin_index1, win_index1);
+
+        output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast<Tcheck>(
+            (static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout000)]) * wlambda0 +
+             static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout001)]) * wlambda1) *
+                hlambda0 +
+            (static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout010)]) * wlambda0 +
+             static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout011)]) * wlambda1) *
+                hlambda1 +
+            (static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout100)]) * wlambda0 +
+             static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout101)]) * wlambda1) *
+                dlambda0 +
+            (static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout110)]) * wlambda0 +
+             static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout111)]) * wlambda1) *
+                dlambda1);
+    }
+
+    return 0;
+}
+template <typename Tgpu, typename Tcheck>
+int32_t mlo_interpolate_trilinear_backward(const miopenTensorDescriptor_t inputGradDesc,
+                                           const miopenTensorDescriptor_t outputGradDesc,
+                                           Tcheck* input_grad,
+                                           const Tgpu* output_grad,
+                                           const size_t nelems,
+                                           const float* scale_factors,
+                                           const bool align_corners)
+{
+    auto output_grad_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc));
+    auto input_grad_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc));
+
+    for(unsigned long gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<5>(input_grad_tv, gid);
+        long n             = tensor_layout.layout[0];
+        long c             = tensor_layout.layout[1];
+        long d             = tensor_layout.layout[2];
+        long h             = tensor_layout.layout[3];
+        long w             = tensor_layout.layout[4];
+
+        long Din  = input_grad_tv.size[2];
+        long Dout = output_grad_tv.size[2];
+        long Hin  = input_grad_tv.size[3];
+        long Hout = output_grad_tv.size[3];
+        long Win  = input_grad_tv.size[4];
+        long Wout = output_grad_tv.size[4];
+
+        float scale_factor_d = scale_factors[0];
+        float scale_factor_d_ =
+            compute_linear_scale_factor(scale_factor_d, Din, Dout, align_corners);
+
+        float scale_factor_h = scale_factors[1];
+        float scale_factor_h_ =
+            compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+
+        float scale_factor_w = scale_factors[2];
+        float scale_factor_w_ =
+            compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+
+        long d_from, d_to, h_from, h_to, w_from, w_to;
+        compute_linear_back_index_from_to(
+            d, Din, Dout, scale_factor_d_, align_corners, &d_from, &d_to);
+        compute_linear_back_index_from_to(
+            h, Hin, Hout, scale_factor_h_, align_corners, &h_from, &h_to);
+        compute_linear_back_index_from_to(
+            w, Win, Wout, scale_factor_w_, align_corners, &w_from, &w_to);
+
+        float output = 0;
+        for(long i = d_from; i < d_to; i++)
+        {
+            float d_lambda = compute_back_lambda(i, d, scale_factor_d_, Din, Dout, align_corners);
+            for(long j = h_from; j < h_to; j++)
+            {
+                float h_lambda =
+                    compute_back_lambda(j, h, scale_factor_h_, Hin, Hout, align_corners);
+                for(long k = w_from; k < w_to; k++)
+                {
+                    float w_lambda =
+                        compute_back_lambda(k, w, scale_factor_w_, Win, Wout, align_corners);
+                    tensor_layout_t<5> output_layout(n, c, i, j, k);
+
+                    output += output_grad[output_grad_tv.get_tensor_view_idx(output_layout)] *
+                              d_lambda * h_lambda * w_lambda;
+                }
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = output;
+    }
+
+    return 0;
+}
+
+inline float compute_scales_value(float scale, long input_size, long output_size)
+{
+    return (scale == 0.f) ? (static_cast<float>(input_size) / output_size) : (1.0f / scale);
+}
+
+inline long nearest_idx(long output_index, long input_size, long output_size, float scales)
+{
+    if(output_size == input_size)
+    {
+        return output_index;
+    }
+    else if(output_size == 2 * input_size)
+    {
+        return output_index / 2;
+    }
+    else
+    {
+        float scale = compute_scales_value(scales, input_size, output_size);
+        return std::min(static_cast<long>((output_index * scale)), input_size);
+    }
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mlo_nearest_forward(const miopenTensorDescriptor_t inputDesc,
+                            const miopenTensorDescriptor_t outputDesc,
+                            const Tgpu* input,
+                            Tcheck* output,
+                            const size_t nelems,
+                            const float* scale_factors)
+{
+    auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto output_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
+
+    for(unsigned long gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<5>(output_tv, gid);
+        long n             = tensor_layout.layout[0];
+        long c             = tensor_layout.layout[1];
+        long d             = tensor_layout.layout[2];
+        long h             = tensor_layout.layout[3];
+        long w             = tensor_layout.layout[4];
+
+        long Dout = output_tv.size[2];
+        long Hout = output_tv.size[3];
+        long Wout = output_tv.size[4];
+        long Din  = input_tv.size[2];
+        long Hin  = input_tv.size[3];
+        long Win  = input_tv.size[4];
+
+        long x = nearest_idx(d, Din, Dout, scale_factors[0]);
+        long y = nearest_idx(h, Hin, Hout, scale_factors[1]);
+        long z = nearest_idx(w, Win, Wout, scale_factors[2]);
+
+        tensor_layout_t<5> input_layout(n, c, x, y, z);
+
+        output[output_tv.get_tensor_view_idx(tensor_layout)] =
+            input[input_tv.get_tensor_view_idx(input_layout)];
+    }
+
+    return 0;
+}
+
+inline long nearest_idx_back(long input_index, long input_size, long output_size, float scales)
+{
+    if(output_size == input_size)
+    {
+        return input_index;
+    }
+    else if(output_size == 2 * input_size)
+    {
+        return input_index * 2;
+    }
+    else
+    {
+        float scale = compute_scales_value(scales, input_size, output_size);
+        return std::min(static_cast<long>(std::ceil(input_index / scale)), output_size);
+    }
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mlo_nearest_backward(const miopenTensorDescriptor_t inputGradDesc,
+                             const miopenTensorDescriptor_t outputGradDesc,
+                             Tcheck* input_grad,
+                             const Tgpu* output_grad,
+                             const size_t nelems,
+                             const float* scale_factors)
+{
+    auto output_grad_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc));
+    auto input_grad_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc));
+
+    for(unsigned long gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<5>(input_grad_tv, gid);
+        long n             = tensor_layout.layout[0];
+        long c             = tensor_layout.layout[1];
+        long x             = tensor_layout.layout[2];
+        long y             = tensor_layout.layout[3];
+        long z             = tensor_layout.layout[4];
+
+        long Dout = output_grad_tv.size[2];
+        long Hout = output_grad_tv.size[3];
+        long Wout = output_grad_tv.size[4];
+        long Din  = input_grad_tv.size[2];
+        long Hin  = input_grad_tv.size[3];
+        long Win  = input_grad_tv.size[4];
+
+        float scale_factor_d = scale_factors[0];
+        float scale_factor_h = scale_factors[1];
+        float scale_factor_w = scale_factors[2];
+
+        long dstart = nearest_idx_back(x, Din, Dout, scale_factor_d);
+        long dlimit = nearest_idx_back(x + 1, Din, Dout, scale_factor_d);
+        long hstart = nearest_idx_back(y, Hin, Hout, scale_factor_h);
+        long hlimit = nearest_idx_back(y + 1, Hin, Hout, scale_factor_h);
+        long wstart = nearest_idx_back(z, Win, Wout, scale_factor_w);
+        long wlimit = nearest_idx_back(z + 1, Win, Wout, scale_factor_w);
+
+        float grad = 0.f;
+        for(long d = dstart; d < dlimit; d++)
+        {
+            for(long h = hstart; h < hlimit; h++)
+            {
+                for(long w = wstart; w < wlimit; w++)
+                {
+                    tensor_layout_t<5> output_grad_layout(n, c, d, h, w);
+                    grad += static_cast<float>(
+                        output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]);
+                }
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = static_cast<Tcheck>(grad);
+    }
+
+    return 0;
+}
+
+inline float
+bicubic_idx(long output_index, long output_size, float scale_factor, bool align_corners)
+{
+    if(output_size == 1)
+    {
+        if(align_corners)
+        {
+            return 0;
+        }
+        return -0.5f;
+    }
+    return get_src_index(output_index, scale_factor, align_corners);
+}
+
+inline float cubic_convolution1(float x, float A) { return ((A + 2) * x - (A + 3)) * x * x + 1; }
+
+inline float cubic_convolution2(float x, float A)
+{
+    return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+inline void get_cubic_upsampling_coefficients(float coeffs[4], float t)
+{
+    float A = -0.75f;
+
+    float x1  = t;
+    coeffs[0] = cubic_convolution2(x1 + 1.0f, A);
+    coeffs[1] = cubic_convolution1(x1, A);
+
+    float x2  = 1.0f - t;
+    coeffs[2] = cubic_convolution1(x2, A);
+    coeffs[3] = cubic_convolution2(x2 + 1.0f, A);
+}
+
+inline float cubic_interp1d(float x0, float x1, float x2, float x3, float t)
+{
+    float coeffs[4];
+    get_cubic_upsampling_coefficients(coeffs, t);
+
+    return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+inline long bound(long p, long max_size) { return std::max(std::min(p, max_size - 1), 0L); }
+
+template <typename Tgpu, typename Tcheck>
+int32_t mlo_bicubic_forward(const miopenTensorDescriptor_t inputDesc,
+                            const miopenTensorDescriptor_t outputDesc,
+                            const Tgpu* input,
+                            Tcheck* output,
+                            const size_t nelems,
+                            const float* scale_factors,
+                            const bool align_corners)
+{
+    auto input_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
+    auto output_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(outputDesc));
+
+    for(unsigned long gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<4>(output_tv, gid);
+        long n             = tensor_layout.layout[0];
+        long c             = tensor_layout.layout[1];
+        long h             = tensor_layout.layout[2];
+        long w             = tensor_layout.layout[3];
+
+        long Hin  = input_tv.size[2];
+        long Win  = input_tv.size[3];
+        long Hout = output_tv.size[2];
+        long Wout = output_tv.size[3];
+        if(Hin == Hout && Win == Wout)
+        {
+            output[output_tv.get_tensor_view_idx(tensor_layout)] =
+                input[input_tv.get_tensor_view_idx(tensor_layout)];
+            continue;
+        }
+
+        float scale_factor_h = scale_factors[0];
+        float scale_factor_h_ =
+            compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+        float real_y = bicubic_idx(h, Hout, scale_factor_h_, align_corners);
+        long in_y    = static_cast<long>(std::floor(real_y));
+        float t_y    = real_y - in_y;
+
+        float scale_factor_w = scale_factors[1];
+        float scale_factor_w_ =
+            compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+        float real_x = bicubic_idx(w, Wout, scale_factor_w_, align_corners);
+        long in_x    = static_cast<long>(std::floor(real_x));
+        float t_x    = real_x - in_x;
+
+        float coefficients[4];
+#pragma unroll
+        for(int k = 0; k < 4; k++)
+        {
+            long y = bound(in_y - 1 + k, Hin);
+            tensor_layout_t<4> input_layout0(n, c, y, bound(in_x - 1, Win));
+            tensor_layout_t<4> input_layout1(n, c, y, bound(in_x, Win));
+            tensor_layout_t<4> input_layout2(n, c, y, bound(in_x + 1, Win));
+            tensor_layout_t<4> input_layout3(n, c, y, bound(in_x + 2, Win));
+
+            coefficients[k] = cubic_interp1d(
+                static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout0)]),
+                static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout1)]),
+                static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout2)]),
+                static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout3)]),
+                t_x);
+        }
+        output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast<Tcheck>(cubic_interp1d(
+            coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y));
+    }
+
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mlo_bicubic_backward(const miopenTensorDescriptor_t inputGradDesc,
+                             const miopenTensorDescriptor_t outputGradDesc,
+                             Tcheck* input_grad,
+                             const Tgpu* output_grad,
+                             const size_t nelems,
+                             const float* scale_factors,
+                             const bool align_corners)
+{
+    auto output_grad_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc));
+    auto input_grad_tv =
+        miopen::solver::interpolate::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc));
+
+    std::vector<float> workspace;
+    workspace.resize(nelems, 0);
+
+    uint64_t Hin  = input_grad_tv.size[2];
+    uint64_t Hout = output_grad_tv.size[2];
+    uint64_t Win  = input_grad_tv.size[3];
+    uint64_t Wout = output_grad_tv.size[3];
+
+    size_t out_elems = miopen::deref(outputGradDesc).GetElementSize();
+    for(uint64_t gid = 0; gid < out_elems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<4>(output_grad_tv, gid);
+        uint64_t n         = tensor_layout.layout[0];
+        uint64_t c         = tensor_layout.layout[1];
+        uint64_t h         = tensor_layout.layout[2];
+        uint64_t w         = tensor_layout.layout[3];
+
+        if(Hin == Hout && Win == Wout)
+        {
+            input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+                output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)];
+            continue;
+        }
+
+        float scale_factor_h = scale_factors[0];
+        float scale_factor_h_ =
+            compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+        float real_y = bicubic_idx(h, Hout, scale_factor_h_, align_corners);
+        int64_t in_y = static_cast<int64_t>(std::floor(real_y));
+        float t_y    = real_y - static_cast<float>(in_y);
+
+        float scale_factor_w = scale_factors[1];
+        float scale_factor_w_ =
+            compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+        float real_x = bicubic_idx(w, Wout, scale_factor_w_, align_corners);
+        int64_t in_x = static_cast<int64_t>(std::floor(real_x));
+        float t_x    = real_x - static_cast<float>(in_x);
+
+        float y_coeffs[4];
+        float x_coeffs[4];
+        get_cubic_upsampling_coefficients(y_coeffs, t_y);
+        get_cubic_upsampling_coefficients(x_coeffs, t_x);
+        float out_value =
+            static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]);
+
+        for(int i = 0; i < 4; i++)
+        {
+            int64_t input_h = bound(in_y - 1 + i, Hin);
+            for(int j = 0; j < 4; j++)
+            {
+                int64_t input_w = bound(in_x - 1 + j, Win);
+                tensor_layout_t<4> in_grad_layout(n, c, input_h, input_w);
+
+                workspace[input_grad_tv.get_tensor_view_idx(in_grad_layout)] +=
+                    out_value * y_coeffs[i] * x_coeffs[j];
+            }
+        }
+    }
+
+    if(!(Hin == Hout && Win == Wout))
+    {
+        for(uint64_t gid = 0; gid < nelems; ++gid)
+        {
+            auto tensor_layout = tensor_layout_t<4>(input_grad_tv, gid);
+            input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+                static_cast<Tcheck>(workspace[input_grad_tv.get_tensor_view_idx(tensor_layout)]);
+        }
+    }
+
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mlo_interpolate_forward(const miopenTensorDescriptor_t inputDesc,
+                                const miopenTensorDescriptor_t outputDesc,
+                                const Tgpu* input,
+                                Tcheck* output,
+                                const size_t nelems,
+                                const float* scale_factors,
+                                const bool align_corners,
+                                const miopenInterpolateMode_t mode)
+{
+    if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST)
+    {
+        return mlo_nearest_forward(inputDesc, outputDesc, input, output, nelems, scale_factors);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_LINEAR)
+    {
+        return mlo_interpolate_linear_forward(
+            inputDesc, outputDesc, input, output, nelems, scale_factors, align_corners);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_BILINEAR)
+    {
+        return mlo_interpolate_bilinear_forward(
+            inputDesc, outputDesc, input, output, nelems, scale_factors, align_corners);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_TRILINEAR)
+    {
+        return mlo_interpolate_trilinear_forward(
+            inputDesc, outputDesc, input, output, nelems, scale_factors, align_corners);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC)
+    {
+        return mlo_bicubic_forward(
+            inputDesc, outputDesc, input, output, nelems, scale_factors, align_corners);
+    }
+
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mlo_interpolate_backward(const miopenTensorDescriptor_t inputGradDesc,
+                                 const miopenTensorDescriptor_t outputGradDesc,
+                                 Tcheck* input_grad,
+                                 const Tgpu* output_grad,
+                                 const size_t nelems,
+                                 const float* scale_factors,
+                                 const bool align_corners,
+                                 const miopenInterpolateMode_t mode)
+{
+    if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST)
+    {
+        return mlo_nearest_backward(
+            inputGradDesc, outputGradDesc, input_grad, output_grad, nelems, scale_factors);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_LINEAR)
+    {
+        return mlo_interpolate_linear_backward(inputGradDesc,
+                                               outputGradDesc,
+                                               input_grad,
+                                               output_grad,
+                                               nelems,
+                                               scale_factors,
+                                               align_corners);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_BILINEAR)
+    {
+        return mlo_interpolate_bilinear_backward(inputGradDesc,
+                                                 outputGradDesc,
+                                                 input_grad,
+                                                 output_grad,
+                                                 nelems,
+                                                 scale_factors,
+                                                 align_corners);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_TRILINEAR)
+    {
+        return mlo_interpolate_trilinear_backward(inputGradDesc,
+                                                  outputGradDesc,
+                                                  input_grad,
+                                                  output_grad,
+                                                  nelems,
+                                                  scale_factors,
+                                                  align_corners);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC)
+    {
+        return mlo_bicubic_backward(inputGradDesc,
+                                    outputGradDesc,
+                                    input_grad,
+                                    output_grad,
+                                    nelems,
+                                    scale_factors,
+                                    align_corners);
+    }
+
+    return 0;
+}
+
+#endif // MLO_INTERPOLATE_H_
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index e768c7b349..5fce7be78e 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -68,6 +68,7 @@
  * @defgroup argmax
  * @defgroup groupnorm
  * @defgroup cat
+ * @defgroup interpolate
  *
  */
 
@@ -6582,6 +6583,120 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d
 // CLOSEOUT BackendAPI DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+
+/*! @ingroup interpolate
+ * @enum miopenInterpolateMode_t
+ * Modes for Interpolate
+ */
+
+typedef enum
+{
+    MIOPEN_INTERPOLATE_MODE_NEAREST   = 0,
+    MIOPEN_INTERPOLATE_MODE_LINEAR    = 1,
+    MIOPEN_INTERPOLATE_MODE_BILINEAR  = 2,
+    MIOPEN_INTERPOLATE_MODE_BICUBIC   = 3,
+    MIOPEN_INTERPOLATE_MODE_TRILINEAR = 4,
+} miopenInterpolateMode_t;
+
+// Interpolate APIs
+/** @addtogroup interpolate
+ *
+ *  @{
+ */
+
+/*! @brief Execute a interpolate forward layer
+ *
+ * @param handle                MIOpen handle (input)
+ * @param inputDesc             Tensor descriptor for input tensor (input)
+ * @param input                 Data tensor input  (input)
+ * @param outputDesc            Tensor descriptor for output tensor (input)
+ * @param output                Data tensor output (output)
+ * @param scaleFactorsDesc      Tensor descriptor for scale factors tensor (input)
+ * @param scale_factors         Data tensor scale factors - multiplier for spatial size (input)
+ * @param mode                  Interpolation mode (input)
+ * @param align_corners         If set to True, the input and output tensors are aligned by the
+ * center points of their corner pixels, preserving the values at the corner pixels. If set to
+ * False, the input and output tensors are aligned by the corner points of their corner pixels, and
+ * the interpolation uses edge value padding for out-of-boundary values, making this operation
+ * independent of input size when scale_factor is kept the same. This only has an effect when mode
+ * is 'linear', 'bilinear', 'bicubic' or 'trilinear'. (input)
+ * @return                      miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenInterpolateForward(miopenHandle_t handle,
+                         const miopenTensorDescriptor_t inputDesc,
+                         const void* input,
+                         const miopenTensorDescriptor_t outputDesc,
+                         void* output,
+                         const miopenTensorDescriptor_t scaleFactorsDesc,
+                         const void* scale_factors,
+                         const miopenInterpolateMode_t mode,
+                         const bool align_corners);
+
+/*! @brief Helper function to query the minimum workspace size required by the Interpolate Bicubic
+ * Backward call
+ *
+ * @param handle                   MIOpen Handle (input)
+ * @param outputGradDesc           Tensor descriptor for output grad tensor (input)
+ * @param inputGradDesc            Tensor descriptor for input grad tensor (input)
+ * @param scaleFactorsDesc         Tensor descriptor for scale factors tensor (input)
+ * @param mode                     Interpolation mode (input)
+ * @param align_corners            If set to True, the input and output tensors are aligned by the
+ * center points of their corner pixels, preserving the values at the corner pixels. If set to
+ * False, the input and output tensors are aligned by the corner points of their corner pixels, and
+ * the interpolation uses edge value padding for out-of-boundary values, making this operation
+ * independent of input size when scale_factor is kept the same. This only has an effect when mode
+ * is 'linear', 'bilinear', 'bicubic' or 'trilinear'. (input)
+ * @param sizeInBytes              Pointer to data to return the minimum workspace size (output)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetInterpolateBackwardWorkspaceSize(miopenHandle_t handle,
+                                          const miopenTensorDescriptor_t outputGradDesc,
+                                          const miopenTensorDescriptor_t inputGradDesc,
+                                          const miopenTensorDescriptor_t scaleFactorsDesc,
+                                          const miopenInterpolateMode_t mode,
+                                          const bool align_corners,
+                                          size_t* sizeInBytes);
+
+/*! @brief Execute a interpolate backward layer
+ *
+ * @param handle                MIOpen handle (input)
+ * @param workspace             Pointer to workspace (input)
+ * @param workspaceSizeInBytes  Size of workspace buffer (input)
+ * @param inputGradDesc         Tensor descriptor for input grad tensor (input)
+ * @param input_grad            Data tensor input grad (output)
+ * @param outputGradDesc        Tensor descriptor for output grad tensor (input)
+ * @param output_grad           Data tensor output grad (input)
+ * @param scaleFactorsDesc      Tensor descriptor for scale factors tensor (input)
+ * @param scale_factors         Data tensor scale factors - multiplier for spatial size (input)
+ * @param mode                  Interpolation mode (input)
+ * @param align_corners         If set to True, the input and output tensors are aligned by the
+ * center points of their corner pixels, preserving the values at the corner pixels. If set to
+ * False, the input and output tensors are aligned by the corner points of their corner pixels, and
+ * the interpolation uses edge value padding for out-of-boundary values, making this operation
+ * independent of input size when scale_factor is kept the same. This only has an effect when mode
+ * is 'linear', 'bilinear', 'bicubic' or 'trilinear'. (input)
+ * @return                      miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenInterpolateBackward(miopenHandle_t handle,
+                          void* workspace,
+                          size_t workspaceSizeInBytes,
+                          const miopenTensorDescriptor_t inputGradDesc,
+                          void* input_grad,
+                          const miopenTensorDescriptor_t outputGradDesc,
+                          const void* output_grad,
+                          const miopenTensorDescriptor_t scaleFactorsDesc,
+                          const void* scale_factors,
+                          const miopenInterpolateMode_t mode,
+                          const bool align_corners);
+
+/** @} */
+// CLOSEOUT Interpolate DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9671eed03c..4e7a0d0530 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -132,6 +132,8 @@ set( MIOpen_Source
     groupnorm/problem_description.cpp
     handle_api.cpp
     invoker_cache.cpp
+    interpolate_api.cpp
+    interpolate/problem_description.cpp
     kernel_build_params.cpp
     kernel_warnings.cpp
     layernorm_api.cpp
@@ -260,6 +262,15 @@ set( MIOpen_Source
     solver/gemm_bwd.cpp
     solver/gemm_wrw.cpp
     solver/groupnorm/forward_groupnorm.cpp
+    solver/interpolate/bwd_bicubic_interpolate.cpp
+    solver/interpolate/bwd_bilinear_interpolate.cpp
+    solver/interpolate/bwd_linear_interpolate.cpp
+    solver/interpolate/bwd_nearest_interpolate.cpp
+    solver/interpolate/bwd_trilinear_interpolate.cpp
+    solver/interpolate/fwd_bicubic_interpolate.cpp
+    solver/interpolate/fwd_bilinear_interpolate.cpp
+    solver/interpolate/fwd_linear_interpolate.cpp
+    solver/interpolate/fwd_nearest_interpolate.cpp
     solver/layernorm/forward_layernorm.cpp
     solver/layernorm/forward_layernorm2d_ck.cpp
     solver/layernorm/forward_layernorm4d_ck.cpp
@@ -421,6 +432,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/neuron.inc
         kernels/rocm_version.inc
         kernels/stride_array.hpp
+        kernels/tensor_view.hpp
         kernels/utilities.inc
         kernels/workaround_issue_1431.hpp
         kernels/xform_bidirect_winograd_code.inc
@@ -455,6 +467,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenConvDirBatchNormActiv.cl
         kernels/MIOpenConvDirGenFwd.cl
         kernels/MIOpenGroupNorm.cpp
+        kernels/MIOpenInterpolate.cpp
         kernels/MIOpenLayerNorm.cpp
         kernels/MIOpenLRNBwd.cl
         kernels/MIOpenLRNFwd.cl
@@ -579,6 +592,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         argmax.cpp
         cat.cpp
         groupnorm.cpp
+        interpolate.cpp
         kernel_cache.cpp
         layer_norm.cpp
         lrn.cpp
diff --git a/src/include/miopen/interpolate.hpp b/src/include/miopen/interpolate.hpp
new file mode 100644
index 0000000000..d79bcaf35a
--- /dev/null
+++ b/src/include/miopen/interpolate.hpp
@@ -0,0 +1,94 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_INTERPOLATE_HPP_
+#define MIOPEN_INTERPOLATE_HPP_
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+miopenStatus_t InterpolateNearestForward(Handle& handle,
+                                         const TensorDescriptor& inputDesc,
+                                         ConstData_t input,
+                                         const TensorDescriptor& outputDesc,
+                                         Data_t output,
+                                         const TensorDescriptor& scaleFactorsDesc,
+                                         ConstData_t scale_factors,
+                                         miopenInterpolateMode_t mode);
+
+miopenStatus_t InterpolateLinearCubicForward(Handle& handle,
+                                             const TensorDescriptor& inputDesc,
+                                             ConstData_t input,
+                                             const TensorDescriptor& outputDesc,
+                                             Data_t output,
+                                             const TensorDescriptor& scaleFactorsDesc,
+                                             ConstData_t scale_factors,
+                                             miopenInterpolateMode_t mode,
+                                             bool align_corners);
+
+size_t GetInterpolateBicubicBackwardWorkspaceSize(Handle& handle,
+                                                  const TensorDescriptor& outputGradDesc,
+                                                  const TensorDescriptor& inputGradDesc,
+                                                  const TensorDescriptor& scaleFactorsDesc,
+                                                  miopenInterpolateMode_t mode,
+                                                  bool align_corners);
+
+miopenStatus_t InterpolateBicubicBackward(Handle& handle,
+                                          Data_t workspace,
+                                          size_t workspaceSizeInBytes,
+                                          const TensorDescriptor& inputGradDesc,
+                                          Data_t input_grad,
+                                          const TensorDescriptor& outputGradDesc,
+                                          ConstData_t output_grad,
+                                          const TensorDescriptor& scaleFactorsDesc,
+                                          ConstData_t scale_factors,
+                                          miopenInterpolateMode_t mode,
+                                          bool align_corners);
+
+miopenStatus_t InterpolateNearestBackward(Handle& handle,
+                                          const TensorDescriptor& inputGradDesc,
+                                          Data_t input_grad,
+                                          const TensorDescriptor& outputGradDesc,
+                                          ConstData_t output_grad,
+                                          const TensorDescriptor& scaleFactorsDesc,
+                                          ConstData_t scale_factors,
+                                          miopenInterpolateMode_t mode);
+
+miopenStatus_t InterpolateLinearBackward(Handle& handle,
+                                         const TensorDescriptor& inputGradDesc,
+                                         Data_t input_grad,
+                                         const TensorDescriptor& outputGradDesc,
+                                         ConstData_t output_grad,
+                                         const TensorDescriptor& scaleFactorsDesc,
+                                         ConstData_t scale_factors,
+                                         miopenInterpolateMode_t mode,
+                                         bool align_corners);
+
+} // namespace miopen
+#endif // _MIOPEN_INTERPOLATE_HPP_
diff --git a/src/include/miopen/interpolate/invoke_params.hpp b/src/include/miopen/interpolate/invoke_params.hpp
new file mode 100644
index 0000000000..66593cc4ad
--- /dev/null
+++ b/src/include/miopen/interpolate/invoke_params.hpp
@@ -0,0 +1,81 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "miopen/miopen.h"
+#include "miopen/common.hpp"
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+namespace interpolate {
+
+struct FwdInvokeParams : public miopen::InvokeParams
+{
+
+    FwdInvokeParams() = default;
+
+    const TensorDescriptor* inputDesc        = nullptr;
+    const TensorDescriptor* outputDesc       = nullptr;
+    const TensorDescriptor* scaleFactorsDesc = nullptr;
+
+    ConstData_t input         = nullptr;
+    Data_t output             = nullptr;
+    ConstData_t scale_factors = nullptr;
+
+    miopenInterpolateMode_t mode;
+    bool align_corners = false;
+
+    std::size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+};
+
+struct BwdInvokeParams : public miopen::InvokeParams
+{
+
+    BwdInvokeParams() = default;
+
+    const TensorDescriptor* inputGradDesc    = nullptr;
+    const TensorDescriptor* outputGradDesc   = nullptr;
+    const TensorDescriptor* scaleFactorsDesc = nullptr;
+
+    Data_t input_grad         = nullptr;
+    ConstData_t output_grad   = nullptr;
+    ConstData_t scale_factors = nullptr;
+
+    miopenInterpolateMode_t mode;
+    bool align_corners = false;
+
+    std::size_t workspaceSizeInBytes = 0;
+    Data_t workspace                 = nullptr;
+
+    std::size_t GetWorkspaceSize() const { return workspaceSizeInBytes; }
+    Data_t GetWorkspace() const { return workspace; }
+};
+
+} // namespace interpolate
+} // namespace miopen
diff --git a/src/include/miopen/interpolate/problem_description.hpp b/src/include/miopen/interpolate/problem_description.hpp
new file mode 100644
index 0000000000..d5abbfd49b
--- /dev/null
+++ b/src/include/miopen/interpolate/problem_description.hpp
@@ -0,0 +1,280 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "miopen/miopen.h"
+#include <miopen/problem_description_base.hpp>
+#include <miopen/activ.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace interpolate {
+
+struct ProblemDescription : ProblemDescriptionBase
+{
+    ProblemDescription(const TensorDescriptor& scaleFactorsDesc_,
+                       const miopenInterpolateMode_t mode_,
+                       const bool align_corners_)
+        : scaleFactorsDesc(scaleFactorsDesc_), mode(mode_), align_corners(align_corners_)
+    {
+        IsValidMode();
+        IsValidType();
+    }
+
+    const TensorDescriptor& GetScaleFactorsDesc() const { return scaleFactorsDesc; }
+    miopenInterpolateMode_t GetMode() const { return mode; }
+    bool GetAlignCorners() const { return align_corners; }
+
+    bool IsValidMode() const
+    {
+        if(mode != MIOPEN_INTERPOLATE_MODE_NEAREST && mode != MIOPEN_INTERPOLATE_MODE_LINEAR &&
+           mode != MIOPEN_INTERPOLATE_MODE_BILINEAR && mode != MIOPEN_INTERPOLATE_MODE_TRILINEAR &&
+           mode != MIOPEN_INTERPOLATE_MODE_BICUBIC)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Interpolate: Invalid mode.");
+        }
+        return true;
+    }
+
+    bool IsValidType() const
+    {
+        if(scaleFactorsDesc.GetType() != miopenFloat)
+        {
+            std::cout << "scaleFactorsDesc.GetType() = " << scaleFactorsDesc.GetType()
+                      << "miopenFloat type:" << miopenFloat << std::endl;
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Interpolate: Scale factor type should be miopenFloat.");
+        }
+
+        return true;
+    }
+
+protected:
+    TensorDescriptor scaleFactorsDesc;
+    miopenInterpolateMode_t mode;
+    bool align_corners = false;
+};
+
+struct FwdProblemDescription : ProblemDescription
+{
+    FwdProblemDescription(const TensorDescriptor& inputDesc_,
+                          const TensorDescriptor& outputDesc_,
+                          const TensorDescriptor& scaleFactorsDesc_,
+                          const miopenInterpolateMode_t mode_,
+                          const bool align_corners_)
+        : ProblemDescription(scaleFactorsDesc_, mode_, align_corners_),
+          inputDesc(inputDesc_),
+          outputDesc(outputDesc_)
+    {
+        IsValidDims();
+        IsValidLength();
+        IsSameType();
+    }
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
+
+    bool IsValidLength() const
+    {
+        if(inputDesc.GetSize() < 3 || inputDesc.GetSize() > 5)
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Interpolate: Input or output tensor size < 3 or > 5 is not valid.");
+        }
+
+        if(outputDesc.GetSize() != inputDesc.GetSize())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Interpolate: Input and output tensor size do not match.");
+        }
+
+        if((outputDesc.GetSize() - 2) != scaleFactorsDesc.GetElementSize())
+        {
+            if(mode != MIOPEN_INTERPOLATE_MODE_NEAREST)
+            {
+                MIOPEN_THROW(
+                    miopenStatusBadParm,
+                    "Interpolate: Output tensor size and scale factors length do not match.");
+            }
+        }
+        return true;
+    }
+
+    bool IsValidDims() const
+    {
+        if(mode == MIOPEN_INTERPOLATE_MODE_LINEAR)
+        {
+            if(inputDesc.GetSize() != 3)
+            {
+                MIOPEN_THROW(miopenStatusBadParm, "Interpolate: Linear mode requires 3D tensors.");
+            }
+        }
+        if(mode == MIOPEN_INTERPOLATE_MODE_BILINEAR)
+        {
+            if(inputDesc.GetSize() != 4)
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "Interpolate: Bilinear mode requires 4D tensors.");
+            }
+        }
+        if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC)
+        {
+            if(inputDesc.GetSize() != 4)
+            {
+                MIOPEN_THROW(miopenStatusBadParm, "Interpolate: Bicubic mode requires 4D tensors.");
+            }
+        }
+        if(mode == MIOPEN_INTERPOLATE_MODE_TRILINEAR)
+        {
+            if(inputDesc.GetSize() != 5)
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "Interpolate: Trilinear mode requires 5D tensors.");
+            }
+        }
+        return true;
+    }
+
+    bool IsSameType() const
+    {
+        if(inputDesc.GetType() != outputDesc.GetType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Interpolate: Input and output tensor type do not match.");
+        }
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    TensorDescriptor inputDesc;
+    TensorDescriptor outputDesc;
+};
+
+struct BwdProblemDescription : ProblemDescription
+{
+    BwdProblemDescription(const TensorDescriptor& inputGradDesc_,
+                          const TensorDescriptor& outputGradDesc_,
+                          const TensorDescriptor& scaleFactorsDesc_,
+                          const miopenInterpolateMode_t mode_,
+                          const bool align_corners_)
+        : ProblemDescription(scaleFactorsDesc_, mode_, align_corners_),
+          inputGradDesc(inputGradDesc_),
+          outputGradDesc(outputGradDesc_)
+    {
+        IsValidDims();
+        IsValidLength();
+        IsSameType();
+    }
+    const TensorDescriptor& GetInputGradDesc() const { return inputGradDesc; }
+    const TensorDescriptor& GetOutputGradDesc() const { return outputGradDesc; }
+
+    bool IsValidLength() const
+    {
+        if(inputGradDesc.GetSize() < 3 || inputGradDesc.GetSize() > 5)
+        {
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "Interpolate: Input grad or output grad tensor size < 3 or > 5 is not valid.");
+        }
+
+        if(outputGradDesc.GetSize() != inputGradDesc.GetSize())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Interpolate: Input grad and output grad tensor size do not match.");
+        }
+
+        if((outputGradDesc.GetSize() - 2) != scaleFactorsDesc.GetElementSize())
+        {
+            if(mode != MIOPEN_INTERPOLATE_MODE_NEAREST)
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "Interpolate: Tensor size and scale factors length do not match.");
+            }
+        }
+        return true;
+    }
+
+    bool IsValidDims() const
+    {
+        if(mode == MIOPEN_INTERPOLATE_MODE_LINEAR)
+        {
+            if(inputGradDesc.GetSize() != 3)
+            {
+                MIOPEN_THROW(miopenStatusBadParm, "Interpolate: Linear mode requires 3D tensors.");
+            }
+        }
+        if(mode == MIOPEN_INTERPOLATE_MODE_BILINEAR)
+        {
+            if(inputGradDesc.GetSize() != 4)
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "Interpolate: Bilinear mode requires 4D tensors.");
+            }
+        }
+        if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC)
+        {
+            if(inputGradDesc.GetSize() != 4)
+            {
+                MIOPEN_THROW(miopenStatusBadParm, "Interpolate: Bicubic mode requires 4D tensors.");
+            }
+        }
+        if(mode == MIOPEN_INTERPOLATE_MODE_TRILINEAR)
+        {
+            if(inputGradDesc.GetSize() != 5)
+            {
+                MIOPEN_THROW(miopenStatusBadParm,
+                             "Interpolate: Trilinear mode requires 5D tensors.");
+            }
+        }
+        return true;
+    }
+
+    bool IsSameType() const
+    {
+        if(inputGradDesc.GetType() != outputGradDesc.GetType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Interpolate: Input grad and output grad tensor type do not match.");
+        }
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    TensorDescriptor inputGradDesc;
+    TensorDescriptor outputGradDesc;
+};
+
+} // namespace interpolate
+
+} // namespace miopen
diff --git a/src/include/miopen/interpolate/solvers.hpp b/src/include/miopen/interpolate/solvers.hpp
new file mode 100644
index 0000000000..9469374385
--- /dev/null
+++ b/src/include/miopen/interpolate/solvers.hpp
@@ -0,0 +1,214 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/solver.hpp>
+#include <miopen/interpolate/problem_description.hpp>
+#include "miopen/kernel_build_params.hpp"
+
+namespace miopen {
+
+namespace solver {
+
+const auto make_hip_kernel = [](std::vector<size_t> localsize,
+                                std::vector<size_t> gridsize,
+                                std::string kernel_file,
+                                std::string kernel_name,
+                                KernelBuildParameters build_params) {
+    while(localsize.size() < 3)
+        localsize.push_back(1);
+    while(gridsize.size() < 3)
+        gridsize.push_back(1);
+    for(int i = 0; i < localsize.size(); ++i)
+        gridsize[i] = AlignUp(gridsize[i], localsize[i]);
+    return KernelInfo{
+        build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
+};
+
+namespace interpolate {
+
+using InterpolateFwdSolver =
+    NonTunableSolverBase<ExecutionContext, miopen::interpolate::FwdProblemDescription>;
+
+using InterpolateBwdSolver =
+    NonTunableSolverBase<ExecutionContext, miopen::interpolate::BwdProblemDescription>;
+
+// FORWARD NEAREST
+struct InterpolateNearestForward final : InterpolateFwdSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<InterpolateNearestForward>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::interpolate::FwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::interpolate::FwdProblemDescription& problem) const override;
+};
+
+// FORWARD LINEAR
+struct InterpolateLinearForward final : InterpolateFwdSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<InterpolateLinearForward>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::interpolate::FwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::interpolate::FwdProblemDescription& problem) const override;
+};
+
+// FORWARD BILINEAR
+struct InterpolateBilinearForward final : InterpolateFwdSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<InterpolateBilinearForward>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::interpolate::FwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::interpolate::FwdProblemDescription& problem) const override;
+};
+
+// FORWARD BICUBIC
+struct InterpolateBicubicForward final : InterpolateFwdSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<InterpolateBicubicForward>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::interpolate::FwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::interpolate::FwdProblemDescription& problem) const override;
+};
+
+// BACKWARD NEAREST
+struct InterpolateNearestBackward final : InterpolateBwdSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<InterpolateNearestBackward>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::interpolate::BwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::interpolate::BwdProblemDescription& problem) const override;
+};
+
+// BACKWARD LINEAR
+struct InterpolateLinearBackward final : InterpolateBwdSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<InterpolateLinearBackward>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::interpolate::BwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::interpolate::BwdProblemDescription& problem) const override;
+};
+
+// BACKWARD BILINEAR
+struct InterpolateBilinearBackward final : InterpolateBwdSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<InterpolateBilinearBackward>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::interpolate::BwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::interpolate::BwdProblemDescription& problem) const override;
+};
+
+// BACKWARD TRILINEAR
+struct InterpolateTrilinearBackward final : InterpolateBwdSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<InterpolateTrilinearBackward>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::interpolate::BwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::interpolate::BwdProblemDescription& problem) const override;
+};
+
+// BACKWARD BICUBIC
+struct InterpolateBicubicBackward final : InterpolateBwdSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<InterpolateBicubicBackward>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::interpolate::BwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::interpolate::BwdProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::interpolate::BwdProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return true; }
+};
+
+} // namespace interpolate
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/interpolate/utils.hpp b/src/include/miopen/interpolate/utils.hpp
new file mode 100644
index 0000000000..bc22491536
--- /dev/null
+++ b/src/include/miopen/interpolate/utils.hpp
@@ -0,0 +1,81 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include "../src/kernels/tensor_view.hpp"
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace interpolate {
+
+template <int N>
+inline tensor_view_t<N> get_inner_expanded_tv(const TensorDescriptor Desc)
+{
+    auto dims    = Desc.GetLengths();
+    auto strides = Desc.GetStrides();
+
+    tensor_view_t<N> tensor_view;
+    for(size_t i = 0; i < strides.size(); ++i)
+    {
+        tensor_view.stride[i] = strides[i];
+        tensor_view.size[i]   = dims[i];
+    }
+    for(size_t i = strides.size(); i < N; ++i)
+    {
+        tensor_view.stride[i] = tensor_view.stride[i - 1];
+        tensor_view.size[i]   = 1;
+    }
+    return tensor_view;
+}
+
+template <int N>
+inline void slice_tv(tensor_view_t<N>& tensor_view, int32_t sliceCount, const int32_t* slices)
+{
+    for(int32_t i = 0; i < sliceCount; i++)
+    {
+        int32_t dim   = slices[4 * i + 0];
+        int32_t start = slices[4 * i + 1];
+        int32_t end   = slices[4 * i + 2];
+        int32_t step  = slices[4 * i + 3];
+
+        if(end > static_cast<int32_t>(tensor_view.size[dim]))
+            end = tensor_view.size[dim];
+
+        auto len = end - start;
+
+        tensor_view.size[dim] = (len + step - 1) / step;
+        tensor_view.stride[dim] *= step;
+    }
+}
+
+} // namespace interpolate
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index c52dc020ac..c606de1dcf 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -56,7 +56,8 @@ enum class Primitive
     Reduce,
     Cat,
     Mha,
-    Softmax
+    Softmax,
+    Interpolate
 };
 
 struct MIOPEN_EXPORT Id
diff --git a/src/interpolate.cpp b/src/interpolate.cpp
new file mode 100644
index 0000000000..0a1f07bd55
--- /dev/null
+++ b/src/interpolate.cpp
@@ -0,0 +1,243 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/miopen.h"
+#include <miopen/interpolate.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/interpolate/invoke_params.hpp>
+#include <miopen/interpolate/solvers.hpp>
+#include <miopen/find_solution.hpp>
+
+namespace miopen {
+
+miopenStatus_t InterpolateNearestForward(Handle& handle,
+                                         const TensorDescriptor& inputDesc,
+                                         ConstData_t input,
+                                         const TensorDescriptor& outputDesc,
+                                         Data_t output,
+                                         const TensorDescriptor& scaleFactorsDesc,
+                                         ConstData_t scale_factors,
+                                         const miopenInterpolateMode_t mode)
+{
+    const auto problem =
+        interpolate::FwdProblemDescription{inputDesc, outputDesc, scaleFactorsDesc, mode, false};
+
+    const auto invoke_params = [&]() {
+        auto tmp             = interpolate::FwdInvokeParams{};
+        tmp.inputDesc        = &inputDesc;
+        tmp.outputDesc       = &outputDesc;
+        tmp.scaleFactorsDesc = &scaleFactorsDesc;
+
+        tmp.input         = input;
+        tmp.output        = output;
+        tmp.scale_factors = scale_factors;
+
+        tmp.mode = mode;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"InterpolateForward"};
+    const auto solvers = solver::SolverContainer<solver::interpolate::InterpolateNearestForward>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t InterpolateLinearCubicForward(Handle& handle,
+                                             const TensorDescriptor& inputDesc,
+                                             ConstData_t input,
+                                             const TensorDescriptor& outputDesc,
+                                             Data_t output,
+                                             const TensorDescriptor& scaleFactorsDesc,
+                                             ConstData_t scale_factors,
+                                             const miopenInterpolateMode_t mode,
+                                             const bool align_corners)
+{
+    const auto problem = interpolate::FwdProblemDescription{
+        inputDesc, outputDesc, scaleFactorsDesc, mode, align_corners};
+
+    const auto invoke_params = [&]() {
+        auto tmp             = interpolate::FwdInvokeParams{};
+        tmp.inputDesc        = &inputDesc;
+        tmp.outputDesc       = &outputDesc;
+        tmp.scaleFactorsDesc = &scaleFactorsDesc;
+
+        tmp.input         = input;
+        tmp.output        = output;
+        tmp.scale_factors = scale_factors;
+
+        tmp.mode          = mode;
+        tmp.align_corners = align_corners;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"InterpolateForward"};
+    const auto solvers = solver::SolverContainer<solver::interpolate::InterpolateLinearForward,
+                                                 solver::interpolate::InterpolateBilinearForward,
+                                                 solver::interpolate::InterpolateBicubicForward>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t InterpolateNearestBackward(Handle& handle,
+                                          const TensorDescriptor& inputGradDesc,
+                                          Data_t input_grad,
+                                          const TensorDescriptor& outputGradDesc,
+                                          ConstData_t output_grad,
+                                          const TensorDescriptor& scaleFactorsDesc,
+                                          ConstData_t scale_factors,
+                                          const miopenInterpolateMode_t mode)
+{
+    const auto problem = interpolate::BwdProblemDescription{
+        inputGradDesc, outputGradDesc, scaleFactorsDesc, mode, false};
+
+    const auto invoke_params = [&]() {
+        auto tmp             = interpolate::BwdInvokeParams{};
+        tmp.inputGradDesc    = &inputGradDesc;
+        tmp.outputGradDesc   = &outputGradDesc;
+        tmp.scaleFactorsDesc = &scaleFactorsDesc;
+
+        tmp.input_grad    = input_grad;
+        tmp.output_grad   = output_grad;
+        tmp.scale_factors = scale_factors;
+
+        tmp.mode = mode;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"InterpolateBackward"};
+    const auto solvers = solver::SolverContainer<solver::interpolate::InterpolateNearestBackward>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+size_t GetInterpolateBicubicBackwardWorkspaceSize(Handle& handle,
+                                                  const TensorDescriptor& outputGradDesc,
+                                                  const TensorDescriptor& inputGradDesc,
+                                                  const TensorDescriptor& scaleFactorsDesc,
+                                                  const miopenInterpolateMode_t mode,
+                                                  const bool align_corners)
+{
+    auto ctx           = ExecutionContext{&handle};
+    const auto problem = interpolate::BwdProblemDescription{
+        inputGradDesc, outputGradDesc, scaleFactorsDesc, mode, align_corners};
+
+    const auto algo    = AlgorithmName{"InterpolateBackward"};
+    const auto solvers = solver::SolverContainer<solver::interpolate::InterpolateBicubicBackward>{};
+
+    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem);
+
+    return pair_size_vector.empty() ? static_cast<size_t>(-1) : pair_size_vector.front().second;
+}
+
+miopenStatus_t InterpolateBicubicBackward(Handle& handle,
+                                          Data_t workspace,
+                                          size_t workspaceSizeInBytes,
+                                          const TensorDescriptor& inputGradDesc,
+                                          Data_t input_grad,
+                                          const TensorDescriptor& outputGradDesc,
+                                          ConstData_t output_grad,
+                                          const TensorDescriptor& scaleFactorsDesc,
+                                          ConstData_t scale_factors,
+                                          const miopenInterpolateMode_t mode,
+                                          const bool align_corners)
+{
+    const auto problem = interpolate::BwdProblemDescription{
+        inputGradDesc, outputGradDesc, scaleFactorsDesc, mode, align_corners};
+
+    const auto invoke_params = [&]() {
+        auto tmp             = interpolate::BwdInvokeParams{};
+        tmp.inputGradDesc    = &inputGradDesc;
+        tmp.outputGradDesc   = &outputGradDesc;
+        tmp.scaleFactorsDesc = &scaleFactorsDesc;
+
+        tmp.input_grad    = input_grad;
+        tmp.output_grad   = output_grad;
+        tmp.scale_factors = scale_factors;
+
+        tmp.mode          = mode;
+        tmp.align_corners = align_corners;
+
+        tmp.workspace            = workspace;
+        tmp.workspaceSizeInBytes = workspaceSizeInBytes;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"InterpolateBackward"};
+    const auto solvers = solver::SolverContainer<solver::interpolate::InterpolateBicubicBackward>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t InterpolateLinearBackward(Handle& handle,
+                                         const TensorDescriptor& inputGradDesc,
+                                         Data_t input_grad,
+                                         const TensorDescriptor& outputGradDesc,
+                                         ConstData_t output_grad,
+                                         const TensorDescriptor& scaleFactorsDesc,
+                                         ConstData_t scale_factors,
+                                         const miopenInterpolateMode_t mode,
+                                         const bool align_corners)
+{
+    const auto problem = interpolate::BwdProblemDescription{
+        inputGradDesc, outputGradDesc, scaleFactorsDesc, mode, align_corners};
+
+    const auto invoke_params = [&]() {
+        auto tmp             = interpolate::BwdInvokeParams{};
+        tmp.inputGradDesc    = &inputGradDesc;
+        tmp.outputGradDesc   = &outputGradDesc;
+        tmp.scaleFactorsDesc = &scaleFactorsDesc;
+
+        tmp.input_grad    = input_grad;
+        tmp.output_grad   = output_grad;
+        tmp.scale_factors = scale_factors;
+
+        tmp.mode          = mode;
+        tmp.align_corners = align_corners;
+
+        return tmp;
+    }();
+    const auto algo = AlgorithmName{"InterpolateBackward"};
+    const auto solvers =
+        solver::SolverContainer<solver::interpolate::InterpolateLinearBackward,
+                                solver::interpolate::InterpolateBilinearBackward,
+                                solver::interpolate::InterpolateTrilinearBackward>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/interpolate/problem_description.cpp b/src/interpolate/problem_description.cpp
new file mode 100644
index 0000000000..4b6419a78d
--- /dev/null
+++ b/src/interpolate/problem_description.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/interpolate/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <vector>
+#include <sstream>
+
+namespace miopen {
+
+namespace interpolate {
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dims              = inputDesc.GetLengths();
+    auto output_dims             = outputDesc.GetLengths();
+    auto input_dtype             = inputDesc.GetType();
+    miopenInterpolateMode_t mode = GetMode();
+    bool align_corners           = GetAlignCorners();
+
+    std::ostringstream ss;
+    ss << "interpolate_fwd";
+    ss << "mode" << mode;
+    ss << "align_corners" << align_corners;
+    ss << "input_dtype" << input_dtype;
+    ss << "input_dims" << input_dims;
+    ss << "output_dims" << output_dims;
+
+    return NetworkConfig{ss.str()};
+}
+
+NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_grad_dims         = inputGradDesc.GetLengths();
+    auto output_grad_dims        = outputGradDesc.GetLengths();
+    auto output_dtype            = outputGradDesc.GetType();
+    miopenInterpolateMode_t mode = GetMode();
+    bool align_corners           = GetAlignCorners();
+
+    std::ostringstream ss;
+    ss << "interpolate_bwd";
+    ss << "mode" << mode;
+    ss << "align_corners" << align_corners;
+    ss << "output_grad_dtype" << output_dtype;
+    ss << "output_grad_dims" << output_grad_dims;
+    ss << "input_grad_dims" << input_grad_dims;
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace interpolate
+
+} // namespace miopen
diff --git a/src/interpolate_api.cpp b/src/interpolate_api.cpp
new file mode 100644
index 0000000000..5ff4b012b0
--- /dev/null
+++ b/src/interpolate_api.cpp
@@ -0,0 +1,215 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/miopen.h"
+#include <miopen/interpolate.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+static void LogCmdInterpolate(const miopenTensorDescriptor_t xDesc,
+                              const miopenTensorDescriptor_t oDesc,
+                              bool is_fwd,
+                              const miopenInterpolateMode_t mode)
+{
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(xDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "interpolatefp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "interpolate";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "interpolatebfp16";
+        }
+
+        MIOPEN_LOG_FUNCTION(xDesc, oDesc, mode);
+        ss << " -D " << miopen::deref(xDesc).GetLengths();
+        ss << " -Si " << miopen::deref(xDesc).GetStrides();
+        ss << " -So " << miopen::deref(oDesc).GetStrides();
+
+        ss << " -F " << ((is_fwd) ? "1" : "2");
+        ss << " -R " << mode;
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
+
+extern "C" miopenStatus_t miopenInterpolateForward(miopenHandle_t handle,
+                                                   const miopenTensorDescriptor_t inputDesc,
+                                                   const void* input,
+                                                   const miopenTensorDescriptor_t outputDesc,
+                                                   void* output,
+                                                   const miopenTensorDescriptor_t scaleFactorsDesc,
+                                                   const void* scale_factors,
+                                                   const miopenInterpolateMode_t mode,
+                                                   const bool align_corners)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        inputDesc,
+                        input,
+                        outputDesc,
+                        output,
+                        scaleFactorsDesc,
+                        scale_factors,
+                        mode,
+                        align_corners);
+
+    LogCmdInterpolate(inputDesc, outputDesc, true, mode);
+    if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST)
+    {
+        return miopen::try_([&] {
+            miopen::InterpolateNearestForward(miopen::deref(handle),
+                                              miopen::deref(inputDesc),
+                                              DataCast(input),
+                                              miopen::deref(outputDesc),
+                                              DataCast(output),
+                                              miopen::deref(scaleFactorsDesc),
+                                              DataCast(scale_factors),
+                                              mode);
+        });
+    }
+    return miopen::try_([&] {
+        miopen::InterpolateLinearCubicForward(miopen::deref(handle),
+                                              miopen::deref(inputDesc),
+                                              DataCast(input),
+                                              miopen::deref(outputDesc),
+                                              DataCast(output),
+                                              miopen::deref(scaleFactorsDesc),
+                                              DataCast(scale_factors),
+                                              mode,
+                                              align_corners);
+    });
+}
+
+extern "C" miopenStatus_t
+miopenGetInterpolateBackwardWorkspaceSize(miopenHandle_t handle,
+                                          const miopenTensorDescriptor_t outputGradDesc,
+                                          const miopenTensorDescriptor_t inputGradDesc,
+                                          const miopenTensorDescriptor_t scaleFactorsDesc,
+                                          const miopenInterpolateMode_t mode,
+                                          const bool align_corners,
+                                          size_t* sizeInBytes)
+{
+
+    MIOPEN_LOG_FUNCTION(
+        handle, outputGradDesc, inputGradDesc, scaleFactorsDesc, mode, align_corners, sizeInBytes);
+
+    return miopen::try_([&] {
+        miopen::deref(sizeInBytes) =
+            miopen::GetInterpolateBicubicBackwardWorkspaceSize(miopen::deref(handle),
+                                                               miopen::deref(outputGradDesc),
+                                                               miopen::deref(inputGradDesc),
+                                                               miopen::deref(scaleFactorsDesc),
+                                                               mode,
+                                                               align_corners);
+    });
+}
+
+extern "C" miopenStatus_t miopenInterpolateBackward(miopenHandle_t handle,
+                                                    void* workspace,
+                                                    size_t workspaceSizeInBytes,
+                                                    const miopenTensorDescriptor_t inputGradDesc,
+                                                    void* input_grad,
+                                                    const miopenTensorDescriptor_t outputGradDesc,
+                                                    const void* output_grad,
+                                                    const miopenTensorDescriptor_t scaleFactorsDesc,
+                                                    const void* scale_factors,
+                                                    const miopenInterpolateMode_t mode,
+                                                    const bool align_corners)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        inputGradDesc,
+                        input_grad,
+                        outputGradDesc,
+                        output_grad,
+                        scaleFactorsDesc,
+                        scale_factors,
+                        mode,
+                        align_corners);
+
+    LogCmdInterpolate(inputGradDesc, outputGradDesc, false, mode);
+    if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST)
+    {
+        return miopen::try_([&] {
+            miopen::InterpolateNearestBackward(miopen::deref(handle),
+                                               miopen::deref(inputGradDesc),
+                                               DataCast(input_grad),
+                                               miopen::deref(outputGradDesc),
+                                               DataCast(output_grad),
+                                               miopen::deref(scaleFactorsDesc),
+                                               DataCast(scale_factors),
+                                               mode);
+        });
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC)
+    {
+        return miopen::try_([&] {
+            miopen::InterpolateBicubicBackward(miopen::deref(handle),
+                                               DataCast(workspace),
+                                               workspaceSizeInBytes,
+                                               miopen::deref(inputGradDesc),
+                                               DataCast(input_grad),
+                                               miopen::deref(outputGradDesc),
+                                               DataCast(output_grad),
+                                               miopen::deref(scaleFactorsDesc),
+                                               DataCast(scale_factors),
+                                               mode,
+                                               align_corners);
+        });
+    }
+    return miopen::try_([&] {
+        miopen::InterpolateLinearBackward(miopen::deref(handle),
+                                          miopen::deref(inputGradDesc),
+                                          DataCast(input_grad),
+                                          miopen::deref(outputGradDesc),
+                                          DataCast(output_grad),
+                                          miopen::deref(scaleFactorsDesc),
+                                          DataCast(scale_factors),
+                                          mode,
+                                          align_corners);
+    });
+}
diff --git a/src/kernels/MIOpenInterpolate.cpp b/src/kernels/MIOpenInterpolate.cpp
new file mode 100644
index 0000000000..4e3e5b2cf3
--- /dev/null
+++ b/src/kernels/MIOpenInterpolate.cpp
@@ -0,0 +1,967 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+__device__ inline FLOAT_ACCUM compute_linear_scale_factor(FLOAT_ACCUM scale_factor,
+                                                          int64_t input_size,
+                                                          int64_t output_size,
+                                                          bool align_corners)
+{
+    if(align_corners)
+    {
+        if(input_size == 1)
+        {
+            return static_cast<FLOAT_ACCUM>(output_size);
+        }
+        return static_cast<FLOAT_ACCUM>(output_size - 1) / (input_size - 1);
+    }
+    else if(scale_factor == 0)
+    {
+        return static_cast<FLOAT_ACCUM>(output_size) / input_size;
+    }
+    else
+    {
+        return static_cast<FLOAT_ACCUM>(scale_factor);
+    }
+}
+
+__device__ inline FLOAT_ACCUM
+get_src_index(int64_t dest_index, FLOAT_ACCUM scale_factor, bool align_corners)
+{
+    if(align_corners)
+    {
+        return dest_index / scale_factor;
+    }
+    else
+    {
+        // Follow Opencv resize logic.
+        return (dest_index + 0.5f) / scale_factor - 0.5f;
+    }
+}
+
+__device__ inline int64_t
+linear_back_index(int64_t src, FLOAT_ACCUM scale_factor, bool align_corners)
+{
+    return static_cast<int64_t>(ceil(get_src_index(src, 1.f / scale_factor, align_corners)));
+}
+
+__device__ inline void compute_linear_back_index_from_to(int64_t src,
+                                                         int64_t input_size,
+                                                         int64_t output_size,
+                                                         FLOAT_ACCUM scale_factor,
+                                                         bool align_corners,
+                                                         int64_t* from,
+                                                         int64_t* to)
+{
+    if(src - 1 < 1)
+    {
+        *from = 0;
+    }
+    else
+    {
+        *from = linear_back_index(src - 1, scale_factor, align_corners);
+    }
+    if(src + 1 > input_size)
+    {
+        *to = output_size;
+    }
+    else
+    {
+        *to = min(output_size, linear_back_index(src + 1, scale_factor, align_corners));
+    }
+}
+
+__device__ inline void compute_source_index_and_lambda(int64_t h,
+                                                       FLOAT_ACCUM scale_factor,
+                                                       int64_t Hin,
+                                                       bool align_corners,
+                                                       int64_t* hin_index0,
+                                                       int64_t* hin_index1,
+                                                       FLOAT_ACCUM* lambda0,
+                                                       FLOAT_ACCUM* lambda1)
+{
+    FLOAT_ACCUM hin_index_actual = max(0., get_src_index(h, scale_factor, align_corners));
+    *hin_index0                  = static_cast<int64_t>(hin_index_actual);
+    *hin_index1                  = min(*hin_index0 + 1, Hin - 1);
+    *lambda1                     = hin_index_actual - *hin_index0;
+    *lambda0                     = 1.f - *lambda1;
+}
+
+__device__ inline FLOAT_ACCUM
+get_back_lambda(int64_t src, int64_t src0, int64_t src1, FLOAT_ACCUM lambda0, FLOAT_ACCUM lambda1)
+{
+    if(src == src0)
+    {
+        if(src0 == src1)
+        {
+            return 1; // lambda0 + lambda1 = 1
+        }
+        return lambda0;
+    }
+    if(src == src1)
+    {
+        return lambda1;
+    }
+    // This case can happen due to floating point mutiplification.
+    // ex> 7 * (105/9) = 87 or 86.99999995
+    return 0;
+}
+
+__device__ inline FLOAT_ACCUM compute_back_lambda(int64_t dest,
+                                                  int64_t src,
+                                                  FLOAT_ACCUM scale_factor,
+                                                  int64_t Hin,
+                                                  int64_t Hout,
+                                                  bool align_corners)
+{
+    if(Hin == Hout)
+    {
+        return 1;
+    }
+    int64_t index0;
+    int64_t index1;
+    FLOAT_ACCUM lambda0;
+    FLOAT_ACCUM lambda1;
+    compute_source_index_and_lambda(
+        dest, scale_factor, Hin, align_corners, &index0, &index1, &lambda0, &lambda1);
+    return get_back_lambda(src, index0, index1, lambda0, lambda1);
+}
+
+template <typename TI, typename TO>
+__device__ inline void interpolateLinearForward(const TI* __restrict__ input,
+                                                TO* __restrict__ output,
+                                                const tensor_view_t<3> input_tv,
+                                                const tensor_view_t<3> output_tv,
+                                                const size_t nelems,
+                                                const float* scale_factors,
+                                                const bool align_corners)
+{
+    int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= nelems)
+        return;
+
+    auto tensor_layout = tensor_layout_t<3>(output_tv, gid);
+    int64_t n          = tensor_layout.layout[0];
+    int64_t c          = tensor_layout.layout[1];
+    int64_t h          = tensor_layout.layout[2];
+
+    int64_t Hin  = input_tv.size[2];
+    int64_t Hout = output_tv.size[2];
+    if(Hin == Hout || Hout == 1)
+    {
+        output[output_tv.get_tensor_view_idx(tensor_layout)] =
+            input[input_tv.get_tensor_view_idx(tensor_layout)];
+        return;
+    }
+
+    FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[0]);
+    scale_factor_h = compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+
+    int64_t hin_index0;
+    int64_t hin_index1;
+    FLOAT_ACCUM lambda1;
+    FLOAT_ACCUM lambda0;
+    compute_source_index_and_lambda(
+        h, scale_factor_h, Hin, align_corners, &hin_index0, &hin_index1, &lambda0, &lambda1);
+
+    tensor_layout_t<3> input_layout0(n, c, hin_index0);
+
+    tensor_layout_t<3> input_layout1(n, c, hin_index1);
+
+    FLOAT_ACCUM input0 = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout0)]);
+    FLOAT_ACCUM input1 = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout1)]);
+
+    output[output_tv.get_tensor_view_idx(tensor_layout)] =
+        CVT_ACCUM2FLOAT(input0 * lambda0 + input1 * lambda1);
+}
+
+extern "C" __global__ void InterpolateLinearForward(const INPUT_TYPE* __restrict__ input,
+                                                    OUTPUT_TYPE* __restrict__ output,
+                                                    const tensor_view_t<3> input_tv,
+                                                    const tensor_view_t<3> output_tv,
+                                                    const size_t nelems,
+                                                    const float* scale_factors,
+                                                    const bool align_corners)
+{
+    interpolateLinearForward<INPUT_TYPE, OUTPUT_TYPE>(
+        input, output, input_tv, output_tv, nelems, scale_factors, align_corners);
+}
+
+template <typename TI, typename TO>
+__device__ inline void interpolateLinearBackward(TO* __restrict__ input_grad,
+                                                 const TI* __restrict__ output_grad,
+                                                 const tensor_view_t<3> input_grad_tv,
+                                                 const tensor_view_t<3> output_grad_tv,
+                                                 const size_t nelems,
+                                                 const float* scale_factors,
+                                                 const bool align_corners)
+{
+    int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= nelems)
+        return;
+
+    auto tensor_layout = tensor_layout_t<3>(input_grad_tv, gid);
+    int64_t n          = tensor_layout.layout[0];
+    int64_t c          = tensor_layout.layout[1];
+    int64_t h          = tensor_layout.layout[2];
+
+    int64_t Hin  = input_grad_tv.size[2];
+    int64_t Hout = output_grad_tv.size[2];
+
+    if(Hin == Hout)
+    {
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+            output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)];
+        return;
+    }
+
+    FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[0]);
+    FLOAT_ACCUM scale_factor =
+        compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+
+    int64_t from, to;
+    compute_linear_back_index_from_to(h, Hin, Hout, scale_factor, align_corners, &from, &to);
+
+    FLOAT_ACCUM output = 0;
+    for(int64_t i = from; i < to; i++)
+    {
+        tensor_layout_t<3> output_layout(n, c, i);
+        output += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) *
+                  compute_back_lambda(i, h, scale_factor, Hin, Hout, align_corners);
+    }
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(output);
+}
+
+extern "C" __global__ void InterpolateLinearBackward(OUTPUT_TYPE* __restrict__ input_grad,
+                                                     const INPUT_TYPE* __restrict__ output_grad,
+                                                     const tensor_view_t<3> input_grad_tv,
+                                                     const tensor_view_t<3> output_grad_tv,
+                                                     const size_t nelems,
+                                                     const float* scale_factors,
+                                                     const bool align_corners)
+{
+    interpolateLinearBackward<INPUT_TYPE, OUTPUT_TYPE>(input_grad,
+                                                       output_grad,
+                                                       input_grad_tv,
+                                                       output_grad_tv,
+                                                       nelems,
+                                                       scale_factors,
+                                                       align_corners);
+}
+
+template <typename TI, typename TO>
+__device__ inline void interpolateBilinearForward(const TI* __restrict__ input,
+                                                  TO* __restrict__ output,
+                                                  const tensor_view_t<4> input_tv,
+                                                  const tensor_view_t<4> output_tv,
+                                                  const size_t nelems,
+                                                  const float* scale_factors,
+                                                  const bool align_corners)
+{
+    int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= nelems)
+        return;
+
+    auto tensor_layout = tensor_layout_t<4>(output_tv, gid);
+    int64_t n          = tensor_layout.layout[0];
+    int64_t c          = tensor_layout.layout[1];
+    int64_t h          = tensor_layout.layout[2];
+    int64_t w          = tensor_layout.layout[3];
+
+    int64_t Hin  = input_tv.size[2];
+    int64_t Hout = output_tv.size[2];
+    int64_t Win  = input_tv.size[3];
+    int64_t Wout = output_tv.size[3];
+
+    if(Hin == Hout && Win == Wout)
+    {
+        output[output_tv.get_tensor_view_idx(tensor_layout)] =
+            input[input_tv.get_tensor_view_idx(tensor_layout)];
+        return;
+    }
+
+    int64_t hin_index0   = h;
+    int64_t hin_index1   = h;
+    FLOAT_ACCUM hlambda0 = 1;
+    FLOAT_ACCUM hlambda1 = 0;
+    if(Hin != Hout && Hout != 1)
+    {
+        FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[0]);
+        FLOAT_ACCUM scale_factor_h_ =
+            compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+        compute_source_index_and_lambda(
+            h, scale_factor_h_, Hin, align_corners, &hin_index0, &hin_index1, &hlambda0, &hlambda1);
+    }
+
+    int64_t win_index0   = w;
+    int64_t win_index1   = w;
+    FLOAT_ACCUM wlambda0 = 1;
+    FLOAT_ACCUM wlambda1 = 0;
+    if(Win != Wout && Wout != 1)
+    {
+        FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[1]);
+        FLOAT_ACCUM scale_factor_w_ =
+            compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+        compute_source_index_and_lambda(
+            w, scale_factor_w_, Win, align_corners, &win_index0, &win_index1, &wlambda0, &wlambda1);
+    }
+
+    tensor_layout_t<4> input_layout00(n, c, hin_index0, win_index0);
+    tensor_layout_t<4> input_layout01(n, c, hin_index0, win_index1);
+    tensor_layout_t<4> input_layout10(n, c, hin_index1, win_index0);
+    tensor_layout_t<4> input_layout11(n, c, hin_index1, win_index1);
+
+    output[output_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(
+        (CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout00)]) * wlambda0 +
+         CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout01)]) * wlambda1) *
+            hlambda0 +
+        (CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout10)]) * wlambda0 +
+         CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout11)]) * wlambda1) *
+            hlambda1);
+}
+
+extern "C" __global__ void InterpolateBilinearForward(const INPUT_TYPE* __restrict__ input,
+                                                      OUTPUT_TYPE* __restrict__ output,
+                                                      const tensor_view_t<4> input_tv,
+                                                      const tensor_view_t<4> output_tv,
+                                                      const size_t nelems,
+                                                      const float* scale_factors,
+                                                      const bool align_corners)
+{
+    interpolateBilinearForward<INPUT_TYPE, OUTPUT_TYPE>(
+        input, output, input_tv, output_tv, nelems, scale_factors, align_corners);
+}
+
+template <typename TI, typename TO>
+__device__ inline void interpolateBilinearBackward(TO* __restrict__ input_grad,
+                                                   const TI* __restrict__ output_grad,
+                                                   const tensor_view_t<4> input_grad_tv,
+                                                   const tensor_view_t<4> output_grad_tv,
+                                                   const size_t nelems,
+                                                   const float* scale_factors,
+                                                   const bool align_corners)
+{
+    int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= nelems)
+        return;
+
+    auto tensor_layout = tensor_layout_t<4>(input_grad_tv, gid);
+    int64_t n          = tensor_layout.layout[0];
+    int64_t c          = tensor_layout.layout[1];
+    int64_t h          = tensor_layout.layout[2];
+    int64_t w          = tensor_layout.layout[3];
+
+    int64_t Hin  = input_grad_tv.size[2];
+    int64_t Hout = output_grad_tv.size[2];
+    int64_t Win  = input_grad_tv.size[3];
+    int64_t Wout = output_grad_tv.size[3];
+
+    FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[0]);
+    FLOAT_ACCUM scale_factor_h_ =
+        compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+
+    FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[1]);
+    FLOAT_ACCUM scale_factor_w_ =
+        compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+
+    int64_t h_from, h_to;
+    if(Hin == Hout)
+    {
+        h_from = h;
+        h_to   = h + 1;
+    }
+    else
+    {
+        compute_linear_back_index_from_to(
+            h, Hin, Hout, scale_factor_h_, align_corners, &h_from, &h_to);
+    }
+    int64_t w_from, w_to;
+    if(Win == Wout)
+    {
+        w_from = w;
+        w_to   = w + 1;
+    }
+    else
+    {
+        compute_linear_back_index_from_to(
+            w, Win, Wout, scale_factor_w_, align_corners, &w_from, &w_to);
+    }
+
+    FLOAT_ACCUM output = 0;
+    for(int64_t i = h_from; i < h_to; i++)
+    {
+        FLOAT_ACCUM h_lambda = compute_back_lambda(i, h, scale_factor_h_, Hin, Hout, align_corners);
+        if(h_lambda == 0.)
+            continue;
+        for(int64_t j = w_from; j < w_to; j++)
+        {
+            FLOAT_ACCUM w_lambda =
+                compute_back_lambda(j, w, scale_factor_w_, Win, Wout, align_corners);
+
+            tensor_layout_t<4> output_layout(n, c, i, j);
+
+            output +=
+                CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) *
+                h_lambda * w_lambda;
+        }
+    }
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(output);
+}
+
+extern "C" __global__ void InterpolateBilinearBackward(OUTPUT_TYPE* __restrict__ input_grad,
+                                                       const INPUT_TYPE* __restrict__ output_grad,
+                                                       const tensor_view_t<4> input_grad_tv,
+                                                       const tensor_view_t<4> output_grad_tv,
+                                                       const size_t nelems,
+                                                       const float* scale_factors,
+                                                       const bool align_corners)
+{
+    interpolateBilinearBackward<INPUT_TYPE, OUTPUT_TYPE>(input_grad,
+                                                         output_grad,
+                                                         input_grad_tv,
+                                                         output_grad_tv,
+                                                         nelems,
+                                                         scale_factors,
+                                                         align_corners);
+}
+
+template <typename TI, typename TO>
+__device__ inline void interpolateTrilinearBackward(TO* __restrict__ input_grad,
+                                                    const TI* __restrict__ output_grad,
+                                                    const tensor_view_t<5> input_grad_tv,
+                                                    const tensor_view_t<5> output_grad_tv,
+                                                    const size_t nelems,
+                                                    const float* scale_factors,
+                                                    const bool align_corners)
+{
+    int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= nelems)
+        return;
+
+    auto tensor_layout = tensor_layout_t<5>(input_grad_tv, gid);
+    int64_t n          = tensor_layout.layout[0];
+    int64_t c          = tensor_layout.layout[1];
+    int64_t d          = tensor_layout.layout[2];
+    int64_t h          = tensor_layout.layout[3];
+    int64_t w          = tensor_layout.layout[4];
+
+    int64_t Din  = input_grad_tv.size[2];
+    int64_t Dout = output_grad_tv.size[2];
+    int64_t Hin  = input_grad_tv.size[3];
+    int64_t Hout = output_grad_tv.size[3];
+    int64_t Win  = input_grad_tv.size[4];
+    int64_t Wout = output_grad_tv.size[4];
+
+    FLOAT_ACCUM scale_factor_d = CVT_FP32_2ACCUM(scale_factors[0]);
+    FLOAT_ACCUM scale_factor_d_ =
+        compute_linear_scale_factor(scale_factor_d, Din, Dout, align_corners);
+
+    FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[1]);
+    FLOAT_ACCUM scale_factor_h_ =
+        compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+
+    FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[2]);
+    FLOAT_ACCUM scale_factor_w_ =
+        compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+
+    int64_t d_from, d_to;
+    if(Din == Dout)
+    {
+        d_from = d;
+        d_to   = d + 1;
+    }
+    else
+    {
+        compute_linear_back_index_from_to(
+            d, Din, Dout, scale_factor_d_, align_corners, &d_from, &d_to);
+    }
+    int64_t h_from, h_to;
+    if(Hin == Hout)
+    {
+        h_from = h;
+        h_to   = h + 1;
+    }
+    else
+    {
+        compute_linear_back_index_from_to(
+            h, Hin, Hout, scale_factor_h_, align_corners, &h_from, &h_to);
+    }
+    int64_t w_from, w_to;
+    if(Win == Wout)
+    {
+        w_from = w;
+        w_to   = w + 1;
+    }
+    else
+    {
+        compute_linear_back_index_from_to(
+            w, Win, Wout, scale_factor_w_, align_corners, &w_from, &w_to);
+    }
+
+    FLOAT_ACCUM output = 0;
+    for(int64_t i = d_from; i < d_to; i++)
+    {
+        FLOAT_ACCUM d_lambda = compute_back_lambda(i, d, scale_factor_d_, Din, Dout, align_corners);
+        if(d_lambda == 0.f)
+            continue;
+        for(int64_t j = h_from; j < h_to; j++)
+        {
+            FLOAT_ACCUM h_lambda =
+                compute_back_lambda(j, h, scale_factor_h_, Hin, Hout, align_corners);
+            if(h_lambda == 0.f)
+                continue;
+            for(int64_t k = w_from; k < w_to; k++)
+            {
+                FLOAT_ACCUM w_lambda =
+                    compute_back_lambda(k, w, scale_factor_w_, Win, Wout, align_corners);
+                tensor_layout_t<5> output_layout(n, c, i, j, k);
+
+                output += CVT_FLOAT2ACCUM(
+                              output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) *
+                          d_lambda * h_lambda * w_lambda;
+            }
+        }
+    }
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(output);
+}
+
+extern "C" __global__ void InterpolateTrilinearBackward(OUTPUT_TYPE* __restrict__ input_grad,
+                                                        const INPUT_TYPE* __restrict__ output_grad,
+                                                        const tensor_view_t<5> input_grad_tv,
+                                                        const tensor_view_t<5> output_grad_tv,
+                                                        const size_t nelems,
+                                                        const float* scale_factors,
+                                                        const bool align_corners)
+{
+    interpolateTrilinearBackward<INPUT_TYPE, OUTPUT_TYPE>(input_grad,
+                                                          output_grad,
+                                                          input_grad_tv,
+                                                          output_grad_tv,
+                                                          nelems,
+                                                          scale_factors,
+                                                          align_corners);
+}
+
+__device__ inline FLOAT_ACCUM
+compute_scales_value(FLOAT_ACCUM scale, int64_t input_size, int64_t output_size)
+{
+    return (scale == 0.f) ? (static_cast<FLOAT_ACCUM>(input_size) / output_size) : (1.0f / scale);
+}
+
+__device__ inline int64_t
+nearest_idx(int64_t output_index, int64_t input_size, int64_t output_size, FLOAT_ACCUM scales)
+{
+    if(output_size == input_size)
+    {
+        return output_index;
+    }
+    else if(output_size == 2 * input_size)
+    {
+        return output_index / 2;
+    }
+    else
+    {
+        FLOAT_ACCUM scale = compute_scales_value(scales, input_size, output_size);
+        return min(static_cast<int64_t>((output_index * scale)), input_size);
+    }
+}
+
+template <typename TI, typename TO>
+__device__ inline void interpolateNearestForward(const TI* __restrict__ input,
+                                                 TO* __restrict__ output,
+                                                 const tensor_view_t<5> input_tv,
+                                                 const tensor_view_t<5> output_tv,
+                                                 const size_t nelems,
+                                                 const float* scale_factors)
+{
+    int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= nelems)
+        return;
+
+    auto tensor_layout = tensor_layout_t<5>(output_tv, gid);
+    int64_t n          = tensor_layout.layout[0];
+    int64_t c          = tensor_layout.layout[1];
+    int64_t d          = tensor_layout.layout[2];
+    int64_t h          = tensor_layout.layout[3];
+    int64_t w          = tensor_layout.layout[4];
+
+    int64_t Dout = output_tv.size[2];
+    int64_t Hout = output_tv.size[3];
+    int64_t Wout = output_tv.size[4];
+    int64_t Din  = input_tv.size[2];
+    int64_t Hin  = input_tv.size[3];
+    int64_t Win  = input_tv.size[4];
+
+    FLOAT_ACCUM scale_factor_d = CVT_FP32_2ACCUM(scale_factors[0]);
+    FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[1]);
+    FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[2]);
+
+    int64_t x = nearest_idx(d, Din, Dout, scale_factor_d);
+    int64_t y = nearest_idx(h, Hin, Hout, scale_factor_h);
+    int64_t z = nearest_idx(w, Win, Wout, scale_factor_w);
+
+    tensor_layout_t<5> input_layout(n, c, x, y, z);
+
+    output[output_tv.get_tensor_view_idx(tensor_layout)] =
+        input[input_tv.get_tensor_view_idx(input_layout)];
+}
+
+extern "C" __global__ void InterpolateNearestForward(const INPUT_TYPE* __restrict__ input,
+                                                     OUTPUT_TYPE* __restrict__ output,
+                                                     const tensor_view_t<5> input_tv,
+                                                     const tensor_view_t<5> output_tv,
+                                                     const size_t nelems,
+                                                     const float* scale_factors)
+{
+    interpolateNearestForward<INPUT_TYPE, OUTPUT_TYPE>(
+        input, output, input_tv, output_tv, nelems, scale_factors);
+}
+
+__device__ inline int64_t
+nearest_idx_back(int64_t input_index, int64_t input_size, int64_t output_size, FLOAT_ACCUM scales)
+{
+    if(output_size == input_size)
+    {
+        return input_index;
+    }
+    else if(output_size == 2 * input_size)
+    {
+        return input_index * 2;
+    }
+    else
+    {
+        FLOAT_ACCUM scale = compute_scales_value(scales, input_size, output_size);
+        return min(static_cast<int64_t>(ceil(input_index / scale)), output_size);
+    }
+}
+
+template <typename TI, typename TO>
+__device__ inline void interpolateNearestBackward(TO* __restrict__ input_grad,
+                                                  const TI* __restrict__ output_grad,
+                                                  const tensor_view_t<5> input_grad_tv,
+                                                  const tensor_view_t<5> output_grad_tv,
+                                                  const size_t nelems,
+                                                  const float* scale_factors)
+{
+    int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= nelems)
+        return;
+
+    auto tensor_layout = tensor_layout_t<5>(input_grad_tv, gid);
+    int64_t n          = tensor_layout.layout[0];
+    int64_t c          = tensor_layout.layout[1];
+    int64_t x          = tensor_layout.layout[2];
+    int64_t y          = tensor_layout.layout[3];
+    int64_t z          = tensor_layout.layout[4];
+
+    int64_t Dout = output_grad_tv.size[2];
+    int64_t Hout = output_grad_tv.size[3];
+    int64_t Wout = output_grad_tv.size[4];
+    int64_t Din  = input_grad_tv.size[2];
+    int64_t Hin  = input_grad_tv.size[3];
+    int64_t Win  = input_grad_tv.size[4];
+
+    FLOAT_ACCUM scale_factor_d = CVT_FP32_2ACCUM(scale_factors[0]);
+    FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[1]);
+    FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[2]);
+
+    int64_t dstart = nearest_idx_back(x, Din, Dout, scale_factor_d);
+    int64_t dlimit = nearest_idx_back(x + 1, Din, Dout, scale_factor_d);
+    int64_t hstart = nearest_idx_back(y, Hin, Hout, scale_factor_h);
+    int64_t hlimit = nearest_idx_back(y + 1, Hin, Hout, scale_factor_h);
+    int64_t wstart = nearest_idx_back(z, Win, Wout, scale_factor_w);
+    int64_t wlimit = nearest_idx_back(z + 1, Win, Wout, scale_factor_w);
+
+    FLOAT_ACCUM grad = 0.f;
+    for(int64_t d = dstart; d < dlimit; d++)
+    {
+        for(int64_t h = hstart; h < hlimit; h++)
+        {
+            for(int64_t w = wstart; w < wlimit; w++)
+            {
+                tensor_layout_t<5> output_grad_layout(n, c, d, h, w);
+                grad += CVT_FLOAT2ACCUM(
+                    output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]);
+            }
+        }
+    }
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(grad);
+}
+
+extern "C" __global__ void InterpolateNearestBackward(OUTPUT_TYPE* __restrict__ input_grad,
+                                                      const INPUT_TYPE* __restrict__ output_grad,
+                                                      const tensor_view_t<5> input_grad_tv,
+                                                      const tensor_view_t<5> output_grad_tv,
+                                                      const size_t nelems,
+                                                      const float* scale_factors)
+{
+    interpolateNearestBackward<INPUT_TYPE, OUTPUT_TYPE>(
+        input_grad, output_grad, input_grad_tv, output_grad_tv, nelems, scale_factors);
+}
+
+__device__ inline FLOAT_ACCUM
+bicubic_idx(int64_t output_index, int64_t output_size, FLOAT_ACCUM scale_factor, bool align_corners)
+{
+    if(output_size == 1)
+    {
+        if(align_corners)
+        {
+            return 0;
+        }
+        return -0.5f;
+    }
+    return get_src_index(output_index, scale_factor, align_corners);
+}
+
+__device__ inline FLOAT_ACCUM cubic_convolution1(FLOAT_ACCUM x, FLOAT_ACCUM A)
+{
+    return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+__device__ inline FLOAT_ACCUM cubic_convolution2(FLOAT_ACCUM x, FLOAT_ACCUM A)
+{
+    return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+__device__ inline void get_cubic_upsampling_coefficients(FLOAT_ACCUM coeffs[4], FLOAT_ACCUM t)
+{
+    FLOAT_ACCUM A = -0.75f;
+
+    FLOAT_ACCUM x1 = t;
+    coeffs[0]      = cubic_convolution2(x1 + 1.0f, A);
+    coeffs[1]      = cubic_convolution1(x1, A);
+
+    FLOAT_ACCUM x2 = 1.0f - t;
+    coeffs[2]      = cubic_convolution1(x2, A);
+    coeffs[3]      = cubic_convolution2(x2 + 1.0f, A);
+}
+
+__device__ inline FLOAT_ACCUM
+cubic_interp1d(FLOAT_ACCUM x0, FLOAT_ACCUM x1, FLOAT_ACCUM x2, FLOAT_ACCUM x3, FLOAT_ACCUM t)
+{
+    FLOAT_ACCUM coeffs[4];
+    get_cubic_upsampling_coefficients(coeffs, t);
+
+    return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+__device__ inline int64_t bound(int64_t p, int64_t max_size)
+{
+    return max(min(p, max_size - 1), 0l);
+}
+
+template <typename TI, typename TO>
+__device__ inline void interpolateBicubicForward(const TI* __restrict__ input,
+                                                 TO* __restrict__ output,
+                                                 const tensor_view_t<4> input_tv,
+                                                 const tensor_view_t<4> output_tv,
+                                                 const size_t nelems,
+                                                 const float* scale_factors,
+                                                 const bool align_corners)
+{
+    int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= nelems)
+        return;
+
+    auto tensor_layout = tensor_layout_t<4>(output_tv, gid);
+    int64_t n          = tensor_layout.layout[0];
+    int64_t c          = tensor_layout.layout[1];
+    int64_t h          = tensor_layout.layout[2];
+    int64_t w          = tensor_layout.layout[3];
+
+    int64_t Hin  = input_tv.size[2];
+    int64_t Win  = input_tv.size[3];
+    int64_t Hout = output_tv.size[2];
+    int64_t Wout = output_tv.size[3];
+    if(Hin == Hout && Win == Wout)
+    {
+        output[output_tv.get_tensor_view_idx(tensor_layout)] =
+            input[input_tv.get_tensor_view_idx(tensor_layout)];
+        return;
+    }
+
+    FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[0]);
+    FLOAT_ACCUM scale_factor_h_ =
+        compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+    FLOAT_ACCUM real_y = bicubic_idx(h, Hout, scale_factor_h_, align_corners);
+    int64_t in_y       = static_cast<int64_t>(floor(real_y));
+    FLOAT_ACCUM t_y    = real_y - static_cast<FLOAT_ACCUM>(in_y);
+
+    FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[1]);
+    FLOAT_ACCUM scale_factor_w_ =
+        compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+    FLOAT_ACCUM real_x = bicubic_idx(w, Wout, scale_factor_w_, align_corners);
+    int64_t in_x       = static_cast<int64_t>(floor(real_x));
+    FLOAT_ACCUM t_x    = real_x - static_cast<FLOAT_ACCUM>(in_x);
+
+    FLOAT_ACCUM coefficients[4];
+#pragma unroll
+    for(int k = 0; k < 4; k++)
+    {
+        int64_t y = bound(in_y - 1 + k, Hin);
+        tensor_layout_t<4> input_layout0(n, c, y, bound(in_x - 1, Win));
+        tensor_layout_t<4> input_layout1(n, c, y, bound(in_x, Win));
+        tensor_layout_t<4> input_layout2(n, c, y, bound(in_x + 1, Win));
+        tensor_layout_t<4> input_layout3(n, c, y, bound(in_x + 2, Win));
+
+        coefficients[k] =
+            cubic_interp1d(CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout0)]),
+                           CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout1)]),
+                           CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout2)]),
+                           CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(input_layout3)]),
+                           t_x);
+    }
+
+    output[output_tv.get_tensor_view_idx(tensor_layout)] = CVT_ACCUM2FLOAT(
+        cubic_interp1d(coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y));
+}
+
+extern "C" __global__ void InterpolateBicubicForward(const INPUT_TYPE* __restrict__ input,
+                                                     OUTPUT_TYPE* __restrict__ output,
+                                                     const tensor_view_t<4> input_tv,
+                                                     const tensor_view_t<4> output_tv,
+                                                     const size_t nelems,
+                                                     const float* scale_factors,
+                                                     const bool align_corners)
+{
+    interpolateBicubicForward<INPUT_TYPE, OUTPUT_TYPE>(
+        input, output, input_tv, output_tv, nelems, scale_factors, align_corners);
+}
+
+template <typename TI, typename TD>
+__device__ inline void interpolateBicubicBackward(TD* __restrict__ workspace,
+                                                  const TI* __restrict__ output_grad,
+                                                  const tensor_view_t<4> input_grad_tv,
+                                                  const tensor_view_t<4> output_grad_tv,
+                                                  const size_t nelems,
+                                                  const float* scale_factors,
+                                                  const bool align_corners)
+{
+    int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= nelems)
+        return;
+
+    auto tensor_layout = tensor_layout_t<4>(output_grad_tv, gid);
+    int64_t n          = tensor_layout.layout[0];
+    int64_t c          = tensor_layout.layout[1];
+    int64_t h          = tensor_layout.layout[2];
+    int64_t w          = tensor_layout.layout[3];
+
+    int64_t Hin  = input_grad_tv.size[2];
+    int64_t Hout = output_grad_tv.size[2];
+    int64_t Win  = input_grad_tv.size[3];
+    int64_t Wout = output_grad_tv.size[3];
+
+    if(Hin == Hout && Win == Wout)
+    {
+        workspace[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+            CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]);
+        return;
+    }
+
+    FLOAT_ACCUM scale_factor_h = CVT_FP32_2ACCUM(scale_factors[0]);
+    FLOAT_ACCUM scale_factor_h_ =
+        compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+    FLOAT_ACCUM real_y = bicubic_idx(h, Hout, scale_factor_h_, align_corners);
+    int64_t in_y       = static_cast<int64_t>(floor(real_y));
+    FLOAT_ACCUM t_y    = real_y - static_cast<FLOAT_ACCUM>(in_y);
+
+    FLOAT_ACCUM scale_factor_w = CVT_FP32_2ACCUM(scale_factors[1]);
+    FLOAT_ACCUM scale_factor_w_ =
+        compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+    FLOAT_ACCUM real_x = bicubic_idx(w, Wout, scale_factor_w_, align_corners);
+    int64_t in_x       = static_cast<int64_t>(floor(real_x));
+    FLOAT_ACCUM t_x    = real_x - static_cast<FLOAT_ACCUM>(in_x);
+
+    FLOAT_ACCUM y_coeffs[4];
+    FLOAT_ACCUM x_coeffs[4];
+    get_cubic_upsampling_coefficients(y_coeffs, t_y);
+    get_cubic_upsampling_coefficients(x_coeffs, t_x);
+
+    FLOAT_ACCUM out_value =
+        CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]);
+#pragma unroll
+    for(int i = 0; i < 4; i++)
+    {
+        int64_t input_h = bound(in_y - 1 + i, Hin);
+#pragma unroll
+        for(int j = 0; j < 4; j++)
+        {
+            int64_t input_w = bound(in_x - 1 + j, Win);
+            tensor_layout_t<4> in_grad_layout(n, c, input_h, input_w);
+
+            atomicAdd(workspace + input_grad_tv.get_tensor_view_idx(in_grad_layout),
+                      out_value * y_coeffs[i] * x_coeffs[j]);
+        }
+    }
+}
+
+template <typename TD, typename TO>
+__device__ inline void interpolateBicubicBackward_paste(TO* __restrict__ input_grad,
+                                                        const TD* __restrict__ workspace,
+                                                        const tensor_view_t<4> input_grad_tv,
+                                                        const size_t nelems)
+{
+    int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= nelems)
+        return;
+
+    auto tensor_layout = tensor_layout_t<4>(input_grad_tv, gid);
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+        CVT_ACCUM2FLOAT(workspace[input_grad_tv.get_tensor_view_idx(tensor_layout)]);
+}
+
+extern "C" __global__ void InterpolateBicubicBackward(DTYPE* __restrict__ workspace,
+                                                      const INPUT_TYPE* __restrict__ output_grad,
+                                                      const tensor_view_t<4> input_grad_tv,
+                                                      const tensor_view_t<4> output_grad_tv,
+                                                      const size_t nelems,
+                                                      const float* scale_factors,
+                                                      const bool align_corners)
+{
+    interpolateBicubicBackward<INPUT_TYPE, DTYPE>(workspace,
+                                                  output_grad,
+                                                  input_grad_tv,
+                                                  output_grad_tv,
+                                                  nelems,
+                                                  scale_factors,
+                                                  align_corners);
+}
+
+extern "C" __global__ void InterpolateBicubicBackward_paste(OUTPUT_TYPE* __restrict__ input_grad,
+                                                            const DTYPE* __restrict__ workspace,
+                                                            const tensor_view_t<4> input_grad_tv,
+                                                            const size_t nelems)
+{
+    interpolateBicubicBackward_paste<DTYPE, OUTPUT_TYPE>(
+        input_grad, workspace, input_grad_tv, nelems);
+}
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
new file mode 100644
index 0000000000..9d0a3d28e9
--- /dev/null
+++ b/src/kernels/tensor_view.hpp
@@ -0,0 +1,118 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_TENSOR_VIEW_H
+#define GUARD_TENSOR_VIEW_H
+
+template <int N>
+struct tensor_layout_t;
+
+template <int N>
+struct tensor_view_t
+{
+    // Get index in tensor view at tensor layout
+    constexpr uint64_t get_tensor_view_idx(const tensor_layout_t<N>& tensor_layout) const
+    {
+        static_assert(N > 0);
+        uint64_t idx = 0;
+        for(auto i = 0; i < N; ++i)
+        {
+            idx += stride[i] * tensor_layout.layout[i];
+        }
+        return idx;
+    }
+    uint64_t stride[N];
+    uint64_t size[N];
+};
+
+template <int N>
+struct tensor_layout_t
+{
+    // Make tensor layout at index using tensor view
+    constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view, uint64_t idx)
+    {
+        static_assert(N > 0);
+        uint64_t temp = idx;
+        if constexpr(N == 1)
+        {
+            layout[0] = idx;
+        }
+        else
+        {
+            for(auto i = N - 1; i > 1; --i)
+            {
+                layout[i] = temp % tensor_view.size[i];
+                temp      = temp / tensor_view.size[i];
+            }
+            layout[1] = temp % tensor_view.size[1];
+            layout[0] = temp / tensor_view.size[1];
+        }
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 5);
+        layout[0] = n;
+        layout[1] = c;
+        layout[2] = d;
+        layout[3] = h;
+        layout[4] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 4);
+        layout[0] = n;
+        layout[1] = c;
+        layout[2] = h;
+        layout[3] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 3);
+        layout[0] = n;
+        layout[1] = h;
+        layout[2] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t w)
+    {
+        static_assert(N == 2);
+        layout[0] = n;
+        layout[1] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n)
+    {
+        static_assert(N == 1);
+        layout[0] = n;
+    }
+
+    uint64_t layout[N];
+};
+
+#endif // GUARD_TENSOR_VIEW_H
diff --git a/src/solver.cpp b/src/solver.cpp
index f45f3058a6..9bc48f0423 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -30,6 +30,7 @@
 #include <miopen/batchnorm/solvers.hpp>
 #include <miopen/fusion/solvers.hpp>
 #include <miopen/groupnorm/solvers.hpp>
+#include <miopen/interpolate/solvers.hpp>
 #include <miopen/layernorm/solvers.hpp>
 #include <miopen/pooling/solvers.hpp>
 #include <miopen/reduce/solvers.hpp>
@@ -649,6 +650,43 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId());
     Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId());
 
+    Register(registry,
+             ++id,
+             Primitive::Interpolate,
+             interpolate::InterpolateNearestForward{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::Interpolate,
+             interpolate::InterpolateLinearForward{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::Interpolate,
+             interpolate::InterpolateBilinearForward{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::Interpolate,
+             interpolate::InterpolateBicubicForward{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::Interpolate,
+             interpolate::InterpolateNearestBackward{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::Interpolate,
+             interpolate::InterpolateLinearBackward{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::Interpolate,
+             interpolate::InterpolateBilinearBackward{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::Interpolate,
+             interpolate::InterpolateTrilinearBackward{}.SolverDbId());
+    Register(registry,
+             ++id,
+             Primitive::Interpolate,
+             interpolate::InterpolateBicubicBackward{}.SolverDbId());
+
     // IMPORTANT: New solvers should be added to the end of the function!
 }
 
diff --git a/src/solver/interpolate/bwd_bicubic_interpolate.cpp b/src/solver/interpolate/bwd_bicubic_interpolate.cpp
new file mode 100644
index 0000000000..3a534180a0
--- /dev/null
+++ b/src/solver/interpolate/bwd_bicubic_interpolate.cpp
@@ -0,0 +1,208 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/miopen.h"
+#include <miopen/interpolate/solvers.hpp>
+#include <miopen/interpolate/utils.hpp>
+
+#include <miopen/interpolate/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/interpolate.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_BWD_BICUBIC 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace interpolate {
+
+bool IsOverRocmBicubicBwd(const miopen::interpolate::BwdProblemDescription& problem)
+{
+    TensorDescriptor output_grad_desc = problem.GetOutputGradDesc();
+    TensorDescriptor input_grad_desc  = problem.GetInputGradDesc();
+    auto dtype                        = input_grad_desc.GetType();
+
+    float scale_h =
+        static_cast<float>(output_grad_desc.GetLengths()[2]) / input_grad_desc.GetLengths()[2];
+    float scale_w =
+        static_cast<float>(output_grad_desc.GetLengths()[3]) / input_grad_desc.GetLengths()[3];
+
+    if(dtype == miopenHalf || dtype == miopenBFloat16)
+    {
+        if(scale_h + scale_w < 8 && scale_h + scale_w > 1.4)
+            return true;
+        else
+            return false;
+    }
+    else
+    {
+        if(output_grad_desc.GetLengths()[2] + output_grad_desc.GetLengths()[3] <= 256 &&
+           (input_grad_desc.GetElementSize() >= 10000))
+            return true;
+        else
+            return false;
+    }
+}
+
+bool InterpolateBicubicBackward::IsApplicable(
+    const ExecutionContext&, const miopen::interpolate::BwdProblemDescription& problem) const
+{
+    if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_BICUBIC)
+        return false;
+    if(!IsOverRocmBicubicBwd(problem))
+        return false;
+
+    return true;
+}
+
+ConvSolution InterpolateBicubicBackward::GetSolution(
+    const ExecutionContext& context,
+    const miopen::interpolate::BwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
+
+    {
+        auto dtype           = problem.GetInputGradDesc().GetType();
+        size_t N_total       = problem.GetOutputGradDesc().GetElementSize();
+        size_t N_total_paste = problem.GetInputGradDesc().GetElementSize();
+
+        auto kernel = KernelInfo{};
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+            {"DTYPE", "float"},
+        };
+
+        result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_BICUBIC},
+                                                             {N_total},
+                                                             "MIOpenInterpolate.cpp",
+                                                             "InterpolateBicubicBackward",
+                                                             build_params));
+
+        if(dtype != miopenFloat)
+        {
+            result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_BICUBIC},
+                                                                 {N_total_paste},
+                                                                 "MIOpenInterpolate.cpp",
+                                                                 "InterpolateBicubicBackward_paste",
+                                                                 build_params));
+        }
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::interpolate::BwdInvokeParams>();
+
+            auto input_grad_tv  = get_inner_expanded_tv<4>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc));
+            auto dtype          = deref(params.inputGradDesc).GetType();
+            size_t nelems       = params.outputGradDesc->GetElementSize();
+
+            int kernelCnt         = 0;
+            decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]);
+
+            float elapsed = 0.0f;
+            HipEventPtr start;
+            HipEventPtr stop;
+
+            const bool profiling = handle_.IsProfilingEnabled();
+            if(kernels.size() > 1 && profiling)
+            {
+                start = miopen::make_hip_event();
+                stop  = miopen::make_hip_event();
+                handle_.EnableProfiling(false);
+                hipEventRecord(start.get(), handle_.GetStream());
+            }
+
+            if(dtype == miopenFloat)
+            {
+                kernel(params.input_grad,
+                       params.output_grad,
+                       input_grad_tv,
+                       output_grad_tv,
+                       nelems,
+                       params.scale_factors,
+                       params.align_corners);
+            }
+            else
+            {
+                kernel(params.workspace,
+                       params.output_grad,
+                       input_grad_tv,
+                       output_grad_tv,
+                       nelems,
+                       params.scale_factors,
+                       params.align_corners);
+
+                nelems = params.inputGradDesc->GetElementSize();
+                kernel = handle_.Run(kernels[kernelCnt++]);
+                kernel(params.input_grad, params.workspace, input_grad_tv, nelems);
+            }
+
+            if(kernels.size() > 1 && profiling)
+            {
+                hipEventRecord(stop.get(), handle_.GetStream());
+                hipEventSynchronize(stop.get());
+                hipEventElapsedTime(&elapsed, start.get(), stop.get());
+
+                // Clean up
+                hipEventDestroy(start.get());
+                hipEventDestroy(stop.get());
+                handle_.ResetKernelTime();
+                handle_.AccumKernelTime(elapsed);
+                handle_.EnableProfiling(true);
+            };
+        };
+    };
+
+    return result;
+}
+
+std::size_t InterpolateBicubicBackward::GetWorkspaceSize(
+    const ExecutionContext&, const miopen::interpolate::BwdProblemDescription& problem) const
+{
+    return problem.GetInputGradDesc().GetElementSize() * sizeof(float);
+}
+
+} // namespace interpolate
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/interpolate/bwd_bilinear_interpolate.cpp b/src/solver/interpolate/bwd_bilinear_interpolate.cpp
new file mode 100644
index 0000000000..36050a926f
--- /dev/null
+++ b/src/solver/interpolate/bwd_bilinear_interpolate.cpp
@@ -0,0 +1,142 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/miopen.h"
+#include <miopen/interpolate/solvers.hpp>
+#include <miopen/interpolate/utils.hpp>
+
+#include <miopen/interpolate/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/interpolate.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_BWD_BILINEAR 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace interpolate {
+
+bool IsOverRocmBilinearBwd(const miopen::interpolate::BwdProblemDescription& problem)
+{
+    TensorDescriptor input_grad_desc  = problem.GetInputGradDesc();
+    TensorDescriptor output_grad_desc = problem.GetOutputGradDesc();
+    auto dtype                        = input_grad_desc.GetType();
+
+    float scale_h =
+        static_cast<float>(output_grad_desc.GetLengths()[2]) / input_grad_desc.GetLengths()[2];
+    float scale_w =
+        static_cast<float>(output_grad_desc.GetLengths()[3]) / input_grad_desc.GetLengths()[3];
+
+    if(dtype == miopenHalf || dtype == miopenBFloat16)
+    {
+        if(scale_h + scale_w < 2)
+            return false;
+    }
+    else if(dtype == miopenFloat)
+    {
+        if(scale_h + scale_w < 14)
+            return false;
+    }
+
+    return true;
+}
+
+bool InterpolateBilinearBackward::IsApplicable(
+    const ExecutionContext&, const miopen::interpolate::BwdProblemDescription& problem) const
+{
+    if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_BILINEAR)
+        return false;
+    if(!IsOverRocmBilinearBwd(problem))
+        return false;
+
+    return true;
+}
+
+ConvSolution InterpolateBilinearBackward::GetSolution(
+    const ExecutionContext& context,
+    const miopen::interpolate::BwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
+
+    {
+        auto dtype     = problem.GetInputGradDesc().GetType();
+        size_t N_total = problem.GetInputGradDesc().GetElementSize();
+
+        auto kernel = KernelInfo{};
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+            {"DTYPE", "float"},
+        };
+
+        result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_BILINEAR},
+                                                             {N_total},
+                                                             "MIOpenInterpolate.cpp",
+                                                             "InterpolateBilinearBackward",
+                                                             build_params));
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::interpolate::BwdInvokeParams>();
+
+            auto input_grad_tv  = get_inner_expanded_tv<4>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc));
+            size_t nelems       = params.inputGradDesc->GetElementSize();
+
+            kernel(params.input_grad,
+                   params.output_grad,
+                   input_grad_tv,
+                   output_grad_tv,
+                   nelems,
+                   params.scale_factors,
+                   params.align_corners);
+        };
+    };
+
+    return result;
+}
+
+} // namespace interpolate
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/interpolate/bwd_linear_interpolate.cpp b/src/solver/interpolate/bwd_linear_interpolate.cpp
new file mode 100644
index 0000000000..b105417853
--- /dev/null
+++ b/src/solver/interpolate/bwd_linear_interpolate.cpp
@@ -0,0 +1,134 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include <miopen/interpolate/solvers.hpp>
+#include <miopen/interpolate/utils.hpp>
+
+#include <miopen/interpolate/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/interpolate.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_BWD_LINEAR 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace interpolate {
+
+bool IsOverRocmLinearBwd(const miopen::interpolate::BwdProblemDescription& problem)
+{
+    TensorDescriptor input_grad_desc = problem.GetInputGradDesc();
+    auto dtype                       = input_grad_desc.GetType();
+
+    if(dtype == miopenFloat)
+    {
+        if(input_grad_desc.GetElementSize() < 4000)
+            return false;
+    }
+    else if(dtype == miopenHalf || dtype == miopenBFloat16)
+    {
+        if(input_grad_desc.GetElementSize() < 960)
+            return false;
+    }
+
+    return true;
+}
+
+bool InterpolateLinearBackward::IsApplicable(
+    const ExecutionContext&, const miopen::interpolate::BwdProblemDescription& problem) const
+{
+    if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_LINEAR)
+        return false;
+    if(!IsOverRocmLinearBwd(problem))
+        return false;
+    return true;
+}
+
+ConvSolution InterpolateLinearBackward::GetSolution(
+    const ExecutionContext& context,
+    const miopen::interpolate::BwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
+
+    {
+        auto dtype     = problem.GetInputGradDesc().GetType();
+        size_t N_total = problem.GetInputGradDesc().GetElementSize();
+
+        auto kernel = KernelInfo{};
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+            {"DTYPE", "float"},
+        };
+
+        result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_LINEAR},
+                                                             {N_total},
+                                                             "MIOpenInterpolate.cpp",
+                                                             "InterpolateLinearBackward",
+                                                             build_params));
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::interpolate::BwdInvokeParams>();
+
+            auto input_grad_tv  = get_inner_expanded_tv<3>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<3>(deref(params.outputGradDesc));
+            size_t nelems       = params.inputGradDesc->GetElementSize();
+
+            kernel(params.input_grad,
+                   params.output_grad,
+                   input_grad_tv,
+                   output_grad_tv,
+                   nelems,
+                   params.scale_factors,
+                   params.align_corners);
+        };
+    };
+
+    return result;
+}
+
+} // namespace interpolate
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/interpolate/bwd_nearest_interpolate.cpp b/src/solver/interpolate/bwd_nearest_interpolate.cpp
new file mode 100644
index 0000000000..d66fc78829
--- /dev/null
+++ b/src/solver/interpolate/bwd_nearest_interpolate.cpp
@@ -0,0 +1,150 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/activ.hpp"
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include <miopen/interpolate/solvers.hpp>
+#include <miopen/interpolate/utils.hpp>
+
+#include <miopen/interpolate/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/interpolate.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_BWD_NEAREST 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace interpolate {
+
+bool IsOverRocmNearestBwd(const miopen::interpolate::BwdProblemDescription& problem)
+{
+    TensorDescriptor input_grad_desc  = problem.GetInputGradDesc();
+    TensorDescriptor output_grad_desc = problem.GetOutputGradDesc();
+    if(input_grad_desc.GetLengths().size() == 3)
+    {
+        if(output_grad_desc.GetElementSize() < 8000 || input_grad_desc.GetLengths()[0] < 10)
+            return false;
+    }
+    else if(input_grad_desc.GetLengths().size() == 4)
+    {
+        float scale_h =
+            static_cast<float>(output_grad_desc.GetLengths()[2]) / input_grad_desc.GetLengths()[2];
+        float scale_w =
+            static_cast<float>(output_grad_desc.GetLengths()[3]) / input_grad_desc.GetLengths()[3];
+
+        if(input_grad_desc.GetLengths()[0] < 10 || (scale_h + scale_w <= 4))
+            return false;
+    }
+    else if(input_grad_desc.GetLengths().size() == 5)
+    {
+        float scale_h =
+            static_cast<float>(output_grad_desc.GetLengths()[2]) / input_grad_desc.GetLengths()[2];
+        float scale_w =
+            static_cast<float>(output_grad_desc.GetLengths()[3]) / input_grad_desc.GetLengths()[3];
+        float scale_d =
+            static_cast<float>(output_grad_desc.GetLengths()[4]) / input_grad_desc.GetLengths()[4];
+
+        if(scale_h + scale_w + scale_d < 6)
+            return false;
+    }
+
+    return true;
+}
+
+bool InterpolateNearestBackward::IsApplicable(
+    const ExecutionContext&, const miopen::interpolate::BwdProblemDescription& problem) const
+{
+    if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_NEAREST)
+        return false;
+    if(!IsOverRocmNearestBwd(problem))
+        return false;
+    return true;
+}
+
+ConvSolution InterpolateNearestBackward::GetSolution(
+    const ExecutionContext& context,
+    const miopen::interpolate::BwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
+
+    {
+        auto dtype     = problem.GetInputGradDesc().GetType();
+        size_t N_total = problem.GetInputGradDesc().GetElementSize();
+
+        auto kernel = KernelInfo{};
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+            {"DTYPE", "float"},
+        };
+
+        result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_NEAREST},
+                                                             {N_total},
+                                                             "MIOpenInterpolate.cpp",
+                                                             "InterpolateNearestBackward",
+                                                             build_params));
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::interpolate::BwdInvokeParams>();
+
+            auto input_grad_tv  = get_inner_expanded_tv<5>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<5>(deref(params.outputGradDesc));
+            size_t nelems       = params.inputGradDesc->GetElementSize();
+
+            kernel(params.input_grad,
+                   params.output_grad,
+                   input_grad_tv,
+                   output_grad_tv,
+                   nelems,
+                   params.scale_factors);
+        };
+    };
+
+    return result;
+}
+
+} // namespace interpolate
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/interpolate/bwd_trilinear_interpolate.cpp b/src/solver/interpolate/bwd_trilinear_interpolate.cpp
new file mode 100644
index 0000000000..b153070dd5
--- /dev/null
+++ b/src/solver/interpolate/bwd_trilinear_interpolate.cpp
@@ -0,0 +1,145 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/activ.hpp"
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/miopen.h"
+#include <miopen/interpolate/solvers.hpp>
+#include <miopen/interpolate/utils.hpp>
+
+#include <miopen/interpolate/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/interpolate.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_BWD_TRILINEAR 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace interpolate {
+
+bool IsOverRocmTrilinearBwd(const miopen::interpolate::BwdProblemDescription& problem)
+{
+    TensorDescriptor input_grad_desc  = problem.GetInputGradDesc();
+    TensorDescriptor output_grad_desc = problem.GetOutputGradDesc();
+    auto dtype                        = input_grad_desc.GetType();
+
+    float scale_h =
+        static_cast<float>(output_grad_desc.GetLengths()[2]) / input_grad_desc.GetLengths()[2];
+    float scale_w =
+        static_cast<float>(output_grad_desc.GetLengths()[3]) / input_grad_desc.GetLengths()[3];
+    float scale_d =
+        static_cast<float>(output_grad_desc.GetLengths()[4]) / input_grad_desc.GetLengths()[4];
+
+    if(dtype == miopenHalf || dtype == miopenBFloat16)
+    {
+        if(scale_h + scale_w + scale_d < 3.1f)
+            return false;
+    }
+    else if(dtype == miopenFloat)
+    {
+        if(scale_h + scale_w + scale_d <= 6.0f)
+            return false;
+    }
+
+    return true;
+}
+
+bool InterpolateTrilinearBackward::IsApplicable(
+    const ExecutionContext&, const miopen::interpolate::BwdProblemDescription& problem) const
+{
+    if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_TRILINEAR)
+        return false;
+    if(!IsOverRocmTrilinearBwd(problem))
+        return false;
+
+    return true;
+}
+
+ConvSolution InterpolateTrilinearBackward::GetSolution(
+    const ExecutionContext& context,
+    const miopen::interpolate::BwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
+
+    {
+        auto dtype     = problem.GetInputGradDesc().GetType();
+        size_t N_total = problem.GetInputGradDesc().GetElementSize();
+
+        auto kernel = KernelInfo{};
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+            {"DTYPE", "float"},
+        };
+
+        result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_BWD_TRILINEAR},
+                                                             {N_total},
+                                                             "MIOpenInterpolate.cpp",
+                                                             "InterpolateTrilinearBackward",
+                                                             build_params));
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::interpolate::BwdInvokeParams>();
+
+            auto input_grad_tv  = get_inner_expanded_tv<5>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<5>(deref(params.outputGradDesc));
+            size_t nelems       = params.inputGradDesc->GetElementSize();
+
+            kernel(params.input_grad,
+                   params.output_grad,
+                   input_grad_tv,
+                   output_grad_tv,
+                   nelems,
+                   params.scale_factors,
+                   params.align_corners);
+        };
+    };
+
+    return result;
+}
+
+} // namespace interpolate
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/interpolate/fwd_bicubic_interpolate.cpp b/src/solver/interpolate/fwd_bicubic_interpolate.cpp
new file mode 100644
index 0000000000..22d8d594c1
--- /dev/null
+++ b/src/solver/interpolate/fwd_bicubic_interpolate.cpp
@@ -0,0 +1,131 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include <miopen/interpolate/solvers.hpp>
+#include <miopen/interpolate/utils.hpp>
+
+#include <miopen/interpolate/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/interpolate.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_FWD_BICUBIC 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace interpolate {
+
+bool IsOverRocmBicubicFwd(const miopen::interpolate::FwdProblemDescription& problem)
+{
+    TensorDescriptor output_desc = problem.GetOutputDesc();
+    TensorDescriptor input_desc  = problem.GetInputDesc();
+
+    float scale_h = static_cast<float>(output_desc.GetLengths()[2]) / input_desc.GetLengths()[2];
+    float scale_w = static_cast<float>(output_desc.GetLengths()[3]) / input_desc.GetLengths()[3];
+
+    if((output_desc.GetLengths()[2] + output_desc.GetLengths()[3] > 256) &&
+       (scale_h + scale_w >= 2))
+        return false;
+
+    return true;
+}
+
+bool InterpolateBicubicForward::IsApplicable(
+    const ExecutionContext&, const miopen::interpolate::FwdProblemDescription& problem) const
+{
+    if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_BICUBIC)
+        return false;
+    if(!IsOverRocmBicubicFwd(problem))
+        return false;
+
+    return true;
+}
+
+ConvSolution InterpolateBicubicForward::GetSolution(
+    const ExecutionContext& context,
+    const miopen::interpolate::FwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+
+    {
+        auto dtype     = problem.GetOutputDesc().GetType();
+        size_t N_total = problem.GetOutputDesc().GetElementSize();
+
+        auto kernel = KernelInfo{};
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+            {"DTYPE", "float"},
+        };
+
+        result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_BICUBIC},
+                                                             {N_total},
+                                                             "MIOpenInterpolate.cpp",
+                                                             "InterpolateBicubicForward",
+                                                             build_params));
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::interpolate::FwdInvokeParams>();
+
+            auto input_tv  = get_inner_expanded_tv<4>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc));
+            size_t nelems  = params.outputDesc->GetElementSize();
+
+            kernel(params.input,
+                   params.output,
+                   input_tv,
+                   output_tv,
+                   nelems,
+                   params.scale_factors,
+                   params.align_corners);
+        };
+    };
+
+    return result;
+}
+
+} // namespace interpolate
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/interpolate/fwd_bilinear_interpolate.cpp b/src/solver/interpolate/fwd_bilinear_interpolate.cpp
new file mode 100644
index 0000000000..dd93c3d9b2
--- /dev/null
+++ b/src/solver/interpolate/fwd_bilinear_interpolate.cpp
@@ -0,0 +1,128 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include <miopen/interpolate/solvers.hpp>
+#include <miopen/interpolate/utils.hpp>
+
+#include <miopen/interpolate/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/interpolate.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_FWD_BILINEAR 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace interpolate {
+
+bool IsOverRocmBilinearFwd(const miopen::interpolate::FwdProblemDescription& problem)
+{
+    TensorDescriptor output_desc = problem.GetOutputDesc();
+
+    if(output_desc.GetLengths()[2] + output_desc.GetLengths()[3] > 256)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool InterpolateBilinearForward::IsApplicable(
+    const ExecutionContext&, const miopen::interpolate::FwdProblemDescription& problem) const
+{
+    if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_BILINEAR)
+        return false;
+    if(!IsOverRocmBilinearFwd(problem))
+        return false;
+
+    return true;
+}
+
+ConvSolution InterpolateBilinearForward::GetSolution(
+    const ExecutionContext& context,
+    const miopen::interpolate::FwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+
+    {
+        auto dtype     = problem.GetOutputDesc().GetType();
+        size_t N_total = problem.GetOutputDesc().GetElementSize();
+
+        auto kernel = KernelInfo{};
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+            {"DTYPE", "float"},
+        };
+
+        result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_BILINEAR},
+                                                             {N_total},
+                                                             "MIOpenInterpolate.cpp",
+                                                             "InterpolateBilinearForward",
+                                                             build_params));
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::interpolate::FwdInvokeParams>();
+
+            auto input_tv  = get_inner_expanded_tv<4>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc));
+            size_t nelems  = params.outputDesc->GetElementSize();
+
+            kernel(params.input,
+                   params.output,
+                   input_tv,
+                   output_tv,
+                   nelems,
+                   params.scale_factors,
+                   params.align_corners);
+        };
+    };
+
+    return result;
+}
+
+} // namespace interpolate
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/interpolate/fwd_linear_interpolate.cpp b/src/solver/interpolate/fwd_linear_interpolate.cpp
new file mode 100644
index 0000000000..2df80d058c
--- /dev/null
+++ b/src/solver/interpolate/fwd_linear_interpolate.cpp
@@ -0,0 +1,114 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include <miopen/interpolate/solvers.hpp>
+#include <miopen/interpolate/utils.hpp>
+
+#include <miopen/interpolate/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/interpolate.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_FWD_LINEAR 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace interpolate {
+
+bool InterpolateLinearForward::IsApplicable(
+    const ExecutionContext&, const miopen::interpolate::FwdProblemDescription& problem) const
+{
+    if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_LINEAR)
+        return false;
+
+    return true;
+}
+
+ConvSolution InterpolateLinearForward::GetSolution(
+    const ExecutionContext& context,
+    const miopen::interpolate::FwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+
+    {
+        auto dtype     = problem.GetOutputDesc().GetType();
+        size_t N_total = problem.GetOutputDesc().GetElementSize();
+
+        auto kernel = KernelInfo{};
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+            {"DTYPE", "float"},
+        };
+
+        result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_LINEAR},
+                                                             {N_total},
+                                                             "MIOpenInterpolate.cpp",
+                                                             "InterpolateLinearForward",
+                                                             build_params));
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::interpolate::FwdInvokeParams>();
+
+            auto input_tv  = get_inner_expanded_tv<3>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<3>(deref(params.outputDesc));
+            size_t nelems  = params.outputDesc->GetElementSize();
+
+            kernel(params.input,
+                   params.output,
+                   input_tv,
+                   output_tv,
+                   nelems,
+                   params.scale_factors,
+                   params.align_corners);
+        };
+    };
+
+    return result;
+}
+
+} // namespace interpolate
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/interpolate/fwd_nearest_interpolate.cpp b/src/solver/interpolate/fwd_nearest_interpolate.cpp
new file mode 100644
index 0000000000..95250ef03b
--- /dev/null
+++ b/src/solver/interpolate/fwd_nearest_interpolate.cpp
@@ -0,0 +1,127 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include <miopen/interpolate/solvers.hpp>
+#include <miopen/interpolate/utils.hpp>
+
+#include <miopen/interpolate/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/interpolate.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_FWD_NEAREST 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace interpolate {
+
+bool IsOverRocmNearestFwd(const miopen::interpolate::FwdProblemDescription& problem)
+{
+    TensorDescriptor input_desc = problem.GetInputDesc();
+    if(input_desc.GetLengths().size() == 3)
+    {
+        size_t nelems = problem.GetInputDesc().GetElementSize();
+        if(nelems < 4096)
+            return false;
+    }
+    else if(input_desc.GetLengths().size() == 4 || input_desc.GetLengths().size() == 5)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool InterpolateNearestForward::IsApplicable(
+    const ExecutionContext&, const miopen::interpolate::FwdProblemDescription& problem) const
+{
+    if(problem.GetMode() != miopenInterpolateMode_t::MIOPEN_INTERPOLATE_MODE_NEAREST)
+        return false;
+    if(!IsOverRocmNearestFwd(problem))
+        return false;
+
+    return true;
+}
+
+ConvSolution InterpolateNearestForward::GetSolution(
+    const ExecutionContext& context,
+    const miopen::interpolate::FwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+
+    {
+        auto dtype     = problem.GetOutputDesc().GetType();
+        size_t N_total = problem.GetOutputDesc().GetElementSize();
+
+        auto kernel = KernelInfo{};
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+            {"DTYPE", "float"},
+        };
+
+        result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_FWD_NEAREST},
+                                                             {N_total},
+                                                             "MIOpenInterpolate.cpp",
+                                                             "InterpolateNearestForward",
+                                                             build_params));
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::interpolate::FwdInvokeParams>();
+
+            auto input_tv  = get_inner_expanded_tv<5>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<5>(deref(params.outputDesc));
+            size_t nelems  = params.outputDesc->GetElementSize();
+
+            kernel(params.input, params.output, input_tv, output_tv, nelems, params.scale_factors);
+        };
+    };
+
+    return result;
+}
+
+} // namespace interpolate
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/test/cpu_interpolate.hpp b/test/cpu_interpolate.hpp
new file mode 100644
index 0000000000..98b4a53ddb
--- /dev/null
+++ b/test/cpu_interpolate.hpp
@@ -0,0 +1,984 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_INTERPOLATE_HPP
+#define GUARD_CPU_INTERPOLATE_HPP
+
+#include "miopen/miopen.h"
+#include "tensor_holder.hpp"
+#include <miopen/interpolate/utils.hpp>
+
+inline float compute_linear_scale_factor(float scale_factor,
+                                         int64_t input_size,
+                                         int64_t output_size,
+                                         bool align_corners)
+{
+    if(align_corners)
+    {
+        if(input_size == 1)
+        {
+            return static_cast<float>(output_size);
+        }
+        return static_cast<float>(output_size - 1) / (input_size - 1);
+    }
+    else if(scale_factor == 0)
+    {
+        return static_cast<float>(output_size) / input_size;
+    }
+    else
+    {
+        return static_cast<float>(scale_factor);
+    }
+}
+
+inline float get_src_index(int64_t dest_index, float scale_factor, bool align_corners)
+{
+    if(align_corners)
+    {
+        return dest_index / scale_factor;
+    }
+    else
+    {
+        return (dest_index + 0.5f) / scale_factor - 0.5f;
+    }
+}
+
+inline int64_t linear_back_index(int64_t src, float scale_factor, bool align_corners)
+{
+    return static_cast<int64_t>(std::ceil(get_src_index(src, 1.f / scale_factor, align_corners)));
+}
+
+inline void compute_linear_back_index_from_to(int64_t src,
+                                              int64_t input_isze,
+                                              int64_t output_size,
+                                              float scale_factor,
+                                              bool align_corners,
+                                              int64_t* from,
+                                              int64_t* to)
+{
+    if(src - 1 < 1)
+    {
+        *from = 0;
+    }
+    else
+    {
+        *from = linear_back_index(src - 1, scale_factor, align_corners);
+    }
+    if(src + 1 > input_isze)
+    {
+        *to = output_size;
+    }
+    else
+    {
+        *to = std::min(output_size, linear_back_index(src + 1, scale_factor, align_corners));
+    }
+}
+
+inline void compute_source_index_and_lambda(int64_t h,
+                                            float scale_factor,
+                                            int64_t Hin,
+                                            int64_t Hout,
+                                            bool align_corners,
+                                            int64_t* hin_index0,
+                                            int64_t* hin_index1,
+                                            float* lambda0,
+                                            float* lambda1)
+{
+    float hin_index_actual = static_cast<float>(
+        std::max(static_cast<float>(0.), get_src_index(h, scale_factor, align_corners)));
+    *hin_index0 = static_cast<int64_t>(hin_index_actual);
+    *hin_index1 = std::min(*hin_index0 + 1, Hin - 1);
+    *lambda1    = hin_index_actual - *hin_index0;
+    *lambda0    = 1.f - *lambda1;
+}
+
+inline float get_back_lambda(int64_t src, int64_t src0, int64_t src1, float lambda0, float lambda1)
+{
+    if(src == src0)
+    {
+        if(src0 == src1)
+        {
+            return 1; // lambda0 + lambda1 = 1
+        }
+        return lambda0;
+    }
+    if(src == src1)
+    {
+        return lambda1;
+    }
+    // This case can happen due to floating point mutiplification.
+    // ex> 7 * (105/9) = 87 or 86.99999995
+    return 0;
+}
+
+inline float compute_back_lambda(
+    int64_t dest, int64_t src, float scale_factor, int64_t Hin, int64_t Hout, bool align_corners)
+{
+    if(Hin == Hout)
+    {
+        return 1;
+    }
+    int64_t index0;
+    int64_t index1;
+    float lambda0;
+    float lambda1;
+    compute_source_index_and_lambda(
+        dest, scale_factor, Hin, Hout, align_corners, &index0, &index1, &lambda0, &lambda1);
+    return get_back_lambda(src, index0, index1, lambda0, lambda1);
+}
+
+template <class T>
+void cpu_interpolate_linear_forward(const tensor<T> input,
+                                    tensor<T>& output,
+                                    const size_t nelems,
+                                    const tensor<float> scale_factors,
+                                    const bool align_corners)
+{
+    auto input_tv  = miopen::solver::interpolate::get_inner_expanded_tv<3>(input.desc);
+    auto output_tv = miopen::solver::interpolate::get_inner_expanded_tv<3>(output.desc);
+
+    for(int64_t gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<3>(output_tv, gid);
+        int64_t n          = tensor_layout.layout[0];
+        int64_t c          = tensor_layout.layout[1];
+        int64_t h          = tensor_layout.layout[2];
+
+        int64_t Hin  = input_tv.size[2];
+        int64_t Hout = output_tv.size[2];
+        if(Hin == Hout || Hout == 1)
+        {
+            output[output_tv.get_tensor_view_idx(tensor_layout)] =
+                input[input_tv.get_tensor_view_idx(tensor_layout)];
+            continue;
+        }
+
+        float scale_factor_h = scale_factors[0];
+        scale_factor_h = compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+
+        int64_t hin_index0;
+        int64_t hin_index1;
+        float lambda1;
+        float lambda0;
+        compute_source_index_and_lambda(h,
+                                        scale_factor_h,
+                                        Hin,
+                                        Hout,
+                                        align_corners,
+                                        &hin_index0,
+                                        &hin_index1,
+                                        &lambda0,
+                                        &lambda1);
+
+        tensor_layout_t<3> input_layout0(n, c, hin_index0);
+        tensor_layout_t<3> input_layout1(n, c, hin_index1);
+
+        float input0 = input[input_tv.get_tensor_view_idx(input_layout0)];
+        float input1 = input[input_tv.get_tensor_view_idx(input_layout1)];
+
+        output[output_tv.get_tensor_view_idx(tensor_layout)] =
+            static_cast<T>(input0 * lambda0 + input1 * lambda1);
+    }
+}
+
+template <class T>
+void cpu_interpolate_linear_backward(tensor<T>& input_grad,
+                                     const tensor<T> output_grad,
+                                     const size_t nelems,
+                                     const tensor<float> scale_factors,
+                                     const bool align_corners)
+{
+    auto output_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<3>(output_grad.desc);
+    auto input_grad_tv  = miopen::solver::interpolate::get_inner_expanded_tv<3>(input_grad.desc);
+
+    for(int64_t gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<3>(input_grad_tv, gid);
+        int64_t n          = tensor_layout.layout[0];
+        int64_t c          = tensor_layout.layout[1];
+        int64_t h          = tensor_layout.layout[2];
+
+        int64_t Hin  = input_grad_tv.size[2];
+        int64_t Hout = output_grad_tv.size[2];
+
+        if(Hin == Hout)
+        {
+            input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+                output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)];
+            continue;
+        }
+
+        float scale_factor_h = scale_factors[0];
+        float scale_factor = compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+
+        int64_t from, to;
+        compute_linear_back_index_from_to(h, Hin, Hout, scale_factor, align_corners, &from, &to);
+
+        float output = 0;
+        for(int64_t i = from; i < to; i++)
+        {
+            tensor_layout_t<3> output_layout(n, c, i);
+            output +=
+                static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) *
+                compute_back_lambda(i, h, scale_factor, Hin, Hout, align_corners);
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = static_cast<T>(output);
+    }
+}
+
+template <class T>
+void cpu_interpolate_bilinear_forward(const tensor<T> input,
+                                      tensor<T>& output,
+                                      const size_t nelems,
+                                      const tensor<float> scale_factors,
+                                      const bool align_corners)
+{
+    auto input_tv  = miopen::solver::interpolate::get_inner_expanded_tv<4>(input.desc);
+    auto output_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(output.desc);
+
+    for(int64_t gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<4>(output_tv, gid);
+        int64_t n          = tensor_layout.layout[0];
+        int64_t c          = tensor_layout.layout[1];
+        int64_t h          = tensor_layout.layout[2];
+        int64_t w          = tensor_layout.layout[3];
+
+        int64_t Hin  = input_tv.size[2];
+        int64_t Hout = output_tv.size[2];
+        int64_t Win  = input_tv.size[3];
+        int64_t Wout = output_tv.size[3];
+
+        if(Hin == Hout && Win == Wout)
+        {
+            output[output_tv.get_tensor_view_idx(tensor_layout)] =
+                input[input_tv.get_tensor_view_idx(tensor_layout)];
+            continue;
+        }
+
+        int64_t hin_index0 = h;
+        int64_t hin_index1 = h;
+        float hlambda0     = 1;
+        float hlambda1     = 0;
+        if(Hin != Hout && Hout != 1)
+        {
+            float scale_factor_h = scale_factors[0];
+            float scale_factor_h_ =
+                compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+            compute_source_index_and_lambda(h,
+                                            scale_factor_h_,
+                                            Hin,
+                                            Hout,
+                                            align_corners,
+                                            &hin_index0,
+                                            &hin_index1,
+                                            &hlambda0,
+                                            &hlambda1);
+        }
+
+        int64_t win_index0 = w;
+        int64_t win_index1 = w;
+        float wlambda0     = 1;
+        float wlambda1     = 0;
+        if(Win != Wout && Wout != 1)
+        {
+            float scale_factor_w = scale_factors[1];
+            float scale_factor_w_ =
+                compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+            compute_source_index_and_lambda(w,
+                                            scale_factor_w_,
+                                            Win,
+                                            Wout,
+                                            align_corners,
+                                            &win_index0,
+                                            &win_index1,
+                                            &wlambda0,
+                                            &wlambda1);
+        }
+
+        tensor_layout_t<4> input_layout00(n, c, hin_index0, win_index0);
+        tensor_layout_t<4> input_layout01(n, c, hin_index0, win_index1);
+        tensor_layout_t<4> input_layout10(n, c, hin_index1, win_index0);
+        tensor_layout_t<4> input_layout11(n, c, hin_index1, win_index1);
+
+        output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast<T>(
+            (static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout00)]) * wlambda0 +
+             static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout01)]) * wlambda1) *
+                hlambda0 +
+            (static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout10)]) * wlambda0 +
+             static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout11)]) * wlambda1) *
+                hlambda1);
+    }
+}
+
+template <class T>
+void cpu_interpolate_bilinear_backward(tensor<T>& input_grad,
+                                       const tensor<T> output_grad,
+                                       const size_t nelems,
+                                       const tensor<float> scale_factors,
+                                       const bool align_corners)
+{
+    auto output_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(output_grad.desc);
+    auto input_grad_tv  = miopen::solver::interpolate::get_inner_expanded_tv<4>(input_grad.desc);
+
+    for(int64_t gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<4>(input_grad_tv, gid);
+        int64_t n          = tensor_layout.layout[0];
+        int64_t c          = tensor_layout.layout[1];
+        int64_t h          = tensor_layout.layout[2];
+        int64_t w          = tensor_layout.layout[3];
+
+        int64_t Hin  = input_grad_tv.size[2];
+        int64_t Hout = output_grad_tv.size[2];
+        int64_t Win  = input_grad_tv.size[3];
+        int64_t Wout = output_grad_tv.size[3];
+
+        float scale_factor_h = scale_factors[0];
+        float scale_factor_h_ =
+            compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+
+        float scale_factor_w = scale_factors[1];
+        float scale_factor_w_ =
+            compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+
+        int64_t h_from, h_to;
+        if(Hin == Hout)
+        {
+            h_from = h;
+            h_to   = h + 1;
+        }
+        else
+        {
+            compute_linear_back_index_from_to(
+                h, Hin, Hout, scale_factor_h_, align_corners, &h_from, &h_to);
+        }
+        int64_t w_from, w_to;
+        if(Win == Wout)
+        {
+            w_from = w;
+            w_to   = w + 1;
+        }
+        else
+        {
+            compute_linear_back_index_from_to(
+                w, Win, Wout, scale_factor_w_, align_corners, &w_from, &w_to);
+        }
+
+        float output = 0;
+        for(int64_t i = h_from; i < h_to; i++)
+        {
+            float h_lambda = compute_back_lambda(i, h, scale_factor_h_, Hin, Hout, align_corners);
+            if(h_lambda == 0.)
+                continue;
+            for(int64_t j = w_from; j < w_to; j++)
+            {
+                float w_lambda =
+                    compute_back_lambda(j, w, scale_factor_w_, Win, Wout, align_corners);
+
+                tensor_layout_t<4> output_layout(n, c, i, j);
+                output += static_cast<float>(
+                              output_grad[output_grad_tv.get_tensor_view_idx(output_layout)]) *
+                          h_lambda * w_lambda;
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = static_cast<T>(output);
+    }
+}
+
+template <class T>
+void cpu_interpolate_trilinear_forward(const tensor<T> input,
+                                       tensor<T>& output,
+                                       const size_t nelems,
+                                       const tensor<float> scale_factors,
+                                       const bool align_corners)
+{
+    auto input_tv  = miopen::solver::interpolate::get_inner_expanded_tv<5>(input.desc);
+    auto output_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(output.desc);
+
+    for(int64_t gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<5>(output_tv, gid);
+        int64_t n          = tensor_layout.layout[0];
+        int64_t c          = tensor_layout.layout[1];
+        int64_t d          = tensor_layout.layout[2];
+        int64_t h          = tensor_layout.layout[3];
+        int64_t w          = tensor_layout.layout[4];
+
+        int64_t Din  = input_tv.size[2];
+        int64_t Dout = output_tv.size[2];
+        int64_t Hin  = input_tv.size[3];
+        int64_t Hout = output_tv.size[3];
+        int64_t Win  = input_tv.size[4];
+        int64_t Wout = output_tv.size[4];
+
+        if(Hin == Hout && Win == Wout && Din == Dout)
+        {
+            output[output_tv.get_tensor_view_idx(tensor_layout)] =
+                input[input_tv.get_tensor_view_idx(tensor_layout)];
+            continue;
+        }
+
+        int64_t din_index0 = d;
+        int64_t din_index1 = d;
+        float dlambda0     = 1;
+        float dlambda1     = 0;
+        if(Din != Dout && Dout != 1)
+        {
+            float scale_factor_d = scale_factors[0];
+            float scale_factor_d_ =
+                compute_linear_scale_factor(scale_factor_d, Din, Dout, align_corners);
+            compute_source_index_and_lambda(d,
+                                            scale_factor_d_,
+                                            Din,
+                                            Dout,
+                                            align_corners,
+                                            &din_index0,
+                                            &din_index1,
+                                            &dlambda0,
+                                            &dlambda1);
+        }
+
+        int64_t hin_index0 = h;
+        int64_t hin_index1 = h;
+        float hlambda0     = 1;
+        float hlambda1     = 0;
+        if(Hin != Hout && Hout != 1)
+        {
+            float scale_factor_h = scale_factors[1];
+            float scale_factor_h_ =
+                compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+            compute_source_index_and_lambda(h,
+                                            scale_factor_h_,
+                                            Hin,
+                                            Hout,
+                                            align_corners,
+                                            &hin_index0,
+                                            &hin_index1,
+                                            &hlambda0,
+                                            &hlambda1);
+        }
+
+        int64_t win_index0 = w;
+        int64_t win_index1 = w;
+        float wlambda0     = 1;
+        float wlambda1     = 0;
+        if(Win != Wout && Wout != 1)
+        {
+            float scale_factor_w = scale_factors[2];
+            float scale_factor_w_ =
+                compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+            compute_source_index_and_lambda(w,
+                                            scale_factor_w_,
+                                            Win,
+                                            Wout,
+                                            align_corners,
+                                            &win_index0,
+                                            &win_index1,
+                                            &wlambda0,
+                                            &wlambda1);
+        }
+
+        tensor_layout_t<5> input_layout000(n, c, din_index0, hin_index0, win_index0);
+        tensor_layout_t<5> input_layout001(n, c, din_index0, hin_index0, win_index1);
+        tensor_layout_t<5> input_layout010(n, c, din_index0, hin_index1, win_index0);
+        tensor_layout_t<5> input_layout011(n, c, din_index0, hin_index1, win_index1);
+        tensor_layout_t<5> input_layout100(n, c, din_index1, hin_index0, win_index0);
+        tensor_layout_t<5> input_layout101(n, c, din_index1, hin_index0, win_index1);
+        tensor_layout_t<5> input_layout110(n, c, din_index1, hin_index1, win_index0);
+        tensor_layout_t<5> input_layout111(n, c, din_index1, hin_index1, win_index1);
+
+        output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast<T>(
+            (static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout000)]) * wlambda0 +
+             static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout001)]) * wlambda1) *
+                hlambda0 +
+            (static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout010)]) * wlambda0 +
+             static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout011)]) * wlambda1) *
+                hlambda1 +
+            (static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout100)]) * wlambda0 +
+             static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout101)]) * wlambda1) *
+                dlambda0 +
+            (static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout110)]) * wlambda0 +
+             static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout111)]) * wlambda1) *
+                dlambda1);
+    }
+}
+template <class T>
+void cpu_interpolate_trilinear_backward(tensor<T>& input_grad,
+                                        const tensor<T> output_grad,
+                                        const size_t nelems,
+                                        const tensor<float> scale_factors,
+                                        const bool align_corners)
+{
+    auto output_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(output_grad.desc);
+    auto input_grad_tv  = miopen::solver::interpolate::get_inner_expanded_tv<5>(input_grad.desc);
+
+    for(int64_t gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<5>(input_grad_tv, gid);
+        int64_t n          = tensor_layout.layout[0];
+        int64_t c          = tensor_layout.layout[1];
+        int64_t d          = tensor_layout.layout[2];
+        int64_t h          = tensor_layout.layout[3];
+        int64_t w          = tensor_layout.layout[4];
+
+        int64_t Din  = input_grad_tv.size[2];
+        int64_t Dout = output_grad_tv.size[2];
+        int64_t Hin  = input_grad_tv.size[3];
+        int64_t Hout = output_grad_tv.size[3];
+        int64_t Win  = input_grad_tv.size[4];
+        int64_t Wout = output_grad_tv.size[4];
+
+        float scale_factor_d = scale_factors[0];
+        float scale_factor_d_ =
+            compute_linear_scale_factor(scale_factor_d, Din, Dout, align_corners);
+
+        float scale_factor_h = scale_factors[1];
+        float scale_factor_h_ =
+            compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+
+        float scale_factor_w = scale_factors[2];
+        float scale_factor_w_ =
+            compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+
+        int64_t d_from, d_to, h_from, h_to, w_from, w_to;
+        compute_linear_back_index_from_to(
+            d, Din, Dout, scale_factor_d_, align_corners, &d_from, &d_to);
+        compute_linear_back_index_from_to(
+            h, Hin, Hout, scale_factor_h_, align_corners, &h_from, &h_to);
+        compute_linear_back_index_from_to(
+            w, Win, Wout, scale_factor_w_, align_corners, &w_from, &w_to);
+
+        float output = 0;
+        for(int64_t i = d_from; i < d_to; i++)
+        {
+            float d_lambda = compute_back_lambda(i, d, scale_factor_d_, Din, Dout, align_corners);
+            for(int64_t j = h_from; j < h_to; j++)
+            {
+                float h_lambda =
+                    compute_back_lambda(j, h, scale_factor_h_, Hin, Hout, align_corners);
+                for(int64_t k = w_from; k < w_to; k++)
+                {
+                    float w_lambda =
+                        compute_back_lambda(k, w, scale_factor_w_, Win, Wout, align_corners);
+
+                    tensor_layout_t<5> output_layout(n, c, i, j, k);
+                    output += output_grad[output_grad_tv.get_tensor_view_idx(output_layout)] *
+                              d_lambda * h_lambda * w_lambda;
+                }
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = output;
+    }
+}
+
+inline float compute_scales_value(float scale, int64_t input_size, int64_t output_size)
+{
+    return (scale == 0.f) ? (static_cast<float>(input_size) / output_size) : (1.0f / scale);
+}
+
+inline int64_t
+nearest_idx(int64_t output_index, int64_t input_size, int64_t output_size, float scales)
+{
+    if(output_size == input_size)
+    {
+        return output_index;
+    }
+    else if(output_size == 2 * input_size)
+    {
+        return output_index / 2;
+    }
+    else
+    {
+        float scale = compute_scales_value(scales, input_size, output_size);
+        return std::min(static_cast<int64_t>((output_index * scale)), input_size);
+    }
+}
+
+template <class T>
+void cpu_nearest_forward(const tensor<T> input,
+                         tensor<T>& output,
+                         const size_t nelems,
+                         const tensor<float> scale_factors)
+{
+    auto input_tv  = miopen::solver::interpolate::get_inner_expanded_tv<5>(input.desc);
+    auto output_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(output.desc);
+
+    for(int64_t gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<5>(output_tv, gid);
+        int64_t n          = tensor_layout.layout[0];
+        int64_t c          = tensor_layout.layout[1];
+        int64_t d          = tensor_layout.layout[2];
+        int64_t h          = tensor_layout.layout[3];
+        int64_t w          = tensor_layout.layout[4];
+
+        int64_t Dout = output_tv.size[2];
+        int64_t Hout = output_tv.size[3];
+        int64_t Wout = output_tv.size[4];
+        int64_t Din  = input_tv.size[2];
+        int64_t Hin  = input_tv.size[3];
+        int64_t Win  = input_tv.size[4];
+
+        int64_t x = nearest_idx(d, Din, Dout, scale_factors[0]);
+        int64_t y = nearest_idx(h, Hin, Hout, scale_factors[1]);
+        int64_t z = nearest_idx(w, Win, Wout, scale_factors[2]);
+
+        tensor_layout_t<5> input_layout(n, c, x, y, z);
+        output[output_tv.get_tensor_view_idx(tensor_layout)] =
+            input[input_tv.get_tensor_view_idx(input_layout)];
+    }
+}
+
+inline int64_t
+nearest_idx_back(int64_t input_index, int64_t input_size, int64_t output_size, float scales)
+{
+    if(output_size == input_size)
+    {
+        return input_index;
+    }
+    else if(output_size == 2 * input_size)
+    {
+        return input_index * 2;
+    }
+    else
+    {
+        float scale = compute_scales_value(scales, input_size, output_size);
+        return std::min(static_cast<int64_t>(std::ceil(input_index / scale)), output_size);
+    }
+}
+
+template <class T>
+void cpu_nearest_backward(tensor<T>& input_grad,
+                          const tensor<T> output_grad,
+                          const size_t nelems,
+                          const tensor<float> scale_factors)
+{
+    auto input_grad_tv  = miopen::solver::interpolate::get_inner_expanded_tv<5>(input_grad.desc);
+    auto output_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<5>(output_grad.desc);
+
+    for(int64_t gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<5>(input_grad_tv, gid);
+        int64_t n          = tensor_layout.layout[0];
+        int64_t c          = tensor_layout.layout[1];
+        int64_t x          = tensor_layout.layout[2];
+        int64_t y          = tensor_layout.layout[3];
+        int64_t z          = tensor_layout.layout[4];
+
+        int64_t Dout = output_grad_tv.size[2];
+        int64_t Hout = output_grad_tv.size[3];
+        int64_t Wout = output_grad_tv.size[4];
+        int64_t Din  = input_grad_tv.size[2];
+        int64_t Hin  = input_grad_tv.size[3];
+        int64_t Win  = input_grad_tv.size[4];
+
+        float scale_factor_d = scale_factors[0];
+        float scale_factor_h = scale_factors[1];
+        float scale_factor_w = scale_factors[2];
+
+        int64_t dstart = nearest_idx_back(x, Din, Dout, scale_factor_d);
+        int64_t dlimit = nearest_idx_back(x + 1, Din, Dout, scale_factor_d);
+        int64_t hstart = nearest_idx_back(y, Hin, Hout, scale_factor_h);
+        int64_t hlimit = nearest_idx_back(y + 1, Hin, Hout, scale_factor_h);
+        int64_t wstart = nearest_idx_back(z, Win, Wout, scale_factor_w);
+        int64_t wlimit = nearest_idx_back(z + 1, Win, Wout, scale_factor_w);
+
+        float grad = 0.f;
+        for(int64_t d = dstart; d < dlimit; d++)
+        {
+            for(int64_t h = hstart; h < hlimit; h++)
+            {
+                for(int64_t w = wstart; w < wlimit; w++)
+                {
+                    tensor_layout_t<5> output_grad_layout(n, c, d, h, w);
+                    grad += static_cast<float>(
+                        output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]);
+                }
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = static_cast<T>(grad);
+    }
+}
+
+inline float
+bicubic_idx(int64_t output_index, int64_t output_size, float scale_factor, bool align_corners)
+{
+    if(output_size == 1)
+    {
+        if(align_corners)
+        {
+            return 0;
+        }
+        return -0.5f;
+    }
+    return get_src_index(output_index, scale_factor, align_corners);
+}
+
+inline float cubic_convolution1(float x, float A) { return ((A + 2) * x - (A + 3)) * x * x + 1; }
+
+inline float cubic_convolution2(float x, float A)
+{
+    return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+inline void get_cubic_upsampling_coefficients(float coeffs[4], float t)
+{
+    float A = -0.75f;
+
+    float x1  = t;
+    coeffs[0] = cubic_convolution2(x1 + 1.0f, A);
+    coeffs[1] = cubic_convolution1(x1, A);
+
+    float x2  = 1.0f - t;
+    coeffs[2] = cubic_convolution1(x2, A);
+    coeffs[3] = cubic_convolution2(x2 + 1.0f, A);
+}
+
+inline float cubic_interp1d(float x0, float x1, float x2, float x3, float t)
+{
+    float coeffs[4];
+    get_cubic_upsampling_coefficients(coeffs, t);
+
+    return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+inline int64_t bound(int64_t p, int64_t max_size)
+{
+    return std::max(std::min(p, max_size - 1), 0L);
+}
+
+template <class T>
+void cpu_bicubic_forward(const tensor<T> input,
+                         tensor<T>& output,
+                         const size_t nelems,
+                         const tensor<float> scale_factors,
+                         const bool align_corners)
+{
+    auto input_tv  = miopen::solver::interpolate::get_inner_expanded_tv<4>(input.desc);
+    auto output_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(output.desc);
+
+    for(int64_t gid = 0; gid < nelems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<4>(output_tv, gid);
+        int64_t n          = tensor_layout.layout[0];
+        int64_t c          = tensor_layout.layout[1];
+        int64_t h          = tensor_layout.layout[2];
+        int64_t w          = tensor_layout.layout[3];
+
+        int64_t Hin  = input_tv.size[2];
+        int64_t Win  = input_tv.size[3];
+        int64_t Hout = output_tv.size[2];
+        int64_t Wout = output_tv.size[3];
+        if(Hin == Hout && Win == Wout)
+        {
+            output[output_tv.get_tensor_view_idx(tensor_layout)] =
+                input[input_tv.get_tensor_view_idx(tensor_layout)];
+            continue;
+        }
+
+        float scale_factor_h = scale_factors[0];
+        float scale_factor_h_ =
+            compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+        float real_y = bicubic_idx(h, Hout, scale_factor_h_, align_corners);
+        int64_t in_y = static_cast<int64_t>(std::floor(real_y));
+        float t_y    = real_y - in_y;
+
+        float scale_factor_w = scale_factors[1];
+        float scale_factor_w_ =
+            compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+        float real_x = bicubic_idx(w, Wout, scale_factor_w_, align_corners);
+        int64_t in_x = static_cast<int64_t>(std::floor(real_x));
+        float t_x    = real_x - in_x;
+
+        float coefficients[4];
+#pragma unroll
+        for(int k = 0; k < 4; k++)
+        {
+            int64_t y = bound(in_y - 1 + k, Hin);
+            tensor_layout_t<4> input_layout0(n, c, y, bound(in_x - 1, Win));
+            tensor_layout_t<4> input_layout1(n, c, y, bound(in_x, Win));
+            tensor_layout_t<4> input_layout2(n, c, y, bound(in_x + 1, Win));
+            tensor_layout_t<4> input_layout3(n, c, y, bound(in_x + 2, Win));
+
+            coefficients[k] = cubic_interp1d(
+                static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout0)]),
+                static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout1)]),
+                static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout2)]),
+                static_cast<float>(input[input_tv.get_tensor_view_idx(input_layout3)]),
+                t_x);
+        }
+        output[output_tv.get_tensor_view_idx(tensor_layout)] = static_cast<T>(cubic_interp1d(
+            coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y));
+    }
+}
+
+template <class T>
+void cpu_bicubic_backward(tensor<T>& input_grad,
+                          const tensor<T> output_grad,
+                          const size_t nelems,
+                          const tensor<float> scale_factors,
+                          const bool align_corners)
+{
+    auto input_grad_tv  = miopen::solver::interpolate::get_inner_expanded_tv<4>(input_grad.desc);
+    auto output_grad_tv = miopen::solver::interpolate::get_inner_expanded_tv<4>(output_grad.desc);
+
+    std::vector<float> workspace;
+    workspace.resize(nelems, 0.f);
+
+    int64_t Hin  = input_grad_tv.size[2];
+    int64_t Hout = output_grad_tv.size[2];
+    int64_t Win  = input_grad_tv.size[3];
+    int64_t Wout = output_grad_tv.size[3];
+
+    size_t out_elems = output_grad.desc.GetElementSize();
+
+    for(int64_t gid = 0; gid < out_elems; ++gid)
+    {
+        auto tensor_layout = tensor_layout_t<4>(output_grad_tv, gid);
+        int64_t n          = tensor_layout.layout[0];
+        int64_t c          = tensor_layout.layout[1];
+        int64_t h          = tensor_layout.layout[2];
+        int64_t w          = tensor_layout.layout[3];
+
+        if(Hin == Hout && Win == Wout)
+        {
+            input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+                output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)];
+            continue;
+        }
+
+        float scale_factor_h = scale_factors[0];
+        float scale_factor_h_ =
+            compute_linear_scale_factor(scale_factor_h, Hin, Hout, align_corners);
+        float real_y = bicubic_idx(h, Hout, scale_factor_h_, align_corners);
+        int64_t in_y = static_cast<int64_t>(std::floor(real_y));
+        float t_y    = real_y - static_cast<float>(in_y);
+
+        float scale_factor_w = scale_factors[1];
+        float scale_factor_w_ =
+            compute_linear_scale_factor(scale_factor_w, Win, Wout, align_corners);
+        float real_x = bicubic_idx(w, Wout, scale_factor_w_, align_corners);
+        int64_t in_x = static_cast<int64_t>(std::floor(real_x));
+        float t_x    = real_x - static_cast<float>(in_x);
+
+        float y_coeffs[4];
+        float x_coeffs[4];
+        get_cubic_upsampling_coefficients(y_coeffs, t_y);
+        get_cubic_upsampling_coefficients(x_coeffs, t_x);
+
+        float out_value =
+            static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]);
+
+#pragma unroll
+        for(int i = 0; i < 4; i++)
+        {
+            int64_t input_h = bound(in_y - 1 + i, Hin);
+#pragma unroll
+            for(int j = 0; j < 4; j++)
+            {
+                int64_t input_w = bound(in_x - 1 + j, Win);
+                tensor_layout_t<4> in_grad_layout(n, c, input_h, input_w);
+                workspace[input_grad_tv.get_tensor_view_idx(in_grad_layout)] +=
+                    out_value * y_coeffs[i] * x_coeffs[j];
+            }
+        }
+    }
+
+    if(!(Hin == Hout && Win == Wout))
+    {
+        for(int64_t gid = 0; gid < nelems; ++gid)
+        {
+            auto tensor_layout = tensor_layout_t<4>(input_grad_tv, gid);
+            input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+                static_cast<T>(workspace[input_grad_tv.get_tensor_view_idx(tensor_layout)]);
+        }
+    }
+}
+
+template <class T>
+void cpu_interpolate_forward(const tensor<T> input,
+                             tensor<T>& output,
+                             const size_t nelems,
+                             const tensor<float> scale_factors,
+                             const bool align_corners,
+                             const miopenInterpolateMode_t mode)
+{
+    if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST)
+    {
+        cpu_nearest_forward(input, output, nelems, scale_factors);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_LINEAR)
+    {
+        cpu_interpolate_linear_forward(input, output, nelems, scale_factors, align_corners);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_BILINEAR)
+    {
+        cpu_interpolate_bilinear_forward(input, output, nelems, scale_factors, align_corners);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_TRILINEAR)
+    {
+        cpu_interpolate_trilinear_forward(input, output, nelems, scale_factors, align_corners);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC)
+    {
+        cpu_bicubic_forward(input, output, nelems, scale_factors, align_corners);
+    }
+}
+
+template <class T>
+void cpu_interpolate_backward(tensor<T>& input_grad,
+                              const tensor<T> output_grad,
+                              const size_t nelems,
+                              const tensor<float> scale_factors,
+                              const bool align_corners,
+                              const miopenInterpolateMode_t mode)
+{
+    if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST)
+    {
+        cpu_nearest_backward(input_grad, output_grad, nelems, scale_factors);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_LINEAR)
+    {
+        cpu_interpolate_linear_backward(
+            input_grad, output_grad, nelems, scale_factors, align_corners);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_BILINEAR)
+    {
+        cpu_interpolate_bilinear_backward(
+            input_grad, output_grad, nelems, scale_factors, align_corners);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_TRILINEAR)
+    {
+        cpu_interpolate_trilinear_backward(
+            input_grad, output_grad, nelems, scale_factors, align_corners);
+    }
+    else if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC)
+    {
+        cpu_bicubic_backward(input_grad, output_grad, nelems, scale_factors, align_corners);
+    }
+}
+
+#endif // GUARD_CPU_INTERPOLATE_HPP
diff --git a/test/gtest/interpolate.cpp b/test/gtest/interpolate.cpp
new file mode 100644
index 0000000000..4502da88c3
--- /dev/null
+++ b/test/gtest/interpolate.cpp
@@ -0,0 +1,175 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/env.hpp>
+#include "interpolate.hpp"
+
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace interpolate {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG));
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
+struct GPU_Interpolate_fwd_FP32 : InterpolateTestFwd<float>
+{
+};
+
+struct GPU_Interpolate_fwd_FP16 : InterpolateTestFwd<half>
+{
+};
+
+struct GPU_Interpolate_fwd_BFP16 : InterpolateTestFwd<bfloat16>
+{
+};
+
+struct GPU_Interpolate_bwd_FP32 : InterpolateTestBwd<float>
+{
+};
+
+struct GPU_Interpolate_bwd_FP16 : InterpolateTestBwd<half>
+{
+};
+
+struct GPU_Interpolate_bwd_BFP16 : InterpolateTestBwd<bfloat16>
+{
+};
+
+} // namespace interpolate
+using namespace interpolate;
+
+// FORWARD TEST
+TEST_P(GPU_Interpolate_fwd_FP32, InterpolateTest)
+{
+    if((miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && GetFloatArg() == "--float") ||
+       miopen::IsUnset(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Interpolate_fwd_FP16, InterpolateTest)
+{
+    if((miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && GetFloatArg() == "--half") ||
+       miopen::IsUnset(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Interpolate_fwd_BFP16, InterpolateTest)
+{
+    if((miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && GetFloatArg() == "--bfloat16") ||
+       miopen::IsUnset(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Interpolate_fwd_FP32,
+                         testing::ValuesIn(InterpolateTestFwdConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Interpolate_fwd_FP16,
+                         testing::ValuesIn(InterpolateTestFwdConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Interpolate_fwd_BFP16,
+                         testing::ValuesIn(InterpolateTestFwdConfigs()));
+
+// BACKWARD TEST
+TEST_P(GPU_Interpolate_bwd_FP32, InterpolateTestBwd)
+{
+    if((miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && GetFloatArg() == "--float") ||
+       miopen::IsUnset(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Interpolate_bwd_FP16, InterpolateTestBwd)
+{
+    if((miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && GetFloatArg() == "--half") ||
+       miopen::IsUnset(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Interpolate_bwd_BFP16, InterpolateTestBwd)
+{
+    if((miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && GetFloatArg() == "--bfloat16") ||
+       miopen::IsUnset(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Interpolate_bwd_FP32,
+                         testing::ValuesIn(InterpolateTestBwdConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Interpolate_bwd_FP16,
+                         testing::ValuesIn(InterpolateTestBwdConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Interpolate_bwd_BFP16,
+                         testing::ValuesIn(InterpolateTestBwdConfigs()));
diff --git a/test/gtest/interpolate.hpp b/test/gtest/interpolate.hpp
new file mode 100644
index 0000000000..a6f369507f
--- /dev/null
+++ b/test/gtest/interpolate.hpp
@@ -0,0 +1,390 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "../driver/tensor_driver.hpp"
+#include "cpu_interpolate.hpp"
+#include "get_handle.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <miopen/interpolate.hpp>
+#include <miopen/miopen.h>
+#include <vector>
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+struct InterpolateTestCase
+{
+    std::vector<size_t> input;
+    std::vector<size_t> size;
+    std::vector<float> scale_factors;
+    miopenInterpolateMode_t mode;
+    bool align_corners;
+
+    friend std::ostream& operator<<(std::ostream& os, const InterpolateTestCase& tc)
+    {
+        return os << " input:" << tc.input << " size:" << tc.size
+                  << " scale_factors:" << tc.scale_factors << " mode:" << tc.mode
+                  << " align_corners:" << tc.align_corners;
+    }
+
+    std::vector<size_t> GetInput() const { return input; }
+};
+
+inline std::vector<InterpolateTestCase> InterpolateTestFwdConfigs()
+{
+    return {
+        {{16, 256, 1, 1}, {32, 32}, {0, 0}, MIOPEN_INTERPOLATE_MODE_BILINEAR, false},
+        {{16, 256, 1, 1}, {32, 32}, {0, 0}, MIOPEN_INTERPOLATE_MODE_BILINEAR, true},
+        {{16, 256, 20, 20}, {40, 40}, {2, 2}, MIOPEN_INTERPOLATE_MODE_BICUBIC, false},
+        {{16, 256, 20, 20}, {40, 40}, {0, 0}, MIOPEN_INTERPOLATE_MODE_BICUBIC, true},
+        {{16, 256, 1}, {32}, {0}, MIOPEN_INTERPOLATE_MODE_NEAREST, false},
+        {{16, 256, 1}, {32}, {0}, MIOPEN_INTERPOLATE_MODE_LINEAR, false},
+        {{16, 256, 1}, {32}, {0}, MIOPEN_INTERPOLATE_MODE_LINEAR, true},
+    };
+}
+
+inline std::vector<InterpolateTestCase> InterpolateTestBwdConfigs()
+{
+    return {
+        {{16, 256, 1, 1, 1}, {32, 32, 32}, {32, 32, 32}, MIOPEN_INTERPOLATE_MODE_TRILINEAR, false},
+        {{16, 256, 1, 1, 1}, {32, 32, 32}, {0, 0, 0}, MIOPEN_INTERPOLATE_MODE_TRILINEAR, true},
+        {{16, 256, 1, 1, 1}, {32, 32, 32}, {0, 0, 0}, MIOPEN_INTERPOLATE_MODE_NEAREST, false},
+        {{16, 256, 1, 1}, {32, 32}, {0, 0}, MIOPEN_INTERPOLATE_MODE_NEAREST, false},
+        {{16, 256, 1, 1}, {32, 32}, {0, 0}, MIOPEN_INTERPOLATE_MODE_BILINEAR, false},
+        {{16, 256, 1, 1}, {32, 32}, {0, 0}, MIOPEN_INTERPOLATE_MODE_BILINEAR, true},
+        {{16, 256, 20, 20}, {40, 40}, {2, 2}, MIOPEN_INTERPOLATE_MODE_BICUBIC, false},
+        {{16, 256, 20, 20}, {40, 40}, {0, 0}, MIOPEN_INTERPOLATE_MODE_BICUBIC, true},
+        {{16, 256, 1}, {32}, {0}, MIOPEN_INTERPOLATE_MODE_NEAREST, false},
+        {{16, 256, 1}, {32}, {0}, MIOPEN_INTERPOLATE_MODE_LINEAR, false},
+        {{16, 256, 1}, {32}, {0}, MIOPEN_INTERPOLATE_MODE_LINEAR, true},
+    };
+}
+
+inline std::vector<size_t> GetStrides(std::vector<size_t> input, bool contiguous)
+{
+    if(!contiguous)
+        std::swap(input.front(), input.back());
+    std::vector<size_t> strides(input.size());
+    strides.back() = 1;
+    for(int i = input.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * input[i + 1];
+    if(!contiguous)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+// FORWARD TEST
+template <typename T = float>
+struct InterpolateTestFwd : public ::testing::TestWithParam<InterpolateTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle      = get_handle();
+        interpolate_config = GetParam();
+
+        auto in_dim   = interpolate_config.GetInput();
+        auto size     = interpolate_config.size;
+        mode          = interpolate_config.mode;
+        align_corners = interpolate_config.align_corners;
+
+        if(mode != MIOPEN_INTERPOLATE_MODE_NEAREST)
+        {
+            scale_factors = tensor<float>{size.size()};
+            for(int i = 0; i < size.size(); i++)
+                scale_factors[i] = interpolate_config.scale_factors[i];
+        }
+        else
+        {
+            scale_factors = tensor<float>{3};
+            for(int i = 0; i < size.size(); i++)
+                scale_factors[i] = interpolate_config.scale_factors[i];
+            for(int i = size.size(); i < 3; i++)
+                scale_factors[i] = 0;
+        }
+
+        auto out_dim = std::vector<size_t>({in_dim[0], in_dim[1]});
+        for(int i = 0; i < size.size(); i++)
+        {
+            if(scale_factors[i] != 0)
+                out_dim.push_back(static_cast<size_t>(ceil(in_dim[i + 2] * scale_factors[i])));
+            else
+            {
+                out_dim.push_back(size[i]);
+            }
+        }
+
+        auto gen_input_value = [](auto...) {
+            return prng::gen_A_to_B<T>(static_cast<T>(-5.0f), static_cast<T>(1.0f));
+        };
+
+        auto in_strides = GetStrides(in_dim, true);
+        input           = tensor<T>{in_dim, in_strides}.generate(gen_input_value);
+
+        auto out_strides = GetStrides(out_dim, true);
+        output           = tensor<T>{out_dim, out_strides};
+        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_output = tensor<T>{out_dim, out_strides};
+        std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        input_dev         = handle.Write(input.data);
+        output_dev        = handle.Write(output.data);
+        scale_factors_dev = handle.Write(scale_factors.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        size_t nelems = output.desc.GetElementSize();
+
+        cpu_interpolate_forward<T>(input, ref_output, nelems, scale_factors, align_corners, mode);
+
+        if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST)
+        {
+            status = miopen::InterpolateNearestForward(handle,
+                                                       input.desc,
+                                                       input_dev.get(),
+                                                       output.desc,
+                                                       output_dev.get(),
+                                                       scale_factors.desc,
+                                                       scale_factors_dev.get(),
+                                                       mode);
+        }
+        else
+        {
+            status = miopen::InterpolateLinearCubicForward(handle,
+                                                           input.desc,
+                                                           input_dev.get(),
+                                                           output.desc,
+                                                           output_dev.get(),
+                                                           scale_factors.desc,
+                                                           scale_factors_dev.get(),
+                                                           mode,
+                                                           align_corners);
+        }
+        fflush(stdout);
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<T>::epsilon();
+
+        auto error = miopen::rms_range(ref_output, output);
+
+        ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
+        EXPECT_LT(error, threshold * 10) << "Error output beyond tolerance Error:" << error
+                                         << ",  Thresholdx10: " << threshold * 10;
+    }
+    InterpolateTestCase interpolate_config;
+
+    tensor<T> input;
+    tensor<T> output;
+    tensor<T> ref_output;
+    tensor<float> scale_factors;
+
+    miopenInterpolateMode_t mode;
+    bool align_corners;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+    miopen::Allocator::ManageDataPtr scale_factors_dev;
+};
+
+// BACKWARD TEST
+template <typename T = float>
+struct InterpolateTestBwd : public ::testing::TestWithParam<InterpolateTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle      = get_handle();
+        interpolate_config = GetParam();
+
+        auto in_dim      = interpolate_config.GetInput();
+        auto in_grad_dim = in_dim;
+        auto size        = interpolate_config.size;
+        mode             = interpolate_config.mode;
+        align_corners    = interpolate_config.align_corners;
+
+        if(mode != MIOPEN_INTERPOLATE_MODE_NEAREST)
+        {
+            scale_factors = tensor<float>{size.size()};
+            for(int i = 0; i < size.size(); i++)
+                scale_factors[i] = interpolate_config.scale_factors[i];
+        }
+        else
+        {
+            scale_factors = tensor<float>{3};
+            for(int i = 0; i < size.size(); i++)
+                scale_factors[i] = interpolate_config.scale_factors[i];
+            for(int i = size.size(); i < 3; i++)
+                scale_factors[i] = 0;
+        }
+
+        auto out_grad_dim = std::vector<size_t>({in_dim[0], in_dim[1]});
+        for(int i = 0; i < size.size(); i++)
+        {
+            if(scale_factors[i] != 0)
+                out_grad_dim.push_back(static_cast<size_t>(ceil(in_dim[i + 2] * scale_factors[i])));
+            else
+                out_grad_dim.push_back(size[i]);
+        }
+
+        auto gen_output_grad_value = [](auto...) {
+            return prng::gen_A_to_B<T>(static_cast<T>(-5.0f), static_cast<T>(5.0f));
+        };
+
+        auto out_grad_strides = GetStrides(out_grad_dim, true);
+        output_grad = tensor<T>{out_grad_dim, out_grad_strides}.generate(gen_output_grad_value);
+
+        auto in_strides = GetStrides(in_grad_dim, true);
+        input_grad      = tensor<T>{in_grad_dim, in_strides};
+        std::fill(input_grad.begin(), input_grad.end(), static_cast<T>(0.f));
+
+        ref_input_grad = tensor<T>{in_grad_dim, in_strides};
+        std::fill(ref_input_grad.begin(), ref_input_grad.end(), static_cast<T>(0.f));
+
+        if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC)
+        {
+            ws_sizeInBytes = miopen::GetInterpolateBicubicBackwardWorkspaceSize(
+                handle, output_grad.desc, input_grad.desc, scale_factors.desc, mode, align_corners);
+            if(ws_sizeInBytes == static_cast<size_t>(-1))
+                GTEST_SKIP();
+
+            workspace = tensor<float>{in_grad_dim, in_strides};
+            std::fill(workspace.begin(), workspace.end(), 0.f);
+
+            workspace_dev = handle.Write(workspace.data);
+        }
+
+        output_grad_dev   = handle.Write(output_grad.data);
+        input_grad_dev    = handle.Write(input_grad.data);
+        scale_factors_dev = handle.Write(scale_factors.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        size_t nelems = input_grad.desc.GetElementSize();
+
+        cpu_interpolate_backward<T>(
+            ref_input_grad, output_grad, nelems, scale_factors, align_corners, mode);
+
+        if(mode == MIOPEN_INTERPOLATE_MODE_NEAREST)
+        {
+            status = miopen::InterpolateNearestBackward(handle,
+                                                        input_grad.desc,
+                                                        input_grad_dev.get(),
+                                                        output_grad.desc,
+                                                        output_grad_dev.get(),
+                                                        scale_factors.desc,
+                                                        scale_factors_dev.get(),
+                                                        mode);
+        }
+        else if(mode == MIOPEN_INTERPOLATE_MODE_BICUBIC)
+        {
+            status = miopen::InterpolateBicubicBackward(handle,
+                                                        workspace_dev.get(),
+                                                        ws_sizeInBytes,
+                                                        input_grad.desc,
+                                                        input_grad_dev.get(),
+                                                        output_grad.desc,
+                                                        output_grad_dev.get(),
+                                                        scale_factors.desc,
+                                                        scale_factors_dev.get(),
+                                                        mode,
+                                                        align_corners);
+        }
+        else
+        {
+            status = miopen::InterpolateLinearBackward(handle,
+                                                       input_grad.desc,
+                                                       input_grad_dev.get(),
+                                                       output_grad.desc,
+                                                       output_grad_dev.get(),
+                                                       scale_factors.desc,
+                                                       scale_factors_dev.get(),
+                                                       mode,
+                                                       align_corners);
+        }
+        fflush(stdout);
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        input_grad.data = handle.Read<T>(input_grad_dev, input_grad.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<T>::epsilon();
+
+        auto error = miopen::rms_range(ref_input_grad, input_grad);
+
+        ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad));
+        EXPECT_LT(error, threshold * 10) << "Error input grad beyond tolerance Error:" << error
+                                         << ",  Thresholdx10: " << threshold * 10;
+    }
+    InterpolateTestCase interpolate_config;
+
+    tensor<float> workspace;
+    tensor<T> input_grad;
+    tensor<T> output_grad;
+    tensor<T> ref_input_grad;
+    tensor<float> scale_factors;
+
+    miopenInterpolateMode_t mode;
+    bool align_corners;
+
+    miopen::Allocator::ManageDataPtr input_grad_dev;
+    miopen::Allocator::ManageDataPtr output_grad_dev;
+    miopen::Allocator::ManageDataPtr scale_factors_dev;
+    miopen::Allocator::ManageDataPtr workspace_dev;
+
+    size_t ws_sizeInBytes;
+};