diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 02496548e8..43bc4d798a 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -33,3 +33,4 @@ The MIOpen API library is structured as follows:
   * :doc:`Cat <../doxygen/html/group__cat>` (experimental)
   * :doc:`SGD <../doxygen/html/group___s_g_d>` (experimental)
   * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental)
+  * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental)
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index de1a0edd2d..afd10b6232 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -43,6 +43,7 @@ add_executable(MIOpenDriver
     dm_dropout.cpp
     dm_fusion.cpp
     dm_gemm.cpp
+    dm_getitem.cpp
     dm_groupnorm.cpp
     dm_layernorm.cpp
     dm_lrn.cpp
diff --git a/driver/InputFlags.cpp b/driver/InputFlags.cpp
index 41f872b0e8..12df05cfb5 100644
--- a/driver/InputFlags.cpp
+++ b/driver/InputFlags.cpp
@@ -292,6 +292,165 @@ TensorParameters InputFlags::GetValueTensor(const std::string& long_name) const
 
     MIOPEN_THROW("Too many tensor descriptor parameters.");
 }
+
+TensorParametersUint64 InputFlags::GetValueTensorUint64(const std::string& long_name) const
+{
+    const auto& input     = MapInputs.at(FindShortName(long_name));
+    const auto components = miopen::SplitDelim(input.value.c_str(), ',');
+
+    if(components.size() < 1)
+        return {};
+
+    auto parse = [](auto line) {
+        auto ret        = std::vector<uint64_t>{};
+        const auto strs = miopen::SplitDelim(line, 'x');
+        for(auto&& str : strs)
+        {
+            auto elem = uint64_t{};
+            auto ss   = std::istringstream{str};
+            ss >> elem;
+
+            if(ss.bad() || ss.fail())
+                MIOPEN_THROW("Invalid tensor component " + str + " in " + line + ".");
+
+            ret.push_back(elem);
+        }
+        return ret;
+    };
+
+    auto lens = parse(components[0]);
+
+    if(components.size() == 1)
+        return {lens};
+
+    auto layout  = std::string{};
+    auto strides = std::vector<uint64_t>{};
+
+    if(std::isdigit(components[1][0]))
+        strides = parse(components[1]);
+    else
+        layout = components[1];
+
+    if(components.size() == 2)
+        return {lens, strides, layout};
+
+    MIOPEN_THROW("Too many tensor descriptor parameters.");
+}
+
+std::vector<int32_t> InputFlags::GetValueVectorInt(const std::string& long_name) const
+{
+    const auto& input = MapInputs.at(FindShortName(long_name));
+
+    auto ret        = std::vector<int32_t>{};
+    const auto strs = miopen::SplitDelim(input.value.c_str(), ',');
+
+    for(auto&& str : strs)
+    {
+        auto elem = int32_t{};
+        auto ss   = std::istringstream{str};
+        ss >> elem;
+
+        if(ss.bad() || ss.fail())
+            MIOPEN_THROW("Invalid tensor component " + str + " in " + input.value.c_str() + ".");
+
+        ret.push_back(elem);
+    }
+
+    return ret;
+}
+
+std::vector<uint64_t> InputFlags::GetValueVectorUint64(const std::string& long_name) const
+{
+    const auto& input = MapInputs.at(FindShortName(long_name));
+
+    auto ret        = std::vector<uint64_t>{};
+    const auto strs = miopen::SplitDelim(input.value.c_str(), ',');
+
+    for(auto&& str : strs)
+    {
+        auto elem = uint64_t{};
+        auto ss   = std::istringstream{str};
+        ss >> elem;
+
+        if(ss.bad() || ss.fail())
+            MIOPEN_THROW("Invalid tensor component " + str + " in " + input.value.c_str() + ".");
+
+        ret.push_back(elem);
+    }
+
+    return ret;
+}
+
+std::vector<std::vector<int32_t>>
+InputFlags::GetValue2dVectorInt(const std::string& long_name) const
+{
+    const auto& input     = MapInputs.at(FindShortName(long_name));
+    const auto components = miopen::SplitDelim(input.value.c_str(), ',');
+    auto output           = std::vector<std::vector<int32_t>>{};
+
+    if(components.size() < 1)
+        return {};
+
+    auto parse = [](auto line) {
+        auto ret        = std::vector<int32_t>{};
+        const auto strs = miopen::SplitDelim(line, 'x');
+        for(auto&& str : strs)
+        {
+            auto elem = int32_t{};
+            auto ss   = std::istringstream{str};
+            ss >> elem;
+
+            if(ss.bad() || ss.fail())
+                MIOPEN_THROW("Invalid tensor component " + str + " in " + line + ".");
+
+            ret.push_back(elem);
+        }
+        return ret;
+    };
+
+    for(auto&& component : components)
+    {
+        output.push_back(parse(component));
+    }
+
+    return output;
+}
+
+std::vector<std::vector<uint64_t>>
+InputFlags::GetValue2dVectorUint64(const std::string& long_name) const
+{
+    const auto& input     = MapInputs.at(FindShortName(long_name));
+    const auto components = miopen::SplitDelim(input.value.c_str(), ',');
+    auto output           = std::vector<std::vector<uint64_t>>{};
+
+    if(components.size() < 1)
+        return {};
+
+    auto parse = [](auto line) {
+        auto ret        = std::vector<uint64_t>{};
+        const auto strs = miopen::SplitDelim(line, 'x');
+        for(auto&& str : strs)
+        {
+            auto elem = uint64_t{};
+            auto ss   = std::istringstream{str};
+            ss >> elem;
+
+            if(ss.bad() || ss.fail())
+                MIOPEN_THROW("Invalid tensor component " + str + " in " + line + ".");
+
+            ret.push_back(elem);
+        }
+        return ret;
+    };
+
+    for(auto&& component : components)
+    {
+        output.push_back(parse(component));
+    }
+
+    return output;
+}
+
 void InputFlags::SetValue(const std::string& long_name, const std::string& new_value)
 {
     char short_name                = FindShortName(long_name);
diff --git a/driver/InputFlags.hpp b/driver/InputFlags.hpp
index 557a895b11..43f7c3a206 100644
--- a/driver/InputFlags.hpp
+++ b/driver/InputFlags.hpp
@@ -63,6 +63,25 @@ struct TensorParameters
     void CalculateStrides();
 };
 
+struct TensorParametersUint64
+{
+    std::vector<uint64_t> lengths = {};
+    std::vector<uint64_t> strides = {};
+    std::string layout            = "";
+
+    TensorParametersUint64 FillMissing(const TensorParametersUint64& other) const
+    {
+        return {
+            (lengths.empty() ? other.lengths : lengths),
+            (strides.empty() ? other.strides : strides),
+            (layout.empty() ? other.layout : layout),
+        };
+    }
+
+    uint64_t SetTensordDescriptor(miopenTensorDescriptor_t result, miopenDataType_t data_type);
+    void CalculateStrides();
+};
+
 class InputFlags
 {
     std::map<char, Input> MapInputs;
@@ -90,6 +109,11 @@ class InputFlags
     uint64_t GetValueUint64(const std::string& _long_name) const;
     double GetValueDouble(const std::string& _long_name) const;
     TensorParameters GetValueTensor(const std::string& long_name) const;
+    TensorParametersUint64 GetValueTensorUint64(const std::string& long_name) const;
+    std::vector<int32_t> GetValueVectorInt(const std::string& long_name) const;
+    std::vector<uint64_t> GetValueVectorUint64(const std::string& long_name) const;
+    std::vector<std::vector<int32_t>> GetValue2dVectorInt(const std::string& long_name) const;
+    std::vector<std::vector<uint64_t>> GetValue2dVectorUint64(const std::string& long_name) const;
     void SetValue(const std::string& long_name, const std::string& new_value);
     void StoreOptionalFlagValue(char short_name, const std::string& input_value);
 
diff --git a/driver/addlayernorm_driver.hpp b/driver/addlayernorm_driver.hpp
index 4741d2d820..e74a1548e6 100644
--- a/driver/addlayernorm_driver.hpp
+++ b/driver/addlayernorm_driver.hpp
@@ -176,8 +176,8 @@ class AddLayerNormDriver : public Driver
     std::vector<Tgpu> weight;
     std::vector<Tgpu> bias;
     std::vector<Tgpu> out;
-    std::vector<Tref> mean;
-    std::vector<Tref> rstd;
+    std::vector<Tgpu> mean;
+    std::vector<Tgpu> rstd;
     std::vector<Tref> outhost;
     std::vector<Tref> meanhost;
     std::vector<Tref> rstdhost;
@@ -259,7 +259,7 @@ int AddLayerNormDriver<Tgpu, Tref>::AddCmdLineArgs()
     inflags.AddInputFlag("eps", 'e', "0.00001", "Alpha (Default=0.00001)", "double");
     inflags.AddInputFlag("normalized_dim", 'o', "3", "Nomalized Dim (Default=3)", "int");
     inflags.AddInputFlag(
-        "mode", 'm', "0", "elemwise affine mode (0), weight and bias mode (1) (Default=0)", "int");
+        "mode", 'm', "2", "elemwise affine mode (2), weight and bias mode (3) (Default=0)", "int");
 
     inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
     inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
@@ -291,16 +291,16 @@ int AddLayerNormDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     weight_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, weight_sz, sizeof(Tgpu)));
     bias_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, bias_sz, sizeof(Tgpu)));
     out_dev    = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
-    mean_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, mean_sz, sizeof(Tref)));
-    rstd_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, rstd_sz, sizeof(Tref)));
+    mean_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, mean_sz, sizeof(Tgpu)));
+    rstd_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, rstd_sz, sizeof(Tgpu)));
 
     in       = std::vector<Tgpu>(in_sz, Tgpu0val);
     in2      = std::vector<Tgpu>(in2_sz, Tgpu0val);
     weight   = std::vector<Tgpu>(weight_sz, Tgpu0val);
     bias     = std::vector<Tgpu>(bias_sz, Tgpu0val);
     out      = std::vector<Tgpu>(out_sz, Tgpu0val);
-    mean     = std::vector<Tref>(mean_sz, Tref0val);
-    rstd     = std::vector<Tref>(rstd_sz, Tref0val);
+    mean     = std::vector<Tgpu>(mean_sz, Tgpu0val);
+    rstd     = std::vector<Tgpu>(rstd_sz, Tgpu0val);
     outhost  = std::vector<Tref>(out_sz, Tref0val);
     meanhost = std::vector<Tref>(mean_sz, Tref0val);
     rstdhost = std::vector<Tref>(rstd_sz, Tref0val);
diff --git a/driver/cat_driver.hpp b/driver/cat_driver.hpp
index 51eb16b1c7..3254b5f3bc 100644
--- a/driver/cat_driver.hpp
+++ b/driver/cat_driver.hpp
@@ -183,8 +183,8 @@ template <typename Tgpu, typename Tref>
 int CatDriver<Tgpu, Tref>::AddCmdLineArgs()
 {
     inflags.AddInputFlag("forw", 'F', "1", "Run only Forward Cat (Default=1)", "int");
-    inflags.AddTensorFlag("input1", '1', "", "input1 tensor descriptor");
-    inflags.AddTensorFlag("input2", '2', "", "input2 tensor descriptor");
+    inflags.AddTensorFlag("input1", '1', "2x32x128x128x128", "input1 tensor descriptor");
+    inflags.AddTensorFlag("input2", '2', "2x32x128x128x128", "input2 tensor descriptor");
     inflags.AddTensorFlag("input3", '3', "", "input3 tensor descriptor");
     inflags.AddTensorFlag("input4", '4', "", "input4 tensor descriptor");
     inflags.AddTensorFlag("input5", '5', "", "input5 tensor descriptor");
diff --git a/driver/dm_getitem.cpp b/driver/dm_getitem.cpp
new file mode 100644
index 0000000000..bfb72be96a
--- /dev/null
+++ b/driver/dm_getitem.cpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "getitem_driver.hpp"
+#include "registry_driver_maker.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "getitem")
+        return new GetitemDriver<float, float>();
+    if(base_arg == "getitemfp16")
+        return new GetitemDriver<float16, float>();
+    if(base_arg == "getitembfp16")
+        return new GetitemDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index bf951a54ad..a36121f676 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -169,13 +169,13 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
 [[noreturn]] inline void Usage()
 {
     printf("Usage: ./driver *base_arg* *other_args*\n");
-    printf("Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], "
-           "pool[fp16], lrn[fp16], "
+    printf("Supported Base Arguments: conv[fp16|int8|bfp16], pool[fp16], lrn[fp16], "
            "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
-           "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
+           "tensorop, reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
            "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
            "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
-           "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw\n");
+           "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
+           "getitem[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -190,22 +190,22 @@ inline std::string ParseBaseArg(int argc, char* argv[])
     std::string arg = argv[1];
 
     if(arg != "conv" && arg != "convfp16" && arg != "convint8" && arg != "convbfp16" &&
-       arg != "convfp8" && arg != "convbfp8" && arg != "CBAInfer" && arg != "CBAInferfp16" &&
        arg != "pool" && arg != "poolfp16" && arg != "lrn" && arg != "lrnfp16" && arg != "activ" &&
        arg != "activfp16" && arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" &&
        arg != "bnormfp16" && arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" &&
        arg != "rnn_seqfp16" && arg != "gemm" && arg != "gemmfp16" && arg != "ctc" &&
-       arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "tensoropfp16" &&
-       arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && arg != "layernorm" &&
-       arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" &&
-       arg != "sumbfp16" && arg != "groupnorm" && arg != "groupnormfp16" &&
-       arg != "groupnormbfp16" && arg != "cat" && arg != "catfp16" && arg != "catbfp16" &&
-       arg != "addlayernorm" && arg != "addlayernormfp16" && arg != "addlayernormbfp16" &&
-       arg != "t5layernorm" && arg != "t5layernormfp16" && arg != "t5layernormbfp16" &&
-       arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" &&
+       arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "reduce" &&
+       arg != "reducefp16" && arg != "reducefp64" && arg != "layernorm" && arg != "layernormfp16" &&
+       arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" && arg != "sumbfp16" &&
+       arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" &&
+       arg != "catfp16" && arg != "catbfp16" && arg != "addlayernorm" &&
+       arg != "addlayernormfp16" && arg != "addlayernormbfp16" && arg != "t5layernorm" &&
+       arg != "t5layernormfp16" && arg != "t5layernormbfp16" && arg != "adam" &&
+       arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" &&
        arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" &&
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
-       arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "--version")
+       arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
+       arg != "getitemfp16" && arg != "getitembfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/getitem_driver.hpp b/driver/getitem_driver.hpp
new file mode 100644
index 0000000000..c48c9a0520
--- /dev/null
+++ b/driver/getitem_driver.hpp
@@ -0,0 +1,545 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_GETITEM_DRIVER_HPP
+#define GUARD_MIOPEN_GETITEM_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+#include "random.hpp"
+#include <algorithm>
+#include <cfloat>
+#include <cstdlib>
+#include <memory>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_view_utils.hpp>
+#include <numeric>
+#include <vector>
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloGetitemBackwardRunHost(miopenTensorDescriptor_t dyDesc,
+                                  uint32_t indexCount,
+                                  miopenTensorDescriptor_t* indexDescs,
+                                  miopenTensorDescriptor_t dxDesc,
+                                  miopenTensorDescriptor_t errorDesc,
+                                  Tgpu* dy,
+                                  int32_t** indexs,
+                                  Tcheck* dxhost,
+                                  int32_t* errorhost,
+                                  uint32_t dimCount,
+                                  int32_t* dims,
+                                  uint32_t sliceCount,
+                                  int32_t* slices,
+                                  uint32_t offset)
+{
+    auto dy_dims  = miopen::deref(dyDesc).GetLengths();
+    auto dy_numel = std::accumulate(dy_dims.begin(), dy_dims.end(), 1L, std::multiplies<int64_t>());
+    auto dx_dims  = miopen::deref(dxDesc).GetLengths();
+    auto index_dims = miopen::deref(indexDescs[0]).GetLengths();
+    auto index_numel =
+        std::accumulate(index_dims.begin(), index_dims.end(), 1L, std::multiplies<int64_t>());
+    auto element_index = std::vector<int32_t>(indexCount * index_numel + indexCount);
+
+    std::vector<size_t> output_dims;
+    for(int32_t i = 0; i < dimCount; i++)
+    {
+        output_dims.push_back(dx_dims[dims[i]]);
+    }
+
+    auto dim_info_offset = indexCount > 0 ? indexCount * index_dims[0] : 0;
+    auto start_dim       = dims[0];
+
+    auto dy_tv     = miopen::get_inner_expanded_tv<5>(miopen::deref(dyDesc));
+    auto dxhost_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dxDesc));
+    miopen::slice_tv<5>(dxhost_tv, sliceCount, slices);
+
+    int32_t ret = 0;
+
+    // Get element index form indexs
+    for(size_t j = 0; j < indexCount; j++)
+    {
+        const auto& index_dim = dims[j];
+        const auto& dim_size  = output_dims[j];
+
+        for(size_t o = 0; o < index_numel; o++)
+        {
+            int32_t getitem_index = indexs[j][o];
+
+            if(getitem_index >= 0 && getitem_index < dim_size)
+            {
+                element_index[(o * indexCount) + j] = getitem_index;
+            }
+            else if(getitem_index >= -dim_size && getitem_index < 0)
+            {
+                element_index[(o * indexCount) + j] = getitem_index + dim_size;
+            }
+            else
+            {
+                errorhost[j] = -1;
+            }
+
+            if(o == 0)
+            {
+                element_index[dim_info_offset + j] = index_dim;
+            }
+        }
+    }
+
+    // GetItem
+    for(size_t o = 0; o < dy_numel; o++)
+    {
+        tensor_layout_t<5> ncdhw(dy_tv, o);
+        tensor_layout_t<5> idx(ncdhw);
+
+        if(indexCount > 0)
+        {
+            size_t dim_cursor = ncdhw.layout[start_dim];
+            size_t i          = start_dim;
+            size_t j          = 0;
+
+            for(; i < start_dim + indexCount; ++i, ++j)
+            {
+                size_t dim_idx      = element_index[dim_info_offset + j];
+                idx.layout[dim_idx] = element_index[(dim_cursor * indexCount) + j];
+            }
+
+            i          = element_index[dim_info_offset + indexCount - 1] + 1;
+            dim_cursor = start_dim + 1;
+            for(; i < 5; ++i, ++dim_cursor)
+            {
+                idx.layout[i] = ncdhw.layout[dim_cursor];
+            }
+        }
+
+        dxhost[dxhost_tv.get_tensor_view_idx(idx)] += dy[dy_tv.get_tensor_view_idx(ncdhw)];
+    }
+
+    return ret;
+}
+
+template <typename Tgpu, typename Tref>
+class GetitemDriver : public Driver
+{
+public:
+    GetitemDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&dyDesc);
+        miopenCreateTensorDescriptor(&dxDesc);
+        miopenCreateTensorDescriptor(&errorDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~GetitemDriver() override
+    {
+        miopenDestroyTensorDescriptor(dyDesc);
+        for(auto indexDesc : indexDescs)
+        {
+            miopenDestroyTensorDescriptor(indexDesc);
+        }
+        miopenDestroyTensorDescriptor(dxDesc);
+        miopenDestroyTensorDescriptor(errorDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    miopenTensorDescriptor_t dyDesc;
+    std::vector<miopenTensorDescriptor_t> indexDescs;
+    miopenTensorDescriptor_t dxDesc;
+    miopenTensorDescriptor_t errorDesc;
+
+    std::unique_ptr<GPUMem> dy_dev;
+    std::vector<std::unique_ptr<GPUMem>> index_devs;
+    std::unique_ptr<GPUMem> dx_dev;
+    std::unique_ptr<GPUMem> error_dev;
+    std::unique_ptr<GPUMem> workspace_dev;
+
+    std::vector<Tgpu> dy;
+    std::vector<std::vector<int32_t>> indexs;
+    std::vector<Tgpu> dx;
+    std::vector<int32_t> error;
+    std::vector<int32_t> workspace;
+    std::vector<Tref> dxhost;
+    std::vector<int32_t> errorhost;
+
+    size_t ws_sizeInBytes;
+
+    std::vector<int32_t> dims;
+    std::vector<std::vector<int32_t>> slices;
+    std::vector<int32_t> slices_flat;
+    uint32_t offset;
+
+    std::vector<int32_t> output_dims;
+    std::vector<void*> index_devs_ptr;
+    std::vector<int32_t*> indexs_ptr;
+};
+
+template <typename Tgpu, typename Tref>
+int GetitemDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+
+    if(inflags.GetValueInt("indexcount") < 0)
+        MIOPEN_THROW("Index count is negative: " + inflags.GetValueStr("indexcount") + ".");
+
+    if(inflags.GetValueInt("dimcount") < 0)
+        MIOPEN_THROW("Dim count is negative: " + inflags.GetValueStr("dimcount") + ".");
+
+    if(inflags.GetValueInt("slicecount") < 0)
+        MIOPEN_THROW("Slice count is negative: " + inflags.GetValueStr("slicecount") + ".");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int GetitemDriver<Tgpu, Tref>::GetandSetData()
+{
+    auto dyTensorParam   = inflags.GetValueTensorUint64("doutput");
+    auto dxTensorParam   = inflags.GetValueTensorUint64("dinput");
+    auto indexCountParam = inflags.GetValueInt("indexcount");
+    auto dimCountParam   = inflags.GetValueInt("dimcount");
+    auto sliceCountParam = inflags.GetValueInt("slicecount");
+    offset               = inflags.GetValueInt("offset");
+
+    auto indexTensorLengths = inflags.GetValue2dVectorInt("indexs");
+    if(indexTensorLengths.size() != indexCountParam)
+        MIOPEN_THROW("Error parsing indexs tensor: " + inflags.GetValueStr("indexs") + ".");
+
+    dims = inflags.GetValueVectorInt("dims");
+    if(dims.size() != dimCountParam)
+        MIOPEN_THROW("Error parsing dims tensor: " + inflags.GetValueStr("dims") + ".");
+
+    for(auto dim : dims)
+    {
+        output_dims.push_back(dxTensorParam.lengths[dim]);
+    }
+
+    slices = inflags.GetValue2dVectorInt("slices");
+    if(slices.size() != sliceCountParam)
+        MIOPEN_THROW("Error parsing slices: " + inflags.GetValueStr("slices") + ".");
+
+    for(auto slice : slices)
+    {
+        for(int32_t i = 0; i < 4; i++)
+        {
+            slices_flat.push_back(slice[i]);
+        }
+    }
+
+    if(SetTensorNd(dyDesc, dyTensorParam.lengths, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing doutput tensor: " + inflags.GetValueStr("doutput") + ".");
+
+    for(auto indexTensorLength : indexTensorLengths)
+    {
+        miopenTensorDescriptor_t indexDesc;
+        miopenCreateTensorDescriptor(&indexDesc);
+        if(SetTensorNd(indexDesc, indexTensorLength, miopenInt32) != miopenStatusSuccess)
+            MIOPEN_THROW("Error parsing indexs tensor: " + inflags.GetValueStr("indexs") + ".");
+        indexDescs.push_back(indexDesc);
+    }
+
+    if(SetTensorNd(dxDesc, dxTensorParam.lengths, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing dinput tensor: " + inflags.GetValueStr("dinput") + ".");
+
+    std::vector<int32_t> error_length;
+    error_length.push_back(indexCountParam);
+    if(SetTensorNd(errorDesc, error_length, miopen_type<int32_t>{}) != miopenStatusSuccess)
+        MIOPEN_THROW("Error making error tensor: " + inflags.GetValueStr("indexcount") + ".");
+
+    return 0;
+}
+
+template <typename Tgpu, typename Tref>
+int GetitemDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "0", "Run only Forward Getitem (Default=0)", "int");
+    inflags.AddTensorFlag("doutput", 'O', "128x128", "doutput tensor descriptor");
+    inflags.AddTensorFlag("indexs", 'D', "128", "indexs tensor descriptor");
+    inflags.AddTensorFlag("dinput", 'N', "128x128", "dinput tensor descriptor");
+
+    inflags.AddInputFlag("indexcount", '1', "1", "the number of indexs tensor(Default=1)", "int");
+    inflags.AddInputFlag("dimcount", '2', "1", "The dimensions(Default=1)", "int");
+    inflags.AddInputFlag("dims", '3', "0", "The dimensions(Default=0)", "vector<int>");
+    inflags.AddInputFlag("slicecount", '4', "0", "The number of slices(Default=0)", "int");
+    inflags.AddInputFlag("slices",
+                         '5',
+                         "",
+                         "The slices(Default=\'\'"
+                         ")",
+                         "vector<vector<int>>");
+    inflags.AddInputFlag("offset", '6', "0", "The offset of output(Default=0)", "int");
+
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int GetitemDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t dy_sz    = GetTensorSize(dyDesc);
+    size_t dx_sz    = GetTensorSize(dxDesc);
+    size_t error_sz = GetTensorSize(errorDesc);
+
+    miopenGetGetitemWorkspaceSize(
+        GetHandle(), indexDescs.size(), indexDescs.data(), &ws_sizeInBytes);
+
+    uint32_t ctx = 0;
+
+    dy_dev        = std::unique_ptr<GPUMem>(new GPUMem(ctx, dy_sz, sizeof(Tgpu)));
+    dx_dev        = std::unique_ptr<GPUMem>(new GPUMem(ctx, dx_sz, sizeof(Tgpu)));
+    error_dev     = std::unique_ptr<GPUMem>(new GPUMem(ctx, error_sz, sizeof(int32_t)));
+    workspace_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte)));
+
+    dy        = std::vector<Tgpu>(dy_sz, static_cast<Tgpu>(0));
+    dx        = std::vector<Tgpu>(dx_sz, static_cast<Tgpu>(0));
+    error     = std::vector<int32_t>(error_sz, static_cast<int32_t>(0));
+    workspace = std::vector<int32_t>(ws_sizeInBytes / sizeof(int32_t), static_cast<int32_t>(0));
+    dxhost    = std::vector<Tref>(dx_sz, static_cast<Tref>(0));
+    errorhost = std::vector<int32_t>(error_sz, static_cast<int32_t>(0));
+
+    for(int32_t i = 0; i < dy_sz; i++)
+    {
+        dy[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-1), static_cast<Tgpu>(1));
+    }
+
+    for(int32_t i = 0; i < indexDescs.size(); i++)
+    {
+        size_t index_sz = GetTensorSize(indexDescs[i]);
+        index_devs.push_back(std::unique_ptr<GPUMem>(new GPUMem(ctx, index_sz, sizeof(int32_t))));
+        indexs.push_back(std::vector<int32_t>(index_sz, static_cast<int32_t>(0)));
+        auto& index    = indexs.back();
+        auto index_dev = index_devs.back().get();
+
+        for(int j = 0; j < index_sz; j++)
+        {
+            index[j] = prng::gen_A_to_B<int32_t>(static_cast<int32_t>(0),
+                                                 static_cast<int32_t>(output_dims[i]));
+        }
+        if(index_dev->ToGPU(GetStream(), index.data()) != 0)
+            std::cerr << "Error copying (index) to GPU, size: " << index_dev->GetSize()
+                      << std::endl;
+        index_devs_ptr.push_back(index_dev->GetMem());
+        indexs_ptr.push_back(index.data());
+    }
+
+    if(dy_dev->ToGPU(GetStream(), dy.data()) != 0)
+        std::cerr << "Error copying (dy) to GPU, size: " << dy_dev->GetSize() << std::endl;
+
+    if(workspace_dev->ToGPU(GetStream(), workspace.data()) != 0)
+        std::cerr << "Error copying (workspace) to GPU, size: " << workspace_dev->GetSize()
+                  << std::endl;
+
+    if(dx_dev->ToGPU(GetStream(), dx.data()) != 0)
+        std::cerr << "Error copying (dx) to GPU, size: " << dx_dev->GetSize() << std::endl;
+
+    if(error_dev->ToGPU(GetStream(), error.data()) != 0)
+        std::cerr << "Error copying (error) to GPU, size: " << error_dev->GetSize() << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int GetitemDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int GetitemDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int32_t i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+
+        if(dx_dev->ToGPU(GetStream(), dx.data()) != 0)
+            std::cerr << "Error copying (dx) to GPU, size: " << dx_dev->GetSize() << std::endl;
+
+        miopenGetitemBackward(GetHandle(),
+                              workspace_dev->GetMem(),
+                              ws_sizeInBytes,
+                              dyDesc,
+                              dy_dev->GetMem(),
+                              indexDescs.size(),
+                              indexDescs.data(),
+                              index_devs_ptr.data(),
+                              dxDesc,
+                              dx_dev->GetMem(),
+                              errorDesc,
+                              error_dev->GetMem(),
+                              dims.size(),
+                              dims.data(),
+                              slices.size(),
+                              slices_flat.data(),
+                              offset);
+
+        float time = 0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int32_t iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Backward Getitem Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Backward Getitem Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
+    }
+
+    if(dx_dev->FromGPU(GetStream(), dx.data()) != 0)
+        std::cerr << "Error copying (dx_dev) from GPU, size: " << dx_dev->GetSize() << std::endl;
+
+    if(error_dev->FromGPU(GetStream(), error.data()) != 0)
+        std::cerr << "Error copying (error_dev) from GPU, size: " << error_dev->GetSize()
+                  << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int GetitemDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    mloGetitemBackwardRunHost<Tgpu, Tref>(dyDesc,
+                                          indexDescs.size(),
+                                          indexDescs.data(),
+                                          dxDesc,
+                                          errorDesc,
+                                          dy.data(),
+                                          indexs_ptr.data(),
+                                          dxhost.data(),
+                                          errorhost.data(),
+                                          dims.size(),
+                                          dims.data(),
+                                          slices.size(),
+                                          slices_flat.data(),
+                                          offset);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+Tref GetitemDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    // In the case of layernorm, there is a cumulative sum operation, and in the case of
+    // floating point operation, the result value can change if the order of the summed values
+    // is changed. So apply a threshold that is 10 times larger than other operations.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-4 : 8.2e-1;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    // If there is an atomic operation on the GPU kernel, a large error occurs depending on the
+    // calculation order, so it is multiplied by 10 times.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8000.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int GetitemDriver<Tgpu, Tref>::VerifyForward()
+{
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int GetitemDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+
+    auto error_dx = miopen::rms_range(dxhost, dx);
+
+    if(!std::isfinite(error_dx) || error_dx > tolerance)
+    {
+        std::cout << "Backward Getitem FAILED: " << error_dx << " > " << tolerance << std::endl;
+        return EC_VerifyBwd;
+    }
+    else
+    {
+        std::cout << "Backward Getitem Verifies OK on CPU reference (" << error_dx << " < "
+                  << tolerance << ')' << std::endl;
+    }
+
+    auto error_error = miopen::rms_range(errorhost, error);
+
+    if(!std::isfinite(error_error) || std::abs(static_cast<float>(error_error)) != 0.0f)
+    {
+        std::cout << "Backward Getitem FAILED: Result does not equal" << std::endl;
+        return EC_VerifyBwd;
+    }
+    else
+    {
+        std::cout << "Backward Getitem Verifies OK on CPU and GPU" << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_GETITEM_DRIVER_HPP
diff --git a/driver/groupnorm_driver.hpp b/driver/groupnorm_driver.hpp
index c143496cdd..1e97f541a0 100644
--- a/driver/groupnorm_driver.hpp
+++ b/driver/groupnorm_driver.hpp
@@ -110,8 +110,8 @@ class GroupNormDriver : public Driver
     std::vector<Tgpu> weight;
     std::vector<Tgpu> bias;
     std::vector<Tgpu> out;
-    std::vector<Tref> mean;
-    std::vector<Tref> rstd;
+    std::vector<Tgpu> mean;
+    std::vector<Tgpu> rstd;
     std::vector<Tref> outhost;
     std::vector<Tref> meanhost;
     std::vector<Tref> rstdhost;
@@ -158,14 +158,14 @@ template <typename Tgpu, typename Tref>
 int GroupNormDriver<Tgpu, Tref>::AddCmdLineArgs()
 {
     inflags.AddInputFlag("forw", 'F', "1", "Run only Forward GroupNorm (Default=1)", "int");
-    inflags.AddInputFlag("batchsize", 'n', "100", "Mini-batch size (Default=100)", "int");
-    inflags.AddInputFlag("in_channels", 'c', "6", "Number of Input Channels (Default=6)", "int");
-    inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int");
-    inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int");
-    inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int");
+    inflags.AddInputFlag("batchsize", 'n', "32", "Mini-batch size (Default=100)", "int");
+    inflags.AddInputFlag("in_channels", 'c', "32", "Number of Input Channels (Default=6)", "int");
+    inflags.AddInputFlag("in_d", 'D', "14", "Input Depth (Default=0)", "int");
+    inflags.AddInputFlag("in_h", 'H', "14", "Input Height (Default=32)", "int");
+    inflags.AddInputFlag("in_w", 'W', "14", "Input Width (Default=32)", "int");
 
     inflags.AddInputFlag("eps", 'e', "0.00001", "Alpha (Default=0.00001)", "double");
-    inflags.AddInputFlag("num_groups", 'g', "3", "num_groups", "int");
+    inflags.AddInputFlag("num_groups", 'g', "4", "num_groups", "int");
     inflags.AddInputFlag(
         "mode", 'm', "0", "elemwise affine mode (0), weight and bias mode (1) (Default=0)", "int");
 
@@ -224,15 +224,15 @@ int GroupNormDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     weight_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, weight_sz, sizeof(Tgpu)));
     bias_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, bias_sz, sizeof(Tgpu)));
     out_dev    = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
-    mean_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, mean_sz, sizeof(Tref)));
-    rstd_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, rstd_sz, sizeof(Tref)));
+    mean_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, mean_sz, sizeof(Tgpu)));
+    rstd_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, rstd_sz, sizeof(Tgpu)));
 
     in       = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
     weight   = std::vector<Tgpu>(weight_sz, static_cast<Tgpu>(0));
     bias     = std::vector<Tgpu>(bias_sz, static_cast<Tgpu>(0));
     out      = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
-    mean     = std::vector<Tref>(mean_sz, static_cast<Tref>(0));
-    rstd     = std::vector<Tref>(rstd_sz, static_cast<Tref>(0));
+    mean     = std::vector<Tgpu>(mean_sz, static_cast<Tgpu>(0));
+    rstd     = std::vector<Tgpu>(rstd_sz, static_cast<Tgpu>(0));
     outhost  = std::vector<Tref>(out_sz, static_cast<Tref>(0));
     meanhost = std::vector<Tref>(mean_sz, static_cast<Tref>(0));
     rstdhost = std::vector<Tref>(rstd_sz, static_cast<Tref>(0));
@@ -347,23 +347,14 @@ int GroupNormDriver<Tgpu, Tref>::RunBackwardGPU()
 template <typename Tgpu, typename Tref>
 Tref GroupNormDriver<Tgpu, Tref>::GetTolerance()
 {
-    if(data_type == miopenHalf)
-    {
-        return 1e-3;
-    }
-    else if(data_type == miopenFloat)
-    {
-        return 5e-5;
-    }
-    else if(data_type == miopenDouble)
-    {
-        return 1e-10;
-    }
-    else if(data_type == miopenBFloat16)
-    {
-        return 5e-3;
-    }
-    return 0;
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
 }
 
 template <typename Tgpu, typename Tref>
diff --git a/driver/layernorm_driver.hpp b/driver/layernorm_driver.hpp
index ea5b841c08..5bdf82ce85 100644
--- a/driver/layernorm_driver.hpp
+++ b/driver/layernorm_driver.hpp
@@ -166,8 +166,8 @@ class LayerNormDriver : public Driver
     std::vector<Tgpu> weight;
     std::vector<Tgpu> bias;
     std::vector<Tgpu> out;
-    std::vector<Tref> mean;
-    std::vector<Tref> rstd;
+    std::vector<Tgpu> mean;
+    std::vector<Tgpu> rstd;
     std::vector<Tref> outhost;
     std::vector<Tref> meanhost;
     std::vector<Tref> rstdhost;
@@ -276,15 +276,15 @@ int LayerNormDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     weight_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, weight_sz, sizeof(Tgpu)));
     bias_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, bias_sz, sizeof(Tgpu)));
     out_dev    = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
-    mean_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, mean_sz, sizeof(Tref)));
-    rstd_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, rstd_sz, sizeof(Tref)));
+    mean_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, mean_sz, sizeof(Tgpu)));
+    rstd_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, rstd_sz, sizeof(Tgpu)));
 
     in       = std::vector<Tgpu>(in_sz, Tgpu0val);
     weight   = std::vector<Tgpu>(weight_sz, Tgpu0val);
     bias     = std::vector<Tgpu>(bias_sz, Tgpu0val);
     out      = std::vector<Tgpu>(out_sz, Tgpu0val);
-    mean     = std::vector<Tref>(mean_sz, Tref0ref);
-    rstd     = std::vector<Tref>(rstd_sz, Tref0ref);
+    mean     = std::vector<Tgpu>(mean_sz, Tgpu0val);
+    rstd     = std::vector<Tgpu>(rstd_sz, Tgpu0val);
     outhost  = std::vector<Tref>(out_sz, Tref0ref);
     meanhost = std::vector<Tref>(mean_sz, Tref0ref);
     rstdhost = std::vector<Tref>(rstd_sz, Tref0ref);
diff --git a/driver/t5layernorm_driver.hpp b/driver/t5layernorm_driver.hpp
index 94a4f6b934..3d02a2c3f4 100644
--- a/driver/t5layernorm_driver.hpp
+++ b/driver/t5layernorm_driver.hpp
@@ -318,12 +318,12 @@ int T5LayerNormDriver<Tgpu, Tref>::GetandSetData()
 template <typename Tgpu, typename Tref>
 int T5LayerNormDriver<Tgpu, Tref>::AddCmdLineArgs()
 {
-    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward T5LayerNorm (Default=1)", "int");
+    inflags.AddInputFlag("forw", 'F', "0", "Run only Forward T5LayerNorm (Default=1)", "int");
     inflags.AddTensorFlag("input", 'X', "100x3x32x32", "input tensor descriptor");
 
     inflags.AddInputFlag("eps", 'e', "0.00001", "Alpha (Default=0.00001)", "double");
     inflags.AddInputFlag(
-        "mode", 'm', "0", "elemwise affine mode (0), weight mode (1) (Default=0)", "int");
+        "mode", 'm', "5", "elemwise affine mode (5), weight mode (6) (Default=5)", "int");
 
     inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
     inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
diff --git a/driver/tensor_driver.hpp b/driver/tensor_driver.hpp
index f6868fab98..c353a6ee11 100644
--- a/driver/tensor_driver.hpp
+++ b/driver/tensor_driver.hpp
@@ -173,6 +173,13 @@ inline int SetTensorNd(miopenTensorDescriptor_t t,
     return miopenSetTensorDescriptor(t, data_type, len.size(), len.data(), nullptr);
 }
 
+inline int SetTensorNd(miopenTensorDescriptor_t t,
+                       std::vector<std::size_t>& len,
+                       miopenDataType_t data_type = miopenFloat)
+{
+    return miopenSetTensorDescriptorV2(t, data_type, len.size(), len.data(), nullptr);
+}
+
 inline int SetTensorNd(miopenTensorDescriptor_t t,
                        std::vector<int>& len,
                        std::vector<int>& strides,
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 0f2c2a5cb0..6b205fc99e 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -69,6 +69,7 @@
  * @defgroup groupnorm
  * @defgroup cat
  * @defgroup SGD
+ * @defgroup getitem
  *
  */
 
@@ -5922,6 +5923,8 @@ typedef enum
         3, /*!< the operation is getting the minimum value and index of the reduced elements */
     MIOPEN_REDUCE_EXTREME_MAX =
         4, /*!< the operation is getting the maximum value and index of the reduced elements */
+    MIOPEN_REDUCE_CALCULATION_SUM =
+        5, /*!< the operation is multiplying the values of the reduced elements */
 } miopenReduceExtremeOp_t;
 
 // ReduceExtreme APIs
@@ -7538,6 +7541,72 @@ miopenTransformersAdamWWithOutput(miopenHandle_t handle,
 // CLOSEOUT SGD DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+// GetItem APIs
+/** @addtogroup getitem
+ *
+ *  @{
+ */
+/*! @brief Helper function to query the minimum workspace size required by the getitem call
+ *
+ * @param [in]   handle                  MIOpen Handle
+ * @param [in]   indexCount              Number of input tensor indexs
+ * @param [in]   indexDescs              Tensor descriptor of input tensor indexs
+ * @param [out]  sizeInBytes             Pointer to data to return the minimum workspace size
+ * @return                        miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetGetitemWorkspaceSize(miopenHandle_t handle,
+                              uint32_t indexCount,
+                              const miopenTensorDescriptor_t* indexDescs,
+                              size_t* sizeInBytes);
+
+/*! @brief Execute a getitem backward layer
+ *
+ * Backward of getitem for tensor indexing, slicing, masking.
+ *
+ * @param [in]   handle                  MIOpen handle
+ * @param [in]   workspace               Address of the allocated workspace data
+ * @param [in]   workspaceSizeInBytes    Size in bytes of the allocated workspace data
+ * @param [in]   dyDesc                  Tensor descriptor of input tensor dy
+ * @param [in]   dy                      Source data tensor dy
+ * @param [in]   indexCount              Number of input tensor indexs
+ * @param [in]   indexDescs              Tensor descriptor of input tensor indexs(All indexs same
+ * size)
+ * @param [in]   indexs                  Source data tensor indexs
+ * @param [in]   dxDesc                  Tensor descriptor of output tensor dx
+ * @param [out]  dx                      Data tensor dx(It must be initialized to 0)
+ * @param [in]   errorDesc               Tensor descriptor of output tensor error
+ * @param [out]  error                   Data tensor error(It must be initialized to 0)
+ * @param [in]   dimCount                Number of dimensions
+ * @param [in]   dims                    Dimensions
+ * @param [in]   sliceCount              Number of slices
+ * @param [in]   slices                  Slices
+ * @param [in]   offset                  Offset of output tensor dx
+ * @return                               miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
+                                                   void* workspace,
+                                                   size_t workspaceSizeInBytes,
+                                                   const miopenTensorDescriptor_t dyDesc,
+                                                   const void* dy,
+                                                   uint32_t indexCount,
+                                                   const miopenTensorDescriptor_t* indexDescs,
+                                                   const void* const* indexs,
+                                                   const miopenTensorDescriptor_t dxDesc,
+                                                   void* dx,
+                                                   const miopenTensorDescriptor_t errorDesc,
+                                                   void* error,
+                                                   uint32_t dimCount,
+                                                   const int32_t* dims,
+                                                   uint32_t sliceCount,
+                                                   const int32_t* slices,
+                                                   uint32_t offset);
+
+/** @} */
+// CLOSEOUT GETITEM DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 14e7c954b1..ea6bbc77f0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -126,6 +126,7 @@ set( MIOpen_Source
     fusion.cpp
     fusion/problem_description.cpp
     generic_search.cpp
+    getitem_api.cpp
     graphapi/convolution.cpp
     graphapi/engine.cpp
     graphapi/enginecfg.cpp
@@ -145,6 +146,7 @@ set( MIOpen_Source
     groupnorm/problem_description.cpp
     handle_api.cpp
     invoker_cache.cpp
+    getitem/problem_description.cpp
     kernel_build_params.cpp
     kernel_warnings.cpp
     layernorm_api.cpp
@@ -278,6 +280,7 @@ set( MIOpen_Source
     solver/conv_ocl_dir2Dfwd_fused.cpp
     solver/conv_winoRxS_fused.cpp
     solver/groupnorm/forward_groupnorm.cpp
+    solver/getitem/backward_getitem.cpp
     solver/layernorm/backward_t5layernorm.cpp
     solver/layernorm/forward_addlayernorm.cpp
     solver/layernorm/forward_layernorm.cpp
@@ -438,6 +441,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/conv_sizes.inc
         kernels/float_types.h
         kernels/gpr_alloc.inc
+        kernels/hip_atomic.hpp
         kernels/hip_f8_impl.hpp
         kernels/hip_float8.hpp
         kernels/inst_wrappers.inc
@@ -449,6 +453,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/neuron.inc
         kernels/rocm_version.inc
         kernels/stride_array.hpp
+        kernels/tensor_view.hpp
         kernels/utilities.inc
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c16_stride1.inc
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1536vgprs_fp16_fp16acc_f2x3_c32_stride1.inc
@@ -487,6 +492,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenConvDirBatchNormActiv.cl
         kernels/MIOpenConvDirGenFwd.cl
         kernels/MIOpenGroupNorm.cpp
+        kernels/MIOpenGetitem.cpp
         kernels/MIOpenLayerNorm.cpp
         kernels/MIOpenLRNBwd.cl
         kernels/MIOpenLRNFwd.cl
@@ -616,6 +622,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         addlayernorm.cpp
         cat.cpp
         groupnorm.cpp
+        getitem.cpp
         kernel_cache.cpp
         layernorm.cpp
         lrn.cpp
diff --git a/src/getitem.cpp b/src/getitem.cpp
new file mode 100644
index 0000000000..c3b1b0c3bc
--- /dev/null
+++ b/src/getitem.cpp
@@ -0,0 +1,106 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/getitem.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/check_numerics.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/getitem/invoke_params.hpp>
+#include <miopen/getitem/solvers.hpp>
+#include <miopen/find_solution.hpp>
+
+namespace miopen {
+
+std::size_t GetGetitemWorkspaceSize(Handle& handle,
+                                    uint32_t indexCount,
+                                    const TensorDescriptor* const* indexDescs)
+{
+    auto ctx           = ExecutionContext{&handle};
+    const auto problem = getitem::ProblemDescription{indexCount, indexDescs};
+
+    const auto algo    = AlgorithmName{"GetitemBackward"};
+    const auto solvers = solver::SolverContainer<solver::getitem::GetitemBackward>{};
+
+    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem, true);
+
+    return pair_size_vector.empty() ? static_cast<size_t>(0) : pair_size_vector.front().second;
+}
+
+miopenStatus_t GetitemBackward(Handle& handle,
+                               Data_t workspace,
+                               size_t workspaceSizeInBytes,
+                               const TensorDescriptor& dyDesc,
+                               ConstData_t dy,
+                               uint32_t indexCount,
+                               const TensorDescriptor* const* indexDescs,
+                               ConstData_t* indexs,
+                               const TensorDescriptor& dxDesc,
+                               Data_t dx,
+                               const TensorDescriptor& errorDesc,
+                               Data_t error,
+                               uint32_t dimCount,
+                               const int32_t* dims,
+                               uint32_t sliceCount,
+                               const int32_t* slices,
+                               uint32_t offset)
+{
+    const auto problem = getitem::ProblemDescription{dyDesc,
+                                                     indexCount,
+                                                     indexDescs,
+                                                     dxDesc,
+                                                     errorDesc,
+                                                     dimCount,
+                                                     dims,
+                                                     sliceCount,
+                                                     slices,
+                                                     offset};
+
+    const auto invoke_params = getitem::GetitemInvokeParams{workspace,
+                                                            workspaceSizeInBytes,
+                                                            dyDesc,
+                                                            dy,
+                                                            indexCount,
+                                                            indexDescs,
+                                                            indexs,
+                                                            dxDesc,
+                                                            dx,
+                                                            errorDesc,
+                                                            error,
+                                                            dimCount,
+                                                            dims,
+                                                            sliceCount,
+                                                            slices,
+                                                            offset};
+
+    const auto algo    = AlgorithmName{"GetitemBackward"};
+    const auto solvers = solver::SolverContainer<solver::getitem::GetitemBackward>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/getitem/problem_description.cpp b/src/getitem/problem_description.cpp
new file mode 100644
index 0000000000..b8b32109d6
--- /dev/null
+++ b/src/getitem/problem_description.cpp
@@ -0,0 +1,69 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/getitem/problem_description.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace getitem {
+
+NetworkConfig ProblemDescription::MakeNetworkConfig() const
+{
+    auto dy_dims     = dyDesc.GetLengths();
+    auto input_dtype = dyDesc.GetType();
+    auto error_dtype = errorDesc.GetType();
+
+    auto input_size =
+        std::accumulate(dy_dims.begin(), dy_dims.end(), 1ULL, std::multiplies<size_t>());
+
+    std::ostringstream ss;
+
+    ss << "getitembwd";
+    ss << "input_size" << input_size;
+    ss << "input_dtype" << input_dtype;
+    ss << "error_dtype" << error_dtype;
+    ss << "indexCount" << indexCount;
+
+    for(int i = 0; i < indexCount; ++i)
+    {
+        if(i == 0)
+            ss << "indexs_size";
+        const auto& index_dims = (*indexDescs)[i].GetLengths();
+        auto index_size        = std::accumulate(
+            index_dims.begin(), index_dims.begin(), 1ULL, std::multiplies<size_t>());
+        ss << index_size << "_";
+    }
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace getitem
+
+} // namespace miopen
diff --git a/src/getitem_api.cpp b/src/getitem_api.cpp
new file mode 100644
index 0000000000..094f44620f
--- /dev/null
+++ b/src/getitem_api.cpp
@@ -0,0 +1,206 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/getitem.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
+
+static void LogCmdGetitem(const miopenTensorDescriptor_t dyDesc,
+                          uint32_t indexCount,
+                          const miopenTensorDescriptor_t* indexDescs,
+                          const miopenTensorDescriptor_t dxDesc,
+                          uint32_t dimCount,
+                          const int32_t* dims,
+                          uint32_t sliceCount,
+                          const int32_t* slices,
+                          uint32_t offset,
+                          bool is_fwd)
+{
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(dyDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "getitemfp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "getitemfp32";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "getitemf16";
+        }
+
+        std::string dy_s;
+        auto dy_dims = miopen::deref(dyDesc).GetLengths();
+        for(int i = 0; i < dy_dims.size(); i++)
+        {
+            dy_s += std::to_string(dy_dims[i]);
+            if(i != dy_dims.size() - 2)
+                dy_s += ",";
+        }
+        ss << " -doutput " << dy_s;
+
+        for(int i = 0; i < indexCount; i++)
+        {
+            std::string index_s;
+            auto index_dims = miopen::deref(indexDescs[i]).GetLengths();
+            for(int j = 0; j < index_dims.size(); j++)
+            {
+                index_s += std::to_string(index_dims[j]);
+                if(j != index_dims.size() - 2)
+                    index_s += ",";
+            }
+            ss << " -index" << i + 1 << " " << index_s;
+        }
+
+        std::string dx_s;
+        auto dx_dims = miopen::deref(dxDesc).GetLengths();
+
+        for(int i = 0; i < dx_dims.size(); i++)
+        {
+            dx_s += std::to_string(dx_dims[i]);
+            if(i != dx_dims.size() - 2)
+                dx_s += ",";
+        }
+
+        ss << " -dx " << dx_s;
+
+        std::string dims_s;
+        for(int i = 0; i < dimCount; i++)
+        {
+            dims_s += std::to_string(dims[i]);
+            if(i != dimCount - 2)
+                dims_s += ",";
+        }
+        ss << " -dims" << dims_s;
+
+        std::string slices_s;
+        for(int i = 0; i < sliceCount; i++)
+        {
+            slices_s += std::to_string(slices[i]);
+            if(i != sliceCount - 2)
+                slices_s += ",";
+        }
+        ss << " -slice" << slices_s;
+
+        ss << " -offset" << offset;
+        ss << " -F " << ((is_fwd) ? "1" : "2");
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
+
+extern "C" miopenStatus_t miopenGetGetitemWorkspaceSize(miopenHandle_t handle,
+                                                        uint32_t indexCount,
+                                                        const miopenTensorDescriptor_t* indexDescs,
+                                                        size_t* sizeInBytes)
+{
+    MIOPEN_LOG_FUNCTION(handle, indexCount, indexDescs);
+
+    return miopen::try_([&] {
+        std::vector<miopen::TensorDescriptor*> indexDescsCast;
+        std::transform(indexDescs,
+                       indexDescs + indexCount,
+                       std::back_inserter(indexDescsCast),
+                       [](const auto& indexDesc) { return &miopen::deref(indexDesc); });
+        miopen::deref(sizeInBytes) = miopen::GetGetitemWorkspaceSize(
+            miopen::deref(handle), indexCount, indexDescsCast.data());
+    });
+};
+
+extern "C" miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
+                                                void* workspace,
+                                                size_t workspaceSizeInBytes,
+                                                const miopenTensorDescriptor_t dyDesc,
+                                                const void* dy,
+                                                uint32_t indexCount,
+                                                const miopenTensorDescriptor_t* indexDescs,
+                                                const void* const* indexs,
+                                                const miopenTensorDescriptor_t dxDesc,
+                                                void* dx,
+                                                const miopenTensorDescriptor_t errorDesc,
+                                                void* error,
+                                                uint32_t dimCount,
+                                                const int32_t* dims,
+                                                uint32_t sliceCount,
+                                                const int32_t* slices,
+                                                uint32_t offset)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        workspace,
+                        workspaceSizeInBytes,
+                        dyDesc,
+                        dy,
+                        indexCount,
+                        indexDescs,
+                        indexs,
+                        dxDesc,
+                        dx,
+                        errorDesc,
+                        error,
+                        dimCount,
+                        dims,
+                        sliceCount,
+                        slices,
+                        offset);
+
+    LogCmdGetitem(
+        dyDesc, indexCount, indexDescs, dxDesc, dimCount, dims, sliceCount, slices, offset, true);
+    return miopen::try_([&] {
+        std::vector<ConstData_t> indexsCast;
+        std::vector<miopen::TensorDescriptor*> indexDescsCast;
+        std::transform(indexDescs,
+                       indexDescs + indexCount,
+                       std::back_inserter(indexDescsCast),
+                       [](const auto& indexDesc) { return &miopen::deref(indexDesc); });
+        std::transform(indexs,
+                       indexs + indexCount,
+                       std::back_inserter(indexsCast),
+                       [](const void* index) { return DataCast(index); });
+
+        miopen::GetitemBackward(miopen::deref(handle),
+                                DataCast(workspace),
+                                workspaceSizeInBytes,
+                                miopen::deref(dyDesc),
+                                DataCast(dy),
+                                indexCount,
+                                indexDescsCast.data(),
+                                indexsCast.data(),
+                                miopen::deref(dxDesc),
+                                DataCast(dx),
+                                miopen::deref(errorDesc),
+                                DataCast(error),
+                                dimCount,
+                                dims,
+                                sliceCount,
+                                slices,
+                                offset);
+    });
+}
diff --git a/src/include/miopen/find_solution.hpp b/src/include/miopen/find_solution.hpp
index ff6a2a1ca1..30735cd38a 100644
--- a/src/include/miopen/find_solution.hpp
+++ b/src/include/miopen/find_solution.hpp
@@ -395,25 +395,25 @@ struct SolverContainer
     }
 
     template <class Context, class Problem>
-    std::vector<std::pair<std::string, size_t>>
-    GetWorkspaceSizes(const Context& ctx,
-                      const Problem& problem,
-                      std::size_t limit = std::numeric_limits<std::size_t>::max()) const
+    std::vector<std::pair<std::string, size_t>> GetWorkspaceSizes(
+        const Context& ctx, const Problem& problem, const bool simple_primitive = false) const
     {
         std::vector<std::pair<std::string, size_t>> res;
         const auto find_only = GetEnvFindOnlySolver();
-        std::size_t count    = 0;
         miopen::each_args(
             [&](auto solver) {
-                if(count >= limit)
-                    return;
-
                 if(find_only &&
                    (std::find(find_only->begin(), find_only->end(), Id{solver.SolverDbId()}) ==
                     find_only->end()))
                 { // Do nothing (and keep silence for the sake of Tuna), just skip.
                 }
-                else if(!solver.MayNeedWorkspace())
+                // The following optimization is required to avoid checks
+                // for solvers that have slow IsApplicable() and do not
+                // require workspace (like MLIR convolutions). However we
+                // do not want to use it for simple primitives, for example,
+                // the ones that ExecutePrimitive() which uses the first applicable
+                // solver:
+                else if(!simple_primitive && !solver.MayNeedWorkspace())
                 {
                     MIOPEN_LOG_I2(solver.SolverDbId() << ": Skipped (no workspace required)");
                 }
@@ -429,7 +429,6 @@ struct SolverContainer
                 }
                 else
                 {
-                    ++count;
                     auto sz = solver.GetWorkspaceSize(ctx, problem);
                     res.push_back(std::make_pair(solver.SolverDbId(), sz));
                     MIOPEN_LOG_I2(solver.SolverDbId() << ": " << sz);
diff --git a/src/include/miopen/getitem.hpp b/src/include/miopen/getitem.hpp
new file mode 100644
index 0000000000..191b1dba97
--- /dev/null
+++ b/src/include/miopen/getitem.hpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_GETITEM_HPP_
+#define MIOPEN_GETITEM_HPP_
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+MIOPEN_INTERNALS_EXPORT std::size_t GetGetitemWorkspaceSize(
+    Handle& handle, uint32_t indexCount, const TensorDescriptor* const* indexDescs);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t GetitemBackward(Handle& handle,
+                                                       Data_t workspace,
+                                                       size_t workspaceSizeInBytes,
+                                                       const TensorDescriptor& dyDesc,
+                                                       ConstData_t dy,
+                                                       uint32_t indexCount,
+                                                       const TensorDescriptor* const* indexDescs,
+                                                       ConstData_t* indexs,
+                                                       const TensorDescriptor& dxDesc,
+                                                       Data_t dx,
+                                                       const TensorDescriptor& errorDesc,
+                                                       Data_t error,
+                                                       uint32_t dimCount,
+                                                       const int32_t* dims,
+                                                       uint32_t sliceCount,
+                                                       const int32_t* slices,
+                                                       uint32_t offset);
+
+} // namespace miopen
+#endif // _MIOPEN_GETITEM_HPP_
diff --git a/src/include/miopen/getitem/invoke_params.hpp b/src/include/miopen/getitem/invoke_params.hpp
new file mode 100644
index 0000000000..e663482271
--- /dev/null
+++ b/src/include/miopen/getitem/invoke_params.hpp
@@ -0,0 +1,97 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+namespace getitem {
+
+struct GetitemInvokeParams : public miopen::InvokeParams
+{
+
+    GetitemInvokeParams(Data_t workspace_,
+                        std::size_t workspace_size_,
+                        const TensorDescriptor& dyDesc_,
+                        ConstData_t dy_,
+                        uint32_t indexCount_,
+                        const TensorDescriptor* const* indexDescs_,
+                        ConstData_t* indexs_,
+                        const TensorDescriptor& dxDesc_,
+                        Data_t dx_,
+                        const TensorDescriptor& errorDesc_,
+                        Data_t error_,
+                        uint32_t dimCount_,
+                        const int32_t* dims_,
+                        uint32_t sliceCount_,
+                        const int32_t* slices_,
+                        uint32_t offset_)
+        : workspace(workspace_),
+          workspace_size(workspace_size_),
+          dyDesc(dyDesc_),
+          dy(dy_),
+          indexCount(indexCount_),
+          indexDescs(indexDescs_),
+          indexs(indexs_),
+          dxDesc(dxDesc_),
+          dx(dx_),
+          errorDesc(errorDesc_),
+          error(error_),
+          dimCount(dimCount_),
+          dims(dims_),
+          sliceCount(sliceCount_),
+          slices(slices_),
+          offset(offset_)
+    {
+    }
+
+    Data_t workspace           = nullptr;
+    std::size_t workspace_size = 0;
+    const TensorDescriptor dyDesc{};
+    ConstData_t dy                            = nullptr;
+    uint32_t indexCount                       = 0;
+    const TensorDescriptor* const* indexDescs = nullptr;
+    ConstData_t* indexs                       = nullptr;
+    const TensorDescriptor dxDesc{};
+    Data_t dx = nullptr;
+    const TensorDescriptor errorDesc{};
+    Data_t error = nullptr;
+
+    uint32_t dimCount     = 0;
+    const int32_t* dims   = nullptr;
+    uint32_t sliceCount   = 0;
+    const int32_t* slices = nullptr;
+    uint32_t offset       = 0;
+
+    std::size_t GetWorkspaceSize() const { return workspace_size; }
+    Data_t GetWorkspace() const { return workspace; }
+};
+
+} // namespace getitem
+
+} // namespace miopen
diff --git a/src/include/miopen/getitem/problem_description.hpp b/src/include/miopen/getitem/problem_description.hpp
new file mode 100644
index 0000000000..fed4e78d22
--- /dev/null
+++ b/src/include/miopen/getitem/problem_description.hpp
@@ -0,0 +1,181 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/activ.hpp>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+#include <cassert>
+#include <string>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace getitem {
+
+struct ProblemDescription : ProblemDescriptionBase
+{
+    ProblemDescription(const TensorDescriptor& dyDesc_,
+                       uint32_t indexCount_,
+                       const TensorDescriptor* const* indexDescs_,
+                       const TensorDescriptor& dxDesc_,
+                       const TensorDescriptor& errorDesc_,
+                       uint32_t dimCount_,
+                       const int32_t* dims_,
+                       uint32_t sliceCount_,
+                       const int32_t* slices_,
+                       uint32_t offset_)
+        : dyDesc(dyDesc_),
+          indexCount(indexCount_),
+          indexDescs(indexDescs_),
+          dxDesc(dxDesc_),
+          errorDesc(errorDesc_),
+          dimCount(dimCount_),
+          dims(dims_),
+          sliceCount(sliceCount_),
+          slices(slices_),
+          offset(offset_)
+    {
+        IsValidIndexsLength();
+        IsValidIndexs();
+        IsValidDims();
+        IsValidSlices();
+    }
+
+    ProblemDescription(const int32_t indexCount_, const TensorDescriptor* const* indexDescs_)
+        : indexCount(indexCount_), indexDescs(indexDescs_)
+    {
+        IsValidIndexsLength();
+        IsValidIndexs();
+    }
+
+    const TensorDescriptor& GetDYDesc() const { return dyDesc; }
+    int32_t GetIndexCount() const { return indexCount; }
+    const TensorDescriptor& GetIndexDesc(int i) const
+    {
+        if(i >= indexCount)
+        {
+            MIOPEN_THROW(miopenStatusInternalError, "Item: Invalid tensor index.");
+        }
+        return (*indexDescs)[i];
+    }
+    const TensorDescriptor& GetDXDesc() const { return dxDesc; }
+    const TensorDescriptor& GetErrorDesc() const { return errorDesc; }
+    int32_t GetDimCount() const { return dimCount; }
+    int32_t GetDim(int i) const
+    {
+        if(i >= indexCount)
+        {
+            MIOPEN_THROW(miopenStatusInternalError, "Item: Invalid dim index.");
+        }
+        return dims[i];
+    }
+    int32_t GetSliceCount() const { return sliceCount; }
+    int32_t GetSlice(int i) const
+    {
+        if(i >= sliceCount)
+        {
+            MIOPEN_THROW(miopenStatusInternalError, "Item: Invalid slice index.");
+        }
+        return slices[i];
+    }
+    int32_t GetOffset() const { return offset; }
+
+    bool IsValidIndexsLength() const
+    {
+        if(indexCount > 0)
+        {
+            auto firstlength = (*indexDescs)[0];
+            for(int32_t i = 1; i < indexCount; ++i)
+            {
+                if(firstlength != (*indexDescs)[i])
+                    MIOPEN_THROW(miopenStatusBadParm,
+                                 "Getitem: Indexs dimension lengths do not match.");
+            }
+        }
+        return true;
+    }
+
+    bool IsValidIndexs() const
+    {
+        if(indexCount > 0)
+        {
+            if(indexDescs == nullptr)
+                MIOPEN_THROW(miopenStatusBadParm, "Getitem: indexDesc is nullptr.");
+        }
+        return true;
+    }
+
+    bool IsValidDims() const
+    {
+        if(dimCount > 0)
+
+            if(dims == nullptr)
+                MIOPEN_THROW(miopenStatusBadParm, "Getitem: dims is nullptr.");
+        return true;
+    }
+
+    bool IsValidSlices() const
+    {
+        if(sliceCount > 0)
+        {
+            if(slices == nullptr)
+                MIOPEN_THROW(miopenStatusBadParm, "Getitem: slices is nullptr.");
+        }
+        return true;
+    }
+
+    bool IsSameType() const
+    {
+        if(dyDesc.GetType() != dxDesc.GetType())
+        {
+            return false;
+        }
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    TensorDescriptor dyDesc{};
+    uint32_t indexCount                       = 0;
+    const TensorDescriptor* const* indexDescs = nullptr;
+    TensorDescriptor dxDesc{};
+    TensorDescriptor errorDesc{};
+
+    uint32_t dimCount     = 0;
+    const int32_t* dims   = nullptr;
+    uint32_t sliceCount   = 0;
+    const int32_t* slices = nullptr;
+    uint32_t offset       = 0;
+
+    NetworkConfig MakeForwardNetworkConfig() const;
+};
+
+} // namespace getitem
+
+} // namespace miopen
diff --git a/src/include/miopen/getitem/solvers.hpp b/src/include/miopen/getitem/solvers.hpp
new file mode 100644
index 0000000000..f2edcbe437
--- /dev/null
+++ b/src/include/miopen/getitem/solvers.hpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/getitem/problem_description.hpp>
+#include <miopen/solver.hpp>
+#include <utility>
+
+namespace miopen {
+
+namespace solver {
+
+namespace getitem {
+
+using ItemSolver = NonTunableSolverBase<ExecutionContext, miopen::getitem::ProblemDescription>;
+
+struct GetitemBackward final : ItemSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<GetitemBackward>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::getitem::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::getitem::ProblemDescription& problem) const override;
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::getitem::ProblemDescription& problem) const override;
+    bool MayNeedWorkspace() const override { return true; }
+};
+
+} // namespace getitem
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/mlo_internal.hpp b/src/include/miopen/mlo_internal.hpp
index 43d2e1a067..6252df3087 100644
--- a/src/include/miopen/mlo_internal.hpp
+++ b/src/include/miopen/mlo_internal.hpp
@@ -119,6 +119,8 @@ inline int AlignUp(int val, unsigned step)
     return static_cast<int>(((static_cast<unsigned>(val) + step - 1) / step) * step);
 }
 
+inline size_t AlignUp(size_t val, size_t step) { return (val + step - 1) / step * step; }
+
 namespace miopen {
 
 struct TensorDescriptor;
diff --git a/src/include/miopen/reduce/problem_description.hpp b/src/include/miopen/reduce/problem_description.hpp
index 03001a155b..b48bd3b3ce 100644
--- a/src/include/miopen/reduce/problem_description.hpp
+++ b/src/include/miopen/reduce/problem_description.hpp
@@ -200,7 +200,8 @@ struct ProblemDescription : ProblemDescriptionBase
     TensorDescriptor indiceDesc;
 
     int32_t dim;
-    miopenReduceExtremeOp_t reduceExtremeOp;
+
+    miopenReduceExtremeOp_t reduceExtremeOp = MIOPEN_REDUCE_CALCULATION_SUM;
 
     NetworkConfig MakeForwardNetworkConfig() const;
 };
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 6ddd83bcef..81c15f6bea 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -58,7 +58,8 @@ enum class Primitive
     Cat,
     Mha,
     Softmax,
-    Adam
+    Adam,
+    Item
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp
new file mode 100644
index 0000000000..9f7430ba8a
--- /dev/null
+++ b/src/include/miopen/tensor_view_utils.hpp
@@ -0,0 +1,80 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_TENSOR_VIEW_UTIL_HPP_
+#define MIOPEN_TENSOR_VIEW_UTIL_HPP_
+
+#include <miopen/common.hpp>
+#include "../../kernels/tensor_view.hpp"
+
+namespace miopen {
+
+template <int N>
+inline tensor_view_t<N> get_inner_expanded_tv(const TensorDescriptor Desc)
+{
+    auto dims    = Desc.GetLengths();
+    auto strides = Desc.GetStrides();
+
+    tensor_view_t<N> tensor_view;
+    for(size_t i = 0; i < N; ++i)
+    {
+        if(i < dims.size())
+        {
+            tensor_view.stride[i] = strides[i];
+            tensor_view.size[i]   = dims[i];
+        }
+        else
+        {
+            tensor_view.stride[i] = (i == 0 ? 1 : strides[i - 1]);
+            tensor_view.size[i]   = 1;
+        }
+    }
+    return tensor_view;
+}
+
+template <int N>
+inline void slice_tv(tensor_view_t<N>& tensor_view, int32_t sliceCount, const int32_t* slices)
+{
+    for(int32_t i = 0; i < sliceCount; i++)
+    {
+        int32_t dim   = slices[4 * i + 0];
+        int32_t start = slices[4 * i + 1];
+        int32_t end   = slices[4 * i + 2];
+        int32_t step  = slices[4 * i + 3];
+
+        if(end > static_cast<int32_t>(tensor_view.size[dim]))
+            end = tensor_view.size[dim];
+
+        auto len = end - start;
+
+        tensor_view.size[dim] = (len + step - 1) / step;
+        tensor_view.stride[dim] *= step;
+    }
+}
+
+} // namespace miopen
+
+#endif // MIOPEN_TENSOR_REORDER_UTIL_HPP_
diff --git a/src/kernels/MIOpenGetitem.cpp b/src/kernels/MIOpenGetitem.cpp
new file mode 100644
index 0000000000..4daba996c8
--- /dev/null
+++ b/src/kernels/MIOpenGetitem.cpp
@@ -0,0 +1,158 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "hip_atomic.hpp"
+#include "miopen_cstdint.hpp"
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+template <typename IDX, typename E>
+__device__ void getitembuildindices(const IDX* __restrict__ index,
+                                    IDX* __restrict__ element_index,
+                                    E* __restrict__ error,
+                                    int32_t index_dim,
+                                    int32_t indexCount,
+                                    int32_t dim_size,
+                                    tensor_view_t<5> index_tv,
+                                    int32_t dim_offset,
+                                    int32_t dim_info_offset)
+{
+    const uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    tensor_layout_t<5> ncdhw(index_tv, gid);
+
+    if(ncdhw.layout[0] >= index_tv.size[0])
+        return;
+
+    uint64_t idx      = index_tv.get_tensor_view_idx(ncdhw);
+    IDX getitem_index = index[idx];
+
+    if(getitem_index >= 0 && getitem_index < dim_size)
+    {
+        element_index[(gid * indexCount) + dim_offset] = getitem_index;
+    }
+    else if(getitem_index >= -dim_size && getitem_index < 0)
+    {
+        element_index[(gid * indexCount) + dim_offset] = getitem_index + dim_size;
+    }
+    else
+    {
+        error[dim_offset] = -1;
+    }
+
+    if(gid == 0)
+    {
+        element_index[dim_info_offset + dim_offset] = index_dim;
+    }
+}
+
+template <typename TI, typename IDX, typename TO>
+__device__ void getitembwd(const TI* __restrict__ dy,
+                           IDX* __restrict__ element_index,
+                           TO* __restrict__ dx,
+                           int32_t start_dim,
+                           int32_t indexCount,
+                           tensor_view_t<5> dy_tv,
+                           tensor_view_t<5> dx_tv,
+                           int32_t dim_info_offset,
+                           int32_t offset)
+{
+    const uint64_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    tensor_layout_t<5> ncdhw(dy_tv, gid);
+
+    if(ncdhw.layout[0] >= dy_tv.size[0])
+        return;
+
+    tensor_layout_t<5> idx = ncdhw;
+
+    if(indexCount > 0)
+    {
+        int32_t dim_cursor = ncdhw.layout[start_dim];
+        int32_t i          = start_dim;
+        int32_t j          = 0;
+
+        for(; i < start_dim + indexCount; ++i, ++j)
+        {
+            uint64_t dim_idx = static_cast<uint64_t>(element_index[dim_info_offset + j]);
+            idx.layout[dim_idx] =
+                static_cast<uint64_t>(element_index[(dim_cursor * indexCount) + j]);
+        }
+
+        i          = element_index[dim_info_offset + indexCount - 1] + 1;
+        dim_cursor = start_dim + 1;
+        for(; i < 5; ++i, ++dim_cursor)
+        {
+            idx.layout[i] = ncdhw.layout[dim_cursor];
+        }
+    }
+
+    idx.layout[0] += offset;
+    ncdhw.layout[0] += offset;
+
+    atomic_add_g(&dx[dx_tv.get_tensor_view_idx(idx)], dy[dy_tv.get_tensor_view_idx(ncdhw)]);
+}
+
+extern "C" __global__ void GetItemBuildIndices(const INDEX_TYPE* __restrict__ index,
+                                               INDEX_TYPE* __restrict__ element_index,
+                                               ERROR_TYPE* __restrict__ error,
+                                               int32_t index_dim,
+                                               int32_t indexCount,
+                                               int32_t dim_size,
+                                               tensor_view_t<5> index_tv,
+                                               int32_t dim_offset,
+                                               int32_t dim_info_offset)
+{
+    // instantiate the kernel
+    getitembuildindices<INDEX_TYPE, ERROR_TYPE>(index,
+                                                element_index,
+                                                error,
+                                                index_dim,
+                                                indexCount,
+                                                dim_size,
+                                                index_tv,
+                                                dim_offset,
+                                                dim_info_offset);
+}
+
+extern "C" __global__ void GetitemBwd(const INPUT_TYPE* __restrict__ dy,
+                                      INDEX_TYPE* __restrict__ element_index,
+                                      OUTPUT_TYPE* __restrict__ dx,
+                                      int32_t start_dim,
+                                      int32_t indexCount,
+                                      tensor_view_t<5> dy_tv,
+                                      tensor_view_t<5> dx_tv,
+                                      int32_t dim_info_offset,
+                                      int32_t offset)
+{
+    // instantiate the kernel
+    getitembwd<INPUT_TYPE, INDEX_TYPE, OUTPUT_TYPE>(
+        dy, element_index, dx, start_dim, indexCount, dy_tv, dx_tv, dim_info_offset, offset);
+}
diff --git a/src/kernels/MIOpenGroupNorm.cpp b/src/kernels/MIOpenGroupNorm.cpp
index 54d70d323b..1ddf58d232 100644
--- a/src/kernels/MIOpenGroupNorm.cpp
+++ b/src/kernels/MIOpenGroupNorm.cpp
@@ -30,17 +30,18 @@
 
 #include "float_types.h"
 
-extern "C" __global__ void GroupNormFwdContiguous(const FLOAT* __restrict__ x,
-                                                  FLOAT* __restrict__ y,
-                                                  const FLOAT* __restrict__ weight,
-                                                  const FLOAT* __restrict__ bias,
-                                                  FLOAT_ACCUM* __restrict__ mean,
-                                                  FLOAT_ACCUM* __restrict__ rstd,
-                                                  float eps,
-                                                  uint64_t num_groups,
-                                                  uint64_t num_channels,
-                                                  uint64_t numel_per_channel,
-                                                  bool mode)
+template <typename TI, typename TO>
+__device__ void groupnormfwdcontiguous(const TI* __restrict__ x,
+                                       const TI* __restrict__ weight,
+                                       const TI* __restrict__ bias,
+                                       TO* __restrict__ y,
+                                       TO* __restrict__ mean,
+                                       TO* __restrict__ rstd,
+                                       float eps,
+                                       uint64_t num_groups,
+                                       uint64_t num_channels,
+                                       uint64_t numel_per_channel,
+                                       bool mode)
 {
     /*
      * Each group works on a single channel.
@@ -98,9 +99,9 @@ extern "C" __global__ void GroupNormFwdContiguous(const FLOAT* __restrict__ x,
     if(lid == 0)
     {
         if(mean)
-            mean[gid] = pmean;
+            mean[gid] = CVT_ACCUM2FLOAT(pmean);
         if(rstd)
-            rstd[gid] = prstd;
+            rstd[gid] = CVT_ACCUM2FLOAT(prstd);
     }
 
     // forward calculation
@@ -119,3 +120,20 @@ extern "C" __global__ void GroupNormFwdContiguous(const FLOAT* __restrict__ x,
         y[idx]          = CVT_ACCUM2FLOAT(val);
     }
 }
+
+extern "C" __global__ void GroupNormFwdContiguous(const INPUT_TYPE* __restrict__ x,
+                                                  const INPUT_TYPE* __restrict__ weight,
+                                                  const INPUT_TYPE* __restrict__ bias,
+                                                  OUTPUT_TYPE* __restrict__ y,
+                                                  OUTPUT_TYPE* __restrict__ mean,
+                                                  OUTPUT_TYPE* __restrict__ rstd,
+                                                  float eps,
+                                                  uint64_t num_groups,
+                                                  uint64_t num_channels,
+                                                  uint64_t numel_per_channel,
+                                                  bool mode)
+{
+    // instantiate the kernel
+    groupnormfwdcontiguous<INPUT_TYPE, OUTPUT_TYPE>(
+        x, weight, bias, y, mean, rstd, eps, num_groups, num_channels, numel_per_channel, mode);
+}
diff --git a/src/kernels/MIOpenLayerNorm.cpp b/src/kernels/MIOpenLayerNorm.cpp
index 21880d4b66..9a5e736f94 100644
--- a/src/kernels/MIOpenLayerNorm.cpp
+++ b/src/kernels/MIOpenLayerNorm.cpp
@@ -40,7 +40,7 @@ __device__ void layernormfwdcontiguous(const TI* __restrict__ x,
                                        TO* __restrict__ rstd,
                                        float eps,
                                        uint64_t inner_size,
-                                       bool mode)
+                                       int32_t mode)
 {
     /*
      * Each group works on a single channel.
@@ -96,9 +96,9 @@ __device__ void layernormfwdcontiguous(const TI* __restrict__ x,
     if(lid == 0)
     {
         if(mean)
-            mean[gid] = pmean;
+            mean[gid] = CVT_ACCUM2FLOAT(pmean);
         if(rstd)
-            rstd[gid] = prstd;
+            rstd[gid] = CVT_ACCUM2FLOAT(prstd);
     }
 
     // forward calculation
@@ -129,7 +129,7 @@ __device__ void addlayernormfwdcontiguous(const TI* __restrict__ x,
                                           TO* __restrict__ rstd,
                                           float eps,
                                           uint64_t inner_size,
-                                          bool mode)
+                                          int32_t mode)
 {
     const uint64_t gid = blockIdx.x;
     const uint64_t lid = threadIdx.x;
@@ -168,9 +168,9 @@ __device__ void addlayernormfwdcontiguous(const TI* __restrict__ x,
     if(lid == 0)
     {
         if(mean)
-            mean[gid] = pmean;
+            mean[gid] = CVT_ACCUM2FLOAT(pmean);
         if(rstd)
-            rstd[gid] = prstd;
+            rstd[gid] = CVT_ACCUM2FLOAT(prstd);
     }
 
     // forward calculation
@@ -199,7 +199,7 @@ __device__ void t5layernormfwdcontiguous(const TI* __restrict__ x,
                                          TO* __restrict__ rstd,
                                          float eps,
                                          uint64_t inner_size,
-                                         bool mode)
+                                         int32_t mode)
 {
     const uint64_t gid = blockIdx.x;
     const uint64_t lid = threadIdx.x;
@@ -232,7 +232,7 @@ __device__ void t5layernormfwdcontiguous(const TI* __restrict__ x,
     if(lid == 0)
     {
         if(rstd)
-            rstd[gid] = prstd;
+            rstd[gid] = CVT_ACCUM2FLOAT(prstd);
     }
 
     // forward calculation
@@ -257,7 +257,7 @@ __device__ void t5layernormbwdcontiguous(const TI* __restrict__ dy,
                                          const TI* __restrict__ rstd,
                                          TO* __restrict__ dx,
                                          uint64_t inner_size,
-                                         bool mode)
+                                         int32_t mode)
 {
     const uint64_t gid = blockIdx.x;
     const uint64_t lid = threadIdx.x;
@@ -401,7 +401,7 @@ extern "C" __global__ void LayernormFwdContiguous(const INPUT_TYPE* __restrict__
                                                   OUTPUT_TYPE* __restrict__ rstd,
                                                   float eps,
                                                   uint64_t inner_size,
-                                                  bool mode)
+                                                  int32_t mode)
 {
     // instantiate the kernel
     layernormfwdcontiguous<INPUT_TYPE, OUTPUT_TYPE>(
@@ -417,7 +417,7 @@ extern "C" __global__ void AddLayernormFwdContiguous(const INPUT_TYPE* __restric
                                                      OUTPUT_TYPE* __restrict__ rstd,
                                                      float eps,
                                                      uint64_t inner_size,
-                                                     bool mode)
+                                                     int32_t mode)
 {
     // instantiate the kernel
     addlayernormfwdcontiguous<INPUT_TYPE, OUTPUT_TYPE>(
@@ -430,7 +430,7 @@ extern "C" __global__ void T5LayernormFwdContiguous(const INPUT_TYPE* __restrict
                                                     OUTPUT_TYPE* __restrict__ rstd,
                                                     float eps,
                                                     uint64_t inner_size,
-                                                    bool mode)
+                                                    int32_t mode)
 {
     // instantiate the kernel
     t5layernormfwdcontiguous<INPUT_TYPE, OUTPUT_TYPE>(x, weight, y, rstd, eps, inner_size, mode);
@@ -442,7 +442,7 @@ extern "C" __global__ void T5LayernormBwdContiguous(const INPUT_TYPE* __restrict
                                                     const INPUT_TYPE* __restrict__ rstd,
                                                     OUTPUT_TYPE* __restrict__ dx,
                                                     uint64_t inner_size,
-                                                    bool mode)
+                                                    int32_t mode)
 {
     // instantiate the kernel
     t5layernormbwdcontiguous<INPUT_TYPE, OUTPUT_TYPE>(dy, x, weight, rstd, dx, inner_size, mode);
diff --git a/src/kernels/hip_atomic.hpp b/src/kernels/hip_atomic.hpp
new file mode 100644
index 0000000000..aad6b0a63e
--- /dev/null
+++ b/src/kernels/hip_atomic.hpp
@@ -0,0 +1,95 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+__device__ static inline __half __ushort_as___half(ushort x)
+{
+    static_assert(sizeof(ushort) == sizeof(__half), "");
+
+    __half tmp;
+    __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+    return tmp;
+}
+
+__device__ static inline ushort ____half_as_ushort(__half x)
+{
+    static_assert(sizeof(ushort) == sizeof(__half), "");
+
+    ushort tmp;
+    __builtin_memcpy(&tmp, &x, sizeof(tmp));
+
+    return tmp;
+}
+
+__device__ inline void atomic_add_g(ushort* addr, const float val)
+{
+    size_t offset    = reinterpret_cast<size_t>(addr) & 0x2;
+    bool is_32_align = offset;
+    uint32_t* addr_as_uint32_t =
+        reinterpret_cast<uint32_t*>(reinterpret_cast<char*>(addr) - offset);
+    uint32_t current = *addr_as_uint32_t;
+
+    uint32_t expected;
+
+    do
+    {
+        expected              = current;
+        ushort current_ushort = is_32_align ? current >> 16 : current & 0xffff;
+
+        float next_float   = __uint_as_float(static_cast<uint32_t>(current_ushort) << 16) + val;
+        ushort next_ushort = static_cast<ushort>(__float_as_uint(next_float) >> 16);
+        uint32_t next      = is_32_align ? (current & 0xffff) | (next_ushort << 16)
+                                         : (current & 0xffff0000) | next_ushort;
+
+        current = atomicCAS(addr_as_uint32_t, expected, next);
+    } while(current != expected);
+}
+
+__device__ inline void atomic_add_g(__half* addr, const __half val)
+{
+    size_t offset    = reinterpret_cast<size_t>(addr) & 0x2;
+    bool is_32_align = offset;
+    uint32_t* addr_as_uint32_t =
+        reinterpret_cast<uint32_t*>(reinterpret_cast<size_t>(addr) - offset);
+    uint32_t current = *addr_as_uint32_t;
+
+    uint32_t expected;
+
+    do
+    {
+        expected              = current;
+        ushort current_ushort = is_32_align ? current >> 16 : current & 0xffff;
+
+        ushort next_ushort = ____half_as_ushort(__ushort_as___half(current_ushort) + val);
+        uint32_t next      = is_32_align ? (current & 0xffff) | (next_ushort << 16)
+                                         : (current & 0xffff0000) | next_ushort;
+
+        current = atomicCAS(addr_as_uint32_t, expected, next);
+    } while(current != expected);
+}
+
+__device__ inline void atomic_add_g(float* addr, const float val) { atomicAdd(addr, val); }
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
new file mode 100644
index 0000000000..d35bfd93fc
--- /dev/null
+++ b/src/kernels/tensor_view.hpp
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_TENSOR_VIEW_HPP
+#define GUARD_TENSOR_VIEW_HPP
+
+template <int N>
+struct tensor_layout_t;
+
+template <int N>
+struct tensor_view_t
+{
+    // Get index in tensor view at tensor layout
+    constexpr uint64_t get_tensor_view_idx(const tensor_layout_t<N>& tensor_layout)
+    {
+        static_assert(N > 0);
+        uint64_t idx = 0;
+        for(auto i = 0; i < N; ++i)
+        {
+            idx += stride[i] * tensor_layout.layout[i];
+        }
+        return idx;
+    }
+    uint64_t stride[N];
+    uint64_t size[N];
+};
+
+template <int N>
+struct tensor_layout_t
+{
+    // Make tensor layout at index using tensor view
+    constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view, uint64_t idx)
+    {
+        static_assert(N > 0);
+        uint64_t temp = idx;
+        if constexpr(N == 1)
+        {
+            layout[0] = idx;
+        }
+        else
+        {
+            for(auto i = N - 1; i > 1; --i)
+            {
+                layout[i] = temp % tensor_view.size[i];
+                temp      = temp / tensor_view.size[i];
+            }
+            layout[1] = temp % tensor_view.size[1];
+            layout[0] = temp / tensor_view.size[1];
+        }
+    }
+
+    uint64_t layout[N];
+};
+
+#endif // GUARD_TENSOR_VIEW_HPP
diff --git a/src/reduce/problem_description.cpp b/src/reduce/problem_description.cpp
index ac73d16a02..c50ca4f755 100644
--- a/src/reduce/problem_description.cpp
+++ b/src/reduce/problem_description.cpp
@@ -38,7 +38,8 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
     auto xlength = xDesc.GetLengths();
     std::vector<std::size_t> outputlength;
     if((reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MIN) ||
-       (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX))
+       (reduceExtremeOp == MIOPEN_REDUCE_EXTREME_MAX) ||
+       (reduceExtremeOp == MIOPEN_REDUCE_CALCULATION_SUM))
         outputlength = yDesc.GetLengths();
     else
         outputlength = indiceDesc.GetLengths();
diff --git a/src/solver.cpp b/src/solver.cpp
index e468d38d0a..6b451ca498 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -32,6 +32,7 @@
 #include <miopen/cat/solvers.hpp>
 #include <miopen/fusion/solvers.hpp>
 #include <miopen/groupnorm/solvers.hpp>
+#include <miopen/getitem/solvers.hpp>
 #include <miopen/layernorm/solvers.hpp>
 #include <miopen/pooling/solvers.hpp>
 #include <miopen/reduce/solvers.hpp>
@@ -662,6 +663,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     Register(registry, ++id, Primitive::Cat, cat::CatForward{}.SolverDbId());
     Register(registry, ++id, Primitive::Adam, adam::Adam{}.SolverDbId());
+    Register(registry, ++id, Primitive::Item, getitem::GetitemBackward{}.SolverDbId());
 
     Register(registry, ++id, Primitive::Adam, adam::TransformersAdamW{}.SolverDbId());
 
diff --git a/src/solver/getitem/backward_getitem.cpp b/src/solver/getitem/backward_getitem.cpp
new file mode 100644
index 0000000000..ab44832b8b
--- /dev/null
+++ b/src/solver/getitem/backward_getitem.cpp
@@ -0,0 +1,283 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/getitem.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/getitem/invoke_params.hpp>
+#include <miopen/getitem/solvers.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace getitem {
+
+bool IsLargeIndex(const miopen::getitem::ProblemDescription& problem)
+{
+    auto dy_dims = problem.GetDYDesc().GetLengths();
+    auto dx_dims = problem.GetDXDesc().GetLengths();
+
+    for(int32_t i = 0; i < problem.GetDimCount(); i++)
+    {
+        if(dy_dims[problem.GetDim(i)] / dx_dims[problem.GetDim(i)] > 400)
+            return false;
+    }
+
+    return true;
+}
+
+bool GetitemBackward::IsApplicable(const ExecutionContext& /*context*/,
+                                   const miopen::getitem::ProblemDescription& problem) const
+{
+    if(!problem.IsSameType())
+        return false;
+    if(!IsLargeIndex(problem))
+        return false;
+    return true;
+}
+
+ConvSolution GetitemBackward::GetSolution(const ExecutionContext& /*context*/,
+                                          const miopen::getitem::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    const auto& dtype        = problem.GetDYDesc().GetType();
+    const auto& input_dtype  = miopen::GetDataType(problem.GetDYDesc().GetType());
+    const auto& index_dtype  = miopen::GetDataType(problem.GetIndexDesc(0).GetType());
+    const auto& error_dtype  = miopen::GetDataType(problem.GetErrorDesc().GetType());
+    const auto& output_dtype = miopen::GetDataType(problem.GetDXDesc().GetType());
+    const auto& dy_dims      = problem.GetDYDesc().GetLengths();
+    const auto& indexCount   = problem.GetIndexCount();
+
+    auto dy_numel =
+        std::accumulate(dy_dims.begin(), dy_dims.end(), 1ULL, std::multiplies<size_t>());
+
+    for(int32_t i = 0; i < indexCount; i++)
+    {
+        const auto& index_dims = problem.GetIndexDesc(i).GetLengths();
+        auto index_numel =
+            std::accumulate(index_dims.begin(), index_dims.end(), 1L, std::multiplies<int64_t>());
+
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(index_numel, xlocalsize);
+        size_t ylocalsize = 1;
+        size_t ygridsize  = 1;
+        size_t zlocalsize = 1;
+        size_t zgridsize  = 1;
+
+        auto kernel = KernelInfo{};
+
+        kernel.kernel_file = "MIOpenGetitem.cpp";
+        kernel.kernel_name = "GetItemBuildIndices";
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"INDEX_TYPE", index_dtype},
+            {"ERROR_TYPE", error_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+            {"LOCAL_SIZE", LOCAL_SIZE},
+        };
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
+
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    {
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(dy_numel, xlocalsize);
+        size_t ylocalsize = 1;
+        size_t ygridsize  = 1;
+        size_t zlocalsize = 1;
+        size_t zgridsize  = 1;
+
+        auto kernel = KernelInfo{};
+
+        kernel.kernel_file = "MIOpenGetitem.cpp";
+        kernel.kernel_name = "GetitemBwd";
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"INDEX_TYPE", index_dtype},
+            {"ERROR_TYPE", error_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+            {"LOCAL_SIZE", LOCAL_SIZE},
+        };
+
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
+
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::getitem::GetitemInvokeParams>();
+
+            const auto& start_dim = params.dims[0];
+            const auto& dx_dims   = params.dxDesc.GetLengths();
+
+            const auto& dims     = params.dims;
+            const auto& dimCount = params.dimCount;
+
+            std::vector<int32_t> output_dims(dimCount);
+            for(int32_t i = 0; i < dimCount; i++)
+            {
+                output_dims[i] = static_cast<int32_t>(dx_dims[dims[i]]);
+            }
+
+            const auto& indexCount = params.indexCount;
+            const auto& index_dims = params.indexDescs[0]->GetLengths();
+            const auto& sliceCount = params.sliceCount;
+            const auto& slices     = params.slices;
+            auto dim_info_offset =
+                indexCount > 0 ? indexCount * static_cast<int32_t>(index_dims[0]) : 0;
+
+            auto dy_tv = get_inner_expanded_tv<5>(params.dyDesc);
+            auto dx_tv = get_inner_expanded_tv<5>(params.dxDesc);
+
+            slice_tv<5>(dx_tv, sliceCount, slices);
+
+            auto elapsed = 0.f;
+            HipEventPtr start;
+            HipEventPtr stop;
+            bool reset_profiling_state = false;
+
+            for(int32_t i = 0; i < indexCount; i++)
+            {
+                decltype(auto) build_index_kernel = handle_.Run(kernels[i]);
+
+                const auto& index_dim  = dims[i];
+                const auto& dim_size   = output_dims[i];
+                auto index_tv          = get_inner_expanded_tv<5>(*params.indexDescs[i]);
+                const auto& dim_offset = i;
+
+                if((i == 0) && handle_.IsProfilingEnabled())
+                {
+                    handle_.EnableProfiling(false);
+                    reset_profiling_state = true;
+                    start                 = miopen::make_hip_event();
+                    stop                  = miopen::make_hip_event();
+                    hipEventRecord(start.get(), handle_.GetStream());
+                }
+
+                build_index_kernel(params.indexs[i],
+                                   params.workspace,
+                                   params.error,
+                                   index_dim,
+                                   indexCount,
+                                   dim_size,
+                                   index_tv,
+                                   dim_offset,
+                                   dim_info_offset);
+            }
+
+            if((indexCount == 0) && handle_.IsProfilingEnabled())
+            {
+                handle_.EnableProfiling(false);
+                reset_profiling_state = true;
+                start                 = miopen::make_hip_event();
+                stop                  = miopen::make_hip_event();
+                hipEventRecord(start.get(), handle_.GetStream());
+            }
+
+            decltype(auto) kernel = handle_.Run(kernels[indexCount]);
+
+            kernel(params.dy,
+                   params.workspace,
+                   params.dx,
+                   start_dim,
+                   indexCount,
+                   dy_tv,
+                   dx_tv,
+                   dim_info_offset,
+                   params.offset);
+
+            if(reset_profiling_state)
+            {
+                hipEventRecord(stop.get(), handle_.GetStream());
+                hipEventSynchronize(stop.get());
+                hipEventElapsedTime(&elapsed, start.get(), stop.get());
+                handle_.ResetKernelTime();
+                handle_.AccumKernelTime(elapsed);
+
+                hipEventDestroy(start.get());
+                hipEventDestroy(stop.get());
+                handle_.EnableProfiling(true);
+            };
+        };
+    };
+
+    return result;
+}
+
+std::size_t
+GetitemBackward::GetWorkspaceSize(const ExecutionContext& /*context*/,
+                                  const miopen::getitem::ProblemDescription& problem) const
+{
+    const auto& indexCount = problem.GetIndexCount();
+    if(indexCount > 0)
+    {
+        const auto& index_dims = problem.GetIndexDesc(0).GetLengths();
+        auto index_numel =
+            std::accumulate(index_dims.begin(), index_dims.end(), 1L, std::multiplies<int64_t>());
+        return (indexCount * index_numel + problem.GetIndexCount()) *
+               get_data_size(problem.GetIndexDesc(0).GetType());
+    }
+
+    return 0;
+}
+
+} // namespace getitem
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/groupnorm/forward_groupnorm.cpp b/src/solver/groupnorm/forward_groupnorm.cpp
index e4018d16ab..11f66e2f83 100644
--- a/src/solver/groupnorm/forward_groupnorm.cpp
+++ b/src/solver/groupnorm/forward_groupnorm.cpp
@@ -75,8 +75,10 @@ GroupNormForward::GetSolution(const ExecutionContext& context,
     auto result = ConvSolution{miopenStatusSuccess};
 
     {
-        auto dtype = problem.GetXDesc().GetType();
-        auto dims  = problem.GetXDesc().GetLengths();
+        auto dtype        = problem.GetXDesc().GetType();
+        auto input_dtype  = miopen::GetDataType(problem.GetXDesc().GetType());
+        auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType());
+        auto dims         = problem.GetXDesc().GetLengths();
 
         size_t num_groups = problem.GetNumGroups();
         size_t outer_size = dims[0] * num_groups;
@@ -98,6 +100,8 @@ GroupNormForward::GetSolution(const ExecutionContext& context,
             {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
             {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
             {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
             {"LOCAL_SIZE", LOCAL_SIZE},
         };
 
@@ -125,9 +129,9 @@ GroupNormForward::GetSolution(const ExecutionContext& context,
             size_t num_channels      = dims[1];
 
             kernel(params.x,
-                   params.y,
                    params.weight,
                    params.bias,
+                   params.y,
                    params.mean,
                    params.rstd,
                    params.epsilon,
diff --git a/src/solver/layernorm/backward_t5layernorm.cpp b/src/solver/layernorm/backward_t5layernorm.cpp
index c62a756b77..cf984f2e77 100644
--- a/src/solver/layernorm/backward_t5layernorm.cpp
+++ b/src/solver/layernorm/backward_t5layernorm.cpp
@@ -278,7 +278,7 @@ T5LayernormBackward::GetSolution(const ExecutionContext& context,
                        params.rstd,
                        params.dx,
                        inner_size,
-                       static_cast<bool>(params.mode % 2));
+                       static_cast<int32_t>(params.mode));
 
                 weight_parallel_kernel(params.dy,
                                        params.x,
@@ -333,7 +333,7 @@ T5LayernormBackward::GetSolution(const ExecutionContext& context,
                        params.rstd,
                        params.dx,
                        inner_size,
-                       static_cast<bool>(params.mode % 2));
+                       static_cast<int32_t>(params.mode));
 
                 weight_kernel(params.dy, params.x, params.rstd, params.dw, outer_size, inner_size);
 
diff --git a/src/solver/layernorm/forward_addlayernorm.cpp b/src/solver/layernorm/forward_addlayernorm.cpp
index ba366b318d..98c2c7ca55 100644
--- a/src/solver/layernorm/forward_addlayernorm.cpp
+++ b/src/solver/layernorm/forward_addlayernorm.cpp
@@ -138,7 +138,7 @@ AddLayernormForward::GetSolution(const ExecutionContext& context,
                    params.rstd,
                    params.epsilon,
                    inner_size,
-                   static_cast<bool>(params.mode % 2));
+                   static_cast<int32_t>(params.mode));
         };
     };
 
diff --git a/src/solver/layernorm/forward_layernorm.cpp b/src/solver/layernorm/forward_layernorm.cpp
index ffbe479f1f..81e5641836 100644
--- a/src/solver/layernorm/forward_layernorm.cpp
+++ b/src/solver/layernorm/forward_layernorm.cpp
@@ -137,7 +137,7 @@ LayernormForward::GetSolution(const ExecutionContext& context,
                    params.rstd,
                    params.epsilon,
                    inner_size,
-                   static_cast<bool>(params.mode));
+                   static_cast<int32_t>(params.mode));
         };
     };
 
diff --git a/src/solver/reduce/forward_argmax.cpp b/src/solver/reduce/forward_argmax.cpp
index 4a44887ea4..9c2f1203cd 100644
--- a/src/solver/reduce/forward_argmax.cpp
+++ b/src/solver/reduce/forward_argmax.cpp
@@ -40,7 +40,7 @@ namespace reduce {
 
 size_t ArgmaxForward::XGridSize(std::vector<size_t> indicedims) const
 {
-    auto indice_numel =
+    size_t indice_numel =
         std::accumulate(indicedims.begin(), indicedims.end(), 1ULL, std::multiplies<size_t>());
     return AlignUp(indice_numel, LOCAL_SIZE);
 }
diff --git a/src/solver/reduce/forward_argmin.cpp b/src/solver/reduce/forward_argmin.cpp
index c0b3d15aa0..51471c5466 100644
--- a/src/solver/reduce/forward_argmin.cpp
+++ b/src/solver/reduce/forward_argmin.cpp
@@ -40,7 +40,7 @@ namespace reduce {
 
 size_t ArgminForward::XGridSize(std::vector<size_t> indicedims) const
 {
-    auto indice_numel =
+    size_t indice_numel =
         std::accumulate(indicedims.begin(), indicedims.end(), 1ULL, std::multiplies<size_t>());
     return AlignUp(indice_numel, LOCAL_SIZE);
 }
diff --git a/src/solver/reduce/forward_max.cpp b/src/solver/reduce/forward_max.cpp
index 9537c300cf..a759d9bcfa 100644
--- a/src/solver/reduce/forward_max.cpp
+++ b/src/solver/reduce/forward_max.cpp
@@ -40,7 +40,7 @@ namespace reduce {
 
 size_t MaxForward::XGridSize(std::vector<size_t> ydims) const
 {
-    auto output_numel =
+    size_t output_numel =
         std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies<size_t>());
     return AlignUp(output_numel, LOCAL_SIZE);
 }
diff --git a/src/solver/reduce/forward_min.cpp b/src/solver/reduce/forward_min.cpp
index f7aae43779..abb0c4b5bd 100644
--- a/src/solver/reduce/forward_min.cpp
+++ b/src/solver/reduce/forward_min.cpp
@@ -40,7 +40,7 @@ namespace reduce {
 
 size_t MinForward::XGridSize(std::vector<size_t> ydims) const
 {
-    auto output_numel =
+    size_t output_numel =
         std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies<size_t>());
     return AlignUp(output_numel, LOCAL_SIZE);
 }
diff --git a/src/sum.cpp b/src/sum.cpp
index 00caefa1a9..ddfb21917a 100644
--- a/src/sum.cpp
+++ b/src/sum.cpp
@@ -47,9 +47,9 @@ std::size_t GetSumWorkspaceSize(Handle& handle,
     const auto algo    = AlgorithmName{"SumForward"};
     const auto solvers = solver::SolverContainer<solver::reduce::SumForward>{};
 
-    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem);
+    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem, true);
 
-    return pair_size_vector.empty() ? static_cast<size_t>(-1) : pair_size_vector.front().second;
+    return pair_size_vector.empty() ? static_cast<size_t>(0) : pair_size_vector.front().second;
 }
 
 miopenStatus_t SumForward(Handle& handle,
diff --git a/src/t5layernorm.cpp b/src/t5layernorm.cpp
index 680270c4b0..5978fdd677 100644
--- a/src/t5layernorm.cpp
+++ b/src/t5layernorm.cpp
@@ -86,9 +86,9 @@ std::size_t GetT5LayerNormBackwardWorkspaceSize(Handle& handle,
     const auto algo    = AlgorithmName{"T5LayerNormBackward"};
     const auto solvers = solver::SolverContainer<solver::layernorm::T5LayernormBackward>{};
 
-    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem);
+    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem, true);
 
-    return pair_size_vector.empty() ? static_cast<size_t>(-1) : pair_size_vector.front().second;
+    return pair_size_vector.empty() ? static_cast<size_t>(0) : pair_size_vector.front().second;
 }
 
 miopenStatus_t T5LayerNormBackward(Handle& handle,
diff --git a/test/gtest/addlayernorm.cpp b/test/gtest/addlayernorm.cpp
index 015f427f90..ae39096b7a 100644
--- a/test/gtest/addlayernorm.cpp
+++ b/test/gtest/addlayernorm.cpp
@@ -59,8 +59,8 @@ using namespace addlayernorm;
 
 TEST_P(GPU_AddLayerNorm_FP32, AddLayerNormTestFw)
 {
-    auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG);
-    if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--float")
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
     {
         RunTest();
         Verify();
@@ -73,8 +73,8 @@ TEST_P(GPU_AddLayerNorm_FP32, AddLayerNormTestFw)
 
 TEST_P(GPU_AddLayerNorm_FP16, AddLayerNormTestFw)
 {
-    auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG);
-    if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--half")
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
     {
         RunTest();
         Verify();
@@ -87,8 +87,8 @@ TEST_P(GPU_AddLayerNorm_FP16, AddLayerNormTestFw)
 
 TEST_P(GPU_AddLayerNorm_BFP16, AddLayerNormTestFw)
 {
-    auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG);
-    if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--bfloat16")
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
     {
         RunTest();
         Verify();
diff --git a/test/gtest/addlayernorm.hpp b/test/gtest/addlayernorm.hpp
index 0be011e683..da65ca93c6 100644
--- a/test/gtest/addlayernorm.hpp
+++ b/test/gtest/addlayernorm.hpp
@@ -78,8 +78,10 @@ void cpu_addlayernorm_forward(tensor<T> input,
         ref_rstd[o] = static_cast<T>(rstd_v);
 
         ford(inner_size)([&](int32_t i) {
-            float weight_v = mode ? static_cast<float>(weight[i]) : 1;
-            float bias_v   = mode ? static_cast<float>(bias[i]) : 0;
+            float weight_v =
+                (mode == MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD) ? 1 : static_cast<float>(weight[i]);
+            float bias_v =
+                (mode == MIOPEN_ELEMENTWISE_AFFINE_FUSED_ADD) ? 0 : static_cast<float>(bias[i]);
             ref_output[o * inner_size + i] =
                 static_cast<T>((static_cast<float>(input[o * inner_size + i]) +
                                 static_cast<float>(input2[o * inner_size + i]) - mean_v) *
diff --git a/test/gtest/cat.cpp b/test/gtest/cat.cpp
index 7b394093f5..6bc405e2f0 100644
--- a/test/gtest/cat.cpp
+++ b/test/gtest/cat.cpp
@@ -52,7 +52,8 @@ using namespace cat;
 
 TEST_P(CatTestFloat, CatTestFw)
 {
-    if(env::enabled(MIOPEN_TEST_ALL) && (GetFloatArg() == "--float"))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
     {
         RunTest();
         Verify();
diff --git a/test/gtest/getitem.cpp b/test/gtest/getitem.cpp
new file mode 100644
index 0000000000..6db0c25fff
--- /dev/null
+++ b/test/gtest/getitem.cpp
@@ -0,0 +1,110 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "getitem.hpp"
+#include <miopen/env.hpp>
+
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace getitem {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
+struct GetitemBwdTestFloat : GetitemBwdTest<float>
+{
+};
+
+struct GetitemBwdTestHalf : GetitemBwdTest<half_float::half>
+{
+};
+
+struct GetitemBwdTestBFloat16 : GetitemBwdTest<bfloat16>
+{
+};
+
+} // namespace getitem
+using namespace getitem;
+
+TEST_P(GetitemBwdTestFloat, GetitemBwdTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GetitemBwdTestHalf, GetitemBwdTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GetitemBwdTestBFloat16, GetitemBwdTest)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(GetitemTestSet,
+                         GetitemBwdTestFloat,
+                         testing::ValuesIn(GetitemTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(GetitemTestSet,
+                         GetitemBwdTestHalf,
+                         testing::ValuesIn(GetitemTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(GetitemTestSet,
+                         GetitemBwdTestBFloat16,
+                         testing::ValuesIn(GetitemTestConfigs()));
diff --git a/test/gtest/getitem.hpp b/test/gtest/getitem.hpp
new file mode 100644
index 0000000000..264f002985
--- /dev/null
+++ b/test/gtest/getitem.hpp
@@ -0,0 +1,407 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "../driver/tensor_driver.hpp"
+#include "get_handle.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <gtest/gtest.h>
+#include <miopen/getitem.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor_view_utils.hpp>
+
+template <class T>
+void cpu_getitem_backward(tensor<T> dy,
+                          uint32_t indexCount,
+                          std::vector<tensor<int32_t>> indexs,
+                          tensor<T>& ref_dx,
+                          tensor<int32_t>& ref_error,
+                          uint32_t dimCount,
+                          int32_t* dims,
+                          uint32_t sliceCount,
+                          int32_t* slices,
+                          uint32_t offset)
+{
+    auto dy_dims  = dy.desc.GetLengths();
+    auto dy_numel = std::accumulate(dy_dims.begin(), dy_dims.end(), 1L, std::multiplies<int64_t>());
+    auto dx_dims  = ref_dx.desc.GetLengths();
+    auto index_dims = indexs[0].desc.GetLengths();
+    auto index_numel =
+        std::accumulate(index_dims.begin(), index_dims.end(), 1L, std::multiplies<int64_t>());
+    auto element_index = std::vector<int32_t>(indexCount * index_numel + indexCount);
+
+    std::vector<int32_t> output_dims;
+    for(int32_t i = 0; i < dimCount; i++)
+    {
+        output_dims.push_back(dx_dims[dims[i]]);
+    }
+
+    auto dim_info_offset = indexCount > 0 ? indexCount * index_dims[0] : 0;
+    auto start_dim       = dims[0];
+
+    auto dy_tv     = miopen::get_inner_expanded_tv<5>(dy.desc);
+    auto ref_dx_tv = miopen::get_inner_expanded_tv<5>(ref_dx.desc);
+    miopen::slice_tv<5>(ref_dx_tv, sliceCount, slices);
+
+    // Get element index form indexs
+    for(int j = 0; j < indexCount; j++)
+    {
+        auto index_dim = dims[j];
+        auto dim_size  = output_dims[j];
+
+        par_ford(index_numel)([&](int32_t o) {
+            int32_t getitem_index = indexs[j][o];
+
+            if(getitem_index >= 0 && getitem_index < dim_size)
+            {
+                element_index[(o * indexCount) + j] = getitem_index;
+            }
+            else if(getitem_index >= -dim_size && getitem_index < 0)
+            {
+                element_index[(o * indexCount) + j] = getitem_index + dim_size;
+            }
+            else
+            {
+                ref_error[j] = -1;
+            }
+
+            if(o == 0)
+            {
+                element_index[dim_info_offset + j] = index_dim;
+            }
+        });
+    }
+
+    // GetItem
+    par_ford(dy_numel)([&](int32_t o) {
+        tensor_layout_t<5> ncdhw(dy_tv, o);
+        tensor_layout_t<5> idx(ncdhw);
+
+        if(indexCount > 0)
+        {
+            size_t dim_cursor = ncdhw.layout[start_dim];
+            size_t i          = start_dim;
+            size_t j          = 0;
+
+            for(; i < start_dim + indexCount; ++i, ++j)
+            {
+                size_t dim_idx      = element_index[dim_info_offset + j];
+                idx.layout[dim_idx] = element_index[(dim_cursor * indexCount) + j];
+            }
+
+            i          = element_index[dim_info_offset + indexCount - 1] + 1;
+            dim_cursor = start_dim + 1;
+            for(; i < 5; ++i, ++dim_cursor)
+            {
+                idx.layout[i] = ncdhw.layout[dim_cursor];
+            }
+        }
+
+        ref_dx[ref_dx_tv.get_tensor_view_idx(idx)] += dy[dy_tv.get_tensor_view_idx(ncdhw)];
+    });
+}
+
+struct GetitemTestCase
+{
+    std::vector<int32_t> dy;
+    std::vector<std::vector<int32_t>> indexs;
+    std::vector<int32_t> dx;
+    std::vector<int32_t> dims;
+    std::vector<std::vector<int32_t>> slices;
+    uint32_t offset;
+
+    friend std::ostream& operator<<(std::ostream& os, const GetitemTestCase& tc)
+    {
+
+        os << " dy:";
+        auto dy_s = tc.dy;
+        os << dy_s[0];
+        for(int32_t i = 1; i < dy_s.size(); i++)
+        {
+            os << "x" << dy_s[i];
+        }
+
+        os << " indexs:";
+        for(int32_t i = 0; i < tc.indexs.size(); i++)
+        {
+            auto index_s = tc.indexs[i];
+            if(i != 0)
+                os << ",";
+            os << index_s[0];
+            for(int32_t j = 1; j < index_s.size(); j++)
+            {
+                os << "index" << index_s[j];
+            }
+        }
+
+        os << " dx:";
+        auto dx_s = tc.dx;
+        os << dx_s[0];
+        for(int32_t i = 1; i < dx_s.size(); i++)
+        {
+            os << "x" << dx_s[i];
+        }
+
+        os << " dims:";
+        auto dims_s = tc.dims;
+        os << dims_s[0];
+        for(int32_t i = 1; i < dims_s.size(); i++)
+        {
+            os << "," << dims_s[i];
+        }
+
+        os << " slices:";
+        for(int32_t i = 0; i < tc.slices.size(); i++)
+        {
+            auto slice_s = tc.slices[i];
+            if(i != 0)
+                os << ",";
+            os << slice_s[0];
+            for(int32_t j = 1; j < slice_s.size(); j++)
+            {
+                os << "slice" << slice_s[j];
+            }
+        }
+
+        os << " offset:" << tc.offset;
+
+        return os;
+    }
+
+    std::vector<int32_t> GetDy() { return dy; }
+
+    std::vector<std::vector<int32_t>> GetIndexs() { return indexs; }
+
+    std::vector<int32_t> GetDx() { return dx; }
+
+    std::vector<int32_t> GetDims() { return dims; }
+
+    std::vector<std::vector<int32_t>> GetSlices() { return slices; }
+};
+
+std::vector<GetitemTestCase> GetitemTestConfigs()
+{ // dy indexs dx dims slices offset
+    // clang-format off
+    return {
+        { {128, 128}, {{128}},  {128, 128},   {0}, {}, 0}, //llama2
+        { {16, 4},    {{16}},   {3234, 4},    {0}, {}, 0}, //ssdlite
+        { {149, 128}, {{1490}}, {1490, 1128}, {0}, {}, 0}, //llama2_7b
+        { {10, 128},  {{10}},   {160, 128},   {0}, {}, 0},
+        { {4260, 4},  {{4300}}, {4300, 4},    {0}, {}, 0}, //fasterrcnn
+        { {4260},     {{4300}}, {4300},       {0}, {}, 0}  //maskrcnn
+      };
+    // clang-format on
+}
+
+template <typename T = float>
+struct GetitemBwdTest : public ::testing::TestWithParam<GetitemTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle  = get_handle();
+        getitem_config = GetParam();
+        auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+
+        dims   = getitem_config.GetDims();
+        slices = getitem_config.GetSlices();
+        offset = getitem_config.offset;
+
+        for(auto slice : slices)
+        {
+            for(int32_t i = 0; i < 4; i++)
+            {
+                slices_flat.push_back(slice[i]);
+            }
+        }
+
+        auto dy_dim     = getitem_config.GetDy();
+        auto indexs_dim = getitem_config.GetIndexs();
+        auto dx_dim     = getitem_config.GetDx();
+        std::vector<int32_t> error_dim;
+        error_dim.push_back(indexs_dim.size());
+
+        dy = tensor<T>{dy_dim}.generate(gen_value);
+
+        auto output_dims = std::vector<int32_t>{};
+        for(auto dim : dims)
+        {
+            output_dims.push_back(static_cast<int32_t>(dx_dim[dim]));
+        }
+
+        for(int32_t i = 0; i < indexs_dim.size(); i++)
+        {
+            auto index       = tensor<int32_t>{indexs_dim[i]};
+            auto index_dims  = index.desc.GetLengths();
+            auto index_numel = std::accumulate(
+                index_dims.begin(), index_dims.end(), 1L, std::multiplies<int64_t>());
+            for(int32_t j = 0; j < index_numel; j++)
+            {
+                index[j] = prng::gen_0_to_B<int32_t>(output_dims[i]);
+            }
+            indexs.push_back(index);
+        }
+
+        dx = tensor<T>{dx_dim};
+        std::fill(dx.begin(), dx.end(), static_cast<T>(0));
+
+        error = tensor<int32_t>{error_dim};
+        std::fill(error.begin(), error.end(), static_cast<int32_t>(0));
+
+        ref_error = tensor<int32_t>{error_dim};
+        std::fill(ref_error.begin(), ref_error.end(), static_cast<int32_t>(0));
+
+        ref_dx = tensor<T>{dx_dim};
+        std::fill(ref_dx.begin(), ref_dx.end(), static_cast<T>(0));
+
+        std::vector<miopen::TensorDescriptor*> indexDescs;
+
+        std::transform(indexs.begin(),
+                       indexs.end(),
+                       std::back_inserter(indexDescs),
+                       [](auto& index) { return &index.desc; });
+
+        std::vector<size_t> workspace_dims;
+        ws_sizeInBytes =
+            miopen::GetGetitemWorkspaceSize(handle, indexDescs.size(), indexDescs.data());
+
+        workspace_dims.push_back(ws_sizeInBytes / sizeof(T));
+        if(ws_sizeInBytes != 0)
+        {
+            workspace = tensor<T>{workspace_dims};
+            std::fill(workspace.begin(), workspace.end(), static_cast<T>(0));
+            workspace_dev = handle.Write(workspace.data);
+        }
+
+        dy_dev = handle.Write(dy.data);
+
+        std::transform(indexs.begin(),
+                       indexs.end(),
+                       std::back_inserter(indexs_dev),
+                       [&](auto& index) { return handle.Write(index.data); });
+
+        dx_dev    = handle.Write(dx.data);
+        error_dev = handle.Write(error.data);
+    }
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        cpu_getitem_backward<T>(dy,
+                                indexs.size(),
+                                indexs,
+                                ref_dx,
+                                ref_error,
+                                dims.size(),
+                                dims.data(),
+                                slices.size(),
+                                slices_flat.data(),
+                                offset);
+
+        std::vector<miopen::TensorDescriptor*> indexDescs;
+        std::vector<ConstData_t> indexData;
+
+        std::transform(indexs.begin(),
+                       indexs.end(),
+                       std::back_inserter(indexDescs),
+                       [](auto& index) { return &index.desc; });
+        std::transform(indexs_dev.begin(),
+                       indexs_dev.end(),
+                       std::back_inserter(indexData),
+                       [](auto& index_dev) { return index_dev.get(); });
+
+        miopenStatus_t status = miopen::GetitemBackward(handle,
+                                                        workspace_dev.get(),
+                                                        ws_sizeInBytes,
+                                                        dy.desc,
+                                                        dy_dev.get(),
+                                                        indexDescs.size(),
+                                                        indexDescs.data(),
+                                                        indexData.data(),
+                                                        dx.desc,
+                                                        dx_dev.get(),
+                                                        error.desc,
+                                                        error_dev.get(),
+                                                        dims.size(),
+                                                        dims.data(),
+                                                        slices.size(),
+                                                        slices_flat.data(),
+                                                        offset);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+
+        dx.data    = handle.Read<T>(dx_dev, dx.data.size());
+        error.data = handle.Read<int32_t>(error_dev, error.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        // In the case of layernorm, there is a cumulative sum operation, and in the case of
+        // floating point operation, the result value can change if the order of the summed values
+        // is changed. So apply a threshold that is 10 times larger than other operations.
+        auto threshold = std::is_same<T, float>::value ? 1.5e-4 : 8.2e-1;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        // If there is an atomic operation on the GPU kernel, a large error occurs depending on the
+        // calculation order, so it is multiplied by 10 times.
+        if(std::is_same<T, bfloat16>::value)
+            threshold *= 8000.0;
+
+        auto error_dx = miopen::rms_range(ref_dx, dx);
+        EXPECT_TRUE(miopen::range_distance(ref_dx) == miopen::range_distance(dx));
+        EXPECT_TRUE(error_dx < threshold * 10) << "Error dx beyond tolerance Error:" << error_dx
+                                               << ",  Thresholdx10: " << threshold * 10;
+
+        auto error_error = miopen::rms_range(ref_error, error);
+        EXPECT_TRUE(miopen::range_distance(ref_error) == miopen::range_distance(error));
+        EXPECT_TRUE(std::abs(static_cast<float>(error_error)) == 0.0f) << "Error dx is not equal";
+    }
+    GetitemTestCase getitem_config;
+
+    tensor<T> dy;
+    std::vector<tensor<int32_t>> indexs;
+    tensor<T> dx;
+    tensor<T> workspace;
+    tensor<int32_t> error;
+
+    tensor<T> ref_dx;
+    tensor<int32_t> ref_error;
+
+    miopen::Allocator::ManageDataPtr dy_dev;
+    std::vector<miopen::Allocator::ManageDataPtr> indexs_dev;
+    miopen::Allocator::ManageDataPtr dx_dev;
+    miopen::Allocator::ManageDataPtr workspace_dev;
+    miopen::Allocator::ManageDataPtr error_dev;
+
+    size_t ws_sizeInBytes;
+
+    std::vector<int32_t> dims;
+    std::vector<std::vector<int32_t>> slices;
+    std::vector<int32_t> slices_flat;
+    uint32_t offset;
+};
diff --git a/test/gtest/groupnorm.cpp b/test/gtest/groupnorm.cpp
index 62a83e4081..d40f826c0f 100644
--- a/test/gtest/groupnorm.cpp
+++ b/test/gtest/groupnorm.cpp
@@ -52,12 +52,8 @@ using namespace groupnorm;
 
 TEST_P(GroupNormTestFloat, GroupNormTestFw)
 {
-    const auto& handle = get_handle();
-
-    if((miopen::StartsWith(handle.GetDeviceName(), "gfx908") ||
-        miopen::StartsWith(handle.GetDeviceName(), "gfx90a") ||
-        miopen::StartsWith(handle.GetDeviceName(), "gfx94")) &&
-       env::enabled(MIOPEN_TEST_ALL) && (GetFloatArg() == "--float"))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
     {
         RunTest();
         Verify();
diff --git a/test/gtest/layernorm.cpp b/test/gtest/layernorm.cpp
index e780ad1648..e1e669c6d3 100644
--- a/test/gtest/layernorm.cpp
+++ b/test/gtest/layernorm.cpp
@@ -30,8 +30,6 @@
 MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
 MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
 
-namespace env = miopen::env;
-
 namespace layernorm {
 
 std::string GetFloatArg()
@@ -66,7 +64,8 @@ TEST_P(LayerNormTestFloat, LayerNormTestFw)
     if((miopen::StartsWith(handle.GetDeviceName(), "gfx908") ||
         miopen::StartsWith(handle.GetDeviceName(), "gfx90a") ||
         miopen::StartsWith(handle.GetDeviceName(), "gfx94")) &&
-       env::enabled(MIOPEN_TEST_ALL) && (GetFloatArg() == "--float"))
+       (!MIOPEN_TEST_ALL ||
+        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")))
     {
         RunTest();
         Verify();
@@ -79,12 +78,12 @@ TEST_P(LayerNormTestFloat, LayerNormTestFw)
 
 TEST_P(LayerNormTestHalf, LayerNormTestFw)
 {
-    auto TypeArg       = env::value(MIOPEN_TEST_FLOAT_ARG);
     const auto& handle = get_handle();
     if((miopen::StartsWith(handle.GetDeviceName(), "gfx908") ||
         miopen::StartsWith(handle.GetDeviceName(), "gfx90a") ||
         miopen::StartsWith(handle.GetDeviceName(), "gfx94")) &&
-       env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--half")
+       (!MIOPEN_TEST_ALL ||
+        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")))
     {
         RunTest();
         Verify();
@@ -97,12 +96,12 @@ TEST_P(LayerNormTestHalf, LayerNormTestFw)
 
 TEST_P(LayerNormTestBFloat16, LayerNormTestFw)
 {
-    auto TypeArg       = env::value(MIOPEN_TEST_FLOAT_ARG);
     const auto& handle = get_handle();
     if((miopen::StartsWith(handle.GetDeviceName(), "gfx908") ||
         miopen::StartsWith(handle.GetDeviceName(), "gfx90a") ||
         miopen::StartsWith(handle.GetDeviceName(), "gfx94")) &&
-       env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--bfloat16")
+       (!MIOPEN_TEST_ALL ||
+        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")))
     {
         RunTest();
         Verify();
diff --git a/test/gtest/layernorm.hpp b/test/gtest/layernorm.hpp
index a50fe031de..cd8813e391 100644
--- a/test/gtest/layernorm.hpp
+++ b/test/gtest/layernorm.hpp
@@ -76,8 +76,9 @@ void cpu_layernorm_forward(tensor<T> input,
         ref_rstd[o] = static_cast<T>(rstd_v);
 
         ford(inner_size)([&](int32_t i) {
-            float weight_v                 = mode ? static_cast<float>(weight[i]) : 1;
-            float bias_v                   = mode ? static_cast<float>(bias[i]) : 0;
+            float weight_v =
+                (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 1 : static_cast<float>(weight[i]);
+            float bias_v = (mode == MIOPEN_ELEMENTWISE_AFFINE) ? 0 : static_cast<float>(bias[i]);
             ref_output[o * inner_size + i] = static_cast<T>(
                 (static_cast<float>(input[o * inner_size + i]) - mean_v) * rstd_v * weight_v +
                 bias_v);
diff --git a/test/gtest/reduceextreme.cpp b/test/gtest/reduceextreme.cpp
index 1d6637deb0..670ec24e1b 100644
--- a/test/gtest/reduceextreme.cpp
+++ b/test/gtest/reduceextreme.cpp
@@ -59,7 +59,8 @@ using namespace reduceextreme;
 
 TEST_P(ReduceExtremeTestFloat, ReduceExtremeTestFw)
 {
-    if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--float")
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
     {
         RunTest();
         Verify();
@@ -72,7 +73,8 @@ TEST_P(ReduceExtremeTestFloat, ReduceExtremeTestFw)
 
 TEST_P(ReduceExtremeTestHalf, ReduceExtremeTestFw)
 {
-    if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--half")
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
     {
         RunTest();
         Verify();
@@ -85,7 +87,8 @@ TEST_P(ReduceExtremeTestHalf, ReduceExtremeTestFw)
 
 TEST_P(ReduceExtremeTestBFloat16, ReduceExtremeTestFw)
 {
-    if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--bfloat16")
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
     {
         RunTest();
         Verify();
diff --git a/test/gtest/sum.cpp b/test/gtest/sum.cpp
index 066c609dd5..fb2e7aefa8 100644
--- a/test/gtest/sum.cpp
+++ b/test/gtest/sum.cpp
@@ -53,7 +53,8 @@ using namespace sum;
 
 TEST_P(SumTestFloat, SumTestFw)
 {
-    if(env::enabled(MIOPEN_TEST_ALL) && (GetFloatArg() == "--float"))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
     {
         RunTest();
         Verify();
diff --git a/test/gtest/t5layernorm.cpp b/test/gtest/t5layernorm.cpp
index c062f1007e..49b45e88f9 100644
--- a/test/gtest/t5layernorm.cpp
+++ b/test/gtest/t5layernorm.cpp
@@ -71,8 +71,8 @@ using namespace t5layernorm;
 
 TEST_P(T5LayerNormTestFloat, T5LayerNormTestFw)
 {
-    auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG);
-    if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--float")
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
     {
         RunTest();
         Verify();
@@ -85,8 +85,8 @@ TEST_P(T5LayerNormTestFloat, T5LayerNormTestFw)
 
 TEST_P(T5LayerNormTestHalf, T5LayerNormTestFw)
 {
-    auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG);
-    if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--half")
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
     {
         RunTest();
         Verify();
@@ -99,8 +99,8 @@ TEST_P(T5LayerNormTestHalf, T5LayerNormTestFw)
 
 TEST_P(T5LayerNormTestBFloat16, T5LayerNormTestFw)
 {
-    auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG);
-    if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--bfloat16")
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
     {
         RunTest();
         Verify();
@@ -113,8 +113,8 @@ TEST_P(T5LayerNormTestBFloat16, T5LayerNormTestFw)
 
 TEST_P(T5LayerNormBwdTestFloat, T5LayerNormBwdTestFw)
 {
-    auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG);
-    if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--float")
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
     {
         RunTest();
         Verify();
@@ -127,8 +127,8 @@ TEST_P(T5LayerNormBwdTestFloat, T5LayerNormBwdTestFw)
 
 TEST_P(T5LayerNormBwdTestHalf, T5LayerNormBwdTestFw)
 {
-    auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG);
-    if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--half")
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
     {
         RunTest();
         Verify();
@@ -141,8 +141,8 @@ TEST_P(T5LayerNormBwdTestHalf, T5LayerNormBwdTestFw)
 
 TEST_P(T5LayerNormBwdTestBFloat16, T5LayerNormBwdTestFw)
 {
-    auto TypeArg = env::value(MIOPEN_TEST_FLOAT_ARG);
-    if(env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == "--bfloat16")
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
     {
         RunTest();
         Verify();
diff --git a/test/gtest/t5layernorm.hpp b/test/gtest/t5layernorm.hpp
index 505336a130..b82b543b2a 100644
--- a/test/gtest/t5layernorm.hpp
+++ b/test/gtest/t5layernorm.hpp
@@ -409,7 +409,7 @@ struct T5LayerNormBwdTest : public ::testing::TestWithParam<T5LayerNormTestCase>
         std::fill(ref_dw.begin(), ref_dw.end(), std::numeric_limits<T>::quiet_NaN());
 
         std::vector<size_t> workspace_dims;
-        printf("GetT5LayerNormBackwardWorkspaceSize\n");
+
         ws_sizeInBytes = miopen::GetT5LayerNormBackwardWorkspaceSize(
             handle, dy.desc, x.desc, weight.desc, rstd.desc, dx.desc, dw.desc, ln_mode);
         if(ws_sizeInBytes == static_cast<size_t>(-1))
@@ -472,7 +472,7 @@ struct T5LayerNormBwdTest : public ::testing::TestWithParam<T5LayerNormTestCase>
 
         // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
         if(std::is_same<T, bfloat16>::value)
-            threshold *= 8.0;
+            threshold *= 80.0;
 
         auto error = miopen::rms_range(ref_dx, dx);
         EXPECT_TRUE(miopen::range_distance(ref_dx) == miopen::range_distance(dx));