diff --git a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp
deleted file mode 100644
index 02e675203d..0000000000
--- a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp
+++ /dev/null
@@ -1,210 +0,0 @@
-#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP
-#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_DEPRECATED_HPP
-
-#include "static_kernel_common_header.hpp"
-#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp"
-
-namespace ck {
-
-// OriginalTensorDesc : ConstantTensorDescriptor_deprecated<...>
-//     it's the tensor whose dimensions are to be merged
-// OriginalDimMergeSeqs : Sequence<...>...
-//     each is a sequence of original dimensions (of OriginalTensorDesc) to be merged
-template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
-struct ConstantMergedTensorDescriptor_deprecated
-{
-    using Type = ConstantMergedTensorDescriptor_deprecated;
-
-    static constexpr auto mOriginalDimMergeSeqs = std::tuple<OriginalDimMergeSeqs...>{};
-
-    static constexpr index_t nDim         = sizeof...(OriginalDimMergeSeqs);
-    static constexpr index_t nOriginalDim = OriginalTensorDesc::GetNumOfDimension();
-
-    __host__ __device__ constexpr ConstantMergedTensorDescriptor_deprecated()
-    {
-        static_assert(nDim <= nOriginalDim, "wrong!");
-
-        // TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
-        // OriginalTensorDesc::nDim number of dimensions
-
-        // TODO: check OriginalDimMergeSeqs contains all original dimensions
-
-        // TODO: check there is no duplication in OriginalDimMergeSeqs
-    }
-
-    __host__ __device__ static constexpr auto GetOriginalTensorDescriptor()
-    {
-        return OriginalTensorDesc{};
-    }
-
-    __host__ __device__ static constexpr auto GetNumOfDimension() { return Number<nDim>{}; }
-
-    template <index_t IDim>
-    __host__ __device__ static constexpr auto GetContainedOriginalDimensions(Number<IDim>)
-    {
-        return std::get<IDim>(mOriginalDimMergeSeqs);
-    }
-
-    template <index_t IDim>
-    __host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number<IDim>)
-    {
-        return (std::get<IDim>(mOriginalDimMergeSeqs).GetSize() > 1);
-    }
-
-    template <index_t IDim>
-    __host__ __device__ static constexpr auto GetLength(Number<IDim>)
-    {
-        constexpr auto original_dims_partial = std::get<IDim>(mOriginalDimMergeSeqs);
-
-        return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize();
-    }
-
-    template <index_t IDim>
-    __host__ __device__ static constexpr auto GetStride(Number<IDim>)
-    {
-        static_assert(!ContainMultipleOriginalDimensions(Number<IDim>{}),
-                      "wrong! stride of a merged dimension is undefined");
-
-        constexpr auto idim_original = std::get<IDim>(mOriginalDimMergeSeqs).Back();
-
-        return OriginalTensorDesc::GetStride(Number<idim_original>{});
-    }
-
-    // this is a hack to return the stride of the last original dimension of a merged dimension
-    // TODO: refactor this once the concept of "dimension" is used
-    template <index_t IDim>
-    __host__ __device__ static constexpr auto GetLastOriginalDimensionStride(Number<IDim>)
-    {
-        constexpr auto idim_last_original = std::get<IDim>(mOriginalDimMergeSeqs).Back();
-
-        return OriginalTensorDesc::GetStride(Number<idim_last_original>{});
-    }
-
-    __host__ __device__ static constexpr auto GetLengths()
-    {
-        return Sequence<OriginalTensorDesc::Extract(OriginalDimMergeSeqs{}).GetElementSize()...>{};
-    }
-
-    __host__ __device__ static constexpr auto GetElementSize()
-    {
-        return OriginalTensorDesc::GetElementSize();
-    }
-
-    template <class OriginalDimsPartial>
-    struct lambda_1_GetOriginalMultiIndexFromMultiIndex
-    {
-        const Array<index_t, OriginalDimsPartial::GetSize()>& original_multi_id_partial;
-        Array<index_t, nOriginalDim>& original_multi_id;
-
-        __host__ __device__ constexpr lambda_1_GetOriginalMultiIndexFromMultiIndex(
-            const Array<index_t, OriginalDimsPartial::GetSize()>& original_multi_id_partial_,
-            Array<index_t, nOriginalDim>& original_multi_id_)
-            : original_multi_id_partial(original_multi_id_partial_),
-              original_multi_id(original_multi_id_)
-        {
-        }
-
-        template <index_t I>
-        __host__ __device__ constexpr void operator()(Number<I>) const
-        {
-            constexpr index_t idim_original = OriginalDimsPartial::Get(Number<I>{});
-
-            index_t itmp = original_multi_id_partial[I];
-
-            original_multi_id(idim_original) = itmp;
-        }
-    };
-
-    struct lambda_0_GetOriginalMultiIndexFromMultiIndex
-    {
-        const Array<index_t, nDim>& multi_id;
-        Array<index_t, nOriginalDim>& original_multi_id;
-
-        __host__ __device__ constexpr lambda_0_GetOriginalMultiIndexFromMultiIndex(
-            const Array<index_t, nDim>& multi_id_, Array<index_t, nOriginalDim>& original_multi_id_)
-            : multi_id(multi_id_), original_multi_id(original_multi_id_)
-        {
-        }
-
-        template <index_t IDim>
-        __host__ __device__ constexpr void operator()(Number<IDim>) const
-        {
-            constexpr auto original_dims_partial = std::get<IDim>(Type::mOriginalDimMergeSeqs);
-
-            // get partial original-multi-id corresponding to this merged dimension
-            const auto original_multi_id_partial =
-                OriginalTensorDesc::Extract(original_dims_partial)
-                    .GetMultiIndexFrom1dIndex(multi_id[IDim]);
-
-            static_for<0, original_dims_partial.GetSize(), 1>{}(
-                lambda_1_GetOriginalMultiIndexFromMultiIndex<decltype(original_dims_partial)>(
-                    original_multi_id_partial, original_multi_id));
-        }
-    };
-
-    // return type is Array<...>
-    __host__ __device__ static constexpr auto
-    GetOriginalMultiIndexFromMultiIndex(Array<index_t, nDim> multi_id)
-    {
-        Array<index_t, nOriginalDim> original_multi_id;
-
-        static_for<0, nDim, 1>{}(
-            lambda_0_GetOriginalMultiIndexFromMultiIndex(multi_id, original_multi_id));
-
-        return original_multi_id;
-    }
-
-    template <index_t... Is>
-    __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Sequence<Is...>)
-    {
-        constexpr auto multi_id = sequence2array(Sequence<Is...>{});
-
-        constexpr auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);
-
-        return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
-    }
-
-    __host__ __device__ static constexpr index_t
-    GetOffsetFromMultiIndex(Array<index_t, nDim> multi_id)
-    {
-        auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id);
-
-        return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id);
-    }
-
-    template <class... Is>
-    __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Is... is)
-    {
-        return GetOffsetFromMultiIndex(Array<index_t, nDim>{is...});
-    }
-
-    __host__ __device__ static constexpr Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
-    {
-        constexpr auto packed_desc = make_ConstantTensorDescriptor_packed(GetLengths());
-
-        return packed_desc.GetMultiIndexFrom1dIndex(id);
-    }
-
-    __host__ __device__ static constexpr auto Pack()
-    {
-        constexpr auto lengths = GetLengths();
-        constexpr auto strides = calculate_tensor_strides_packed(lengths);
-        return ConstantTensorDescriptor_deprecated<decltype(lengths), decltype(strides)>{};
-    }
-};
-
-template <class OriginalTensorDesc, class... OriginalDimMergeSeqs>
-__host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc,
-                                                                       OriginalDimMergeSeqs...)
-{
-    return ConstantMergedTensorDescriptor_deprecated<OriginalTensorDesc, OriginalDimMergeSeqs...>{};
-}
-
-template <class TDesc>
-__host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDesc)
-{
-    print_ConstantTensorDescriptor(s, TDesc::GetOriginalTensorDescriptor());
-}
-
-} // namespace ck
-#endif
diff --git a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp
deleted file mode 100644
index 494ef1ddd8..0000000000
--- a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_deprecated.hpp
+++ /dev/null
@@ -1,348 +0,0 @@
-#ifndef CK_TENSOR_COORDINATE_DEPRECATED_HPP
-#define CK_TENSOR_COORDINATE_DEPRECATED_HPP
-
-#include "static_kernel_common_header.hpp"
-#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp"
-#include "static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp"
-
-namespace ck {
-
-// TensorDesc is ConstantTensorDescriptor_deprecated
-template <class TensorDesc>
-struct NormalTensorCoordinate_deprecated
-{
-    using type             = NormalTensorCoordinate_deprecated;
-    using tensor_desc_type = TensorDesc;
-
-    static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
-
-    __host__
-        __device__ constexpr NormalTensorCoordinate_deprecated(Array<index_t, nDim> tensor_index)
-        : mOffset{tensor_desc_type::GetOffsetFromMultiIndex(tensor_index)}
-    {
-    }
-
-    template <class... Xs>
-    __host__ __device__ constexpr NormalTensorCoordinate_deprecated(Xs... xs)
-        : NormalTensorCoordinate_deprecated(Array<index_t, nDim>{xs...})
-    {
-    }
-
-    template <index_t... Xs>
-    __host__ __device__ constexpr NormalTensorCoordinate_deprecated(Sequence<Xs...>)
-        : NormalTensorCoordinate_deprecated(Array<index_t, nDim>{Xs...})
-    {
-    }
-
-    __host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
-
-    // T is Array or Sequence
-    template <class T>
-    __host__ __device__ type operator+=(T step_sizes)
-    {
-        static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
-
-        mOffset += tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
-
-        return *this;
-    }
-
-    template <class T>
-    __host__ __device__ type operator-=(T step_sizes)
-    {
-        static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
-
-        mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(step_sizes);
-
-        return *this;
-    }
-
-    template <class T>
-    __host__ __device__ constexpr type operator+(T step_sizes) const
-    {
-        type coord = *this;
-        coord += step_sizes;
-        return coord;
-    }
-
-    template <class T>
-    __host__ __device__ constexpr type operator-(T step_sizes) const
-    {
-        type coord = *this;
-        coord -= step_sizes;
-        return coord;
-    }
-
-    // reposition point of origin, and return compensated offset.
-    // This is a hack to reduce index calculation during looping over
-    // a tensor whose origin is this TensorCoordinate. It does so, by spitting
-    // out the run-time offset to the pointer (to the tensor data) held by this
-    // TensorCoordiante, so the caller can add the offset into the run-time pointer of
-    // the data, so only 1 run-time variable (update pointer) is needed, instead
-    // of 2 run-time variables (old pointer and this offset)
-    // TODO: after introducing the concept of "run-time tensor view", which contains the
-    // run-time pointer to the data, always keep track of the pointer, instead of both
-    // offset and the pointer. This also bring additional benefit that we don't need to
-    // worry the offset might underflow (because offset is unsigned integer) when updating it.
-    __host__ __device__ constexpr index_t RepositionOrigin()
-    {
-        index_t offset_diff = mOffset;
-        mOffset             = 0;
-        return offset_diff;
-    }
-
-private:
-    index_t mOffset;
-};
-
-// TensorDesc is ConstantMergedTensorDescriptor_deprecated
-template <class TensorDesc>
-struct MergedTensorCoordinate_deprecated
-{
-    using type             = MergedTensorCoordinate_deprecated;
-    using tensor_desc_type = TensorDesc;
-
-    static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension();
-    static constexpr index_t nOriginalDim =
-        tensor_desc_type::GetOriginalTensorDescriptor().GetNumOfDimension();
-
-    __host__
-        __device__ constexpr MergedTensorCoordinate_deprecated(Array<index_t, nDim> tensor_index)
-        : mOriginalIndex{tensor_desc_type::GetOriginalMultiIndexFromMultiIndex(tensor_index)}
-    {
-        // partial offset on each dimension
-        static_for<0, nDim, 1>{}([&](auto idim) {
-            constexpr auto partial_original_dims =
-                tensor_desc_type::GetContainedOriginalDimensions(idim);
-
-            constexpr auto partial_original_desc =
-                tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims);
-
-            mPartialOffsets(idim) = partial_original_desc.GetOffsetFromMultiIndex(
-                extract_array(mOriginalIndex, partial_original_dims));
-        });
-
-        // complete offset
-        mOffset =
-            accumulate_on_array(mPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
-    }
-
-    template <class... Xs>
-    __host__ __device__ constexpr MergedTensorCoordinate_deprecated(Xs... xs)
-        : MergedTensorCoordinate_deprecated(Array<index_t, nDim>{xs...})
-    {
-    }
-
-    __host__ __device__ constexpr index_t GetOffset() const { return mOffset; }
-
-    template <class IDim, class T, bool PositiveDirection>
-    __host__ __device__ void
-    MoveOnDimension(IDim idim_, T step_size, integral_constant<bool, PositiveDirection>)
-    {
-        constexpr auto idim = idim_;
-
-        // if step_size is known at compile time
-        static_if<is_static<T>::value>{}(
-            [&](auto) { static_if<T{} == 0>{}([&](auto) { return; }); });
-
-        // update original index
-        static_if<tensor_desc_type::ContainMultipleOriginalDimensions(idim)>{}([&](auto) {
-            constexpr auto partial_original_dims =
-                tensor_desc_type::GetContainedOriginalDimensions(idim);
-
-            constexpr index_t ndim_partial_original = partial_original_dims.GetSize();
-
-            constexpr auto partial_original_desc =
-                tensor_desc_type::GetOriginalTensorDescriptor().Extract(partial_original_dims);
-
-            const auto partial_original_step_sizes =
-                partial_original_desc.GetMultiIndexFrom1dIndex(step_size);
-
-            // update partial original multi-id
-            auto partial_original_id = extract_array(mOriginalIndex, partial_original_dims);
-
-            static_if<PositiveDirection>{}([&](auto) {
-                partial_original_id += partial_original_step_sizes;
-
-                bool carry = false;
-
-                // do carry check in reversed order, starting from lowest dimension
-                // don't check the highest dimension
-                static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) {
-                    constexpr index_t i = ndim_partial_original - 1 - IReverse;
-
-                    if(carry)
-                    {
-                        ++partial_original_id(i);
-                    }
-
-                    carry = false;
-
-                    if(partial_original_id[i] >= partial_original_desc.GetLength(i))
-                    {
-                        partial_original_id(i) -= partial_original_desc.GetLength(i);
-                        carry = true;
-                    }
-                });
-
-                // highest dimension
-                if(carry)
-                {
-                    ++partial_original_id(0);
-                }
-            }).Else([&](auto) {
-                // shift up multi-id to avoid unsigned integer underflow during intermediate
-                // calculations. After the shift, should have new_multi_id[...] >= 1
-                partial_original_id +=
-                    partial_original_desc.GetLengths() - partial_original_step_sizes;
-
-                bool borrow = false;
-
-                // do borrow check in reversed order, starting from lowest dimension
-                // don't check the highest dimension
-                static_for<0, ndim_partial_original - 1, 1>{}([&](auto IReverse) {
-                    constexpr index_t i = ndim_partial_original - 1 - IReverse;
-
-                    if(borrow)
-                    {
-                        --partial_original_id(i);
-                    }
-
-                    borrow = false;
-
-                    if(partial_original_id[i] < partial_original_desc.GetLength(i))
-                    {
-                        partial_original_id(i) += partial_original_desc.GetLength(i);
-                        borrow = true;
-                    }
-                });
-
-                // highest dimension
-                if(borrow)
-                {
-                    --partial_original_id(0);
-                }
-
-                // shift back down multi-id
-                // here, should have new_multi_id[...] >= GetLengths()
-                partial_original_id = partial_original_id - partial_original_desc.GetLengths();
-            });
-
-            // update "mOriginalIndex"
-            static_for<0, ndim_partial_original, 1>{}([&](auto I) {
-                constexpr auto idim_original = partial_original_dims[I];
-
-                mOriginalIndex(idim_original) = partial_original_id[I];
-            });
-
-            // calculate new partial offset on this merged dimension
-            const index_t old_partial_offset = mPartialOffsets[idim];
-
-            mPartialOffsets(idim) =
-                partial_original_desc.GetOffsetFromMultiIndex(partial_original_id);
-
-            // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
-            mOffset = (mOffset + mPartialOffsets[idim]) - old_partial_offset;
-        }).Else([&](auto fwd) {
-            static_if<PositiveDirection>{}([&](auto) {
-                mOffset += step_size * fwd(tensor_desc_type{}).GetStride(idim);
-            }).Else([&](auto) { mOffset -= step_size * fwd(tensor_desc_type{}).GetStride(idim); });
-        });
-    }
-
-    // T is Array or Sequence
-    template <class T>
-    __host__ __device__ type operator+=(T step_sizes)
-    {
-        static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
-
-        static_for<0, nDim, 1>{}([&](auto idim) {
-            // compiler should remove dead code path, because step_sizes is known at
-            // compile time
-            if(step_sizes[idim] != 0)
-            {
-                this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, true>{});
-            }
-        });
-
-        return *this;
-    }
-
-    template <class T>
-    __host__ __device__ type operator-=(T step_sizes)
-    {
-        static_assert(is_same<typename T::data_type, index_t>{} && T::GetSize() == nDim, "wrong!");
-
-        static_for<0, nDim, 1>{}([&](auto idim) {
-            // compiler should remove dead code path, because step_sizes is known at
-            // compile time
-            if(step_sizes[idim] != 0)
-            {
-                this->MoveOnDimension(idim, step_sizes[idim], integral_constant<bool, false>{});
-            }
-        });
-
-        return *this;
-    }
-
-    template <class T>
-    __host__ __device__ constexpr type operator+(T step_sizes) const
-    {
-        type coord = *this;
-        coord += step_sizes;
-        return coord;
-    }
-
-    template <class T>
-    __host__ __device__ constexpr type operator-(T step_sizes) const
-    {
-        type coord = *this;
-        coord -= step_sizes;
-        return coord;
-    }
-
-    __host__ __device__ static constexpr index_t RepositionOrigin() { return 0; }
-
-private:
-    // Allocate register memory for all merged dimensions and normal dimensions.
-    // However, only those merged dimensions, whose index will be involved in arithmetic
-    // after the construction of this TensorCoordinate (e.g. when user move a slicing
-    // window on the merged dimension), will use these register memory.
-    // Let's hope compiler will optimize away those register memory allocated for normal
-    // dimensions, and those merged dimensions, that would never be involved in index
-    // arithmetic after construction of TensorCoordinate.
-    // TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
-    // and simplify implementation of ConstantMergedTensorDescriptor_deprecated, so we don't need to
-    // count on compiler to optimize away those register memory for us
-    Array<index_t, nOriginalDim> mOriginalIndex;
-    Array<index_t, nDim> mPartialOffsets;
-
-    // complete offset
-    index_t mOffset;
-};
-
-template <class TensorDesc>
-struct TensorCoordinate_deprecated
-{
-private:
-    template <class... Ts>
-    __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantTensorDescriptor_deprecated<Ts...>)
-    {
-        return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor_deprecated<Ts...>>();
-    }
-
-    template <class... Ts>
-    __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor_deprecated<Ts...>)
-    {
-        return MergedTensorCoordinate_deprecated<
-            ConstantMergedTensorDescriptor_deprecated<Ts...>>();
-    }
-
-public:
-    using type = decltype(MakeDummyTensorCoordinate(TensorDesc{}));
-};
-
-} // namespace ck
-#endif
diff --git a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_helper.hpp b/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_helper.hpp
deleted file mode 100644
index 2cacb329cb..0000000000
--- a/src/kernels/static_composable_kernel/include/tensor_description/static_kernel_tensor_coordinate_helper.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef CK_TENSOR_COORDINATE_HELPER_HPP
-#define CK_TENSOR_COORDINATE_HELPER_HPP
-
-#include "tensor_coordiante_hpp"
-
-namespace ck {
-
-template <typename TensorDesc>
-__host__ __device__ constexpr auto
-make_tensor_coordinate(TensorDesc, MultiIndex<TensorDesc::GetNumOfDimension()> idx)
-{
-    return typename TensorCoordinate<TensorDesc>::type(idx);
-}
-
-} // namespace ck
-#endif
diff --git a/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy_deprecated.hpp
deleted file mode 100644
index 806a38a0c7..0000000000
--- a/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_blockwise_generic_tensor_slice_copy_deprecated.hpp
+++ /dev/null
@@ -1,613 +0,0 @@
-#ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
-#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
-
-#include "static_kernel_common_header.hpp"
-#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp"
-#include "static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp"
-#include "static_kernel_tensor_coordinate_deprecated.hpp"
-#include "static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp"
-
-namespace ck {
-
-// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
-// memory layout (ordering of dimensions) can be different between src and dst.
-// This functions assume each thread is reading and writing a normal (not merged) tensor,
-// to simplify index calculations. To satisfy this assumption, the user need to make sure
-// that, on a merged dimension that constains multiple original dimensions, the length of
-// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
-// repeat-length on the merged dimension need to be 1. These sanity checks are performed
-// in constructor of BlockwiseGenericTensorSliceCopy_v1_deprecated
-template <index_t BlockSize,
-          typename SrcDesc,
-          typename DstDesc,
-          typename SliceLengths,
-          typename SubLengths,
-          typename ThreadClusterLengths,
-          typename ThreadClusterArrangeOrder,
-          typename SrcDimAccessOrder,
-          typename DstDimAccessOrder,
-          index_t SrcVectorAccessDim,
-          index_t DstVectorAccessDim,
-          index_t SrcDataPerAccess,
-          index_t DstDataPerAccess>
-struct BlockwiseGenericTensorSliceCopy_v1_deprecated
-{
-    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
-
-    static constexpr index_t nOriginalDimSrc =
-        SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
-    static constexpr index_t nOriginalDimDst =
-        DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
-
-    // per-thread offset
-    index_t mThreadSrcOffset;
-    index_t mThreadDstOffset;
-
-    // "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
-    // "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
-    // updated if slicing-window is moved. However, they will not be used if you always move
-    // the slicing-window along a non-merged dimension. In that case, compiler should be
-    // able to remove these calculation.
-    // TODO: make sure compiler would actually remove them in that case
-
-    // partial offset in each (merged) dimension
-    Array<index_t, nDim> mThreadSrcPartialOffsets;
-    Array<index_t, nDim> mThreadDstPartialOffsets;
-
-    // multi-id of original tensor
-    Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
-    Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
-
-    __device__
-    BlockwiseGenericTensorSliceCopy_v1_deprecated(Array<index_t, nDim> src_block_data_id_begin,
-                                                  Array<index_t, nDim> dst_block_data_id_begin)
-    {
-        // check NDim consistency
-        static_assert(
-            nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
-                nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
-                nDim == ThreadClusterLengths::GetSize() &&
-                nDim == ThreadClusterArrangeOrder::GetSize() &&
-                nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
-            "wrong");
-
-        // check thread arrange order and read/write access order are valid
-        static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
-                          is_valid_sequence_map<SrcDimAccessOrder>::value &&
-                          is_valid_sequence_map<DstDimAccessOrder>::value,
-                      "wrong!");
-
-        // thread cluster
-        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
-            ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
-
-        // BlockSize
-        static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
-
-        // divide work
-        constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{};
-
-        static_for<0, nDim, 1>{}([&](auto IDim) {
-            static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
-                          "wrong! cannot evenly divide sliced tensor into cluster");
-        });
-
-        constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
-
-        // additional check for merged dimension
-        static_for<0, nDim, 1>{}([&](auto IDim_) {
-            // src
-            static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
-                constexpr auto IDim = decltype(IDim_){};
-
-                // on a merged dimension that constains multiple original dimensions,
-                // the length of the last original dimension need to evenly dividable by its
-                // sub-length,
-                // so each thread is effectively reading a normal (not merged) tensor
-                constexpr auto idim_last_original_src =
-                    SrcDesc::GetContainedOriginalDimensions(IDim).Back();
-                static_assert(
-                    SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_src) %
-                            SubLengths::Get(IDim) ==
-                        0,
-                    "wrong!");
-
-                // merged dimension should have repeat_lengths = 1
-                static_assert(repeat_lengths[IDim] == 1,
-                              "wrong! repeat_lengths shoud be 1 on merged dimension");
-            });
-
-            // dst
-            static_if<DstDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
-                constexpr auto IDim = decltype(IDim_){};
-
-                // on a merged dimension that constains multiple original dimensions,
-                // the length of the last original dimension need to evenly dividable by its
-                // sub-length,
-                // so each thread is effectively reading a normal (not merged) tensor
-                constexpr auto idim_last_original_dst =
-                    DstDesc::GetContainedOriginalDimensions(IDim).Back();
-                static_assert(
-                    DstDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_dst) %
-                            SubLengths::Get(IDim) ==
-                        0,
-                    "wrong!");
-
-                // merged dimension should have repeat_lengths = 1
-                static_assert(repeat_lengths[IDim] == 1,
-                              "wrong! repeat_lengths shoud be 1 on merged dimension");
-            });
-        });
-
-        // calculate mThreadSrcOffset, mThreadDstOffset
-        const auto thread_cluster_id =
-            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
-
-        const auto data_cluster_id =
-            reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
-
-        const auto thread_data_id_begin = data_cluster_id * SubLengths{};
-
-        // original multi-id
-        mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex(
-            src_block_data_id_begin + thread_data_id_begin);
-
-        mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex(
-            dst_block_data_id_begin + thread_data_id_begin);
-
-        // partial offset on each dimension
-        static_for<0, nDim, 1>{}([&](auto IDim) {
-            constexpr auto src_partial_original_dims =
-                SrcDesc::GetContainedOriginalDimensions(IDim);
-
-            constexpr auto src_partial_original_desc =
-                SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
-
-            mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex(
-                extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
-        });
-
-        static_for<0, nDim, 1>{}([&](auto IDim) {
-            constexpr auto dst_partial_original_dims =
-                DstDesc::GetContainedOriginalDimensions(IDim);
-
-            constexpr auto dst_partial_original_desc =
-                DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
-
-            mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
-                extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
-        });
-
-        // complete offset
-        mThreadSrcOffset = accumulate_on_array(
-            mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
-
-        mThreadDstOffset = accumulate_on_array(
-            mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
-    }
-
-    __device__ static constexpr auto GetRegisterBufferDescriptor()
-    {
-        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
-
-        return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
-    }
-
-    __device__ static constexpr index_t GetThreadBufferSize()
-    {
-        return GetRegisterBufferDescriptor().GetElementSpace();
-    }
-
-    template <typename TData>
-    __device__ void RunLoadThreadBuffer(const TData* __restrict__ p_src,
-                                        TData* __restrict__ p_buffer) const
-    {
-        constexpr auto thread_sub_tensor_lengths = SubLengths{};
-
-        constexpr auto data_per_cluster_per_dims =
-            thread_sub_tensor_lengths * ThreadClusterLengths{};
-
-        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
-
-        constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
-
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
-        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
-            constexpr auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
-
-            constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
-
-            constexpr index_t src_offset =
-                SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
-
-            constexpr index_t buffer_offset =
-                thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
-#else
-        ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
-            const auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
-
-            const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
-
-            const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
-
-            const index_t buffer_offset =
-                thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
-#endif
-
-            // By position the origin of the per-thread window at the point, where multi-index
-            // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
-            // is assuming each thread is copy a noraml (not merged) tensor.
-            // To satisfy this assumption, the user need to make sure that, on a merged dimension
-            // that constains multiple original dimensions, the length of the last original
-            // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
-            // the merged dimension need to be 1. These sanity checks are performed in constructor
-            // of BlockwiseGenericTensorSliceCopy_v1_deprecated
-            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<SrcDesc,
-                                                             decltype(thread_buffer_desc),
-                                                             SubLengths,
-                                                             SrcDimAccessOrder,
-                                                             SrcVectorAccessDim,
-                                                             SrcDataPerAccess,
-                                                             1>(make_zero_array<index_t, nDim>(),
-                                                                make_zero_array<index_t, nDim>())
-                .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
-        });
-    }
-
-    template <typename TData>
-    __device__ void RunStoreThreadBuffer(const TData* __restrict__ p_buffer,
-                                         TData* __restrict__ p_dst) const
-    {
-        constexpr auto thread_sub_tensor_lengths = SubLengths{};
-
-        constexpr auto data_per_cluster_per_dims =
-            thread_sub_tensor_lengths * ThreadClusterLengths{};
-
-        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
-
-        constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
-
-#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
-        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
-            constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
-
-            constexpr auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
-
-            constexpr index_t buffer_offset =
-                thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
-
-            constexpr index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
-#else
-        ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
-            const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
-
-            const auto dst_data_id_begin = repeat_id * data_per_cluster_per_dims;
-
-            const index_t buffer_offset =
-                thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
-
-            const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
-#endif
-
-            // By position the origin of the per-thread window at the point, where multi-index
-            // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
-            // is assuming each thread is copy a noraml (not merged) tensor.
-            // To satisfy this assumption, the user need to make sure that, on a merged dimension
-            // that constains multiple original dimensions, the length of the last original
-            // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
-            // the merged dimension need to be 1. These sanity checks are performed in constructor
-            // of BlockwiseGenericTensorSliceCopy_v1_deprecated
-            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<decltype(thread_buffer_desc),
-                                                             DstDesc,
-                                                             SubLengths,
-                                                             DstDimAccessOrder,
-                                                             DstVectorAccessDim,
-                                                             1,
-                                                             DstDataPerAccess>(
-                make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
-                .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
-        });
-    }
-
-    template <typename TData>
-    __device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
-    {
-        TData p_buffer[GetThreadBufferSize()];
-
-        RunLoadThreadBuffer(p_src, p_buffer);
-        RunStoreThreadBuffer(p_buffer, p_dst);
-    }
-
-    // When moving the slicing windows along a merged dimension, if the strides of the
-    // contained (by the merged dimension) original dimensions are not in descending order,
-    // then there is no guarantee that the new offset will be larger than the old offset
-    // for movement in positive direction (vice versue for movement in negative direction).
-    // As a result, there is the possiblity that the offset calculation may result in
-    // unsigned integer underflow (due to "-" operation). However, this hazard should not
-    // happen, as long as the users make sure the slicing window would not be moved out of
-    // the boundary of the tensor being sliced. This functions doesn't do runtime sanity
-    // check on out-of-bound slicing window, for performance reason
-    template <index_t IDim_, index_t StepSize, bool PositiveDirection>
-    __device__ void MoveSlicingWindowOnSourceTensor(
-        Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
-    {
-        constexpr auto IDim = Number<IDim_>{};
-
-        static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto) {
-            // logic for a merged dimension, also works for non-merged dimension, but its logic may
-            // be unncessarily complicated for compiler to remove calculations that are useless for
-            // a non-merged dimension
-
-            // extract partial original dimensions
-            constexpr auto src_partial_original_dims =
-                SrcDesc::GetContainedOriginalDimensions(IDim);
-
-            constexpr auto src_partial_original_desc =
-                SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
-
-            // calculate new partial original multi-id
-            auto old_src_partial_original_id =
-                extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims);
-
-            auto new_src_partial_original_id =
-                src_partial_original_desc.UpdateMultiIndexGivenStepSizeOf1dIndex(
-                    old_src_partial_original_id, StepSize, direction);
-
-            // update "mThreadSrcOriginalMultiId"
-            static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I) {
-                constexpr auto IDimOriginal = src_partial_original_dims[I];
-
-                mThreadSrcOriginalMultiId(IDimOriginal) = new_src_partial_original_id[I];
-            });
-
-            // calculate new partial offset on this merged dimension
-            const index_t old_src_partial_offset = mThreadSrcPartialOffsets[IDim];
-
-            const index_t new_src_partial_offset =
-                src_partial_original_desc.GetOffsetFromMultiIndex(new_src_partial_original_id);
-
-            // update "mThreadSrcPartialOffsets"
-            mThreadSrcPartialOffsets(IDim) = new_src_partial_offset;
-
-            // update "mThreadSrcOffset", do "+" before "-" to avoid underflow
-            mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset;
-        }).Else([&](auto) {
-            // Logic for non-merged dimension. If you are never going to move the slicing window on
-            // a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
-            // which are being calculated here, will never be used later. In this case, compiler
-            // should be able to remove these calculations.
-            // TODO: make sure compiler would actually remove them in this case.
-
-            // It is the user's responsiblity to make sure the slicing window will not be moved out
-            // of the boundary of the tensor being sliced. Otherwise, there might be hazard like
-            // unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
-
-            constexpr auto IDimOriginal = SrcDesc::GetContainedOriginalDimensions(IDim).Front();
-
-            static_if<PositiveDirection>{}([&](auto fwd) {
-                mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim);
-
-                mThreadSrcOriginalMultiId(IDimOriginal) += StepSize;
-
-                mThreadSrcPartialOffsets(IDim) += StepSize * fwd(SrcDesc{}).GetStride(IDim);
-            }).Else([&](auto fwd) {
-                mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
-
-                mThreadSrcOriginalMultiId(IDimOriginal) -= StepSize;
-
-                mThreadSrcPartialOffsets(IDim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim);
-            });
-        });
-    }
-
-    template <typename T, bool PositiveDirection>
-    __device__ void
-    MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
-    {
-        static_for<0, nDim, 1>{}([&](auto idim) {
-            if(step_sizes[idim] != 0)
-            {
-                MoveSlicingWindowOnSourceTensor(idim, step_sizes[idim], positive_direction);
-            }
-        });
-    }
-};
-
-// This version use TensorCoordiante
-// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
-// memory layout (ordering of dimensions) can be different between src and dst.
-template <index_t BlockSize,
-          typename SrcDesc,
-          typename DstDesc,
-          typename SliceLengths,
-          typename SubLengths,
-          typename ThreadClusterLengths,
-          typename ThreadClusterArrangeOrder,
-          typename SrcDimAccessOrder,
-          typename DstDimAccessOrder,
-          index_t SrcVectorAccessDim,
-          index_t DstVectorAccessDim,
-          index_t SrcDataPerAccess,
-          index_t DstDataPerAccess>
-struct BlockwiseGenericTensorSliceCopy_v2_deprecated
-{
-    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
-
-    using Index = MultiIndex<nDim>;
-
-    __device__ constexpr BlockwiseGenericTensorSliceCopy_v2_deprecated(
-        const Index& src_block_slice_origin, const Index& dst_block_slice_origin)
-    {
-        static_assert(
-            nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
-                nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
-                nDim == ThreadClusterLengths::GetSize() &&
-                nDim == ThreadClusterArrangeOrder::GetSize() &&
-                nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
-            "wrong! nDim not consistent");
-
-        static_assert(is_same<SliceLengths, decltype(SubLengths{} * ThreadClusterLengths{})>{},
-                      "wrong! threads should be mapped to cover entire slicing window");
-
-        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
-            ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
-
-        static_assert(BlockSize == thread_cluster_desc.GetElementSize(),
-                      "wrong! BlockSize not consistent with ThreadClusterLengths");
-
-        const auto thread_cluster_id =
-            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
-
-        const auto data_cluster_id =
-            reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
-
-        const auto thread_data_id_begin = data_cluster_id * SubLengths{};
-
-        mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin);
-        mThreadwiseLoad.SetDstSliceOrigin(make_zero_array<index_t, nDim>());
-
-        mThreadwiseStore.SetSrcSliceOrigin(make_zero_array<index_t, nDim>());
-        mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin);
-    }
-
-    __device__ static constexpr index_t GetThreadBufferSize()
-    {
-        return ThreadBufferDesc::GetElementSpace();
-    }
-
-    template <typename BlockSrcData,
-              typename ThreadBufferData,
-              AddressSpace BlockSrcAddressSpace,
-              AddressSpace ThreadBufferAddressSpace>
-    __device__ void
-    RunLoadThreadBuffer(const BlockSrcData* p_block_src,
-                        ThreadBufferData* p_thread_buffer,
-                        integral_constant<AddressSpace, BlockSrcAddressSpace>,
-                        integral_constant<AddressSpace, ThreadBufferAddressSpace>) const
-    {
-        constexpr auto block_src_address_space =
-            integral_constant<AddressSpace, BlockSrcAddressSpace>{};
-        constexpr auto thread_buffer_address_space =
-            integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
-
-        mThreadwiseLoad.Run(
-            p_block_src, p_thread_buffer, block_src_address_space, thread_buffer_address_space);
-    }
-
-    template <typename BlockSrcData, typename ThreadBufferData>
-    __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
-                                        ThreadBufferData* p_thread_buffer) const
-    {
-        constexpr auto generic_address_space =
-            integral_constant<AddressSpace, AddressSpace::Generic>{};
-
-        RunLoadThreadBuffer(
-            p_block_src, p_thread_buffer, generic_address_space, generic_address_space);
-    }
-
-    template <typename ThreadBufferData,
-              typename BlockDstData,
-              AddressSpace ThreadBufferAddressSpace,
-              AddressSpace BlockDstAddressSpace>
-    __device__ void
-    RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
-                         BlockDstData* p_block_dst,
-                         integral_constant<AddressSpace, ThreadBufferAddressSpace>,
-                         integral_constant<AddressSpace, BlockDstAddressSpace>) const
-    {
-        constexpr auto thread_buffer_address_space =
-            integral_constant<AddressSpace, ThreadBufferAddressSpace>{};
-        constexpr auto block_dst_address_space =
-            integral_constant<AddressSpace, BlockDstAddressSpace>{};
-
-        mThreadwiseStore.Run(
-            p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
-    }
-
-    template <typename ThreadBufferData, typename BlockDstData>
-    __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
-                                         BlockDstData* p_block_dst) const
-    {
-        constexpr auto generic_address_space =
-            integral_constant<AddressSpace, AddressSpace::Generic>{};
-
-        RunStoreThreadBuffer(
-            p_thread_buffer, p_block_dst, generic_address_space, generic_address_space);
-    }
-
-    template <typename BlockSrcData,
-              typename BlockDstData,
-              AddressSpace BlockSrcAddressSpace,
-              AddressSpace BlockDstAddressSpace>
-    __device__ void
-    Run(const BlockSrcData* p_block_src,
-        BlockDstData* p_block_dst,
-        integral_constant<AddressSpace, BlockSrcAddressSpace> block_src_address_space,
-        integral_constant<AddressSpace, BlockDstAddressSpace> block_dst_address_space) const
-    {
-        BlockSrcData p_thread_buffer[GetThreadBufferSize()];
-
-        constexpr auto generic_address_space =
-            integral_constant<AddressSpace, AddressSpace::Generic>{};
-
-        RunLoadThreadBuffer(
-            p_block_src, p_thread_buffer, block_src_address_space, generic_address_space);
-
-        // if there is type conversion, it's done during store
-        RunStoreThreadBuffer(
-            p_thread_buffer, p_block_dst, generic_address_space, block_dst_address_space);
-    }
-
-    template <typename BlockSrcData, typename BlockDstData>
-    __device__ void Run(const BlockSrcData* p_block_src, BlockDstData* p_block_dst) const
-    {
-        constexpr auto generic_address_space =
-            integral_constant<AddressSpace, AddressSpace::Generic>{};
-
-        Run(p_block_src, p_block_dst, generic_address_space, generic_address_space);
-    }
-
-    template <typename T, bool PositiveDirection>
-    __device__ void
-    MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
-    {
-        mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
-    }
-
-    template <typename T, bool PositiveDirection>
-    __device__ void
-    MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
-    {
-        mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
-    }
-
-private:
-    using ThreadBufferDesc = decltype(make_ConstantTensorDescriptor_packed(SubLengths{}));
-
-    using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<SrcDesc,
-                                                                            ThreadBufferDesc,
-                                                                            SubLengths,
-                                                                            SrcDimAccessOrder,
-                                                                            SrcDimAccessOrder,
-                                                                            SrcVectorAccessDim,
-                                                                            SrcVectorAccessDim,
-                                                                            SrcDataPerAccess,
-                                                                            1>;
-
-    using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v2r1_deprecated<ThreadBufferDesc,
-                                                                             DstDesc,
-                                                                             SubLengths,
-                                                                             DstDimAccessOrder,
-                                                                             DstDimAccessOrder,
-                                                                             DstVectorAccessDim,
-                                                                             DstVectorAccessDim,
-                                                                             1,
-                                                                             DstDataPerAccess>;
-
-    ThreadwiseLoad mThreadwiseLoad;
-    ThreadwiseStore mThreadwiseStore;
-};
-
-} // namespace ck
-
-#endif
diff --git a/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops.hpp
deleted file mode 100644
index 44652321f7..0000000000
--- a/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_gridwise_gemm_xdlops.hpp
+++ /dev/null
@@ -1,650 +0,0 @@
-#ifndef CK_GRIDWISE_GEMM_XDLOPS_HPP
-#define CK_GRIDWISE_GEMM_XDLOPS_HPP
-
-#include "static_kernel_common_header.hpp"
-#include "static_kernel_tensor_descriptor.hpp"
-#include "static_kernel_tensor_descriptor_helper.hpp"
-#include "static_kernel_ConstantMatrixDescriptor.hpp"
-#include "static_kernel_blockwise_generic_tensor_slice_copy.hpp"
-#include "static_kernel_threadwise_generic_tensor_slice_copy.hpp"
-#include "static_kernel_blockwise_gemm_xdlops.hpp"
-
-namespace ck {
-
-template <index_t GridSize,
-          index_t BlockSize,
-          class Float,
-          class AccFloat,
-          class AGlobalDesc,
-          class BGlobalDesc,
-          class CGlobalDesc,
-          index_t MPerBlock,
-          index_t NPerBlock,
-          index_t KPerBlock,
-          index_t MPerWave,
-          index_t NPerWave,
-          index_t GemmDataPerReadM,
-          index_t GemmDataPerReadN,
-          class ABlockCopyThreadSliceLengths_K_M,
-          class ABlockCopyThreadClusterLengths_K_M,
-          class ABlockCopyThreadClusterArrangeOrder,
-          class ABlockCopySrcAccessOrder,
-          class ABlockCopyDstAccessOrder,
-          index_t ABlockCopySrcVectorReadDim,
-          index_t ABlockCopySrcDataPerRead,
-          index_t ABlockCopyDstDataPerWrite_M,
-          class BBlockCopyThreadSliceLengths_K_N,
-          class BBlockCopyThreadClusterLengths_K_N,
-          class BBlockCopyThreadClusterArrangeOrder,
-          class BBlockCopySrcAccessOrder,
-          class BBlockCopyDstAccessOrder,
-          index_t BBlockCopySrcVectorReadDim,
-          index_t BBlockCopySrcDataPerRead,
-          index_t BBlockCopyDstDataPerWrite_N,
-          InMemoryDataOperation CGlobalMemoryDataOperation,
-          index_t ABlockCopySrcDataStride = 1,
-          index_t BBlockCopySrcDataStride = 1>
-struct GridwiseGemmTransposedANormalBNormalCXdlops_v1
-{
-    __device__ void Run(const Float* const __restrict__ p_a_global,
-                        const Float* const __restrict__ p_b_global,
-                        Float* const __restrict__ p_c_global) const
-    {
-
-        constexpr auto True = integral_constant<bool, true>{};
-
-        constexpr auto a_k_m_global_desc = AGlobalDesc{};
-        constexpr auto b_k_n_global_desc = BGlobalDesc{};
-        constexpr auto c_m_n_global_desc = CGlobalDesc{};
-
-        constexpr auto K = b_k_n_global_desc.GetLengths()[0];
-        constexpr auto N = b_k_n_global_desc.GetLengths()[1];
-        constexpr auto M = a_k_m_global_desc.GetLengths()[1];
-
-        // divide block work by [M, N]
-        static_assert(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0,
-                      "wrong! cannot divide work evenly among block");
-
-        constexpr index_t MBlockWork = M / MPerBlock;
-        constexpr index_t NBlockWork = N / NPerBlock;
-
-        static_assert(MPerBlock % MPerWave == 0 && NPerBlock % NPerWave == 0,
-                      "wrong! M/NPerBlock % M/NPerWave != 0");
-
-        constexpr index_t MWaves = MPerBlock / MPerWave;
-        constexpr index_t NWaves = NPerBlock / NPerWave;
-
-        constexpr auto block_work_desc =
-            make_cluster_descriptor(Sequence<MBlockWork, NBlockWork>{});
-
-        const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
-
-        const index_t m_block_data_on_global = block_work_id[0] * MPerBlock;
-        const index_t n_block_data_on_global = block_work_id[1] * NPerBlock;
-
-        //   LDS mem
-        constexpr index_t max_align = math::lcm(BBlockCopyDstDataPerWrite_N,
-                                                ABlockCopyDstDataPerWrite_M,
-                                                GemmDataPerReadM,
-                                                GemmDataPerReadN);
-
-        //   LDS
-        //     be careful of LDS alignment
-        constexpr auto a_k_m_block_desc = make_native_tensor_descriptor_aligned(
-            Sequence<KPerBlock, MPerBlock>{}, Number<max_align>{});
-
-        auto a_blockwise_copy =
-            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
-                                               decltype(a_k_m_global_desc),
-                                               decltype(a_k_m_block_desc),
-                                               decltype(a_k_m_block_desc.GetLengths()),
-                                               ABlockCopyThreadSliceLengths_K_M,
-                                               ABlockCopyThreadClusterLengths_K_M,
-                                               ABlockCopyThreadClusterArrangeOrder,
-                                               ABlockCopySrcAccessOrder,
-                                               ABlockCopyDstAccessOrder,
-                                               ABlockCopySrcVectorReadDim,
-                                               1,
-                                               ABlockCopySrcDataPerRead,
-                                               ABlockCopyDstDataPerWrite_M,
-                                               AddressSpace::Global,
-                                               AddressSpace::Vgpr,
-                                               AddressSpace::Lds,
-                                               InMemoryDataOperation::Set,
-                                               ABlockCopySrcDataStride>({0, m_block_data_on_global},
-                                                                        {0, 0});
-
-        constexpr auto b_k_n_block_desc = make_native_tensor_descriptor_aligned(
-            Sequence<KPerBlock, NPerBlock>{}, Number<max_align>{});
-
-        auto b_blockwise_copy =
-            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
-                                               decltype(b_k_n_global_desc),
-                                               decltype(b_k_n_block_desc),
-                                               decltype(b_k_n_block_desc.GetLengths()),
-                                               BBlockCopyThreadSliceLengths_K_N,
-                                               BBlockCopyThreadClusterLengths_K_N,
-                                               BBlockCopyThreadClusterArrangeOrder,
-                                               BBlockCopySrcAccessOrder,
-                                               BBlockCopyDstAccessOrder,
-                                               BBlockCopySrcVectorReadDim,
-                                               1,
-                                               BBlockCopySrcDataPerRead,
-                                               BBlockCopyDstDataPerWrite_N,
-                                               AddressSpace::Global,
-                                               AddressSpace::Vgpr,
-                                               AddressSpace::Lds,
-                                               InMemoryDataOperation::Set,
-                                               BBlockCopySrcDataStride>({0, n_block_data_on_global},
-                                                                        {0, 0});
-
-        // GEMM definition
-        // c_mtx += transpose(a_mtx) * b_mtx
-        //     a_mtx[KPerBlock, MPerBlock] is in LDS
-        //     b_mtx[EPerBlocl, NPerBlock] is in LDS
-        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
-        //     register
-        constexpr auto a_k_m_block_mtx_desc = make_ConstantMatrixDescriptor(a_k_m_block_desc);
-        constexpr auto b_k_n_block_mtx_desc = make_ConstantMatrixDescriptor(b_k_n_block_desc);
-
-        const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_xdlops<
-            BlockSize,
-            decltype(a_k_m_block_mtx_desc),
-            decltype(b_k_n_block_mtx_desc),
-            Float,
-            MPerWave,
-            NPerWave,
-            MWaves,
-            NWaves,
-            GemmDataPerReadM,
-            GemmDataPerReadN>{};
-
-        constexpr index_t a_block_space =
-            math::integer_least_multiple(a_k_m_block_desc.GetElementSpace(), max_align);
-
-        constexpr index_t b_block_space =
-            math::integer_least_multiple(b_k_n_block_desc.GetElementSpace(), max_align);
-
-        __shared__ Float p_a_block_double[2 * a_block_space];
-        __shared__ Float p_b_block_double[2 * b_block_space];
-
-        // get zero-initialized output register of vector type
-        auto c_thread_vec = blockwise_gemm.CreateOutputVecZero();
-
-        // LDS double buffer: preload data into LDS
-        {
-            a_blockwise_copy.Run(p_a_global, p_a_block_double);
-            b_blockwise_copy.Run(p_b_global, p_b_block_double);
-        }
-
-        using b_blockwise_copy_src_step = Sequence<KPerBlock, 0>;
-        using a_blockwise_copy_src_step = Sequence<KPerBlock, 0>;
-
-        // LDS double buffer: main body
-        for(index_t k_block_data_begin = 0; k_block_data_begin + 2 * KPerBlock < K;
-            k_block_data_begin += 2 * KPerBlock)
-        {
-#pragma unroll
-            for(index_t iloop = 0; iloop < 2; ++iloop)
-            {
-                const bool even_loop = (iloop % 2 == 0);
-
-                Float* p_a_block_now =
-                    even_loop ? p_a_block_double : p_a_block_double + a_block_space;
-                Float* p_b_block_now =
-                    even_loop ? p_b_block_double : p_b_block_double + b_block_space;
-
-                Float* p_a_block_next =
-                    even_loop ? p_a_block_double + a_block_space : p_a_block_double;
-                Float* p_b_block_next =
-                    even_loop ? p_b_block_double + b_block_space : p_b_block_double;
-
-                Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()];
-                Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()];
-
-                a_blockwise_copy.MoveSrcSliceWindow(a_blockwise_copy_src_step{}, True);
-                b_blockwise_copy.MoveSrcSliceWindow(b_blockwise_copy_src_step{}, True);
-
-                __syncthreads();
-
-                // LDS doubel buffer: load next data from device mem
-                a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer);
-                b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
-
-                // LDS double buffer: GEMM on current data
-                c_thread_vec = blockwise_gemm.Run(p_a_block_now, p_b_block_now, c_thread_vec);
-
-                // LDS double buffer: store next data to LDS
-                a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer, p_a_block_next);
-                b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, p_b_block_next);
-            }
-        }
-
-        // LDS double buffer: tail
-        {
-            constexpr bool has_two_iteration_left = (K % (2 * KPerBlock) == 0);
-
-            if(has_two_iteration_left) // if has 2 iteration left
-            {
-                Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()];
-                Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()];
-
-                a_blockwise_copy.MoveSrcSliceWindow(a_blockwise_copy_src_step{}, True);
-                b_blockwise_copy.MoveSrcSliceWindow(b_blockwise_copy_src_step{}, True);
-
-                __syncthreads();
-
-                // LDS double buffer: load last data from device mem
-                a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer);
-                b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
-
-                // LDS double buffer: GEMM on 2nd-last data
-                c_thread_vec = blockwise_gemm.Run(p_a_block_double, p_b_block_double, c_thread_vec);
-
-                // LDS double buffer: store last data to LDS
-                a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer,
-                                                      p_a_block_double + a_block_space);
-                b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer,
-                                                      p_b_block_double + b_block_space);
-
-                __syncthreads();
-
-                // LDS double buffer: GEMM on current data
-                c_thread_vec = blockwise_gemm.Run(p_a_block_double + a_block_space,
-                                                  p_b_block_double + b_block_space,
-                                                  c_thread_vec);
-            }
-            else // if has 1 iteration left
-            {
-                __syncthreads();
-
-                // LDS double buffer: GEMM on last data
-                c_thread_vec = blockwise_gemm.Run(p_a_block_double, p_b_block_double, c_thread_vec);
-            }
-        }
-
-        // copy output: register to global memory
-        {
-            ///\todo inconsistent layout of xdlops and tensor
-            // xdlops layout
-            // M1 = num_groups;
-            // M0 = group_size;
-            // N1 = num_blks_per_wave;
-            // N0 = num_threads_per_blks;
-            constexpr auto CLayout = blockwise_gemm.GetOutputLayout();
-            constexpr index_t M0   = CLayout.M1();
-            constexpr index_t M1   = CLayout.N1();
-            constexpr index_t M2   = CLayout.M0();
-
-            constexpr auto c_m0_m1_m2_n_global_desc = transform_tensor_descriptor(
-                c_m_n_global_desc,
-                make_tuple(UnMerge<Sequence<M0, M1, M2>>{}, PassThrough<N>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}));
-
-            //     src descriptor
-            constexpr auto c_m0_m1_m2_n_thread_desc =
-                make_native_tensor_descriptor_packed(Sequence<M0, 1, M2, 1>{});
-
-            using CThreadCopySliceLengths = Sequence<M0, 1, M2, 1>;
-
-            constexpr index_t BlkSize = blockwise_gemm.GetBlkSize();
-            constexpr index_t NumBlks = blockwise_gemm.GetNumBlks();
-
-            for(index_t i = 0; i < NumBlks; ++i)
-            {
-                // calculate origin of thread output tensor on global memory
-                //     blockwise GEMM c matrix starting index
-                const auto c_thread_mtx_on_block = blockwise_gemm.GetBeginOfThreadMatrixC(i);
-
-                const index_t m_thread_data_on_global =
-                    m_block_data_on_global + c_thread_mtx_on_block.row;
-
-                const index_t n_thread_data_on_global =
-                    n_block_data_on_global + c_thread_mtx_on_block.col;
-
-                ThreadwiseGenericTensorSliceCopy_v4r2<decltype(c_m0_m1_m2_n_thread_desc),
-                                                      decltype(c_m0_m1_m2_n_global_desc),
-                                                      CThreadCopySliceLengths,
-                                                      arithmetic_sequence_gen<0, 4, 1>::type,
-                                                      3,
-                                                      1,
-                                                      1,
-                                                      AddressSpace::Vgpr,
-                                                      AddressSpace::Global,
-                                                      CGlobalMemoryDataOperation>(
-                    {0, 0, 0, 0},
-                    {m_thread_data_on_global / (M2 * M1),
-                     m_thread_data_on_global % (M2 * M1) / M2,
-                     m_thread_data_on_global % M2,
-                     n_thread_data_on_global})
-                    .Run(c_thread_vec.n + i * BlkSize, p_c_global);
-            }
-        }
-    }
-};
-
-template <index_t GridSize,
-          index_t BlockSize,
-          class Float,
-          class AccFloat,
-          class AGlobalDesc,
-          class BGlobalDesc,
-          class CGlobalDesc,
-          index_t MPerBlock,
-          index_t NPerBlock,
-          index_t KPerBlock,
-          index_t MPerWave,
-          index_t NPerWave,
-          index_t GemmDataPerReadM,
-          index_t GemmDataPerReadN,
-          class ABlockCopyThreadSliceLengths_G_K_M,
-          class ABlockCopyThreadClusterLengths_G_K_M,
-          class ABlockCopyThreadClusterArrangeOrder,
-          class ABlockCopySrcAccessOrder,
-          class ABlockCopyDstAccessOrder,
-          index_t ABlockCopySrcVectorReadDim,
-          index_t ABlockCopySrcDataPerRead,
-          index_t ABlockCopyDstDataPerWrite_M,
-          class BBlockCopyThreadSliceLengths_G_K_N,
-          class BBlockCopyThreadClusterLengths_G_K_N,
-          class BBlockCopyThreadClusterArrangeOrder,
-          class BBlockCopySrcAccessOrder,
-          class BBlockCopyDstAccessOrder,
-          index_t BBlockCopySrcVectorReadDim,
-          index_t BBlockCopySrcDataPerRead,
-          index_t BBlockCopyDstDataPerWrite_N,
-          InMemoryDataOperation CGlobalMemoryDataOperation,
-          index_t ABlockCopySrcDataStride = 1,
-          index_t BBlockCopySrcDataStride = 1>
-struct GridwiseBatchedGemmTransposedANormalBNormalCXdlops_v1
-{
-    __device__ void Run(const Float* const __restrict__ p_a_global,
-                        const Float* const __restrict__ p_b_global,
-                        Float* const __restrict__ p_c_global) const
-    {
-
-        constexpr auto I1 = Number<1>{};
-        constexpr auto I2 = Number<2>{};
-
-        constexpr auto True = integral_constant<bool, true>{};
-
-        constexpr auto a_g_k_m_global_desc = AGlobalDesc{};
-        constexpr auto b_g_k_n_global_desc = BGlobalDesc{};
-        constexpr auto c_g_m_n_global_desc = CGlobalDesc{};
-
-        constexpr auto G = b_g_k_n_global_desc.GetLengths()[0];
-
-        constexpr auto K = b_g_k_n_global_desc.GetLengths()[1];
-        constexpr auto N = b_g_k_n_global_desc.GetLengths()[2];
-        constexpr auto M = a_g_k_m_global_desc.GetLengths()[2];
-
-        // divide block work by [M, N]
-        static_assert(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0,
-                      "wrong! cannot divide work evenly among block");
-
-        constexpr index_t MBlockWork = M / MPerBlock;
-        constexpr index_t NBlockWork = N / NPerBlock;
-
-        static_assert(MPerBlock % MPerWave == 0 && NPerBlock % NPerWave == 0,
-                      "wrong! M/NPerBlock % M/NPerWave != 0");
-
-        constexpr index_t MWaves = MPerBlock / MPerWave;
-        constexpr index_t NWaves = NPerBlock / NPerWave;
-
-        constexpr auto block_work_desc =
-            make_cluster_descriptor(Sequence<G, MBlockWork, NBlockWork>{});
-
-        const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
-
-        const index_t group_id               = block_work_id[0];
-        const index_t m_block_data_on_global = block_work_id[1] * MPerBlock;
-        const index_t n_block_data_on_global = block_work_id[2] * NPerBlock;
-
-        //   LDS mem
-        constexpr index_t max_align = math::lcm(BBlockCopyDstDataPerWrite_N,
-                                                ABlockCopyDstDataPerWrite_M,
-                                                GemmDataPerReadM,
-                                                GemmDataPerReadN);
-
-        //   LDS
-        //     be careful of LDS alignment
-        constexpr auto a_g_k_m_block_desc = make_native_tensor_descriptor_aligned(
-            Sequence<1, KPerBlock, MPerBlock>{}, Number<max_align>{});
-
-        auto a_blockwise_copy =
-            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
-                                               decltype(a_g_k_m_global_desc),
-                                               decltype(a_g_k_m_block_desc),
-                                               decltype(a_g_k_m_block_desc.GetLengths()),
-                                               ABlockCopyThreadSliceLengths_G_K_M,
-                                               ABlockCopyThreadClusterLengths_G_K_M,
-                                               ABlockCopyThreadClusterArrangeOrder,
-                                               ABlockCopySrcAccessOrder,
-                                               ABlockCopyDstAccessOrder,
-                                               ABlockCopySrcVectorReadDim,
-                                               2,
-                                               ABlockCopySrcDataPerRead,
-                                               ABlockCopyDstDataPerWrite_M,
-                                               AddressSpace::Global,
-                                               AddressSpace::Vgpr,
-                                               AddressSpace::Lds,
-                                               InMemoryDataOperation::Set,
-                                               ABlockCopySrcDataStride>(
-                {group_id, 0, m_block_data_on_global}, {0, 0, 0});
-
-        constexpr auto b_g_k_n_block_desc = make_native_tensor_descriptor_aligned(
-            Sequence<1, KPerBlock, NPerBlock>{}, Number<max_align>{});
-
-        auto b_blockwise_copy =
-            BlockwiseGenericTensorSliceCopy_v4<BlockSize,
-                                               decltype(b_g_k_n_global_desc),
-                                               decltype(b_g_k_n_block_desc),
-                                               decltype(b_g_k_n_block_desc.GetLengths()),
-                                               BBlockCopyThreadSliceLengths_G_K_N,
-                                               BBlockCopyThreadClusterLengths_G_K_N,
-                                               BBlockCopyThreadClusterArrangeOrder,
-                                               BBlockCopySrcAccessOrder,
-                                               BBlockCopyDstAccessOrder,
-                                               BBlockCopySrcVectorReadDim,
-                                               2,
-                                               BBlockCopySrcDataPerRead,
-                                               BBlockCopyDstDataPerWrite_N,
-                                               AddressSpace::Global,
-                                               AddressSpace::Vgpr,
-                                               AddressSpace::Lds,
-                                               InMemoryDataOperation::Set,
-                                               BBlockCopySrcDataStride>(
-                {group_id, 0, n_block_data_on_global}, {0, 0, 0});
-
-        // GEMM definition
-        // c_mtx += transpose(a_mtx) * b_mtx
-        //     a_mtx[KPerBlock, MPerBlock] is in LDS
-        //     b_mtx[EPerBlocl, NPerBlock] is in LDS
-        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
-        //     register
-        constexpr auto a_k_m_block_mtx_desc = make_ConstantMatrixDescriptor_packed(
-            a_g_k_m_block_desc.GetLength(I1), a_g_k_m_block_desc.GetLength(I2));
-        constexpr auto b_k_n_block_mtx_desc = make_ConstantMatrixDescriptor_packed(
-            b_g_k_n_block_desc.GetLength(I1), b_g_k_n_block_desc.GetLength(I2));
-
-        const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_xdlops<
-            BlockSize,
-            decltype(a_k_m_block_mtx_desc),
-            decltype(b_k_n_block_mtx_desc),
-            Float,
-            MPerWave,
-            NPerWave,
-            MWaves,
-            NWaves,
-            GemmDataPerReadM,
-            GemmDataPerReadN>{};
-
-        constexpr index_t a_block_space =
-            math::integer_least_multiple(a_g_k_m_block_desc.GetElementSpace(), max_align);
-
-        constexpr index_t b_block_space =
-            math::integer_least_multiple(b_g_k_n_block_desc.GetElementSpace(), max_align);
-
-        __shared__ Float p_a_block_double[2 * a_block_space];
-        __shared__ Float p_b_block_double[2 * b_block_space];
-
-        // get zero-initialized output register of vector type
-        auto c_thread_vec = blockwise_gemm.CreateOutputVecZero();
-
-        // LDS double buffer: preload data into LDS
-        {
-            a_blockwise_copy.Run(p_a_global, p_a_block_double);
-            b_blockwise_copy.Run(p_b_global, p_b_block_double);
-        }
-
-        using b_blockwise_copy_src_step = Sequence<0, KPerBlock, 0>;
-        using a_blockwise_copy_src_step = Sequence<0, KPerBlock, 0>;
-
-        // LDS double buffer: main body
-        for(index_t k_block_data_begin = 0; k_block_data_begin + 2 * KPerBlock < K;
-            k_block_data_begin += 2 * KPerBlock)
-        {
-#pragma unroll
-            for(index_t iloop = 0; iloop < 2; ++iloop)
-            {
-                const bool even_loop = (iloop % 2 == 0);
-
-                Float* p_a_block_now =
-                    even_loop ? p_a_block_double : p_a_block_double + a_block_space;
-                Float* p_b_block_now =
-                    even_loop ? p_b_block_double : p_b_block_double + b_block_space;
-
-                Float* p_a_block_next =
-                    even_loop ? p_a_block_double + a_block_space : p_a_block_double;
-                Float* p_b_block_next =
-                    even_loop ? p_b_block_double + b_block_space : p_b_block_double;
-
-                Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()];
-                Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()];
-
-                a_blockwise_copy.MoveSrcSliceWindow(a_blockwise_copy_src_step{}, True);
-                b_blockwise_copy.MoveSrcSliceWindow(b_blockwise_copy_src_step{}, True);
-
-                __syncthreads();
-
-                // LDS doubel buffer: load next data from device mem
-                a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer);
-                b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
-
-                // LDS double buffer: GEMM on current data
-                c_thread_vec = blockwise_gemm.Run(p_a_block_now, p_b_block_now, c_thread_vec);
-
-                // LDS double buffer: store next data to LDS
-                a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer, p_a_block_next);
-                b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer, p_b_block_next);
-            }
-        }
-
-        // LDS double buffer: tail
-        {
-            constexpr bool has_two_iteration_left = (K % (2 * KPerBlock) == 0);
-
-            if(has_two_iteration_left) // if has 2 iteration left
-            {
-                Float p_a_thread_buffer[a_blockwise_copy.GetThreadBufferSize()];
-                Float p_b_thread_buffer[b_blockwise_copy.GetThreadBufferSize()];
-
-                a_blockwise_copy.MoveSrcSliceWindow(a_blockwise_copy_src_step{}, True);
-                b_blockwise_copy.MoveSrcSliceWindow(b_blockwise_copy_src_step{}, True);
-
-                __syncthreads();
-
-                // LDS double buffer: load last data from device mem
-                a_blockwise_copy.RunLoadThreadBuffer(p_a_global, p_a_thread_buffer);
-                b_blockwise_copy.RunLoadThreadBuffer(p_b_global, p_b_thread_buffer);
-
-                // LDS double buffer: GEMM on 2nd-last data
-                c_thread_vec = blockwise_gemm.Run(p_a_block_double, p_b_block_double, c_thread_vec);
-
-                // LDS double buffer: store last data to LDS
-                a_blockwise_copy.RunStoreThreadBuffer(p_a_thread_buffer,
-                                                      p_a_block_double + a_block_space);
-                b_blockwise_copy.RunStoreThreadBuffer(p_b_thread_buffer,
-                                                      p_b_block_double + b_block_space);
-
-                __syncthreads();
-
-                // LDS double buffer: GEMM on current data
-                c_thread_vec = blockwise_gemm.Run(p_a_block_double + a_block_space,
-                                                  p_b_block_double + b_block_space,
-                                                  c_thread_vec);
-            }
-            else // if has 1 iteration left
-            {
-                __syncthreads();
-
-                // LDS double buffer: GEMM on last data
-                c_thread_vec = blockwise_gemm.Run(p_a_block_double, p_b_block_double, c_thread_vec);
-            }
-        }
-
-        // copy output: register to global memory
-        {
-            ///\todo inconsistent layout of xdlops and tensor
-            // xdlops layout
-            // M1 = num_groups;
-            // M0 = group_size;
-            // N1 = num_blks_per_wave;
-            // N0 = num_threads_per_blks;
-            constexpr auto CLayout = blockwise_gemm.GetOutputLayout();
-            constexpr index_t M0   = CLayout.M1();
-            constexpr index_t M1   = CLayout.N1();
-            constexpr index_t M2   = CLayout.M0();
-
-            constexpr auto c_g_m0_m1_m2_n_global_desc = transform_tensor_descriptor(
-                c_g_m_n_global_desc,
-                make_tuple(PassThrough<G>{}, UnMerge<Sequence<M0, M1, M2>>{}, PassThrough<N>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}));
-
-            //     src descriptor
-            constexpr auto c_g_m0_m1_m2_n_thread_desc =
-                make_native_tensor_descriptor_packed(Sequence<1, M0, 1, M2, 1>{});
-
-            using CThreadCopySliceLengths = Sequence<1, M0, 1, M2, 1>;
-
-            constexpr index_t BlkSize = CLayout.GetBlkSize();
-            constexpr index_t NumBlks = CLayout.GetNumBlks();
-
-            for(index_t i = 0; i < NumBlks; ++i)
-            {
-                // calculate origin of thread output tensor on global memory
-                //     blockwise GEMM c matrix starting index
-                const auto c_thread_mtx_on_block = blockwise_gemm.GetBeginOfThreadMatrixC(i);
-
-                const index_t m_thread_data_on_global =
-                    m_block_data_on_global + c_thread_mtx_on_block.row;
-
-                const index_t n_thread_data_on_global =
-                    n_block_data_on_global + c_thread_mtx_on_block.col;
-
-                ThreadwiseGenericTensorSliceCopy_v4r2<decltype(c_g_m0_m1_m2_n_thread_desc),
-                                                      decltype(c_g_m0_m1_m2_n_global_desc),
-                                                      CThreadCopySliceLengths,
-                                                      arithmetic_sequence_gen<0, 5, 1>::type,
-                                                      4,
-                                                      1,
-                                                      1,
-                                                      AddressSpace::Vgpr,
-                                                      AddressSpace::Global,
-                                                      CGlobalMemoryDataOperation>(
-                    {0, 0, 0, 0, 0},
-                    {group_id,
-                     m_thread_data_on_global / (M2 * M1),
-                     m_thread_data_on_global % (M2 * M1) / M2,
-                     m_thread_data_on_global % M2,
-                     n_thread_data_on_global})
-                    .Run(c_thread_vec.n + i * BlkSize, p_c_global);
-            }
-        }
-    }
-};
-
-} // namespace ck
-#endif
diff --git a/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp b/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp
deleted file mode 100644
index cf9a8bcbc7..0000000000
--- a/src/kernels/static_composable_kernel/include/tensor_operation/static_kernel_threadwise_generic_tensor_slice_copy_deprecated.hpp
+++ /dev/null
@@ -1,443 +0,0 @@
-#ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
-#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
-
-#include "static_kernel_common_header.hpp"
-#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp"
-#include "static_kernel_ConstantMergedTensorDescriptor_deprecated.hpp"
-#include "static_kernel_tensor_coordinate_deprecated.hpp"
-
-namespace ck {
-
-// This threadwise copy allow vector access of src and dst.
-// It allows the vector size to be different on src and dst.
-// The dimensions of vector access should be the same on src and dst.
-// The dimension access order should be the same on src and dst.
-// It is designed for cases, where one of src and dst is register, and
-// the other is device memory or LDS
-template <typename SrcDesc,
-          typename DstDesc,
-          typename SliceLengths,
-          typename DimAccessOrder,
-          index_t VectorAccessDim,
-          index_t SrcDataPerAccess,
-          index_t DstDataPerAccess>
-struct ThreadwiseGenericTensorSliceCopy_v1r2_deprecated
-{
-    static constexpr index_t nDim = SliceLengths::GetSize();
-
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2_deprecated(
-        Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
-        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
-    {
-        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
-                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
-                          nDim == DimAccessOrder::GetSize(),
-                      "wrong! # of dimensions not the same");
-
-        static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
-
-        static_assert(
-            SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0,
-            "wrong! cannot evenly divide");
-
-        // check vectorized memory access
-        constexpr auto vector_access_dim = Number<VectorAccessDim>{};
-
-        static_if<!SrcDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
-            static_assert(
-                (fwd(SrcDesc{}).GetStride(vector_access_dim) == 1 || SrcDataPerAccess == 1),
-                "wrong! vectorized access is allowed only if stride == 1");
-        }).Else([&](auto fwd) {
-            static_assert((fwd(SrcDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
-                           SrcDataPerAccess == 1),
-                          "wrong! vectorized access is allowed only if stride == 1");
-        });
-
-        static_if<!DstDesc::ContainMultipleOriginalDimensions(vector_access_dim)>{}([&](auto fwd) {
-            static_assert(
-                (fwd(DstDesc{}).GetStride(vector_access_dim) == 1 || DstDataPerAccess == 1),
-                "wrong! vectorized access is allowed only if stride == 1");
-        }).Else([&](auto fwd) {
-            static_assert((fwd(DstDesc{}).GetLastOriginalDimensionStride(vector_access_dim) == 1 ||
-                           DstDataPerAccess == 1),
-                          "wrong! vectorized access is allowed only if stride == 1");
-        });
-    }
-
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r2_deprecated()
-        : ThreadwiseGenericTensorSliceCopy_v1r2_deprecated(make_zero_array<index_t, nDim>(),
-                                                           make_zero_array<index_t, nDim>())
-    {
-    }
-
-    __device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin)
-    {
-        mSrcSliceOrigin = src_slice_origin;
-    }
-
-    __device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin)
-    {
-        mDstSliceOrigin = dst_slice_origin;
-    }
-
-    template <class SrcData, class DstData>
-    __device__ void Run(const SrcData* p_src, DstData* p_dst) const
-    {
-        using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
-        using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
-
-        constexpr auto vector_access_dim = Number<VectorAccessDim>{};
-
-        constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
-        constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
-
-        constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
-
-        constexpr auto long_vector_access_lengths = SliceLengths::Modify(
-            vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
-
-        ford<decltype(long_vector_access_lengths), DimAccessOrder>{}(
-            [&](auto long_vector_access_id) {
-                // data id w.r.t slicing-window
-                auto long_vector_data_begin_id = long_vector_access_id;
-                long_vector_data_begin_id(vector_access_dim) =
-                    long_vector_size * long_vector_access_id[vector_access_dim];
-
-                // buffer to hold a long-vector
-                SrcData p_src_long_vector[long_vector_size];
-                DstData p_dst_long_vector[long_vector_size];
-
-                // load data from src to the long-vector buffer
-                for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
-                {
-                    auto scalar_id               = make_zero_array<index_t, nDim>();
-                    scalar_id(vector_access_dim) = i * src_data_per_access;
-
-                    const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(
-                        mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id));
-
-                    const index_t buffer_offset = i * src_data_per_access;
-
-                    *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
-                        *reinterpret_cast<const src_vector_t*>(&p_src[src_offset]);
-                }
-
-                // type conversion
-                for(index_t i = 0; i < long_vector_size; ++i)
-                {
-                    p_dst_long_vector[i] = type_convert<DstData>{}(p_src_long_vector[i]);
-                }
-
-                // store data from the long-vector buffer to dst
-                for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i)
-                {
-                    auto scalar_id               = make_zero_array<index_t, nDim>();
-                    scalar_id(vector_access_dim) = i * dst_data_per_access;
-
-                    const index_t buffer_offset = i * dst_data_per_access;
-
-                    const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(
-                        mDstSliceOrigin + (long_vector_data_begin_id + scalar_id));
-
-                    *reinterpret_cast<dst_vector_t*>(&p_dst[dst_offset]) =
-                        *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
-                }
-            });
-    }
-
-private:
-    Array<index_t, nDim> mSrcSliceOrigin;
-    Array<index_t, nDim> mDstSliceOrigin;
-};
-
-// This version use TensorCoordinate_deprecated
-// This threadwise copy allow vector access of src and dst.
-// It allows the dimensions of vector access to be different on src and dst.
-// It also allows the vector size to be different on src and dst.
-// It also allows order of access to be different on src and dst.
-// It use register as buffer to hold all data moving from src to dst.
-// It is designed for copying small amount of data, and src and dst are
-// device memory or LDS.
-// When copying large amout of data, let's hope compiler will reduce register
-// used for the buffer.
-template <typename SrcDesc,
-          typename DstDesc,
-          typename SliceLengths,
-          typename SrcDimAccessOrder,
-          typename DstDimAccessOrder,
-          index_t SrcVectorAccessDim,
-          index_t DstVectorAccessDim,
-          index_t SrcDataPerAccess,
-          index_t DstDataPerAccess>
-struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
-{
-    static constexpr index_t nDim = SliceLengths::GetSize();
-
-    using Index = MultiIndex<nDim>;
-
-    using SrcCoordinate = typename TensorCoordinate_deprecated<SrcDesc>::type;
-    using DstCoordinate = typename TensorCoordinate_deprecated<DstDesc>::type;
-
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1_deprecated(
-        const Index& src_slice_origin, const Index& dst_slice_origin)
-        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
-    {
-        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
-                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
-                          nDim == SrcDimAccessOrder::GetSize() &&
-                          nDim == DstDimAccessOrder::GetSize(),
-                      "wrong! # of dimensions not the same");
-
-        static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
-                          is_valid_sequence_map<DstDimAccessOrder>::value,
-                      "wrong! map is not valid");
-
-        static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
-                          SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
-                      "wrong! cannot evenly divide");
-
-        // check vectorized memory access
-        constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
-        constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
-
-        static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
-            [&](auto fwd) {
-                static_assert(
-                    (fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            })
-            .Else([&](auto fwd) {
-                static_assert(
-                    (fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
-                     SrcDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            });
-
-        static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
-            [&](auto fwd) {
-                static_assert(
-                    (fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            })
-            .Else([&](auto fwd) {
-                static_assert(
-                    (fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
-                     DstDataPerAccess == 1),
-                    "wrong! vectorized access is allowed only if stride == 1");
-            });
-    }
-
-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v2r1_deprecated()
-        : ThreadwiseGenericTensorSliceCopy_v2r1_deprecated(make_zero_array<index_t, nDim>(),
-                                                           make_zero_array<index_t, nDim>())
-    {
-    }
-
-    __device__ void SetSrcSliceOrigin(SrcCoordinate src_slice_origin)
-    {
-        mSrcSliceOrigin = src_slice_origin;
-    }
-
-    __device__ void SetDstSliceOrigin(DstCoordinate dst_slice_origin)
-    {
-        mDstSliceOrigin = dst_slice_origin;
-    }
-
-    template <typename TDesc, class Lengths>
-    struct IsolateMergedDimLengths
-    {
-        template <typename IDim>
-        __device__ constexpr index_t operator()(IDim idim) const
-        {
-            return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1;
-        }
-    };
-
-    template <typename SrcData,
-              typename DstData,
-              AddressSpace SrcAddressSpace,
-              AddressSpace DstAddressSpace>
-    __device__ void Run(const SrcData* p_src,
-                        DstData* p_dst,
-                        integral_constant<AddressSpace, SrcAddressSpace>,
-                        integral_constant<AddressSpace, DstAddressSpace>) const
-    {
-        constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
-
-        SrcData p_src_buffer_[buffer_desc.GetElementSpace()];
-        SrcData* p_src_buffer = p_src_buffer_;
-
-        // copy data from src into buffer
-        {
-            constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
-            constexpr auto src_data_per_access   = Number<SrcDataPerAccess>{};
-
-            constexpr auto src_access_lengths = SliceLengths::Modify(
-                src_vector_access_dim,
-                SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
-
-            // Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
-            // normal dimensions is known at compile time.
-            // Below is a hack to isolate merged dimension id from normal dimension id, so the
-            // corresponding offset can be calculated seperately at run-time and compile-time.
-            // src_merged_dim_access_lengths has the same value as src_access_lengths on src's
-            // merged dimensions, and has value = 1 on normal dimensions;
-            // src_merged_dim_access_lengths has the same value as src_access_lengths on src's
-            // normal dimensions, and has value = 1 on merged dimensions;
-            constexpr auto src_merged_dim_access_lengths = typename sequence_gen<
-                nDim,
-                IsolateMergedDimLengths<SrcDesc, decltype(src_access_lengths)>>::type{};
-
-            constexpr auto src_normal_dim_access_lengths =
-                src_access_lengths + Number<1>{} - src_merged_dim_access_lengths;
-
-            ford<decltype(src_merged_dim_access_lengths), SrcDimAccessOrder>{}(
-                [&](auto src_merged_dim_access_id) {
-                    auto src_merged_dim_data_id = src_merged_dim_access_id;
-                    src_merged_dim_data_id(src_vector_access_dim) =
-                        src_merged_dim_access_id[src_vector_access_dim] * src_data_per_access;
-
-                    // offset w.r.t. merged dimension need be computed at run-time,
-                    const index_t src_merged_offset =
-                        (mSrcSliceOrigin + src_merged_dim_data_id).GetOffset();
-
-                    ford<decltype(src_normal_dim_access_lengths), SrcDimAccessOrder>{}(
-                        [&](auto src_normal_dim_access_id) {
-                            auto src_normal_dim_data_id = src_normal_dim_access_id;
-                            src_normal_dim_data_id(src_vector_access_dim) =
-                                src_normal_dim_access_id[src_vector_access_dim] *
-                                src_data_per_access;
-
-                            // offset w.r.t. normal dimension is known at compile-time
-                            const index_t src_normal_offset =
-                                SrcDesc::GetOffsetFromMultiIndex(src_normal_dim_data_id);
-
-                            SrcData p_src_vector_data[SrcDataPerAccess];
-
-                            transfer_data<SrcData,
-                                          SrcDataPerAccess,
-                                          SrcAddressSpace,
-                                          AddressSpace::Vgpr,
-                                          InMemoryDataOperation::Set>(
-                                p_src, src_normal_offset + src_merged_offset, p_src_vector_data, 0);
-
-                            // unpack vector into buffer
-                            for(index_t i = 0; i < SrcDataPerAccess; ++i)
-                            {
-                                auto scalar_id                   = make_zero_array<index_t, nDim>();
-                                scalar_id(src_vector_access_dim) = i;
-
-                                const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
-                                    src_merged_dim_data_id + src_normal_dim_data_id + scalar_id);
-
-                                p_src_buffer[buffer_offset] = p_src_vector_data[i];
-                            }
-                        });
-                });
-        }
-
-        // type conversion
-        // TODO: would compiler do a good job reusing register for buffer?
-        DstData p_dst_buffer_[buffer_desc.GetElementSpace()];
-        DstData* p_dst_buffer = p_dst_buffer_;
-
-        ford<SliceLengths>{}([&](auto idx) {
-            p_dst_buffer[buffer_desc.GetOffsetFromMultiIndex(idx)] =
-                type_convert<DstData>{}(p_src_buffer[buffer_desc.GetOffsetFromMultiIndex(idx)]);
-        });
-
-        // copy data from buffer into dst
-        {
-            constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
-            constexpr auto dst_data_per_access   = Number<DstDataPerAccess>{};
-
-            constexpr auto dst_access_lengths = SliceLengths::Modify(
-                dst_vector_access_dim,
-                SliceLengths::Get(dst_vector_access_dim) / dst_data_per_access);
-
-            constexpr auto dst_merged_dim_access_lengths = typename sequence_gen<
-                nDim,
-                IsolateMergedDimLengths<DstDesc, decltype(dst_access_lengths)>>::type{};
-
-            constexpr auto dst_normal_dim_access_lengths =
-                dst_access_lengths + Number<1>{} - dst_merged_dim_access_lengths;
-
-            ford<decltype(dst_merged_dim_access_lengths), DstDimAccessOrder>{}(
-                [&](auto dst_merged_dim_access_id) {
-                    auto dst_merged_dim_data_id = dst_merged_dim_access_id;
-                    dst_merged_dim_data_id(dst_vector_access_dim) =
-                        dst_merged_dim_access_id[dst_vector_access_dim] * dst_data_per_access;
-
-                    // offset w.r.t. merged dimension need be computed at run-time,
-                    const index_t dst_merged_offset =
-                        (mDstSliceOrigin + dst_merged_dim_data_id).GetOffset();
-
-                    ford<decltype(dst_normal_dim_access_lengths), DstDimAccessOrder>{}(
-                        [&](auto dst_normal_dim_access_id) {
-                            auto dst_normal_dim_data_id = dst_normal_dim_access_id;
-                            dst_normal_dim_data_id(dst_vector_access_dim) =
-                                dst_normal_dim_access_id[dst_vector_access_dim] *
-                                dst_data_per_access;
-
-                            DstData p_dst_vector_data[DstDataPerAccess];
-
-                            // pack vector from buffer
-                            for(index_t i = 0; i < DstDataPerAccess; ++i)
-                            {
-                                auto scalar_id                   = make_zero_array<index_t, nDim>();
-                                scalar_id(dst_vector_access_dim) = i;
-
-                                const index_t buffer_offset = buffer_desc.GetOffsetFromMultiIndex(
-                                    dst_merged_dim_data_id + dst_normal_dim_data_id + scalar_id);
-
-                                p_dst_vector_data[i] = p_dst_buffer[buffer_offset];
-                            }
-
-                            // offset w.r.t. normal dimension is known at compile-time
-                            const index_t dst_normal_offset =
-                                DstDesc::GetOffsetFromMultiIndex(dst_normal_dim_data_id);
-
-                            transfer_data<DstData,
-                                          DstDataPerAccess,
-                                          AddressSpace::Vgpr,
-                                          DstAddressSpace,
-                                          InMemoryDataOperation::Set>(
-                                p_dst_vector_data, 0, p_dst, dst_normal_offset + dst_merged_offset);
-                        });
-                });
-        }
-    }
-
-    template <typename SrcData, typename DstData>
-    __device__ void Run(const SrcData* p_src, DstData* p_dst) const
-    {
-        constexpr auto generic_address_space =
-            integral_constant<AddressSpace, AddressSpace::Generic>{};
-
-        Run(p_src, p_dst, generic_address_space, generic_address_space);
-    }
-
-    // T can be Sequence or Array
-    template <typename T, bool PositiveDirection>
-    __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
-    {
-        static_if<PositiveDirection>{}([&](auto) {
-            mSrcSliceOrigin += step_sizes;
-        }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
-    }
-
-    template <typename T, bool PositiveDirection>
-    __device__ void MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
-    {
-        static_if<PositiveDirection>{}([&](auto) {
-            mDstSliceOrigin += step_sizes;
-        }).Else([&](auto) { mDstSliceOrigin -= step_sizes; });
-    }
-
-private:
-    SrcCoordinate mSrcSliceOrigin;
-    DstCoordinate mDstSliceOrigin;
-};
-
-} // namespace ck
-#endif
diff --git a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp b/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
deleted file mode 100644
index 1c7fd21e3c..0000000000
--- a/src/kernels/static_composable_kernel/src/kernel_wrapper/static_kernel_gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.cpp
+++ /dev/null
@@ -1,329 +0,0 @@
-#include "static_kernel_common_header.hpp"
-#include "static_kernel_ConstantTensorDescriptor_deprecated.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_fp16_bfp16_fwd_nchw_kcyx_nkhw_lds_double_buffer.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_fp16_bfp16_wrw_nchw_kcyx_nkhw_lds_double_buffer.hpp"
-#include "float_types.h"
-
-extern "C" __global__
-    __launch_bounds__(CK_PARAM_TUNABLE_BLOCK_SIZE, 2) void gridwise_convolution_implicit_gemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer(
-        const FLOAT* const __restrict__ p_in_global,
-        const FLOAT* const __restrict__ p_wei_global,
-        FLOAT* const __restrict__ p_out_global)
-{
-    using namespace ck;
-
-    // read params: problem decription
-    constexpr index_t N  = CK_PARAM_PROBLEM_N;
-    constexpr index_t K  = CK_PARAM_PROBLEM_K;
-    constexpr index_t C  = CK_PARAM_PROBLEM_C;
-    constexpr index_t Hi = CK_PARAM_PROBLEM_HI;
-    constexpr index_t Wi = CK_PARAM_PROBLEM_WI;
-    constexpr index_t Ho = CK_PARAM_PROBLEM_HO;
-    constexpr index_t Wo = CK_PARAM_PROBLEM_WO;
-    constexpr index_t Y  = CK_PARAM_PROBLEM_Y;
-    constexpr index_t X  = CK_PARAM_PROBLEM_X;
-
-    constexpr index_t ConvStrideH = CK_PARAM_PROBLEM_CONV_STRIDE_H;
-    constexpr index_t ConvStrideW = CK_PARAM_PROBLEM_CONV_STRIDE_W;
-
-    constexpr index_t ConvDilationH = CK_PARAM_PROBLEM_CONV_DILATION_H;
-    constexpr index_t ConvDilationW = CK_PARAM_PROBLEM_CONV_DILATION_W;
-
-    // read params: tunable params
-    constexpr index_t BlockSize = CK_PARAM_TUNABLE_BLOCK_SIZE;
-
-    constexpr index_t GemmMPerBlock = CK_PARAM_TUNABLE_GEMM_M_PER_BLOCK;
-    constexpr index_t GemmNPerBlock = CK_PARAM_TUNABLE_GEMM_N_PER_BLOCK;
-    constexpr index_t GemmKPerBlock = CK_PARAM_TUNABLE_GEMM_K_PER_BLOCK;
-    constexpr index_t GemmKBlocks   = CK_PARAM_TUNABLE_GEMM_K_BLOCKS;
-
-    // read params: dependent params
-    constexpr index_t GridSize = CK_PARAM_DEPENDENT_GRID_SIZE;
-
-    constexpr index_t LeftPadH = CK_PARAM_PROBLEM_LEFT_PAD_H;
-    constexpr index_t LeftPadW = CK_PARAM_PROBLEM_LEFT_PAD_W;
-
-    constexpr index_t RightPadH = CK_PARAM_PROBLEM_RIGHT_PAD_H;
-    constexpr index_t RightPadW = CK_PARAM_PROBLEM_RIGHT_PAD_W;
-
-    using LeftPads  = Sequence<LeftPadH, LeftPadW>;
-    using RightPads = Sequence<RightPadH, RightPadW>;
-
-// calculate dependent params amd heuristic params
-#if CK_PARAM_PROBLEM_DIRECTION == 2
-    // In the WrW direction the filter is the output, while the output image is the input being
-    // convolved with the (original) input image. This requires that the tensordescriptors be
-    // swapped
-    // To reuse the fwd kernel for this operation we need to swap the n and c dimension of the
-    // input descriptor, the n and k dimension of the output descriptor
-    // This change is necessary so that reduction dimensions are consistent with the requirement
-    // of the wrw convolution when used in a fwd context
-    constexpr auto tmp_in_nchw_desc =
-        make_native_tensor_descriptor_packed(Sequence<N, C, Hi, Wi>{});
-    constexpr auto tmp_wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
-    constexpr auto tmp_out_nkhw_desc =
-        make_native_tensor_descriptor_packed(Sequence<N, K, Ho, Wo>{});
-    constexpr auto in_nchw_desc =
-        reorder_tensor_descriptor_given_upper2lower(tmp_in_nchw_desc, Sequence<1, 0, 2, 3>{});
-    // wei and out are swapped in the solver
-    constexpr auto wei_kcyx_desc =
-        reorder_tensor_descriptor_given_upper2lower(tmp_out_nkhw_desc, Sequence<1, 0, 2, 3>{});
-    constexpr auto out_nkhw_desc =
-        reorder_tensor_descriptor_given_upper2lower(tmp_wei_kcyx_desc, Sequence<1, 0, 2, 3>{});
-    constexpr auto dir = ImplicitGemmDirection::BackwardWeight;
-
-    // swap stride and dilation
-    using ConvDilations = Sequence<ConvStrideH, ConvStrideW>;
-    using ConvStrides   = Sequence<ConvDilationH, ConvDilationW>;
-#else
-    static_assert(GemmKBlocks == 1, "do not support GemmKBlocks > 1 for forward!");
-    // calculate dependent params amd heuristic params
-    constexpr auto in_nchw_desc  = make_native_tensor_descriptor_packed(Sequence<N, C, Hi, Wi>{});
-    constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
-    constexpr auto out_nkhw_desc = make_native_tensor_descriptor_packed(Sequence<N, K, Ho, Wo>{});
-
-    constexpr auto dir          = ImplicitGemmDirection::ForwardData;
-    using ConvStrides           = Sequence<ConvStrideH, ConvStrideW>;
-    using ConvDilations         = Sequence<ConvDilationH, ConvDilationW>;
-#endif // CK_PARAM_PROBLEM_DIRECTION == 2
-
-    constexpr index_t GemmBBlockCopyClusterLengths_GemmK =
-        CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K;
-    constexpr index_t GemmBBlockCopyClusterLengths_GemmN =
-        CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_N;
-
-    constexpr index_t GemmBBlockCopyThreadSliceLengths_GemmK =
-        GemmKPerBlock / GemmBBlockCopyClusterLengths_GemmK;
-    constexpr index_t GemmBBlockCopyThreadSliceLengths_GemmN =
-        GemmNPerBlock / GemmBBlockCopyClusterLengths_GemmN;
-
-    constexpr index_t GemmABlockCopyClusterLengths_GemmK =
-        CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_K;
-    constexpr index_t GemmABlockCopyClusterLengths_GemmM =
-        CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_CLUSTER_LENGTHS_GEMM_M;
-
-    constexpr index_t GemmABlockCopyThreadSliceLengths_GemmK =
-        GemmKPerBlock / GemmABlockCopyClusterLengths_GemmK;
-    constexpr index_t GemmABlockCopyThreadSliceLengths_GemmM =
-        GemmMPerBlock / GemmABlockCopyClusterLengths_GemmM;
-
-#if MIOPEN_USE_FP32
-    using GemmABlockCopyThreadSliceLengths_GemmG_GemmK_GemmM =
-        Sequence<1, GemmABlockCopyThreadSliceLengths_GemmK, GemmABlockCopyThreadSliceLengths_GemmM>;
-    using GemmABlockCopyThreadClusterLengths_GemmG_GemmK_GemmM =
-        Sequence<1, GemmABlockCopyClusterLengths_GemmK, GemmABlockCopyClusterLengths_GemmM>;
-
-    using GemmABlockCopyThreadClusterArrangeOrder = Sequence<0, 2, 1>; // [E0, K, E1]
-    using GemmABlockCopySrcAccessOrder            = Sequence<0, 2, 1>; // [E0, K, E1]
-    using GemmABlockCopyDstAccessOrder            = Sequence<0, 1, 2>; // [E0, E1, K]
-
-    constexpr index_t GemmABlockCopySrcDataPerRead_GemmM =
-        CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_M;
-
-    using GemmBBlockCopyThreadSliceLengths_GemmG_GemmK_GemmN =
-        Sequence<1, GemmBBlockCopyThreadSliceLengths_GemmK, GemmBBlockCopyThreadSliceLengths_GemmN>;
-    using GemmBBlockCopyThreadClusterLengths_GemmG_GemmK_GemmN =
-        Sequence<1, GemmBBlockCopyClusterLengths_GemmK, GemmBBlockCopyClusterLengths_GemmN>;
-
-    using GemmBBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 2>; // [E0, E1, B]
-    using GemmBBlockCopySrcAccessOrder            = Sequence<0, 1, 2>; // [E0, E1, B]
-    using GemmBBlockCopyDstAccessOrder            = Sequence<0, 1, 2>; // [E0, E1, B]
-
-    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN =
-        CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_N;
-
-    constexpr index_t GemmABlockCopySrcDataPerRead_GemmK =
-        CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_K;
-
-#elif MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
-    constexpr index_t GemmKPACK = CK_PARAM_GEMM_KPACK_LENGTH;
-
-    using GemmABlockCopySubLengths_GemmG_GemmK_GemmM_GemmKPACK =
-        Sequence<1,
-                 GemmABlockCopyThreadSliceLengths_GemmK,
-                 GemmABlockCopyThreadSliceLengths_GemmM,
-                 GemmKPACK>;
-    using GemmABlockCopyClusterLengths_GemmG_GemmK_GemmM_GemmKPACK =
-        Sequence<1, GemmABlockCopyClusterLengths_GemmK, GemmABlockCopyClusterLengths_GemmM, 1>;
-
-    using GemmABlockCopyThreadClusterArrangeOrder = Sequence<0, 2, 1, 3>; // [G, M, K, GemmKPACK]
-    using GemmABlockCopySrcAccessOrder            = Sequence<0, 2, 1, 3>; // [G, M, K, GemmKPACK]
-    using GemmABlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [G, K, M, GemmKPACK]
-
-    constexpr index_t GemmABlockCopyDstDataPerWrite_GemmKPACK =
-        CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_KPACK;
-
-    using GemmBBlockCopySubLengths_GemmG_GemmK_GemmN_GemmKPACK =
-        Sequence<1,
-                 GemmBBlockCopyThreadSliceLengths_GemmK,
-                 GemmBBlockCopyThreadSliceLengths_GemmN,
-                 GemmKPACK>;
-    using GemmBBlockCopyClusterLengths_GemmG_GemmK_GemmN_GemmKPACK =
-        Sequence<1, GemmBBlockCopyClusterLengths_GemmK, GemmBBlockCopyClusterLengths_GemmN, 1>;
-
-    using GemmBBlockCopyThreadClusterArrangeOrder = Sequence<0, 1, 3, 2>; // [G, K, GemmKPACK, B]
-    using GemmBBlockCopySrcAccessOrder            = Sequence<0, 1, 3, 2>; // [G, K, GemmKPACK, B]
-    using GemmBBlockCopyDstAccessOrder            = Sequence<0, 1, 2, 3>; // [G, K, B, GemmKPACK]
-
-    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmKPACK =
-        CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_DST_DATA_PER_WRITE_GEMM_KPACK;
-
-#if CK_PARAM_PROBLEM_DIRECTION == 2
-    constexpr index_t GemmABlockCopySrcDataPerRead_GemmK =
-        CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_K;
-#else
-    constexpr index_t GemmABlockCopySrcDataPerRead_GemmKPACK =
-        CK_PARAM_TUNABLE_GEMM_A_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_KPACK;
-#endif // CK_PARAM_PROBLEM_DIRECTION
-
-#endif // MIOPEN_USE_FP16 || MIOPEN_USE_BFP16
-
-    constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN =
-        CK_PARAM_TUNABLE_GEMM_B_BLOCK_COPY_SRC_DATA_PER_READ_GEMM_N;
-
-    constexpr auto GemmMPerWave                  = CK_PARAM_GEMM_M_PER_WAVE;
-    constexpr auto GemmNPerWave                  = CK_PARAM_GEMM_N_PER_WAVE;
-    constexpr index_t GemmThreadGemmDataPerReadM = 1;
-    constexpr index_t GemmThreadGemmDataPerReadN = 1;
-
-#if MIOPEN_USE_FP32
-    constexpr auto gridwise_conv =
-        GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_nchw_kcyx_nkhw_lds_double_buffer<
-            GridSize,
-            BlockSize,
-            FLOAT,
-            FLOAT_ACCUM,
-            decltype(in_nchw_desc),
-            decltype(wei_kcyx_desc),
-            decltype(out_nkhw_desc),
-            ConvStrides,
-            ConvDilations,
-            LeftPads,
-            RightPads,
-            GemmMPerBlock,
-            GemmNPerBlock,
-            GemmKPerBlock,
-            GemmKBlocks,
-            GemmMPerWave,
-            GemmNPerWave,
-            GemmThreadGemmDataPerReadM,
-            GemmThreadGemmDataPerReadN,
-            GemmABlockCopyThreadSliceLengths_GemmG_GemmK_GemmM,
-            GemmABlockCopyThreadClusterLengths_GemmG_GemmK_GemmM,
-            GemmABlockCopyThreadClusterArrangeOrder,
-            GemmABlockCopySrcAccessOrder,
-            GemmABlockCopyDstAccessOrder,
-            GemmABlockCopySrcDataPerRead_GemmK,
-            GemmABlockCopySrcDataPerRead_GemmM,
-            GemmBBlockCopyThreadSliceLengths_GemmG_GemmK_GemmN,
-            GemmBBlockCopyThreadClusterLengths_GemmG_GemmK_GemmN,
-            GemmBBlockCopyThreadClusterArrangeOrder,
-            GemmBBlockCopySrcAccessOrder,
-            GemmBBlockCopyDstAccessOrder,
-            GemmBBlockCopySrcDataPerRead_GemmN,
-            GemmBBlockCopyDstDataPerWrite_GemmN,
-            dir>{};
-    gridwise_conv.Run(p_in_global, p_wei_global, p_out_global);
-
-#elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) && CK_PARAM_PROBLEM_DIRECTION == 2
-
-    // Backward weight in fp16/bfp16 uses atomic add to do reduction along K dimension
-    // It requires output blob to be of float as no atomic add exists for fp16/ushort
-    constexpr auto gridwise_conv =
-        GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_fp16_bfp16_wrw_nchw_kcyx_nkhw_lds_double_buffer<
-            GridSize,
-            BlockSize,
-            FLOAT,       // Input data type = fp16 (fp16) or ushort (bfp16)
-            FLOAT_ACCUM, // Acc data type = float (see float_types.h)
-            float, // Output data type = float  (not fp16/ushort) as no atomic add ISA exists for
-                   // fp16/ushort.
-            decltype(in_nchw_desc),
-            decltype(wei_kcyx_desc),
-            decltype(out_nkhw_desc),
-            ConvStrides,
-            ConvDilations,
-            LeftPads,
-            RightPads,
-            GemmMPerBlock,
-            GemmNPerBlock,
-            GemmKPerBlock,
-            GemmKBlocks,
-            GemmKPACK,
-            GemmMPerWave,
-            GemmNPerWave,
-            GemmThreadGemmDataPerReadM,
-            GemmThreadGemmDataPerReadN,
-            GemmABlockCopySubLengths_GemmG_GemmK_GemmM_GemmKPACK,
-            GemmABlockCopyClusterLengths_GemmG_GemmK_GemmM_GemmKPACK,
-            GemmABlockCopyThreadClusterArrangeOrder,
-            GemmABlockCopySrcAccessOrder,
-            GemmABlockCopyDstAccessOrder,
-            GemmABlockCopySrcDataPerRead_GemmK,
-            GemmABlockCopyDstDataPerWrite_GemmKPACK,
-            GemmBBlockCopySubLengths_GemmG_GemmK_GemmN_GemmKPACK,
-            GemmBBlockCopyClusterLengths_GemmG_GemmK_GemmN_GemmKPACK,
-            GemmBBlockCopyThreadClusterArrangeOrder,
-            GemmBBlockCopySrcAccessOrder,
-            GemmBBlockCopyDstAccessOrder,
-            GemmBBlockCopySrcDataPerRead_GemmN,
-            GemmBBlockCopyDstDataPerWrite_GemmKPACK,
-            dir>{};
-
-    // Output blob is cast to float as no atomic add exists for fp16/ushort
-    gridwise_conv.Run(p_in_global, p_wei_global, reinterpret_cast<FLOAT_ACCUM*>(p_out_global));
-#elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16) && CK_PARAM_PROBLEM_DIRECTION != 2
-    // Forward data doesn't use any atomic add so output blob remains of the same type
-    // as input blob
-
-    constexpr auto wkgrp_schd_order =
-#if MIOPEN_USE_FP16
-        NBlock1MBlock0;
-#else
-        MBlock1NBlock0;
-#endif // MIOPEN_USE_FP16
-
-    constexpr auto gridwise_conv =
-        GridwiseConvolutionImplicitGemm_v4r4_gen_xdlops_fp16_bfp16_fwd_nchw_kcyx_nkhw_lds_double_buffer<
-            GridSize,
-            BlockSize,
-            FLOAT,       // Input data type = fp16 (fp16) or ushort (bfp16)
-            FLOAT_ACCUM, // Acc data type = float (see float_types.h)
-            FLOAT,       // Input data type = fp16 (fp16) or ushort (bfp16)
-            decltype(in_nchw_desc),
-            decltype(wei_kcyx_desc),
-            decltype(out_nkhw_desc),
-            ConvStrides,
-            ConvDilations,
-            LeftPads,
-            RightPads,
-            GemmMPerBlock,
-            GemmNPerBlock,
-            GemmKPerBlock,
-            GemmKBlocks,
-            GemmKPACK,
-            GemmMPerWave,
-            GemmNPerWave,
-            GemmThreadGemmDataPerReadM,
-            GemmThreadGemmDataPerReadN,
-            GemmABlockCopySubLengths_GemmG_GemmK_GemmM_GemmKPACK,
-            GemmABlockCopyClusterLengths_GemmG_GemmK_GemmM_GemmKPACK,
-            GemmABlockCopyThreadClusterArrangeOrder,
-            GemmABlockCopySrcAccessOrder,
-            GemmABlockCopyDstAccessOrder,
-            GemmABlockCopySrcDataPerRead_GemmKPACK,
-            GemmABlockCopyDstDataPerWrite_GemmKPACK,
-            GemmBBlockCopySubLengths_GemmG_GemmK_GemmN_GemmKPACK,
-            GemmBBlockCopyClusterLengths_GemmG_GemmK_GemmN_GemmKPACK,
-            GemmBBlockCopyThreadClusterArrangeOrder,
-            GemmBBlockCopySrcAccessOrder,
-            GemmBBlockCopyDstAccessOrder,
-            GemmBBlockCopySrcDataPerRead_GemmN,
-            GemmBBlockCopyDstDataPerWrite_GemmKPACK,
-            dir,
-            wkgrp_schd_order>{};
-    gridwise_conv.Run(p_in_global, p_wei_global, p_out_global);
-#else
-    static_assert(false, "wrong! Only fp32, fp16 and bfp16 are supported.");
-#endif // MIOPEN_USE_FP32
-}