diff --git a/library/include/rocwmma/internal/blend.hpp b/library/include/rocwmma/internal/blend.hpp
index 79f1ba33..776b0751 100644
--- a/library/include/rocwmma/internal/blend.hpp
+++ b/library/include/rocwmma/internal/blend.hpp
@@ -99,6 +99,7 @@ namespace rocwmma
         using Zip4    = Driver<BlendImpl::Ops::Zip4>;
         using Zip8    = Driver<BlendImpl::Ops::Zip8>;
         using Zip16   = Driver<BlendImpl::Ops::Zip16>;
+        using Zip32   = Driver<BlendImpl::Ops::Zip32>;
 
         // Unpack functions
         using UnpackByteLo   = Driver<BlendImpl::Ops::UnpackByteLo>;
@@ -107,6 +108,18 @@ namespace rocwmma
         using UnpackWordHi   = Driver<BlendImpl::Ops::UnpackWordHi>;
         using UnpackByteLoHi = Driver<BlendImpl::Ops::UnpackByteLoHi>;
 
+        // Extract functions
+        using ExtractByteEven = Driver<BlendImpl::Ops::ExtractByteEven>;
+        using ExtractByteOdd  = Driver<BlendImpl::Ops::ExtractByteOdd>;
+        using ExtractWordEven = Driver<BlendImpl::Ops::ExtractWordEven>;
+        using ExtractWordOdd  = Driver<BlendImpl::Ops::ExtractWordOdd>;
+
+        using ExtractByteEvenOdd = Driver<BlendImpl::Ops::ExtractByteEvenOdd>;
+        using ExtractWordEvenOdd = Driver<BlendImpl::Ops::ExtractWordEvenOdd>;
+
+        using ExtractByteOddEven = Driver<BlendImpl::Ops::ExtractByteOddEven>;
+        using ExtractWordOddEven = Driver<BlendImpl::Ops::ExtractWordOddEven>;
+
     } // namespace Blend
 
 } // namespace rocwmma
diff --git a/library/include/rocwmma/internal/blend_impl.hpp b/library/include/rocwmma/internal/blend_impl.hpp
index 59f2a563..c0bb5861 100644
--- a/library/include/rocwmma/internal/blend_impl.hpp
+++ b/library/include/rocwmma/internal/blend_impl.hpp
@@ -49,6 +49,7 @@ namespace rocwmma
         using Properties::OP_GROUP_SIZE_1;
         using Properties::OP_GROUP_SIZE_16;
         using Properties::OP_GROUP_SIZE_2;
+        using Properties::OP_GROUP_SIZE_32;
         using Properties::OP_GROUP_SIZE_4;
         using Properties::OP_GROUP_SIZE_8;
 
@@ -247,6 +248,7 @@ namespace rocwmma
             using Zip4    = Zip<OP_GROUP_SIZE_4>;
             using Zip8    = Zip<OP_GROUP_SIZE_8>;
             using Zip16   = Zip<OP_GROUP_SIZE_16>;
+            using Zip32   = Zip<OP_GROUP_SIZE_32>;
 
             // Blend sub-dword elements in regular ordered patterns
             using UnpackByteLo   = PermByte<0u, 4u, 1u, 5u>;
@@ -255,6 +257,16 @@ namespace rocwmma
             using UnpackWordHi   = PermWord<1u, 3u>;
             using UnpackByteLoHi = PermByte<0u, 6u, 1u, 7u>;
 
+            using ExtractByteEven = PermByte<0u, 2u, 4u, 6u>;
+            using ExtractByteOdd  = PermByte<1u, 3u, 5u, 7u>;
+            using ExtractWordEven = UnpackWordLo;
+            using ExtractWordOdd  = UnpackWordHi;
+
+            using ExtractByteEvenOdd = PermByte<0u, 2u, 5u, 7u>;
+            using ExtractByteOddEven = PermByte<1u, 3u, 4u, 6u>;
+            using ExtractWordEvenOdd = PermWord<0u, 3u>;
+            using ExtractWordOddEven = PermWord<1u, 2u>;
+
         } // namespace Ops
 
     } // namespace BlendImpl
diff --git a/library/include/rocwmma/internal/convert.hpp b/library/include/rocwmma/internal/convert.hpp
index fe9807fc..1236c33f 100644
--- a/library/include/rocwmma/internal/convert.hpp
+++ b/library/include/rocwmma/internal/convert.hpp
@@ -27,6 +27,7 @@
 #define ROCWMMA_CONVERT_HPP
 
 #include "types.hpp"
+#include "utility/forward.hpp"
 
 namespace rocwmma
 {
@@ -58,7 +59,7 @@ namespace rocwmma
             template <typename IncomingT>
             ROCWMMA_DEVICE static inline auto exec(IncomingT&& regsIn) -> IncomingT&&
             {
-                return std::forward<IncomingT>(regsIn);
+                return forward<IncomingT>(regsIn);
             }
         };
 
diff --git a/library/include/rocwmma/internal/coop_load.hpp b/library/include/rocwmma/internal/coop_load.hpp
index 8690a6ba..8a12a5a9 100644
--- a/library/include/rocwmma/internal/coop_load.hpp
+++ b/library/include/rocwmma/internal/coop_load.hpp
@@ -63,7 +63,7 @@ namespace rocwmma
 
         // Outer loop = index 0,
         // Inner loop = index N-1
-        template <std::size_t Depth = 0,
+        template <size_t Depth = 0,
                   typename Iterator,
                   typename StrideSpace,
                   typename Strides2d>
@@ -73,14 +73,14 @@ namespace rocwmma
                                                        StrideSpace&& strideSpace,
                                                        Strides2d&&   strides2d)
         {
-            static_assert(VecTraits<std::decay_t<StrideSpace>>::size()
-                              == VecTraits<std::decay_t<Strides2d>>::size(),
+            static_assert(VecTraits<decay_t<StrideSpace>>::size()
+                              == VecTraits<decay_t<Strides2d>>::size(),
                           "Mismatched size");
-            auto strideOffset = DataLayout::fromMatrixCoord(std::get<Depth>(strides2d), ldm);
-            auto strideCount  = std::get<Depth>(strideSpace);
+            auto strideOffset = DataLayout::fromMatrixCoord(get<Depth>(strides2d), ldm);
+            auto strideCount  = get<Depth>(strideSpace);
 
             // Last depth layer will invoke the load
-            if constexpr(Depth == (VecTraits<std::decay_t<StrideSpace>>::size() - 1u))
+            if constexpr(Depth == (VecTraits<decay_t<StrideSpace>>::size() - 1u))
             {
 #pragma unroll
                 for(int i = 0; i < strideCount; i++)
@@ -135,7 +135,7 @@ namespace rocwmma
             }
 
             // Split the reduced stride space.
-            auto workItemsPerWave = std::max(totalWorkItems / maxWaves, 1u);
+            auto workItemsPerWave = max(totalWorkItems / maxWaves, 1u);
             auto strideSpaceS     = inflate_coord_left(workItemsPerWave - 1u, strideSpaceR) + 1u;
 
             // Add back in the VW dimension, for the full stride
@@ -191,7 +191,7 @@ namespace rocwmma
             }
 
             // Split the reduced stride space.
-            constexpr auto workItemsPerWave = std::max(totalWorkItems / maxWaves, 1u);
+            constexpr auto workItemsPerWave = max(totalWorkItems / maxWaves, 1u);
             constexpr auto strideSpaceS
                 = inflate_coord_left(workItemsPerWave - 1u, strideSpaceR) + 1u;
 
diff --git a/library/include/rocwmma/internal/coop_store.hpp b/library/include/rocwmma/internal/coop_store.hpp
index b781bd40..0dd6e1d9 100644
--- a/library/include/rocwmma/internal/coop_store.hpp
+++ b/library/include/rocwmma/internal/coop_store.hpp
@@ -64,7 +64,7 @@ namespace rocwmma
 
         // Outer loop = index 0,
         // Inner loop = index N-1
-        template <std::size_t Depth = 0,
+        template <size_t Depth = 0,
                   typename Iterator,
                   typename StrideSpace,
                   typename Strides2d>
@@ -74,14 +74,14 @@ namespace rocwmma
                                                        StrideSpace&& strideCounts,
                                                        Strides2d&&   strides2d)
         {
-            static_assert(VecTraits<std::decay_t<StrideSpace>>::size()
-                              == VecTraits<std::decay_t<Strides2d>>::size(),
+            static_assert(VecTraits<decay_t<StrideSpace>>::size()
+                              == VecTraits<decay_t<Strides2d>>::size(),
                           "Mismatched size");
-            auto strideOffset = DataLayout::fromMatrixCoord(std::get<Depth>(strides2d), ldm);
-            auto strideCount  = std::get<Depth>(strideCounts);
+            auto strideOffset = DataLayout::fromMatrixCoord(get<Depth>(strides2d), ldm);
+            auto strideCount  = get<Depth>(strideCounts);
 
             // Last depth layer will invoke the load
-            if constexpr(Depth == (VecTraits<std::decay_t<StrideSpace>>::size() - 1u))
+            if constexpr(Depth == (VecTraits<decay_t<StrideSpace>>::size() - 1u))
             {
 #pragma unroll
                 for(int i = 0; i < strideCount; i++)
@@ -136,7 +136,7 @@ namespace rocwmma
             }
 
             // Split the reduced stride space.
-            auto workItemsPerWave = std::max(totalWorkItems / maxWaves, 1u);
+            auto workItemsPerWave = max(totalWorkItems / maxWaves, 1u);
             auto strideSpaceS     = inflate_coord_left(workItemsPerWave - 1u, strideSpaceR) + 1u;
 
             // Add back in the VW dimension, for the full stride
@@ -190,7 +190,7 @@ namespace rocwmma
             }
 
             // Split the reduced stride space.
-            constexpr auto workItemsPerWave = std::max(totalWorkItems / maxWaves, 1u);
+            constexpr auto workItemsPerWave = max(totalWorkItems / maxWaves, 1u);
             constexpr auto strideSpaceS
                 = inflate_coord_left(workItemsPerWave - 1u, strideSpaceR) + 1u;
 
diff --git a/library/include/rocwmma/internal/float8.h b/library/include/rocwmma/internal/float8.h
index 71c1d5fb..b32ac3c9 100644
--- a/library/include/rocwmma/internal/float8.h
+++ b/library/include/rocwmma/internal/float8.h
@@ -34,12 +34,6 @@
 using uint8_t  = __hip_internal::uint8_t;
 using uint16_t = __hip_internal::uint16_t;
 
-namespace std
-{
-    template <bool B, class T, class F>
-    struct conditional;
-}
-
 #endif
 
 // We are clipping in down conversion by default
@@ -771,7 +765,7 @@ inline ROCWMMA_HOST_DEVICE bool operator!=(rocwmma_bf8 a, rocwmma_bf8 b)
 template <typename T,
           typename Ta,
           bool stochastic_rounding,
-          typename std::enable_if<std::is_same<T, Ta>{}, int>::type = 0>
+          typename rocwmma::enable_if<rocwmma::is_same<T, Ta>{}, int>::type = 0>
 inline ROCWMMA_HOST_DEVICE T explicit_downcast(Ta a)
 {
     // same type, no conversion
@@ -779,20 +773,20 @@ inline ROCWMMA_HOST_DEVICE T explicit_downcast(Ta a)
 }
 
 // Use h/w intrinsic and optimized version when __gfx940__
-template <
-    typename T,
-    typename Ta,
-    bool stochastic_rounding,
-    typename std::enable_if<(!(std::is_same<T, Ta>{})
-                             && (std::is_same<T, rocwmma_f8>{} || std::is_same<T, rocwmma_bf8>{})),
-                            int>::type
-    = 0>
+template <typename T,
+          typename Ta,
+          bool stochastic_rounding,
+          typename rocwmma::enable_if<(!(rocwmma::is_same<T, Ta>{})
+                                       && (rocwmma::is_same<T, rocwmma_f8>{}
+                                           || rocwmma::is_same<T, rocwmma_bf8>{})),
+                                      int>::type
+          = 0>
 inline ROCWMMA_HOST_DEVICE T explicit_downcast(Ta a, uint32_t rng)
 {
 #if ROCWMMA_ARCH_GFX940 || ROCWMMA_ARCH_GFX941 || ROCWMMA_ARCH_GFX942
     // NOTE: we are directly calling cast_to_f8_from_f32 instead of constructor to optimize away one runtime branch
     T val;
-    if(std::is_same<T, rocwmma_f8>::value)
+    if(rocwmma::is_same<T, rocwmma_f8>::value)
     {
         val.data = rocwmma_f8::cast_to_f8_from_f32<stochastic_rounding>(float(a), rng);
     }
@@ -811,14 +805,14 @@ inline ROCWMMA_HOST_DEVICE T explicit_downcast(Ta a, uint32_t rng)
 
 // NOTE NOTE: The above code is good if we don't consider HIP-GEMM code and only consider the quantization
 // However, if we need HIP-GEMM for fall-back, we would need explicit_cast handles Tacc=f32 to To=f16/bf16 conversion
-template <
-    typename T,
-    typename Ta,
-    bool stochastic_rounding,
-    typename std::enable_if<(!(std::is_same<T, Ta>{})
-                             && !(std::is_same<T, rocwmma_f8>{} || std::is_same<T, rocwmma_bf8>{})),
-                            int>::type
-    = 0>
+template <typename T,
+          typename Ta,
+          bool stochastic_rounding,
+          typename rocwmma::enable_if<(!(rocwmma::is_same<T, Ta>{})
+                                       && !(rocwmma::is_same<T, rocwmma_f8>{}
+                                            || rocwmma::is_same<T, rocwmma_bf8>{})),
+                                      int>::type
+          = 0>
 inline ROCWMMA_HOST_DEVICE T explicit_downcast(Ta a, uint32_t rng)
 {
     // the return type is not a F8 types, no SR for those types
diff --git a/library/include/rocwmma/internal/io_layout.hpp b/library/include/rocwmma/internal/io_layout.hpp
index d07e5b41..42878c0f 100644
--- a/library/include/rocwmma/internal/io_layout.hpp
+++ b/library/include/rocwmma/internal/io_layout.hpp
@@ -132,7 +132,7 @@ namespace rocwmma
         {
             MaxVW = detail::MaxVWSelector<matrix_a, BlockDim, KDim, DataT, DataLayoutT, WaveCount>::
                 Result,
-            VW = std::is_same<DataLayoutT, row_major>::value ? MaxVW : 1u
+            VW = is_same<DataLayoutT, row_major>::value ? MaxVW : 1u
         };
 
         // Layout mapping for 1d / 2d
@@ -140,7 +140,7 @@ namespace rocwmma
         using MatrixLayout
             = MatrixLayout::template ColNT<BlockDim, KDim, DataT, DataLayoutT, VW, MaxVW>;
 
-        static_assert(!(std::is_same_v<DataLayoutT, col_major> && VW > 1),
+        static_assert(!(is_same_v<DataLayoutT, col_major> && VW > 1),
                       "matrix_a in col_major currently does not support VW > 1");
     };
 
@@ -156,7 +156,7 @@ namespace rocwmma
         {
             MaxVW = detail::MaxVWSelector<matrix_b, BlockDim, KDim, DataT, DataLayoutT, WaveCount>::
                 Result,
-            VW = std::is_same<DataLayoutT, col_major>::value ? MaxVW : 1u
+            VW = is_same<DataLayoutT, col_major>::value ? MaxVW : 1u
         };
 
         // Layout mapping for 1d / 2d
@@ -164,7 +164,7 @@ namespace rocwmma
         using MatrixLayout
             = MatrixLayout::template RowNT<BlockDim, KDim, DataT, DataLayoutT, VW, MaxVW>;
 
-        static_assert(!(std::is_same_v<DataLayoutT, row_major> && VW > 1),
+        static_assert(!(is_same_v<DataLayoutT, row_major> && VW > 1),
                       "matrix_b in row_major currently does not support VW > 1");
     };
 
@@ -178,8 +178,8 @@ namespace rocwmma
         // Vector size properties
         enum : uint32_t
         {
-            MaxVW = (std::is_same<DataT, float64_t>::value || ROCWMMA_ARCH_GFX11) ? 1u : 4u,
-            VW    = std::is_same<DataLayoutT, col_major>::value ? MaxVW : 1u
+            MaxVW = (is_same<DataT, float64_t>::value || ROCWMMA_ARCH_GFX11) ? 1u : 4u,
+            VW    = is_same<DataLayoutT, col_major>::value ? MaxVW : 1u
         };
 
         // Layout mapping for 1d / 2d
@@ -187,7 +187,7 @@ namespace rocwmma
         using MatrixLayout
             = MatrixLayout::template RowNT<BlockDim, KDim, DataT, DataLayoutT, VW, MaxVW>;
 
-        static_assert(!(std::is_same<DataLayoutT, row_major>::value && VW > 1),
+        static_assert(!(is_same<DataLayoutT, row_major>::value && VW > 1),
                       "accumulator in row_major currently does not support VW > 1");
     };
 
diff --git a/library/include/rocwmma/internal/layout.hpp b/library/include/rocwmma/internal/layout.hpp
index 48462bcb..b02c231f 100644
--- a/library/include/rocwmma/internal/layout.hpp
+++ b/library/include/rocwmma/internal/layout.hpp
@@ -26,6 +26,7 @@
 #ifndef ROCWMMA_LAYOUT_HPP
 #define ROCWMMA_LAYOUT_HPP
 
+#include "utility/type_traits.hpp"
 #include "layout_impl.hpp"
 
 namespace rocwmma
@@ -188,8 +189,8 @@ namespace rocwmma
                   typename DataLayout,
                   uint32_t VectorWidth,
                   uint32_t MaxVectorWidth>
-        struct ColNT : public std::conditional_t<
-                           std::is_same<DataLayout, col_major>::value,
+        struct ColNT : public conditional_t<
+                           is_same<DataLayout, col_major>::value,
                            detail::ColOrthoVW<BlockDim, BlockK, DataT, 1, MaxVectorWidth>,
                            detail::ColOrthoVW<BlockDim, BlockK, DataT, VectorWidth, MaxVectorWidth>>
         {
@@ -202,11 +203,11 @@ namespace rocwmma
                 // elements in both row_major or col_major data layouts.
                 // This layout cannot support for VW > 1 in col_major data layout otherwise the
                 // ordering is broken.
-                static_assert(!(std::is_same_v<DataLayout, col_major> && VectorWidth > 1),
+                static_assert(!(is_same_v<DataLayout, col_major> && VectorWidth > 1),
                               "ColNT in col_major does not support VectorWidth > 1");
 
                 // Must ensure that MaxVectorWidth fits inside the leading dimension
-                static_assert(std::is_same_v<DataLayout, row_major> && (MaxVectorWidth <= BlockK),
+                static_assert(is_same_v<DataLayout, row_major> && (MaxVectorWidth <= BlockK),
                     "MaxVectorWidth is larger than BlockK dimension. Try reducing MaxVectorWidth");
             };
         };
@@ -315,8 +316,8 @@ namespace rocwmma
                   typename DataLayout,
                   uint32_t VectorWidth,
                   uint32_t MaxVectorWidth>
-        struct RowNT : public std::conditional_t<
-                           std::is_same<DataLayout, col_major>::value,
+        struct RowNT : public conditional_t<
+                           is_same<DataLayout, col_major>::value,
                            detail::RowOrthoVW<BlockDim, BlockK, DataT, VectorWidth, MaxVectorWidth>,
                            detail::RowOrthoVW<BlockDim, BlockK, DataT, 1, MaxVectorWidth>>
         {
@@ -329,11 +330,11 @@ namespace rocwmma
                 // elements in both in row_major or col_major data layouts.
                 // This layout cannot support for VW > 1 in row_major data layout otherwise the
                 // ordering is broken.
-                static_assert(!(std::is_same_v<DataLayout, row_major> && VectorWidth > 1),
+                static_assert(!(is_same_v<DataLayout, row_major> && VectorWidth > 1),
                               "RowNT in row_major does not support VectorWidth > 1");
 
                 // Must ensure that MaxVectorWidth fits inside the leading dimension
-                static_assert(std::is_same_v<DataLayout, col_major> && (MaxVectorWidth <= BlockK),
+                static_assert(is_same_v<DataLayout, col_major> && (MaxVectorWidth <= BlockK),
                     "MaxVectorWidth is larger than BlockK dimension. Try reducing MaxVectorWidth");
             };
         };
@@ -498,8 +499,8 @@ namespace rocwmma
                   typename DataLayout,
                   uint32_t VectorWidth,
                   uint32_t MaxVectorWidth = VectorWidth>
-        struct Col : public std::conditional_t<
-                         std::is_same<DataLayout, col_major>::value,
+        struct Col : public conditional_t<
+                         is_same<DataLayout, col_major>::value,
                          detail::ColInlineVW<BlockDim, BlockK, DataT, VectorWidth, MaxVectorWidth>,
                          detail::ColOrthoVW<BlockDim, BlockK, DataT, VectorWidth, MaxVectorWidth>>
         {
@@ -664,8 +665,8 @@ namespace rocwmma
                   typename DataLayout,
                   uint32_t VectorWidth,
                   uint32_t MaxVectorWidth = VectorWidth>
-        struct Row : public std::conditional_t<
-                         std::is_same<DataLayout, row_major>::value,
+        struct Row : public conditional_t<
+                         is_same<DataLayout, row_major>::value,
                          detail::RowInlineVW<BlockDim, BlockK, DataT, VectorWidth, MaxVectorWidth>,
                          detail::RowOrthoVW<BlockDim, BlockK, DataT, VectorWidth, MaxVectorWidth>>
         {
diff --git a/library/include/rocwmma/internal/layout_impl.hpp b/library/include/rocwmma/internal/layout_impl.hpp
index 7dd08439..0047d394 100644
--- a/library/include/rocwmma/internal/layout_impl.hpp
+++ b/library/include/rocwmma/internal/layout_impl.hpp
@@ -59,12 +59,12 @@ namespace rocwmma
             /// Helper to ensure layout types are consistent (same)
             ///
             template <typename LhsDataLayout, typename RhsDataLayout>
-            struct ConsistencyCheck : public std::false_type
+            struct ConsistencyCheck : public false_type
             {
             };
 
             template <typename DataLayout>
-            struct ConsistencyCheck<DataLayout, DataLayout> : public std::true_type
+            struct ConsistencyCheck<DataLayout, DataLayout> : public true_type
             {
             };
 
@@ -72,12 +72,12 @@ namespace rocwmma
             /// Helper to check if layout types are orthogonal
             ///
             template <typename LhsDataLayout, typename RhsDataLayout>
-            struct OrthogonalCheck : public std::true_type
+            struct OrthogonalCheck : public true_type
             {
             };
 
             template <typename DataLayout>
-            struct OrthogonalCheck<DataLayout, DataLayout> : public std::false_type
+            struct OrthogonalCheck<DataLayout, DataLayout> : public false_type
             {
             };
 
@@ -208,13 +208,13 @@ namespace rocwmma
             /// Check for consistency in element ordering between two layouts
             ///
             template <typename LhsMatrixLayout, typename RhsMatrixLayout>
-            struct ConsistencyCheck : public std::false_type
+            struct ConsistencyCheck : public false_type
             {
             };
 
             // Same type is compatible
             template <typename MatrixLayout>
-            struct ConsistencyCheck<MatrixLayout, MatrixLayout> : public std::true_type
+            struct ConsistencyCheck<MatrixLayout, MatrixLayout> : public true_type
             {
             };
 
@@ -229,7 +229,7 @@ namespace rocwmma
                 MatrixLayout::ColNT<BlockDim, BlockK, DataT, col_major, 1, MaxVectorWidth>,
                 MatrixLayout::
                     ColNT<BlockDim, BlockK, DataT, row_major, RhsVectorWidth, MaxVectorWidth>>
-                : public std::true_type
+                : public true_type
             {
             };
 
@@ -242,7 +242,7 @@ namespace rocwmma
                 MatrixLayout::
                     ColNT<BlockDim, BlockK, DataT, row_major, LhsVectorWidth, MaxVectorWidth>,
                 MatrixLayout::ColNT<BlockDim, BlockK, DataT, col_major, 1, MaxVectorWidth>>
-                : public std::true_type
+                : public true_type
             {
             };
 
@@ -255,7 +255,7 @@ namespace rocwmma
                 MatrixLayout::
                     RowNT<BlockDim, BlockK, DataT, col_major, LhsVectorWidth, MaxVectorWidth>,
                 MatrixLayout::RowNT<BlockDim, BlockK, DataT, row_major, 1, MaxVectorWidth>>
-                : public std::true_type
+                : public true_type
             {
             };
 
@@ -268,7 +268,7 @@ namespace rocwmma
                 MatrixLayout::RowNT<BlockDim, BlockK, DataT, row_major, 1, MaxVectorWidth>,
                 MatrixLayout::
                     RowNT<BlockDim, BlockK, DataT, col_major, RhsVectorWidth, MaxVectorWidth>>
-                : public std::true_type
+                : public true_type
             {
             };
 
@@ -286,7 +286,7 @@ namespace rocwmma
                     Col<BlockDim, BlockK, DataT, DataLayout, LhsVectorWidth, MaxVectorWidth>,
                 MatrixLayout::
                     Col<BlockDim, BlockK, DataT, DataLayout, RhsVectorWidth, MaxVectorWidth>>
-                : public std::true_type
+                : public true_type
             {
             };
 
@@ -302,7 +302,7 @@ namespace rocwmma
                     Row<BlockDim, BlockK, DataT, DataLayout, LhsVectorWidth, MaxVectorWidth>,
                 MatrixLayout::
                     Row<BlockDim, BlockK, DataT, DataLayout, RhsVectorWidth, MaxVectorWidth>>
-                : public std::true_type
+                : public true_type
             {
             };
 
@@ -311,13 +311,13 @@ namespace rocwmma
             ///
 
             template <typename LhsMatrixLayout, typename RhsMatrixLayout>
-            struct OrthogonalCheck : public std::false_type
+            struct OrthogonalCheck : public false_type
             {
             };
 
             // Same type is not orthogonal
             template <typename MatrixLayout>
-            struct OrthogonalCheck<MatrixLayout, MatrixLayout> : public std::false_type
+            struct OrthogonalCheck<MatrixLayout, MatrixLayout> : public false_type
             {
             };
 
@@ -325,7 +325,7 @@ namespace rocwmma
             struct OrthogonalCheck<
                 MatrixLayout::ColNT<BlockDim, BlockK, DataT, col_major, 1, MaxVectorWidth>,
                 MatrixLayout::RowNT<BlockDim, BlockK, DataT, row_major, 1, MaxVectorWidth>>
-                : public std::true_type
+                : public true_type
             {
             };
 
@@ -340,7 +340,7 @@ namespace rocwmma
                     ColNT<BlockDim, BlockK, DataT, row_major, LhsVectorWidth, MaxVectorWidth>,
                 MatrixLayout::
                     RowNT<BlockDim, BlockK, DataT, col_major, RhsVectorWidth, MaxVectorWidth>>
-                : public std::true_type
+                : public true_type
             {
             };
 
@@ -348,7 +348,7 @@ namespace rocwmma
             struct OrthogonalCheck<
                 MatrixLayout::RowNT<BlockDim, BlockK, DataT, row_major, 1, MaxVectorWidth>,
                 MatrixLayout::ColNT<BlockDim, BlockK, DataT, col_major, 1, MaxVectorWidth>>
-                : public std::true_type
+                : public true_type
             {
             };
 
@@ -363,7 +363,7 @@ namespace rocwmma
                     RowNT<BlockDim, BlockK, DataT, col_major, LhsVectorWidth, MaxVectorWidth>,
                 MatrixLayout::
                     ColNT<BlockDim, BlockK, DataT, row_major, RhsVectorWidth, MaxVectorWidth>>
-                : public std::true_type
+                : public true_type
             {
             };
 
@@ -382,7 +382,7 @@ namespace rocwmma
                                   DataT,
                                   typename DataLayout::template OrthogonalLayout_t<LhsDataLayout>,
                                   RhsVectorWidth,
-                                  MaxVectorWidth>> : public std::true_type
+                                  MaxVectorWidth>> : public true_type
             {
             };
 
@@ -401,7 +401,7 @@ namespace rocwmma
                                   DataT,
                                   typename DataLayout::template OrthogonalLayout_t<LhsDataLayout>,
                                   RhsVectorWidth,
-                                  MaxVectorWidth>> : public std::true_type
+                                  MaxVectorWidth>> : public true_type
             {
             };
 
@@ -416,7 +416,7 @@ namespace rocwmma
                     Col<BlockDim, BlockK, DataT, col_major, LhsVectorWidth, MaxVectorWidth>,
                 MatrixLayout::
                     Row<BlockDim, BlockK, DataT, row_major, RhsVectorWidth, MaxVectorWidth>>
-                : public std::true_type
+                : public true_type
             {
             };
 
@@ -505,7 +505,7 @@ namespace rocwmma
                         WaveSize = IOTraits::ThreadsPerIO,
 
                         // Number of BlockDim columns gathered per cycle of MaxVW
-                        MaxKPerIO = WaveSize * MaxVectorWidth / std::min(BlockDim, WaveSize),
+                        MaxKPerIO = WaveSize * MaxVectorWidth / min(BlockDim, WaveSize),
 
                         BlockDimStride_X = WaveSize,
                         BlockDimStride_Y = 0u,
@@ -520,7 +520,7 @@ namespace rocwmma
                         LargeDim = BlockDim >= WaveSize,
 
                         // Number of segments in BlockDim direction
-                        BlockDimSegs = std::max(BlockDim / BlockDimStride_X, 1u),
+                        BlockDimSegs = max(BlockDim / BlockDimStride_X, 1u),
 
                         // Number of segments in the BlockK direction
                         BlockKSegs = BlockK / BlockKStride_Y,
@@ -742,7 +742,7 @@ namespace rocwmma
                         MaxElementsPerIO = WaveSize * MaxVectorWidth,
 
                         // Number of BlockDim columns gathered per cycle of MaxVW
-                        MaxKPerIO = std::max(1u, MaxElementsPerIO / BlockDim),
+                        MaxKPerIO = max(1u, MaxElementsPerIO / BlockDim),
 
                         VWStride_X = VectorWidth,
                         VWStride_Y = 0u,
@@ -757,13 +757,13 @@ namespace rocwmma
                         LargeDim = BlockDim >= MaxElementsPerIO,
 
                         // Number of segments in BlockDim direction
-                        BlockDimSegs = std::max(1u, BlockDim / BlockDimStride_X),
+                        BlockDimSegs = max(1u, BlockDim / BlockDimStride_X),
 
                         // Number of segments in the BlockK direction
-                        BlockKSegs = std::max(1u, BlockK / BlockKStride_Y),
+                        BlockKSegs = max(1u, BlockK / BlockKStride_Y),
 
                         // Number of segments in the MaxVW direction
-                        VWSegs = std::max(1u, MaxVectorWidth / VWStride_X),
+                        VWSegs = max(1u, MaxVectorWidth / VWStride_X),
 
                         // Log2 Values
                         Log2BlockDim         = Log2<BlockDim>::value,
@@ -933,8 +933,7 @@ namespace rocwmma
                 ROCWMMA_DEVICE constexpr static inline auto strides()
                 {
                     auto t = Traits::OrthoLayout::strides();
-                    return make_vector(
-                        swap(std::get<0>(t)), swap(std::get<1>(t)), swap(std::get<2>(t)));
+                    return make_vector(swap(get<0>(t)), swap(get<1>(t)), swap(get<2>(t)));
                 }
 
                 ROCWMMA_DEVICE static inline typename Traits::MatrixCoordT
@@ -979,8 +978,7 @@ namespace rocwmma
                 ROCWMMA_DEVICE constexpr static inline auto strides()
                 {
                     auto t = Traits::OrthoLayout::strides();
-                    return make_vector(
-                        swap(std::get<0>(t)), swap(std::get<1>(t)), swap(std::get<2>(t)));
+                    return make_vector(swap(get<0>(t)), swap(get<1>(t)), swap(get<2>(t)));
                 }
 
                 ROCWMMA_DEVICE static inline typename Traits::MatrixCoordT
diff --git a/library/include/rocwmma/internal/mapping_util.hpp b/library/include/rocwmma/internal/mapping_util.hpp
index 8719d31f..b98a201e 100644
--- a/library/include/rocwmma/internal/mapping_util.hpp
+++ b/library/include/rocwmma/internal/mapping_util.hpp
@@ -27,7 +27,7 @@
 #define ROCWMMA_MAPPING_UTIL_HPP
 
 #include "types.hpp"
-
+#include "utility/type_traits.hpp"
 namespace rocwmma
 {
     // Fwd declaration
@@ -59,11 +59,11 @@ namespace rocwmma
 
             // Size of workgroup, normalized to wave count.
             template <bool IsConst                        = (TBlockX > 0u && TBlockY > 0u),
-                      typename std::enable_if_t<IsConst>* = nullptr>
+                      enable_if_t<IsConst>* = nullptr>
             ROCWMMA_DEVICE constexpr static inline WorkgroupDimT workgroupDim();
 
             template <bool IsConst                         = (TBlockX > 0u && TBlockY > 0u),
-                      typename std::enable_if_t<!IsConst>* = nullptr>
+                      enable_if_t<!IsConst>* = nullptr>
             ROCWMMA_DEVICE static inline WorkgroupDimT workgroupDim();
         };
 
@@ -93,8 +93,8 @@ namespace rocwmma
 
             enum : uint32_t
             {
-                MajorIndex = std::is_same<DataOrientation, row_major>::value ? 0 : 1,
-                MinorIndex = std::is_same<DataOrientation, row_major>::value ? 1 : 0
+                MajorIndex = is_same<DataOrientation, row_major>::value ? 0 : 1,
+                MinorIndex = is_same<DataOrientation, row_major>::value ? 1 : 0
             };
 
             // Determine the leading dimension of a matrix.
diff --git a/library/include/rocwmma/internal/mapping_util_impl.hpp b/library/include/rocwmma/internal/mapping_util_impl.hpp
index e13e690a..106f755f 100644
--- a/library/include/rocwmma/internal/mapping_util_impl.hpp
+++ b/library/include/rocwmma/internal/mapping_util_impl.hpp
@@ -105,7 +105,7 @@ namespace rocwmma
 
         template <uint32_t TBlockX, uint32_t TBlockY>
         template <bool IsConst /* = (TBlockX > 0u && TBlockY > 0u) */,
-                  typename std::enable_if_t<IsConst>* /* = nullptr */>
+                  enable_if_t<IsConst>* /* = nullptr */>
         ROCWMMA_DEVICE constexpr inline auto WaveSpace<TBlockX, TBlockY>::workgroupDim()
             -> WorkgroupDimT
         {
@@ -114,7 +114,7 @@ namespace rocwmma
 
         template <uint32_t TBlockX, uint32_t TBlockY>
         template <bool IsConst /* = (TBlockX > 0u && TBlockY > 0u) */,
-                  typename std::enable_if_t<!IsConst>* /* = nullptr */>
+                  enable_if_t<!IsConst>* /* = nullptr */>
         ROCWMMA_DEVICE inline auto WaveSpace<TBlockX, TBlockY>::workgroupDim() -> WorkgroupDimT
         {
             return waveCount(make_coord2d(blockDim.x, blockDim.y));
@@ -251,7 +251,7 @@ namespace rocwmma
     ROCWMMA_DEVICE inline auto MappingUtil<BlockHeight, BlockWidth, DataT, DataLayout>::matrixCoord(
         BlockCoordT const& blockCoord) -> MatrixCoordT
     {
-        return MatrixSpace::fromBlockCoord(std::forward<BlockCoordT const>(blockCoord));
+        return MatrixSpace::fromBlockCoord(forward<BlockCoordT const>(blockCoord));
     }
 
     template <uint32_t BlockHeight, uint32_t BlockWidth, typename DataT, typename DataLayout>
@@ -259,7 +259,7 @@ namespace rocwmma
         MappingUtil<BlockHeight, BlockWidth, DataT, DataLayout>::dataOffset(
             MatrixCoordT const& matrixCoord, uint32_t ldm)
     {
-        return DataSpace::fromMatrixCoord(std::forward<MatrixCoordT const>(matrixCoord), ldm);
+        return DataSpace::fromMatrixCoord(forward<MatrixCoordT const>(matrixCoord), ldm);
     }
 
     template <uint32_t BlockHeight, uint32_t BlockWidth, typename DataT, typename DataLayout>
@@ -268,7 +268,7 @@ namespace rocwmma
             DataT const* baseAddr, MatrixCoordT const& matrixCoord, uint32_t ldm)
     {
         return baseAddr
-               + DataSpace::fromMatrixCoord(std::forward<MatrixCoordT const>(matrixCoord), ldm);
+               + DataSpace::fromMatrixCoord(forward<MatrixCoordT const>(matrixCoord), ldm);
     }
 
     template <uint32_t BlockHeight, uint32_t BlockWidth, typename DataT, typename DataLayout>
@@ -276,7 +276,7 @@ namespace rocwmma
         DataT* baseAddr, MatrixCoordT const& matrixCoord, uint32_t ldm)
     {
         return baseAddr
-               + DataSpace::fromMatrixCoord(std::forward<MatrixCoordT const>(matrixCoord), ldm);
+               + DataSpace::fromMatrixCoord(forward<MatrixCoordT const>(matrixCoord), ldm);
     }
 
 } // namespace rocwmma
diff --git a/library/include/rocwmma/internal/mfma.hpp b/library/include/rocwmma/internal/mfma.hpp
index 6747eef2..794bea81 100644
--- a/library/include/rocwmma/internal/mfma.hpp
+++ b/library/include/rocwmma/internal/mfma.hpp
@@ -52,7 +52,7 @@ namespace rocwmma
                 BlockM,
                 BlockN,
                 BlockK,
-                typename std::enable_if_t<ROCWMMA_ARCH_GFX9 && (BlockM == BlockN)>>
+                enable_if_t<ROCWMMA_ARCH_GFX9 && (BlockM == BlockN)>>
     {
         // Full-fragment IO traits
         using IOTraitsA   = IOTraits<BlockM, BlockK, InputT>;
@@ -90,10 +90,10 @@ namespace rocwmma
 
             // A / B  and C / D types must match
             static_assert(
-                std::is_same<typename VecTraitsA::DataT, typename VecTraitsB::DataT>::value,
+                is_same<typename VecTraitsA::DataT, typename VecTraitsB::DataT>::value,
                 "A and B registers must be of same type");
             static_assert(
-                std::is_same<typename VecTraitsC::DataT, typename VecTraitsD::DataT>::value,
+                is_same<typename VecTraitsC::DataT, typename VecTraitsD::DataT>::value,
                 "C and D registers must be of same type");
 
             // Full fragment counts must match packed IO counts
diff --git a/library/include/rocwmma/internal/opaque_load.hpp b/library/include/rocwmma/internal/opaque_load.hpp
index 1bdd47cd..c14e1978 100644
--- a/library/include/rocwmma/internal/opaque_load.hpp
+++ b/library/include/rocwmma/internal/opaque_load.hpp
@@ -78,7 +78,7 @@ namespace rocwmma
 
         // Outer loop = index 0,
         // Inner loop = index N-1
-        template <std::size_t Depth = 0,
+        template <size_t Depth = 0,
                   typename Iterator,
                   typename StrideCounts,
                   typename Strides2d>
@@ -92,7 +92,7 @@ namespace rocwmma
             auto strideCount  = get<Depth>(strideCounts);
 
             // Last depth layer will invoke the load
-            if constexpr(Depth == (VecTraits<std::decay_t<StrideCounts>>::size() - 1u))
+            if constexpr(Depth == (VecTraits<decay_t<StrideCounts>>::size() - 1u))
             {
 #pragma unroll
                 for(int i = 0; i < strideCount; i++)
diff --git a/library/include/rocwmma/internal/opaque_store.hpp b/library/include/rocwmma/internal/opaque_store.hpp
index 7c89b12d..1f1f9990 100644
--- a/library/include/rocwmma/internal/opaque_store.hpp
+++ b/library/include/rocwmma/internal/opaque_store.hpp
@@ -73,7 +73,7 @@ namespace rocwmma
 
         using StoreVecTraits = VecTraits<typename Traits::StoreT>;
 
-        template <std::size_t Depth = 0,
+        template <size_t Depth = 0,
                   typename Iterator,
                   typename StrideCounts,
                   typename Strides2d>
@@ -87,7 +87,7 @@ namespace rocwmma
             auto strideCount  = get<Depth>(strideCounts);
 
             // Last depth layer will invoke the load
-            if constexpr(Depth == (VecTraits<std::decay_t<StrideCounts>>::size() - 1u))
+            if constexpr(Depth == (VecTraits<decay_t<StrideCounts>>::size() - 1u))
             {
 #pragma unroll
                 for(int i = 0; i < strideCount; i++)
diff --git a/library/include/rocwmma/internal/pack_util_impl.hpp b/library/include/rocwmma/internal/pack_util_impl.hpp
index dac20823..22866437 100644
--- a/library/include/rocwmma/internal/pack_util_impl.hpp
+++ b/library/include/rocwmma/internal/pack_util_impl.hpp
@@ -29,6 +29,7 @@
 #include "pack_util.hpp"
 #include "types.hpp"
 #include "utils.hpp"
+#include "vector_util.hpp"
 
 namespace rocwmma
 {
@@ -104,6 +105,18 @@ namespace rocwmma
         using PackedT   = int32_t;
     };
 
+    template <>
+    struct PackTraits<int64_t>
+    {
+        enum : uint32_t
+        {
+            PackRatio = 1 // No pack
+        };
+
+        using UnpackedT = int64_t;
+        using PackedT   = int64_t;
+    };
+
     template <>
     struct PackTraits<float8_t>
     {
@@ -251,12 +264,13 @@ namespace rocwmma
     ROCWMMA_DEVICE /*static*/ inline auto&
         PackUtil<DataT>::packHelper(VecT<UnpackedT, VecSize> const& v)
     {
-        static_assert(VecSize % Traits::PackRatio == 0, "Use paddedPack32 instead.");
+        static_assert(VecSize % Traits::PackRatio == 0,
+                      "Cannot pack partial b32 vector. Use paddedPack instead.");
 
         // NOTE: Assumes that there is NO padding...
         using PackedVecT   = VecT<PackedT, VecSize / Traits::PackRatio>;
-        using UnpackedVecT = std::decay_t<decltype(v)>;
-        return *reinterpret_cast<PackedVecT*>(&(const_cast<UnpackedVecT&>(v)));
+        using UnpackedVecT = decay_t<decltype(v)>;
+        return reinterpret_cast<PackedVecT const&>(v);
     }
 
     template <typename DataT>
@@ -264,10 +278,16 @@ namespace rocwmma
     ROCWMMA_DEVICE /*static*/ inline auto&
         PackUtil<DataT>::unpackHelper(VecT<PackedT, VecSize> const& v)
     {
+        if constexpr(is_same_v<PackedT, UnpackedT>)
+        {
+            static_assert(Traits::PackRatio == 1, "Input vector must be packed");
+        }
+
         // NOTE: Assumes that there is NO padding...
-        using PackedVecT   = std::decay_t<decltype(v)>;
+        using PackedVecT   = decay_t<decltype(v)>;
         using UnpackedVecT = VecT<UnpackedT, VecSize * Traits::PackRatio>;
-        return *reinterpret_cast<UnpackedVecT*>(&(const_cast<PackedVecT&>(v)));
+
+        return reinterpret_cast<UnpackedVecT const&>(v);
     }
 
     template <typename DataT>
@@ -353,7 +373,8 @@ namespace rocwmma
         // Duplicate the inputs for padding
         else if constexpr((VecSize * 2u) == Traits::PackRatio)
         {
-            return packHelper(concat(v, v));
+            // Make sure to return by value here as concat produces rval
+            return VecT<PackedT, 1u>(packHelper(concat(v, v)));
         }
         // Pad single element data to b32
         else if constexpr(VecSize == 1u)
@@ -375,7 +396,7 @@ namespace rocwmma
         // Take lower half of vector
         else if constexpr((UnpaddedSize * 2u) == Traits::PackRatio)
         {
-            return extractLo(v);
+            return extractLo(unpackHelper(v));
         }
         // Pad single element data to b32
         else if constexpr(UnpaddedSize == 1u)
diff --git a/library/include/rocwmma/internal/permute.hpp b/library/include/rocwmma/internal/permute.hpp
index 765d7670..3e258259 100644
--- a/library/include/rocwmma/internal/permute.hpp
+++ b/library/include/rocwmma/internal/permute.hpp
@@ -56,9 +56,9 @@ namespace rocwmma
             static_assert((PermuteOp::opId() == CrossLaneOps::Properties::OP_ID_BLOCK_BCAST)
                               || (PermuteOp::opId() == CrossLaneOps::Properties::OP_ID_SHUFFLE)
                               || (PermuteOp::opId() == CrossLaneOps::Properties::OP_ID_GATHER)
-                              || (PermuteOp::opId() == CrossLaneOps::Properties::OP_ID_SCATTER),
+                              || (PermuteOp::opId() == CrossLaneOps::Properties::OP_ID_SCATTER)
+                              || (PermuteOp::opId() == CrossLaneOps::Properties::OP_ID_ROTATE),
                           "PermuteOp is unsupported");
-
             template <typename DataT>
             ROCWMMA_DEVICE static inline auto exec(DataT const& src)
             {
diff --git a/library/include/rocwmma/internal/rocwmma_hip_f8_impl.h b/library/include/rocwmma/internal/rocwmma_hip_f8_impl.h
index bea6f035..4ef850a7 100644
--- a/library/include/rocwmma/internal/rocwmma_hip_f8_impl.h
+++ b/library/include/rocwmma/internal/rocwmma_hip_f8_impl.h
@@ -27,8 +27,12 @@
 #ifndef ROCWMMA_HIP_FP8_IMPL_H
 #define ROCWMMA_HIP_FP8_IMPL_H
 
+#include "utility/type_traits.hpp"
+
 namespace rocwmma_hip_f8_impl
 {
+    using rocwmma::is_same;
+    using rocwmma::conditional;
 
     ROCWMMA_HOST inline int clz(uint32_t x)
     {
@@ -42,8 +46,8 @@ namespace rocwmma_hip_f8_impl
     template <int wm, int we, typename T, bool negative_zero_nan, bool clip>
     ROCWMMA_HOST_DEVICE uint8_t cast_to_f8(T _x, bool stoch, uint32_t rng)
     {
-        constexpr bool is_half  = std::is_same<T, _Float16>::value;
-        constexpr bool is_float = std::is_same<T, float>::value;
+        constexpr bool is_half  = is_same<T, _Float16>::value;
+        constexpr bool is_float = is_same<T, float>::value;
         static_assert(wm + we == 7, "wm+we==7");
         static_assert(is_half || is_float, "Only half and float can be cast to f8");
 
@@ -239,8 +243,8 @@ namespace rocwmma_hip_f8_impl
     template <int wm, int we, typename T, bool negative_zero_nan>
     ROCWMMA_HOST_DEVICE T cast_from_f8(uint8_t x)
     {
-        constexpr bool is_half  = std::is_same<T, _Float16>::value;
-        constexpr bool is_float = std::is_same<T, float>::value;
+        constexpr bool is_half  = is_same<T, _Float16>::value;
+        constexpr bool is_float = is_same<T, float>::value;
         static_assert(is_half || is_float, "only half and float are supported");
 
         constexpr int weo = is_half ? 5 : 8;
@@ -296,7 +300,7 @@ namespace rocwmma_hip_f8_impl
                 return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
             }
         }
-        typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
+        typename conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
         if(we == 5 && is_half && !negative_zero_nan)
         {
             retval = x << 8;
diff --git a/library/include/rocwmma/internal/rocwmma_xfloat32.hpp b/library/include/rocwmma/internal/rocwmma_xfloat32.hpp
index b5ad57c8..f47ac775 100644
--- a/library/include/rocwmma/internal/rocwmma_xfloat32.hpp
+++ b/library/include/rocwmma/internal/rocwmma_xfloat32.hpp
@@ -179,6 +179,7 @@ typedef struct
     float data;
 } rocwmma_xfloat32_public;
 
+#if !defined(__HIPCC_RTC__)
 static_assert(std::is_standard_layout<rocwmma_xfloat32>{},
               "rocwmma_xfloat32 is not a standard layout type, and thus is "
               "incompatible with C.");
@@ -187,7 +188,6 @@ static_assert(std::is_trivial<rocwmma_xfloat32>{},
               "rocwmma_xfloat32 is not a trivial type, and thus is "
               "incompatible with C.");
 
-#if !defined(__HIPCC_RTC__)
 static_assert(sizeof(rocwmma_xfloat32) == sizeof(rocwmma_xfloat32_public)
                   && offsetof(rocwmma_xfloat32, data) == offsetof(rocwmma_xfloat32_public, data),
               "internal rocwmma_xfloat32 does not match public rocwmma_xfloat32");
diff --git a/library/include/rocwmma/internal/transforms_impl.hpp b/library/include/rocwmma/internal/transforms_impl.hpp
index e2762410..f234b045 100644
--- a/library/include/rocwmma/internal/transforms_impl.hpp
+++ b/library/include/rocwmma/internal/transforms_impl.hpp
@@ -33,6 +33,7 @@
 #include "pack_util.hpp"
 #include "permute.hpp"
 #include "utils.hpp"
+#include "vector_util.hpp"
 
 namespace rocwmma
 {
@@ -148,6 +149,41 @@ namespace rocwmma
         return PackUtil::template paddedUnpack<VecSize>(concat(lo, hi));
     }
 
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline auto unpackLoHi16(VecT<DataT, VecSize> const& v)
+    {
+        static_assert(VecSize % 2 == 0, "VecSize must be a multiple of 2");
+        using PackUtil = PackUtil<DataT>;
+
+        auto lo     = PackUtil::paddedPack(extractEven(v));
+        auto hi     = PackUtil::paddedPack(extractOdd(v));
+        auto rot_lo = Swizzle::RotateR32<16>::exec(lo);
+        auto rot_hi = Swizzle::RotateR32<16>::exec(hi);
+        lo          = Blend::Zip16::exec(lo, rot_hi);
+        hi          = Blend::Zip16::exec(rot_lo, hi);
+
+        return PackUtil::template paddedUnpack<VecSize>(concat(lo, hi));
+    }
+
+    // TODO: Wave64 only?
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline auto unpackLoHi32(VecT<DataT, VecSize> const& v)
+    {
+        static_assert(VecSize % 2 == 0, "VecSize must be a multiple of 2");
+        using PackUtil = PackUtil<DataT>;
+
+        auto lo = PackUtil::paddedPack(extractEven(v));
+        auto hi = PackUtil::paddedPack(extractOdd(v));
+
+        // TODO: label as rotateR64 for consistency?
+        auto rot_lo = Permute::RotateWaveR<32>::exec(lo);
+        auto rot_hi = Permute::RotateWaveR<32>::exec(hi);
+        lo          = Blend::Zip32::exec(lo, rot_hi);
+        hi          = Blend::Zip32::exec(rot_lo, hi);
+
+        return PackUtil::template paddedUnpack<VecSize>(concat(lo, hi));
+    }
+
     template <typename DataT>
     ROCWMMA_DEVICE static inline auto aos_soa_16xk_b32(VecT<DataT, 8> const& v)
     {
@@ -194,8 +230,8 @@ namespace rocwmma
         // In order to save some operations, we can
         // rotate the odds components only and make up the
         // offset later in gather.
-        auto evens = PackUtil::paddedPack(extractEven(v));
-        auto odds  = PackUtil::paddedPack(extractOdd(v));
+        auto evens = PackUtil::paddedPack(extractEven(result));
+        auto odds  = PackUtil::paddedPack(extractOdd(result));
 
         auto rot = Swizzle::RotateR32<16>::exec(odds);
         auto lo  = Blend::Zip16::exec(evens, rot);
@@ -224,7 +260,32 @@ namespace rocwmma
     template <typename DataT>
     ROCWMMA_DEVICE static inline auto aos_soa_64xk_b32(VecT<DataT, 8> const& v)
     {
-        return 0;
+        using PackUtil = PackUtil<DataT>;
+
+        // Step 1 : Unpack groups of 8
+        auto result = unpackLoHi8(v);
+
+        // Step 2 : Unpack groups of 16
+        result = unpackLoHi16(result);
+
+        // Step 3 : Unpack groups of 32
+        // In order to save some operations, we can
+        // rotate the odds components only and make up the
+        // offset later in gather.
+        auto lo = PackUtil::paddedPack(extractEven(result));
+        auto hi = PackUtil::paddedPack(extractOdd(result));
+
+        // TODO: label as rotateR64 for consistency?
+        auto rot_hi = Permute::RotateWaveR<32>::exec(hi);
+        hi          = Blend::Zip32::exec(rot_hi, lo);
+        lo          = Blend::Zip32::exec(lo, rot_hi);
+
+        // Step 4 : Gather
+        // Note the offset of 32 in hi
+        lo = Permute::GatherWave<8, 0>::exec(lo);
+        hi = Permute::GatherWave<8, 32>::exec(hi);
+
+        return PackUtil::template paddedUnpack<8>(concat(lo, hi));
     }
 
     template <typename DataT>
@@ -275,6 +336,197 @@ namespace rocwmma
         return 0;
     }
 
+    template <uint32_t BlockDim, uint32_t VectorWidth>
+    struct AosToSoa;
+
+    template <>
+    struct AosToSoa<16, 8>
+    {
+        constexpr static uint32_t VW      = 8;
+        constexpr static uint32_t VecSize = 8;
+
+        template <typename DataT>
+        ROCWMMA_DEVICE constexpr static inline auto exec(VecT<DataT, VecSize> const& v)
+        {
+            using PackUtil = PackUtil<DataT>;
+
+            // Step 1 : Unpack groups of 2
+            auto result = unpackLoHi2(v);
+
+            // Step 2 : Unpack groups of 4
+            result = unpackLoHi4(result);
+
+            // Step 3 : Unpack groups of 8
+            result = unpackLoHi8(result);
+
+            // Step 4 : Gather
+            return PackUtil::template paddedUnpack<VecSize>(
+                Permute::Gather16<VW, 0>::exec(PackUtil::paddedPack(result)));
+        }
+    };
+
+    template <>
+    struct AosToSoa<32, 8>
+    {
+        constexpr static uint32_t VW      = 8;
+        constexpr static uint32_t VecSize = 8;
+
+        template <typename DataT>
+        ROCWMMA_DEVICE constexpr static inline auto exec(VecT<DataT, VecSize> const& v)
+        {
+            using PackUtil = PackUtil<DataT>;
+
+            // Step 1 : Unpack groups of 4
+            auto result = unpackLoHi4(v);
+
+            // Step 2 : Unpack groups of 8
+            result = unpackLoHi8(result);
+
+            // Step 3 : Unpack groups of 16
+            // In order to save some operations, we can
+            // rotate the odds components only and make up the
+            // offset later in gather.
+            auto evens = PackUtil::paddedPack(extractEven(result));
+            auto odds  = PackUtil::paddedPack(extractOdd(result));
+
+            auto rot = Swizzle::RotateR32<16>::exec(odds);
+            auto lo  = Blend::Zip16::exec(evens, rot);
+            auto hi  = Blend::Zip16::exec(rot, evens);
+
+            // Step 4 : Gather
+            // Note the offset of 16 in hi
+            lo = Permute::Gather32<VW, 0>::exec(lo);
+            hi = Permute::Gather32<VW, 16>::exec(hi);
+
+            return PackUtil::template paddedUnpack<VecSize>(concat(lo, hi));
+        }
+    };
+
+    template <>
+    struct AosToSoa<64, 8>
+    {
+        constexpr static uint32_t VW      = 8;
+        constexpr static uint32_t VecSize = 8;
+
+        template <typename DataT>
+        ROCWMMA_DEVICE constexpr static inline auto exec(VecT<DataT, VecSize> const& v)
+        {
+            using PackUtil = PackUtil<DataT>;
+
+            // Step 1 : Unpack groups of 8
+            auto result = unpackLoHi8(v);
+
+            // Step 2 : Unpack groups of 16
+            result = unpackLoHi16(result);
+
+            // Step 3 : Unpack groups of 32
+            // In order to save some operations, we can
+            // rotate the odds components only and make up the
+            // offset later in gather.
+            auto lo = PackUtil::paddedPack(extractEven(result));
+            auto hi = PackUtil::paddedPack(extractOdd(result));
+
+            // TODO: label as rotateR64 for consistency?
+            auto rot_hi = Permute::RotateWaveR<32>::exec(hi);
+            hi          = Blend::Zip32::exec(rot_hi, lo);
+            lo          = Blend::Zip32::exec(lo, rot_hi);
+
+            // Step 4 : Gather
+            // Note the offset of 32 in hi
+            lo = Permute::GatherWave<VW, 0>::exec(lo);
+            hi = Permute::GatherWave<VW, 32>::exec(hi);
+
+            return PackUtil::template paddedUnpack<VecSize>(concat(lo, hi));
+        }
+    };
+
+    template <>
+    struct AosToSoa<128, 8>
+    {
+        constexpr static uint32_t VW      = 8;
+        constexpr static uint32_t VecSize = 16;
+
+        template <typename DataT>
+        ROCWMMA_DEVICE constexpr static inline auto exec(VecT<DataT, VecSize> const& v)
+        {
+            using PackUtil = PackUtil<DataT>;
+
+            // Data comes in as AOS format:
+            // There are TWO sets of VW = 8 registers (because this case BlockDim / 64 = 2):
+            // 1. Vecs 0-7
+            // 2. Vecs 8-15
+            //
+            // Register/ |          VW = 8                 |
+            //     Tidx  |___0___|___1___|___...___|___7___|
+            //         0 |   0   |   1   |   ...   |   7   |
+            //         1 |   8   |   9   |   ...   |   15  |
+            //       ... |   ... |   ... |   ...   |  ...  |
+            //        63 |__504__|__505__|___...___|__511__|
+            //
+            // Register/ |          VW = 8                 |
+            //     Tidx  |___8___|___9___|___...___|___15__|
+            //         0 |  512  |  513  |   ...   |  519  |
+            //         1 |  520  |  521  |   ...   |  527  |
+            //       ... |   ... |   ... |   ...   |  ...  |
+            //        63 |__1016_|__1017_|___...___|__1023_|
+
+            // For each batch of VW registers
+            auto v0 = extractLo(v);
+            auto v1 = extractHi(v);
+
+            // Step 1 : Unpack groups of 8
+            auto r0 = unpackLoHi8(v0);
+            auto r1 = unpackLoHi8(v1);
+
+            // Step 2 : isolate data for upper 64 dim from lower 64 dim
+            v0 = concat(extractLo(r0), extractLo(r1));
+            v1 = concat(extractHi(r0), extractHi(r1));
+
+            // Continue from here as if r0 and r1 are independent 64 dim.
+
+            // Step 3 : Unpack groups of 16
+            v0 = unpackLoHi16(v0);
+            v1 = unpackLoHi16(v1);
+
+            // Step 4 : Unpack groups of 32
+            // In order to save some operations, we can
+            // rotate the odds components only and make up the
+            // offset later in gather.
+            auto lo0 = PackUtil::paddedPack(extractEven(v0));
+            auto hi0 = PackUtil::paddedPack(extractOdd(v0));
+
+            auto lo1 = PackUtil::paddedPack(extractEven(v1));
+            auto hi1 = PackUtil::paddedPack(extractOdd(v1));
+
+            // TODO: label as rotateR64 for consistency?
+            auto rot_hi0 = Permute::RotateWaveR<32>::exec(hi0);
+            hi0          = Blend::Zip32::exec(rot_hi0, lo0);
+            lo0          = Blend::Zip32::exec(lo0, rot_hi0);
+
+            auto rot_hi1 = Permute::RotateWaveR<32>::exec(hi1);
+            hi1          = Blend::Zip32::exec(rot_hi1, lo1);
+            lo1          = Blend::Zip32::exec(lo1, rot_hi1);
+
+            // Step 5 : Gather
+            // Note the offset of 32 in hi
+            lo0 = Permute::GatherWave<VW, 0>::exec(lo0);
+            hi0 = Permute::GatherWave<VW, 32>::exec(hi0);
+
+            lo1 = Permute::GatherWave<VW, 0>::exec(lo1);
+            hi1 = Permute::GatherWave<VW, 32>::exec(hi1);
+
+            // Step 6 : Unpack and re-order.
+            auto c0 = PackUtil::template paddedUnpack<VecSize>(concat(lo0, hi0));
+            //c0      = reorderEvenOdd(c0);
+            c0      = concat(extractEven(c0), extractOdd(c0));
+            auto c1 = PackUtil::template paddedUnpack<VecSize>(concat(lo1, hi1));
+            //c1      = reorderEvenOdd(c1);
+            c1 = concat(extractEven(c1), extractOdd(c1));
+
+            return concat(c0, c1);
+        }
+    };
+
     // SOA -> AOS
     // Transform from ortho VW to inline VW
     template <typename DataT>
diff --git a/library/include/rocwmma/internal/tuple.hpp b/library/include/rocwmma/internal/tuple.hpp
index ca9c1c48..54fafc3b 100644
--- a/library/include/rocwmma/internal/tuple.hpp
+++ b/library/include/rocwmma/internal/tuple.hpp
@@ -33,13 +33,12 @@
 
 #endif // !defined(__HIPCC_RTC__)
 
+#include "utility/forward.hpp"
+#include "utility/sequence.hpp"
 #include "utils.hpp"
 
 namespace rocwmma
 {
-    using detail::index_sequence;
-    using detail::make_index_sequence;
-
     template <typename VecT, unsigned int Rank, typename U>
     ROCWMMA_HOST_DEVICE inline constexpr non_native_vector_base<VecT, Rank>
         operator+(non_native_vector_base<VecT, Rank> const& x, U y) noexcept
@@ -98,64 +97,63 @@ namespace rocwmma
 
     namespace detail
     {
-        template <typename VecT, std::size_t... Indices>
+        template <typename VecT, size_t... Indices>
         constexpr static auto copy_impl(VecT&& t, index_sequence<Indices...>&&)
         {
-            return make_vector(std::get<Indices>(std::forward<VecT>(t))...);
+            return make_vector(get<Indices>(forward<VecT>(t))...);
         }
     }
 
     template <typename VecT>
     constexpr static auto pop_right(VecT&& t)
     {
-        return detail::copy_impl(std::forward<VecT>(t),
-                                 make_index_sequence<VecTraits<std::decay_t<VecT>>::size() - 1>{});
+        return detail::copy_impl(forward<VecT>(t),
+                                 make_index_sequence<VecTraits<decay_t<VecT>>::size() - 1>{});
     }
 
     template <typename VecT>
     constexpr static auto pop_left(VecT&& t)
     {
         auto pop_front = [](auto front, auto... rest) { return make_vector(rest...); };
-        return apply(pop_front, std::forward<VecT>(t));
+        return apply(pop_front, forward<VecT>(t));
     }
 
     template <typename VecT>
     constexpr static decltype(auto) get_first(VecT&& t)
     {
-        return std::get<0>(std::forward<VecT>(t));
+        return get<0>(forward<VecT>(t));
     }
 
     template <typename VecT>
     constexpr static decltype(auto) get_last(VecT&& t)
     {
-        return std::get<VecTraits<std::decay_t<VecT>>::size() - 1u>(std::forward<VecT>(t));
+        return get<VecTraits<decay_t<VecT>>::size() - 1u>(forward<VecT>(t));
     }
 
     namespace detail
     {
-        template <typename VecT, std::size_t... Indices>
+        template <typename VecT, size_t... Indices>
         constexpr static decltype(auto) reverse_impl(VecT&& t, index_sequence<Indices...>)
         {
-            return make_vector(
-                std::get<sizeof...(Indices) - 1 - Indices>(std::forward<VecT>(t))...);
+            return make_vector(get<sizeof...(Indices) - 1 - Indices>(forward<VecT>(t))...);
         }
     }
 
     template <typename VecT>
     constexpr static decltype(auto) reverse(VecT&& t)
     {
-        return detail::reverse_impl(std::forward<VecT>(t),
-                                    make_index_sequence<VecTraits<std::decay_t<VecT>>::size()>{});
+        return detail::reverse_impl(forward<VecT>(t),
+                                    make_index_sequence<VecTraits<decay_t<VecT>>::size()>{});
     }
 
     namespace detail
     {
-        template <typename Vec0, typename Vec1, std::size_t... Indices>
+        template <typename Vec0, typename Vec1, size_t... Indices>
         constexpr static decltype(auto)
             flatten_coord_right_impl(Vec0&& coord, Vec1&& dims, index_sequence<Indices...>)
         {
-            static_assert(VecTraits<std::decay_t<Vec0>>::size() == sizeof...(Indices)
-                              && VecTraits<std::decay_t<Vec1>>::size() == sizeof...(Indices),
+            static_assert(VecTraits<decay_t<Vec0>>::size() == sizeof...(Indices)
+                              && VecTraits<decay_t<Vec1>>::size() == sizeof...(Indices),
                           "coord and dims vectors must be the same size");
 
             auto flatten = [](auto&& c, auto&& d, auto& mul) {
@@ -164,10 +162,10 @@ namespace rocwmma
                 return result;
             };
 
-            auto mult = typename VecTraits<std::decay_t<Vec0>>::DataT{1};
-            return (flatten(std::get<Indices>(std::forward<Vec0>(coord)),
-                            std::get<Indices>(std::forward<Vec1>(dims)),
-                            std::forward<decltype(mult)&>(mult))
+            auto mult = typename VecTraits<decay_t<Vec0>>::DataT{1};
+            return (flatten(get<Indices>(forward<Vec0>(coord)),
+                            get<Indices>(forward<Vec1>(dims)),
+                            forward<decltype(mult)&>(mult))
                     + ...);
         }
     }
@@ -176,19 +174,19 @@ namespace rocwmma
     constexpr static decltype(auto) flatten_coord_right(Vec0&& coord, Vec1&& dims)
     {
         return detail::flatten_coord_right_impl(
-            std::forward<Vec0>(coord),
-            std::forward<Vec1>(dims),
-            make_index_sequence<VecTraits<std::decay_t<Vec0>>::size()>{});
+            forward<Vec0>(coord),
+            forward<Vec1>(dims),
+            make_index_sequence<VecTraits<decay_t<Vec0>>::size()>{});
     }
 
     namespace detail
     {
-        template <typename Vec0, typename Vec1, std::size_t... Indices>
+        template <typename Vec0, typename Vec1, size_t... Indices>
         constexpr static decltype(auto)
             flatten_coord_left_impl(Vec0&& coord, Vec1&& dims, index_sequence<Indices...>)
         {
-            static_assert(VecTraits<std::decay_t<Vec0>>::size() == sizeof...(Indices)
-                              && VecTraits<std::decay_t<Vec1>>::size() == sizeof...(Indices),
+            static_assert(VecTraits<decay_t<Vec0>>::size() == sizeof...(Indices)
+                              && VecTraits<decay_t<Vec1>>::size() == sizeof...(Indices),
                           "coord and dims vectors must be the same size");
 
             auto flatten = [](auto&& c, auto&& d, auto& mul) {
@@ -197,10 +195,10 @@ namespace rocwmma
                 return result;
             };
 
-            auto mult = typename VecTraits<std::decay_t<Vec0>>::DataT{1};
-            return (flatten(std::get<sizeof...(Indices) - 1 - Indices>(std::forward<Vec0>(coord)),
-                            std::get<sizeof...(Indices) - 1 - Indices>(std::forward<Vec1>(dims)),
-                            std::forward<decltype(mult)&>(mult))
+            auto mult = typename VecTraits<decay_t<Vec0>>::DataT{1};
+            return (flatten(get<sizeof...(Indices) - 1 - Indices>(forward<Vec0>(coord)),
+                            get<sizeof...(Indices) - 1 - Indices>(forward<Vec1>(dims)),
+                            forward<decltype(mult)&>(mult))
                     + ...);
         }
     }
@@ -209,14 +207,14 @@ namespace rocwmma
     constexpr static decltype(auto) flatten_coord_left(Vec0&& coord, Vec1&& dims)
     {
         return detail::flatten_coord_left_impl(
-            std::forward<Vec0>(coord),
-            std::forward<Vec1>(dims),
-            make_index_sequence<VecTraits<std::decay_t<Vec0>>::size()>{});
+            forward<Vec0>(coord),
+            forward<Vec1>(dims),
+            make_index_sequence<VecTraits<decay_t<Vec0>>::size()>{});
     }
 
     namespace detail
     {
-        template <typename Coord1d, typename VecT, std::size_t... Indices>
+        template <typename Coord1d, typename VecT, size_t... Indices>
         constexpr static inline decltype(auto)
             inflate_coord_right_impl(Coord1d&& flatCoord, VecT&& dims, index_sequence<Indices...>)
         {
@@ -226,10 +224,10 @@ namespace rocwmma
                 return result;
             };
 
-            auto div = std::decay_t<Coord1d>{1};
-            return make_vector(inflate(std::forward<Coord1d>(flatCoord),
-                                       std::get<Indices>(std::forward<VecT>(dims)),
-                                       std::forward<decltype(div)&>(div),
+            auto div = decay_t<Coord1d>{1};
+            return make_vector(inflate(forward<Coord1d>(flatCoord),
+                                       get<Indices>(forward<VecT>(dims)),
+                                       forward<decltype(div)&>(div),
                                        Indices == sizeof...(Indices) - 1)...);
         }
     }
@@ -238,14 +236,14 @@ namespace rocwmma
     constexpr static inline decltype(auto) inflate_coord_right(Coord1d&& flatCoord, VecT&& dims)
     {
         return detail::inflate_coord_right_impl(
-            std::forward<Coord1d>(flatCoord),
-            std::forward<VecT>(dims),
-            make_index_sequence<VecTraits<std::decay_t<VecT>>::size()>{});
+            forward<Coord1d>(flatCoord),
+            forward<VecT>(dims),
+            make_index_sequence<VecTraits<decay_t<VecT>>::size()>{});
     }
 
     namespace detail
     {
-        template <typename Coord1d, typename VecT, std::size_t... Indices>
+        template <typename Coord1d, typename VecT, size_t... Indices>
         constexpr static inline decltype(auto)
             inflate_coord_left_impl(Coord1d&& flatCoord, VecT&& dims, index_sequence<Indices...>)
         {
@@ -255,13 +253,12 @@ namespace rocwmma
                 return result;
             };
 
-            auto div = std::decay_t<Coord1d>{1};
-            return reverse(
-                make_vector(inflate(std::forward<Coord1d>(flatCoord),
-                                    std::get<VecTraits<std::decay_t<VecT>>::size() - 1 - Indices>(
-                                        std::forward<VecT>(dims)),
-                                    std::forward<decltype(div)&>(div),
-                                    Indices == sizeof...(Indices) - 1)...));
+            auto div = decay_t<Coord1d>{1};
+            return reverse(make_vector(
+                inflate(forward<Coord1d>(flatCoord),
+                        get<VecTraits<decay_t<VecT>>::size() - 1 - Indices>(forward<VecT>(dims)),
+                        forward<decltype(div)&>(div),
+                        Indices == sizeof...(Indices) - 1)...));
         }
     }
 
@@ -269,25 +266,25 @@ namespace rocwmma
     constexpr static inline decltype(auto) inflate_coord_left(Coord1d&& flatCoord, VecT&& dims)
     {
         return detail::inflate_coord_left_impl(
-            std::forward<Coord1d>(flatCoord),
-            std::forward<VecT>(dims),
-            make_index_sequence<VecTraits<std::decay_t<VecT>>::size()>{});
+            forward<Coord1d>(flatCoord),
+            forward<VecT>(dims),
+            make_index_sequence<VecTraits<decay_t<VecT>>::size()>{});
     }
 
     namespace detail
     {
-        template <typename Vec0, typename Vec1, std::size_t... Indices>
+        template <typename Vec0, typename Vec1, size_t... Indices>
         constexpr static inline decltype(auto)
             to_matrix_space_impl(Vec0&& strides, Vec1&& strideSpace, index_sequence<Indices...>)
         {
-            static_assert(VecTraits<std::decay_t<Vec0>>::size() == sizeof...(Indices)
-                              && VecTraits<std::decay_t<Vec1>>::size() == sizeof...(Indices),
+            static_assert(VecTraits<decay_t<Vec0>>::size() == sizeof...(Indices)
+                              && VecTraits<decay_t<Vec1>>::size() == sizeof...(Indices),
                           "strides and strideSpace vectors must be the same size");
 
             auto inflate = [](auto&& stride, auto&& dim) { return stride * dim; };
 
-            return (inflate(std::get<Indices>(std::forward<Vec0>(strides)),
-                            std::get<Indices>(std::forward<Vec1>(strideSpace)))
+            return (inflate(get<Indices>(forward<Vec0>(strides)),
+                            get<Indices>(forward<Vec1>(strideSpace)))
                     + ...);
         }
     }
@@ -296,25 +293,25 @@ namespace rocwmma
     constexpr static inline decltype(auto) to_matrix_space(Vec0&& strides, Vec1&& strideSpace)
     {
         return detail::to_matrix_space_impl(
-            std::forward<Vec0>(strides),
-            std::forward<Vec1>(strideSpace),
-            make_index_sequence<VecTraits<std::decay_t<Vec0>>::size()>{});
+            forward<Vec0>(strides),
+            forward<Vec1>(strideSpace),
+            make_index_sequence<VecTraits<decay_t<Vec0>>::size()>{});
     }
 
 #if !defined(__HIPCC_RTC__)
 
     template <class T, size_t... I>
-    auto& print(std::ostream& os, T&& t, std::index_sequence<I...>&&)
+    auto& print(std::ostream& os, T&& t, index_sequence<I...>&&)
     {
         os << "(";
-        (..., (os << (I == 0 ? "" : ", ") << std::get<I>(std::forward<T>(t))));
+        (..., (os << (I == 0 ? "" : ", ") << get<I>(forward<T>(t))));
         return os << ")\n";
     }
 
     template <class... ArgsT>
     auto& print(std::ostream& os, std::tuple<ArgsT...> const& t)
     {
-        return print(os, t, std::make_index_sequence<sizeof...(ArgsT)>());
+        return print(os, t, make_index_sequence<sizeof...(ArgsT)>());
     }
 
 #endif // !defined(__HIPCC_RTC__)
diff --git a/library/include/rocwmma/internal/type_traits.hpp b/library/include/rocwmma/internal/type_traits.hpp
index 8f1e7f08..404cea52 100644
--- a/library/include/rocwmma/internal/type_traits.hpp
+++ b/library/include/rocwmma/internal/type_traits.hpp
@@ -124,356 +124,22 @@ namespace rocwmma
     } // namespace detail
 } // namespace rocwmma
 
-///////////////////////////////////////////////////////////
-/////////////  std replacements for hipRTC  ///////////////
-///////////////////////////////////////////////////////////
-#if defined(__HIPCC_RTC__)
-namespace std
-{
-    template <typename T>
-    class numeric_limits
-    {
-    public:
-        ROCWMMA_HOST_DEVICE static constexpr T min() noexcept;
-        ROCWMMA_HOST_DEVICE static constexpr T lowest() noexcept;
-        ROCWMMA_HOST_DEVICE static constexpr T max() noexcept;
-        ROCWMMA_HOST_DEVICE static constexpr T epsilon() noexcept;
-        ROCWMMA_HOST_DEVICE static constexpr T round_error() noexcept;
-        ROCWMMA_HOST_DEVICE static constexpr T infinity() noexcept;
-        ROCWMMA_HOST_DEVICE static constexpr T quiet_NaN() noexcept;
-        ROCWMMA_HOST_DEVICE static constexpr T signaling_NaN() noexcept;
-        ROCWMMA_HOST_DEVICE static constexpr T denorm_min() noexcept;
-    };
-
-    template <bool B, class T = void>
-    using enable_if_t = typename enable_if<B, T>::type;
-
-    template <bool B, class T, class F>
-    struct conditional
-    {
-    };
-
-    template <class T, class F>
-    struct conditional<true, T, F>
-    {
-        using type = T;
-    };
-
-    template <class T, class F>
-    struct conditional<false, T, F>
-    {
-        using type = F;
-    };
-
-    template <bool B, class T, class F>
-    using conditional_t = typename conditional<B, T, F>::type;
-
-    template <typename T>
-    ROCWMMA_HOST_DEVICE constexpr const T& max(const T& a, const T& b)
-    {
-        return (a < b) ? b : a;
-    }
-
-    template <typename T>
-    ROCWMMA_HOST_DEVICE constexpr const T& min(const T& a, const T& b)
-    {
-        return (a < b) ? a : b;
-    }
-
-    // Meta programming helper types.
-
-    template <bool, typename, typename>
-    struct conditional;
-
-    template <typename...>
-    struct __or_;
-
-    template <>
-    struct __or_<> : public false_type
-    {
-    };
-
-    template <typename _B1>
-    struct __or_<_B1> : public _B1
-    {
-    };
-
-    template <typename _B1, typename _B2>
-    struct __or_<_B1, _B2> : public conditional<_B1::value, _B1, _B2>::type
-    {
-    };
-
-    template <typename _B1, typename _B2, typename _B3, typename... _Bn>
-    struct __or_<_B1, _B2, _B3, _Bn...>
-        : public conditional<_B1::value, _B1, __or_<_B2, _B3, _Bn...>>::type
-    {
-    };
-
-    template <typename...>
-    struct __and_;
-
-    template <>
-    struct __and_<> : public true_type
-    {
-    };
-
-    template <typename _B1>
-    struct __and_<_B1> : public _B1
-    {
-    };
-
-    template <typename _B1, typename _B2>
-    struct __and_<_B1, _B2> : public conditional<_B1::value, _B2, _B1>::type
-    {
-    };
-
-    template <typename _B1, typename _B2, typename _B3, typename... _Bn>
-    struct __and_<_B1, _B2, _B3, _Bn...>
-        : public conditional<_B1::value, __and_<_B2, _B3, _Bn...>, _B1>::type
-    {
-    };
-
-    template <bool __v>
-    using __bool_constant = integral_constant<bool, __v>;
-
-    template <typename _Pp>
-    struct __not_ : public __bool_constant<!bool(_Pp::value)>
-    {
-    };
-
-    // remove_reference
-    template <typename T>
-    struct remove_reference
-    {
-        typedef T type;
-    };
-
-    template <typename T>
-    struct remove_reference<T&>
-    {
-        typedef T type;
-    };
-
-    template <typename T>
-    struct remove_reference<T&&>
-    {
-        typedef T type;
-    };
-
-    // is_lvalue_reference
-    template <typename>
-    struct is_lvalue_reference : public false_type
-    {
-    };
-
-    template <typename T>
-    struct is_lvalue_reference<T&> : public true_type
-    {
-    };
-
-    // is_rvalue_reference
-    template <typename>
-    struct is_rvalue_reference : public false_type
-    {
-    };
-
-    template <typename T>
-    struct is_rvalue_reference<T&&> : public true_type
-    {
-    };
-
-    // lvalue forwarding
-    template <typename T>
-    constexpr T&& forward(typename remove_reference<T>::type& __t) noexcept
-    {
-        return static_cast<T&&>(__t);
-    }
-
-    // rvalue forwarding
-    template <typename T>
-    constexpr T&& forward(typename remove_reference<T>::type&& __t) noexcept
-    {
-        static_assert(!is_lvalue_reference<T>::value,
-                      "template argument"
-                      " substituting T is an lvalue reference type");
-        return static_cast<T&&>(__t);
-    }
-
-    // remove_const
-    template <typename T>
-    struct remove_const
-    {
-        typedef T type;
-    };
-
-    template <typename T>
-    struct remove_const<T const>
-    {
-        typedef T type;
-    };
-
-    // remove_volatile
-    template <typename T>
-    struct remove_volatile
-    {
-        typedef T type;
-    };
-
-    template <typename T>
-    struct remove_volatile<T volatile>
-    {
-        typedef T type;
-    };
-
-    // remove_cv
-    template <typename T>
-    struct remove_cv
-    {
-        typedef typename remove_const<typename remove_volatile<T>::type>::type type;
-    };
-
-    // remove_extent
-    template <typename T>
-    struct remove_extent
-    {
-        typedef T type;
-    };
-
-    template <typename T, std::size_t _Size>
-    struct remove_extent<T[_Size]>
-    {
-        typedef T type;
-    };
-
-    template <typename T>
-    struct remove_extent<T[]>
-    {
-        typedef T type;
-    };
-
-    // is_void
-    template <typename>
-    struct __is_void_helper : public false_type
-    {
-    };
-
-    template <>
-    struct __is_void_helper<void> : public true_type
-    {
-    };
-
-    template <typename T>
-    struct is_void : public __is_void_helper<typename remove_cv<T>::type>::type
-    {
-    };
-
-    // is_reference
-    template <typename T>
-    struct is_reference : public __or_<is_lvalue_reference<T>, is_rvalue_reference<T>>::type
-    {
-    };
-
-    // is_function
-    template <typename>
-    struct is_function : public false_type
-    {
-    };
-
-    // is_object
-    template <typename T>
-    struct is_object : public __not_<__or_<is_function<T>, is_reference<T>, is_void<T>>>::type
-    {
-    };
-
-    // __is_referenceable
-    template <typename T>
-    struct __is_referenceable : public __or_<is_object<T>, is_reference<T>>::type{};
-
-    // add_pointer
-    template <typename T, bool = __or_<__is_referenceable<T>, is_void<T>>::value>
-    struct __add_pointer_helper
-    {
-        typedef T type;
-    };
-
-    template <typename T>
-    struct __add_pointer_helper<T, true>
-    {
-        typedef typename remove_reference<T>::type* type;
-    };
-
-    template <typename T>
-    struct add_pointer : public __add_pointer_helper<T>
-    {
-    };
-
-    // is_array
-    template <typename>
-    struct is_array : public false_type
-    {
-    };
-
-    template <typename T, std::size_t _Size>
-    struct is_array<T[_Size]> : public true_type
-    {
-    };
-
-    template <typename T>
-    struct is_array<T[]> : public true_type
-    {
-    };
-
-    // decay selectors
-    template <typename _Up,
-              bool _IsArray    = is_array<_Up>::value,
-              bool _IsFunction = is_function<_Up>::value>
-    struct __decay_selector;
+#include "utility/numeric_limits.hpp"
 
-    template <typename _Up>
-    struct __decay_selector<_Up, false, false>
-    {
-        typedef typename remove_cv<_Up>::type __type;
-    };
-
-    template <typename _Up>
-    struct __decay_selector<_Up, true, false>
-    {
-        typedef typename remove_extent<_Up>::type* __type;
-    };
-
-    template <typename _Up>
-    struct __decay_selector<_Up, false, true>
-    {
-        typedef typename add_pointer<_Up>::type __type;
-    };
-
-    // decay
-    template <typename T>
-    class decay
-    {
-        typedef typename remove_reference<T>::type __remove_type;
-
-    public:
-        typedef typename __decay_selector<__remove_type>::__type type;
-    };
-
-    template <typename T>
-    using decay_t = typename decay<T>::type;
-
-    template <class T, class U>
-    inline constexpr bool is_same_v = is_same<T, U>::value;
-
-} // namespace std
+#if defined(__HIPCC_RTC__)
+#define NUMERIC_LIMITS_NAMESPACE rocwmma::detail
+#else
+#define NUMERIC_LIMITS_NAMESPACE std
 #endif
 
-namespace std
+namespace NUMERIC_LIMITS_NAMESPACE
 {
 #if defined(__HIPCC_RTC__)
     using uint16_t = rocwmma::uint16_t;
 #endif
 
     ///////////////////////////////////////////////////////////
-    ///////////  std::numeric_limits<float8_t>  //////////////
+    ///////////  numeric_limits<rocwmma::float8_t>  //////////////
     ///////////////////////////////////////////////////////////
     // @cond
     template <>
@@ -533,7 +199,7 @@ namespace std
     }
 
     ///////////////////////////////////////////////////////////
-    ///////////  std::numeric_limits<bfloat8_t>  //////////////
+    ///////////  numeric_limits<bfloat8_t>  //////////////
     ///////////////////////////////////////////////////////////
 
     template <>
@@ -593,7 +259,7 @@ namespace std
     }
 
     ///////////////////////////////////////////////////////////
-    ///////////  std::numeric_limits<float16_t>  //////////////
+    ///////////  numeric_limits<float16_t>  //////////////
     ///////////////////////////////////////////////////////////
 
     template <>
@@ -653,7 +319,7 @@ namespace std
     }
 
     ///////////////////////////////////////////////////////////
-    ///////////  std::numeric_limits<hfloat16_t>  /////////////
+    ///////////  numeric_limits<hfloat16_t>  /////////////
     ///////////////////////////////////////////////////////////
 #if !ROCWMMA_NO_HALF
     template <>
@@ -715,7 +381,7 @@ namespace std
 #endif // !ROCWMMA_NO_HALF
 
     ///////////////////////////////////////////////////////////
-    ///////////  std::numeric_limits<bfloat16_t>  /////////////
+    ///////////  numeric_limits<bfloat16_t>  /////////////
     ///////////////////////////////////////////////////////////
 
     template <>
@@ -775,7 +441,7 @@ namespace std
     }
 
     ///////////////////////////////////////////////////////////
-    ///////////  std::numeric_limits<xfloat32_t>  //////////////
+    ///////////  numeric_limits<xfloat32_t>  //////////////
     ///////////////////////////////////////////////////////////
 
     template <>
@@ -835,36 +501,32 @@ namespace std
     }
     // @endcond
 
-} // namespace std
+} // namespace rocwmma
 
 namespace rocwmma
 {
 #if !defined(__HIPCC_RTC__)
-    template <typename T, typename std::enable_if_t<std::is_integral<T>::value, int> = 0>
-    constexpr auto maxExactInteger() -> decltype(std::numeric_limits<T>::max())
+    template <typename T, enable_if_t<is_integral<T>::value, int> = 0>
+    constexpr auto maxExactInteger() -> decltype(numeric_limits<T>::max())
     {
-        return std::numeric_limits<T>::max();
+        return numeric_limits<T>::max();
     }
 
     template <typename T,
-              typename std::enable_if_t<std::is_floating_point<T>::value
-                                            && std::numeric_limits<T>::digits,
-                                        int>
-              = 0>
-    constexpr auto maxExactInteger() ->
-        typename std::conditional_t<std::is_same<T, float64_t>::value, int64_t, int32_t>
+              enable_if_t<is_floating_point<T>::value && numeric_limits<T>::digits, int> = 0>
+    constexpr auto maxExactInteger()
+        -> conditional_t<is_same<T, float64_t>::value, int64_t, int32_t>
     {
-        using RetT =
-            typename std::conditional_t<std::is_same<T, float64_t>::value, int64_t, int32_t>;
-        return ((RetT)1 << std::numeric_limits<T>::digits);
+        using RetT = conditional_t<is_same<T, float64_t>::value, int64_t, int32_t>;
+        return ((RetT)1 << numeric_limits<T>::digits);
     }
 
     template <typename T,
-              typename std::enable_if_t<
+              enable_if_t<
 #if !ROCWMMA_NO_HALF
-                  std::is_same<T, hfloat16_t>::value ||
+                  is_same<T, hfloat16_t>::value ||
 #endif // !ROCWMMA_NO_HALF
-                      std::is_same<T, float16_t>::value,
+                      is_same<T, float16_t>::value,
                   int>
               = 0>
     constexpr auto maxExactInteger() -> int32_t
@@ -873,28 +535,28 @@ namespace rocwmma
         return ((int32_t)1 << 11);
     }
 
-    template <typename T, typename std::enable_if_t<std::is_same<T, bfloat16_t>::value, int> = 0>
+    template <typename T, enable_if_t<is_same<T, bfloat16_t>::value, int> = 0>
     constexpr auto maxExactInteger() -> int32_t
     {
         // b16 mantissa is 7 bits
         return ((int32_t)1 << 8);
     }
 
-    template <typename T, typename std::enable_if_t<std::is_same<T, float8_t>::value, int> = 0>
+    template <typename T, enable_if_t<is_same<T, rocwmma::float8_t>::value, int> = 0>
     constexpr auto maxExactInteger() -> int32_t
     {
         // f8 mantissa is 3 bits
         return ((int32_t)1 << 4);
     }
 
-    template <typename T, typename std::enable_if_t<std::is_same<T, bfloat8_t>::value, int> = 0>
+    template <typename T, enable_if_t<is_same<T, bfloat8_t>::value, int> = 0>
     constexpr auto maxExactInteger() -> int32_t
     {
         // bf8 mantissa is 2 bits
         return ((int32_t)1 << 3);
     }
 
-    template <typename T, typename std::enable_if_t<std::is_same<T, xfloat32_t>::value, int> = 0>
+    template <typename T, enable_if_t<is_same<T, xfloat32_t>::value, int> = 0>
     constexpr auto maxExactInteger() -> int32_t
     {
         // xf32 mantissa is 7 bits
diff --git a/library/include/rocwmma/internal/types_ext.hpp b/library/include/rocwmma/internal/types_ext.hpp
index 692d3ba8..e5d4771e 100644
--- a/library/include/rocwmma/internal/types_ext.hpp
+++ b/library/include/rocwmma/internal/types_ext.hpp
@@ -48,11 +48,11 @@ namespace rocwmma
     ////////////////////////////////////////////////////////////////////////
     template <typename Outgoing,
               typename Incoming,
-              typename std::enable_if_t<!std::is_same_v<Incoming, Outgoing>, int> = 0>
+              enable_if_t<!is_same_v<Incoming, Outgoing>, int> = 0>
     __host__ __device__ inline Outgoing convert(const Incoming& value)
     {
 #if !ROCWMMA_NO_HALF
-        if constexpr(std::is_same_v<Outgoing, hfloat16_t>)
+        if constexpr(is_same_v<Outgoing, hfloat16_t>)
         {
 
 #if defined(__HIP_NO_HALF_CONVERSIONS__)
@@ -62,7 +62,7 @@ namespace rocwmma
             return static_cast<hfloat16_t>(value);
 #endif // defined(__HIP_NO_HALF_CONVERSIONS__)
         }
-        else if constexpr(std::is_same_v<Incoming, hfloat16_t>)
+        else if constexpr(is_same_v<Incoming, hfloat16_t>)
         {
 
 #if defined(__HIP_NO_HALF_CONVERSIONS__)
@@ -81,7 +81,7 @@ namespace rocwmma
 
     template <typename Outgoing,
               typename Incoming,
-              typename std::enable_if_t<std::is_same_v<Incoming, Outgoing>, int> = 0>
+              enable_if_t<is_same_v<Incoming, Outgoing>, int> = 0>
     __host__ __device__ inline Outgoing const& convert(const Incoming& value)
     {
         return value;
@@ -105,8 +105,8 @@ namespace rocwmma
     {
         auto absDiff = std::fabs(__half2float(x) - __half2float(y));
         auto absAdd  = std::fabs(__half2float(x) + __half2float(y));
-        return absDiff <= __half2float(std::numeric_limits<hfloat16_t>::epsilon()) * absAdd * 2.0f
-               || absDiff < __half2float(std::numeric_limits<hfloat16_t>::min());
+        return absDiff <= __half2float(numeric_limits<hfloat16_t>::epsilon()) * absAdd * 2.0f
+               || absDiff < __half2float(numeric_limits<hfloat16_t>::min());
     }
 
     ROCWMMA_HALF_OP_ATTR inline bool operator!=(const hfloat16_t& x, const hfloat16_t& y)
diff --git a/library/include/rocwmma/internal/utility/forward.hpp b/library/include/rocwmma/internal/utility/forward.hpp
new file mode 100644
index 00000000..7dce49ef
--- /dev/null
+++ b/library/include/rocwmma/internal/utility/forward.hpp
@@ -0,0 +1,52 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_UTILITY_FORWARD_HPP
+#define ROCWMMA_UTILITY_FORWARD_HPP
+
+#if defined(__HIPCC_RTC__) || defined(__clang__)
+
+#include "forward_impl.hpp"
+namespace rocwmma
+{
+    // Use drop-in replacement
+    using detail::forward;
+
+} // namespace rocwmma
+
+#else
+
+#include <utility>
+namespace rocwmma
+{
+    // Use STL
+    using std::forward;
+
+} // namespace rocwmma
+
+#endif // defined(__HIPCC_RTC__) || defined(__clang__)
+
+#endif // ROCWMMA_UTILITY_FORWARD_HPP
diff --git a/library/include/rocwmma/internal/utility/forward_impl.hpp b/library/include/rocwmma/internal/utility/forward_impl.hpp
new file mode 100644
index 00000000..59d69b02
--- /dev/null
+++ b/library/include/rocwmma/internal/utility/forward_impl.hpp
@@ -0,0 +1,55 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_UTILITY_FORWARD_IMPL_HPP
+#define ROCWMMA_UTILITY_FORWARD_IMPL_HPP
+
+#include "type_traits.hpp"
+
+namespace rocwmma
+{
+    namespace detail
+    {
+
+        template <typename T>
+        ROCWMMA_HOST_DEVICE constexpr T&& forward(typename remove_reference<T>::type& t) noexcept
+        {
+            return static_cast<T&&>(t);
+        }
+
+        template <typename T>
+        ROCWMMA_HOST_DEVICE constexpr T&& forward(typename remove_reference<T>::type&& t) noexcept
+        {
+            static_assert(!is_lvalue_reference<T>::value,
+                          "template argument substituting T is an lvalue reference type");
+            return static_cast<T&&>(t);
+        }
+
+    } // namespace detail
+
+} // namespace rocwmma
+
+#endif // ROCWMMA_UTILITY_FORWARD_IMPL_HPP
diff --git a/library/include/rocwmma/internal/utility/get.hpp b/library/include/rocwmma/internal/utility/get.hpp
new file mode 100644
index 00000000..773794a0
--- /dev/null
+++ b/library/include/rocwmma/internal/utility/get.hpp
@@ -0,0 +1,50 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_UTILITY_GET_HPP
+#define ROCWMMA_UTILITY_GET_HPP
+
+#include "get_impl.hpp"
+
+namespace rocwmma
+{
+    // get overloads
+    using detail::get;
+}
+
+#if !defined(__HIPCC_RTC__)
+
+#include <tuple>
+namespace rocwmma
+{
+    // Use STL
+    using std::get;
+
+} // namespace rocwmma
+
+#endif // !defined(__HIPCC_RTC__)
+
+#endif // ROCWMMA_UTILITY_GET_HPP
diff --git a/library/include/rocwmma/internal/utility/get_impl.hpp b/library/include/rocwmma/internal/utility/get_impl.hpp
new file mode 100644
index 00000000..88e833cb
--- /dev/null
+++ b/library/include/rocwmma/internal/utility/get_impl.hpp
@@ -0,0 +1,81 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_UTILITY_GET_IMPL_HPP
+#define ROCWMMA_UTILITY_GET_IMPL_HPP
+
+#include "../vector.hpp"
+
+namespace rocwmma
+{
+    namespace detail
+    {
+        // HIP_vector_type overloads
+        template <uint32_t Idx, typename DataT, uint32_t VecSize>
+        ROCWMMA_HOST_DEVICE constexpr inline DataT get(HIP_vector_type<DataT, VecSize>&& v)
+        {
+            return v.data[Idx];
+        }
+
+        template <uint32_t Idx, typename DataT, uint32_t VecSize>
+        ROCWMMA_HOST_DEVICE constexpr inline DataT& get(HIP_vector_type<DataT, VecSize>& v)
+        {
+            return reinterpret_cast<DataT*>(&v.data)[Idx];
+        }
+
+        template <uint32_t Idx, typename DataT, uint32_t VecSize>
+        ROCWMMA_HOST_DEVICE constexpr inline DataT get(HIP_vector_type<DataT, VecSize> const& v)
+        {
+            return v.data[Idx];
+        }
+
+        // non_native_vector_base overloads
+        template <uint32_t Idx, typename DataT, uint32_t VecSize>
+        ROCWMMA_HOST_DEVICE constexpr static inline DataT
+            get(non_native_vector_base<DataT, VecSize>&& v)
+        {
+            return v[Idx];
+        }
+
+        template <uint32_t Idx, typename DataT, uint32_t VecSize>
+        ROCWMMA_HOST_DEVICE constexpr static inline DataT&
+            get(non_native_vector_base<DataT, VecSize>& v)
+        {
+            return v[Idx];
+        }
+
+        template <uint32_t Idx, typename DataT, uint32_t VecSize>
+        ROCWMMA_HOST_DEVICE constexpr static inline DataT
+            get(non_native_vector_base<DataT, VecSize> const& v)
+        {
+            return v[Idx];
+        }
+
+    } // namespace detail
+
+} // namespace rocwmma
+
+#endif // ROCWMMA_UTILITY_GET_IMPL_HPP
diff --git a/library/include/rocwmma/internal/utility/numeric_limits.hpp b/library/include/rocwmma/internal/utility/numeric_limits.hpp
new file mode 100644
index 00000000..76a96c08
--- /dev/null
+++ b/library/include/rocwmma/internal/utility/numeric_limits.hpp
@@ -0,0 +1,52 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_UTILITY_NUMERIC_LIMITS_HPP
+#define ROCWMMA_UTILITY_NUMERIC_LIMITS_HPP
+
+#if defined(__HIPCC_RTC__)
+
+#include "numeric_limits_impl.hpp"
+namespace rocwmma
+{
+    // Use drop-in replacement
+    using detail::numeric_limits;
+
+} // namespace rocwmma
+
+#else
+
+#include <limits>
+namespace rocwmma
+{
+    // Use STL
+    using std::numeric_limits;
+
+} // namespace rocwmma
+
+#endif // defined(__HIPCC_RTC__)
+
+#endif // ROCWMMA_UTILITY_NUMERIC_LIMITS_HPP
diff --git a/library/include/rocwmma/internal/utility/numeric_limits_impl.hpp b/library/include/rocwmma/internal/utility/numeric_limits_impl.hpp
new file mode 100644
index 00000000..0349bf14
--- /dev/null
+++ b/library/include/rocwmma/internal/utility/numeric_limits_impl.hpp
@@ -0,0 +1,57 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_UTILITY_NUMERIC_LIMITS_IMPL_HPP
+#define ROCWMMA_UTILITY_NUMERIC_LIMITS_IMPL_HPP
+
+namespace rocwmma
+{
+    namespace detail
+    {
+        // Currently does not have implementation as there is no current
+        // library needs for regular arithmetic types.
+        // Specializations do exist for f8, bf8 and xf32 types where they
+        // are currently defined.
+        template <typename T>
+        class numeric_limits
+        {
+        public:
+            ROCWMMA_HOST_DEVICE static constexpr T min() noexcept;
+            ROCWMMA_HOST_DEVICE static constexpr T lowest() noexcept;
+            ROCWMMA_HOST_DEVICE static constexpr T max() noexcept;
+            ROCWMMA_HOST_DEVICE static constexpr T epsilon() noexcept;
+            ROCWMMA_HOST_DEVICE static constexpr T round_error() noexcept;
+            ROCWMMA_HOST_DEVICE static constexpr T infinity() noexcept;
+            ROCWMMA_HOST_DEVICE static constexpr T quiet_NaN() noexcept;
+            ROCWMMA_HOST_DEVICE static constexpr T signaling_NaN() noexcept;
+            ROCWMMA_HOST_DEVICE static constexpr T denorm_min() noexcept;
+        };
+
+    } // namespace detail
+
+} // namespace rocwmma
+
+#endif // ROCWMMA_UTILITY_NUMERIC_LIMITS_IMPL_HPP
diff --git a/library/include/rocwmma/internal/utility/sequence.hpp b/library/include/rocwmma/internal/utility/sequence.hpp
new file mode 100644
index 00000000..de5b26e2
--- /dev/null
+++ b/library/include/rocwmma/internal/utility/sequence.hpp
@@ -0,0 +1,37 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_UTILITY_SEQUENCE_HPP
+#define ROCWMMA_UTILITY_SEQUENCE_HPP
+
+#include "sequence_impl.hpp"
+namespace rocwmma
+{
+    using detail::index_sequence;
+    using detail::make_index_sequence;
+} // namespace rocwmma
+
+#endif // ROCWMMA_UTILITY_SEQUENCE_HPP
diff --git a/library/include/rocwmma/internal/utility/sequence_impl.hpp b/library/include/rocwmma/internal/utility/sequence_impl.hpp
new file mode 100644
index 00000000..10b16460
--- /dev/null
+++ b/library/include/rocwmma/internal/utility/sequence_impl.hpp
@@ -0,0 +1,98 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_UTILITY_SEQUENCE_IMPL_HPP
+#define ROCWMMA_UTILITY_SEQUENCE_IMPL_HPP
+
+#include "type_traits.hpp"
+
+namespace rocwmma
+{
+    namespace detail
+    {
+        template <typename Int, Int... Ints>
+        struct integer_sequence
+        {
+            using value_type = Int;
+            constexpr integer_sequence() {}
+            static constexpr size_t size() noexcept
+            {
+                return sizeof...(Ints);
+            }
+        };
+
+        template <size_t... Indices>
+        using index_sequence = integer_sequence<size_t, Indices...>;
+
+        namespace
+        {
+            // Merge two integer sequences, adding an offset to the right-hand side.
+            template <typename Offset, typename Lhs, typename Rhs>
+            struct merge;
+
+            template <typename Int, Int Offset, Int... Lhs, Int... Rhs>
+            struct merge<integral_constant<Int, Offset>,
+                         integer_sequence<Int, Lhs...>,
+                         integer_sequence<Int, Rhs...>>
+            {
+                using type = integer_sequence<Int, Lhs..., (Offset + Rhs)...>;
+            };
+
+            template <typename Int, typename N>
+            struct log_make_sequence
+            {
+                using L    = integral_constant<Int, N::value / 2>;
+                using R    = integral_constant<Int, N::value - L::value>;
+                using type = typename merge<L,
+                                            typename log_make_sequence<Int, L>::type,
+                                            typename log_make_sequence<Int, R>::type>::type;
+            };
+
+            // An empty sequence.
+            template <typename Int>
+            struct log_make_sequence<Int, integral_constant<Int, 0>>
+            {
+                using type = integer_sequence<Int>;
+            };
+
+            // A single-element sequence.
+            template <typename Int>
+            struct log_make_sequence<Int, integral_constant<Int, 1>>
+            {
+                using type = integer_sequence<Int, 0>;
+            };
+        }
+
+        template <typename Int, Int N>
+        using make_integer_sequence =
+            typename log_make_sequence<Int, integral_constant<Int, N>>::type;
+
+        template <size_t N>
+        using make_index_sequence = make_integer_sequence<size_t, N>;
+    } // namespace detail
+} // namespace rocwmma
+
+#endif // ROCWMMA_UTILITY_SEQUENCE_IMPL_HPP
diff --git a/library/include/rocwmma/internal/utility/type_traits.hpp b/library/include/rocwmma/internal/utility/type_traits.hpp
new file mode 100644
index 00000000..fdeb5f9d
--- /dev/null
+++ b/library/include/rocwmma/internal/utility/type_traits.hpp
@@ -0,0 +1,148 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_UTILITY_TYPE_TRAITS_HPP
+#define ROCWMMA_UTILITY_TYPE_TRAITS_HPP
+
+#if defined(__HIPCC_RTC__)
+
+#include "type_traits_impl.hpp"
+namespace rocwmma
+{
+    // Use drop-in replacement
+    using detail::add_pointer;
+    using detail::add_pointer_t;
+    using detail::bool_constant;
+    using detail::conditional;
+    using detail::conditional_t;
+    using detail::decay;
+    using detail::decay_t;
+    using detail::enable_if;
+    using detail::enable_if_t;
+    using detail::false_type;
+    using detail::integral_constant;
+    using detail::is_arithmetic;
+    using detail::is_arithmetic_v;
+    using detail::is_array;
+    using detail::is_array_v;
+    using detail::is_convertible;
+    using detail::is_convertible_v;
+    using detail::is_floating_point;
+    using detail::is_floating_point_v;
+    using detail::is_function;
+    using detail::is_function_v;
+    using detail::is_integral;
+    using detail::is_integral_v;
+    using detail::is_lvalue_reference;
+    using detail::is_lvalue_reference_v;
+    using detail::is_reference;
+    using detail::is_reference_v;
+    using detail::is_rvalue_reference;
+    using detail::is_rvalue_reference_v;
+    using detail::is_same;
+    using detail::is_same_v;
+    using detail::is_signed;
+    using detail::is_signed_v;
+    using detail::is_void;
+    using detail::is_void_v;
+    using detail::remove_const;
+    using detail::remove_const_t;
+    using detail::remove_cv;
+    using detail::remove_cv_t;
+    using detail::remove_extent;
+    using detail::remove_extent_t;
+    using detail::remove_reference;
+    using detail::remove_reference_t;
+    using detail::remove_volatile;
+    using detail::remove_volatile_t;
+    using detail::true_type;
+
+    using detail::max;
+    using detail::min;
+
+} // namespace rocwmma
+
+#else
+
+#include <type_traits>
+namespace rocwmma
+{
+    // std implementations
+    using std::add_pointer;
+    using std::add_pointer_t;
+    using std::bool_constant;
+    using std::conditional;
+    using std::conditional_t;
+    using std::decay;
+    using std::decay_t;
+    using std::enable_if;
+    using std::enable_if_t;
+    using std::false_type;
+    using std::integral_constant;
+    using std::is_arithmetic;
+    using std::is_arithmetic_v;
+    using std::is_array;
+    using std::is_array_v;
+    using std::is_convertible;
+    using std::is_convertible_v;
+    using std::is_floating_point;
+    using std::is_floating_point_v;
+    using std::is_function;
+    using std::is_function_v;
+    using std::is_integral;
+    using std::is_integral_v;
+    using std::is_lvalue_reference;
+    using std::is_lvalue_reference_v;
+    using std::is_reference;
+    using std::is_reference_v;
+    using std::is_rvalue_reference;
+    using std::is_rvalue_reference_v;
+    using std::is_same;
+    using std::is_same_v;
+    using std::is_signed;
+    using std::is_signed_v;
+    using std::is_void;
+    using std::is_void_v;
+    using std::remove_const;
+    using std::remove_const_t;
+    using std::remove_cv;
+    using std::remove_cv_t;
+    using std::remove_extent;
+    using std::remove_extent_t;
+    using std::remove_reference;
+    using std::remove_reference_t;
+    using std::remove_volatile;
+    using std::remove_volatile_t;
+    using std::true_type;
+
+    using std::max;
+    using std::min;
+
+} // namespace rocwmma
+
+#endif // defined(__HIPCC_RTC__) || defined(__clang__)
+
+#endif // ROCWMMA_UTILITY_TYPE_TRAITS_HPP
diff --git a/library/include/rocwmma/internal/utility/type_traits_impl.hpp b/library/include/rocwmma/internal/utility/type_traits_impl.hpp
new file mode 100644
index 00000000..02b7a426
--- /dev/null
+++ b/library/include/rocwmma/internal/utility/type_traits_impl.hpp
@@ -0,0 +1,631 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_UTILITY_TYPE_TRAITS_IMPL_HPP
+#define ROCWMMA_UTILITY_TYPE_TRAITS_IMPL_HPP
+
+namespace rocwmma
+{
+    namespace detail
+    {
+        // TODO: Separate file?
+        template <typename T>
+        ROCWMMA_HOST_DEVICE constexpr const T& max(const T& a, const T& b)
+        {
+            return (a < b) ? b : a;
+        }
+
+        template <typename T>
+        ROCWMMA_HOST_DEVICE constexpr const T& min(const T& a, const T& b)
+        {
+            return (a < b) ? a : b;
+        }
+
+        using ::size_t;
+
+        template <class T, T Val>
+        struct integral_constant
+        {
+            static constexpr const T value = Val;
+            using value_type               = T;
+            using type                     = integral_constant;
+            constexpr operator value_type() const
+            {
+                return value;
+            }
+            constexpr value_type operator()() const
+            {
+                return value;
+            }
+        };
+
+        template <class T, T Val>
+        constexpr const T integral_constant<T, Val>::value;
+
+        using true_type  = integral_constant<bool, true>;
+        using false_type = integral_constant<bool, false>;
+
+        template <bool B>
+        using bool_constant = integral_constant<bool, B>;
+
+        using true_type  = bool_constant<true>;
+        using false_type = bool_constant<false>;
+
+        template <bool B>
+        struct true_or_false_type : public false_type
+        {
+        };
+        template <>
+        struct true_or_false_type<true> : public true_type
+        {
+        };
+
+        // Static conditional
+        template <bool B, class T, class F>
+        struct conditional
+        {
+        };
+
+        template <class T, class F>
+        struct conditional<true, T, F>
+        {
+            using type = T;
+        };
+
+        template <class T, class F>
+        struct conditional<false, T, F>
+        {
+            using type = F;
+        };
+
+        template <bool B, class T, class F>
+        using conditional_t = typename conditional<B, T, F>::type;
+
+        // Logical ops
+        template <typename... Bs>
+        struct logical_or;
+
+        template <>
+        struct logical_or<> : public false_type
+        {
+        };
+
+        template <typename T>
+        struct logical_or<T> : public T
+        {
+        };
+
+        template <typename B1, typename B2>
+        struct logical_or<B1, B2> : public conditional_t<B1::value, B1, B2>
+        {
+        };
+
+        template <typename B1, typename B2, typename B3, typename... Bs>
+        struct logical_or<B1, B2, B3, Bs...>
+            : public conditional_t<B1::value, B1, logical_or<B2, B3, Bs...>>
+        {
+        };
+
+        template <typename... Bs>
+        using logical_or_t = typename logical_or<Bs...>::type;
+
+        template <typename...>
+        struct logical_and;
+
+        template <>
+        struct logical_and<> : public true_type
+        {
+        };
+
+        template <typename B1>
+        struct logical_and<B1> : public B1
+        {
+        };
+
+        template <typename B1, typename B2>
+        struct logical_and<B1, B2> : public conditional_t<B1::value, B2, B1>
+        {
+        };
+
+        template <typename B1, typename B2, typename B3, typename... Bs>
+        struct logical_and<B1, B2, B3, Bs...>
+            : public conditional_t<B1::value, logical_and<B2, B3, Bs...>, B1>
+        {
+        };
+
+        template <typename... Bs>
+        using logical_and_t = typename logical_and<Bs...>::type;
+
+        template <typename B>
+        struct logical_not : public bool_constant<!bool(B::value)>
+        {
+        };
+
+        template <typename B>
+        using logical_not_t = typename logical_not<B>::type;
+
+        // remove_reference
+        template <typename T>
+        struct remove_reference
+        {
+            using type = T;
+        };
+
+        template <typename T>
+        struct remove_reference<T&>
+        {
+            using type = T;
+        };
+
+        template <typename T>
+        struct remove_reference<T&&>
+        {
+            using type = T;
+        };
+
+        template <typename T>
+        using remove_reference_t = typename remove_reference<T>::type;
+
+        // remove_const
+        template <typename T>
+        struct remove_const
+        {
+            using type = T;
+        };
+
+        template <typename T>
+        struct remove_const<T const>
+        {
+            using type = T;
+        };
+
+        template <typename T>
+        using remove_const_t = typename remove_const<T>::type;
+
+        // remove_volatile
+        template <typename T>
+        struct remove_volatile
+        {
+            using type = T;
+        };
+
+        template <typename T>
+        struct remove_volatile<T volatile>
+        {
+            using type = T;
+        };
+
+        template <typename T>
+        using remove_volatile_t = typename remove_volatile<T>::type;
+
+        // remove_cv
+        template <typename T>
+        struct remove_cv
+        {
+            using type = remove_const_t<remove_volatile_t<T>>;
+        };
+
+        template <typename T>
+        using remove_cv_t = typename remove_cv<T>::type;
+
+        // remove_extent
+        template <typename T>
+        struct remove_extent
+        {
+            using type = T;
+        };
+
+        template <typename T, std::size_t _Size>
+        struct remove_extent<T[_Size]>
+        {
+            using type = T;
+        };
+
+        template <typename T>
+        struct remove_extent<T[]>
+        {
+            using type = T;
+        };
+
+        template <typename T>
+        using remove_extent_t = typename remove_extent<T>::type;
+
+        // add_pointer
+        template <typename T>
+        struct is_referenceable;
+
+        template <typename T>
+        struct is_void;
+
+        template <typename T, bool = logical_or<is_referenceable<T>, is_void<T>>::value>
+        struct add_pointer_helper
+        {
+            using type = T;
+        };
+
+        template <typename T>
+        struct add_pointer_helper<T, true>
+        {
+            using type = remove_reference_t<T>*;
+        };
+
+        template <typename T>
+        struct add_pointer : public add_pointer_helper<T>
+        {
+        };
+
+        template <typename T>
+        using add_pointer_t = typename add_pointer<T>::type;
+
+        // is_lvalue_reference
+        template <typename>
+        struct is_lvalue_reference : public false_type
+        {
+        };
+
+        template <typename T>
+        struct is_lvalue_reference<T&> : public true_type
+        {
+        };
+
+        template <typename T>
+        inline constexpr bool is_lvalue_reference_v = is_lvalue_reference<T>::value;
+
+        // is_rvalue_reference
+        template <typename>
+        struct is_rvalue_reference : public false_type
+        {
+        };
+
+        template <typename T>
+        struct is_rvalue_reference<T&&> : public true_type
+        {
+        };
+
+        template <typename T>
+        inline constexpr bool is_rvalue_reference_v = is_rvalue_reference<T>::value;
+
+        // is_void
+        template <typename>
+        struct is_void_helper : public false_type
+        {
+        };
+
+        template <>
+        struct is_void_helper<void> : public true_type
+        {
+        };
+
+        template <typename T>
+        struct is_void : public is_void_helper<remove_cv_t<T>>::type
+        {
+        };
+
+        template <typename T>
+        inline constexpr bool is_void_v = is_void<T>::value;
+
+        // is_reference
+        template <typename T>
+        struct is_reference : public logical_or_t<is_lvalue_reference<T>, is_rvalue_reference<T>>
+        {
+        };
+
+        template <typename T>
+        inline constexpr bool is_reference_v = is_reference<T>::value;
+
+        // is_function
+        template <typename>
+        struct is_function : public false_type
+        {
+        };
+
+        template <typename T>
+        inline constexpr bool is_function_v = is_function<T>::value;
+
+        // is_object
+        template <typename T>
+        struct is_object
+            : public logical_not_t<logical_or<is_function<T>, is_reference<T>, is_void<T>>>
+        {
+        };
+
+        template <typename T>
+        inline constexpr bool is_object_v = is_object<T>::value;
+
+        // __is_referenceable
+        template <typename T>
+        struct is_referenceable : public logical_or_t<is_object<T>, is_reference<T>>
+        {
+        };
+
+        template <typename T>
+        inline constexpr bool is_referenceable_v = is_referenceable<T>::value;
+
+        // is_array
+        template <typename>
+        struct is_array : public false_type
+        {
+        };
+
+        template <typename T, size_t _Size>
+        struct is_array<T[_Size]> : public true_type
+        {
+        };
+
+        template <typename T>
+        struct is_array<T[]> : public true_type
+        {
+        };
+
+        template <typename T>
+        inline constexpr bool is_array_v = is_array<T>::value;
+
+        // is_integral
+        template <class T>
+        struct is_integral : public false_type
+        {
+        };
+        template <>
+        struct is_integral<bool> : public true_type
+        {
+        };
+        template <>
+        struct is_integral<char> : public true_type
+        {
+        };
+        template <>
+        struct is_integral<signed char> : public true_type
+        {
+        };
+        template <>
+        struct is_integral<unsigned char> : public true_type
+        {
+        };
+        template <>
+        struct is_integral<wchar_t> : public true_type
+        {
+        };
+        template <>
+        struct is_integral<short> : public true_type
+        {
+        };
+        template <>
+        struct is_integral<unsigned short> : public true_type
+        {
+        };
+        template <>
+        struct is_integral<int> : public true_type
+        {
+        };
+        template <>
+        struct is_integral<unsigned int> : public true_type
+        {
+        };
+        template <>
+        struct is_integral<long> : public true_type
+        {
+        };
+        template <>
+        struct is_integral<unsigned long> : public true_type
+        {
+        };
+        template <>
+        struct is_integral<long long> : public true_type
+        {
+        };
+        template <>
+        struct is_integral<unsigned long long> : public true_type
+        {
+        };
+
+        template <typename T>
+        inline constexpr bool is_integral_v = is_integral<T>::value;
+
+        // is_arithmetic
+        template <class T>
+        struct is_arithmetic : public false_type
+        {
+        };
+        template <>
+        struct is_arithmetic<bool> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<char> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<signed char> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<unsigned char> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<wchar_t> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<short> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<unsigned short> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<int> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<unsigned int> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<long> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<unsigned long> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<long long> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<unsigned long long> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<float> : public true_type
+        {
+        };
+        template <>
+        struct is_arithmetic<double> : public true_type
+        {
+        };
+
+        template <typename T>
+        inline constexpr bool is_arithmetic_v = is_arithmetic<T>::value;
+
+        // is_floating_point
+        template <typename T>
+        struct is_floating_point : public false_type
+        {
+        };
+        template <>
+        struct is_floating_point<float> : public true_type
+        {
+        };
+        template <>
+        struct is_floating_point<double> : public true_type
+        {
+        };
+        template <>
+        struct is_floating_point<long double> : public true_type
+        {
+        };
+
+        template <typename T>
+        inline constexpr bool is_floating_point_v = is_floating_point<T>::value;
+
+        // is_signed
+        template <typename T, bool = is_arithmetic<T>::value>
+        struct is_signed : public false_type
+        {
+        };
+
+        template <typename T>
+        struct is_signed<T, true> : public true_or_false_type<T(-1) < T(0)>
+        {
+        };
+
+        template <typename T>
+        inline constexpr bool is_signed_v = is_signed<T>::value;
+
+        // is_same
+        template <typename T, typename U>
+        struct is_same : public false_type
+        {
+        };
+        template <typename T>
+        struct is_same<T, T> : public true_type
+        {
+        };
+
+        template <class T, class U>
+        inline constexpr bool is_same_v = is_same<T, U>::value;
+
+        // is_convertible
+        template <class T1, class T2>
+        struct is_convertible : public true_or_false_type<__is_convertible_to(T1, T2)>
+        {
+        };
+
+        template <class T, class U>
+        inline constexpr bool is_convertible_v = is_convertible<T, U>::value;
+
+        // decay selectors
+        template <typename Up,
+                  bool IsArray    = is_array<Up>::value,
+                  bool IsFunction = is_function<Up>::value>
+        struct decay_selector;
+
+        template <typename Up>
+        struct decay_selector<Up, false, false>
+        {
+            using type = remove_cv_t<Up>;
+        };
+
+        template <typename Up>
+        struct decay_selector<Up, true, false>
+        {
+            using type = remove_extent_t<Up>*;
+        };
+
+        template <typename Up>
+        struct decay_selector<Up, false, true>
+        {
+            using type = add_pointer_t<Up>;
+        };
+
+        template <typename T>
+        using decay_selector_t = typename decay_selector<T>::type;
+
+        // decay
+        template <typename T>
+        class decay
+        {
+            using remove_type = remove_reference_t<T>;
+
+        public:
+            using type = decay_selector_t<remove_type>;
+        };
+
+        template <typename T>
+        using decay_t = typename decay<T>::type;
+
+        // SFINAE enable_if
+        template <bool B, class T = void>
+        struct enable_if
+        {
+        };
+        template <class T>
+        struct enable_if<true, T>
+        {
+            using type = T;
+        };
+
+        template <bool B, class T = void>
+        using enable_if_t = typename enable_if<B, T>::type;
+
+    } // namespace detail
+
+} // namespace rocwmma
+
+#endif // ROCWMMA_UTILITY_TYPE_TRAITS_IMPL_HPP
diff --git a/library/include/rocwmma/internal/utility/vector.hpp b/library/include/rocwmma/internal/utility/vector.hpp
new file mode 100644
index 00000000..060c8d59
--- /dev/null
+++ b/library/include/rocwmma/internal/utility/vector.hpp
@@ -0,0 +1,51 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_UTILITY_VECTOR_HPP
+#define ROCWMMA_UTILITY_VECTOR_HPP
+
+#include "vector_impl.hpp"
+
+namespace rocwmma
+{
+    template <typename VecT>
+    ROCWMMA_HOST_DEVICE constexpr inline auto vector_size(VecT const& v);
+
+    template <typename... Ts>
+    ROCWMMA_HOST_DEVICE constexpr decltype(auto) make_vector(Ts&&... ts);
+
+    template <typename Lhs, typename Rhs>
+    ROCWMMA_HOST_DEVICE constexpr decltype(auto) vector_cat(Lhs&& lhs, Rhs&& rhs);
+
+    template <typename VecT>
+    ROCWMMA_HOST_DEVICE constexpr static inline decltype(auto)
+        vector_reduce_and(VecT&& lhs) noexcept;
+
+    template <typename DataT>
+    ROCWMMA_HOST_DEVICE constexpr inline auto swap(HIP_vector_type<DataT, 2> const& v);
+} // namespace rocwmma
+
+#endif // ROCWMMA_UTILITY_VECTOR_HPP
diff --git a/library/include/rocwmma/internal/utility/vector_impl.hpp b/library/include/rocwmma/internal/utility/vector_impl.hpp
new file mode 100644
index 00000000..6e73289d
--- /dev/null
+++ b/library/include/rocwmma/internal/utility/vector_impl.hpp
@@ -0,0 +1,223 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_UTILITY_VECTOR_IMPL_HPP
+#define ROCWMMA_UTILITY_VECTOR_IMPL_HPP
+
+#include "type_traits.hpp"
+#include "get.hpp"
+
+namespace rocwmma
+{
+    template <typename VecT>
+    ROCWMMA_HOST_DEVICE constexpr inline auto vector_size(VecT const& v)
+    {
+        return VecTraits<VecT>::size();
+    }
+
+    namespace detail
+    {
+        template <typename... Ts>
+        struct first_type;
+
+        template <typename T, typename... Ts>
+        struct first_type<T, Ts...>
+        {
+            using type = T;
+        };
+
+        template <typename... Ts>
+        using first_type_t = typename first_type<Ts...>::type;
+
+        template <typename... Ts>
+        struct is_same_type;
+
+        template <typename T>
+        struct is_same_type<T> : true_type
+        {
+        };
+
+        template <typename T, typename U, typename... Ts>
+        struct is_same_type<T, U, Ts...>
+            : conditional_t<is_same<T, U>{}, is_same_type<U, Ts...>, false_type>
+        {
+        };
+
+        template <typename... Ts>
+        constexpr bool is_same_type_v = is_same_type<Ts...>::value;
+    }
+
+    template <typename DataT>
+    ROCWMMA_HOST_DEVICE constexpr inline auto swap(HIP_vector_type<DataT, 2> const& v)
+    {
+        return HIP_vector_type<DataT, 2>{get<1>(v), get<0>(v)};
+    }
+
+    // temporary apply impl
+    namespace detail
+    {
+        template <typename F, typename DataT, uint32_t Rank, size_t... I>
+        constexpr decltype(auto)
+            apply_impl(F fn, HIP_vector_type<DataT, Rank> const& v, index_sequence<I...>)
+        {
+            return fn(get<I>(v)...);
+        }
+
+    } // namespace detail
+
+    template <typename F, typename DataT, uint32_t Rank>
+    constexpr decltype(auto) apply(F fn, HIP_vector_type<DataT, Rank>& v)
+    {
+        constexpr size_t size = VecTraits<decay_t<decltype(v)>>::size();
+        return detail::apply_impl(fn, v, detail::make_index_sequence<size>());
+    }
+
+    namespace detail
+    {
+        template <typename F, typename DataT, uint32_t Rank, size_t... I>
+        constexpr decltype(auto)
+            apply_impl(F fn, non_native_vector_base<DataT, Rank> const& v, index_sequence<I...>)
+        {
+            return fn(get<I>(v)...);
+        }
+
+    } // namespace detail
+
+    template <typename F, typename DataT, uint32_t Rank>
+    constexpr decltype(auto) apply(F fn, non_native_vector_base<DataT, Rank> const& v)
+    {
+        constexpr size_t size = VecTraits<decay_t<decltype(v)>>::size();
+        return detail::apply_impl(fn, v, detail::make_index_sequence<size>());
+    }
+
+    template <typename... Ts>
+    ROCWMMA_HOST_DEVICE constexpr decltype(auto) make_vector(Ts&&... ts)
+    {
+        // TODO: When HIP_vector_type becomes constexpr replace with non_native_vector type.
+
+        // Ensure that all the arguments are the same type
+        static_assert(detail::is_same_type_v<decay_t<Ts>...>,
+                      "Vector arguments must all be the same type");
+
+        using DataT = typename detail::first_type_t<decay_t<Ts>...>;
+        return non_native_vector_base<DataT, sizeof...(Ts)>{forward<Ts>(ts)...};
+    }
+
+    namespace detail
+    {
+        template <typename DataT0,
+                  uint32_t Rank0,
+                  size_t... Is0,
+                  typename DataT1,
+                  uint32_t Rank1,
+                  size_t... Is1>
+        constexpr static inline decltype(auto)
+            vector_cat_impl(non_native_vector_base<DataT0, Rank0> const& lhs,
+                            index_sequence<Is0...>,
+                            non_native_vector_base<DataT1, Rank1> const& rhs,
+                            index_sequence<Is1...>)
+        {
+            return make_vector(get<Is0>(lhs)..., get<Is1>(rhs)...);
+        }
+
+    } // namespace detail
+
+    template <typename Lhs, typename Rhs>
+    ROCWMMA_HOST_DEVICE constexpr decltype(auto) vector_cat(Lhs&& lhs, Rhs&& rhs)
+    {
+        constexpr size_t Size0 = VecTraits<decay_t<decltype(lhs)>>::size();
+        constexpr size_t Size1 = VecTraits<decay_t<decltype(rhs)>>::size();
+
+        return detail::vector_cat_impl(forward<Lhs>(lhs),
+                                       detail::make_index_sequence<Size0>(),
+                                       forward<Rhs>(rhs),
+                                       detail::make_index_sequence<Size1>());
+    }
+
+    namespace detail
+    {
+        template <typename DataT0, typename DataT1, uint32_t Rank, size_t... Is>
+        constexpr static inline decltype(auto)
+            mult_poly_vec_impl(non_native_vector_base<DataT0, Rank> const& lhs,
+                               non_native_vector_base<DataT1, Rank> const& rhs,
+                               index_sequence<Is...>)
+        {
+            return make_vector((get<Is>(lhs) * get<Is>(rhs))...);
+        }
+
+    } // namespace detail
+
+    template <typename DataT0, typename DataT1, uint32_t Rank>
+    constexpr decltype(auto) operator*(non_native_vector_base<DataT0, Rank> const& lhs,
+                                       non_native_vector_base<DataT1, Rank> const& rhs)
+    {
+        return detail::mult_poly_vec_impl(lhs, rhs, detail::make_index_sequence<Rank>());
+    }
+
+    namespace detail
+    {
+        template <class BinOp, typename T, typename... Ts>
+        ROCWMMA_HOST_DEVICE constexpr static inline decay_t<T> reduceOp_impl(T&& t,
+                                                                             Ts&&... ts) noexcept
+        {
+            using CastT = decay_t<T>;
+            if constexpr(sizeof...(Ts) >= 1)
+            {
+                return BinOp::exec(static_cast<CastT>(t), reduceOp_impl<BinOp>(forward<Ts>(ts)...));
+            }
+            else
+            {
+                return static_cast<CastT>(t);
+            }
+        }
+
+        template <class BinOp, typename VecT, size_t... Is>
+        ROCWMMA_HOST_DEVICE constexpr static inline decltype(auto)
+            vector_reduce_impl(VecT&& v, index_sequence<Is...>) noexcept
+        {
+            return reduceOp_impl<BinOp>(get<Is>(forward<VecT>(v))...);
+        }
+
+        // Use with operations that have 1 operands
+        template <class BinOp, typename VecT>
+        ROCWMMA_HOST_DEVICE constexpr static inline decltype(auto)
+            vector_reduce(VecT&& lhs) noexcept
+        {
+            return vector_reduce_impl<BinOp>(
+                forward<VecT>(lhs),
+                detail::make_index_sequence<VecTraits<decay_t<VecT>>::size()>{});
+        }
+    }
+
+    template <typename VecT>
+    ROCWMMA_HOST_DEVICE constexpr static inline decltype(auto)
+        vector_reduce_and(VecT&& lhs) noexcept
+    {
+        return detail::vector_reduce<detail::BitwiseOp::And>(forward<VecT>(lhs));
+    }
+} // namespace rocwmma
+
+#endif // ROCWMMA_UTILITY_VECTOR_IMPL_HPP
diff --git a/library/include/rocwmma/internal/utils.hpp b/library/include/rocwmma/internal/utils.hpp
index 51edddba..c06e874a 100644
--- a/library/include/rocwmma/internal/utils.hpp
+++ b/library/include/rocwmma/internal/utils.hpp
@@ -27,6 +27,8 @@
 #define ROCWMMA_UTILS_HPP
 
 #include "types.hpp"
+
+#include "utility/get.hpp"
 #include "vector.hpp"
 
 namespace rocwmma
@@ -37,137 +39,6 @@ namespace rocwmma
     /// Note: performs static unroll                                ///
     ///////////////////////////////////////////////////////////////////
 
-    namespace detail
-    {
-        template <typename DataT, uint32_t VecSize, uint32_t... Idx>
-        ROCWMMA_DEVICE constexpr static inline auto extractEven(VecT<DataT, VecSize> const& v,
-                                                                detail::SeqT<Idx...>)
-        {
-            static_assert(sizeof...(Idx) == VecSize / 2u,
-                          "Index count must be half the vector size");
-            return VecT<DataT, VecSize / 2u>{get<Idx * 2>(v)...};
-        }
-
-        template <typename DataT, uint32_t VecSize, uint32_t... Idx>
-        ROCWMMA_DEVICE constexpr static inline auto extractOdd(VecT<DataT, VecSize> const& v,
-                                                               detail::SeqT<Idx...>)
-        {
-            static_assert(sizeof...(Idx) == VecSize / 2u,
-                          "Index count must be half the vector size");
-            return VecT<DataT, VecSize / 2u>{get<Idx * 2 + 1>(v)...};
-        }
-
-        template <typename DataT, uint32_t VecSize, uint32_t... Idx>
-        ROCWMMA_DEVICE constexpr static inline auto extractLo(VecT<DataT, VecSize> const& v,
-                                                              detail::SeqT<Idx...>)
-        {
-            static_assert(sizeof...(Idx) == VecSize / 2u,
-                          "Index count must be half the vector size");
-            return VecT<DataT, VecSize / 2u>{get<Idx>(v)...};
-        }
-
-        template <typename DataT, uint32_t VecSize, uint32_t... Idx>
-        ROCWMMA_DEVICE constexpr static inline auto extractHi(VecT<DataT, VecSize> const& v,
-                                                              detail::SeqT<Idx...>)
-        {
-            static_assert(sizeof...(Idx) == VecSize / 2u,
-                          "Index count must be half the vector size");
-            return VecT<DataT, VecSize / 2u>{get<VecSize / 2 + Idx>(v)...};
-        }
-
-        template <typename DataT, uint32_t VecSize, uint32_t... Idx>
-        ROCWMMA_DEVICE constexpr static inline auto concat(VecT<DataT, VecSize> const& v0,
-                                                           VecT<DataT, VecSize> const& v1,
-                                                           detail::SeqT<Idx...>)
-        {
-            static_assert(sizeof...(Idx) == VecSize, "Index count must equal the vector size");
-            return VecT<DataT, VecSize * 2u>{get<Idx>(v0)..., get<Idx>(v1)...};
-        }
-
-        template <typename DataT, uint32_t VecSize, uint32_t... Idx>
-        ROCWMMA_DEVICE constexpr static inline auto zip(VecT<DataT, VecSize> const& v0,
-                                                        VecT<DataT, VecSize> const& v1,
-                                                        detail::SeqT<Idx...>)
-        {
-            static_assert(sizeof...(Idx) == VecSize, "Index count must equal the vector size");
-            return VecT<DataT, VecSize>{((Idx % 2 == 0) ? get<Idx>(v0) : get<Idx>(v1))...};
-        }
-
-        template <typename DataT, uint32_t VecSize, uint32_t... Idx>
-        ROCWMMA_DEVICE constexpr static inline auto unpackLo(VecT<DataT, VecSize> const& v0,
-                                                             VecT<DataT, VecSize> const& v1,
-                                                             detail::SeqT<Idx...>)
-        {
-            static_assert(sizeof...(Idx) == VecSize, "Index count must equal the vector size");
-            return VecT<DataT, VecSize>{
-                ((Idx % 2 == 0) ? get<Idx / 2u>(v0) : get<Idx / 2u>(v1))...};
-        }
-
-        template <typename DataT, uint32_t VecSize, uint32_t... Idx>
-        ROCWMMA_DEVICE constexpr static inline auto unpackHi(VecT<DataT, VecSize> const& v0,
-                                                             VecT<DataT, VecSize> const& v1,
-                                                             detail::SeqT<Idx...>)
-        {
-            constexpr auto startIdx = VecSize / 2u;
-            static_assert(sizeof...(Idx) == VecSize, "Index count must equal the vector size");
-            return VecT<DataT, VecSize>{
-                ((Idx % 2 == 0) ? get<startIdx + Idx / 2u>(v0) : get<startIdx + Idx / 2u>(v1))...};
-        }
-
-    } // namespace detail
-
-    template <typename DataT, uint32_t VecSize>
-    ROCWMMA_DEVICE constexpr static inline auto extractEven(VecT<DataT, VecSize> const& v)
-    {
-        return detail::extractEven(v, detail::Seq<VecSize / 2u>{});
-    }
-
-    template <typename DataT, uint32_t VecSize>
-    ROCWMMA_DEVICE constexpr static inline auto extractLo(VecT<DataT, VecSize> const& v)
-    {
-        return detail::extractLo(v, detail::Seq<VecSize / 2u>{});
-    }
-
-    template <typename DataT, uint32_t VecSize>
-    ROCWMMA_DEVICE constexpr static inline auto extractHi(VecT<DataT, VecSize> const& v)
-    {
-        return detail::extractHi(v, detail::Seq<VecSize / 2u>{});
-    }
-
-    template <typename DataT, uint32_t VecSize>
-    ROCWMMA_DEVICE constexpr static inline auto extractOdd(VecT<DataT, VecSize> const& v)
-    {
-        return detail::extractOdd(v, detail::Seq<VecSize / 2u>{});
-    }
-
-    template <typename DataT, uint32_t VecSize>
-    ROCWMMA_DEVICE constexpr static inline auto concat(VecT<DataT, VecSize> const& v0,
-                                                       VecT<DataT, VecSize> const& v1)
-    {
-        return detail::concat(v0, v1, detail::Seq<VecSize>{});
-    }
-
-    template <typename DataT, uint32_t VecSize>
-    ROCWMMA_DEVICE constexpr static inline auto zip(VecT<DataT, VecSize> const& v0,
-                                                    VecT<DataT, VecSize> const& v1)
-    {
-        return detail::zip(v0, v1, detail::Seq<VecSize>{});
-    }
-
-    template <typename DataT, uint32_t VecSize>
-    ROCWMMA_DEVICE constexpr static inline auto unpackLo(VecT<DataT, VecSize> const& v0,
-                                                         VecT<DataT, VecSize> const& v1)
-    {
-        return detail::unpackLo(v0, v1, detail::Seq<VecSize>{});
-    }
-
-    template <typename DataT, uint32_t VecSize>
-    ROCWMMA_DEVICE constexpr static inline auto unpackHi(VecT<DataT, VecSize> const& v0,
-                                                         VecT<DataT, VecSize> const& v1)
-    {
-        return detail::unpackHi(v0, v1, detail::Seq<VecSize>{});
-    }
-
     // Unary swap only considered in 2d vectors.
     template <typename DataT>
     ROCWMMA_HOST_DEVICE constexpr static inline auto swap(non_native_vector_base<DataT, 2> const& v)
@@ -222,7 +93,7 @@ namespace std
     template <typename F, typename Tuple, size_t... I>
     auto apply_impl(F fn, Tuple t, std::index_sequence<I...>)
     {
-        return fn(std::get<I>(t)...);
+        return fn(get<I>(t)...);
     }
     template <typename F, typename Tuple>
     auto apply(F fn, Tuple t)
@@ -296,9 +167,9 @@ namespace rocwmma
 {
     // Computes ceil(numerator/divisor) for integer types.
     template <typename intT1,
-              class = typename std::enable_if<std::is_integral<intT1>::value>::type,
+              class = typename enable_if<is_integral<intT1>::value>::type,
               typename intT2,
-              class = typename std::enable_if<std::is_integral<intT2>::value>::type>
+              class = typename enable_if<is_integral<intT2>::value>::type>
     static constexpr intT1 ceilDiv(const intT1 numerator, const intT2 divisor)
     {
         return (numerator + divisor - 1) / divisor;
diff --git a/library/include/rocwmma/internal/vector.hpp b/library/include/rocwmma/internal/vector.hpp
index 2b2b93ce..bf6ef234 100644
--- a/library/include/rocwmma/internal/vector.hpp
+++ b/library/include/rocwmma/internal/vector.hpp
@@ -30,10 +30,15 @@
 // #include "types.hpp"
 // #include "types_ext.hpp"
 #if !defined(__HIPCC_RTC__)
+
 #include <hip/hip_fp16.h>
 #include <hip/hip_vector_types.h>
+
 #endif
 
+#include "utility/forward.hpp"
+#include "utility/type_traits.hpp"
+
 /**
  * rocWMMA vectors are implemented as HIP_vector_type<T, N> objects, which will ultimately
  * serve as the backend storage for fragment objects. The intention is to be compatible
@@ -147,13 +152,13 @@ namespace rocwmma
         ROCWMMA_HOST_DEVICE
         inline VecT& operator=(VecT&&) = default;
 
-        template <typename U                                                           = T,
-                  typename std::enable_if<(std::is_same<U, T>{}) && (Rank > 1)>::type* = nullptr>
+        template <typename U                                                 = T,
+                  typename enable_if<(is_same<U, T>{}) && (Rank > 1)>::type* = nullptr>
         ROCWMMA_HOST_DEVICE explicit constexpr non_native_vector_base(T x_) noexcept;
 
         template <typename... Ts,
-                  typename U                                              = T,
-                  typename std::enable_if<(sizeof...(Ts) == Rank)>::type* = nullptr>
+                  typename U                                         = T,
+                  typename enable_if<(sizeof...(Ts) == Rank)>::type* = nullptr>
         ROCWMMA_HOST_DEVICE constexpr non_native_vector_base(Ts... args) noexcept;
 
         ROCWMMA_HOST_DEVICE
@@ -186,28 +191,28 @@ namespace rocwmma
         ROCWMMA_HOST_DEVICE
         constexpr inline VecT operator/(const VecT& x_) noexcept;
 
-        template <typename U = T, typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        template <typename U = T, typename enable_if<is_integral<U>{}>::type* = nullptr>
         ROCWMMA_HOST_DEVICE inline VecT& operator%=(const VecT& x_) noexcept;
 
-        template <typename U = T, typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
+        template <typename U = T, typename enable_if<is_signed<U>{}>::type* = nullptr>
         ROCWMMA_HOST_DEVICE inline VecT operator-() const noexcept;
 
-        template <typename U = T, typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        template <typename U = T, typename enable_if<is_integral<U>{}>::type* = nullptr>
         ROCWMMA_HOST_DEVICE inline VecT& operator&=(const VecT& x_) noexcept;
 
-        template <typename U = T, typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        template <typename U = T, typename enable_if<is_integral<U>{}>::type* = nullptr>
         ROCWMMA_HOST_DEVICE inline VecT& operator|=(const VecT& x_) noexcept;
 
-        template <typename U = T, typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        template <typename U = T, typename enable_if<is_integral<U>{}>::type* = nullptr>
         ROCWMMA_HOST_DEVICE inline VecT operator~() const noexcept;
 
-        template <typename U = T, typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        template <typename U = T, typename enable_if<is_integral<U>{}>::type* = nullptr>
         ROCWMMA_HOST_DEVICE inline VecT& operator^=(const VecT& x_) noexcept;
 
-        template <typename U = T, typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        template <typename U = T, typename enable_if<is_integral<U>{}>::type* = nullptr>
         ROCWMMA_HOST_DEVICE inline VecT& operator>>=(const VecT& x_) noexcept;
 
-        template <typename U = T, typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
+        template <typename U = T, typename enable_if<is_integral<U>{}>::type* = nullptr>
         ROCWMMA_HOST_DEVICE inline VecT& operator<<=(const VecT& x_) noexcept;
 
         ROCWMMA_HOST_DEVICE
@@ -395,6 +400,7 @@ ROCWMMA_REGISTER_HIP_NON_NATIVE_VECTOR_TYPE_WITH_INC_DEC_OPS_AS_FLOAT(rocwmma::b
 ROCWMMA_REGISTER_HIP_NON_NATIVE_VECTOR_TYPE_WITH_INC_DEC_OPS_AS_FLOAT(rocwmma::bfloat16_t, 512);
 
 #include "type_traits.hpp"
+#include "utility/get.hpp"
 
 namespace rocwmma
 {
@@ -434,223 +440,8 @@ namespace rocwmma
             return VecSize;
         }
     };
+}
 
-    namespace detail
-    {
-        template <typename... Ts>
-        struct first_type;
-
-        template <typename T, typename... Ts>
-        struct first_type<T, Ts...>
-        {
-            using type = T;
-        };
-
-        template <typename... Ts>
-        using first_type_t = typename first_type<Ts...>::type;
-
-        template <typename... Ts>
-        struct is_same_type;
-
-        template <typename T>
-        struct is_same_type<T> : std::true_type
-        {
-        };
-
-        template <typename T, typename U, typename... Ts>
-        struct is_same_type<T, U, Ts...>
-            : std::conditional_t<std::is_same<T, U>{}, is_same_type<U, Ts...>, std::false_type>
-        {
-        };
-
-        template <typename... Ts>
-        constexpr bool is_same_type_v = is_same_type<Ts...>::value;
-    }
-
-    ///////////////////////////////////////////////////////////////////
-    ///           HIP_vector_type<T, N> utility overrides           ///
-    ///                                                             ///
-    /// Note: HIP_vector_type<T, N> uses vector extensions.         ///
-    /// Element-wise access of vectors in constexpr is forbidden.   ///
-    ///////////////////////////////////////////////////////////////////
-    template <uint32_t Idx, typename DataT, uint32_t VecSize>
-    ROCWMMA_HOST_DEVICE constexpr inline DataT& get(HIP_vector_type<DataT, VecSize>& v)
-    {
-        return reinterpret_cast<DataT*>(&v.data)[Idx];
-    }
-
-    template <uint32_t Idx, typename DataT, uint32_t VecSize>
-    ROCWMMA_HOST_DEVICE constexpr inline DataT get(HIP_vector_type<DataT, VecSize> const& v)
-    {
-        return v.data[Idx];
-    }
-
-    template <typename DataT>
-    ROCWMMA_HOST_DEVICE constexpr inline auto swap(HIP_vector_type<DataT, 2> const& v)
-    {
-        return HIP_vector_type<DataT, 2>{get<1>(v), get<0>(v)};
-    }
-
-    namespace detail
-    {
-        template <typename F, typename DataT, uint32_t Rank, size_t... I>
-        constexpr decltype(auto)
-            apply_impl(F fn, HIP_vector_type<DataT, Rank> const& v, index_sequence<I...>)
-        {
-            return fn(get<I>(v)...);
-        }
-
-    } // namespace detail
-
-    template <typename F, typename DataT, uint32_t Rank>
-    constexpr decltype(auto) apply(F fn, HIP_vector_type<DataT, Rank>& v)
-    {
-        constexpr std::size_t size = VecTraits<std::decay_t<decltype(v)>>::size();
-        return detail::apply_impl(fn, v, detail::make_index_sequence<size>());
-    }
-
-    ///////////////////////////////////////////////////////////////////
-    ///     non_native_vector_base<T, N> utility overrides          ///
-    ///////////////////////////////////////////////////////////////////
-    template <uint32_t Idx, typename DataT, uint32_t VecSize>
-    ROCWMMA_HOST_DEVICE constexpr static inline DataT&
-        get(non_native_vector_base<DataT, VecSize>& v)
-    {
-        return v[Idx];
-    }
-
-    template <uint32_t Idx, typename DataT, uint32_t VecSize>
-    ROCWMMA_HOST_DEVICE constexpr static inline DataT
-        get(non_native_vector_base<DataT, VecSize> const& v)
-    {
-        return v[Idx];
-    }
-
-    namespace detail
-    {
-        template <typename F, typename DataT, uint32_t Rank, size_t... I>
-        constexpr decltype(auto)
-            apply_impl(F fn, non_native_vector_base<DataT, Rank> const& v, index_sequence<I...>)
-        {
-            return fn(get<I>(v)...);
-        }
-
-    } // namespace detail
-
-    template <typename F, typename DataT, uint32_t Rank>
-    constexpr decltype(auto) apply(F fn, non_native_vector_base<DataT, Rank> const& v)
-    {
-        constexpr std::size_t size = VecTraits<std::decay_t<decltype(v)>>::size();
-        return detail::apply_impl(fn, v, detail::make_index_sequence<size>());
-    }
-
-    template <typename... Ts>
-    constexpr decltype(auto) make_vector(Ts&&... ts)
-    {
-        // TODO: When HIP_vector_type becomes constexpr replace with non_native_vector type.
-
-        // Ensure that all the arguments are the same type
-        static_assert(detail::is_same_type_v<std::decay_t<Ts>...>,
-                      "Vector arguments must all be the same type");
-
-        using DataT = typename detail::first_type_t<std::decay_t<Ts>...>;
-        return non_native_vector_base<DataT, sizeof...(Ts)>{std::forward<Ts>(ts)...};
-    }
-
-    namespace detail
-    {
-        template <typename DataT0,
-                  uint32_t Rank0,
-                  size_t... Is0,
-                  typename DataT1,
-                  uint32_t Rank1,
-                  size_t... Is1>
-        constexpr static inline decltype(auto)
-            vector_cat_impl(non_native_vector_base<DataT0, Rank0> const& lhs,
-                            index_sequence<Is0...>,
-                            non_native_vector_base<DataT1, Rank1> const& rhs,
-                            index_sequence<Is1...>)
-        {
-            return make_vector(get<Is0>(lhs)..., get<Is1>(rhs)...);
-        }
-
-    } // namespace detail
-
-    template <typename Lhs, typename Rhs>
-    constexpr decltype(auto) vector_cat(Lhs&& lhs, Rhs&& rhs)
-    {
-        constexpr std::size_t Size0 = VecTraits<std::decay_t<decltype(lhs)>>::size();
-        constexpr std::size_t Size1 = VecTraits<std::decay_t<decltype(rhs)>>::size();
-
-        return detail::vector_cat_impl(std::forward<Lhs>(lhs),
-                                       detail::make_index_sequence<Size0>(),
-                                       std::forward<Rhs>(rhs),
-                                       detail::make_index_sequence<Size1>());
-    }
-
-    namespace detail
-    {
-        template <typename DataT0, typename DataT1, uint32_t Rank, size_t... Is>
-        constexpr static inline decltype(auto)
-            mult_poly_vec_impl(non_native_vector_base<DataT0, Rank> const& lhs,
-                               non_native_vector_base<DataT1, Rank> const& rhs,
-                               index_sequence<Is...>)
-        {
-            return make_vector((get<Is>(lhs) * get<Is>(rhs))...);
-        }
-
-    } // namespace detail
-
-    template <typename DataT0, typename DataT1, uint32_t Rank>
-    constexpr decltype(auto) operator*(non_native_vector_base<DataT0, Rank> const& lhs,
-                                       non_native_vector_base<DataT1, Rank> const& rhs)
-    {
-        return detail::mult_poly_vec_impl(lhs, rhs, detail::make_index_sequence<Rank>());
-    }
-
-    namespace detail
-    {
-        template <class BinOp, typename T, typename... Ts>
-        ROCWMMA_HOST_DEVICE constexpr static inline std::decay_t<T>
-            reduceOp_impl(T&& t, Ts&&... ts) noexcept
-        {
-            using CastT = std::decay_t<T>;
-            if constexpr(sizeof...(Ts) >= 1)
-            {
-                return BinOp::exec(static_cast<CastT>(t),
-                                   reduceOp_impl<BinOp>(std::forward<Ts>(ts)...));
-            }
-            else
-            {
-                return static_cast<CastT>(t);
-            }
-        }
-
-        template <class BinOp, typename VecT, size_t... Is>
-        ROCWMMA_HOST_DEVICE constexpr static inline decltype(auto)
-            vector_reduce_impl(VecT&& v, index_sequence<Is...>) noexcept
-        {
-            return reduceOp_impl<BinOp>(get<Is>(std::forward<VecT>(v))...);
-        }
-
-        // Use with operations that have 1 operands
-        template <class BinOp, typename VecT>
-        ROCWMMA_HOST_DEVICE constexpr static inline decltype(auto)
-            vector_reduce(VecT&& lhs) noexcept
-        {
-            return vector_reduce_impl<BinOp>(
-                std::forward<VecT>(lhs),
-                detail::make_index_sequence<VecTraits<std::decay_t<VecT>>::size()>{});
-        }
-    }
-
-    template <typename VecT>
-    ROCWMMA_HOST_DEVICE constexpr static inline decltype(auto)
-        vector_reduce_and(VecT&& lhs) noexcept
-    {
-        return detail::vector_reduce<detail::BitwiseOp::And>(std::forward<VecT>(lhs));
-    }
-
-} // namespace rocwmma
+#include "utility/vector.hpp"
 
 #endif // ROCWMMA_VECTOR_HPP
diff --git a/library/include/rocwmma/internal/vector_impl.hpp b/library/include/rocwmma/internal/vector_impl.hpp
index d082ee4e..e5cc1692 100644
--- a/library/include/rocwmma/internal/vector_impl.hpp
+++ b/library/include/rocwmma/internal/vector_impl.hpp
@@ -27,6 +27,7 @@
 #ifndef ROCWMMA_VECTOR_IMPL_HPP
 #define ROCWMMA_VECTOR_IMPL_HPP
 
+#include "utility/sequence.hpp"
 #include "vector.hpp"
 
 namespace rocwmma
@@ -69,8 +70,7 @@ namespace rocwmma
             };
             struct Mod
             {
-                template <typename TT,
-                          typename std::enable_if<std::is_integral<TT>{}>::type* = nullptr>
+                template <typename TT, typename enable_if<is_integral<TT>{}>::type* = nullptr>
                 ROCWMMA_HOST_DEVICE constexpr static inline auto exec(TT lhs, TT rhs)
                 {
                     return lhs % rhs;
@@ -78,8 +78,7 @@ namespace rocwmma
             };
             struct Minus
             {
-                template <typename TT,
-                          typename std::enable_if<std::is_signed<TT>{}>::type* = nullptr>
+                template <typename TT, typename enable_if<is_signed<TT>{}>::type* = nullptr>
                 ROCWMMA_HOST_DEVICE constexpr static inline auto exec(TT lhs)
                 {
                     return -lhs;
@@ -92,8 +91,7 @@ namespace rocwmma
         {
             struct And
             {
-                template <typename TT,
-                          typename std::enable_if<std::is_integral<TT>{}>::type* = nullptr>
+                template <typename TT, typename enable_if<is_integral<TT>{}>::type* = nullptr>
                 ROCWMMA_HOST_DEVICE constexpr static inline TT exec(TT lhs, TT rhs)
                 {
                     return lhs & rhs;
@@ -102,8 +100,7 @@ namespace rocwmma
 
             struct Or
             {
-                template <typename TT,
-                          typename std::enable_if<std::is_integral<TT>{}>::type* = nullptr>
+                template <typename TT, typename enable_if<is_integral<TT>{}>::type* = nullptr>
                 ROCWMMA_HOST_DEVICE constexpr static inline TT exec(TT lhs, TT rhs)
                 {
                     return lhs | rhs;
@@ -112,8 +109,7 @@ namespace rocwmma
 
             struct Not
             {
-                template <typename TT,
-                          typename std::enable_if<std::is_integral<TT>{}>::type* = nullptr>
+                template <typename TT, typename enable_if<is_integral<TT>{}>::type* = nullptr>
                 ROCWMMA_HOST_DEVICE constexpr static inline TT exec(TT lhs)
                 {
                     return ~lhs;
@@ -122,8 +118,7 @@ namespace rocwmma
 
             struct Xor
             {
-                template <typename TT,
-                          typename std::enable_if<std::is_integral<TT>{}>::type* = nullptr>
+                template <typename TT, typename enable_if<is_integral<TT>{}>::type* = nullptr>
                 ROCWMMA_HOST_DEVICE constexpr static inline TT exec(TT lhs, TT rhs)
                 {
                     return lhs ^ rhs;
@@ -132,8 +127,7 @@ namespace rocwmma
 
             struct ShiftR
             {
-                template <typename TT,
-                          typename std::enable_if<std::is_integral<TT>{}>::type* = nullptr>
+                template <typename TT, typename enable_if<is_integral<TT>{}>::type* = nullptr>
                 ROCWMMA_HOST_DEVICE constexpr static inline TT exec(TT lhs, TT rhs)
                 {
                     return lhs >> rhs;
@@ -142,8 +136,7 @@ namespace rocwmma
 
             struct ShiftL
             {
-                template <typename TT,
-                          typename std::enable_if<std::is_integral<TT>{}>::type* = nullptr>
+                template <typename TT, typename enable_if<is_integral<TT>{}>::type* = nullptr>
                 ROCWMMA_HOST_DEVICE constexpr static inline TT exec(TT lhs, TT rhs)
                 {
                     return lhs >> rhs;
@@ -157,7 +150,7 @@ namespace rocwmma
             struct And
             {
                 template <typename TT,
-                          typename std::enable_if<std::is_convertible<TT, bool>{}>::type* = nullptr>
+                          typename enable_if<is_convertible<TT, bool>{}>::type* = nullptr>
                 ROCWMMA_HOST_DEVICE constexpr static inline auto exec(TT lhs, TT rhs)
                 {
                     return lhs && rhs;
@@ -167,7 +160,7 @@ namespace rocwmma
             struct Or
             {
                 template <typename TT,
-                          typename std::enable_if<std::is_convertible<TT, bool>{}>::type* = nullptr>
+                          typename enable_if<is_convertible<TT, bool>{}>::type* = nullptr>
                 ROCWMMA_HOST_DEVICE constexpr static inline auto exec(TT lhs, TT rhs)
                 {
                     return lhs || rhs;
@@ -177,7 +170,7 @@ namespace rocwmma
             struct Not
             {
                 template <typename TT,
-                          typename std::enable_if<std::is_convertible<TT, bool>{}>::type* = nullptr>
+                          typename enable_if<is_convertible<TT, bool>{}>::type* = nullptr>
                 ROCWMMA_HOST_DEVICE constexpr static inline auto exec(TT lhs)
                 {
                     return !lhs;
@@ -244,82 +237,6 @@ namespace rocwmma
 
         } // namespace RelationalOp
 
-        template <class IntT, IntT val>
-        struct integral_constant
-        {
-            static constexpr IntT value = val;
-            using value_type            = IntT;
-            using type                  = integral_constant;
-            constexpr operator value_type() const noexcept
-            {
-                return value;
-            }
-            constexpr value_type operator()() const noexcept
-            {
-                return value;
-            }
-        };
-
-        template <typename Int, Int... Ints>
-        struct integer_sequence
-        {
-            using value_type = Int;
-            constexpr integer_sequence() {}
-            static constexpr std::size_t size() noexcept
-            {
-                return sizeof...(Ints);
-            }
-        };
-
-        template <std::size_t... Indices>
-        using index_sequence = integer_sequence<std::size_t, Indices...>;
-
-        namespace
-        {
-            // Merge two integer sequences, adding an offset to the right-hand side.
-            template <typename Offset, typename Lhs, typename Rhs>
-            struct merge;
-
-            template <typename Int, Int Offset, Int... Lhs, Int... Rhs>
-            struct merge<integral_constant<Int, Offset>,
-                         integer_sequence<Int, Lhs...>,
-                         integer_sequence<Int, Rhs...>>
-            {
-                using type = integer_sequence<Int, Lhs..., (Offset + Rhs)...>;
-            };
-
-            template <typename Int, typename N>
-            struct log_make_sequence
-            {
-                using L    = integral_constant<Int, N::value / 2>;
-                using R    = integral_constant<Int, N::value - L::value>;
-                using type = typename merge<L,
-                                            typename log_make_sequence<Int, L>::type,
-                                            typename log_make_sequence<Int, R>::type>::type;
-            };
-
-            // An empty sequence.
-            template <typename Int>
-            struct log_make_sequence<Int, integral_constant<Int, 0>>
-            {
-                using type = integer_sequence<Int>;
-            };
-
-            // A single-element sequence.
-            template <typename Int>
-            struct log_make_sequence<Int, integral_constant<Int, 1>>
-            {
-                using type = integer_sequence<Int, 0>;
-            };
-        }
-
-        template <typename Int, Int N>
-        using make_integer_sequence =
-            typename log_make_sequence<Int, integral_constant<Int, N>>::type;
-
-        template <std::size_t N>
-        using make_index_sequence = make_integer_sequence<std::size_t, N>;
-
         // Helpers for expression expansion, specific to non_native_vector_base
         template <uint32_t... ns>
         using SeqT = integer_sequence<uint32_t, ns...>;
@@ -367,7 +284,7 @@ namespace rocwmma
     // As a solution, Rank == 1 should fall into the ctor(Ts... args) for initializer
     // list construction, and NOT bCast initialization.
     template <typename T, unsigned int Rank>
-    template <typename U, typename std::enable_if<(std::is_same<U, T>{}) && (Rank > 1)>::type*>
+    template <typename U, typename enable_if<(is_same<U, T>{}) && (Rank > 1)>::type*>
     ROCWMMA_HOST_DEVICE constexpr non_native_vector_base<T, Rank>::non_native_vector_base(
         T x_) noexcept
         : non_native_vector_base(detail::template bCast<VecT>(x_, detail::Seq<Rank>{}))
@@ -378,7 +295,7 @@ namespace rocwmma
     // Default template depth is currently not deep enough to
     // support vector sizes of 512
     template <typename T, unsigned int Rank>
-    template <typename... Ts, typename U, typename std::enable_if<(sizeof...(Ts) == Rank)>::type*>
+    template <typename... Ts, typename U, typename enable_if<(sizeof...(Ts) == Rank)>::type*>
     ROCWMMA_HOST_DEVICE constexpr non_native_vector_base<T, Rank>::non_native_vector_base(
         Ts... args) noexcept
         : d{static_cast<T>(args)...}
@@ -460,7 +377,7 @@ namespace rocwmma
     }
 
     template <typename T, unsigned int Rank>
-    template <typename U, typename std::enable_if<std::is_integral<U>{}>::type*>
+    template <typename U, typename enable_if<is_integral<U>{}>::type*>
     ROCWMMA_HOST_DEVICE inline auto
         non_native_vector_base<T, Rank>::operator%=(const VecT& x_) noexcept -> VecT&
     {
@@ -468,7 +385,7 @@ namespace rocwmma
     }
 
     template <typename T, unsigned int Rank>
-    template <typename U, typename std::enable_if<std::is_signed<U>{}>::type*>
+    template <typename U, typename enable_if<is_signed<U>{}>::type*>
     ROCWMMA_HOST_DEVICE inline auto non_native_vector_base<T, Rank>::operator-() const noexcept
         -> VecT
     {
@@ -477,7 +394,7 @@ namespace rocwmma
 
     // @cond
     template <typename T, unsigned int Rank>
-    template <typename U, typename std::enable_if<std::is_integral<U>{}>::type*>
+    template <typename U, typename enable_if<is_integral<U>{}>::type*>
     ROCWMMA_HOST_DEVICE inline auto
         non_native_vector_base<T, Rank>::operator&=(const VecT& x_) noexcept -> VecT&
     {
@@ -486,7 +403,7 @@ namespace rocwmma
     // @endcond
 
     template <typename T, unsigned int Rank>
-    template <typename U, typename std::enable_if<std::is_integral<U>{}>::type*>
+    template <typename U, typename enable_if<is_integral<U>{}>::type*>
     ROCWMMA_HOST_DEVICE inline auto
         non_native_vector_base<T, Rank>::operator|=(const VecT& x_) noexcept -> VecT&
     {
@@ -494,7 +411,7 @@ namespace rocwmma
     }
 
     template <typename T, unsigned int Rank>
-    template <typename U, typename std::enable_if<std::is_integral<U>{}>::type*>
+    template <typename U, typename enable_if<is_integral<U>{}>::type*>
     ROCWMMA_HOST_DEVICE inline auto non_native_vector_base<T, Rank>::operator~() const noexcept
         -> VecT
     {
@@ -502,7 +419,7 @@ namespace rocwmma
     }
 
     template <typename T, unsigned int Rank>
-    template <typename U, typename std::enable_if<std::is_integral<U>{}>::type*>
+    template <typename U, typename enable_if<is_integral<U>{}>::type*>
     ROCWMMA_HOST_DEVICE inline auto
         non_native_vector_base<T, Rank>::operator^=(const VecT& x_) noexcept -> VecT&
     {
@@ -510,7 +427,7 @@ namespace rocwmma
     }
 
     template <typename T, unsigned int Rank>
-    template <typename U, typename std::enable_if<std::is_integral<U>{}>::type*>
+    template <typename U, typename enable_if<is_integral<U>{}>::type*>
     ROCWMMA_HOST_DEVICE inline auto
         non_native_vector_base<T, Rank>::operator>>=(const VecT& x_) noexcept -> VecT&
     {
@@ -518,7 +435,7 @@ namespace rocwmma
     }
 
     template <typename T, unsigned int Rank>
-    template <typename U, typename std::enable_if<std::is_integral<U>{}>::type*>
+    template <typename U, typename enable_if<is_integral<U>{}>::type*>
     ROCWMMA_HOST_DEVICE inline auto
         non_native_vector_base<T, Rank>::operator<<=(const VecT& x_) noexcept -> VecT&
     {
@@ -660,44 +577,43 @@ namespace rocwmma
 /// OR native vector extension. The latter doesn't have the required built-in broadcast.     ///
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-#define ROCWMMA_REGISTER_HIP_VECTOR_BASE(TYPE, RANK, STORAGE_IMPL)                             \
-    template <>                                                                                \
-    struct HIP_vector_base<TYPE, RANK>                                                         \
-    {                                                                                          \
-        STORAGE_IMPL(TYPE, RANK);                                                              \
-                                                                                               \
-        using value_type = TYPE;                                                               \
-                                                                                               \
-        ROCWMMA_HOST_DEVICE                                                                    \
-        HIP_vector_base() = default;                                                           \
-        template <typename... ArgsT,                                                           \
-                  typename U                                                 = TYPE,           \
-                  typename std::enable_if<(sizeof...(ArgsT) == RANK)>::type* = nullptr>        \
-        ROCWMMA_HOST_DEVICE constexpr HIP_vector_base(ArgsT... args) noexcept                  \
-            : data{args...}                                                                    \
-        {                                                                                      \
-        }                                                                                      \
-                                                                                               \
-        template <                                                                             \
-            typename U                                                              = TYPE,    \
-            typename std::enable_if<(std::is_same<U, TYPE>{}) && (RANK > 1)>::type* = nullptr> \
-        ROCWMMA_HOST_DEVICE constexpr explicit HIP_vector_base(TYPE val) noexcept              \
-            : HIP_vector_base(rocwmma::detail::template bCast<HIP_vector_base>(                \
-                val, rocwmma::detail::Seq<RANK>{}))                                            \
-        {                                                                                      \
-        }                                                                                      \
-                                                                                               \
-        ROCWMMA_HOST_DEVICE                                                                    \
-        constexpr HIP_vector_base(const HIP_vector_base&) = default;                           \
-                                                                                               \
-        ROCWMMA_HOST_DEVICE                                                                    \
-        constexpr HIP_vector_base(HIP_vector_base&&) = default;                                \
-                                                                                               \
-        ROCWMMA_HOST_DEVICE                                                                    \
-        ~HIP_vector_base() = default;                                                          \
-                                                                                               \
-        ROCWMMA_HOST_DEVICE                                                                    \
-        HIP_vector_base& operator=(const HIP_vector_base& x_) noexcept = default;              \
+#define ROCWMMA_REGISTER_HIP_VECTOR_BASE(TYPE, RANK, STORAGE_IMPL)                              \
+    template <>                                                                                 \
+    struct HIP_vector_base<TYPE, RANK>                                                          \
+    {                                                                                           \
+        STORAGE_IMPL(TYPE, RANK);                                                               \
+                                                                                                \
+        using value_type = TYPE;                                                                \
+                                                                                                \
+        ROCWMMA_HOST_DEVICE                                                                     \
+        HIP_vector_base() = default;                                                            \
+        template <typename... ArgsT,                                                            \
+                  typename U                                        = TYPE,                     \
+                  rocwmma::enable_if_t<(sizeof...(ArgsT) == RANK)>* = nullptr>                  \
+        ROCWMMA_HOST_DEVICE constexpr HIP_vector_base(ArgsT... args) noexcept                   \
+            : data{args...}                                                                     \
+        {                                                                                       \
+        }                                                                                       \
+                                                                                                \
+        template <typename U                                                         = TYPE,    \
+                  rocwmma::enable_if_t<(rocwmma::is_same<U, TYPE>{}) && (RANK > 1)>* = nullptr> \
+        ROCWMMA_HOST_DEVICE constexpr explicit HIP_vector_base(TYPE val) noexcept               \
+            : HIP_vector_base(rocwmma::detail::template bCast<HIP_vector_base>(                 \
+                val, rocwmma::detail::Seq<RANK>{}))                                             \
+        {                                                                                       \
+        }                                                                                       \
+                                                                                                \
+        ROCWMMA_HOST_DEVICE                                                                     \
+        constexpr HIP_vector_base(const HIP_vector_base&) = default;                            \
+                                                                                                \
+        ROCWMMA_HOST_DEVICE                                                                     \
+        constexpr HIP_vector_base(HIP_vector_base&&) = default;                                 \
+                                                                                                \
+        ROCWMMA_HOST_DEVICE                                                                     \
+        ~HIP_vector_base() = default;                                                           \
+                                                                                                \
+        ROCWMMA_HOST_DEVICE                                                                     \
+        HIP_vector_base& operator=(const HIP_vector_base& x_) noexcept = default;               \
     };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/library/include/rocwmma/internal/vector_util.hpp b/library/include/rocwmma/internal/vector_util.hpp
new file mode 100644
index 00000000..abc56c4f
--- /dev/null
+++ b/library/include/rocwmma/internal/vector_util.hpp
@@ -0,0 +1,129 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_VECTOR_UTIL_HPP
+#define ROCWMMA_VECTOR_UTIL_HPP
+
+#include "types.hpp"
+#include "vector.hpp"
+
+namespace rocwmma
+{
+    //! Extracts the first (lo) half of elements from a given vector
+    /*!
+      \param v Vector to extract the lo elements from.
+    */
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto extractLo(VecT<DataT, VecSize> const& v);
+
+    //! Extracts the second (hi) half of elements from a given vector
+    /*!
+      \param v Vector to extract the hi elements from.
+    */
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto extractHi(VecT<DataT, VecSize> const& v);
+
+    //! Extracts the the even elements elements from a given vector
+    /*!
+      \param v Vector to extract the even elements from.
+    */
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_HOST_DEVICE constexpr static inline auto extractEven(VecT<DataT, VecSize> const& v);
+
+    //! Extracts the the odd elements elements from a given vector
+    /*!
+      \param v Vector to extract the odd elements from.
+    */
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto extractOdd(VecT<DataT, VecSize> const& v);
+
+    //! Re-orders vector elements such that even elements are concatenated with odd elements.
+    /*!
+      \param v Vector to reorder elements from.
+    */
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto reorderEvenOdd(VecT<DataT, VecSize> const& v);
+
+    //! Re-orders vector elements such that odd elements are concatenated with even elements.
+    /*!
+      \param v Vector to reorder elements from.
+    */
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto reorderOddEven(VecT<DataT, VecSize> const& v);
+
+    //! Concatenates the contents of two vectors together in order.
+    /*!
+      \param v0 First vector to concatenate
+      \param v1 Second vector to concatenate
+    */
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto concat(VecT<DataT, VecSize> const& v0,
+                                                       VecT<DataT, VecSize> const& v1);
+
+    //! Alternates selecting even elements from the first vector and odd elements from the second vector.
+    //! Analogous to a zipper.
+    //! E.g.
+    //! v0     = [0, 1, 2, 3]
+    //! v1     = [4, 5, 6, 7]
+    //! result = [0, 5, 2, 7]
+    /*!
+      \param v0 Vector from which even elements are alternately selected
+      \param v1 Vector from which odd elements are alternately selected
+    */
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto zip(VecT<DataT, VecSize> const& v0,
+                                                    VecT<DataT, VecSize> const& v1);
+
+    //! Alternates selecting the first (lo) half of elements from each vector
+    //! E.g.
+    //! v0     = [0, 1, 2, 3]
+    //! v1     = [4, 5, 6, 7]
+    //! result = [0, 4, 1, 5]
+    /*!
+      \param v0 Vector from which lo elements are alternately selected
+      \param v1 Vector from which lo elements are alternately selected
+    */
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto unpackLo(VecT<DataT, VecSize> const& v0,
+                                                         VecT<DataT, VecSize> const& v1);
+
+    //! Alternates selecting the second (hi) half of elements from each vector
+    //! E.g.
+    //! v0     = [0, 1, 2, 3]
+    //! v1     = [4, 5, 6, 7]
+    //! result = [2, 6, 3, 7]
+    /*!
+      \param v0 Vector from which hi elements are alternately selected
+      \param v1 Vector from which hi elements are alternately selected
+    */
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto unpackHi(VecT<DataT, VecSize> const& v0,
+                                                         VecT<DataT, VecSize> const& v1);
+} // namespace rocwmma
+
+#include "vector_util_impl.hpp"
+
+#endif // ROCWMMA_VECTOR_UTIL_HPP
diff --git a/library/include/rocwmma/internal/vector_util_impl.hpp b/library/include/rocwmma/internal/vector_util_impl.hpp
new file mode 100644
index 00000000..7d5d8396
--- /dev/null
+++ b/library/include/rocwmma/internal/vector_util_impl.hpp
@@ -0,0 +1,441 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_VECTOR_UTIL_IMPL_HPP
+#define ROCWMMA_VECTOR_UTIL_IMPL_HPP
+
+#include "blend.hpp"
+#include "types.hpp"
+#include "vector.hpp"
+
+namespace rocwmma
+{
+    namespace detail
+    {
+        template <uint32_t N>
+        using Number = integral_constant<int32_t, N>;
+
+        // Can be used to build any vector class of <DataT, VecSize>
+        // Either VecT or non_native_vector_vase.
+        // Class acts as a static for_each style generator:
+        // Incoming functor F will be called with each index + args in sequence.
+        // Results of functor calls are used to construct a new vector.
+        template <template <typename, uint32_t> class VecT, typename DataT, uint32_t VecSize>
+        struct vector_generator
+        {
+            static_assert(VecSize > 0, "VectorSize must be at least 1");
+
+            ROCWMMA_HOST_DEVICE constexpr vector_generator() {}
+
+            // F signature: F(Number<Iter>, args...)
+            template <class F, typename... ArgsT>
+            ROCWMMA_HOST_DEVICE constexpr auto operator()(F f, ArgsT&&... args) const
+            {
+                // Build the number sequence to be expanded below.
+                return operator()(f, detail::Seq<VecSize>{}, forward<ArgsT>(args)...);
+            }
+
+        private:
+            template <class F, uint32_t... Indices, typename... ArgsT>
+            ROCWMMA_HOST_DEVICE constexpr auto
+                operator()(F f, detail::SeqT<Indices...>, ArgsT&&... args) const
+            {
+                // Execute incoming functor f with each index, as well as forwarded args.
+                // The resulting vector is constructed with the results of each functor call.
+                return VecT<DataT, VecSize>{
+                    (f(Number<Indices>{}, forward<ArgsT>(args)...))...};
+            }
+        };
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    struct vector_generator : public detail::vector_generator<VecT, DataT, VecSize>
+    {
+    };
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto concat(VecT<DataT, VecSize> const& v0,
+                                                       VecT<DataT, VecSize> const& v1)
+    {
+        auto concat = [](auto&& idx, auto&& v0, auto&& v1) {
+            constexpr auto Index = decay_t<decltype(idx)>::value;
+            return (Index < VecSize) ? get<Index>(v0) : get<Index - VecSize>(v1);
+        };
+
+        return vector_generator<DataT, VecSize * 2u>()(concat, v0, v1);
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto extractLo(VecT<DataT, VecSize> const& v)
+    {
+        if constexpr(VecSize > 1)
+        {
+            auto lo = [](auto&& idx, auto&& v) {
+                constexpr auto Index = decay_t<decltype(idx)>::value;
+                return get<Index>(v);
+            };
+
+            return vector_generator<DataT, VecSize / 2u>()(lo, v);
+        }
+        // Self-forwarding case
+        else
+        {
+            return v;
+        }
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto extractHi(VecT<DataT, VecSize> const& v)
+    {
+        if constexpr(VecSize > 1)
+        {
+            auto hi = [](auto&& idx, auto&& v) {
+                constexpr auto Index = decay_t<decltype(idx)>::value;
+                return get<Index + VecSize / 2u>(v);
+            };
+
+            return vector_generator<DataT, VecSize / 2u>()(hi, v);
+        }
+        else
+        {
+            return v;
+        }
+    }
+
+} // namespace rocwmma
+
+#include "pack_util.hpp"
+
+namespace rocwmma
+{
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_HOST_DEVICE constexpr static inline auto extractEven(VecT<DataT, VecSize> const& v)
+    {
+        using PackUtil   = PackUtil<DataT>;
+        using PackTraits = typename PackUtil::Traits;
+
+        // Special case: Sub-dword data sizes with minimum 2 packed vectors
+        // Extract even elements only.
+        constexpr auto ElementSize   = sizeof(DataT);
+        constexpr auto PackedVecSize = VecSize / PackTraits::PackRatio;
+        if constexpr(ElementSize < 4u && PackedVecSize >= 2u)
+        {
+            auto evens = [](auto&& idx, auto&& v) {
+                constexpr auto Index = decay_t<decltype(idx)>::value;
+                return (ElementSize == 2u) ? Blend::ExtractWordEven::exec(get<Index * 2u>(v),
+                                                                          get<Index * 2u + 1u>(v))
+                                           : Blend::ExtractByteEven::exec(get<Index * 2u>(v),
+                                                                          get<Index * 2u + 1u>(v));
+            };
+
+            // Pack, extract and unpack
+            using PackedT = typename PackTraits::PackedT;
+            auto packed   = PackUtil::paddedPack(v);
+            auto result   = vector_generator<PackedT, PackedVecSize / 2u>()(evens, packed);
+            return PackUtil::template paddedUnpack<VecSize / 2u>(result);
+        }
+        // General case:
+        // Re-arrangement of dword+ data sizes isn't super costly and can
+        // be achieved with a simple static shuffle.
+        else if constexpr(VecSize > 1)
+        {
+            auto evens = [](auto&& idx, auto&& v) {
+                constexpr auto Index = decay_t<decltype(idx)>::value;
+                return get<Index * 2>(v);
+            };
+
+            return vector_generator<DataT, VecSize / 2u>()(evens, v);
+        }
+        // Forwarding case: vector size is 1
+        else
+        {
+            return v;
+        }
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto extractOdd(VecT<DataT, VecSize> const& v)
+    {
+        using PackUtil   = PackUtil<DataT>;
+        using PackTraits = typename PackUtil::Traits;
+
+        // Special case: Sub-dword data sizes with minimum 2 packed vectors
+        // Extract odd elements only.
+        constexpr auto ElementSize   = sizeof(DataT);
+        constexpr auto PackedVecSize = VecSize / PackTraits::PackRatio;
+        if constexpr(ElementSize < 4u && PackedVecSize >= 2u)
+        {
+            auto odds = [](auto&& idx, auto&& v) {
+                constexpr auto Index = decay_t<decltype(idx)>::value;
+                return (ElementSize == 2u) ? Blend::ExtractWordOdd::exec(get<Index * 2u>(v),
+                                                                         get<Index * 2u + 1u>(v))
+                                           : Blend::ExtractByteOdd::exec(get<Index * 2u>(v),
+                                                                         get<Index * 2u + 1u>(v));
+            };
+
+            // Pack, extract and unpack
+            using PackedT = typename PackTraits::PackedT;
+            auto packed   = PackUtil::paddedPack(v);
+            auto result   = vector_generator<PackedT, PackedVecSize / 2u>()(odds, packed);
+            return PackUtil::template paddedUnpack<VecSize / 2u>(result);
+        }
+        // General case:
+        // Re-arrangement of dword+ data sizes isn't super costly and can
+        // be achieved with a simple static shuffle.
+        else if constexpr(VecSize > 1)
+        {
+            auto odds = [](auto&& idx, auto&& v) {
+                constexpr auto Index = decay_t<decltype(idx)>::value;
+                return get<Index * 2 + 1>(v);
+            };
+
+            return vector_generator<DataT, VecSize / 2u>()(odds, v);
+        }
+        // Forwarding case: vector size is 1
+        else
+        {
+            return v;
+        }
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto reorderEvenOdd(VecT<DataT, VecSize> const& v)
+    {
+        using PackUtil   = PackUtil<DataT>;
+        using PackTraits = typename PackUtil::Traits;
+
+        // Special case: Sub-dword data sizes, maximum one packed vector.
+        // Extract even elements followed by odd elements.
+        constexpr auto ElementSize   = sizeof(DataT);
+        constexpr auto PackedVecSize = VecSize / PackTraits::PackRatio;
+        if constexpr(ElementSize < 4u && PackedVecSize == 1)
+        {
+            auto evenOdds = [](auto&& idx, auto&& v) {
+                constexpr auto Index = decay_t<decltype(idx)>::value;
+                return (ElementSize == 2u)
+                           ? Blend::ExtractWordEvenOdd::exec(get<Index>(v), get<Index>(v))
+                           : Blend::ExtractByteEvenOdd::exec(get<Index>(v), get<Index>(v));
+            };
+
+            // Pack, extract and unpack
+            using PackedT = typename PackTraits::PackedT;
+            auto packed   = PackUtil::paddedPack(v);
+            auto result   = vector_generator<PackedT, PackedVecSize>()(evenOdds, packed);
+            return PackUtil::template paddedUnpack<VecSize>(result);
+        }
+        // General case: Concatenate evens and odds
+        else if constexpr(VecSize > 1)
+        {
+            return concat(extractEven(v), extractOdd(v));
+        }
+        // Forwarding case: return self
+        else
+        {
+            return v;
+        }
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto reorderOddEven(VecT<DataT, VecSize> const& v)
+    {
+        using PackUtil   = PackUtil<DataT>;
+        using PackTraits = typename PackUtil::Traits;
+
+        // Special case: Sub-dword data sizes, maximum one packed vector.
+        // Optimize data-reorder with cross-lane ops.
+        constexpr auto ElementSize   = sizeof(DataT);
+        constexpr auto PackedVecSize = VecSize / PackTraits::PackRatio;
+        if constexpr(ElementSize < 4u && PackedVecSize <= 1)
+        {
+            using PackedT = typename PackTraits::PackedT;
+
+            // Exactly one packed vector
+            if constexpr(PackedVecSize == 1)
+            {
+                auto oddEvens = [](auto&& idx, auto&& v) {
+                    constexpr auto Index = decay_t<decltype(idx)>::value;
+                    return (ElementSize == 2u)
+                               ? Blend::ExtractWordOddEven::exec(get<Index>(v), get<Index>(v))
+                               : Blend::ExtractByteOddEven::exec(get<Index>(v), get<Index>(v));
+                };
+
+                // Pack, extract and unpack
+                auto packed = PackUtil::paddedPack(v);
+                auto result = vector_generator<PackedT, PackedVecSize>()(oddEvens, packed);
+                return PackUtil::template paddedUnpack<VecSize>(result);
+            }
+            // Corner case: Swap bytes
+            else if constexpr(ElementSize == 1 && VecSize == 2)
+            {
+                auto oddEvens = [](auto&& idx, auto&& v) {
+                    // Manually swap bytes
+                    using SwapBytes = Blend::Driver<BlendImpl::Ops::PermByte<1u, 0u, 3u, 2u>>;
+
+                    constexpr auto Index = decay_t<decltype(idx)>::value;
+                    return SwapBytes::exec(get<Index>(v), get<Index>(v));
+                };
+
+                // Pack, extract and unpack
+                auto packed = PackUtil::paddedPack(v);
+                auto result = vector_generator<PackedT, 1u>()(oddEvens, packed);
+                return PackUtil::template paddedUnpack<VecSize>(result);
+            }
+            // ElementSize = 1, 2 VecSize = 1
+            else
+            {
+                return v;
+            }
+        }
+        // General case: Concatenate evens and odds
+        else if constexpr(VecSize > 1)
+        {
+            return concat(extractOdd(v), extractEven(v));
+        }
+        // Forwarding case: return self
+        else
+        {
+            return v;
+        }
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto zip(VecT<DataT, VecSize> const& v0,
+                                                    VecT<DataT, VecSize> const& v1)
+    {
+        using PackUtil   = PackUtil<DataT>;
+        using PackTraits = typename PackUtil::Traits;
+
+        // Special case: Sub-dword data sizes
+        // Optimize data-reorder with cross-lane ops.
+        constexpr auto ElementSize   = sizeof(DataT);
+        constexpr auto PackedVecSize = max(VecSize / PackTraits::PackRatio, 1u);
+        if constexpr(ElementSize < 4u)
+        {
+            auto zip = [](auto&& idx, auto&& v0, auto&& v1) {
+                constexpr auto Index = decay_t<decltype(idx)>::value;
+                return (ElementSize == 2u) ? Blend::ZipWord::exec(get<Index>(v0), get<Index>(v1))
+                                           : Blend::ZipByte::exec(get<Index>(v0), get<Index>(v1));
+            };
+
+            // Pack, extract and unpack
+            using PackedT = typename PackTraits::PackedT;
+            auto packed0  = PackUtil::paddedPack(v0);
+            auto packed1  = PackUtil::paddedPack(v1);
+            auto result   = vector_generator<PackedT, PackedVecSize>()(zip, packed0, packed1);
+            return PackUtil::template paddedUnpack<VecSize>(result);
+        }
+        else
+        {
+            auto zip = [](auto&& idx, auto&& v0, auto&& v1) {
+                constexpr auto Index = decay_t<decltype(idx)>::value;
+                return (Index % 2u == 0u) ? get<Index>(v0) : get<Index>(v1);
+            };
+
+            return vector_generator<DataT, VecSize>()(zip, v0, v1);
+        }
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto unpackLo(VecT<DataT, VecSize> const& v0,
+                                                         VecT<DataT, VecSize> const& v1)
+    {
+        using PackUtil   = PackUtil<DataT>;
+        using PackTraits = typename PackUtil::Traits;
+
+        // Special case: Sub-dword data sizes
+        // Optimize data-reorder with cross-lane ops.
+        constexpr auto ElementSize   = sizeof(DataT);
+        constexpr auto PackedVecSize = max(VecSize / PackTraits::PackRatio, 1u);
+        if constexpr(ElementSize < 4u)
+        {
+            auto unpackLo = [](auto&& idx, auto&& v0, auto&& v1) {
+                constexpr auto Index = decay_t<decltype(idx)>::value;
+                return (ElementSize == 2u)
+                           ? Blend::UnpackWordLo::exec(get<Index>(v0), get<Index>(v1))
+                           : Blend::UnpackByteLo::exec(get<Index>(v0), get<Index>(v1));
+            };
+
+            // Pack, extract and unpack
+            using PackedT = typename PackTraits::PackedT;
+            auto packed0  = PackUtil::paddedPack(v0);
+            auto packed1  = PackUtil::paddedPack(v1);
+            auto result   = vector_generator<PackedT, PackedVecSize>()(unpackLo, packed0, packed1);
+            return PackUtil::template paddedUnpack<VecSize>(result);
+        }
+        else
+        {
+            auto unpackLo = [](auto&& idx, auto&& v0, auto&& v1) {
+                constexpr auto Index = decay_t<decltype(idx)>::value;
+                return (Index % 2u == 0u) ? get<Index / 2u>(v0) : get<Index / 2u>(v1);
+            };
+
+            return vector_generator<DataT, VecSize>()(unpackLo, v0, v1);
+        }
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE constexpr static inline auto unpackHi(VecT<DataT, VecSize> const& v0,
+                                                         VecT<DataT, VecSize> const& v1)
+    {
+        using PackUtil   = PackUtil<DataT>;
+        using PackTraits = typename PackUtil::Traits;
+
+        // Special case: Sub-dword data sizes
+        // Optimize data-reorder with cross-lane ops.
+        constexpr auto ElementSize   = sizeof(DataT);
+        constexpr auto PackedVecSize = max(VecSize / PackTraits::PackRatio, 1u);
+        if constexpr(ElementSize < 4u)
+        {
+            auto unpackHi = [](auto&& idx, auto&& v0, auto&& v1) {
+                constexpr auto Index = decay_t<decltype(idx)>::value;
+                return (ElementSize == 2u)
+                           ? Blend::UnpackWordHi::exec(get<Index>(v0), get<Index>(v1))
+                           : Blend::UnpackByteHi::exec(get<Index>(v0), get<Index>(v1));
+            };
+
+            // Pack, extract and unpack
+            using PackedT = typename PackTraits::PackedT;
+            auto packed0  = PackUtil::paddedPack(v0);
+            auto packed1  = PackUtil::paddedPack(v1);
+            auto result   = vector_generator<PackedT, PackedVecSize>()(unpackHi, packed0, packed1);
+            return PackUtil::template paddedUnpack<VecSize>(result);
+        }
+        else
+        {
+            auto unpackHi = [](auto&& idx, auto&& v0, auto&& v1) {
+                constexpr auto startIdx = VecSize / 2u;
+                constexpr auto Index    = decay_t<decltype(idx)>::value;
+                return (Index % 2u == 0u) ? get<startIdx + Index / 2u>(v0)
+                                          : get<startIdx + Index / 2u>(v1);
+            };
+
+            return vector_generator<DataT, VecSize>()(unpackHi, v0, v1);
+        }
+    }
+
+} // namespace rocwmma
+
+#endif // ROCWMMA_VECTOR_UTIL_IMPL_HPP
diff --git a/library/include/rocwmma/internal/wmma.hpp b/library/include/rocwmma/internal/wmma.hpp
index f5d0f1aa..ed80f890 100644
--- a/library/include/rocwmma/internal/wmma.hpp
+++ b/library/include/rocwmma/internal/wmma.hpp
@@ -70,18 +70,18 @@ namespace rocwmma
         BlockM,
         BlockN,
         BlockK,
-        typename std::enable_if<
-            ((std::is_same<InputT, float16_t>::value && std::is_same<ComputeT, float16_t>::value)
-             || (std::is_same<InputT, float16_t>::value && std::is_same<ComputeT, float32_t>::value)
-             || (std::is_same<InputT, hfloat16_t>::value
-                 && std::is_same<ComputeT, hfloat16_t>::value)
-             || (std::is_same<InputT, hfloat16_t>::value
-                 && std::is_same<ComputeT, float32_t>::value)
-             || (std::is_same<InputT, bfloat16_t>::value
-                 && std::is_same<ComputeT, bfloat16_t>::value)
-             || (std::is_same<InputT, bfloat16_t>::value
-                 && std::is_same<ComputeT, float32_t>::value)
-             || (std::is_same<InputT, int8_t>::value && std::is_same<ComputeT, int32_t>::value))
+        typename enable_if<
+            ((is_same<InputT, float16_t>::value && is_same<ComputeT, float16_t>::value)
+             || (is_same<InputT, float16_t>::value && is_same<ComputeT, float32_t>::value)
+             || (is_same<InputT, hfloat16_t>::value
+                 && is_same<ComputeT, hfloat16_t>::value)
+             || (is_same<InputT, hfloat16_t>::value
+                 && is_same<ComputeT, float32_t>::value)
+             || (is_same<InputT, bfloat16_t>::value
+                 && is_same<ComputeT, bfloat16_t>::value)
+             || (is_same<InputT, bfloat16_t>::value
+                 && is_same<ComputeT, float32_t>::value)
+             || (is_same<InputT, int8_t>::value && is_same<ComputeT, int32_t>::value))
             && (BlockM == 16) && (BlockN == 16) && (BlockK >= 16) // 16 block size only
             >::type>
     {
diff --git a/library/include/rocwmma/rocwmma_coop_impl.hpp b/library/include/rocwmma/rocwmma_coop_impl.hpp
index c7f6789c..a8eff041 100644
--- a/library/include/rocwmma/rocwmma_coop_impl.hpp
+++ b/library/include/rocwmma/rocwmma_coop_impl.hpp
@@ -66,16 +66,16 @@ namespace rocwmma
                               uint32_t waveCount)
     {
 
-        using FragT  = typename std::decay_t<decltype(frag)>;
+        using FragT  = decay_t<decltype(frag)>;
         using Loader = typename GetCoopIOConfig_t<FragT>::Loader;
 
         // Sanity checks
-        static_assert(!std::is_same<DataLayout, void>::value,
+        static_assert(!is_same<DataLayout, void>::value,
                       "Must provide layout information. Either statically assign data layout in "
                       "fragment declaration or use the run-time function overload.");
 
         static_assert(
-            std::is_same<typename FragT::Traits::AccessT, typename Loader::Traits::OutputT>::value,
+            is_same<typename FragT::Traits::AccessT, typename Loader::Traits::OutputT>::value,
             "Fragment access and coop load output types do not match");
 
         // Load and implicit pack
@@ -95,7 +95,7 @@ namespace rocwmma
                               const DataT*                                                  data,
                               uint32_t                                                      ldm)
     {
-        using FragT       = typename std::decay_t<decltype(frag)>;
+        using FragT       = decay_t<decltype(frag)>;
         using MappingUtil = GetMappingUtil_t<FragT>;
 
         // Default: all waves participate in 'row major' order
@@ -138,16 +138,16 @@ namespace rocwmma
                               uint32_t                                                      ldm,
                               uint32_t waveIndex)
     {
-        using FragT  = typename std::decay_t<decltype(frag)>;
+        using FragT  = decay_t<decltype(frag)>;
         using Loader = typename GetCoopIOConfig_t<FragT, WaveCount>::Loader;
 
         // Sanity checks
-        static_assert(!std::is_same<DataLayout, void>::value,
+        static_assert(!is_same<DataLayout, void>::value,
                       "Must provide layout information. Either statically assign data layout in "
                       "fragment declaration or use the run-time function overload.");
 
         static_assert(
-            std::is_same<typename FragT::Traits::AccessT, typename Loader::Traits::OutputT>::value,
+            is_same<typename FragT::Traits::AccessT, typename Loader::Traits::OutputT>::value,
             "Fragment access and coop load output types do not match");
 
         // Load and implicit pack
@@ -187,16 +187,16 @@ namespace rocwmma
         uint32_t                                                            waveIndex,
         uint32_t                                                            waveCount)
     {
-        using FragT  = typename std::decay_t<decltype(frag)>;
+        using FragT  = decay_t<decltype(frag)>;
         using Storer = typename GetCoopIOConfig_t<FragT>::Storer;
 
         // Sanity checks
-        static_assert(!std::is_same<DataLayout, void>::value,
+        static_assert(!is_same<DataLayout, void>::value,
                       "Must provide data layout. Either statically assign data layout in "
                       "fragment declaration or use the run-time function overload.");
 
         static_assert(
-            std::is_same<typename FragT::Traits::AccessT, typename Storer::Traits::InputT>::value,
+            is_same<typename FragT::Traits::AccessT, typename Storer::Traits::InputT>::value,
             "Fragment access and coop store input types do not match");
 
         // Implicit unpack and store
@@ -216,7 +216,7 @@ namespace rocwmma
         fragment<MatrixT, BlockM, BlockN, BlockK, DataT, DataLayout> const& frag,
         uint32_t                                                            ldm)
     {
-        using FragT       = typename std::decay<decltype(frag)>::type;
+        using FragT       = decay_t<decltype(frag)>;
         using MappingUtil = GetMappingUtil_t<FragT>;
 
         // Default: all waves participate in 'row major' order
@@ -262,16 +262,16 @@ namespace rocwmma
         uint32_t                                                            waveIndex)
     {
 
-        using FragT  = typename std::decay_t<decltype(frag)>;
+        using FragT  = decay_t<decltype(frag)>;
         using Storer = typename GetCoopIOConfig_t<FragT, WaveCount>::Storer;
 
         // Sanity checks
-        static_assert(!std::is_same<DataLayout, void>::value,
+        static_assert(!is_same<DataLayout, void>::value,
                       "Must provide data layout. Either statically assign data layout in "
                       "fragment declaration or use the run-time function overload.");
 
         static_assert(
-            std::is_same<typename FragT::Traits::AccessT, typename Storer::Traits::InputT>::value,
+            is_same<typename FragT::Traits::AccessT, typename Storer::Traits::InputT>::value,
             "Fragment access and coop stor input types do not match");
 
         // Implicit unpack and store
diff --git a/library/include/rocwmma/rocwmma_impl.hpp b/library/include/rocwmma/rocwmma_impl.hpp
index 21939782..ddabcb45 100644
--- a/library/include/rocwmma/rocwmma_impl.hpp
+++ b/library/include/rocwmma/rocwmma_impl.hpp
@@ -52,6 +52,7 @@
 #include "internal/utils.hpp"
 #include "internal/vector.hpp"
 #include "internal/vector_iterator.hpp"
+#include "internal/vector_util.hpp"
 #include "internal/wmma.hpp"
 
 namespace rocwmma
@@ -214,11 +215,11 @@ namespace rocwmma
         fill_fragment(fragment<MatrixT, BlockM, BlockN, BlockK, DataT, DataLayout>& frag,
                       DataT                                                         value)
     {
-        using FragT       = typename std::decay_t<decltype(frag)>;
+        using FragT       = decay_t<decltype(frag)>;
         using Broadcaster = typename GetIOConfig_t<FragT>::Broadcaster;
 
         // Sanity check
-        static_assert(std::is_same<typename Broadcaster::Traits::BroadcastT,
+        static_assert(is_same<typename Broadcaster::Traits::BroadcastT,
                                    typename FragT::Traits::AccessT>::value,
                       "Broadcast input and fragment access types do not match");
 
@@ -236,16 +237,16 @@ namespace rocwmma
                          const DataT*                                                  data,
                          uint32_t                                                      ldm)
     {
-        using FragT  = typename std::decay_t<decltype(frag)>;
+        using FragT  = decay_t<decltype(frag)>;
         using Loader = typename GetIOConfig_t<FragT>::Loader;
 
         // Sanity checks
-        static_assert(!std::is_same<DataLayout, void>::value,
+        static_assert(!is_same<DataLayout, void>::value,
                       "Must provide layout information. Either statically assign data layout in "
                       "fragment declaration or use the run-time function overload.");
 
         static_assert(
-            std::is_same<typename FragT::Traits::AccessT, typename Loader::Traits::OutputT>::value,
+            is_same<typename FragT::Traits::AccessT, typename Loader::Traits::OutputT>::value,
             "Fragment access and load output types do not match");
 
         // Load then implicit pack
@@ -283,16 +284,16 @@ namespace rocwmma
                           fragment<MatrixT, BlockM, BlockN, BlockK, DataT, DataLayout> const& frag,
                           uint32_t                                                            ldm)
     {
-        using FragT  = typename std::decay_t<decltype(frag)>;
+        using FragT  = decay_t<decltype(frag)>;
         using Storer = typename GetIOConfig_t<FragT>::Storer;
 
         // Sanity check
-        static_assert(!std::is_same<DataLayout, void>::value,
+        static_assert(!is_same<DataLayout, void>::value,
                       "Must provide data layout. Either statically assign data layout in "
                       "fragment declaration or use the run-time function overload.");
 
         static_assert(
-            std::is_same<typename FragT::Traits::AccessT, typename Storer::Traits::InputT>::value,
+            is_same<typename FragT::Traits::AccessT, typename Storer::Traits::InputT>::value,
             "Fragment access and store input types do not match");
 
         // Implicit unpack and then store
@@ -335,13 +336,13 @@ namespace rocwmma
                  fragment<matrix_b, BlockM, BlockN, BlockK, InputT, LayoutB> const&      b,
                  fragment<accumulator, BlockM, BlockN, BlockK, ComputeT, LayoutC> const& c)
     {
-        using FragA = typename std::decay_t<decltype(a)>;
-        using FragB = typename std::decay_t<decltype(b)>;
+        using FragA = decay_t<decltype(a)>;
+        using FragB = decay_t<decltype(b)>;
 
         // Sanity check
         // static_assert(detail::MfmaCheck<FragA, FragB>::value,
         //              "A and B fragment layouts must be orthogonal");
-        using MMA = typename std::conditional_t<ROCWMMA_ARCH_GFX9,
+        using MMA = conditional_t<ROCWMMA_ARCH_GFX9,
                                                 Mfma<InputT, ComputeT, BlockM, BlockN, BlockK>,
                                                 Wmma<InputT, ComputeT, BlockM, BlockN, BlockK>>;
 
diff --git a/library/include/rocwmma/rocwmma_transforms_impl.hpp b/library/include/rocwmma/rocwmma_transforms_impl.hpp
index 8979f508..10695774 100644
--- a/library/include/rocwmma/rocwmma_transforms_impl.hpp
+++ b/library/include/rocwmma/rocwmma_transforms_impl.hpp
@@ -35,7 +35,7 @@ namespace rocwmma
         ///
         template <typename LhsFrag, typename RhsFrag>
         struct ConsistencyCheck
-            : public std::conditional_t<
+            : public conditional_t<
                   MatrixLayout::detail::ConsistencyCheck<
                       typename GetIOConfig_t<LhsFrag>::IOLayout::MatrixLayout,
                       typename GetIOConfig_t<RhsFrag>::IOLayout::MatrixLayout>::value
@@ -45,14 +45,14 @@ namespace rocwmma
                       && MatrixLayout::detail::ConsistencyCheck<
                           typename GetCoopIOConfig_t<LhsFrag, 4u>::IOLayout::MatrixLayout,
                           typename GetCoopIOConfig_t<RhsFrag, 4u>::IOLayout::MatrixLayout>::value,
-                  std::true_type,
-                  std::false_type>
+                  true_type,
+                  false_type>
         {
         };
 
         template <typename LhsFrag, typename RhsFrag>
         struct OrthogonalCheck
-            : public std::conditional_t<
+            : public conditional_t<
                   MatrixLayout::detail::OrthogonalCheck<
                       typename GetIOConfig_t<LhsFrag>::IOLayout::MatrixLayout,
                       typename GetIOConfig_t<RhsFrag>::IOLayout::MatrixLayout>::value
@@ -62,8 +62,8 @@ namespace rocwmma
                       && MatrixLayout::detail::OrthogonalCheck<
                           typename GetCoopIOConfig_t<LhsFrag, 4u>::IOLayout::MatrixLayout,
                           typename GetCoopIOConfig_t<RhsFrag, 4u>::IOLayout::MatrixLayout>::value,
-                  std::true_type,
-                  std::false_type>
+                  true_type,
+                  false_type>
         {
         };
 
@@ -213,15 +213,15 @@ namespace rocwmma
     template <typename FragT>
     ROCWMMA_DEVICE static inline decltype(auto) applyTranspose(FragT&& frag)
     {
-        return detail::template ApplyTranspose<std::decay_t<FragT>>::exec(
-            std::forward<FragT>(frag));
+        return detail::template ApplyTranspose<decay_t<FragT>>::exec(
+            forward<FragT>(frag));
     }
 
     template <typename DataLayoutT, typename FragT>
     ROCWMMA_DEVICE static inline decltype(auto) applyDataLayout(FragT&& frag)
     {
-        return detail::template ApplyDataLayout<std::decay_t<FragT>, DataLayoutT>::exec(
-            std::forward<FragT>(frag));
+        return detail::template ApplyDataLayout<decay_t<FragT>, DataLayoutT>::exec(
+            forward<FragT>(frag));
     }
 
 } // namespace rocwmma
diff --git a/test/dlrm/device/common.hpp b/test/dlrm/device/common.hpp
index a05f6832..daa7710a 100644
--- a/test/dlrm/device/common.hpp
+++ b/test/dlrm/device/common.hpp
@@ -33,13 +33,13 @@ namespace rocwmma
 {
 
 #if !ROCWMMA_TESTS_NO_HALF
-    __device__ inline bool is_same(half a, half b)
+    __device__ inline bool is_equal(half a, half b)
     {
         return __heq(a, b);
     }
 #endif // !ROCWMMA_NO_HALF
 
-    __device__ inline bool is_same(float a, float b)
+    __device__ inline bool is_equal(float a, float b)
     {
         return a == b;
     }
@@ -77,7 +77,7 @@ namespace rocwmma
         size_t end      = (num_elm * (tid + 1)) / nthreads;
         for(size_t i = start; i < end; i++)
         {
-            if(!is_same(a[i], b[i]))
+            if(!is_equal(a[i], b[i]))
             {
                 float a_    = (float)a[i];
                 float b_    = (float)b[i];
diff --git a/test/gemm/gemm_PGR1_LB2_MP0_MB_CP/test/ad_hoc_test.cpp b/test/gemm/gemm_PGR1_LB2_MP0_MB_CP/test/ad_hoc_test.cpp
index 2f01fed2..7faf1916 100644
--- a/test/gemm/gemm_PGR1_LB2_MP0_MB_CP/test/ad_hoc_test.cpp
+++ b/test/gemm/gemm_PGR1_LB2_MP0_MB_CP/test/ad_hoc_test.cpp
@@ -49,7 +49,7 @@ namespace rocwmma
         // Types: ALL + double
         // Block Sizes: 16 x 16 x BlockK
         // Layouts: NT
-        using Types      = std::tuple<std::tuple<float16_t, float32_t, float32_t>>;
+        using Types      = std::tuple<std::tuple<bfloat16_t, float32_t, float32_t>>;
         using BlockSizes = std::tuple<std::tuple<I<16>, I<16>, I<16>>>;
         using Layouts    = std::tuple<
             std::tuple<col_major, row_major, row_major>>; //typename Base::TestLayoutsNT;
@@ -90,12 +90,12 @@ namespace rocwmma
                 //{64, 64, 1024},
                 //         {32, 64, 1024},
                 // {64, 32, 1024},
-                // {256, 256, 1024},
+                {256, 256, 1024},
                 //{1024, 1024, 1024},
                 //{64, 64, 64},
                 //{128, 128, 128},
                 //{2048, 2048, 2048},
-                {4096, 4096, 4096},
+                // {4096, 4096, 4096},
                 //{8192, 8192, 8192}
 
             };
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 7ee55fce..71409fe9 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -57,6 +57,8 @@ add_subdirectory(load_store_matrix_coop_sync_test)
 add_subdirectory(fill_fragment_test)
 add_subdirectory(vector_iterator_test)
 add_subdirectory(vector_test)
+add_subdirectory(vector_util_test)
+add_subdirectory(pack_util_test)
 add_subdirectory(io_traits_test)
 add_subdirectory(cross_lane_ops_test)
 add_subdirectory(io_shape_test)
diff --git a/test/unit/pack_util_test/CMakeLists.txt b/test/unit/pack_util_test/CMakeLists.txt
new file mode 100644
index 00000000..a835c73d
--- /dev/null
+++ b/test/unit/pack_util_test/CMakeLists.txt
@@ -0,0 +1,34 @@
+###############################################################################
+#
+# MIT License
+#
+# Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+###############################################################################
+
+# Include path for current test files
+set(ROCWMMA_TEST_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} ${ROCWMMA_TEST_INCLUDE_DIRS})
+
+set(PackUtilTestSources ${UnitCommonSources}
+                          ${CMAKE_CURRENT_SOURCE_DIR}/test/pack_util.cpp
+                       )
+
+add_rocwmma_unit_test(pack_util_test ${PackUtilTestSources})
diff --git a/test/unit/pack_util_test/detail/pack_util.hpp b/test/unit/pack_util_test/detail/pack_util.hpp
new file mode 100644
index 00000000..395ce881
--- /dev/null
+++ b/test/unit/pack_util_test/detail/pack_util.hpp
@@ -0,0 +1,106 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_DETAIL_PACK_UTIL_TEST_HPP
+#define ROCWMMA_DETAIL_PACK_UTIL_TEST_HPP
+
+#include "device/pack_util.hpp"
+#include "unit_kernel_base.hpp"
+
+namespace rocwmma
+{
+
+    // Wrapper into the actual device function
+    template <uint32_t VecSize, typename DataT>
+    struct PackUtilKernel final
+        : public UnitKernelBase<1,
+                                1,
+                                DataT,
+                                col_major> // BlockM, BlockN, DataLayout are redundant for this test
+    {
+    private:
+        using Base = UnitKernelBase<1, 1, DataT, col_major>;
+
+    public:
+        PackUtilKernel()        = default;
+        ~PackUtilKernel() final = default;
+
+        void setupImpl(typename Base::DataStorage::ProblemSize const& probsize) final
+        {
+            // Need at least 1 element for the result
+            auto& dataInstance = Base::DataStorage::instance();
+            dataInstance->resizeStorage(probsize);
+
+            dataInstance->hostOut().get()[0] = static_cast<DataT>(ERROR_VALUE);
+            dataInstance->copyData(dataInstance->deviceOut(), dataInstance->hostOut(), 1);
+        }
+
+        void validateResultsImpl() final
+        {
+            auto& dataInstance = Base::DataStorage::instance();
+
+            // Cache current kernel result from device
+            dataInstance->copyData(dataInstance->hostOut(), dataInstance->deviceOut(), 1);
+
+            // Check the single output result
+            Base::mValidationResult = (dataInstance->hostOut().get()[0] == DataT(SUCCESS_VALUE));
+        }
+
+        typename Base::KernelFunc kernelImpl() const final
+        {
+            return typename Base::KernelFunc(packUtilTest<DataT, VecSize>);
+        }
+    };
+
+    // This is the GeneratorImpl class
+    struct PackUtilGenerator
+    {
+        // Indices to test parameters
+        enum : uint32_t
+        {
+            VecSize = 0,
+            DataT   = 1,
+        };
+
+        using ResultT = std::shared_ptr<KernelI>;
+
+        template <typename... Ts>
+        static ResultT generate(std::tuple<Ts...> testParams)
+        {
+            // Map GTest params to Kernel params
+            using TestParamsT = std::tuple<Ts...>;
+            using KernelT
+                = PackUtilKernel<std::tuple_element_t<VecSize, TestParamsT>::value, // VecSize
+                                 std::tuple_element_t<DataT, TestParamsT> // DataT
+                                 >;
+
+            return std::make_shared<KernelT>();
+        }
+    };
+
+} // namespace rocwmma
+
+#endif // ROCWMMA_DETAIL_PACK_UTIL_TEST_HPP
diff --git a/test/unit/pack_util_test/device/pack_util.hpp b/test/unit/pack_util_test/device/pack_util.hpp
new file mode 100644
index 00000000..18354586
--- /dev/null
+++ b/test/unit/pack_util_test/device/pack_util.hpp
@@ -0,0 +1,103 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_DEVICE_PACK_UTIL_TEST_HPP
+#define ROCWMMA_DEVICE_PACK_UTIL_TEST_HPP
+
+#include <rocwmma/rocwmma.hpp>
+
+static constexpr uint32_t ERROR_VALUE   = 7u;
+static constexpr uint32_t SUCCESS_VALUE = 0u;
+
+namespace rocwmma
+{
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline DataT get(VecT<DataT, VecSize> const& v, uint32_t idx)
+    {
+        return v.data[idx];
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline auto generateSeqVec()
+    {
+        auto buildSeq = [](auto&& idx) {
+            constexpr auto Index = std::decay_t<decltype(idx)>::value;
+            return static_cast<DataT>(Index);
+        };
+
+        return vector_generator<DataT, VecSize>()(buildSeq);
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline bool packUtilTestBasic()
+    {
+        bool err = false;
+
+        auto res = generateSeqVec<DataT, VecSize>();
+
+        for(uint32_t i = 0; i < VecSize; i++)
+        {
+            err |= (get(res, i) != static_cast<DataT>(i));
+        }
+
+        return err;
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_KERNEL void packUtilTest(uint32_t     m,
+                                     uint32_t     n,
+                                     DataT const* in,
+                                     DataT*       out,
+                                     uint32_t     ld,
+                                     DataT        param1,
+                                     DataT        param2)
+    {
+        __shared__ int32_t result;
+        result = 0;
+        synchronize_workgroup();
+
+        bool err = false;
+
+        // Add tests here
+        err = err ? err : packUtilTestBasic<DataT, VecSize>();
+
+        // Reduce error count
+        atomicAdd(&result, (int32_t)err);
+
+        // Wait for all threads
+        synchronize_workgroup();
+
+        // Just need one thread to update output
+        if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 && blockIdx.x == 0
+           && blockIdx.y == 0 && blockIdx.z == 0)
+        {
+            out[0] = static_cast<DataT>(result == 0 ? SUCCESS_VALUE : ERROR_VALUE);
+        }
+    }
+
+} // namespace rocwmma
+
+#endif // ROCWMMA_DEVICE_PACK_UTIL_TEST_HPP
diff --git a/test/unit/pack_util_test/test/pack_util.cpp b/test/unit/pack_util_test/test/pack_util.cpp
new file mode 100644
index 00000000..33b6fe3e
--- /dev/null
+++ b/test/unit/pack_util_test/test/pack_util.cpp
@@ -0,0 +1,99 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <tuple>
+#include <type_traits>
+
+#include "detail/pack_util.hpp"
+#include "kernel_generator.hpp"
+#include "unit_test.hpp"
+
+namespace rocwmma
+{
+
+    struct TestParams : public UnitTestParams
+    {
+        using Base = UnitTestParams;
+
+        // Types: Base IOC + double
+        using Types = typename Base::TestTypes16;
+
+        // Vector Sizes.
+        // Test up to VecSize = 64. Anything bigger is impractical.
+        using VecSizes = std::tuple<I<1>, I<2>, I<4>, I<8>, I<16>, I<32>, I<64>>;
+
+        using KernelParams = typename CombineLists<VecSizes, Types>::Result;
+
+        // Assemble the kernel generator
+        // Kernel: VectorUtil
+        using GeneratorImpl   = PackUtilGenerator;
+        using KernelGenerator = KernelGenerator<KernelParams, GeneratorImpl>;
+
+        // Sanity check for kernel generator
+        static_assert(std::is_same<typename GeneratorImpl::ResultT, typename Base::KernelT>::value,
+                      "Kernels from this generator do not match testing interface");
+
+        static inline std::vector<ThreadBlockT> threadBlocks()
+        {
+            auto warpSize = HipDevice::instance()->warpSize();
+            // clang-format off
+            return { {warpSize, 1} };
+            // clang-format on
+        }
+
+        static inline std::vector<ProblemSizeT> problemSizes()
+        {
+            // clang-format off
+            return { {1, 1} };
+            // clang-format on
+        }
+
+        static inline typename KernelGenerator::ResultT kernels()
+        {
+            return KernelGenerator::generate();
+        }
+    };
+
+} // namespace rocwmma
+
+// Test suite for unique parameterization
+class PackUtilTest : public rocwmma::UnitTest
+{
+};
+
+TEST_P(PackUtilTest, RunKernel)
+{
+    this->RunKernel();
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    KernelTests,
+    PackUtilTest,
+    ::testing::Combine(::testing::ValuesIn(rocwmma::TestParams::kernels()),
+                       ::testing::ValuesIn(rocwmma::TestParams::threadBlocks()),
+                       ::testing::ValuesIn(rocwmma::TestParams::problemSizes()),
+                       ::testing::ValuesIn(rocwmma::TestParams::param1s()),
+                       ::testing::ValuesIn(rocwmma::TestParams::param2s())));
diff --git a/test/unit/vector_util_test/CMakeLists.txt b/test/unit/vector_util_test/CMakeLists.txt
new file mode 100644
index 00000000..9f019a44
--- /dev/null
+++ b/test/unit/vector_util_test/CMakeLists.txt
@@ -0,0 +1,34 @@
+###############################################################################
+#
+# MIT License
+#
+# Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+###############################################################################
+
+# Include path for current test files
+set(ROCWMMA_TEST_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR} ${ROCWMMA_TEST_INCLUDE_DIRS})
+
+set(VectorUtilTestSources ${UnitCommonSources}
+                          ${CMAKE_CURRENT_SOURCE_DIR}/test/vector_util.cpp
+                       )
+
+add_rocwmma_unit_test(vector_util_test ${VectorUtilTestSources})
diff --git a/test/unit/vector_util_test/detail/vector_util.hpp b/test/unit/vector_util_test/detail/vector_util.hpp
new file mode 100644
index 00000000..6cea730c
--- /dev/null
+++ b/test/unit/vector_util_test/detail/vector_util.hpp
@@ -0,0 +1,106 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_DETAIL_VECTOR_UTIL_TEST_HPP
+#define ROCWMMA_DETAIL_VECTOR_UTIL_TEST_HPP
+
+#include "device/vector_util.hpp"
+#include "unit_kernel_base.hpp"
+
+namespace rocwmma
+{
+
+    // Wrapper into the actual device function
+    template <uint32_t VecSize, typename DataT>
+    struct VectorUtilKernel final
+        : public UnitKernelBase<1,
+                                1,
+                                DataT,
+                                col_major> // BlockM, BlockN, DataLayout are redundant for this test
+    {
+    private:
+        using Base = UnitKernelBase<1, 1, DataT, col_major>;
+
+    public:
+        VectorUtilKernel()        = default;
+        ~VectorUtilKernel() final = default;
+
+        void setupImpl(typename Base::DataStorage::ProblemSize const& probsize) final
+        {
+            // Need at least 1 element for the result
+            auto& dataInstance = Base::DataStorage::instance();
+            dataInstance->resizeStorage(probsize);
+
+            dataInstance->hostOut().get()[0] = static_cast<DataT>(ERROR_VALUE);
+            dataInstance->copyData(dataInstance->deviceOut(), dataInstance->hostOut(), 1);
+        }
+
+        void validateResultsImpl() final
+        {
+            auto& dataInstance = Base::DataStorage::instance();
+
+            // Cache current kernel result from device
+            dataInstance->copyData(dataInstance->hostOut(), dataInstance->deviceOut(), 1);
+
+            // Check the single output result
+            Base::mValidationResult = (dataInstance->hostOut().get()[0] == DataT(SUCCESS_VALUE));
+        }
+
+        typename Base::KernelFunc kernelImpl() const final
+        {
+            return typename Base::KernelFunc(vectorUtilTest<DataT, VecSize>);
+        }
+    };
+
+    // This is the GeneratorImpl class
+    struct VectorUtilGenerator
+    {
+        // Indices to test parameters
+        enum : uint32_t
+        {
+            VecSize = 0,
+            DataT   = 1,
+        };
+
+        using ResultT = std::shared_ptr<KernelI>;
+
+        template <typename... Ts>
+        static ResultT generate(std::tuple<Ts...> testParams)
+        {
+            // Map GTest params to Kernel params
+            using TestParamsT = std::tuple<Ts...>;
+            using KernelT
+                = VectorUtilKernel<std::tuple_element_t<VecSize, TestParamsT>::value, // VecSize
+                                   std::tuple_element_t<DataT, TestParamsT> // DataT
+                                   >;
+
+            return std::make_shared<KernelT>();
+        }
+    };
+
+} // namespace rocwmma
+
+#endif // ROCWMMA_DETAIL_VECTOR_UTIL_TEST_HPP
diff --git a/test/unit/vector_util_test/device/vector_util.hpp b/test/unit/vector_util_test/device/vector_util.hpp
new file mode 100644
index 00000000..1b082386
--- /dev/null
+++ b/test/unit/vector_util_test/device/vector_util.hpp
@@ -0,0 +1,296 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef ROCWMMA_DEVICE_VECTOR_UTIL_TEST_HPP
+#define ROCWMMA_DEVICE_VECTOR_UTIL_TEST_HPP
+
+#include <rocwmma/rocwmma.hpp>
+
+static constexpr uint32_t ERROR_VALUE   = 7u;
+static constexpr uint32_t SUCCESS_VALUE = 0u;
+
+namespace rocwmma
+{
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline DataT get(VecT<DataT, VecSize> const& v, uint32_t idx)
+    {
+        return v.data[idx];
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline auto generateSeqVec()
+    {
+        auto buildSeq = [](auto&& idx) {
+            constexpr auto Index = std::decay_t<decltype(idx)>::value;
+            return static_cast<DataT>(Index);
+        };
+
+        return vector_generator<DataT, VecSize>()(buildSeq);
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline bool vectorGeneratorTestBasic()
+    {
+        bool err = false;
+
+        auto res = generateSeqVec<DataT, VecSize>();
+
+        for(uint32_t i = 0; i < VecSize; i++)
+        {
+            err |= (get(res, i) != static_cast<DataT>(i));
+        }
+
+        return err;
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline bool vectorGeneratorTestWithArgs()
+    {
+        bool err = false;
+
+        auto sum = [](auto&& idx, auto&& v0, auto&& v1) {
+            constexpr auto Index = std::decay_t<decltype(idx)>::value;
+            return get<Index>(v0) + get<Index>(v1);
+        };
+
+        auto v0 = VecT<DataT, VecSize>{static_cast<DataT>(1.0f)};
+        auto v1 = VecT<DataT, VecSize>{static_cast<DataT>(2.0f)};
+
+        auto res = vector_generator<DataT, VecSize>()(sum, v0, v1);
+
+        for(uint32_t i = 0; i < VecSize; i++)
+        {
+            err |= (get(res, i) != (static_cast<DataT>(1.0f) + static_cast<DataT>(2.0f)));
+        }
+
+        return err;
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline bool extractEvenTest()
+    {
+        bool err = false;
+
+        auto v   = generateSeqVec<DataT, VecSize>();
+        auto res = extractEven(v);
+
+        // Handle the general case
+        if constexpr(VecSize > 1)
+        {
+            for(uint32_t i = 0; i < VecSize / 2; i++)
+            {
+                err |= (get(res, i) != static_cast<DataT>(i * 2));
+            }
+        }
+        // Handle case where VecSize == 1
+        else
+        {
+            err |= (v != res);
+        }
+
+        return err;
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline bool extractOddTest()
+    {
+        bool err = false;
+
+        auto v   = generateSeqVec<DataT, VecSize>();
+        auto res = extractOdd(v);
+
+        // Handle general case
+        if constexpr(VecSize > 1)
+        {
+            for(uint32_t i = 0; i < VecSize / 2; i++)
+            {
+                err |= (get(res, i) != static_cast<DataT>(i * 2 + 1));
+            }
+        }
+        // Handle case where VecSize == 1
+        else
+        {
+            err |= (v != res);
+        }
+
+        return err;
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline bool reorderEvenOddTest()
+    {
+        using PackUtil   = PackUtil<DataT>;
+        using PackTraits = typename PackUtil::Traits;
+        bool err         = false;
+
+        auto v   = generateSeqVec<DataT, VecSize>();
+        auto res = reorderEvenOdd(v);
+
+        // Handle general case
+        if constexpr(VecSize > 1)
+        {
+            for(uint32_t i = 0; i < VecSize; i++)
+            {
+                if(i < VecSize / 2)
+                {
+                    err |= (get(res, i) != static_cast<DataT>(i * 2));
+                }
+                else
+                {
+                    err |= (get(res, i) != static_cast<DataT>((i - VecSize / 2) * 2 + 1));
+                }
+            }
+        }
+        // Handle case where VecSize == 1
+        else
+        {
+            err |= (v != res);
+        }
+
+        return err;
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline bool reorderOddEvenTest()
+    {
+        using PackUtil   = PackUtil<DataT>;
+        using PackTraits = typename PackUtil::Traits;
+        bool err         = false;
+
+        auto v   = generateSeqVec<DataT, VecSize>();
+        auto res = reorderOddEven(v);
+
+        // Handle general case
+        if constexpr(VecSize > 1)
+        {
+            for(uint32_t i = 0; i < VecSize; i++)
+            {
+                if(i < VecSize / 2)
+                {
+                    err |= (get(res, i) != static_cast<DataT>(i * 2 + 1));
+                }
+                else
+                {
+                    err |= (get(res, i) != static_cast<DataT>((i - VecSize / 2) * 2));
+                }
+            }
+        }
+        // Handle case where VecSize == 1
+        else
+        {
+            err |= (v != res);
+        }
+
+        return err;
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_DEVICE static inline bool zipTest()
+    {
+        using PackUtil   = PackUtil<DataT>;
+        using PackTraits = typename PackUtil::Traits;
+        bool err         = false;
+
+        auto v0  = generateSeqVec<DataT, VecSize>();
+        auto v1  = generateSeqVec<DataT, VecSize>() + DataT{VecSize};
+        auto res = zip(v0, v1);
+
+        for(uint32_t i = 0; i < VecSize; i++)
+        {
+            if(i % 2u == 0)
+            {
+                err |= (get(res, i) != static_cast<DataT>(i));
+            }
+            else
+            {
+                err |= (get(res, i) != static_cast<DataT>(i + VecSize));
+            }
+        }
+
+        // if (err)
+        // {
+        //     if constexpr (std::is_integral_v<DataT>)
+        //     {
+        //         if(threadIdx.x == 0)
+        //         {
+        //             printf("dataSize: %d\n", (int)sizeof(DataT));
+        //             printf("v\n");
+        //             for(uint32_t i = 0; i < VecSize; i++)
+        //             {
+        //                 printf("i[%d] = %d\n", i, get(v, i));
+        //             }
+        //             printf("RES\n");
+        //             for(uint32_t i = 0; i < VecSize; i++)
+        //             {
+        //                 printf("i[%d] = %d\n", i, get(res, i));
+        //             }
+        //         }
+        //     }
+        // }
+
+        return err;
+    }
+
+    template <typename DataT, uint32_t VecSize>
+    ROCWMMA_KERNEL void vectorUtilTest(uint32_t     m,
+                                       uint32_t     n,
+                                       DataT const* in,
+                                       DataT*       out,
+                                       uint32_t     ld,
+                                       DataT        param1,
+                                       DataT        param2)
+    {
+        __shared__ int32_t result;
+        result = 0;
+        synchronize_workgroup();
+
+        bool err = false;
+
+        err = err ? err : vectorGeneratorTestBasic<DataT, VecSize>();
+        err = err ? err : vectorGeneratorTestWithArgs<DataT, VecSize>();
+        err = err ? err : extractEvenTest<DataT, VecSize>();
+        err = err ? err : extractOddTest<DataT, VecSize>();
+        err = err ? err : reorderEvenOddTest<DataT, VecSize>();
+        err = err ? err : reorderOddEvenTest<DataT, VecSize>();
+        err = err ? err : zipTest<DataT, VecSize>();
+
+        // Reduce error count
+        atomicAdd(&result, (int32_t)err);
+
+        // Wait for all threads
+        synchronize_workgroup();
+
+        // Just need one thread to update output
+        if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 && blockIdx.x == 0
+           && blockIdx.y == 0 && blockIdx.z == 0)
+        {
+            out[0] = static_cast<DataT>(result == 0 ? SUCCESS_VALUE : ERROR_VALUE);
+        }
+    }
+
+} // namespace rocwmma
+
+#endif // ROCWMMA_DEVICE_VECTOR_UTIL_TEST_HPP
diff --git a/test/unit/vector_util_test/test/vector_util.cpp b/test/unit/vector_util_test/test/vector_util.cpp
new file mode 100644
index 00000000..abe98202
--- /dev/null
+++ b/test/unit/vector_util_test/test/vector_util.cpp
@@ -0,0 +1,99 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (C) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <tuple>
+#include <type_traits>
+
+#include "detail/vector_util.hpp"
+#include "kernel_generator.hpp"
+#include "unit_test.hpp"
+
+namespace rocwmma
+{
+
+    struct TestParams : public UnitTestParams
+    {
+        using Base = UnitTestParams;
+
+        // Types: Base IOC + double
+        using Types = typename Base::TestTypes16;
+
+        // Vector Sizes.
+        // Test up to VecSize = 64. Anything bigger is impractical.
+        using VecSizes = std::tuple<I<1>, I<2>, I<4>, I<8>, I<16>, I<32>, I<64>>;
+
+        using KernelParams = typename CombineLists<VecSizes, Types>::Result;
+
+        // Assemble the kernel generator
+        // Kernel: VectorUtil
+        using GeneratorImpl   = VectorUtilGenerator;
+        using KernelGenerator = KernelGenerator<KernelParams, GeneratorImpl>;
+
+        // Sanity check for kernel generator
+        static_assert(std::is_same<typename GeneratorImpl::ResultT, typename Base::KernelT>::value,
+                      "Kernels from this generator do not match testing interface");
+
+        static inline std::vector<ThreadBlockT> threadBlocks()
+        {
+            auto warpSize = HipDevice::instance()->warpSize();
+            // clang-format off
+            return { {warpSize, 1} };
+            // clang-format on
+        }
+
+        static inline std::vector<ProblemSizeT> problemSizes()
+        {
+            // clang-format off
+            return { {1, 1} };
+            // clang-format on
+        }
+
+        static inline typename KernelGenerator::ResultT kernels()
+        {
+            return KernelGenerator::generate();
+        }
+    };
+
+} // namespace rocwmma
+
+// Test suite for unique parameterization
+class VectorUtilTest : public rocwmma::UnitTest
+{
+};
+
+TEST_P(VectorUtilTest, RunKernel)
+{
+    this->RunKernel();
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    KernelTests,
+    VectorUtilTest,
+    ::testing::Combine(::testing::ValuesIn(rocwmma::TestParams::kernels()),
+                       ::testing::ValuesIn(rocwmma::TestParams::threadBlocks()),
+                       ::testing::ValuesIn(rocwmma::TestParams::problemSizes()),
+                       ::testing::ValuesIn(rocwmma::TestParams::param1s()),
+                       ::testing::ValuesIn(rocwmma::TestParams::param2s())));