diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp
index 3b0ce7988f576..cf00f3d01f381 100644
--- a/sycl/include/sycl/detail/generic_type_traits.hpp
+++ b/sycl/include/sycl/detail/generic_type_traits.hpp
@@ -252,6 +252,16 @@ inline constexpr bool is_genfloatptr_marray_v =
     (IsDecorated == access::decorated::yes ||
      IsDecorated == access::decorated::no);
 
+template <typename T>
+using is_byte_t = typename
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+    std::is_same<T, std::byte>;
+#else
+    std::false_type;
+#endif
+
+template <typename T> inline constexpr bool is_byte_v = is_byte_t<T>::value;
+
 template <typename T>
 using make_floating_point_t = make_type_t<T, gtl::scalar_floating_list>;
 
@@ -332,6 +342,8 @@ template <typename T> auto convertToOpenCLType(T &&x) {
                                                    std::declval<ElemTy>()))>,
                             no_ref::size()>;
 #ifdef __SYCL_DEVICE_ONLY__
+
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
     // TODO: for some mysterious reasons on NonUniformGroups E2E tests fail if
     // we use the "else" version only. I suspect that's an issues with
     // non-uniform groups implementation.
@@ -340,6 +352,10 @@ template <typename T> auto convertToOpenCLType(T &&x) {
     else
       return static_cast<typename MatchingVec::vector_t>(
           x.template as<MatchingVec>());
+#else  // __INTEL_PREVIEW_BREAKING_CHANGES
+    return sycl::bit_cast<typename MatchingVec::vector_t>(x);
+#endif // __INTEL_PREVIEW_BREAKING_CHANGES
+
 #else
     return x.template as<MatchingVec>();
 #endif
diff --git a/sycl/include/sycl/detail/vector_convert.hpp b/sycl/include/sycl/detail/vector_convert.hpp
index c018fce5bcfa3..6552daa560e9a 100644
--- a/sycl/include/sycl/detail/vector_convert.hpp
+++ b/sycl/include/sycl/detail/vector_convert.hpp
@@ -558,6 +558,15 @@ NativeToT convertImpl(NativeFromT Value) {
   }
 }
 
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+template <typename FromT, typename ToT, sycl::rounding_mode RoundingMode,
+          int VecSize, typename NativeFromT, typename NativeToT>
+auto ConvertImpl(std::byte val) {
+  return convertImpl<FromT, ToT, RoundingMode, VecSize, NativeFromT, NativeToT>(
+      (std::int8_t)val);
+}
+#endif
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 3a16dcd244b4c..6792262ec21a0 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -32,6 +32,7 @@ bfloat16 bitsToBfloat16(const Bfloat16StorageT Value);
 
 // sycl::vec support
 namespace bf16 {
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 #ifdef __SYCL_DEVICE_ONLY__
 using Vec2StorageT = Bfloat16StorageT __attribute__((ext_vector_type(2)));
 using Vec3StorageT = Bfloat16StorageT __attribute__((ext_vector_type(3)));
@@ -45,6 +46,7 @@ using Vec4StorageT = std::array<Bfloat16StorageT, 4>;
 using Vec8StorageT = std::array<Bfloat16StorageT, 8>;
 using Vec16StorageT = std::array<Bfloat16StorageT, 16>;
 #endif
+#endif // __INTEL_PREVIEW_BREAKING_CHANGES
 } // namespace bf16
 } // namespace detail
 
diff --git a/sycl/include/sycl/half_type.hpp b/sycl/include/sycl/half_type.hpp
index 951146f2cdfbb..799ff9fb186e9 100644
--- a/sycl/include/sycl/half_type.hpp
+++ b/sycl/include/sycl/half_type.hpp
@@ -249,11 +249,14 @@ using StorageT = _Float16;
 using BIsRepresentationT = _Float16;
 using VecElemT = _Float16;
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 using Vec2StorageT = VecElemT __attribute__((ext_vector_type(2)));
 using Vec3StorageT = VecElemT __attribute__((ext_vector_type(3)));
 using Vec4StorageT = VecElemT __attribute__((ext_vector_type(4)));
 using Vec8StorageT = VecElemT __attribute__((ext_vector_type(8)));
 using Vec16StorageT = VecElemT __attribute__((ext_vector_type(16)));
+#endif // __INTEL_PREVIEW_BREAKING_CHANGES
+
 #else // SYCL_DEVICE_ONLY
 using StorageT = detail::host_half_impl::half;
 // No need to extract underlying data type for built-in functions operating on
@@ -261,6 +264,7 @@ using StorageT = detail::host_half_impl::half;
 using BIsRepresentationT = half;
 using VecElemT = half;
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 // On the host side we cannot use OpenCL cl_half# types as an underlying type
 // for vec because they are actually defined as an integer type under the
 // hood. As a result half values will be converted to the integer and passed
@@ -270,6 +274,8 @@ using Vec3StorageT = std::array<VecElemT, 3>;
 using Vec4StorageT = std::array<VecElemT, 4>;
 using Vec8StorageT = std::array<VecElemT, 8>;
 using Vec16StorageT = std::array<VecElemT, 16>;
+#endif // __INTEL_PREVIEW_BREAKING_CHANGES
+
 #endif // SYCL_DEVICE_ONLY
 
 #ifndef __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/sycl/vector_preview.hpp b/sycl/include/sycl/vector_preview.hpp
index f1bf7fcfcc24d..1d38fc08b287d 100644
--- a/sycl/include/sycl/vector_preview.hpp
+++ b/sycl/include/sycl/vector_preview.hpp
@@ -26,10 +26,6 @@
 #error "SYCL device compiler is built without ext_vector_type support"
 #endif
 
-#if defined(__SYCL_DEVICE_ONLY__)
-#define __SYCL_USE_EXT_VECTOR_TYPE__
-#endif
-
 #include <sycl/access/access.hpp>              // for decorated, address_space
 #include <sycl/aliases.hpp>                    // for half, cl_char, cl_int
 #include <sycl/detail/common.hpp>              // for ArrayCreator, RepeatV...
@@ -45,8 +41,9 @@
 
 #include <sycl/ext/oneapi/bfloat16.hpp> // bfloat16
 
+#include <algorithm>   // for std::min
 #include <array>       // for array
-#include <assert.h>    // for assert
+#include <cassert>     // for assert
 #include <cstddef>     // for size_t, NULL, byte
 #include <cstdint>     // for uint8_t, int16_t, int...
 #include <functional>  // for divides, multiplies
@@ -86,81 +83,10 @@ struct elem {
 };
 
 namespace detail {
-// select_apply_cl_t selects from T8/T16/T32/T64 basing on
-// sizeof(_IN).  expected to handle scalar types in _IN.
-template <typename _IN, typename T8, typename T16, typename T32, typename T64>
-using select_apply_cl_t = std::conditional_t<
-    sizeof(_IN) == 1, T8,
-    std::conditional_t<sizeof(_IN) == 2, T16,
-                       std::conditional_t<sizeof(_IN) == 4, T32, T64>>>;
-
-template <typename T> struct vec_helper {
-  using RetType = T;
-  static constexpr RetType get(T value) { return value; }
-  static constexpr RetType set(T value) { return value; }
-};
-template <> struct vec_helper<bool> {
-  using RetType = select_apply_cl_t<bool, std::int8_t, std::int16_t,
-                                    std::int32_t, std::int64_t>;
-  static constexpr RetType get(bool value) { return value; }
-  static constexpr RetType set(bool value) { return value; }
-};
-
-template <> struct vec_helper<sycl::ext::oneapi::bfloat16> {
-  using RetType = sycl::ext::oneapi::bfloat16;
-  using BFloat16StorageT = sycl::ext::oneapi::detail::Bfloat16StorageT;
-  static constexpr RetType get(BFloat16StorageT value) {
-#if defined(__SYCL_BITCAST_IS_CONSTEXPR)
-    return sycl::bit_cast<RetType>(value);
-#else
-    // awkward workaround. sycl::bit_cast isn't constexpr in older GCC
-    // C++20 will give us both std::bit_cast and constexpr reinterpet for void*
-    // but neither available yet.
-    union {
-      sycl::ext::oneapi::bfloat16 bf16;
-      sycl::ext::oneapi::detail::Bfloat16StorageT storage;
-    } result = {};
-    result.storage = value;
-    return result.bf16;
-#endif
-  }
-
-  static constexpr RetType get(RetType value) { return value; }
-
-  static constexpr BFloat16StorageT set(RetType value) {
-#if defined(__SYCL_BITCAST_IS_CONSTEXPR)
-    return sycl::bit_cast<BFloat16StorageT>(value);
-#else
-    union {
-      sycl::ext::oneapi::bfloat16 bf16;
-      sycl::ext::oneapi::detail::Bfloat16StorageT storage;
-    } result = {};
-    result.bf16 = value;
-    return result.storage;
-#endif
-  }
-};
-
-#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-template <> struct vec_helper<std::byte> {
-  using RetType = std::uint8_t;
-  static constexpr RetType get(std::byte value) { return (RetType)value; }
-  static constexpr RetType set(std::byte value) { return (RetType)value; }
-  static constexpr std::byte get(std::uint8_t value) {
-    return (std::byte)value;
-  }
-  static constexpr std::byte set(std::uint8_t value) {
-    return (std::byte)value;
-  }
-};
-#endif
-
 template <typename VecT, typename OperationLeftT, typename OperationRightT,
           template <typename> class OperationCurrentT, int... Indexes>
 class SwizzleOp;
 
-template <typename T, int N, typename V = void> struct VecStorage;
-
 // Element type for relational operator return value.
 template <typename DataT>
 using rel_t = typename std::conditional_t<
@@ -178,8 +104,18 @@ using rel_t = typename std::conditional_t<
 template <typename T> class GetOp {
 public:
   using DataT = T;
-  DataT getValue(size_t) const { return (DataT)0; }
-  DataT operator()(DataT, DataT) { return (DataT)0; }
+  DataT getValue(size_t) const {
+    if constexpr (std::is_same_v<DataT, sycl::detail::host_half_impl::half>)
+      return DataT{0.0f};
+    else
+      return (DataT)0;
+  }
+  DataT operator()(DataT, DataT) {
+    if constexpr (std::is_same_v<DataT, sycl::detail::host_half_impl::half>)
+      return DataT{0.0f};
+    else
+      return (DataT)0;
+  }
 };
 
 // Forward declarations
@@ -188,212 +124,53 @@ class RoundedRangeKernel;
 template <typename TransformedArgType, int Dims, typename KernelType>
 class RoundedRangeKernelWithKH;
 
-// Vectors of size 1 are handled separately and therefore 1 is not included in
-// the check below.
-constexpr bool isValidVectorSize(int N) {
-  return N == 2 || N == 3 || N == 4 || N == 8 || N == 16;
-}
-template <typename T, int N, typename V> struct VecStorage {
-  static_assert(
-      isValidVectorSize(N) || N == 1,
-      "Incorrect number of elements for sycl::vec: only 1, 2, 3, 4, 8 "
-      "or 16 are supported");
-  static_assert(!std::is_same_v<V, void>, "Incorrect data type for sycl::vec");
-};
-
-#ifdef __SYCL_DEVICE_ONLY__
-// device always has ext vector support, but for huge vectors
-// we switch to std::array, so that we can use a smaller alignment (64)
-// this is to support MSVC, which has a max of 64 for direct params.
-template <typename T, int N> struct VecStorageImpl {
-  static constexpr size_t Num = (N == 3) ? 4 : N;
-  static constexpr size_t Sz = Num * sizeof(T);
-  using DataType =
-      typename std::conditional<Sz <= 64, T __attribute__((ext_vector_type(N))),
-                                std::array<T, Num>>::type;
-  using VectorDataType = T __attribute__((ext_vector_type(N)));
-};
-#else  // __SYCL_DEVICE_ONLY__
-template <typename T, int N> struct VecStorageImpl {
-  using DataType = std::array<T, (N == 3) ? 4 : N>;
-};
-#endif // __SYCL_DEVICE_ONLY__
-
-// Single element bool
-template <> struct VecStorage<bool, 1, void> {
-  using DataType = bool;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType = bool;
-#endif // __SYCL_DEVICE_ONLY__
-};
-
-// Multiple element bool
-template <int N>
-struct VecStorage<bool, N, typename std::enable_if_t<isValidVectorSize(N)>> {
-  using DataType =
-      typename VecStorageImpl<select_apply_cl_t<bool, std::int8_t, std::int16_t,
-                                                std::int32_t, std::int64_t>,
-                              N>::DataType;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType =
-      typename VecStorageImpl<select_apply_cl_t<bool, std::int8_t, std::int16_t,
-                                                std::int32_t, std::int64_t>,
-                              N>::VectorDataType;
-#endif // __SYCL_DEVICE_ONLY__
-};
-
-#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-// Single element byte. Multiple elements will propagate through a later
-// specialization.
-template <> struct VecStorage<std::byte, 1, void> {
-  using DataType = std::int8_t;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType = std::int8_t;
-#endif // __SYCL_DEVICE_ONLY__
-};
-#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-
-// Single element signed integers
-template <typename T>
-struct VecStorage<T, 1, typename std::enable_if_t<is_sigeninteger_v<T>>> {
-  using DataType = T;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType = DataType;
-#endif // __SYCL_DEVICE_ONLY__
-};
-
-// Single element unsigned integers
-template <typename T>
-struct VecStorage<T, 1, typename std::enable_if_t<is_sugeninteger_v<T>>> {
-  using DataType = T;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType = DataType;
-#endif // __SYCL_DEVICE_ONLY__
-};
-
-// Single element floating-point (except half/bfloat16)
+// OpenCL data type to convert to.
 template <typename T>
-struct VecStorage<
-    T, 1,
-    typename std::enable_if_t<!is_half_or_bf16_v<T> && is_sgenfloat_v<T>>> {
-  using DataType = T;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType = DataType;
-#endif // __SYCL_DEVICE_ONLY__
-};
-// Multiple elements signed/unsigned integers and floating-point (except
-// half/bfloat16)
-template <typename T, int N>
-struct VecStorage<
-    T, N,
-    typename std::enable_if_t<isValidVectorSize(N) &&
-                              (is_sgeninteger_v<T> ||
-                               (is_sgenfloat_v<T> && !is_half_or_bf16_v<T>))>> {
-  using DataType =
-      typename VecStorageImpl<typename VecStorage<T, 1>::DataType, N>::DataType;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType =
-      typename VecStorageImpl<typename VecStorage<T, 1>::DataType,
-                              N>::VectorDataType;
-#endif // __SYCL_DEVICE_ONLY__
-};
-
-// Single element half
-template <> struct VecStorage<half, 1, void> {
-  using DataType = sycl::detail::half_impl::StorageT;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType = sycl::detail::half_impl::StorageT;
-#endif // __SYCL_DEVICE_ONLY__
-};
-
-// Multiple elements half
-#if defined(__SYCL_DEVICE_ONLY__)
-#define __SYCL_DEFINE_HALF_VECSTORAGE(Num)                                     \
-  template <> struct VecStorage<half, Num, void> {                             \
-    using DataType = sycl::detail::half_impl::Vec##Num##StorageT;              \
-    using VectorDataType = sycl::detail::half_impl::Vec##Num##StorageT;        \
-  };
-#else // defined(__SYCL_DEVICE_ONLY__)
-#define __SYCL_DEFINE_HALF_VECSTORAGE(Num)                                     \
-  template <> struct VecStorage<half, Num, void> {                             \
-    using DataType = sycl::detail::half_impl::Vec##Num##StorageT;              \
-  };
-#endif // defined(__SYCL_DEVICE_ONLY__)
-
-__SYCL_DEFINE_HALF_VECSTORAGE(2)
-__SYCL_DEFINE_HALF_VECSTORAGE(3)
-__SYCL_DEFINE_HALF_VECSTORAGE(4)
-__SYCL_DEFINE_HALF_VECSTORAGE(8)
-__SYCL_DEFINE_HALF_VECSTORAGE(16)
-#undef __SYCL_DEFINE_HALF_VECSTORAGE
-
-// Single element bfloat16
-template <> struct VecStorage<sycl::ext::oneapi::bfloat16, 1, void> {
-  using DataType = sycl::ext::oneapi::detail::Bfloat16StorageT;
-  // using VectorDataType = sycl::ext::oneapi::bfloat16;
-  using VectorDataType = sycl::ext::oneapi::detail::Bfloat16StorageT;
-};
-// Multiple elements bfloat16
-#define __SYCL_DEFINE_BF16_VECSTORAGE(Num)                                     \
-  template <> struct VecStorage<sycl::ext::oneapi::bfloat16, Num, void> {      \
-    using DataType = sycl::ext::oneapi::detail::bf16::Vec##Num##StorageT;      \
-    using VectorDataType =                                                     \
-        sycl::ext::oneapi::detail::bf16::Vec##Num##StorageT;                   \
-  };
-__SYCL_DEFINE_BF16_VECSTORAGE(2)
-__SYCL_DEFINE_BF16_VECSTORAGE(3)
-__SYCL_DEFINE_BF16_VECSTORAGE(4)
-__SYCL_DEFINE_BF16_VECSTORAGE(8)
-__SYCL_DEFINE_BF16_VECSTORAGE(16)
-#undef __SYCL_DEFINE_BF16_VECSTORAGE
+// clang-format off
+using element_type_for_vector_t = typename map_type<
+    T,
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+    std::byte, /*->*/ std::uint8_t,
+#endif
+    bool, /*->*/ std::int8_t,
+    sycl::half, /*->*/ sycl::detail::half_impl::StorageT,
+    sycl::ext::oneapi::bfloat16, /*->*/ sycl::ext::oneapi::detail::Bfloat16StorageT,
+    T, /*->*/ T>::type;
+// clang-format on
 } // namespace detail
 
-template <typename T> using vec_data = detail::vec_helper<T>;
+///////////////////////// class sycl::vec /////////////////////////
+// Provides a cross-patform vector class template that works efficiently on
+// SYCL devices as well as in host C++ code.
+template <typename DataT, int NumElements> class vec {
 
-template <typename T>
-using vec_data_t = typename detail::vec_helper<T>::RetType;
+  static_assert(NumElements == 1 || NumElements == 2 || NumElements == 3 ||
+                    NumElements == 4 || NumElements == 8 || NumElements == 16,
+                "Invalid number of elements for sycl::vec: only 1, 2, 3, 4, 8 "
+                "or 16 are supported");
+  static_assert(sizeof(bool) == sizeof(int8_t), "bool size is not 1 byte");
 
-///////////////////////// class sycl::vec /////////////////////////
-/// Provides a cross-patform vector class template that works efficiently on
-/// SYCL devices as well as in host C++ code.
-///
-/// \ingroup sycl_api
-template <typename Type, int NumElements> class vec {
-  using DataT = Type;
+  static constexpr size_t AdjustedNum = (NumElements == 3) ? 4 : NumElements;
 
   // This represent type of underlying value. There should be only one field
   // in the class, so vec<float, 16> should be equal to float16 in memory.
-  using DataType = typename detail::VecStorage<DataT, NumElements>::DataType;
+  using DataType = std::array<DataT, AdjustedNum>;
 
-  static constexpr bool IsHostHalf =
-      std::is_same_v<DataT, sycl::detail::half_impl::half> &&
-      std::is_same_v<sycl::detail::half_impl::StorageT,
-                     sycl::detail::host_half_impl::half>;
+public:
+#ifdef __SYCL_DEVICE_ONLY__
+  // Type used for passing sycl::vec to SPIRV builtins.
+  // We can not use ext_vector_type(1) as it's not supported by SPIRV
+  // plugins (CTS fails).
+  using vector_t = typename std::conditional_t<
+      NumElements == 1, detail::element_type_for_vector_t<DataT>,
+      detail::element_type_for_vector_t<DataT> __attribute__((
+          ext_vector_type(NumElements)))>;
+#endif // __SYCL_DEVICE_ONLY__
 
+private:
   static constexpr bool IsBfloat16 =
       std::is_same_v<DataT, sycl::ext::oneapi::bfloat16>;
 
-  static constexpr size_t AdjustedNum = (NumElements == 3) ? 4 : NumElements;
-  static constexpr size_t Sz = sizeof(DataT) * AdjustedNum;
-  static constexpr bool IsSizeGreaterThanMaxAlign =
-      (Sz > detail::MaxVecAlignment);
-
-  // TODO: There is no support for vector half type on host yet.
-  // Also, when Sz is greater than alignment, we use std::array instead of
-  // vector extension. This is for MSVC compatibility, which has a max alignment
-  // of 64 for direct params. If we drop MSVC, we can have alignment the same as
-  // size and use vector extensions for all sizes.
-  static constexpr bool IsUsingArrayOnDevice =
-      (IsHostHalf || IsBfloat16 || IsSizeGreaterThanMaxAlign);
-
-#if defined(__SYCL_DEVICE_ONLY__)
-  static constexpr bool NativeVec = NumElements > 1 && !IsUsingArrayOnDevice;
-  static constexpr bool IsUsingArrayOnHost = false; // not compiling for host.
-#else
-  static constexpr bool NativeVec = false;
-  static constexpr bool IsUsingArrayOnHost = true; // host always std::array.
-#endif
-
   static constexpr int getNumElements() { return NumElements; }
 
   // SizeChecker is needed for vec(const argTN &... args) ctor to validate args.
@@ -411,7 +188,7 @@ template <typename Type, int NumElements> class vec {
   template <typename DataT_, typename T, std::size_t... Is>
   static constexpr std::array<DataT_, sizeof...(Is)>
   VecToArray(const vec<T, sizeof...(Is)> &V, std::index_sequence<Is...>) {
-    return {static_cast<DataT_>(V.getValue(Is))...};
+    return {static_cast<DataT_>(V[Is])...};
   }
   template <typename DataT_, typename T, int N, typename T2, typename T3,
             template <typename> class T4, int... T5, std::size_t... Is>
@@ -446,7 +223,9 @@ template <typename Type, int NumElements> class vec {
   }
   template <typename DataT_, typename T>
   static constexpr auto FlattenVecArgHelper(const T &A) {
-    return std::array<DataT_, 1>{vec_data<DataT_>::get(static_cast<DataT_>(A))};
+    // static_cast required to avoid narrowing conversion warning
+    // when T = unsigned long int and DataT_ = int.
+    return std::array<DataT_, 1>{static_cast<DataT_>(A)};
   }
   template <typename DataT_, typename T> struct FlattenVecArg {
     constexpr auto operator()(const T &A) const {
@@ -541,205 +320,83 @@ template <typename Type, int NumElements> class vec {
   using EnableIfSuitableNumElements =
       typename std::enable_if_t<SizeChecker<0, NumElements, argTN...>::value>;
 
-  template <size_t... Is>
-  constexpr vec(const std::array<vec_data_t<DataT>, NumElements> &Arr,
-                std::index_sequence<Is...>)
-      : m_Data{([&](vec_data_t<DataT> v) constexpr {
-          if constexpr (std::is_same_v<sycl::ext::oneapi::bfloat16, DataT>)
-            return v.value;
-          else
-            return vec_data_t<DataT>(static_cast<DataT>(v));
-        })(Arr[Is])...} {}
-
 public:
+  // Aliases required by SPEC to make sycl::vec consistent
+  // with that of marray and buffer.
   using element_type = DataT;
   using value_type = DataT;
   using rel_t = detail::rel_t<DataT>;
-#ifdef __SYCL_DEVICE_ONLY__
-  using vector_t =
-      typename detail::VecStorage<DataT, NumElements>::VectorDataType;
-#endif // __SYCL_DEVICE_ONLY__
 
+  /****************** Constructors **************/
   vec() = default;
-
   constexpr vec(const vec &Rhs) = default;
   constexpr vec(vec &&Rhs) = default;
 
-  constexpr vec &operator=(const vec &Rhs) = default;
-
-  // W/o this, things like "vec<char,*> = vec<signed char, *>" doesn't work.
-  template <typename Ty = DataT>
-  typename std::enable_if_t<!std::is_same_v<Ty, rel_t> &&
-                                std::is_convertible_v<vec_data_t<Ty>, rel_t>,
-                            vec &>
-  operator=(const vec<rel_t, NumElements> &Rhs) {
-    *this = Rhs.template as<vec>();
-    return *this;
-  }
-
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-  template <typename T = void>
-  using EnableIfNotHostHalf = typename std::enable_if_t<!IsHostHalf, T>;
-
-  template <typename T = void>
-  using EnableIfHostHalf = typename std::enable_if_t<IsHostHalf, T>;
-
-  template <typename T = void>
-  using EnableIfUsingArrayOnDevice =
-      typename std::enable_if_t<IsUsingArrayOnDevice, T>;
-
-  template <typename T = void>
-  using EnableIfNotUsingArrayOnDevice =
-      typename std::enable_if_t<!IsUsingArrayOnDevice, T>;
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
-
-  template <typename T = void>
-  using EnableIfUsingArray =
-      typename std::enable_if_t<IsUsingArrayOnDevice || IsUsingArrayOnHost, T>;
-
-  template <typename T = void>
-  using EnableIfNotUsingArray =
-      typename std::enable_if_t<!IsUsingArrayOnDevice && !IsUsingArrayOnHost,
-                                T>;
-
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-
-  template <typename Ty = DataT>
-  explicit constexpr vec(const EnableIfNotUsingArrayOnDevice<Ty> &arg)
-      : m_Data{DataType(vec_data<Ty>::get(arg))} {}
+private:
+  // Implementation detail for the next public ctor.
+  template <size_t... Is>
+  constexpr vec(const std::array<DataT, NumElements> &Arr,
+                std::index_sequence<Is...>)
+      : m_Data{Arr[Is]...} {}
 
-  template <typename Ty = DataT>
-  typename std::enable_if_t<
-      std::is_fundamental_v<vec_data_t<Ty>> ||
-          detail::is_half_or_bf16_v<typename std::remove_const_t<Ty>>,
-      vec &>
-  operator=(const EnableIfNotUsingArrayOnDevice<Ty> &Rhs) {
-    m_Data = (DataType)vec_data<Ty>::get(Rhs);
-    return *this;
-  }
+public:
+  explicit constexpr vec(const DataT &arg)
+      : vec{detail::RepeatValue<NumElements>(arg),
+            std::make_index_sequence<NumElements>()} {}
 
-  template <typename Ty = DataT>
-  explicit constexpr vec(const EnableIfUsingArrayOnDevice<Ty> &arg)
-      : vec{detail::RepeatValue<NumElements>(
-                static_cast<vec_data_t<DataT>>(arg)),
+  // Constructor from values of base type or vec of base type. Checks that
+  // base types are match and that the NumElements == sum of lengths of args.
+  template <typename... argTN, typename = EnableIfSuitableTypes<argTN...>,
+            typename = EnableIfSuitableNumElements<argTN...>>
+  constexpr vec(const argTN &...args)
+      : vec{VecArgArrayCreator<DataT, argTN...>::Create(args...),
             std::make_index_sequence<NumElements>()} {}
 
+  /****************** Assignment Operators **************/
+  constexpr vec &operator=(const vec &Rhs) = default;
+
+  // Template required to prevent ambiguous overload with the copy assignment
+  // when NumElements == 1. The template prevents implicit conversion from
+  // vec<_, 1> to DataT.
   template <typename Ty = DataT>
   typename std::enable_if_t<
-      std::is_fundamental_v<vec_data_t<Ty>> ||
+      std::is_fundamental_v<Ty> ||
           detail::is_half_or_bf16_v<typename std::remove_const_t<Ty>>,
       vec &>
-  operator=(const EnableIfUsingArrayOnDevice<Ty> &Rhs) {
-    for (int i = 0; i < NumElements; ++i) {
-      setValue(i, Rhs);
-    }
+  operator=(const DataT &Rhs) {
+    *this = vec{Rhs};
     return *this;
   }
-#else  // __SYCL_USE_EXT_VECTOR_TYPE__
-  explicit constexpr vec(const DataT &arg)
-      : vec{detail::RepeatValue<NumElements>(
-                static_cast<vec_data_t<DataT>>(arg)),
-            std::make_index_sequence<NumElements>()} {}
 
+  // W/o this, things like "vec<char,*> = vec<signed char, *>" doesn't work.
   template <typename Ty = DataT>
   typename std::enable_if_t<
-      std::is_fundamental_v<vec_data_t<Ty>> ||
-          detail::is_half_or_bf16_v<typename std::remove_const_t<Ty>>,
-      vec &>
-  operator=(const DataT &Rhs) {
-    for (int i = 0; i < NumElements; ++i) {
-      setValue(i, Rhs);
-    }
+      !std::is_same_v<Ty, rel_t> && std::is_convertible_v<Ty, rel_t>, vec &>
+  operator=(const vec<rel_t, NumElements> &Rhs) {
+    *this = Rhs.template as<vec>();
     return *this;
   }
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
-
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-  // Optimized naive constructors with NumElements of DataT values.
-  // We don't expect compilers to optimize vararg recursive functions well.
-
-  // Helper type to make specific constructors available only for specific
-  // number of elements.
-  template <int IdxNum, typename T = void>
-  using EnableIfMultipleElems = typename std::enable_if_t<
-      std::is_convertible_v<T, DataT> && NumElements == IdxNum, DataT>;
-  template <typename Ty = DataT>
-  constexpr vec(const EnableIfMultipleElems<2, Ty> Arg0,
-                const EnableIfNotUsingArrayOnDevice<Ty> Arg1)
-      : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1)} {}
-  template <typename Ty = DataT>
-  constexpr vec(const EnableIfMultipleElems<3, Ty> Arg0,
-                const EnableIfNotUsingArrayOnDevice<Ty> Arg1, const DataT Arg2)
-      : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1),
-               vec_data<Ty>::get(Arg2)} {}
-  template <typename Ty = DataT>
-  constexpr vec(const EnableIfMultipleElems<4, Ty> Arg0,
-                const EnableIfNotUsingArrayOnDevice<Ty> Arg1, const DataT Arg2,
-                const Ty Arg3)
-      : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1),
-               vec_data<Ty>::get(Arg2), vec_data<Ty>::get(Arg3)} {}
-  template <typename Ty = DataT>
-  constexpr vec(const EnableIfMultipleElems<8, Ty> Arg0,
-                const EnableIfNotUsingArrayOnDevice<Ty> Arg1, const DataT Arg2,
-                const DataT Arg3, const DataT Arg4, const DataT Arg5,
-                const DataT Arg6, const DataT Arg7)
-      : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1),
-               vec_data<Ty>::get(Arg2), vec_data<Ty>::get(Arg3),
-               vec_data<Ty>::get(Arg4), vec_data<Ty>::get(Arg5),
-               vec_data<Ty>::get(Arg6), vec_data<Ty>::get(Arg7)} {}
-  template <typename Ty = DataT>
-  constexpr vec(const EnableIfMultipleElems<16, Ty> Arg0,
-                const EnableIfNotUsingArrayOnDevice<Ty> Arg1, const DataT Arg2,
-                const DataT Arg3, const DataT Arg4, const DataT Arg5,
-                const DataT Arg6, const DataT Arg7, const DataT Arg8,
-                const DataT Arg9, const DataT ArgA, const DataT ArgB,
-                const DataT ArgC, const DataT ArgD, const DataT ArgE,
-                const DataT ArgF)
-      : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1),
-               vec_data<Ty>::get(Arg2), vec_data<Ty>::get(Arg3),
-               vec_data<Ty>::get(Arg4), vec_data<Ty>::get(Arg5),
-               vec_data<Ty>::get(Arg6), vec_data<Ty>::get(Arg7),
-               vec_data<Ty>::get(Arg8), vec_data<Ty>::get(Arg9),
-               vec_data<Ty>::get(ArgA), vec_data<Ty>::get(ArgB),
-               vec_data<Ty>::get(ArgC), vec_data<Ty>::get(ArgD),
-               vec_data<Ty>::get(ArgE), vec_data<Ty>::get(ArgF)} {}
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
-
-  // Constructor from values of base type or vec of base type. Checks that
-  // base types are match and that the NumElements == sum of lengths of args.
-  template <typename... argTN, typename = EnableIfSuitableTypes<argTN...>,
-            typename = EnableIfSuitableNumElements<argTN...>>
-  constexpr vec(const argTN &...args)
-      : vec{VecArgArrayCreator<vec_data_t<DataT>, argTN...>::Create(args...),
-            std::make_index_sequence<NumElements>()} {}
 
 #ifdef __SYCL_DEVICE_ONLY__
-  template <typename vector_t_ = vector_t,
-            typename =
-                typename std::enable_if_t<std::is_same_v<vector_t_, vector_t> &&
-                                          !std::is_same_v<vector_t_, DataT>>>
-  constexpr vec(vector_t openclVector) {
-    if constexpr (!IsUsingArrayOnDevice) {
-      m_Data = openclVector;
-    } else {
-      m_Data = bit_cast<DataType>(openclVector);
-    }
-  }
-
-  operator vector_t() const {
-    if constexpr (!IsUsingArrayOnDevice) {
-      return m_Data;
-    } else {
-      auto ptr = bit_cast<const vector_t *>((&m_Data)->data());
-      return *ptr;
-    }
-  }
+  template <
+      typename vector_t_ = vector_t,
+      typename = typename std::enable_if_t<std::is_same_v<vector_t_, vector_t>>>
+  constexpr vec(vector_t_ openclVector) {
+    m_Data = sycl::bit_cast<DataType>(openclVector);
+  }
+
+  /* @SYCL2020
+   * Available only when: compiled for the device.
+   * Converts this SYCL vec instance to the underlying backend-native vector
+   * type defined by vector_t.
+   */
+  operator vector_t() const { return sycl::bit_cast<vector_t>(m_Data); }
 #endif // __SYCL_DEVICE_ONLY__
 
   // Available only when: NumElements == 1
   template <int N = NumElements>
   operator typename std::enable_if_t<N == 1, DataT>() const {
-    return vec_data<DataT>::get(m_Data);
+    return m_Data[0];
   }
 
   __SYCL2020_DEPRECATED("get_count() is deprecated, please use size() instead")
@@ -750,86 +407,90 @@ template <typename Type, int NumElements> class vec {
   static constexpr size_t get_size() { return byte_size(); }
   static constexpr size_t byte_size() noexcept { return sizeof(m_Data); }
 
+  // We interpret bool as int8_t, std::byte as uint8_t for conversion to other
+  // types.
+  // clang-format off
+  template <typename T>
+  using ConvertBoolAndByteT = typename detail::map_type<
+      T,
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+      std::byte, /*->*/ std::uint8_t,
+#endif
+      bool, /*->*/ std::int8_t,
+      T, /*->*/ T>::type;
+  // clang-format on
+
   // convertImpl can't be called with the same From and To types and therefore
   // we need this version of convert which is mostly no-op.
   template <typename convertT,
             rounding_mode roundingMode = rounding_mode::automatic>
-  std::enable_if_t<
-      std::is_same_v<vec_data_t<DataT>, vec_data_t<convertT>> ||
-          std::is_same_v<detail::ConvertToOpenCLType_t<vec_data_t<DataT>>,
-                         detail::ConvertToOpenCLType_t<vec_data_t<convertT>>>,
-      vec<convertT, NumElements>>
-  convert() const {
-    static_assert(std::is_integral_v<vec_data_t<convertT>> ||
-                      detail::is_floating_point<convertT>::value,
+  vec<convertT, NumElements> convert() const {
+
+    using T = ConvertBoolAndByteT<DataT>;
+    using R = ConvertBoolAndByteT<convertT>;
+    static_assert(std::is_integral_v<R> || detail::is_floating_point<R>::value,
                   "Unsupported convertT");
-    if constexpr (!std::is_same_v<DataT, convertT>) {
-      // Dummy conversion for cases like vec<signed char> -> vec<char>
-      vec<convertT, NumElements> Result;
+
+    using OpenCLT = detail::ConvertToOpenCLType_t<T>;
+    using OpenCLR = detail::ConvertToOpenCLType_t<R>;
+    vec<convertT, NumElements> Result;
+
+    // For conversion between bool -> signed char and byte -> uint8_t.
+    if constexpr (!std::is_same_v<DataT, convertT> &&
+                  (std::is_same_v<OpenCLT, OpenCLR> || std::is_same_v<T, R>)) {
       for (size_t I = 0; I < NumElements; ++I)
         Result.setValue(I, static_cast<convertT>(getValue(I)));
-
       return Result;
-    } else {
-      // No conversion necessary
+    } else if constexpr (std::is_same_v<DataT, convertT>) {
       return *this;
-    }
-  }
+    } else {
 
-  template <typename convertT,
-            rounding_mode roundingMode = rounding_mode::automatic>
-  std::enable_if_t<
-      !std::is_same_v<vec_data_t<DataT>, vec_data_t<convertT>> &&
-          !std::is_same_v<detail::ConvertToOpenCLType_t<vec_data_t<DataT>>,
-                          detail::ConvertToOpenCLType_t<vec_data_t<convertT>>>,
-      vec<convertT, NumElements>>
-  convert() const {
-    static_assert(std::is_integral_v<vec_data_t<convertT>> ||
-                      detail::is_floating_point<convertT>::value,
-                  "Unsupported convertT");
-    using T = vec_data_t<DataT>;
-    using R = vec_data_t<convertT>;
-    using OpenCLT = detail::ConvertToOpenCLType_t<T>;
-    using OpenCLR = detail::ConvertToOpenCLType_t<R>;
-    vec<convertT, NumElements> Result;
+#ifdef __SYCL_DEVICE_ONLY__
+      using OpenCLVecT = OpenCLT __attribute__((ext_vector_type(NumElements)));
+      using OpenCLVecR = OpenCLR __attribute__((ext_vector_type(NumElements)));
+
+      auto NativeVector = sycl::bit_cast<vector_t>(*this);
+      using ConvertTVecType = typename vec<convertT, NumElements>::vector_t;
 
-#if defined(__SYCL_DEVICE_ONLY__)
-    using OpenCLVecT = OpenCLT __attribute__((ext_vector_type(NumElements)));
-    using OpenCLVecR = OpenCLR __attribute__((ext_vector_type(NumElements)));
-    // Whole vector conversion can only be done, if:
-    constexpr bool canUseNativeVectorConvert =
+      // Whole vector conversion can only be done, if:
+      constexpr bool canUseNativeVectorConvert =
 #ifdef __NVPTX__
-        // - we are not on CUDA, see intel/llvm#11840
-        false &&
+          // - we are not on CUDA, see intel/llvm#11840
+          false &&
 #endif
-        // - both vectors are represented using native vector types;
-        NativeVec && vec<convertT, NumElements>::NativeVec &&
-        // - vec storage has an equivalent OpenCL native vector it is implicitly
-        //   convertible to. There are some corner cases where it is not the
-        //   case with char, long and long long types.
-        std::is_convertible_v<decltype(m_Data), OpenCLVecT> &&
-        std::is_convertible_v<decltype(Result.m_Data), OpenCLVecR> &&
-        // - it is not a signed to unsigned (or vice versa) conversion
-        //   see comments within 'convertImpl' for more details;
-        !detail::is_sint_to_from_uint<T, R>::value &&
-        // - destination type is not bool. bool is stored as integer under the
-        //   hood and therefore conversion to bool looks like conversion between
-        //   two integer types. Since bit pattern for true and false is not
-        //   defined, there is no guarantee that integer conversion yields
-        //   right results here;
-        !std::is_same_v<convertT, bool>;
-    if constexpr (canUseNativeVectorConvert) {
-      Result.m_Data = detail::convertImpl<T, R, roundingMode, NumElements,
-                                          OpenCLVecT, OpenCLVecR>(m_Data);
-    } else
-#endif // defined(__SYCL_DEVICE_ONLY__)
-    {
-      // Otherwise, we fallback to per-element conversion:
-      for (size_t I = 0; I < NumElements; ++I) {
-        Result.setValue(
-            I, vec_data<convertT>::get(
-                   detail::convertImpl<T, R, roundingMode, 1, OpenCLT, OpenCLR>(
-                       vec_data<DataT>::get(getValue(I)))));
+          NumElements > 1 &&
+          // - vec storage has an equivalent OpenCL native vector it is
+          //   implicitly convertible to. There are some corner cases where it
+          //   is not the case with char, long and long long types.
+          std::is_convertible_v<vector_t, OpenCLVecT> &&
+          std::is_convertible_v<ConvertTVecType, OpenCLVecR> &&
+          // - it is not a signed to unsigned (or vice versa) conversion
+          //   see comments within 'convertImpl' for more details;
+          !detail::is_sint_to_from_uint<T, R>::value &&
+          // - destination type is not bool. bool is stored as integer under the
+          //   hood and therefore conversion to bool looks like conversion
+          //   between two integer types. Since bit pattern for true and false
+          //   is not defined, there is no guarantee that integer conversion
+          //   yields right results here;
+          !std::is_same_v<convertT, bool>;
+
+      if constexpr (canUseNativeVectorConvert) {
+        Result.m_Data = sycl::bit_cast<decltype(Result.m_Data)>(
+            detail::convertImpl<T, R, roundingMode, NumElements, OpenCLVecT,
+                                OpenCLVecR>(NativeVector));
+      } else
+#endif // __SYCL_DEVICE_ONLY__
+      {
+        // Otherwise, we fallback to per-element conversion:
+        for (size_t I = 0; I < NumElements; ++I) {
+          auto val =
+              detail::convertImpl<T, R, roundingMode, 1, OpenCLT, OpenCLR>(
+                  getValue(I));
+          if constexpr (detail::is_byte_t<convertT>::value)
+            Result.setValue(I, static_cast<convertT>(val));
+          else
+            Result.setValue(I, val);
+        }
       }
     }
 
@@ -859,58 +520,10 @@ template <typename Type, int NumElements> class vec {
     return this;
   }
 
-  // ext_vector_type is used as an underlying type for sycl::vec on device.
-  // The problem is that for clang vector types the return of operator[] is a
-  // temporary and not a reference to the element in the vector. In practice
-  // reinterpret_cast<DataT *>(&m_Data)[i]; is working. According to
-  // http://llvm.org/docs/GetElementPtr.html#can-gep-index-into-vector-elements
-  // this is not disallowed now. But could probably be disallowed in the future.
-  // That is why tests are added to check that behavior of the compiler has
-  // not changed.
-  //
   // Implement operator [] in the same way for host and device.
-  // TODO: change host side implementation when underlying type for host side
-  // will be changed to std::array.
-  // NOTE: aliasing the incompatible types of bfloat16 may lead to problems if
-  // aggressively optimized. Specializing with noinline to avoid as workaround.
-
-  template <typename T = DataT>
-  typename std::enable_if_t<!std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
-                            const DataT &>
-  operator[](int i) const {
-    return reinterpret_cast<const DataT *>(&m_Data)[i];
-  }
-
-  template <typename T = DataT>
-  typename std::enable_if_t<!std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
-                            DataT &>
-  operator[](int i) {
-    return reinterpret_cast<DataT *>(&m_Data)[i];
-  }
-
-#ifdef _MSC_VER
-#define __SYCL_NOINLINE_BF16 __declspec(noinline)
-#else
-#define __SYCL_NOINLINE_BF16 __attribute__((noinline))
-#endif
-
-  template <typename T = DataT>
-  __SYCL_NOINLINE_BF16
-      typename std::enable_if_t<std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
-                                const DataT &>
-      operator[](int i) const {
-    return reinterpret_cast<const DataT *>(&m_Data)[i];
-  }
-
-  template <typename T = DataT>
-  __SYCL_NOINLINE_BF16
-      typename std::enable_if_t<std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
-                                DataT &>
-      operator[](int i) {
-    return reinterpret_cast<DataT *>(&m_Data)[i];
-  }
+  const DataT &operator[](int i) const { return m_Data[i]; }
 
-#undef __SYCL_NOINLINE_BF16
+  DataT &operator[](int i) { return m_Data[i]; }
 
   // Begin hi/lo, even/odd, xyzw, and rgba swizzles.
 private:
@@ -961,7 +574,7 @@ template <typename Type, int NumElements> class vec {
              multi_ptr<DataT, Space, DecorateAddress> Ptr) const {
     for (int I = 0; I < NumElements; I++) {
       *multi_ptr<DataT, Space, DecorateAddress>(Ptr + Offset * NumElements +
-                                                I) = getValue(I);
+                                                I) = m_Data[I];
     }
   }
   template <int Dimensions, access::mode Mode,
@@ -976,97 +589,132 @@ template <typename Type, int NumElements> class vec {
     store(Offset, MultiPtr);
   }
 
-  void ConvertToDataT() {
+#ifdef __SYCL_DEVICE_ONLY__
+  // Require only for std::bool.
+  inline void ConvertToDataT() {
     for (size_t i = 0; i < NumElements; ++i) {
-      DataT tmp = getValue(i);
-      setValue(i, tmp);
+      m_Data[i] = bit_cast<int8_t>(m_Data[i]) != 0;
     }
   }
+#endif
 
-#ifdef __SYCL_BINOP
-#error "Undefine __SYCL_BINOP macro"
+  /******************* sycl::vec math operations ***********************/
+#if defined(__SYCL_BINOP) || defined(BINOP_BASE)
+#error "Undefine __SYCL_BINOP and BINOP_BASE macro"
 #endif
 
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
-  friend vec operator BINOP(const vec &Lhs, const vec &Rhs) {                  \
+#ifdef __SYCL_DEVICE_ONLY__
+#define BINOP_BASE(BINOP, OPASSIGN, CONVERT, COND)                             \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec> operator BINOP(                \
+      const vec & Lhs, const vec & Rhs) {                                      \
     vec Ret;                                                                   \
-    if constexpr (IsUsingArrayOnDevice) {                                      \
+    if constexpr (IsBfloat16) {                                                \
       for (size_t I = 0; I < NumElements; ++I) {                               \
-        Ret.setValue(I, (Lhs.getValue(I) BINOP Rhs.getValue(I)));              \
+        Ret[I] = Lhs[I] BINOP Rhs[I];                                          \
       }                                                                        \
     } else {                                                                   \
-      Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data;                                \
-      if constexpr (std::is_same_v<Type, bool> && CONVERT) {                   \
+      vector_t ExtVecLhs = sycl::bit_cast<vector_t>(Lhs);                      \
+      vector_t ExtVecRhs = sycl::bit_cast<vector_t>(Rhs);                      \
+      Ret = vec<DataT, NumElements>(ExtVecLhs BINOP ExtVecRhs);                \
+      if constexpr (std::is_same_v<DataT, bool> && CONVERT) {                  \
         Ret.ConvertToDataT();                                                  \
       }                                                                        \
     }                                                                          \
     return Ret;                                                                \
-  }                                                                            \
-  friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) {                \
-    return Lhs BINOP vec(Rhs);                                                 \
-  }                                                                            \
-  friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) {                \
-    return vec(Lhs) BINOP Rhs;                                                 \
-  }                                                                            \
-  friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) {                  \
-    Lhs = Lhs BINOP Rhs;                                                       \
-    return Lhs;                                                                \
-  }                                                                            \
-  template <int Num = NumElements>                                             \
-  friend typename std::enable_if_t<Num != 1, vec &> operator OPASSIGN(         \
-      vec & Lhs, const DataT & Rhs) {                                          \
-    Lhs = Lhs BINOP vec(Rhs);                                                  \
-    return Lhs;                                                                \
   }
+#else // __SYCL_DEVICE_ONLY__
 
-#else // __SYCL_USE_EXT_VECTOR_TYPE__
-
-#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
-  friend vec operator BINOP(const vec &Lhs, const vec &Rhs) {                  \
+#define BINOP_BASE(BINOP, OPASSIGN, CONVERT, COND)                             \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec> operator BINOP(                \
+      const vec & Lhs, const vec & Rhs) {                                      \
     vec Ret{};                                                                 \
-    if constexpr (NativeVec)                                                   \
-      Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data;                                \
-    else                                                                       \
-      for (size_t I = 0; I < NumElements; ++I)                                 \
-        Ret.setValue(I, (DataT)(vec_data<DataT>::get(Lhs.getValue(             \
-                            I)) BINOP vec_data<DataT>::get(Rhs.getValue(I)))); \
+    for (size_t I = 0; I < NumElements; ++I) {                                 \
+      Ret[I] = Lhs[I] BINOP Rhs[I];                                            \
+    }                                                                          \
     return Ret;                                                                \
-  }                                                                            \
-  friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) {                \
+  }
+#endif // __SYCL_DEVICE_ONLY__
+
+#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT, COND)                           \
+  BINOP_BASE(BINOP, OPASSIGN, CONVERT, COND)                                   \
+                                                                               \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec> operator BINOP(                \
+      const vec & Lhs, const DataT & Rhs) {                                    \
     return Lhs BINOP vec(Rhs);                                                 \
   }                                                                            \
-  friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) {                \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec> operator BINOP(                \
+      const DataT & Lhs, const vec & Rhs) {                                    \
     return vec(Lhs) BINOP Rhs;                                                 \
   }                                                                            \
-  friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) {                  \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec> &operator OPASSIGN(            \
+      vec & Lhs, const vec & Rhs) {                                            \
     Lhs = Lhs BINOP Rhs;                                                       \
     return Lhs;                                                                \
   }                                                                            \
-  template <int Num = NumElements>                                             \
-  friend typename std::enable_if_t<Num != 1, vec &> operator OPASSIGN(         \
-      vec & Lhs, const DataT & Rhs) {                                          \
+  template <int Num = NumElements, typename T = DataT>                         \
+  friend typename std::enable_if_t<(Num != 1) && (COND), vec &>                \
+  operator OPASSIGN(vec & Lhs, const DataT & Rhs) {                            \
     Lhs = Lhs BINOP vec(Rhs);                                                  \
     return Lhs;                                                                \
   }
 
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
+  // std::byte is not an arithmetic type.
+  __SYCL_BINOP(+, +=, true, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(-, -=, true, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(*, *=, false, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(/, /=, false, (!detail::is_byte_v<T>))
+
+  // The following OPs are available only when: DataT != cl_float &&
+  // DataT != cl_double && DataT != cl_half && DataT != BF16.
+  __SYCL_BINOP(%, %=, false,
+               (!detail::is_vgenfloat_v<T> && (!detail::is_byte_v<T>)))
+  // Bitwise operations are allowed for std::byte.
+  __SYCL_BINOP(|, |=, false, (!detail::is_vgenfloat_v<T>))
+  __SYCL_BINOP(&, &=, false, (!detail::is_vgenfloat_v<T>))
+  __SYCL_BINOP(^, ^=, false, (!detail::is_vgenfloat_v<T>))
+  __SYCL_BINOP(>>, >>=, false,
+               (!detail::is_vgenfloat_v<T> && (!detail::is_byte_v<T>)))
+  __SYCL_BINOP(<<, <<=, true,
+               (!detail::is_vgenfloat_v<T> && (!detail::is_byte_v<T>)))
+
+#undef BINOP_BASE
+#undef __SYCL_BINOP
 
-  __SYCL_BINOP(+, +=, true)
-  __SYCL_BINOP(-, -=, true)
-  __SYCL_BINOP(*, *=, false)
-  __SYCL_BINOP(/, /=, false)
+  // Special <<, >> operators for std::byte.
+  // std::byte is not an arithmetic type and it only supports the following
+  // overloads of >> and << operators.
+  //
+  // 1 template <class IntegerType>
+  //   constexpr std::byte operator<<( std::byte b, IntegerType shift )
+  //   noexcept;
+  // 2 template <class IntegerType>
+  //   constexpr std::byte operator>>( std::byte b, IntegerType shift )
+  //   noexcept;
+#define __SYCL_SHIFT_BYTE(OP, OPASSIGN)                                        \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(detail::is_byte_v<T>), vec> operator OP(   \
+      const vec & Lhs, int shift) {                                            \
+    vec Ret;                                                                   \
+    for (size_t I = 0; I < NumElements; ++I) {                                 \
+      Ret[I] = Lhs[I] OP shift;                                                \
+    }                                                                          \
+    return Ret;                                                                \
+  }                                                                            \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(detail::is_byte_v<T>), vec &>              \
+  operator OPASSIGN(vec & Lhs, int shift) {                                    \
+    Lhs = Lhs OP shift;                                                        \
+    return Lhs;                                                                \
+  }
 
-  // TODO: The following OPs are available only when: DataT != cl_float &&
-  // DataT != cl_double && DataT != cl_half
-  __SYCL_BINOP(%, %=, false)
-  __SYCL_BINOP(|, |=, false)
-  __SYCL_BINOP(&, &=, false)
-  __SYCL_BINOP(^, ^=, false)
-  __SYCL_BINOP(>>, >>=, false)
-  __SYCL_BINOP(<<, <<=, true)
-#undef __SYCL_BINOP
-#undef __SYCL_BINOP_HELP
+  __SYCL_SHIFT_BYTE(<<, <<=)
+  __SYCL_SHIFT_BYTE(>>, >>=)
+#undef __SYCL_SHIFT_BYTE
 
   // Note: vec<>/SwizzleOp logical value is 0/-1 logic, as opposed to 0/1 logic.
   // As far as CTS validation is concerned, 0/-1 logic also applies when
@@ -1075,299 +723,227 @@ template <typename Type, int NumElements> class vec {
   // TODO: Determine if vec<, NumElements=1> is needed at all, remove this
   // inconsistency if not by disallowing one-element vectors (as in OpenCL)
 
-#ifdef __SYCL_RELLOGOP
-#error "Undefine __SYCL_RELLOGOP macro"
+#if defined(__SYCL_RELLOGOP) || defined(RELLOGOP_BASE)
+#error "Undefine __SYCL_RELLOGOP and RELLOGOP_BASE macro."
 #endif
-// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined
-// by SYCL device compiler only.
+
 #ifdef __SYCL_DEVICE_ONLY__
-#define __SYCL_RELLOGOP(RELLOGOP)                                              \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
-                                                   const vec & Rhs) {          \
+#define RELLOGOP_BASE(RELLOGOP, COND)                                          \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec<rel_t, NumElements>>            \
+  operator RELLOGOP(const vec & Lhs, const vec & Rhs) {                        \
     vec<rel_t, NumElements> Ret{};                                             \
-    /* This special case is needed since there are no standard operator||   */ \
-    /* or operator&& functions for std::array.                              */ \
-    if constexpr (IsUsingArrayOnDevice &&                                      \
-                  (std::string_view(#RELLOGOP) == "||" ||                      \
-                   std::string_view(#RELLOGOP) == "&&")) {                     \
+    /* ext_vector_type does not support bfloat16, so for these   */            \
+    /* we do element-by-element operation on the underlying std::array.  */    \
+    if constexpr (IsBfloat16) {                                                \
       for (size_t I = 0; I < NumElements; ++I) {                               \
         /* We cannot use SetValue here as the operator is not a friend of*/    \
         /* Ret on Windows. */                                                  \
-        Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                    \
-            Lhs.getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I)))); \
+        Ret[I] = static_cast<rel_t>(-(Lhs[I] RELLOGOP Rhs[I]));                \
       }                                                                        \
     } else {                                                                   \
+      vector_t ExtVecLhs = sycl::bit_cast<vector_t>(Lhs);                      \
+      vector_t ExtVecRhs = sycl::bit_cast<vector_t>(Rhs);                      \
+      /* Cast required to convert unsigned char ext_vec_type to */             \
+      /* char ext_vec_type. */                                                 \
       Ret = vec<rel_t, NumElements>(                                           \
           (typename vec<rel_t, NumElements>::vector_t)(                        \
-              Lhs.m_Data RELLOGOP Rhs.m_Data));                                \
-      if (NumElements == 1) /*Scalar 0/1 logic was applied, invert*/           \
+              ExtVecLhs RELLOGOP ExtVecRhs));                                  \
+      /* For NumElements == 1, we use scalar instead of ext_vector_type. */    \
+      if constexpr (NumElements == 1) {                                        \
         Ret *= -1;                                                             \
+      }                                                                        \
     }                                                                          \
     return Ret;                                                                \
-  }                                                                            \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
-                                                   const DataT & Rhs) {        \
-    return Lhs RELLOGOP vec(Rhs);                                              \
-  }                                                                            \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const DataT & Lhs,          \
-                                                   const vec & Rhs) {          \
-    return vec(Lhs) RELLOGOP Rhs;                                              \
   }
-
-#else
-#define __SYCL_RELLOGOP(RELLOGOP)                                              \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
-                                                   const vec & Rhs) {          \
+#else // __SYCL_DEVICE_ONLY__
+#define RELLOGOP_BASE(RELLOGOP, COND)                                          \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec<rel_t, NumElements>>            \
+  operator RELLOGOP(const vec & Lhs, const vec & Rhs) {                        \
+                                                                               \
     vec<rel_t, NumElements> Ret{};                                             \
     for (size_t I = 0; I < NumElements; ++I) {                                 \
-      /* We cannot use SetValue here as the operator is not a friend of*/      \
-      /* Ret on Windows. */                                                    \
-      Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                      \
-          Lhs.getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));   \
+      Ret[I] = static_cast<rel_t>(-(Lhs[I] RELLOGOP Rhs[I]));                  \
     }                                                                          \
     return Ret;                                                                \
-  }                                                                            \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
-                                                   const DataT & Rhs) {        \
+  }
+#endif
+
+#define __SYCL_RELLOGOP(RELLOGOP, COND)                                        \
+  RELLOGOP_BASE(RELLOGOP, COND)                                                \
+                                                                               \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec<rel_t, NumElements>>            \
+  operator RELLOGOP(const vec & Lhs, const DataT & Rhs) {                      \
     return Lhs RELLOGOP vec(Rhs);                                              \
   }                                                                            \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const DataT & Lhs,          \
-                                                   const vec & Rhs) {          \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec<rel_t, NumElements>>            \
+  operator RELLOGOP(const DataT & Lhs, const vec & Rhs) {                      \
     return vec(Lhs) RELLOGOP Rhs;                                              \
   }
-#endif
 
-  __SYCL_RELLOGOP(==)
-  __SYCL_RELLOGOP(!=)
-  __SYCL_RELLOGOP(>)
-  __SYCL_RELLOGOP(<)
-  __SYCL_RELLOGOP(>=)
-  __SYCL_RELLOGOP(<=)
-  // TODO: limit to integral types.
-  __SYCL_RELLOGOP(&&)
-  __SYCL_RELLOGOP(||)
+  // OP is: ==, !=, <, >, <=, >=, &&, ||
+  // vec<RET, NumElements> operatorOP(const vec<DataT, NumElements> &Rhs) const;
+  // vec<RET, NumElements> operatorOP(const DataT &Rhs) const;
+  __SYCL_RELLOGOP(==, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(!=, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(>, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(<, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(>=, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(<=, (!detail::is_byte_v<T>))
+
+  // Only available to integral types.
+  __SYCL_RELLOGOP(&&, (!detail::is_vgenfloat_v<T>) && (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(||, (!detail::is_vgenfloat_v<T>) && (!detail::is_byte_v<T>))
 #undef __SYCL_RELLOGOP
 
+// ++ and -- operators are only allowed for DataT!=std::byte and DataT!=bool
+// FIXME: Don't allow Unary operators on vec<bool> after
+// https://github.com/KhronosGroup/SYCL-CTS/issues/896 gets fixed.
 #ifdef __SYCL_UOP
 #error "Undefine __SYCL_UOP macro"
 #endif
-#define __SYCL_UOP(UOP, OPASSIGN)                                              \
-  friend vec &operator UOP(vec & Rhs) {                                        \
-    Rhs OPASSIGN vec_data<DataT>::get(1);                                      \
+#define __SYCL_UOP(UOP, OPASSIGN, COND)                                        \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec &> operator UOP(vec & Rhs) {    \
+    Rhs OPASSIGN DataT{1};                                                     \
     return Rhs;                                                                \
   }                                                                            \
-  friend vec operator UOP(vec &Lhs, int) {                                     \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec> operator UOP(vec & Lhs, int) { \
     vec Ret(Lhs);                                                              \
-    Lhs OPASSIGN vec_data<DataT>::get(1);                                      \
+    Lhs OPASSIGN DataT{1};                                                     \
     return Ret;                                                                \
   }
 
-  __SYCL_UOP(++, +=)
-  __SYCL_UOP(--, -=)
+  __SYCL_UOP(++, +=, (!detail::is_byte_v<T>))
+  __SYCL_UOP(--, -=, (!detail::is_byte_v<T>))
 #undef __SYCL_UOP
 
   // operator~() available only when: dataT != float && dataT != double
   // && dataT != half
-  friend vec operator~(const vec &Rhs) {
-    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
-      vec Ret{};
-      for (size_t I = 0; I < NumElements; ++I) {
-        Ret.setValue(I, ~Rhs.getValue(I));
-      }
-      return Ret;
-    } else {
-      vec Ret{(typename vec::DataType) ~Rhs.m_Data};
-      if constexpr (std::is_same_v<Type, bool>) {
-        Ret.ConvertToDataT();
-      }
-      return Ret;
+  template <typename T = DataT>
+  friend typename std::enable_if_t<!detail::is_vgenfloat_v<T>, vec>
+  operator~(const vec &Rhs) {
+#ifdef __SYCL_DEVICE_ONLY__
+    auto extVec = sycl::bit_cast<vector_t>(Rhs);
+    vec Ret{~extVec};
+    if constexpr (std::is_same_v<DataT, bool>) {
+      Ret.ConvertToDataT();
     }
+    return Ret;
+#else
+    vec Ret{};
+    for (size_t I = 0; I < NumElements; ++I) {
+      Ret[I] = ~Rhs[I];
+    }
+    return Ret;
+#endif
   }
 
-  // operator!
-  friend vec<detail::rel_t<DataT>, NumElements> operator!(const vec &Rhs) {
-    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
-      vec Ret{};
+  // operator!. Not available for std::byte.
+  template <typename T = DataT>
+  friend typename std::enable_if_t<(!detail::is_byte_v<T>),
+                                   vec<detail::rel_t<DataT>, NumElements>>
+  operator!(const vec &Rhs) {
+#ifdef __SYCL_DEVICE_ONLY__
+    if constexpr (!std::is_same_v<DataT, sycl::ext::oneapi::bfloat16>) {
+      auto extVec = sycl::bit_cast<vector_t>(Rhs);
+      vec<detail::rel_t<DataT>, NumElements> Ret{
+          (typename vec<rel_t, NumElements>::vector_t) !extVec};
+      return Ret;
+    } else
+#endif // __SYCL_DEVICE_ONLY__
+    {
+      vec<detail::rel_t<DataT>, NumElements> Ret{};
       for (size_t I = 0; I < NumElements; ++I) {
-#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-        // std::byte neither supports ! unary op or casting, so special handling
-        // is needed. And, worse, Windows has a conflict with 'byte'.
-        if constexpr (std::is_same_v<std::byte, DataT>) {
-          Ret.setValue(I, std::byte{!vec_data<DataT>::get(Rhs.getValue(I))});
-        } else
-#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-        {
-          Ret.setValue(I, !vec_data<DataT>::get(Rhs.getValue(I)));
-        }
+        // static_cast will work here as the output of ! operator is either 0 or
+        // -1.
+        Ret[I] = static_cast<detail::rel_t<DataT>>(-1 * (!Rhs[I]));
       }
-      return Ret.template as<vec<detail::rel_t<DataT>, NumElements>>();
-    } else {
-      return vec{(typename vec<DataT, NumElements>::DataType) !Rhs.m_Data}
-          .template as<vec<detail::rel_t<DataT>, NumElements>>();
+      return Ret;
     }
   }
 
-  // operator +
-  friend vec operator+(const vec &Lhs) {
-    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
-      vec Ret{};
-      for (size_t I = 0; I < NumElements; ++I)
-        Ret.setValue(
-            I, vec_data<DataT>::get(+vec_data<DataT>::get(Lhs.getValue(I))));
-      return Ret;
-    } else {
-      return vec{+Lhs.m_Data};
-    }
+  // operator +. Not available for std::byte as it is not an arithmetic type.
+  template <typename T = DataT>
+  friend typename std::enable_if_t<(!detail::is_byte_v<T>), vec>
+  operator+(const vec &Lhs) {
+#ifdef __SYCL_DEVICE_ONLY__
+    auto extVec = sycl::bit_cast<vector_t>(Lhs);
+    return vec{+extVec};
+#else
+    vec Ret{};
+    for (size_t I = 0; I < NumElements; ++I)
+      Ret[I] = +Lhs[I];
+    return Ret;
+#endif
   }
 
-  // operator -
-  friend vec operator-(const vec &Lhs) {
+  // operator -. Not available for std::byte as it is not an arithmetic type.
+  template <typename T = DataT>
+  friend typename std::enable_if_t<(!detail::is_byte_v<T>), vec>
+  operator-(const vec &Lhs) {
     namespace oneapi = sycl::ext::oneapi;
     vec Ret{};
-    if constexpr (IsBfloat16 && NumElements == 1) {
-      oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data);
-      oneapi::bfloat16 w = -v;
-      Ret.m_Data = oneapi::detail::bfloat16ToBits(w);
-    } else if constexpr (IsBfloat16) {
-      for (size_t I = 0; I < NumElements; I++) {
-        oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data[I]);
-        oneapi::bfloat16 w = -v;
-        Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w);
-      }
-    } else if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
-      for (size_t I = 0; I < NumElements; ++I)
-        Ret.setValue(
-            I, vec_data<DataT>::get(-vec_data<DataT>::get(Lhs.getValue(I))));
-      return Ret;
+    if constexpr (IsBfloat16) {
+      for (size_t I = 0; I < NumElements; I++)
+        Ret[I] = -Lhs[I];
     } else {
-      Ret = vec{-Lhs.m_Data};
-      if constexpr (std::is_same_v<Type, bool>) {
+#ifndef __SYCL_DEVICE_ONLY__
+      for (size_t I = 0; I < NumElements; ++I)
+        Ret[I] = -Lhs[I];
+#else
+      auto extVec = sycl::bit_cast<vector_t>(Lhs);
+      Ret = vec{-extVec};
+      if constexpr (std::is_same_v<DataT, bool>) {
         Ret.ConvertToDataT();
       }
-      return Ret;
+#endif
     }
+    return Ret;
   }
 
-  // OP is: &&, ||
-  // vec<RET, NumElements> operatorOP(const vec<DataT, NumElements> &Rhs) const;
-  // vec<RET, NumElements> operatorOP(const DataT &Rhs) const;
-
-  // OP is: ==, !=, <, >, <=, >=
-  // vec<RET, NumElements> operatorOP(const vec<DataT, NumElements> &Rhs) const;
-  // vec<RET, NumElements> operatorOP(const DataT &Rhs) const;
 private:
-  // Generic method that execute "Operation" on underlying values.
-
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-  template <template <typename> class Operation,
-            typename Ty = vec<DataT, NumElements>>
-  vec<DataT, NumElements>
-  operatorHelper(const EnableIfNotUsingArrayOnDevice<Ty> &Rhs) const {
-    vec<DataT, NumElements> Result;
-    Operation<DataType> Op;
-    Result.m_Data = Op(m_Data, Rhs.m_Data);
-    return Result;
-  }
-
-  template <template <typename> class Operation,
-            typename Ty = vec<DataT, NumElements>>
-  vec<DataT, NumElements>
-  operatorHelper(const EnableIfUsingArrayOnDevice<Ty> &Rhs) const {
-    vec<DataT, NumElements> Result;
-    Operation<DataT> Op;
-    for (size_t I = 0; I < NumElements; ++I) {
-      Result.setValue(I, Op(Rhs.getValue(I), getValue(I)));
-    }
-    return Result;
-  }
-#else  // __SYCL_USE_EXT_VECTOR_TYPE__
-  template <template <typename> class Operation>
-  vec<DataT, NumElements>
-  operatorHelper(const vec<DataT, NumElements> &Rhs) const {
-    vec<DataT, NumElements> Result;
-    Operation<DataT> Op;
-    for (size_t I = 0; I < NumElements; ++I) {
-      Result.setValue(I, Op(Rhs.getValue(I), getValue(I)));
-    }
-    return Result;
-  }
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
-
   // setValue and getValue should be able to operate on different underlying
   // types: enum cl_float#N , builtin vector float#N, builtin type float.
-  // These versions are for N > 1.
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-  template <int Num = NumElements, typename Ty = int,
-            typename = typename std::enable_if_t<1 != Num>>
-  constexpr void setValue(EnableIfNotHostHalf<Ty> Index, const DataT &Value,
-                          int) {
-    m_Data[Index] = vec_data<DataT>::set(Value);
-  }
+  constexpr auto getValue(int Index) const {
 
-  template <int Num = NumElements, typename Ty = int,
-            typename = typename std::enable_if_t<1 != Num>>
-  constexpr DataT getValue(EnableIfNotHostHalf<Ty> Index, int) const {
-    return vec_data<DataT>::get(m_Data[Index]);
-  }
-
-  template <int Num = NumElements, typename Ty = int,
-            typename = typename std::enable_if_t<1 != Num>>
-  constexpr void setValue(EnableIfHostHalf<Ty> Index, const DataT &Value, int) {
-    m_Data.s[Index] = vec_data<DataT>::set(Value);
-  }
-
-  template <int Num = NumElements, typename Ty = int,
-            typename = typename std::enable_if_t<1 != Num>>
-  constexpr DataT getValue(EnableIfHostHalf<Ty> Index, int) const {
-    return vec_data<DataT>::get(m_Data.s[Index]);
-  }
-#else  // __SYCL_USE_EXT_VECTOR_TYPE__
-  template <int Num = NumElements,
-            typename = typename std::enable_if_t<1 != Num>>
-  constexpr void setValue(int Index, const DataT &Value, int) {
-    m_Data[Index] = vec_data<DataT>::set(Value);
-  }
-
-  template <int Num = NumElements,
-            typename = typename std::enable_if_t<1 != Num>>
-  constexpr DataT getValue(int Index, int) const {
-    return vec_data<DataT>::get(m_Data[Index]);
-  }
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
-
-  // N==1 versions, used by host and device. Shouldn't trailing type be int?
-  template <int Num = NumElements,
-            typename = typename std::enable_if_t<1 == Num>>
-  constexpr void setValue(int, const DataT &Value, float) {
-    m_Data = vec_data<DataT>::set(Value);
-  }
+    using RetType =
+        typename std::conditional_t<detail::is_byte_v<DataT>, int8_t,
+#ifdef __SYCL_DEVICE_ONLY__
+                                    detail::element_type_for_vector_t<DataT>
+#else
+                                    DataT
+#endif
+                                    >;
 
-  template <int Num = NumElements,
-            typename = typename std::enable_if_t<1 == Num>>
-  DataT getValue(int, float) const {
-    return vec_data<DataT>::get(m_Data);
+#ifdef __SYCL_DEVICE_ONLY__
+    if constexpr (std::is_same_v<DataT, sycl::ext::oneapi::bfloat16>)
+      return sycl::bit_cast<RetType>(m_Data[Index]);
+    else
+#endif
+      return static_cast<RetType>(m_Data[Index]);
   }
 
-  // setValue and getValue.
-  // The "api" functions used by BINOP etc.  These versions just dispatch
-  // using additional int or float arg to disambiguate vec<1> vs. vec<N>
-  // Special proxies as specialization is not allowed in class scope.
   constexpr void setValue(int Index, const DataT &Value) {
-    if (NumElements == 1)
-      setValue(Index, Value, 0);
+#ifdef __SYCL_DEVICE_ONLY__
+    if constexpr (std::is_same_v<DataT, sycl::ext::oneapi::bfloat16>)
+      m_Data[Index] = sycl::bit_cast<DataT>(Value);
     else
-      setValue(Index, Value, 0.f);
-  }
-
-  DataT getValue(int Index) const {
-    return (NumElements == 1) ? getValue(Index, 0) : getValue(Index, 0.f);
+#endif
+      m_Data[Index] = Value;
   }
 
   // fields
-
-  // Alignment is the same as size, to a maximum size of 64.
-  // detail::vector_alignment will return that value.
-  alignas(detail::vector_alignment<DataT, NumElements>::value) DataType m_Data;
+  // Alignment is the same as size, to a maximum size of 64. SPEC requires
+  // "The elements of an instance of the SYCL vec class template are stored
+  // in memory sequentially and contiguously and are aligned to the size of
+  // the element type in bytes multiplied by the number of elements."
+  static constexpr int alignment = std::min((size_t)64, sizeof(DataType));
+  alignas(alignment) DataType m_Data;
 
   // friends
   template <typename T1, typename T2, typename T3, template <typename> class T4,
@@ -1535,13 +1111,13 @@ class SwizzleOp {
   template <typename T>
   using EnableIfScalarType = typename std::enable_if_t<
       std::is_convertible_v<DataT, T> &&
-      (std::is_fundamental_v<vec_data_t<T>> ||
+      (std::is_fundamental_v<T> ||
        detail::is_half_or_bf16_v<typename std::remove_const_t<T>>)>;
 
   template <typename T>
   using EnableIfNoScalarType = typename std::enable_if_t<
       !std::is_convertible_v<DataT, T> ||
-      !(std::is_fundamental_v<vec_data_t<T>> ||
+      !(std::is_fundamental_v<T> ||
         detail::is_half_or_bf16_v<typename std::remove_const_t<T>>)>;
 
   template <int... Indices>
@@ -1661,7 +1237,7 @@ class SwizzleOp {
 
   template <typename T = DataT>
   friend typename std::enable_if_t<
-      std::is_same_v<T, DataT> && std::is_integral_v<vec_data_t<T>>, vec_t>
+      std::is_same_v<T, DataT> && !detail::is_vgenfloat_v<T>, vec_t>
   operator~(const SwizzleOp &Rhs) {
     vec_t Tmp = Rhs;
     return ~Tmp;
@@ -1688,34 +1264,57 @@ class SwizzleOp {
 #ifdef __SYCL_BINOP
 #error "Undefine __SYCL_BINOP macro"
 #endif
-#define __SYCL_BINOP(BINOP)                                                    \
-  friend vec_t operator BINOP(const DataT &Lhs, const SwizzleOp &Rhs) {        \
+#define __SYCL_BINOP(BINOP, COND)                                              \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec_t> operator BINOP(              \
+      const DataT & Lhs, const SwizzleOp & Rhs) {                              \
     vec_t Tmp = Rhs;                                                           \
     return Lhs BINOP Tmp;                                                      \
   }                                                                            \
-  friend vec_t operator BINOP(const SwizzleOp &Lhs, const DataT &Rhs) {        \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec_t> operator BINOP(              \
+      const SwizzleOp & Lhs, const DataT & Rhs) {                              \
     vec_t Tmp = Lhs;                                                           \
     return Tmp BINOP Rhs;                                                      \
   }                                                                            \
-  friend vec_t operator BINOP(const vec_t &Lhs, const SwizzleOp &Rhs) {        \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec_t> operator BINOP(              \
+      const vec_t & Lhs, const SwizzleOp & Rhs) {                              \
     vec_t Tmp = Rhs;                                                           \
     return Lhs BINOP Tmp;                                                      \
   }                                                                            \
-  friend vec_t operator BINOP(const SwizzleOp &Lhs, const vec_t &Rhs) {        \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec_t> operator BINOP(              \
+      const SwizzleOp & Lhs, const vec_t & Rhs) {                              \
     vec_t Tmp = Lhs;                                                           \
     return Tmp BINOP Rhs;                                                      \
   }
 
-  __SYCL_BINOP(+)
-  __SYCL_BINOP(-)
-  __SYCL_BINOP(*)
-  __SYCL_BINOP(/)
-  __SYCL_BINOP(%)
-  __SYCL_BINOP(&)
-  __SYCL_BINOP(|)
-  __SYCL_BINOP(^)
-  __SYCL_BINOP(>>)
-  __SYCL_BINOP(<<)
+  __SYCL_BINOP(+, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(-, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(*, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(/, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(%, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(&, true)
+  __SYCL_BINOP(|, true)
+  __SYCL_BINOP(^, true)
+  // We have special <<, >> operators for std::byte.
+  __SYCL_BINOP(>>, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(<<, (!detail::is_byte_v<T>))
+
+  template <typename T = DataT>
+  friend typename std::enable_if_t<detail::is_byte_v<T>, vec_t>
+  operator>>(const SwizzleOp &Lhs, const int shift) {
+    vec_t Tmp = Lhs;
+    return Tmp >> shift;
+  }
+
+  template <typename T = DataT>
+  friend typename std::enable_if_t<detail::is_byte_v<T>, vec_t>
+  operator<<(const SwizzleOp &Lhs, const int shift) {
+    vec_t Tmp = Lhs;
+    return Tmp << shift;
+  }
 #undef __SYCL_BINOP
 
 // scalar RELLOGOP vec<>
@@ -1724,33 +1323,40 @@ class SwizzleOp {
 #ifdef __SYCL_RELLOGOP
 #error "Undefine __SYCL_RELLOGOP macro"
 #endif
-#define __SYCL_RELLOGOP(RELLOGOP)                                              \
-  friend vec_rel_t operator RELLOGOP(const DataT &Lhs, const SwizzleOp &Rhs) { \
+#define __SYCL_RELLOGOP(RELLOGOP, COND)                                        \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec_rel_t> operator RELLOGOP(       \
+      const DataT & Lhs, const SwizzleOp & Rhs) {                              \
     vec_t Tmp = Rhs;                                                           \
     return Lhs RELLOGOP Tmp;                                                   \
   }                                                                            \
-  friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const DataT &Rhs) { \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec_rel_t> operator RELLOGOP(       \
+      const SwizzleOp & Lhs, const DataT & Rhs) {                              \
     vec_t Tmp = Lhs;                                                           \
     return Tmp RELLOGOP Rhs;                                                   \
   }                                                                            \
-  friend vec_rel_t operator RELLOGOP(const vec_t &Lhs, const SwizzleOp &Rhs) { \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec_rel_t> operator RELLOGOP(       \
+      const vec_t & Lhs, const SwizzleOp & Rhs) {                              \
     vec_t Tmp = Rhs;                                                           \
     return Lhs RELLOGOP Tmp;                                                   \
   }                                                                            \
-  friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const vec_t &Rhs) { \
+  template <typename T = DataT>                                                \
+  friend typename std::enable_if_t<(COND), vec_rel_t> operator RELLOGOP(       \
+      const SwizzleOp & Lhs, const vec_t & Rhs) {                              \
     vec_t Tmp = Lhs;                                                           \
     return Tmp RELLOGOP Rhs;                                                   \
   }
 
-  __SYCL_RELLOGOP(==)
-  __SYCL_RELLOGOP(!=)
-  __SYCL_RELLOGOP(>)
-  __SYCL_RELLOGOP(<)
-  __SYCL_RELLOGOP(>=)
-  __SYCL_RELLOGOP(<=)
-  // TODO: limit to integral types.
-  __SYCL_RELLOGOP(&&)
-  __SYCL_RELLOGOP(||)
+  __SYCL_RELLOGOP(==, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(!=, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(>, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(<, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(>=, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(<=, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(&&, (!detail::is_byte_v<T> && !detail::is_vgenfloat_v<T>))
+  __SYCL_RELLOGOP(||, (!detail::is_byte_v<T> && !detail::is_vgenfloat_v<T>))
 #undef __SYCL_RELLOGOP
 
   template <int IdxNum = getNumElements(),
@@ -2140,32 +1746,29 @@ class SwizzleOp {
       std::array<int, getNumElements()> Idxs{Indexes...};
       return m_Vector->getValue(Idxs[Index]);
     }
-    auto Op = OperationCurrentT<vec_data_t<CommonDataT>>();
-    return vec_data<CommonDataT>::get(
-        Op(vec_data<CommonDataT>::get(m_LeftOperation.getValue(Index)),
-           vec_data<CommonDataT>::get(m_RightOperation.getValue(Index))));
+    auto Op = OperationCurrentT<CommonDataT>();
+    return Op(m_LeftOperation.getValue(Index),
+              m_RightOperation.getValue(Index));
   }
 
   template <int IdxNum = getNumElements()>
   DataT getValue(EnableIfMultipleIndexes<IdxNum, size_t> Index) const {
     if (std::is_same_v<OperationCurrentT<DataT>, GetOp<DataT>>) {
       std::array<int, getNumElements()> Idxs{Indexes...};
-      return m_Vector->getValue(Idxs[Index]);
+      // Cast required for int8_t -> std::byte
+      return static_cast<DataT>(m_Vector->getValue(Idxs[Index]));
     }
-    auto Op = OperationCurrentT<vec_data_t<DataT>>();
-    return vec_data<DataT>::get(
-        Op(vec_data<DataT>::get(m_LeftOperation.getValue(Index)),
-           vec_data<DataT>::get(m_RightOperation.getValue(Index))));
+    auto Op = OperationCurrentT<DataT>();
+    return Op(m_LeftOperation.getValue(Index),
+              m_RightOperation.getValue(Index));
   }
 
   template <template <typename> class Operation, typename RhsOperation>
   void operatorHelper(const RhsOperation &Rhs) {
-    Operation<vec_data_t<DataT>> Op;
+    Operation<DataT> Op;
     std::array<int, getNumElements()> Idxs{Indexes...};
     for (size_t I = 0; I < Idxs.size(); ++I) {
-      DataT Res = vec_data<DataT>::get(
-          Op(vec_data<DataT>::get(m_Vector->getValue(Idxs[I])),
-             vec_data<DataT>::get(Rhs.getValue(I))));
+      DataT Res = Op(m_Vector->getValue(Idxs[I]), Rhs.getValue(I));
       m_Vector->setValue(Idxs[I], Res);
     }
   }
diff --git a/sycl/test-e2e/Basic/vector/byte.cpp b/sycl/test-e2e/Basic/vector/byte.cpp
index a2d70d1a0ba31..052e1b14269fd 100644
--- a/sycl/test-e2e/Basic/vector/byte.cpp
+++ b/sycl/test-e2e/Basic/vector/byte.cpp
@@ -18,7 +18,7 @@
 #include <tuple>   // std::ignore
 
 int main() {
-  std::byte bt{7};
+  std::byte bt{2};
   // constructors
   sycl::vec<std::byte, 1> vb1(bt);
   sycl::vec<std::byte, 2> vb2{bt, bt};
@@ -46,8 +46,20 @@ int main() {
     auto cnv = vi2.convert<std::byte>();
     auto cnv2 = vb1.convert<int>();
 
+    assert(cnv[0] == std::byte{1} && cnv[1] == std::byte{1});
+    assert(cnv2[0] == 3);
+
     auto asint = vb2.template as<sycl::vec<int16_t, 1>>();
     auto asbyte = vi2.template as<sycl::vec<std::byte, 8>>();
+
+    // 0000 0010 0000 0010 = 514
+    assert(asint[0] == 514);
+
+    // 0000 0000 0000 0001 0000 0000 0000 0001
+    assert(asbyte[0] == std::byte{1} && asbyte[1] == std::byte{0} &&
+           asbyte[2] == std::byte{0} && asbyte[3] == std::byte{0} &&
+           asbyte[4] == std::byte{1} && asbyte[5] == std::byte{0} &&
+           asbyte[6] == std::byte{0} && asbyte[7] == std::byte{0});
   }
 
   // load() and store()
@@ -77,7 +89,7 @@ int main() {
         .wait();
   }
   assert(std_vec[0] == std::byte{2});
-  assert(std_vec[1] == std::byte{7});
+  assert(std_vec[1] == std::byte{2});
 
   // swizzle
   {
@@ -91,212 +103,276 @@ int main() {
     // hi/lo, even/odd
     sycl::vec<std::byte, 4> vbsw(std::byte{0}, std::byte{1}, std::byte{2},
                                  std::byte{3});
+
     sycl::vec<std::byte, 2> vbswhi = vbsw.hi();
-    assert(vbswhi[0] == std::byte{2});
+    assert(vbswhi[0] == std::byte{2} && vbswhi[1] == std::byte{3});
+
     vbswhi = vbsw.lo();
+    assert(vbswhi[0] == std::byte{0} && vbswhi[1] == std::byte{1});
+
     vbswhi = vbsw.odd();
+    assert(vbswhi[0] == std::byte{1} && vbswhi[1] == std::byte{3});
+
     vbswhi = vbsw.even();
+    assert(vbswhi[0] == std::byte{0} && vbswhi[1] == std::byte{2});
   }
 
   // operatorOP for vec and for swizzle
   {
-    sycl::vec<std::byte, 3> vop1{std::byte{4}, std::byte{9}, std::byte{25}};
-    sycl::vec<std::byte, 3> vop2{std::byte{2}, std::byte{3}, std::byte{5}};
-    sycl::vec<std::byte, 4> vop3{std::byte{5}, std::byte{6}, std::byte{2},
-                                 std::byte{3}};
-
-    // binary op for 2 vec
-    auto vop = vop1 + vop2;
-    assert(vop[0] == std::byte{6});
-    vop = vop1 - vop2;
-    vop = vop1 * vop2;
-    vop = vop1 / vop2;
-    assert(vop[0] == std::byte{2});
-    vop = vop1 % vop2;
-
-    // binary op for 2 swizzle
-    auto swlo = vop3.lo();
-    auto swhi = vop3.hi();
-    auto swplus = swlo + swhi;
-    sycl::vec<std::byte, 2> vec_test = swplus;
-    assert(vec_test.x() == std::byte{7} && vec_test.y() == std::byte{9});
-    auto swominus = swlo - swhi;
-    auto swmul = swlo * swhi;
-    vec_test = swmul;
-    assert(vec_test.x() == std::byte{10} && vec_test.y() == std::byte{18});
-    auto swdiv = swlo / swhi;
-
-    // binary op for 1 vec
-    vop = vop1 + std::byte{3};
-    vop = vop1 - std::byte{3};
-    assert(vop[1] == std::byte{6});
-    vop = vop1 * std::byte{3};
-    vop = vop1 / std::byte{3};
-    vop = vop1 % std::byte{3};
-    assert(vop[0] == std::byte{1});
-
-    vop = std::byte{3} + vop1;
-    assert(vop[0] == std::byte{7});
-    vop = std::byte{3} - vop1;
-    vop = std::byte{3} * vop1;
-    assert(vop[2] == std::byte{75});
-    vop = std::byte{3} / vop1;
-
-    // binary op for 1 swizzle
-    auto swplus1 = swlo + std::byte{3};
-    auto swminus1 = swlo - std::byte{3};
-    vec_test = swminus1;
-    assert(vec_test.x() == std::byte{2} && vec_test.y() == std::byte{3});
-    auto swmul1 = swlo * std::byte{3};
-    auto swdiv1 = swlo / std::byte{3};
-    vec_test = swdiv1;
-    assert(vec_test.x() == std::byte{1} && vec_test.y() == std::byte{2});
-
-    auto swplus2 = std::byte{3} + swlo;
-    vec_test = swplus2;
-    assert(vec_test.x() == std::byte{8} && vec_test.y() == std::byte{9});
-    auto swminus2 = std::byte{3} - swlo;
-    auto swmul2 = std::byte{3} * swlo;
-    vec_test = swmul2;
-    assert(vec_test.x() == std::byte{15} && vec_test.y() == std::byte{18});
-    auto swdiv2 = std::byte{3} / swlo;
-
-    // operatorOP= for 2 vec
-    sycl::vec<std::byte, 3> vbuf{std::byte{4}, std::byte{5}, std::byte{6}};
-    vop = vbuf += vop1;
-    assert(vop[0] == std::byte{8});
-    vop = vbuf -= vop1;
-    vop = vbuf *= vop1;
-    vop = vbuf /= vop1;
-    vop = vbuf %= vop1;
-
-    // operatorOP= for 2 swizzle
-    swlo += swhi;
-    swlo -= swhi;
-    vec_test = swlo;
-    assert(vec_test.x() == std::byte{5} && vec_test.y() == std::byte{6});
-    swlo *= swhi;
-    swlo /= swhi;
-    swlo %= swhi;
-
-    // operatorOP= for 1 vec
-    vop = vop1 += std::byte{3};
-    assert(vop[0] == std::byte{7});
-    vop = vop1 -= std::byte{3};
-    vop = vop1 *= std::byte{3};
-    vop = vop1 /= std::byte{3};
-    vop = vop1 %= std::byte{3};
-
-    // operatorOP= for 1 swizzle
-
-    swlo += std::byte{3};
-    swlo -= std::byte{1};
-    vec_test = swlo;
-    assert(vec_test.x() == std::byte{3} && vec_test.y() == std::byte{2});
-    swlo *= std::byte{3};
-    swlo /= std::byte{3};
-    swlo %= std::byte{3};
-
-    // unary operator++ and -- for vec
-    vop1 = sycl::vec<std::byte, 3>(std::byte{4}, std::byte{9}, std::byte{25});
-    vop1++;
-    vop1--;
-    vop = ++vop1;
-    assert(vop[2] == std::byte{26});
-    --vop1;
-
-    // unary operator++ and -- for swizzle
-    swlo++;
-    swlo--;
-    vec_test = swlo;
-    assert(vec_test.x() == std::byte{0} && vec_test.y() == std::byte{2});
-
-    // logical binary op for 2 vec
-    vop = vop1 & vop2;
-    vop = vop1 | vop2;
-    vop = vop1 ^ vop2;
-
-    // logical binary op for 2 swizzle
-    auto swand = swlo & swhi;
-    auto swor = swlo | swhi;
-    auto swxor = swlo ^ swhi;
-
-    // logical binary op for 1 vec
-    vop = vop1 & std::byte{3};
-    vop = vop1 | std::byte{3};
-    vop = vop1 ^ std::byte{3};
-    vop = std::byte{3} & vop1;
-    vop = std::byte{3} | vop1;
-    vop = std::byte{3} ^ vop1;
-
-    // logical binary op for 1 swizzle
-    auto swand2 = swlo & std::byte{3};
-    auto swor2 = swlo | std::byte{3};
-    auto swxor2 = swlo ^ std::byte{3};
-
-    auto swand3 = std::byte{3} & swlo;
-    auto swor3 = std::byte{3} | swlo;
-    auto swxor3 = std::byte{3} ^ swlo;
-
-    // bit binary op for 2 vec
-    vop = vop1 && vop2;
-    vop = vop1 || vop2;
-    vop = vop1 >> vop2;
-    vop = vop1 << vop2;
-
-    vop = vop1 >> std::byte{3};
-    vop = vop1 << std::byte{3};
-    vop = std::byte{3} >> vop1;
-    vop = std::byte{3} << vop1;
-
-    // bit binary op for 2 swizzle
-    swlo >> swhi;
-    swlo << swhi;
-    swlo >> std::byte{3};
-    swlo << std::byte{3};
-    auto right = std::byte{3} >> swhi;
-    auto left = std::byte{3} << swhi;
-
-    // condition op for 2 vec
-    auto vres = vop1 == vop2;
-    vres = vop1 != vop2;
-    vres = vop1 > vop2;
-    vres = vop1 < vop2;
-    vres = vop1 >= vop2;
-    vres = vop1 <= vop2;
-
-    vres = vop1 == std::byte{3};
-    vres = vop1 != std::byte{3};
-    vres = vop1 > std::byte{3};
-    vres = vop1 < std::byte{3};
-    vres = vop1 >= std::byte{3};
-    vres = vop1 <= std::byte{3};
-
-    vres = std::byte{3} == vop1;
-    vres = std::byte{3} != vop1;
-    vres = std::byte{3} > vop1;
-    vres = std::byte{3} < vop1;
-    vres = std::byte{3} >= vop1;
-    vres = std::byte{3} <= vop1;
-
-    // condition op for 2 swizzle
-    auto swres = swhi == swlo;
-    auto swres1 = swhi != swlo;
-    auto swres2 = swhi > swlo;
-    auto swres3 = swhi < swlo;
-    auto swres4 = swhi >= swlo;
-    auto swres5 = swhi <= swlo;
-    auto swres6 = swhi == std::byte{3};
-    auto swres7 = swhi != std::byte{3};
-    auto swres8 = swhi > std::byte{3};
-    auto swres9 = swhi < std::byte{3};
-    auto swres10 = swhi >= std::byte{3};
-    auto swres11 = swhi <= std::byte{3};
-
-    sycl::vec<std::byte, 3> voptest{std::byte{4}, std::byte{9}, std::byte{25}};
-    auto bitv1 = ~vop3;
-    auto bitv2 = !vop3;
-    auto bitw = ~swhi;
+    sycl::vec<std::byte, 3> VecByte3A{std::byte{4}, std::byte{9},
+                                      std::byte{25}};
+    sycl::vec<std::byte, 3> VecByte3B{std::byte{2}, std::byte{3}, std::byte{5}};
+    sycl::vec<std::byte, 4> VecByte4A{std::byte{5}, std::byte{6}, std::byte{2},
+                                      std::byte{3}};
+
+    // Test bitwise operations on vec<std::byte> and swizzles.
+    // Adding asserts on vec<> operations, and not swizzle operations,
+    // should suffice as swizzles just delegates the operation to vec<>
+    // class.
+    {
+      auto SwizByte2A = VecByte4A.lo();
+      auto SwizByte2B = VecByte4A.hi();
+
+      // logical binary op for 2 vec
+      auto VecByte3And = VecByte3A & VecByte3B;
+      auto VecByte3Or = VecByte3A | VecByte3B;
+      auto VecByte3Xor = VecByte3A ^ VecByte3B;
+      assert(VecByte3And[0] == (VecByte3A[0] & VecByte3B[0]));
+      assert(VecByte3Or[1] == (VecByte3A[1] | VecByte3B[1]));
+      assert(VecByte3Xor[2] == (VecByte3A[2] ^ VecByte3B[2]));
+
+      // logical binary op for 2 swizzle
+      auto swand = SwizByte2A & SwizByte2B;
+      auto swor = SwizByte2A | SwizByte2B;
+      auto swxor = SwizByte2A ^ SwizByte2B;
+
+      // Check order of operands for bitwise operators.
+      auto BitWiseAnd1 = VecByte3A & std::byte{3};
+      auto BitWiseOr1 = VecByte3A | std::byte{3};
+      auto BitWiseXor1 = VecByte3A ^ std::byte{3};
+      auto BitWiseAnd2 = std::byte{3} & VecByte3A;
+      auto BitWiseOr2 = std::byte{3} | VecByte3A;
+      auto BitWiseXor2 = std::byte{3} ^ VecByte3A;
+      assert(BitWiseAnd1[0] == BitWiseAnd2[0]);
+      assert(BitWiseOr1[1] == BitWiseOr2[1]);
+      assert(BitWiseXor1[2] == BitWiseXor2[2]);
+
+      // logical binary op for 1 swizzle
+      auto swand2 = SwizByte2A & std::byte{3};
+      auto swor2 = SwizByte2A | std::byte{3};
+      auto swxor2 = SwizByte2A ^ std::byte{3};
+
+      auto swand3 = std::byte{3} & SwizByte2A;
+      auto swor3 = std::byte{3} | SwizByte2A;
+      auto swxor3 = std::byte{3} ^ SwizByte2A;
+
+      // bit-wise negation test
+      auto VecByte4Neg = ~VecByte4A;
+      assert(VecByte4Neg[0] == ~VecByte4A[0]);
+
+      auto bitw = ~SwizByte2B;
+    }
+
+// std::byte is not an arithmetic type or a character type, so std::byte and
+// vec<std::byte> should not support artithmetic operations. In the new
+// implementation of vec<> class, the following will be removed.
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+    {
+      // binary op for 2 vec
+      auto vop = VecByte3A + VecByte3B;
+      assert(vop[0] == std::byte{6});
+      vop = VecByte3A - VecByte3B;
+      vop = VecByte3A * VecByte3B;
+      vop = VecByte3A / VecByte3B;
+      assert(vop[0] == std::byte{2});
+      vop = VecByte3A % VecByte3B;
+
+      // binary op for 2 swizzle
+      auto swlo = VecByte4A.lo();
+      auto swhi = VecByte4A.hi();
+      auto swplus = swlo + swhi;
+      sycl::vec<std::byte, 2> vec_test = swplus;
+      assert(vec_test.x() == std::byte{7} && vec_test.y() == std::byte{9});
+      auto swominus = swlo - swhi;
+      auto swmul = swlo * swhi;
+      vec_test = swmul;
+      assert(vec_test.x() == std::byte{10} && vec_test.y() == std::byte{18});
+      auto swdiv = swlo / swhi;
+
+      // binary op for 1 vec
+      vop = VecByte3A + std::byte{3};
+      vop = VecByte3A - std::byte{3};
+      assert(vop[1] == std::byte{6});
+      vop = VecByte3A * std::byte{3};
+      vop = VecByte3A / std::byte{3};
+      vop = VecByte3A % std::byte{3};
+      assert(vop[0] == std::byte{1});
+
+      vop = std::byte{3} + VecByte3A;
+      assert(vop[0] == std::byte{7});
+      vop = std::byte{3} - VecByte3A;
+      vop = std::byte{3} * VecByte3A;
+      assert(vop[2] == std::byte{75});
+      vop = std::byte{3} / VecByte3A;
+
+      // binary op for 1 swizzle
+      auto swplus1 = swlo + std::byte{3};
+      auto swminus1 = swlo - std::byte{3};
+      vec_test = swminus1;
+      assert(vec_test.x() == std::byte{2} && vec_test.y() == std::byte{3});
+      auto swmul1 = swlo * std::byte{3};
+      auto swdiv1 = swlo / std::byte{3};
+      vec_test = swdiv1;
+      assert(vec_test.x() == std::byte{1} && vec_test.y() == std::byte{2});
+
+      auto swplus2 = std::byte{3} + swlo;
+      vec_test = swplus2;
+      assert(vec_test.x() == std::byte{8} && vec_test.y() == std::byte{9});
+      auto swminus2 = std::byte{3} - swlo;
+      auto swmul2 = std::byte{3} * swlo;
+      vec_test = swmul2;
+      assert(vec_test.x() == std::byte{15} && vec_test.y() == std::byte{18});
+      auto swdiv2 = std::byte{3} / swlo;
+
+      // operatorOP= for 2 vec
+      sycl::vec<std::byte, 3> vbuf{std::byte{4}, std::byte{5}, std::byte{6}};
+      vop = vbuf += VecByte3A;
+      assert(vop[0] == std::byte{8});
+      vop = vbuf -= VecByte3A;
+      vop = vbuf *= VecByte3A;
+      vop = vbuf /= VecByte3A;
+      vop = vbuf %= VecByte3A;
+
+      // operatorOP= for 2 swizzle
+      swlo += swhi;
+      swlo -= swhi;
+      vec_test = swlo;
+      assert(vec_test.x() == std::byte{5} && vec_test.y() == std::byte{6});
+      swlo *= swhi;
+      swlo /= swhi;
+      swlo %= swhi;
+
+      // operatorOP= for 1 vec
+      vop = VecByte3A += std::byte{3};
+      assert(vop[0] == std::byte{7});
+      vop = VecByte3A -= std::byte{3};
+      vop = VecByte3A *= std::byte{3};
+      vop = VecByte3A /= std::byte{3};
+      vop = VecByte3A %= std::byte{3};
+
+      // operatorOP= for 1 swizzle
+      swlo += std::byte{3};
+      swlo -= std::byte{1};
+      vec_test = swlo;
+      assert(vec_test.x() == std::byte{3} && vec_test.y() == std::byte{2});
+      swlo *= std::byte{3};
+      swlo /= std::byte{3};
+      swlo %= std::byte{3};
+
+      // unary operator++ and -- for vec
+      VecByte3A =
+          sycl::vec<std::byte, 3>(std::byte{4}, std::byte{9}, std::byte{25});
+      VecByte3A++;
+      VecByte3A--;
+      vop = ++VecByte3A;
+      assert(vop[2] == std::byte{26});
+      --VecByte3A;
+
+      // unary operator++ and -- for swizzle
+      swlo++;
+      swlo--;
+      vec_test = swlo;
+      assert(vec_test.x() == std::byte{0} && vec_test.y() == std::byte{2});
+    }
+
+    // Logical operations on vec<byte> and swizzles.
+    {
+      // condition op for 2 vec
+      auto vres = VecByte3A == VecByte3B;
+      vres = VecByte3A != VecByte3B;
+      vres = VecByte3A > VecByte3B;
+      vres = VecByte3A < VecByte3B;
+      vres = VecByte3A >= VecByte3B;
+      vres = VecByte3A <= VecByte3B;
+
+      vres = VecByte3A == std::byte{3};
+      vres = VecByte3A != std::byte{3};
+      vres = VecByte3A > std::byte{3};
+      vres = VecByte3A < std::byte{3};
+      vres = VecByte3A >= std::byte{3};
+      vres = VecByte3A <= std::byte{3};
+
+      vres = std::byte{3} == VecByte3A;
+      vres = std::byte{3} != VecByte3A;
+      vres = std::byte{3} > VecByte3A;
+      vres = std::byte{3} < VecByte3A;
+      vres = std::byte{3} >= VecByte3A;
+      vres = std::byte{3} <= VecByte3A;
+
+      auto swlo = VecByte4A.lo();
+      auto swhi = VecByte4A.hi();
+
+      // condition op for 2 swizzle
+      auto swres = swhi == swlo;
+      auto swres1 = swhi != swlo;
+      auto swres2 = swhi > swlo;
+      auto swres3 = swhi < swlo;
+      auto swres4 = swhi >= swlo;
+      auto swres5 = swhi <= swlo;
+      auto swres6 = swhi == std::byte{3};
+      auto swres7 = swhi != std::byte{3};
+      auto swres8 = swhi > std::byte{3};
+      auto swres9 = swhi < std::byte{3};
+      auto swres10 = swhi >= std::byte{3};
+      auto swres11 = swhi <= std::byte{3};
+
+      // bit binary operations
+      auto vop = VecByte3A && VecByte3B;
+      vop = VecByte3A || VecByte3B;
+
+      auto vop1 = VecByte3A >> VecByte3B;
+      vop1 = VecByte3A << VecByte3B;
+
+      vop1 = VecByte3A >> std::byte{3};
+      vop1 = VecByte3A << std::byte{3};
+      vop1 = std::byte{3} >> VecByte3A;
+      vop1 = std::byte{3} << VecByte3A;
+
+      swlo >> swhi;
+      swlo << swhi;
+      swlo >> std::byte{3};
+      swlo << std::byte{3};
+      auto right = std::byte{3} >> swhi;
+      auto left = std::byte{3} << swhi;
+
+      auto bitv2 = !VecByte4A;
+    }
+#else
+    {
+      // std::byte is not an arithmetic type and it only supports the following
+      // overloads of >> and << operators.
+      //
+      // 1 template <class IntegerType>
+      //   constexpr std::byte operator<<( std::byte b, IntegerType shift )
+      //   noexcept;
+      // 2 template <class IntegerType>
+      //   constexpr std::byte operator>>( std::byte b, IntegerType shift )
+      //   noexcept;
+      auto VecByte3Shift = VecByte3A << 3;
+      assert(VecByte3Shift[0] == VecByte3A[0] << 3 &&
+             VecByte3Shift[1] == VecByte3A[1] << 3 &&
+             VecByte3Shift[2] == VecByte3A[2] << 3);
+
+      VecByte3Shift = VecByte3A >> 1;
+      assert(VecByte3Shift[0] == VecByte3A[0] >> 1 &&
+             VecByte3Shift[1] == VecByte3A[1] >> 1 &&
+             VecByte3Shift[2] == VecByte3A[2] >> 1);
+
+      auto SwizByte3Shift = VecByte4A.lo();
+      SwizByte3Shift >> 3;
+      SwizByte3Shift << 3;
+    }
+#endif // __INTEL_PREVIEW_BREAKING_CHANGES
   }
 
   return 0;
diff --git a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_relational.cpp b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_relational.cpp
index 2ba7ff130bf72..e07e2548a840c 100644
--- a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_relational.cpp
+++ b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_relational.cpp
@@ -2,9 +2,6 @@
 // RUN: %{build} -fpreview-breaking-changes -o %t.out
 // RUN: %{run} %t.out
 
-// This test currently fails on AMD HIP due to an unresolved memcmp function.
-// XFAIL: hip_amd
-
 // Checks scalar/vec relational operator ordering.
 
 #include "vec_binary_scalar_order.hpp"
diff --git a/sycl/test/abi/layout_vec.cpp b/sycl/test/abi/layout_vec.cpp
index 8f70f2835bf72..3357641ca607a 100644
--- a/sycl/test/abi/layout_vec.cpp
+++ b/sycl/test/abi/layout_vec.cpp
@@ -1,5 +1,8 @@
 // RUN: %clangxx -fsycl -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s
 // RUN: %clangxx -fsycl -fsycl-device-only -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s
+// RUN: %if preview-breaking-changes-supported %{ %clangxx -fsycl -c -fpreview-breaking-changes -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s -check-prefix FSYCL-PREVIEW-BREAKING-CHANGES-CHECK %}
+// RUN: %if preview-breaking-changes-supported %{ %clangxx -fsycl -fsycl-device-only -c -fpreview-breaking-changes -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s -check-prefix FSYCL-PREVIEW-BREAKING-CHANGES-CHECK %}
+
 // REQUIRES: linux
 // UNSUPPORTED: libcxx
 
@@ -14,6 +17,12 @@ SYCL_EXTERNAL void foo(sycl::vec<int, 4>) {}
 // CHECK-NEXT: | [sizeof=16, dsize=16, align=16,
 // CHECK-NEXT: |  nvsize=16, nvalign=16]
 
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK: 0 | class sycl::vec<int, 4>
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: 0 | struct std::array<int, 4> m_Data
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: 0 | typename _AT_Type::_Type _M_elems
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: | [sizeof=16, dsize=16, align=16,
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: |  nvsize=16, nvalign=16]
+
 //--------------------------------------
 
 SYCL_EXTERNAL void foo(sycl::vec<bool, 16>) {}
@@ -22,3 +31,9 @@ SYCL_EXTERNAL void foo(sycl::vec<bool, 16>) {}
 // CHECK-NEXT: 0 |   DataType m_Data
 // CHECK-NEXT: | [sizeof=16, dsize=16, align=16,
 // CHECK-NEXT: |  nvsize=16, nvalign=16]
+
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK: 0 | class sycl::vec<_Bool, 16>
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: 0 | struct std::array<_Bool, 16> m_Data
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: 0 | typename _AT_Type::_Type _M_elems
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: | [sizeof=16, dsize=16, align=16,
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: |  nvsize=16, nvalign=16]
diff --git a/sycl/test/check_device_code/vector/vector_math_ops.cpp b/sycl/test/check_device_code/vector/vector_math_ops.cpp
index 5d6521d725341..187641fa508cf 100644
--- a/sycl/test/check_device_code/vector/vector_math_ops.cpp
+++ b/sycl/test/check_device_code/vector/vector_math_ops.cpp
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // NOTE: ..., followed by some manual cleanup.
 
-// RUN: %clangxx -I %sycl_include -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -Xclang -disable-lifetime-markers -fsycl-device-only %s -o - | FileCheck %s
+// RUN: %clangxx -I %sycl_include -fpreview-breaking-changes -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -Xclang -disable-lifetime-markers -fsycl-device-only %s -o - | FileCheck %s
 
 // This test checks
 // (1) the storage type of sycl::vec on device for all data types, and
@@ -28,17 +28,17 @@ SYCL_EXTERNAL auto TestAdd(vec<int, 2> a, vec<int, 2> b) { return a + b; }
 // CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.0") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META13:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META14:![0-9]+]])
-// CHECK-NEXT:    [[LOADVEC4_I:%.*]] = load <4 x float>, ptr [[A]], align 16, !noalias [[META14]]
-// CHECK-NEXT:    [[LOADVEC42_I:%.*]] = load <4 x float>, ptr [[B]], align 16, !noalias [[META14]]
-// CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x float> [[LOADVEC4_I]], [[LOADVEC42_I]]
-// CHECK-NEXT:    [[EXTRACTVEC5_I:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC5_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META14]]
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x float>, ptr [[A]], align 16, !noalias [[META14]]
+// CHECK-NEXT:    [[LOADVEC4_I6_I:%.*]] = load <4 x float>, ptr [[B]], align 16, !noalias [[META14]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x float> [[LOADVEC4_I_I]], [[LOADVEC4_I6_I]]
+// CHECK-NEXT:    [[EXTRACTVEC_I8_I:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC_I8_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META14]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestAdd(vec<float, 3> a, vec<float, 3> b) { return a + b; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecIcLi16EEES2_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.1") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.1") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.1") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META17:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.2") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.2") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.2") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META17:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
 // CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META18]]
@@ -51,7 +51,7 @@ SYCL_EXTERNAL auto TestAdd(vec<char, 16> a, vec<char, 16> b) { return a + b; }
 
 // std::byte does not support '+'. Therefore, using bitwise XOR as a substitute.
 // CHECK-LABEL: define dso_local spir_func void @_Z7TestXorN4sycl3_V13vecISt4byteLi8EEES3_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.2") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.2") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.2") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META21:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.4") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META21:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
 // CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 8, !tbaa [[TBAA10]], !noalias [[META22]]
@@ -65,92 +65,72 @@ SYCL_EXTERNAL auto TestXor(vec<std::byte, 8> a, vec<std::byte, 8> b) {
 }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecIbLi4EEES2_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.3") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !srcloc [[META25:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable sret(%"class.sycl::_V1::vec.6") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.6") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.6") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !srcloc [[META25:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META26:![0-9]+]])
 // CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[A]], align 4, !tbaa [[TBAA10]], !noalias [[META26]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[B]], align 4, !tbaa [[TBAA10]], !noalias [[META26]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    store <4 x i8> [[ADD_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !tbaa [[TBAA10]], !alias.scope [[META26]]
 // CHECK-NEXT:    br label [[FOR_COND_I_I:%.*]]
 // CHECK:       for.cond.i.i:
-// CHECK-NEXT:    [[VECINS_I_I6_I_I:%.*]] = phi <4 x i8> [ [[ADD_I]], [[ENTRY:%.*]] ], [ [[VECINS_I_I_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ]
-// CHECK-NEXT:    [[I_0_I_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I]] ]
+// CHECK-NEXT:    [[I_0_I_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ]
 // CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp ult i64 [[I_0_I_I]], 4
-// CHECK-NEXT:    br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V1PLERKNS0_3VECIBLI4EEES4__EXIT:%.*]]
+// CHECK-NEXT:    br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V1PLIBEENST9ENABLE_IFIXNTSR6DETAILE9IS_BYTE_VIT_EENS0_3VECIBLI4EEEE4TYPEERKS5_S9__EXIT:%.*]]
 // CHECK:       for.body.i.i:
-// CHECK-NEXT:    [[CONV_I_I:%.*]] = trunc nuw nsw i64 [[I_0_I_I]] to i32
-// CHECK-NEXT:    [[VECEXT_I_I_I_I:%.*]] = extractelement <4 x i8> [[VECINS_I_I6_I_I]], i32 [[CONV_I_I]]
-// CHECK-NEXT:    [[TOBOOL_I_I_I_I:%.*]] = icmp ne i8 [[VECEXT_I_I_I_I]], 0
-// CHECK-NEXT:    [[FROMBOOL_I_I:%.*]] = zext i1 [[TOBOOL_I_I_I_I]] to i8
-// CHECK-NEXT:    [[VECINS_I_I_I_I]] = insertelement <4 x i8> [[VECINS_I_I6_I_I]], i8 [[FROMBOOL_I_I]], i32 [[CONV_I_I]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I_I:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(4) [[AGG_RESULT]], i64 0, i64 [[I_0_I_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(4) [[ARRAYIDX_I_I_I_I]], align 1, !tbaa [[TBAA10]], !alias.scope [[META26]]
+// CHECK-NEXT:    [[CMP3_I_I:%.*]] = icmp ne i8 [[TMP2]], 0
+// CHECK-NEXT:    [[FROMBOOL_I_I:%.*]] = zext i1 [[CMP3_I_I]] to i8
+// CHECK-NEXT:    store i8 [[FROMBOOL_I_I]], ptr addrspace(4) [[ARRAYIDX_I_I_I_I]], align 1, !tbaa [[TBAA29:![0-9]+]], !alias.scope [[META26]]
 // CHECK-NEXT:    [[INC_I_I]] = add nuw nsw i64 [[I_0_I_I]], 1
-// CHECK-NEXT:    br label [[FOR_COND_I_I]], !llvm.loop [[LOOP29:![0-9]+]]
-// CHECK:       _ZN4sycl3_V1plERKNS0_3vecIbLi4EEES4_.exit:
-// CHECK-NEXT:    store <4 x i8> [[VECINS_I_I6_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !alias.scope [[META26]]
+// CHECK-NEXT:    br label [[FOR_COND_I_I]], !llvm.loop [[LOOP31:![0-9]+]]
+// CHECK:       _ZN4sycl3_V1plIbEENSt9enable_ifIXntsr6detailE9is_byte_vIT_EENS0_3vecIbLi4EEEE4typeERKS5_S9_.exit:
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestAdd(vec<bool, 4> a, vec<bool, 4> b) { return a + b; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecINS0_6detail9half_impl4halfELi3EEES5_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.4") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META31:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.8") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.8") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.8") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META33:![0-9]+]] !sycl_used_aspects [[META34:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
-// CHECK-NEXT:    [[LOADVEC4_I:%.*]] = load <4 x half>, ptr [[A]], align 8, !noalias [[META32]]
-// CHECK-NEXT:    [[LOADVEC42_I:%.*]] = load <4 x half>, ptr [[B]], align 8, !noalias [[META32]]
-// CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x half> [[LOADVEC4_I]], [[LOADVEC42_I]]
-// CHECK-NEXT:    [[EXTRACTVEC5_I:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x half> [[EXTRACTVEC5_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA10]], !alias.scope [[META32]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META35:![0-9]+]])
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x half>, ptr [[A]], align 8, !noalias [[META35]]
+// CHECK-NEXT:    [[LOADVEC4_I6_I:%.*]] = load <4 x half>, ptr [[B]], align 8, !noalias [[META35]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x half> [[LOADVEC4_I_I]], [[LOADVEC4_I6_I]]
+// CHECK-NEXT:    [[EXTRACTVEC_I8_I:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x half> [[EXTRACTVEC_I8_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA10]], !alias.scope [[META35]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestAdd(vec<half, 3> a, vec<half, 3> b) { return a + b; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEES5_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable sret(%"class.sycl::_V1::vec.5") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.5") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.5") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] !srcloc [[META35:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.10") align 8 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.10") align 8 [[A:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.10") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] !srcloc [[META38:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[REF_TMP_I_I:%.*]] = alloca float, align 4
-// CHECK-NEXT:    [[REF_TMP1_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 2
-// CHECK-NEXT:    [[REF_TMP3_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 2
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[REF_TMP1_I]])
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[REF_TMP3_I]])
-// CHECK-NEXT:    [[REF_TMP1_ASCAST_I:%.*]] = addrspacecast ptr [[REF_TMP1_I]] to ptr addrspace(4)
-// CHECK-NEXT:    [[REF_TMP3_ASCAST_I:%.*]] = addrspacecast ptr [[REF_TMP3_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
+// CHECK-NEXT:    [[B_ASCAST:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(4)
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META39:![0-9]+]])
 // CHECK-NEXT:    [[REF_TMP_ASCAST_I_I:%.*]] = addrspacecast ptr [[REF_TMP_I_I]] to ptr addrspace(4)
-// CHECK-NEXT:    [[AGG_RESULT_PROMOTED_I:%.*]] = load <3 x i16>, ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META36]]
-// CHECK-NEXT:    [[LOADVEC4_I_I_I:%.*]] = load <4 x i16>, ptr [[A]], align 8, !noalias [[META39:![0-9]+]]
-// CHECK-NEXT:    [[EXTRACTVEC_I_I_I:%.*]] = shufflevector <4 x i16> [[LOADVEC4_I_I_I]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[LOADVEC4_I_I9_I:%.*]] = load <4 x i16>, ptr [[B]], align 8, !noalias [[META44:![0-9]+]]
-// CHECK-NEXT:    [[EXTRACTVEC_I_I10_I:%.*]] = shufflevector <4 x i16> [[LOADVEC4_I_I9_I]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
 // CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
 // CHECK:       for.cond.i:
-// CHECK-NEXT:    [[VECINS_I_I12_I:%.*]] = phi <3 x i16> [ [[AGG_RESULT_PROMOTED_I]], [[ENTRY:%.*]] ], [ [[VECINS_I_I_I:%.*]], [[FOR_BODY_I:%.*]] ]
-// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY_I]] ]
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 3
-// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V1PLERKNS0_3VECINS0_3EXT6ONEAPI8BFLOAT16ELI3EEES7__EXIT:%.*]]
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V1PLINS0_3EXT6ONEAPI8BFLOAT16EEENST9ENABLE_IFIXNTSR6DETAILE9IS_BYTE_VIT_EENS0_3VECIS4_LI3EEEE4TYPEERKS8_SC__EXIT:%.*]]
 // CHECK:       for.body.i:
-// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META49:![0-9]+]])
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META50:![0-9]+]])
-// CHECK-NEXT:    [[VECEXT_I_I_I:%.*]] = extractelement <3 x i16> [[EXTRACTVEC_I_I_I]], i32 [[CONV_I]]
-// CHECK-NEXT:    store i16 [[VECEXT_I_I_I]], ptr [[REF_TMP1_I]], align 2, !alias.scope [[META51:![0-9]+]], !noalias [[META36]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META56:![0-9]+]])
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META57:![0-9]+]])
-// CHECK-NEXT:    [[VECEXT_I_I11_I:%.*]] = extractelement <3 x i16> [[EXTRACTVEC_I_I10_I]], i32 [[CONV_I]]
-// CHECK-NEXT:    store i16 [[VECEXT_I_I11_I]], ptr [[REF_TMP3_I]], align 2, !alias.scope [[META58:![0-9]+]], !noalias [[META36]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META36]]
-// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[REF_TMP1_ASCAST_I]]) #[[ATTR9:[0-9]+]], !noalias [[META63:![0-9]+]]
-// CHECK-NEXT:    [[CALL_I_I2_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[REF_TMP3_ASCAST_I]]) #[[ATTR9]], !noalias [[META63]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[A_ASCAST]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I10_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[B_ASCAST]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META39]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[ARRAYIDX_I_I_I_I]]) #[[ATTR8:[0-9]+]], !noalias [[META42:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[ARRAYIDX_I_I_I10_I]]) #[[ATTR8]], !noalias [[META42]]
 // CHECK-NEXT:    [[ADD_I_I:%.*]] = fadd float [[CALL_I_I_I_I]], [[CALL_I_I2_I_I]]
-// CHECK-NEXT:    store float [[ADD_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA66:![0-9]+]], !noalias [[META63]]
-// CHECK-NEXT:    [[CALL_I_I3_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR9]], !noalias [[META63]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META36]]
-// CHECK-NEXT:    [[VECINS_I_I_I]] = insertelement <3 x i16> [[VECINS_I_I12_I]], i16 [[CALL_I_I3_I_I]], i32 [[CONV_I]]
+// CHECK-NEXT:    store float [[ADD_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA45:![0-9]+]], !noalias [[META42]]
+// CHECK-NEXT:    [[CALL_I_I3_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META42]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META39]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I12_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[AGG_RESULT]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    store i16 [[CALL_I_I3_I_I]], ptr addrspace(4) [[ARRAYIDX_I_I_I12_I]], align 2, !tbaa [[TBAA47:![0-9]+]], !alias.scope [[META39]]
 // CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
-// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP68:![0-9]+]]
-// CHECK:       _ZN4sycl3_V1plERKNS0_3vecINS0_3ext6oneapi8bfloat16ELi3EEES7_.exit:
-// CHECK-NEXT:    store <3 x i16> [[VECINS_I_I12_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META36]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[REF_TMP1_I]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[REF_TMP3_I]])
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP49:![0-9]+]]
+// CHECK:       _ZN4sycl3_V1plINS0_3ext6oneapi8bfloat16EEENSt9enable_ifIXntsr6detailE9is_byte_vIT_EENS0_3vecIS4_Li3EEEE4typeERKS8_SC_.exit:
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestAdd(vec<ext::oneapi::bfloat16, 3> a,
@@ -161,46 +141,43 @@ SYCL_EXTERNAL auto TestAdd(vec<ext::oneapi::bfloat16, 3> a,
 /***************** Binary Logical Ops *******************/
 
 // CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecIiLi16EEES2_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.6") align 64 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.6") align 64 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.6") align 64 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META69:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.12") align 64 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.12") align 64 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.12") align 64 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META50:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META70:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A]], align 64, !tbaa [[TBAA10]], !noalias [[META70]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr [[B]], align 64, !tbaa [[TBAA10]], !noalias [[META70]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META51:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A]], align 64, !tbaa [[TBAA10]], !noalias [[META51]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr [[B]], align 64, !tbaa [[TBAA10]], !noalias [[META51]]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <16 x i32> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i32>
-// CHECK-NEXT:    store <16 x i32> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 64, !tbaa [[TBAA10]], !alias.scope [[META70]]
+// CHECK-NEXT:    store <16 x i32> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 64, !tbaa [[TBAA10]], !alias.scope [[META51]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestGreaterThan(vec<int, 16> a, vec<int, 16> b) {
   return a > b;
 }
 
-// CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecISt4byteLi3EEES3_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.7") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.8") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.8") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META73:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-LABEL: define dso_local spir_func noundef <3 x i8> @_Z15TestGreaterThanN4sycl3_V13vecISt4byteLi3EEES3_(
+// CHECK-SAME: ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.14") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.14") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] !srcloc [[META54:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META74:![0-9]+]])
-// CHECK-NEXT:    [[LOADVEC4_I:%.*]] = load <4 x i8>, ptr [[A]], align 4, !noalias [[META74]]
-// CHECK-NEXT:    [[LOADVEC42_I:%.*]] = load <4 x i8>, ptr [[B]], align 4, !noalias [[META74]]
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <4 x i8> [[LOADVEC4_I]], [[LOADVEC42_I]]
-// CHECK-NEXT:    [[CMP_I:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <3 x i1> [[CMP_I]] to <3 x i8>
-// CHECK-NEXT:    [[EXTRACTVEC_I_I:%.*]] = shufflevector <3 x i8> [[SEXT_I]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !tbaa [[TBAA10]], !alias.scope [[META74]]
-// CHECK-NEXT:    ret void
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x i8>, ptr [[A]], align 1
+// CHECK-NEXT:    [[LOADVEC4_I_I2:%.*]] = load <4 x i8>, ptr [[B]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i8> [[LOADVEC4_I_I]], [[LOADVEC4_I_I2]]
+// CHECK-NEXT:    [[CMP:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[SEXT:%.*]] = sext <3 x i1> [[CMP]] to <3 x i8>
+// CHECK-NEXT:    ret <3 x i8> [[SEXT]]
 //
 SYCL_EXTERNAL auto TestGreaterThan(vec<std::byte, 3> a, vec<std::byte, 3> b) {
   return a > b;
 }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecIbLi2EEES2_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.9") align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.10") align 2 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.10") align 2 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META77:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.16") align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.18") align 2 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.18") align 2 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META55:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META78:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr [[A]], align 2, !tbaa [[TBAA10]], !noalias [[META78]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr [[B]], align 2, !tbaa [[TBAA10]], !noalias [[META78]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META56:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr [[A]], align 2, !tbaa [[TBAA10]], !noalias [[META56]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr [[B]], align 2, !tbaa [[TBAA10]], !noalias [[META56]]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <2 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i8>
-// CHECK-NEXT:    store <2 x i8> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 2, !tbaa [[TBAA10]], !alias.scope [[META78]]
+// CHECK-NEXT:    store <2 x i8> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 2, !tbaa [[TBAA10]], !alias.scope [[META56]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestGreaterThan(vec<bool, 2> a, vec<bool, 2> b) {
@@ -208,32 +185,44 @@ SYCL_EXTERNAL auto TestGreaterThan(vec<bool, 2> a, vec<bool, 2> b) {
 }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecINS0_6detail9half_impl4halfELi8EEES5_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.11") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.12") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.12") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META81:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.20") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.22") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.22") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META59:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META82:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META82]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B]], align 16, !tbaa [[TBAA10]], !noalias [[META82]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META60:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META60]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B]], align 16, !tbaa [[TBAA10]], !noalias [[META60]]
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <8 x half> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK-NEXT:    store <8 x i16> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META82]]
+// CHECK-NEXT:    store <8 x i16> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META60]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestGreaterThan(vec<half, 8> a, vec<half, 8> b) {
   return a > b;
 }
 
-// FIXME: We incorrectly interpret BF16 as INT16 to peform logical operation.
-// For example, vec<BF16, 2>{-0.5, 3.333} < vec<BF16, 2>{6.0, 6.666} results
-// into {-1, -1} on host but {0, -1} on device.
 // CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi4EEES5_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.13") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.14") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.14") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META85:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.24") align 8 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.26") align 8 [[A:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.26") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META63:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META86:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 8, !tbaa [[TBAA10]], !noalias [[META86]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[B]], align 8, !tbaa [[TBAA10]], !noalias [[META86]]
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <4 x i16> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK-NEXT:    store <4 x i16> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA10]], !alias.scope [[META86]]
+// CHECK-NEXT:    [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
+// CHECK-NEXT:    [[B_ASCAST:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(4)
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META64:![0-9]+]])
+// CHECK-NEXT:    store i64 0, ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META64]]
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 4
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V1GTINS0_3EXT6ONEAPI8BFLOAT16EEENST9ENABLE_IFIXNTSR6DETAILE9IS_BYTE_VIT_EENS0_3VECISLI4EEEE4TYPEERKNS7_IS4_LI4EEESD__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[A_ASCAST]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I13_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[B_ASCAST]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[ARRAYIDX_I_I_I_I]]) #[[ATTR8]], !noalias [[META64]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[ARRAYIDX_I_I_I13_I]]) #[[ATTR8]], !noalias [[META64]]
+// CHECK-NEXT:    [[CMP_I_I:%.*]] = fcmp ogt float [[CALL_I_I_I_I]], [[CALL_I_I2_I_I]]
+// CHECK-NEXT:    [[CONV5_I:%.*]] = sext i1 [[CMP_I_I]] to i16
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I15_I:%.*]] = getelementptr inbounds [4 x i16], ptr addrspace(4) [[AGG_RESULT]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    store i16 [[CONV5_I]], ptr addrspace(4) [[ARRAYIDX_I_I_I15_I]], align 2, !tbaa [[TBAA47]], !alias.scope [[META64]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP67:![0-9]+]]
+// CHECK:       _ZN4sycl3_V1gtINS0_3ext6oneapi8bfloat16EEENSt9enable_ifIXntsr6detailE9is_byte_vIT_EENS0_3vecIsLi4EEEE4typeERKNS7_IS4_Li4EEESD_.exit:
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestGreaterThan(vec<ext::oneapi::bfloat16, 4> a,
@@ -244,144 +233,128 @@ SYCL_EXTERNAL auto TestGreaterThan(vec<ext::oneapi::bfloat16, 4> a,
 /********************** Unary Ops **********************/
 
 // CHECK-LABEL: define dso_local spir_func void @_Z12TestNegationN4sycl3_V13vecIiLi3EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.15") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.15") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META89:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.27") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.27") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META68:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META90:![0-9]+]])
-// CHECK-NEXT:    [[LOADVEC4_I:%.*]] = load <4 x i32>, ptr [[A]], align 16, !noalias [[META90]]
-// CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <4 x i32> [[LOADVEC4_I]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <3 x i32> [[EXTRACTVEC_I]], zeroinitializer
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META69:![0-9]+]])
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x i32>, ptr [[A]], align 16, !noalias [[META69]]
+// CHECK-NEXT:    [[EXTRACTVEC_I_I:%.*]] = shufflevector <4 x i32> [[LOADVEC4_I_I]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <3 x i32> [[EXTRACTVEC_I_I]], zeroinitializer
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <3 x i1> [[CMP_I]] to <3 x i32>
-// CHECK-NEXT:    [[EXTRACTVEC_I_I:%.*]] = shufflevector <3 x i32> [[SEXT_I]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i32> [[EXTRACTVEC_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !alias.scope [[META90]]
+// CHECK-NEXT:    [[EXTRACTVEC_I2_I:%.*]] = shufflevector <3 x i32> [[SEXT_I]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i32> [[EXTRACTVEC_I2_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META69]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestNegation(vec<int, 3> a) { return !a; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z9TestMinusN4sycl3_V13vecIiLi4EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.16") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.16") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META93:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.29") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.29") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META72:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META94:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META94]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META73:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META73]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, [[TMP0]]
-// CHECK-NEXT:    store <4 x i32> [[SUB_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META94]]
+// CHECK-NEXT:    store <4 x i32> [[SUB_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META73]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestMinus(vec<int, 4> a) { return -a; }
 
 // Negation is not valid for std::byte. Therefore, using bitwise negation.
 // CHECK-LABEL: define dso_local spir_func void @_Z19TestBitwiseNegationN4sycl3_V13vecISt4byteLi16EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.17") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.17") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META97:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.30") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.30") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META76:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META98:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META98]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META77:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META77]]
 // CHECK-NEXT:    [[NOT_I:%.*]] = xor <16 x i8> [[TMP0]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-// CHECK-NEXT:    store <16 x i8> [[NOT_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META98]]
+// CHECK-NEXT:    store <16 x i8> [[NOT_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META77]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestBitwiseNegation(vec<std::byte, 16> a) { return ~a; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z12TestNegationN4sycl3_V13vecIbLi4EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.18") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 4 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META101:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.32") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.6") align 4 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META80:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META102:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[A]], align 4, !tbaa [[TBAA10]], !noalias [[META102]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META81:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[A]], align 4, !tbaa [[TBAA10]], !noalias [[META81]]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i8>
-// CHECK-NEXT:    store <4 x i8> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !alias.scope [[META105:![0-9]+]]
+// CHECK-NEXT:    store <4 x i8> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !tbaa [[TBAA10]], !alias.scope [[META81]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestNegation(vec<bool, 4> a) { return !a; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z12TestNegationN4sycl3_V13vecINS0_6detail9half_impl4halfELi2EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.19") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.20") align 4 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META108:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.34") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.36") align 4 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META84:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META109:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr [[A]], align 4, !tbaa [[TBAA10]], !noalias [[META109]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META85:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr [[A]], align 4, !tbaa [[TBAA10]], !noalias [[META85]]
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <2 x half> [[TMP0]], zeroinitializer
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i16>
-// CHECK-NEXT:    store <2 x i16> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !alias.scope [[META112:![0-9]+]]
+// CHECK-NEXT:    store <2 x i16> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !tbaa [[TBAA10]], !alias.scope [[META85]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestNegation(vec<half, 2> a) { return !a; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z9TestMinusN4sycl3_V13vecINS0_6detail9half_impl4halfELi8EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.12") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.12") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META115:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.22") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.22") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META88:![0-9]+]] !sycl_used_aspects [[META34]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META116:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META116]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META89:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META89]]
 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP0]]
-// CHECK-NEXT:    store <8 x half> [[FNEG_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META116]]
+// CHECK-NEXT:    store <8 x half> [[FNEG_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META89]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestMinus(vec<half, 8> a) { return -a; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z12TestNegationN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.21") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.5") align 8 [[A:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META119:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.38") align 8 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.10") align 8 [[A:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META92:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[REF_TMP1_I:%.*]] = alloca float, align 4
-// CHECK-NEXT:    [[REF_TMP2_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 2
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META120:![0-9]+]])
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP1_I]])
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[REF_TMP2_I]])
-// CHECK-NEXT:    [[REF_TMP1_ASCAST_I:%.*]] = addrspacecast ptr [[REF_TMP1_I]] to ptr addrspace(4)
-// CHECK-NEXT:    [[REF_TMP2_ASCAST_I:%.*]] = addrspacecast ptr [[REF_TMP2_I]] to ptr addrspace(4)
-// CHECK-NEXT:    [[LOADVEC4_I_I_I:%.*]] = load <4 x i16>, ptr [[A]], align 8, !noalias [[META123:![0-9]+]]
-// CHECK-NEXT:    [[EXTRACTVEC_I_I_I:%.*]] = shufflevector <4 x i16> [[LOADVEC4_I_I_I]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META93:![0-9]+]])
+// CHECK-NEXT:    store i64 0, ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META93]]
 // CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
 // CHECK:       for.cond.i:
-// CHECK-NEXT:    [[RET_SROA_0_0_I:%.*]] = phi <3 x i16> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[VECINS_I_I_I:%.*]], [[FOR_BODY_I:%.*]] ]
-// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY_I]] ]
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 3
-// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V1NTERKNS0_3VECINS0_3EXT6ONEAPI8BFLOAT16ELI3EEE_EXIT:%.*]]
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V1NTINS0_3EXT6ONEAPI8BFLOAT16EEENST9ENABLE_IFIXNTSR6DETAILE9IS_BYTE_VIT_EENS0_3VECISLI3EEEE4TYPEERKNS7_IS4_LI3EEE_EXIT:%.*]]
 // CHECK:       for.body.i:
-// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
-// CHECK-NEXT:    [[VECEXT_I_I_I:%.*]] = extractelement <3 x i16> [[EXTRACTVEC_I_I_I]], i32 [[CONV_I]]
-// CHECK-NEXT:    store i16 [[VECEXT_I_I_I]], ptr [[REF_TMP2_I]], align 2, !tbaa [[TBAA128:![0-9]+]], !alias.scope [[META130:![0-9]+]], !noalias [[META120]]
-// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[REF_TMP2_ASCAST_I]]) #[[ATTR9]], !noalias [[META120]]
-// CHECK-NEXT:    [[CMP_I_I:%.*]] = fcmp oeq float [[CALL_I_I_I]], 0.000000e+00
-// CHECK-NEXT:    [[CONV4_I:%.*]] = uitofp i1 [[CMP_I_I]] to float
-// CHECK-NEXT:    store float [[CONV4_I]], ptr [[REF_TMP1_I]], align 4, !tbaa [[TBAA66]], !noalias [[META120]]
-// CHECK-NEXT:    [[CALL_I_I9_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP1_ASCAST_I]]) #[[ATTR9]], !noalias [[META120]]
-// CHECK-NEXT:    [[VECINS_I_I_I]] = insertelement <3 x i16> [[RET_SROA_0_0_I]], i16 [[CALL_I_I9_I]], i32 [[CONV_I]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[A_ASCAST]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[ARRAYIDX_I_I_I_I]]) #[[ATTR8]], !noalias [[META93]]
+// CHECK-NEXT:    [[TOBOOL_I:%.*]] = fcmp oeq float [[CALL_I_I_I]], 0.000000e+00
+// CHECK-NEXT:    [[CONV3_I:%.*]] = sext i1 [[TOBOOL_I]] to i16
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I10_I:%.*]] = getelementptr inbounds [4 x i16], ptr addrspace(4) [[AGG_RESULT]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    store i16 [[CONV3_I]], ptr addrspace(4) [[ARRAYIDX_I_I_I10_I]], align 2, !tbaa [[TBAA47]], !alias.scope [[META93]]
 // CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
-// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP133:![0-9]+]]
-// CHECK:       _ZN4sycl3_V1ntERKNS0_3vecINS0_3ext6oneapi8bfloat16ELi3EEE.exit:
-// CHECK-NEXT:    store <3 x i16> [[RET_SROA_0_0_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META134:![0-9]+]]
-// CHECK-NEXT:    [[AGG_RESULT_SROA_IDX_I:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[AGG_RESULT]], i64 6
-// CHECK-NEXT:    store i16 0, ptr addrspace(4) [[AGG_RESULT_SROA_IDX_I]], align 2, !alias.scope [[META134]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP1_I]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[REF_TMP2_I]])
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP96:![0-9]+]]
+// CHECK:       _ZN4sycl3_V1ntINS0_3ext6oneapi8bfloat16EEENSt9enable_ifIXntsr6detailE9is_byte_vIT_EENS0_3vecIsLi3EEEE4typeERKNS7_IS4_Li3EEE.exit:
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestNegation(vec<ext::oneapi::bfloat16, 3> a) { return !a; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z9TestMinusN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.22") align 32 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.22") align 32 [[A:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] !srcloc [[META137:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.39") align 32 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.39") align 32 [[A:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META97:![0-9]+]] !sycl_fixed_targets [[META6]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[REF_TMP_I_I:%.*]] = alloca float, align 4
-// CHECK-NEXT:    [[V_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 2
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META138:![0-9]+]])
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[V_I]])
-// CHECK-NEXT:    [[V_ASCAST_I:%.*]] = addrspacecast ptr [[V_I]] to ptr addrspace(4)
-// CHECK-NEXT:    tail call void @llvm.memset.p4.i64(ptr addrspace(4) noundef align 32 dereferenceable(32) [[AGG_RESULT]], i8 0, i64 32, i1 false), !alias.scope [[META138]]
-// CHECK-NEXT:    [[REF_TMP_ASCAST_I_I:%.*]] = addrspacecast ptr [[REF_TMP_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[REF_TMP_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META98:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I]])
+// CHECK-NEXT:    [[REF_TMP_ASCAST_I:%.*]] = addrspacecast ptr [[REF_TMP_I]] to ptr addrspace(4)
+// CHECK-NEXT:    tail call void @llvm.memset.p4.i64(ptr addrspace(4) noundef align 32 dereferenceable(32) [[AGG_RESULT]], i8 0, i64 32, i1 false), !alias.scope [[META98]]
 // CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
 // CHECK:       for.cond.i:
-// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_COND_I]] ]
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 16
-// CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I]])
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i16>, ptr [[A]], i64 0, i64 [[I_0_I]]
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = load i16, ptr [[TMP0]], align 2, !noalias [[META138]]
-// CHECK-NEXT:    store i16 [[VECEXT_I]], ptr [[V_I]], align 2, !tbaa [[TBAA141:![0-9]+]], !alias.scope [[META143:![0-9]+]], !noalias [[META138]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META138]]
-// CHECK-NEXT:    [[CALL_I_I:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[V_ASCAST_I]]) #[[ATTR9]], !noalias [[META146:![0-9]+]]
-// CHECK-NEXT:    [[FNEG_I_I:%.*]] = fneg float [[CALL_I_I]]
-// CHECK-NEXT:    store float [[FNEG_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA66]], !noalias [[META146]]
-// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR9]], !noalias [[META146]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META138]]
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <16 x i16>, ptr addrspace(4) [[AGG_RESULT]], i64 0, i64 [[I_0_I]]
-// CHECK-NEXT:    store i16 [[CALL_I_I_I_I]], ptr addrspace(4) [[TMP1]], align 2, !alias.scope [[META138]]
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V1NGINS0_3EXT6ONEAPI8BFLOAT16EEENST9ENABLE_IFIXNTSR6DETAILE9IS_BYTE_VIT_EENS0_3VECIS4_LI16EEEE4TYPEERKS8__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I_I:%.*]] = getelementptr inbounds [16 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[A_ASCAST]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[ARRAYIDX_I_I_I_I]]) #[[ATTR8]], !noalias [[META98]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg float [[CALL_I_I_I]]
+// CHECK-NEXT:    store float [[FNEG_I]], ptr [[REF_TMP_I]], align 4, !tbaa [[TBAA45]], !noalias [[META98]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I9_I:%.*]] = getelementptr inbounds [16 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[AGG_RESULT]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    [[CALL_I_I10_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I]]) #[[ATTR8]], !noalias [[META98]]
+// CHECK-NEXT:    store i16 [[CALL_I_I10_I]], ptr addrspace(4) [[ARRAYIDX_I_I_I9_I]], align 2, !tbaa [[TBAA101:![0-9]+]], !alias.scope [[META98]]
 // CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
-// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP149:![0-9]+]]
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP103:![0-9]+]]
+// CHECK:       _ZN4sycl3_V1ngINS0_3ext6oneapi8bfloat16EEENSt9enable_ifIXntsr6detailE9is_byte_vIT_EENS0_3vecIS4_Li16EEEE4typeERKS8_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I]])
+// CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestMinus(vec<ext::oneapi::bfloat16, 16> a) { return -a; }