[clang][x86] Add constexpr support for some basic SSE1 intrinsics #111001

RKSimon · 2024-10-03T14:42:31Z

This is an initial patch to enable constexpr support on the more basic SSE1 intrinsics - such as initialization, arithmetic, logic and fixed shuffles.

The plan is to incrementally extend this for SSE2/AVX etc. - initially for the equivalent basic intrinsics, but we can add support for some of the ia32 builtins as well we the need arises.

llvmbot · 2024-10-03T14:43:07Z

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

This is an initial patch to enable constexpr support on the more basic SSE1 intrinsics - such as initialization, arithmetic, logic and fixed shuffles.

The plan is to incrementally extend this for SSE2/AVX etc. - initially for the equivalent basic intrinsics, but we can add support for some of the ia32 builtins as well we the need arises.

Full diff: https://github.com/llvm/llvm-project/pull/111001.diff

2 Files Affected:

(modified) clang/lib/Headers/xmmintrin.h (+56-72)
(modified) clang/test/CodeGen/X86/sse-builtins.c (+83)

diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 9958e91bfceaa9..e755556f9cd319 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -48,6 +48,14 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
                  __min_vector_width__(128)))
 #endif
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
+#endif
+
 #define __trunc64(x)                                                           \
   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
 #define __zext128(x)                                                           \
@@ -75,9 +83,8 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
 ///    the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_ss(__m128 __a, __m128 __b) {
   __a[0] += __b[0];
   return __a;
 }
@@ -95,9 +102,8 @@ _mm_add_ss(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the sums of both
 ///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4sf)__a + (__v4sf)__b);
 }
 
@@ -117,9 +123,8 @@ _mm_add_ps(__m128 __a, __m128 __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
 ///    copied from the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_ss(__m128 __a, __m128 __b) {
   __a[0] -= __b[0];
   return __a;
 }
@@ -138,9 +143,8 @@ _mm_sub_ss(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 /// \returns A 128-bit vector of [4 x float] containing the differences between
 ///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4sf)__a - (__v4sf)__b);
 }
 
@@ -160,9 +164,8 @@ _mm_sub_ps(__m128 __a, __m128 __b)
 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
 ///    bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_ss(__m128 __a, __m128 __b) {
   __a[0] *= __b[0];
   return __a;
 }
@@ -180,9 +183,8 @@ _mm_mul_ss(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the products of both
 ///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4sf)__a * (__v4sf)__b);
 }
 
@@ -202,9 +204,8 @@ _mm_mul_ps(__m128 __a, __m128 __b)
 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
 ///    upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_div_ss(__m128 __a, __m128 __b) {
   __a[0] /= __b[0];
   return __a;
 }
@@ -221,9 +222,8 @@ _mm_div_ss(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing the divisor.
 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
 ///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_div_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4sf)__a / (__v4sf)__b);
 }
 
@@ -437,9 +437,8 @@ _mm_max_ps(__m128 __a, __m128 __b)
 ///    A 128-bit vector containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 ///    values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_and_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_and_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4su)__a & (__v4su)__b);
 }
 
@@ -459,9 +458,8 @@ _mm_and_ps(__m128 __a, __m128 __b)
 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 ///    one's complement of the first operand and the values in the second
 ///    operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_andnot_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_andnot_ps(__m128 __a, __m128 __b) {
   return (__m128)(~(__v4su)__a & (__v4su)__b);
 }
 
@@ -477,9 +475,8 @@ _mm_andnot_ps(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
 ///    values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_or_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_or_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4su)__a | (__v4su)__b);
 }
 
@@ -496,9 +493,8 @@ _mm_or_ps(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
 ///    of the values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_xor_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_xor_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4su)__a ^ (__v4su)__b);
 }
 
@@ -1738,9 +1734,8 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the extraction.
 /// \returns A 32-bit float containing the extracted value.
-static __inline__ float __DEFAULT_FN_ATTRS
-_mm_cvtss_f32(__m128 __a)
-{
+static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtss_f32(__m128 __a) {
   return __a[0];
 }
 
@@ -1931,9 +1926,8 @@ _mm_undefined_ps(void)
 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
 ///    lower 32 bits contain the value provided in the source operand. The
 ///    upper 96 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ss(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ss(float __w) {
   return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
 }
 
@@ -1949,9 +1943,8 @@ _mm_set_ss(float __w)
 ///    A single-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set1_ps(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set1_ps(float __w) {
   return __extension__ (__m128){ __w, __w, __w, __w };
 }
 
@@ -1968,9 +1961,8 @@ _mm_set1_ps(float __w)
 ///    A single-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps1(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ps1(float __w) {
     return _mm_set1_ps(__w);
 }
 
@@ -1995,9 +1987,8 @@ _mm_set_ps1(float __w)
 ///    A single-precision floating-point value used to initialize bits [31:0]
 ///    of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps(float __z, float __y, float __x, float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ps(float __z, float __y, float __x, float __w) {
   return __extension__ (__m128){ __w, __x, __y, __z };
 }
 
@@ -2023,9 +2014,8 @@ _mm_set_ps(float __z, float __y, float __x, float __w)
 ///    A single-precision floating-point value used to initialize bits [127:96]
 ///    of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setr_ps(float __z, float __y, float __x, float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setr_ps(float __z, float __y, float __x, float __w) {
   return __extension__ (__m128){ __z, __y, __x, __w };
 }
 
@@ -2038,9 +2028,8 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
 ///
 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
 ///    all elements set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setzero_ps(void)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setzero_ps(void) {
   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
 }
 
@@ -2786,9 +2775,8 @@ void _mm_setcsr(unsigned int __i);
 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
 ///    Bits [127:96] are written to bits [127:96] of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpackhi_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpackhi_ps(__m128 __a, __m128 __b) {
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
 }
 
@@ -2808,9 +2796,8 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b)
 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
 ///    Bits [63:32] are written to bits [127:96] of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpacklo_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpacklo_ps(__m128 __a, __m128 __b) {
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
 }
 
@@ -2830,9 +2817,8 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b)
 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
 ///    written to the lower 32 bits of the result.
 /// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_move_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_move_ss(__m128 __a, __m128 __b) {
   __a[0] = __b[0];
   return __a;
 }
@@ -2852,9 +2838,8 @@ _mm_move_ss(__m128 __a, __m128 __b)
 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
 ///    written to the lower 64 bits of the result.
 /// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movehl_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movehl_ps(__m128 __a, __m128 __b) {
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
 }
 
@@ -2873,9 +2858,8 @@ _mm_movehl_ps(__m128 __a, __m128 __b)
 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
 ///    written to the upper 64 bits of the result.
 /// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movelh_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movelh_ps(__m128 __a, __m128 __b) {
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
 }
 
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index 74f62e93c800b6..932d6f36b09b66 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -869,3 +869,86 @@ __m128 test_mm_xor_ps(__m128 A, __m128 B) {
   // CHECK: xor <4 x i32>
   return _mm_xor_ps(A, B);
 }
+
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+
+void test_constexpr() {
+  constexpr __m128 k1 {+1.0f,+0.0f,+2.0f,+4.0f};
+  constexpr __m128 k2 {+8.0f,+4.0f,+2.0f,+1.0f};
+  constexpr __m128 k3 {-4.0f,-5.0f,+6.0f,+7.0f};
+  constexpr __m128 k4 {+0.0f,-0.0f,-0.0f,+0.0f};
+
+  constexpr __m128 v_mm_set_ss = _mm_set_ss(1.0f);
+  static_assert(v_mm_set_ss[0] == +1.0f && v_mm_set_ss[1] == +0.0f && v_mm_set_ss[2] == +0.0f && v_mm_set_ss[3] == +0.0f);
+
+  constexpr __m128 v_mm_set1_ps = _mm_set1_ps(2.0f);
+  static_assert(v_mm_set1_ps[0] == +2.0f && v_mm_set1_ps[1] == +2.0f && v_mm_set1_ps[2] == +2.0f && v_mm_set1_ps[3] == +2.0f);
+
+  constexpr __m128 v_mm_set_ps1 = _mm_set_ps1(-2.0f);
+  static_assert(v_mm_set_ps1[0] == -2.0f && v_mm_set_ps1[1] == -2.0f && v_mm_set_ps1[2] == -2.0f && v_mm_set_ps1[3] == -2.0f);
+
+  constexpr __m128 v_mm_set_ps = _mm_set_ps(+0.0f, +1.0f, +2.0f, +3.0f);
+  static_assert(v_mm_set_ps[0] == +3.0f && v_mm_set_ps[1] == +2.0f && v_mm_set_ps[2] == +1.0f && v_mm_set_ps[3] == +0.0f);
+
+  constexpr __m128 v_mm_setr_ps = _mm_setr_ps(+0.0f, +1.0f, +2.0f, +3.0f);
+  static_assert(v_mm_setr_ps[0] == +0.0f && v_mm_setr_ps[1] == +1.0f && v_mm_setr_ps[2] == +2.0f && v_mm_setr_ps[3] == +3.0f);
+
+  constexpr __m128 v_mm_setzero_ps = _mm_setzero_ps();
+  static_assert(v_mm_setzero_ps[0] == +0.0f && v_mm_setzero_ps[1] == +0.0f && v_mm_setzero_ps[2] == +0.0f && v_mm_setzero_ps[3] == +0.0f);
+
+  constexpr __m128 v_mm_add_ss = _mm_add_ss(k1, k2);
+  static_assert(v_mm_add_ss[0] == +9.0f && v_mm_add_ss[1] == +0.0f && v_mm_add_ss[2] == +2.0f && v_mm_add_ss[3] == +4.0f);
+
+  constexpr __m128 v_mm_add_ps = _mm_add_ps(k1, k2);
+  static_assert(v_mm_add_ps[0] == +9.0f && v_mm_add_ps[1] == +4.0f && v_mm_add_ps[2] == +4.0f && v_mm_add_ps[3] == +5.0f);
+
+  constexpr __m128 v_mm_sub_ss = _mm_sub_ss(k1, k2);
+  static_assert(v_mm_sub_ss[0] == -7.0f && v_mm_sub_ss[1] == +0.0f && v_mm_sub_ss[2] == +2.0f && v_mm_sub_ss[3] == +4.0f);
+
+  constexpr __m128 v_mm_sub_ps = _mm_sub_ps(k1, k2);
+  static_assert(v_mm_sub_ps[0] == -7.0f && v_mm_sub_ps[1] == -4.0f && v_mm_sub_ps[2] == +0.0f && v_mm_sub_ps[3] == +3.0f);
+
+  constexpr __m128 v_mm_mul_ss = _mm_mul_ss(k1, k2);
+  static_assert(v_mm_mul_ss[0] == +8.0f && v_mm_mul_ss[1] == +0.0f && v_mm_mul_ss[2] == +2.0f && v_mm_mul_ss[3] == +4.0f);
+
+  constexpr __m128 v_mm_mul_ps = _mm_mul_ps(k1, k2);
+  static_assert(v_mm_mul_ps[0] == +8.0f && v_mm_mul_ps[1] == +0.0f && v_mm_mul_ps[2] == +4.0f && v_mm_mul_ps[3] == +4.0f);
+
+  constexpr __m128 v_mm_div_ss = _mm_div_ss(k1, k2);
+  static_assert(v_mm_div_ss[0] == +0.125f && v_mm_div_ss[1] == +0.0f && v_mm_div_ss[2] == +2.0f && v_mm_div_ss[3] == +4.0f);
+
+  constexpr __m128 v_mm_div_ps = _mm_div_ps(k1, k2);
+  static_assert(v_mm_div_ps[0] == +0.125f && v_mm_div_ps[1] == +0.0f && v_mm_div_ps[2] == +1.0f && v_mm_div_ps[3] == +4.0f);
+
+  constexpr __m128 v_mm_and_ps = _mm_and_ps(k3, k4);
+  static_assert(v_mm_and_ps[0] == +0.0f && v_mm_and_ps[1] == +0.0f && v_mm_and_ps[2] == +0.0f && v_mm_and_ps[3] == +0.0f);
+
+  constexpr __m128 v_mm_andnot_ps = _mm_andnot_ps(k3, k4);
+  static_assert(v_mm_andnot_ps[0] == +0.0f && v_mm_andnot_ps[1] == +0.0f && v_mm_andnot_ps[2] == +0.0f && v_mm_andnot_ps[3] == +0.0f);
+
+  constexpr __m128 v_mm_or_ps = _mm_or_ps(k3, k4);
+  static_assert(v_mm_or_ps[0] == -4.0f && v_mm_or_ps[1] == -5.0f && v_mm_or_ps[2] == -6.0f && v_mm_or_ps[3] == +7.0f);
+
+  constexpr __m128 v_mm_xor_ps = _mm_xor_ps(k3, k4);
+  static_assert(v_mm_xor_ps[0] == -4.0f && v_mm_xor_ps[1] == +5.0f && v_mm_xor_ps[2] == -6.0f && v_mm_xor_ps[3] == +7.0f);
+
+  constexpr __m128 v_mm_unpackhi_ps = _mm_unpackhi_ps(k1, k2);
+  static_assert(v_mm_unpackhi_ps[0] == +2.0f && v_mm_unpackhi_ps[1] == +2.0f && v_mm_unpackhi_ps[2] == +4.0f && v_mm_unpackhi_ps[3] == +1.0f);
+
+  constexpr __m128 v_mm_unpacklo_ps = _mm_unpacklo_ps(k1, k2);
+  static_assert(v_mm_unpacklo_ps[0] == +1.0f && v_mm_unpacklo_ps[1] == +8.0f && v_mm_unpacklo_ps[2] == +0.0f && v_mm_unpacklo_ps[3] == +4.0f);
+
+  constexpr __m128 v_mm_move_ss = _mm_move_ss(k1, k2);
+  static_assert(v_mm_move_ss[0] == +8.0f && v_mm_move_ss[1] == +0.0f && v_mm_move_ss[2] == +2.0f && v_mm_move_ss[3] == +4.0f);
+
+  constexpr __m128 v_mm_movehl_ps = _mm_movehl_ps(k1, k2);
+  static_assert(v_mm_movehl_ps[0] == +2.0f && v_mm_movehl_ps[1] == +1.0f && v_mm_movehl_ps[2] == +2.0f && v_mm_movehl_ps[3] == +4.0f);
+
+  constexpr __m128 v_mm_movelh_ps = _mm_movelh_ps(k1, k2);
+  static_assert(v_mm_movelh_ps[0] == +1.0f && v_mm_movelh_ps[1] == +0.0f && v_mm_movelh_ps[2] == +8.0f && v_mm_movelh_ps[3] == +4.0f);
+
+  static_assert(_mm_cvtss_f32(k2) == +8.0f);
+}
+
+#endif
\ No newline at end of file

llvmbot · 2024-10-03T14:43:07Z

@llvm/pr-subscribers-clang

Author: Simon Pilgrim (RKSimon)

Changes

This is an initial patch to enable constexpr support on the more basic SSE1 intrinsics - such as initialization, arithmetic, logic and fixed shuffles.

The plan is to incrementally extend this for SSE2/AVX etc. - initially for the equivalent basic intrinsics, but we can add support for some of the ia32 builtins as well we the need arises.

Full diff: https://github.com/llvm/llvm-project/pull/111001.diff

2 Files Affected:

(modified) clang/lib/Headers/xmmintrin.h (+56-72)
(modified) clang/test/CodeGen/X86/sse-builtins.c (+83)

diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 9958e91bfceaa9..e755556f9cd319 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -48,6 +48,14 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
                  __min_vector_width__(128)))
 #endif
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
+#endif
+
 #define __trunc64(x)                                                           \
   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
 #define __zext128(x)                                                           \
@@ -75,9 +83,8 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
 ///    the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_ss(__m128 __a, __m128 __b) {
   __a[0] += __b[0];
   return __a;
 }
@@ -95,9 +102,8 @@ _mm_add_ss(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the sums of both
 ///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4sf)__a + (__v4sf)__b);
 }
 
@@ -117,9 +123,8 @@ _mm_add_ps(__m128 __a, __m128 __b)
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
 ///    copied from the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_ss(__m128 __a, __m128 __b) {
   __a[0] -= __b[0];
   return __a;
 }
@@ -138,9 +143,8 @@ _mm_sub_ss(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 /// \returns A 128-bit vector of [4 x float] containing the differences between
 ///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4sf)__a - (__v4sf)__b);
 }
 
@@ -160,9 +164,8 @@ _mm_sub_ps(__m128 __a, __m128 __b)
 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
 ///    bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_ss(__m128 __a, __m128 __b) {
   __a[0] *= __b[0];
   return __a;
 }
@@ -180,9 +183,8 @@ _mm_mul_ss(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the products of both
 ///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4sf)__a * (__v4sf)__b);
 }
 
@@ -202,9 +204,8 @@ _mm_mul_ps(__m128 __a, __m128 __b)
 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
 ///    upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_div_ss(__m128 __a, __m128 __b) {
   __a[0] /= __b[0];
   return __a;
 }
@@ -221,9 +222,8 @@ _mm_div_ss(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing the divisor.
 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
 ///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_div_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4sf)__a / (__v4sf)__b);
 }
 
@@ -437,9 +437,8 @@ _mm_max_ps(__m128 __a, __m128 __b)
 ///    A 128-bit vector containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 ///    values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_and_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_and_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4su)__a & (__v4su)__b);
 }
 
@@ -459,9 +458,8 @@ _mm_and_ps(__m128 __a, __m128 __b)
 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 ///    one's complement of the first operand and the values in the second
 ///    operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_andnot_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_andnot_ps(__m128 __a, __m128 __b) {
   return (__m128)(~(__v4su)__a & (__v4su)__b);
 }
 
@@ -477,9 +475,8 @@ _mm_andnot_ps(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
 ///    values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_or_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_or_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4su)__a | (__v4su)__b);
 }
 
@@ -496,9 +493,8 @@ _mm_or_ps(__m128 __a, __m128 __b)
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
 ///    of the values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_xor_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_xor_ps(__m128 __a, __m128 __b) {
   return (__m128)((__v4su)__a ^ (__v4su)__b);
 }
 
@@ -1738,9 +1734,8 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the extraction.
 /// \returns A 32-bit float containing the extracted value.
-static __inline__ float __DEFAULT_FN_ATTRS
-_mm_cvtss_f32(__m128 __a)
-{
+static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtss_f32(__m128 __a) {
   return __a[0];
 }
 
@@ -1931,9 +1926,8 @@ _mm_undefined_ps(void)
 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
 ///    lower 32 bits contain the value provided in the source operand. The
 ///    upper 96 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ss(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ss(float __w) {
   return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
 }
 
@@ -1949,9 +1943,8 @@ _mm_set_ss(float __w)
 ///    A single-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set1_ps(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set1_ps(float __w) {
   return __extension__ (__m128){ __w, __w, __w, __w };
 }
 
@@ -1968,9 +1961,8 @@ _mm_set1_ps(float __w)
 ///    A single-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps1(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ps1(float __w) {
     return _mm_set1_ps(__w);
 }
 
@@ -1995,9 +1987,8 @@ _mm_set_ps1(float __w)
 ///    A single-precision floating-point value used to initialize bits [31:0]
 ///    of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps(float __z, float __y, float __x, float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ps(float __z, float __y, float __x, float __w) {
   return __extension__ (__m128){ __w, __x, __y, __z };
 }
 
@@ -2023,9 +2014,8 @@ _mm_set_ps(float __z, float __y, float __x, float __w)
 ///    A single-precision floating-point value used to initialize bits [127:96]
 ///    of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setr_ps(float __z, float __y, float __x, float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setr_ps(float __z, float __y, float __x, float __w) {
   return __extension__ (__m128){ __z, __y, __x, __w };
 }
 
@@ -2038,9 +2028,8 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
 ///
 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
 ///    all elements set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setzero_ps(void)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setzero_ps(void) {
   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
 }
 
@@ -2786,9 +2775,8 @@ void _mm_setcsr(unsigned int __i);
 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
 ///    Bits [127:96] are written to bits [127:96] of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpackhi_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpackhi_ps(__m128 __a, __m128 __b) {
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
 }
 
@@ -2808,9 +2796,8 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b)
 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
 ///    Bits [63:32] are written to bits [127:96] of the destination.
 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpacklo_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpacklo_ps(__m128 __a, __m128 __b) {
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
 }
 
@@ -2830,9 +2817,8 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b)
 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
 ///    written to the lower 32 bits of the result.
 /// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_move_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_move_ss(__m128 __a, __m128 __b) {
   __a[0] = __b[0];
   return __a;
 }
@@ -2852,9 +2838,8 @@ _mm_move_ss(__m128 __a, __m128 __b)
 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
 ///    written to the lower 64 bits of the result.
 /// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movehl_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movehl_ps(__m128 __a, __m128 __b) {
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
 }
 
@@ -2873,9 +2858,8 @@ _mm_movehl_ps(__m128 __a, __m128 __b)
 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
 ///    written to the upper 64 bits of the result.
 /// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movelh_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movelh_ps(__m128 __a, __m128 __b) {
   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
 }
 
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index 74f62e93c800b6..932d6f36b09b66 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -869,3 +869,86 @@ __m128 test_mm_xor_ps(__m128 A, __m128 B) {
   // CHECK: xor <4 x i32>
   return _mm_xor_ps(A, B);
 }
+
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+
+void test_constexpr() {
+  constexpr __m128 k1 {+1.0f,+0.0f,+2.0f,+4.0f};
+  constexpr __m128 k2 {+8.0f,+4.0f,+2.0f,+1.0f};
+  constexpr __m128 k3 {-4.0f,-5.0f,+6.0f,+7.0f};
+  constexpr __m128 k4 {+0.0f,-0.0f,-0.0f,+0.0f};
+
+  constexpr __m128 v_mm_set_ss = _mm_set_ss(1.0f);
+  static_assert(v_mm_set_ss[0] == +1.0f && v_mm_set_ss[1] == +0.0f && v_mm_set_ss[2] == +0.0f && v_mm_set_ss[3] == +0.0f);
+
+  constexpr __m128 v_mm_set1_ps = _mm_set1_ps(2.0f);
+  static_assert(v_mm_set1_ps[0] == +2.0f && v_mm_set1_ps[1] == +2.0f && v_mm_set1_ps[2] == +2.0f && v_mm_set1_ps[3] == +2.0f);
+
+  constexpr __m128 v_mm_set_ps1 = _mm_set_ps1(-2.0f);
+  static_assert(v_mm_set_ps1[0] == -2.0f && v_mm_set_ps1[1] == -2.0f && v_mm_set_ps1[2] == -2.0f && v_mm_set_ps1[3] == -2.0f);
+
+  constexpr __m128 v_mm_set_ps = _mm_set_ps(+0.0f, +1.0f, +2.0f, +3.0f);
+  static_assert(v_mm_set_ps[0] == +3.0f && v_mm_set_ps[1] == +2.0f && v_mm_set_ps[2] == +1.0f && v_mm_set_ps[3] == +0.0f);
+
+  constexpr __m128 v_mm_setr_ps = _mm_setr_ps(+0.0f, +1.0f, +2.0f, +3.0f);
+  static_assert(v_mm_setr_ps[0] == +0.0f && v_mm_setr_ps[1] == +1.0f && v_mm_setr_ps[2] == +2.0f && v_mm_setr_ps[3] == +3.0f);
+
+  constexpr __m128 v_mm_setzero_ps = _mm_setzero_ps();
+  static_assert(v_mm_setzero_ps[0] == +0.0f && v_mm_setzero_ps[1] == +0.0f && v_mm_setzero_ps[2] == +0.0f && v_mm_setzero_ps[3] == +0.0f);
+
+  constexpr __m128 v_mm_add_ss = _mm_add_ss(k1, k2);
+  static_assert(v_mm_add_ss[0] == +9.0f && v_mm_add_ss[1] == +0.0f && v_mm_add_ss[2] == +2.0f && v_mm_add_ss[3] == +4.0f);
+
+  constexpr __m128 v_mm_add_ps = _mm_add_ps(k1, k2);
+  static_assert(v_mm_add_ps[0] == +9.0f && v_mm_add_ps[1] == +4.0f && v_mm_add_ps[2] == +4.0f && v_mm_add_ps[3] == +5.0f);
+
+  constexpr __m128 v_mm_sub_ss = _mm_sub_ss(k1, k2);
+  static_assert(v_mm_sub_ss[0] == -7.0f && v_mm_sub_ss[1] == +0.0f && v_mm_sub_ss[2] == +2.0f && v_mm_sub_ss[3] == +4.0f);
+
+  constexpr __m128 v_mm_sub_ps = _mm_sub_ps(k1, k2);
+  static_assert(v_mm_sub_ps[0] == -7.0f && v_mm_sub_ps[1] == -4.0f && v_mm_sub_ps[2] == +0.0f && v_mm_sub_ps[3] == +3.0f);
+
+  constexpr __m128 v_mm_mul_ss = _mm_mul_ss(k1, k2);
+  static_assert(v_mm_mul_ss[0] == +8.0f && v_mm_mul_ss[1] == +0.0f && v_mm_mul_ss[2] == +2.0f && v_mm_mul_ss[3] == +4.0f);
+
+  constexpr __m128 v_mm_mul_ps = _mm_mul_ps(k1, k2);
+  static_assert(v_mm_mul_ps[0] == +8.0f && v_mm_mul_ps[1] == +0.0f && v_mm_mul_ps[2] == +4.0f && v_mm_mul_ps[3] == +4.0f);
+
+  constexpr __m128 v_mm_div_ss = _mm_div_ss(k1, k2);
+  static_assert(v_mm_div_ss[0] == +0.125f && v_mm_div_ss[1] == +0.0f && v_mm_div_ss[2] == +2.0f && v_mm_div_ss[3] == +4.0f);
+
+  constexpr __m128 v_mm_div_ps = _mm_div_ps(k1, k2);
+  static_assert(v_mm_div_ps[0] == +0.125f && v_mm_div_ps[1] == +0.0f && v_mm_div_ps[2] == +1.0f && v_mm_div_ps[3] == +4.0f);
+
+  constexpr __m128 v_mm_and_ps = _mm_and_ps(k3, k4);
+  static_assert(v_mm_and_ps[0] == +0.0f && v_mm_and_ps[1] == +0.0f && v_mm_and_ps[2] == +0.0f && v_mm_and_ps[3] == +0.0f);
+
+  constexpr __m128 v_mm_andnot_ps = _mm_andnot_ps(k3, k4);
+  static_assert(v_mm_andnot_ps[0] == +0.0f && v_mm_andnot_ps[1] == +0.0f && v_mm_andnot_ps[2] == +0.0f && v_mm_andnot_ps[3] == +0.0f);
+
+  constexpr __m128 v_mm_or_ps = _mm_or_ps(k3, k4);
+  static_assert(v_mm_or_ps[0] == -4.0f && v_mm_or_ps[1] == -5.0f && v_mm_or_ps[2] == -6.0f && v_mm_or_ps[3] == +7.0f);
+
+  constexpr __m128 v_mm_xor_ps = _mm_xor_ps(k3, k4);
+  static_assert(v_mm_xor_ps[0] == -4.0f && v_mm_xor_ps[1] == +5.0f && v_mm_xor_ps[2] == -6.0f && v_mm_xor_ps[3] == +7.0f);
+
+  constexpr __m128 v_mm_unpackhi_ps = _mm_unpackhi_ps(k1, k2);
+  static_assert(v_mm_unpackhi_ps[0] == +2.0f && v_mm_unpackhi_ps[1] == +2.0f && v_mm_unpackhi_ps[2] == +4.0f && v_mm_unpackhi_ps[3] == +1.0f);
+
+  constexpr __m128 v_mm_unpacklo_ps = _mm_unpacklo_ps(k1, k2);
+  static_assert(v_mm_unpacklo_ps[0] == +1.0f && v_mm_unpacklo_ps[1] == +8.0f && v_mm_unpacklo_ps[2] == +0.0f && v_mm_unpacklo_ps[3] == +4.0f);
+
+  constexpr __m128 v_mm_move_ss = _mm_move_ss(k1, k2);
+  static_assert(v_mm_move_ss[0] == +8.0f && v_mm_move_ss[1] == +0.0f && v_mm_move_ss[2] == +2.0f && v_mm_move_ss[3] == +4.0f);
+
+  constexpr __m128 v_mm_movehl_ps = _mm_movehl_ps(k1, k2);
+  static_assert(v_mm_movehl_ps[0] == +2.0f && v_mm_movehl_ps[1] == +1.0f && v_mm_movehl_ps[2] == +2.0f && v_mm_movehl_ps[3] == +4.0f);
+
+  constexpr __m128 v_mm_movelh_ps = _mm_movelh_ps(k1, k2);
+  static_assert(v_mm_movelh_ps[0] == +1.0f && v_mm_movelh_ps[1] == +0.0f && v_mm_movelh_ps[2] == +8.0f && v_mm_movelh_ps[3] == +4.0f);
+
+  static_assert(_mm_cvtss_f32(k2) == +8.0f);
+}
+
+#endif
\ No newline at end of file

github-actions · 2024-10-03T14:46:12Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff 02dd6b1014c708591fa0e8d46efb328c513a86e7 b57d4529615383767bce3714f8924355ff6924ac --extensions c,h -- clang/lib/Headers/xmmintrin.h clang/test/CodeGen/X86/sse-builtins.c

View the diff from clang-format here.

diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 2aa688adef..eb5033eec2 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -83,8 +83,8 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
 ///    the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_add_ss(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ss(__m128 __a,
+                                                                 __m128 __b) {
   __a[0] += __b[0];
   return __a;
 }
@@ -102,8 +102,8 @@ _mm_add_ss(__m128 __a, __m128 __b) {
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the sums of both
 ///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_add_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ps(__m128 __a,
+                                                                 __m128 __b) {
   return (__m128)((__v4sf)__a + (__v4sf)__b);
 }
 
@@ -123,8 +123,8 @@ _mm_add_ps(__m128 __a, __m128 __b) {
 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
 ///    copied from the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_sub_ss(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ss(__m128 __a,
+                                                                 __m128 __b) {
   __a[0] -= __b[0];
   return __a;
 }
@@ -143,8 +143,8 @@ _mm_sub_ss(__m128 __a, __m128 __b) {
 ///    A 128-bit vector of [4 x float] containing the subtrahend.
 /// \returns A 128-bit vector of [4 x float] containing the differences between
 ///    both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_sub_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ps(__m128 __a,
+                                                                 __m128 __b) {
   return (__m128)((__v4sf)__a - (__v4sf)__b);
 }
 
@@ -164,8 +164,8 @@ _mm_sub_ps(__m128 __a, __m128 __b) {
 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
 ///    bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_mul_ss(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ss(__m128 __a,
+                                                                 __m128 __b) {
   __a[0] *= __b[0];
   return __a;
 }
@@ -183,8 +183,8 @@ _mm_mul_ss(__m128 __a, __m128 __b) {
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the products of both
 ///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_mul_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ps(__m128 __a,
+                                                                 __m128 __b) {
   return (__m128)((__v4sf)__a * (__v4sf)__b);
 }
 
@@ -204,8 +204,8 @@ _mm_mul_ps(__m128 __a, __m128 __b) {
 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
 ///    upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_div_ss(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ss(__m128 __a,
+                                                                 __m128 __b) {
   __a[0] /= __b[0];
   return __a;
 }
@@ -222,8 +222,8 @@ _mm_div_ss(__m128 __a, __m128 __b) {
 ///    A 128-bit vector of [4 x float] containing the divisor.
 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
 ///    operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_div_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ps(__m128 __a,
+                                                                 __m128 __b) {
   return (__m128)((__v4sf)__a / (__v4sf)__b);
 }
 
@@ -437,8 +437,8 @@ _mm_max_ps(__m128 __a, __m128 __b)
 ///    A 128-bit vector containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
 ///    values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_and_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_ps(__m128 __a,
+                                                                 __m128 __b) {
   return (__m128)((__v4su)__a & (__v4su)__b);
 }
 
@@ -475,8 +475,8 @@ _mm_andnot_ps(__m128 __a, __m128 __b) {
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
 ///    values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_or_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_ps(__m128 __a,
+                                                                __m128 __b) {
   return (__m128)((__v4su)__a | (__v4su)__b);
 }
 
@@ -493,8 +493,8 @@ _mm_or_ps(__m128 __a, __m128 __b) {
 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
 ///    of the values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_xor_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_ps(__m128 __a,
+                                                                 __m128 __b) {
   return (__m128)((__v4su)__a ^ (__v4su)__b);
 }
 
@@ -1734,8 +1734,7 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the extraction.
 /// \returns A 32-bit float containing the extracted value.
-static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_cvtss_f32(__m128 __a) {
+static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtss_f32(__m128 __a) {
   return __a[0];
 }
 
@@ -1926,8 +1925,7 @@ _mm_undefined_ps(void)
 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
 ///    lower 32 bits contain the value provided in the source operand. The
 ///    upper 96 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_set_ss(float __w) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ss(float __w) {
   return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
 }
 
@@ -1943,8 +1941,7 @@ _mm_set_ss(float __w) {
 ///    A single-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_set1_ps(float __w) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_ps(float __w) {
   return __extension__ (__m128){ __w, __w, __w, __w };
 }
 
@@ -1961,9 +1958,8 @@ _mm_set1_ps(float __w) {
 ///    A single-precision floating-point value used to initialize each vector
 ///    element of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_set_ps1(float __w) {
-    return _mm_set1_ps(__w);
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps1(float __w) {
+  return _mm_set1_ps(__w);
 }
 
 /// Constructs a 128-bit floating-point vector of [4 x float]
@@ -1987,8 +1983,10 @@ _mm_set_ps1(float __w) {
 ///    A single-precision floating-point value used to initialize bits [31:0]
 ///    of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_set_ps(float __z, float __y, float __x, float __w) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps(float __z,
+                                                                 float __y,
+                                                                 float __x,
+                                                                 float __w) {
   return __extension__ (__m128){ __w, __x, __y, __z };
 }
 
@@ -2014,8 +2012,10 @@ _mm_set_ps(float __z, float __y, float __x, float __w) {
 ///    A single-precision floating-point value used to initialize bits [127:96]
 ///    of the result.
 /// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_setr_ps(float __z, float __y, float __x, float __w) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_ps(float __z,
+                                                                  float __y,
+                                                                  float __x,
+                                                                  float __w) {
   return __extension__ (__m128){ __z, __y, __x, __w };
 }
 
@@ -2028,8 +2028,7 @@ _mm_setr_ps(float __z, float __y, float __x, float __w) {
 ///
 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
 ///    all elements set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_setzero_ps(void) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void) {
   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
 }
 
@@ -2817,8 +2816,8 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b) {
 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
 ///    written to the lower 32 bits of the result.
 /// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_move_ss(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_move_ss(__m128 __a,
+                                                                  __m128 __b) {
   __a[0] = __b[0];
   return __a;
 }

phoebewang

LGTM.

This is an initial patch to enable constexpr support on the more basic SSE1 intrinsics - such as initialization, arithmetic, logic and fixed shuffles. The plan is to incrementally extend this for SSE2/AVX etc. - initially for the equivalent basic intrinsics, but we can add support for some of the ia32 builtins as well we the need arises.

Followup to #111001

…mm_cvtsi64_ss SSE1 intrinsics Followup to #111001

Followup to llvm#111001

…mm_cvtsi64_ss SSE1 intrinsics Followup to llvm#111001

RKSimon requested review from phoebewang, KanRobert and goldsteinn October 3, 2024 14:42

llvmbot added clang Clang issues not falling into any other category backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics labels Oct 3, 2024

RKSimon force-pushed the x86-constexpr-sse1-basics branch from d3e47ff to 34f0bfa Compare October 3, 2024 14:53

phoebewang approved these changes Oct 4, 2024

View reviewed changes

RKSimon force-pushed the x86-constexpr-sse1-basics branch from 34f0bfa to b57d452 Compare October 7, 2024 12:19

RKSimon merged commit 73c9ad2 into llvm:main Oct 7, 2024
6 of 8 checks passed

RKSimon deleted the x86-constexpr-sse1-basics branch October 7, 2024 13:11

RKSimon added a commit that referenced this pull request Oct 10, 2024

[clang][x86] Add constexpr support for some basic SSE2 fp intrinsics

003375f

Followup to #111001

RKSimon added a commit that referenced this pull request Oct 10, 2024

[clang][x86] Add constexpr support for _mm_cvtsi32_ss/_mm_cvt_si2ss/_…

8306114

…mm_cvtsi64_ss SSE1 intrinsics Followup to #111001

DanielCChen pushed a commit to DanielCChen/llvm-project that referenced this pull request Oct 16, 2024

[clang][x86] Add constexpr support for some basic SSE2 fp intrinsics

48db7ef

Followup to llvm#111001

DanielCChen pushed a commit to DanielCChen/llvm-project that referenced this pull request Oct 16, 2024

[clang][x86] Add constexpr support for _mm_cvtsi32_ss/_mm_cvt_si2ss/_…

ef16851

…mm_cvtsi64_ss SSE1 intrinsics Followup to llvm#111001

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[clang][x86] Add constexpr support for some basic SSE1 intrinsics #111001

[clang][x86] Add constexpr support for some basic SSE1 intrinsics #111001

RKSimon commented Oct 3, 2024

llvmbot commented Oct 3, 2024

llvmbot commented Oct 3, 2024

github-actions bot commented Oct 3, 2024 •

edited

Loading

phoebewang left a comment

[clang][x86] Add constexpr support for some basic SSE1 intrinsics #111001

[clang][x86] Add constexpr support for some basic SSE1 intrinsics #111001

Conversation

RKSimon commented Oct 3, 2024

llvmbot commented Oct 3, 2024

llvmbot commented Oct 3, 2024

github-actions bot commented Oct 3, 2024 • edited Loading

phoebewang left a comment

Choose a reason for hiding this comment

github-actions bot commented Oct 3, 2024 •

edited

Loading