-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[clang][x86] Add constexpr support for some basic SSE1 intrinsics #111001
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesThis is an initial patch to enable constexpr support on the more basic SSE1 intrinsics - such as initialization, arithmetic, logic and fixed shuffles. The plan is to incrementally extend this for SSE2/AVX etc. - initially for the equivalent basic intrinsics, but we can add support for some of the ia32 builtins as well we the need arises. Full diff: https://github.com/llvm/llvm-project/pull/111001.diff 2 Files Affected:
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 9958e91bfceaa9..e755556f9cd319 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -48,6 +48,14 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
__min_vector_width__(128)))
#endif
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
+#endif
+
#define __trunc64(x) \
(__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
#define __zext128(x) \
@@ -75,9 +83,8 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
/// of the lower 32 bits of both operands. The upper 96 bits are copied from
/// the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_ss(__m128 __a, __m128 __b) {
__a[0] += __b[0];
return __a;
}
@@ -95,9 +102,8 @@ _mm_add_ss(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the sums of both
/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4sf)__a + (__v4sf)__b);
}
@@ -117,9 +123,8 @@ _mm_add_ps(__m128 __a, __m128 __b)
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
/// difference of the lower 32 bits of both operands. The upper 96 bits are
/// copied from the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_ss(__m128 __a, __m128 __b) {
__a[0] -= __b[0];
return __a;
}
@@ -138,9 +143,8 @@ _mm_sub_ss(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing the subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the differences between
/// both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4sf)__a - (__v4sf)__b);
}
@@ -160,9 +164,8 @@ _mm_sub_ps(__m128 __a, __m128 __b)
/// \returns A 128-bit vector of [4 x float] containing the product of the lower
/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
/// bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_ss(__m128 __a, __m128 __b) {
__a[0] *= __b[0];
return __a;
}
@@ -180,9 +183,8 @@ _mm_mul_ss(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the products of both
/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4sf)__a * (__v4sf)__b);
}
@@ -202,9 +204,8 @@ _mm_mul_ps(__m128 __a, __m128 __b)
/// \returns A 128-bit vector of [4 x float] containing the quotients of the
/// lower 32 bits of both operands. The upper 96 bits are copied from the
/// upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_div_ss(__m128 __a, __m128 __b) {
__a[0] /= __b[0];
return __a;
}
@@ -221,9 +222,8 @@ _mm_div_ss(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing the divisor.
/// \returns A 128-bit vector of [4 x float] containing the quotients of both
/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_div_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4sf)__a / (__v4sf)__b);
}
@@ -437,9 +437,8 @@ _mm_max_ps(__m128 __a, __m128 __b)
/// A 128-bit vector containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
/// values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_and_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_and_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4su)__a & (__v4su)__b);
}
@@ -459,9 +458,8 @@ _mm_and_ps(__m128 __a, __m128 __b)
/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
/// one's complement of the first operand and the values in the second
/// operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_andnot_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_andnot_ps(__m128 __a, __m128 __b) {
return (__m128)(~(__v4su)__a & (__v4su)__b);
}
@@ -477,9 +475,8 @@ _mm_andnot_ps(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
/// values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_or_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_or_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4su)__a | (__v4su)__b);
}
@@ -496,9 +493,8 @@ _mm_or_ps(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
/// of the values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_xor_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_xor_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4su)__a ^ (__v4su)__b);
}
@@ -1738,9 +1734,8 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b)
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
/// used in the extraction.
/// \returns A 32-bit float containing the extracted value.
-static __inline__ float __DEFAULT_FN_ATTRS
-_mm_cvtss_f32(__m128 __a)
-{
+static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtss_f32(__m128 __a) {
return __a[0];
}
@@ -1931,9 +1926,8 @@ _mm_undefined_ps(void)
/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
/// lower 32 bits contain the value provided in the source operand. The
/// upper 96 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ss(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ss(float __w) {
return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
}
@@ -1949,9 +1943,8 @@ _mm_set_ss(float __w)
/// A single-precision floating-point value used to initialize each vector
/// element of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set1_ps(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set1_ps(float __w) {
return __extension__ (__m128){ __w, __w, __w, __w };
}
@@ -1968,9 +1961,8 @@ _mm_set1_ps(float __w)
/// A single-precision floating-point value used to initialize each vector
/// element of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps1(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ps1(float __w) {
return _mm_set1_ps(__w);
}
@@ -1995,9 +1987,8 @@ _mm_set_ps1(float __w)
/// A single-precision floating-point value used to initialize bits [31:0]
/// of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps(float __z, float __y, float __x, float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ps(float __z, float __y, float __x, float __w) {
return __extension__ (__m128){ __w, __x, __y, __z };
}
@@ -2023,9 +2014,8 @@ _mm_set_ps(float __z, float __y, float __x, float __w)
/// A single-precision floating-point value used to initialize bits [127:96]
/// of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setr_ps(float __z, float __y, float __x, float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setr_ps(float __z, float __y, float __x, float __w) {
return __extension__ (__m128){ __z, __y, __x, __w };
}
@@ -2038,9 +2028,8 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
///
/// \returns An initialized 128-bit floating-point vector of [4 x float] with
/// all elements set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setzero_ps(void)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setzero_ps(void) {
return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
}
@@ -2786,9 +2775,8 @@ void _mm_setcsr(unsigned int __i);
/// Bits [95:64] are written to bits [63:32] of the destination. \n
/// Bits [127:96] are written to bits [127:96] of the destination.
/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpackhi_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpackhi_ps(__m128 __a, __m128 __b) {
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
}
@@ -2808,9 +2796,8 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b)
/// Bits [31:0] are written to bits [63:32] of the destination. \n
/// Bits [63:32] are written to bits [127:96] of the destination.
/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpacklo_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpacklo_ps(__m128 __a, __m128 __b) {
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
}
@@ -2830,9 +2817,8 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b)
/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
/// written to the lower 32 bits of the result.
/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_move_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_move_ss(__m128 __a, __m128 __b) {
__a[0] = __b[0];
return __a;
}
@@ -2852,9 +2838,8 @@ _mm_move_ss(__m128 __a, __m128 __b)
/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
/// written to the lower 64 bits of the result.
/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movehl_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movehl_ps(__m128 __a, __m128 __b) {
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
}
@@ -2873,9 +2858,8 @@ _mm_movehl_ps(__m128 __a, __m128 __b)
/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
/// written to the upper 64 bits of the result.
/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movelh_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movelh_ps(__m128 __a, __m128 __b) {
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
}
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index 74f62e93c800b6..932d6f36b09b66 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -869,3 +869,86 @@ __m128 test_mm_xor_ps(__m128 A, __m128 B) {
// CHECK: xor <4 x i32>
return _mm_xor_ps(A, B);
}
+
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+
+void test_constexpr() {
+ constexpr __m128 k1 {+1.0f,+0.0f,+2.0f,+4.0f};
+ constexpr __m128 k2 {+8.0f,+4.0f,+2.0f,+1.0f};
+ constexpr __m128 k3 {-4.0f,-5.0f,+6.0f,+7.0f};
+ constexpr __m128 k4 {+0.0f,-0.0f,-0.0f,+0.0f};
+
+ constexpr __m128 v_mm_set_ss = _mm_set_ss(1.0f);
+ static_assert(v_mm_set_ss[0] == +1.0f && v_mm_set_ss[1] == +0.0f && v_mm_set_ss[2] == +0.0f && v_mm_set_ss[3] == +0.0f);
+
+ constexpr __m128 v_mm_set1_ps = _mm_set1_ps(2.0f);
+ static_assert(v_mm_set1_ps[0] == +2.0f && v_mm_set1_ps[1] == +2.0f && v_mm_set1_ps[2] == +2.0f && v_mm_set1_ps[3] == +2.0f);
+
+ constexpr __m128 v_mm_set_ps1 = _mm_set_ps1(-2.0f);
+ static_assert(v_mm_set_ps1[0] == -2.0f && v_mm_set_ps1[1] == -2.0f && v_mm_set_ps1[2] == -2.0f && v_mm_set_ps1[3] == -2.0f);
+
+ constexpr __m128 v_mm_set_ps = _mm_set_ps(+0.0f, +1.0f, +2.0f, +3.0f);
+ static_assert(v_mm_set_ps[0] == +3.0f && v_mm_set_ps[1] == +2.0f && v_mm_set_ps[2] == +1.0f && v_mm_set_ps[3] == +0.0f);
+
+ constexpr __m128 v_mm_setr_ps = _mm_setr_ps(+0.0f, +1.0f, +2.0f, +3.0f);
+ static_assert(v_mm_setr_ps[0] == +0.0f && v_mm_setr_ps[1] == +1.0f && v_mm_setr_ps[2] == +2.0f && v_mm_setr_ps[3] == +3.0f);
+
+ constexpr __m128 v_mm_setzero_ps = _mm_setzero_ps();
+ static_assert(v_mm_setzero_ps[0] == +0.0f && v_mm_setzero_ps[1] == +0.0f && v_mm_setzero_ps[2] == +0.0f && v_mm_setzero_ps[3] == +0.0f);
+
+ constexpr __m128 v_mm_add_ss = _mm_add_ss(k1, k2);
+ static_assert(v_mm_add_ss[0] == +9.0f && v_mm_add_ss[1] == +0.0f && v_mm_add_ss[2] == +2.0f && v_mm_add_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_add_ps = _mm_add_ps(k1, k2);
+ static_assert(v_mm_add_ps[0] == +9.0f && v_mm_add_ps[1] == +4.0f && v_mm_add_ps[2] == +4.0f && v_mm_add_ps[3] == +5.0f);
+
+ constexpr __m128 v_mm_sub_ss = _mm_sub_ss(k1, k2);
+ static_assert(v_mm_sub_ss[0] == -7.0f && v_mm_sub_ss[1] == +0.0f && v_mm_sub_ss[2] == +2.0f && v_mm_sub_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_sub_ps = _mm_sub_ps(k1, k2);
+ static_assert(v_mm_sub_ps[0] == -7.0f && v_mm_sub_ps[1] == -4.0f && v_mm_sub_ps[2] == +0.0f && v_mm_sub_ps[3] == +3.0f);
+
+ constexpr __m128 v_mm_mul_ss = _mm_mul_ss(k1, k2);
+ static_assert(v_mm_mul_ss[0] == +8.0f && v_mm_mul_ss[1] == +0.0f && v_mm_mul_ss[2] == +2.0f && v_mm_mul_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_mul_ps = _mm_mul_ps(k1, k2);
+ static_assert(v_mm_mul_ps[0] == +8.0f && v_mm_mul_ps[1] == +0.0f && v_mm_mul_ps[2] == +4.0f && v_mm_mul_ps[3] == +4.0f);
+
+ constexpr __m128 v_mm_div_ss = _mm_div_ss(k1, k2);
+ static_assert(v_mm_div_ss[0] == +0.125f && v_mm_div_ss[1] == +0.0f && v_mm_div_ss[2] == +2.0f && v_mm_div_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_div_ps = _mm_div_ps(k1, k2);
+ static_assert(v_mm_div_ps[0] == +0.125f && v_mm_div_ps[1] == +0.0f && v_mm_div_ps[2] == +1.0f && v_mm_div_ps[3] == +4.0f);
+
+ constexpr __m128 v_mm_and_ps = _mm_and_ps(k3, k4);
+ static_assert(v_mm_and_ps[0] == +0.0f && v_mm_and_ps[1] == +0.0f && v_mm_and_ps[2] == +0.0f && v_mm_and_ps[3] == +0.0f);
+
+ constexpr __m128 v_mm_andnot_ps = _mm_andnot_ps(k3, k4);
+ static_assert(v_mm_andnot_ps[0] == +0.0f && v_mm_andnot_ps[1] == +0.0f && v_mm_andnot_ps[2] == +0.0f && v_mm_andnot_ps[3] == +0.0f);
+
+ constexpr __m128 v_mm_or_ps = _mm_or_ps(k3, k4);
+ static_assert(v_mm_or_ps[0] == -4.0f && v_mm_or_ps[1] == -5.0f && v_mm_or_ps[2] == -6.0f && v_mm_or_ps[3] == +7.0f);
+
+ constexpr __m128 v_mm_xor_ps = _mm_xor_ps(k3, k4);
+ static_assert(v_mm_xor_ps[0] == -4.0f && v_mm_xor_ps[1] == +5.0f && v_mm_xor_ps[2] == -6.0f && v_mm_xor_ps[3] == +7.0f);
+
+ constexpr __m128 v_mm_unpackhi_ps = _mm_unpackhi_ps(k1, k2);
+ static_assert(v_mm_unpackhi_ps[0] == +2.0f && v_mm_unpackhi_ps[1] == +2.0f && v_mm_unpackhi_ps[2] == +4.0f && v_mm_unpackhi_ps[3] == +1.0f);
+
+ constexpr __m128 v_mm_unpacklo_ps = _mm_unpacklo_ps(k1, k2);
+ static_assert(v_mm_unpacklo_ps[0] == +1.0f && v_mm_unpacklo_ps[1] == +8.0f && v_mm_unpacklo_ps[2] == +0.0f && v_mm_unpacklo_ps[3] == +4.0f);
+
+ constexpr __m128 v_mm_move_ss = _mm_move_ss(k1, k2);
+ static_assert(v_mm_move_ss[0] == +8.0f && v_mm_move_ss[1] == +0.0f && v_mm_move_ss[2] == +2.0f && v_mm_move_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_movehl_ps = _mm_movehl_ps(k1, k2);
+ static_assert(v_mm_movehl_ps[0] == +2.0f && v_mm_movehl_ps[1] == +1.0f && v_mm_movehl_ps[2] == +2.0f && v_mm_movehl_ps[3] == +4.0f);
+
+ constexpr __m128 v_mm_movelh_ps = _mm_movelh_ps(k1, k2);
+ static_assert(v_mm_movelh_ps[0] == +1.0f && v_mm_movelh_ps[1] == +0.0f && v_mm_movelh_ps[2] == +8.0f && v_mm_movelh_ps[3] == +4.0f);
+
+ static_assert(_mm_cvtss_f32(k2) == +8.0f);
+}
+
+#endif
\ No newline at end of file
|
@llvm/pr-subscribers-clang Author: Simon Pilgrim (RKSimon) ChangesThis is an initial patch to enable constexpr support on the more basic SSE1 intrinsics - such as initialization, arithmetic, logic and fixed shuffles. The plan is to incrementally extend this for SSE2/AVX etc. - initially for the equivalent basic intrinsics, but we can add support for some of the ia32 builtins as well we the need arises. Full diff: https://github.com/llvm/llvm-project/pull/111001.diff 2 Files Affected:
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 9958e91bfceaa9..e755556f9cd319 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -48,6 +48,14 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
__min_vector_width__(128)))
#endif
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
+#endif
+
#define __trunc64(x) \
(__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
#define __zext128(x) \
@@ -75,9 +83,8 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
/// of the lower 32 bits of both operands. The upper 96 bits are copied from
/// the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_ss(__m128 __a, __m128 __b) {
__a[0] += __b[0];
return __a;
}
@@ -95,9 +102,8 @@ _mm_add_ss(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the sums of both
/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_add_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4sf)__a + (__v4sf)__b);
}
@@ -117,9 +123,8 @@ _mm_add_ps(__m128 __a, __m128 __b)
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
/// difference of the lower 32 bits of both operands. The upper 96 bits are
/// copied from the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_ss(__m128 __a, __m128 __b) {
__a[0] -= __b[0];
return __a;
}
@@ -138,9 +143,8 @@ _mm_sub_ss(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing the subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the differences between
/// both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sub_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4sf)__a - (__v4sf)__b);
}
@@ -160,9 +164,8 @@ _mm_sub_ps(__m128 __a, __m128 __b)
/// \returns A 128-bit vector of [4 x float] containing the product of the lower
/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
/// bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_ss(__m128 __a, __m128 __b) {
__a[0] *= __b[0];
return __a;
}
@@ -180,9 +183,8 @@ _mm_mul_ss(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the products of both
/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_mul_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4sf)__a * (__v4sf)__b);
}
@@ -202,9 +204,8 @@ _mm_mul_ps(__m128 __a, __m128 __b)
/// \returns A 128-bit vector of [4 x float] containing the quotients of the
/// lower 32 bits of both operands. The upper 96 bits are copied from the
/// upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_div_ss(__m128 __a, __m128 __b) {
__a[0] /= __b[0];
return __a;
}
@@ -221,9 +222,8 @@ _mm_div_ss(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing the divisor.
/// \returns A 128-bit vector of [4 x float] containing the quotients of both
/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_div_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4sf)__a / (__v4sf)__b);
}
@@ -437,9 +437,8 @@ _mm_max_ps(__m128 __a, __m128 __b)
/// A 128-bit vector containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
/// values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_and_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_and_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4su)__a & (__v4su)__b);
}
@@ -459,9 +458,8 @@ _mm_and_ps(__m128 __a, __m128 __b)
/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
/// one's complement of the first operand and the values in the second
/// operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_andnot_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_andnot_ps(__m128 __a, __m128 __b) {
return (__m128)(~(__v4su)__a & (__v4su)__b);
}
@@ -477,9 +475,8 @@ _mm_andnot_ps(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
/// values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_or_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_or_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4su)__a | (__v4su)__b);
}
@@ -496,9 +493,8 @@ _mm_or_ps(__m128 __a, __m128 __b)
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
/// of the values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_xor_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_xor_ps(__m128 __a, __m128 __b) {
return (__m128)((__v4su)__a ^ (__v4su)__b);
}
@@ -1738,9 +1734,8 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b)
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
/// used in the extraction.
/// \returns A 32-bit float containing the extracted value.
-static __inline__ float __DEFAULT_FN_ATTRS
-_mm_cvtss_f32(__m128 __a)
-{
+static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cvtss_f32(__m128 __a) {
return __a[0];
}
@@ -1931,9 +1926,8 @@ _mm_undefined_ps(void)
/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
/// lower 32 bits contain the value provided in the source operand. The
/// upper 96 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ss(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ss(float __w) {
return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
}
@@ -1949,9 +1943,8 @@ _mm_set_ss(float __w)
/// A single-precision floating-point value used to initialize each vector
/// element of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set1_ps(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set1_ps(float __w) {
return __extension__ (__m128){ __w, __w, __w, __w };
}
@@ -1968,9 +1961,8 @@ _mm_set1_ps(float __w)
/// A single-precision floating-point value used to initialize each vector
/// element of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps1(float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ps1(float __w) {
return _mm_set1_ps(__w);
}
@@ -1995,9 +1987,8 @@ _mm_set_ps1(float __w)
/// A single-precision floating-point value used to initialize bits [31:0]
/// of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps(float __z, float __y, float __x, float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_set_ps(float __z, float __y, float __x, float __w) {
return __extension__ (__m128){ __w, __x, __y, __z };
}
@@ -2023,9 +2014,8 @@ _mm_set_ps(float __z, float __y, float __x, float __w)
/// A single-precision floating-point value used to initialize bits [127:96]
/// of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setr_ps(float __z, float __y, float __x, float __w)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setr_ps(float __z, float __y, float __x, float __w) {
return __extension__ (__m128){ __z, __y, __x, __w };
}
@@ -2038,9 +2028,8 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
///
/// \returns An initialized 128-bit floating-point vector of [4 x float] with
/// all elements set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setzero_ps(void)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_setzero_ps(void) {
return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
}
@@ -2786,9 +2775,8 @@ void _mm_setcsr(unsigned int __i);
/// Bits [95:64] are written to bits [63:32] of the destination. \n
/// Bits [127:96] are written to bits [127:96] of the destination.
/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpackhi_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpackhi_ps(__m128 __a, __m128 __b) {
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
}
@@ -2808,9 +2796,8 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b)
/// Bits [31:0] are written to bits [63:32] of the destination. \n
/// Bits [63:32] are written to bits [127:96] of the destination.
/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpacklo_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_unpacklo_ps(__m128 __a, __m128 __b) {
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
}
@@ -2830,9 +2817,8 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b)
/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
/// written to the lower 32 bits of the result.
/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_move_ss(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_move_ss(__m128 __a, __m128 __b) {
__a[0] = __b[0];
return __a;
}
@@ -2852,9 +2838,8 @@ _mm_move_ss(__m128 __a, __m128 __b)
/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
/// written to the lower 64 bits of the result.
/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movehl_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movehl_ps(__m128 __a, __m128 __b) {
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
}
@@ -2873,9 +2858,8 @@ _mm_movehl_ps(__m128 __a, __m128 __b)
/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
/// written to the upper 64 bits of the result.
/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movelh_ps(__m128 __a, __m128 __b)
-{
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movelh_ps(__m128 __a, __m128 __b) {
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
}
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index 74f62e93c800b6..932d6f36b09b66 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -869,3 +869,86 @@ __m128 test_mm_xor_ps(__m128 A, __m128 B) {
// CHECK: xor <4 x i32>
return _mm_xor_ps(A, B);
}
+
+// Test constexpr handling.
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+
+void test_constexpr() {
+ constexpr __m128 k1 {+1.0f,+0.0f,+2.0f,+4.0f};
+ constexpr __m128 k2 {+8.0f,+4.0f,+2.0f,+1.0f};
+ constexpr __m128 k3 {-4.0f,-5.0f,+6.0f,+7.0f};
+ constexpr __m128 k4 {+0.0f,-0.0f,-0.0f,+0.0f};
+
+ constexpr __m128 v_mm_set_ss = _mm_set_ss(1.0f);
+ static_assert(v_mm_set_ss[0] == +1.0f && v_mm_set_ss[1] == +0.0f && v_mm_set_ss[2] == +0.0f && v_mm_set_ss[3] == +0.0f);
+
+ constexpr __m128 v_mm_set1_ps = _mm_set1_ps(2.0f);
+ static_assert(v_mm_set1_ps[0] == +2.0f && v_mm_set1_ps[1] == +2.0f && v_mm_set1_ps[2] == +2.0f && v_mm_set1_ps[3] == +2.0f);
+
+ constexpr __m128 v_mm_set_ps1 = _mm_set_ps1(-2.0f);
+ static_assert(v_mm_set_ps1[0] == -2.0f && v_mm_set_ps1[1] == -2.0f && v_mm_set_ps1[2] == -2.0f && v_mm_set_ps1[3] == -2.0f);
+
+ constexpr __m128 v_mm_set_ps = _mm_set_ps(+0.0f, +1.0f, +2.0f, +3.0f);
+ static_assert(v_mm_set_ps[0] == +3.0f && v_mm_set_ps[1] == +2.0f && v_mm_set_ps[2] == +1.0f && v_mm_set_ps[3] == +0.0f);
+
+ constexpr __m128 v_mm_setr_ps = _mm_setr_ps(+0.0f, +1.0f, +2.0f, +3.0f);
+ static_assert(v_mm_setr_ps[0] == +0.0f && v_mm_setr_ps[1] == +1.0f && v_mm_setr_ps[2] == +2.0f && v_mm_setr_ps[3] == +3.0f);
+
+ constexpr __m128 v_mm_setzero_ps = _mm_setzero_ps();
+ static_assert(v_mm_setzero_ps[0] == +0.0f && v_mm_setzero_ps[1] == +0.0f && v_mm_setzero_ps[2] == +0.0f && v_mm_setzero_ps[3] == +0.0f);
+
+ constexpr __m128 v_mm_add_ss = _mm_add_ss(k1, k2);
+ static_assert(v_mm_add_ss[0] == +9.0f && v_mm_add_ss[1] == +0.0f && v_mm_add_ss[2] == +2.0f && v_mm_add_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_add_ps = _mm_add_ps(k1, k2);
+ static_assert(v_mm_add_ps[0] == +9.0f && v_mm_add_ps[1] == +4.0f && v_mm_add_ps[2] == +4.0f && v_mm_add_ps[3] == +5.0f);
+
+ constexpr __m128 v_mm_sub_ss = _mm_sub_ss(k1, k2);
+ static_assert(v_mm_sub_ss[0] == -7.0f && v_mm_sub_ss[1] == +0.0f && v_mm_sub_ss[2] == +2.0f && v_mm_sub_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_sub_ps = _mm_sub_ps(k1, k2);
+ static_assert(v_mm_sub_ps[0] == -7.0f && v_mm_sub_ps[1] == -4.0f && v_mm_sub_ps[2] == +0.0f && v_mm_sub_ps[3] == +3.0f);
+
+ constexpr __m128 v_mm_mul_ss = _mm_mul_ss(k1, k2);
+ static_assert(v_mm_mul_ss[0] == +8.0f && v_mm_mul_ss[1] == +0.0f && v_mm_mul_ss[2] == +2.0f && v_mm_mul_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_mul_ps = _mm_mul_ps(k1, k2);
+ static_assert(v_mm_mul_ps[0] == +8.0f && v_mm_mul_ps[1] == +0.0f && v_mm_mul_ps[2] == +4.0f && v_mm_mul_ps[3] == +4.0f);
+
+ constexpr __m128 v_mm_div_ss = _mm_div_ss(k1, k2);
+ static_assert(v_mm_div_ss[0] == +0.125f && v_mm_div_ss[1] == +0.0f && v_mm_div_ss[2] == +2.0f && v_mm_div_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_div_ps = _mm_div_ps(k1, k2);
+ static_assert(v_mm_div_ps[0] == +0.125f && v_mm_div_ps[1] == +0.0f && v_mm_div_ps[2] == +1.0f && v_mm_div_ps[3] == +4.0f);
+
+ constexpr __m128 v_mm_and_ps = _mm_and_ps(k3, k4);
+ static_assert(v_mm_and_ps[0] == +0.0f && v_mm_and_ps[1] == +0.0f && v_mm_and_ps[2] == +0.0f && v_mm_and_ps[3] == +0.0f);
+
+ constexpr __m128 v_mm_andnot_ps = _mm_andnot_ps(k3, k4);
+ static_assert(v_mm_andnot_ps[0] == +0.0f && v_mm_andnot_ps[1] == +0.0f && v_mm_andnot_ps[2] == +0.0f && v_mm_andnot_ps[3] == +0.0f);
+
+ constexpr __m128 v_mm_or_ps = _mm_or_ps(k3, k4);
+ static_assert(v_mm_or_ps[0] == -4.0f && v_mm_or_ps[1] == -5.0f && v_mm_or_ps[2] == -6.0f && v_mm_or_ps[3] == +7.0f);
+
+ constexpr __m128 v_mm_xor_ps = _mm_xor_ps(k3, k4);
+ static_assert(v_mm_xor_ps[0] == -4.0f && v_mm_xor_ps[1] == +5.0f && v_mm_xor_ps[2] == -6.0f && v_mm_xor_ps[3] == +7.0f);
+
+ constexpr __m128 v_mm_unpackhi_ps = _mm_unpackhi_ps(k1, k2);
+ static_assert(v_mm_unpackhi_ps[0] == +2.0f && v_mm_unpackhi_ps[1] == +2.0f && v_mm_unpackhi_ps[2] == +4.0f && v_mm_unpackhi_ps[3] == +1.0f);
+
+ constexpr __m128 v_mm_unpacklo_ps = _mm_unpacklo_ps(k1, k2);
+ static_assert(v_mm_unpacklo_ps[0] == +1.0f && v_mm_unpacklo_ps[1] == +8.0f && v_mm_unpacklo_ps[2] == +0.0f && v_mm_unpacklo_ps[3] == +4.0f);
+
+ constexpr __m128 v_mm_move_ss = _mm_move_ss(k1, k2);
+ static_assert(v_mm_move_ss[0] == +8.0f && v_mm_move_ss[1] == +0.0f && v_mm_move_ss[2] == +2.0f && v_mm_move_ss[3] == +4.0f);
+
+ constexpr __m128 v_mm_movehl_ps = _mm_movehl_ps(k1, k2);
+ static_assert(v_mm_movehl_ps[0] == +2.0f && v_mm_movehl_ps[1] == +1.0f && v_mm_movehl_ps[2] == +2.0f && v_mm_movehl_ps[3] == +4.0f);
+
+ constexpr __m128 v_mm_movelh_ps = _mm_movelh_ps(k1, k2);
+ static_assert(v_mm_movelh_ps[0] == +1.0f && v_mm_movelh_ps[1] == +0.0f && v_mm_movelh_ps[2] == +8.0f && v_mm_movelh_ps[3] == +4.0f);
+
+ static_assert(_mm_cvtss_f32(k2) == +8.0f);
+}
+
+#endif
\ No newline at end of file
|
You can test this locally with the following command:git-clang-format --diff 02dd6b1014c708591fa0e8d46efb328c513a86e7 b57d4529615383767bce3714f8924355ff6924ac --extensions c,h -- clang/lib/Headers/xmmintrin.h clang/test/CodeGen/X86/sse-builtins.c View the diff from clang-format here.diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 2aa688adef..eb5033eec2 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -83,8 +83,8 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16)));
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
/// of the lower 32 bits of both operands. The upper 96 bits are copied from
/// the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_add_ss(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ss(__m128 __a,
+ __m128 __b) {
__a[0] += __b[0];
return __a;
}
@@ -102,8 +102,8 @@ _mm_add_ss(__m128 __a, __m128 __b) {
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the sums of both
/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_add_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_ps(__m128 __a,
+ __m128 __b) {
return (__m128)((__v4sf)__a + (__v4sf)__b);
}
@@ -123,8 +123,8 @@ _mm_add_ps(__m128 __a, __m128 __b) {
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
/// difference of the lower 32 bits of both operands. The upper 96 bits are
/// copied from the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_sub_ss(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ss(__m128 __a,
+ __m128 __b) {
__a[0] -= __b[0];
return __a;
}
@@ -143,8 +143,8 @@ _mm_sub_ss(__m128 __a, __m128 __b) {
/// A 128-bit vector of [4 x float] containing the subtrahend.
/// \returns A 128-bit vector of [4 x float] containing the differences between
/// both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_sub_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_ps(__m128 __a,
+ __m128 __b) {
return (__m128)((__v4sf)__a - (__v4sf)__b);
}
@@ -164,8 +164,8 @@ _mm_sub_ps(__m128 __a, __m128 __b) {
/// \returns A 128-bit vector of [4 x float] containing the product of the lower
/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
/// bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_mul_ss(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ss(__m128 __a,
+ __m128 __b) {
__a[0] *= __b[0];
return __a;
}
@@ -183,8 +183,8 @@ _mm_mul_ss(__m128 __a, __m128 __b) {
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the products of both
/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_mul_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_ps(__m128 __a,
+ __m128 __b) {
return (__m128)((__v4sf)__a * (__v4sf)__b);
}
@@ -204,8 +204,8 @@ _mm_mul_ps(__m128 __a, __m128 __b) {
/// \returns A 128-bit vector of [4 x float] containing the quotients of the
/// lower 32 bits of both operands. The upper 96 bits are copied from the
/// upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_div_ss(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ss(__m128 __a,
+ __m128 __b) {
__a[0] /= __b[0];
return __a;
}
@@ -222,8 +222,8 @@ _mm_div_ss(__m128 __a, __m128 __b) {
/// A 128-bit vector of [4 x float] containing the divisor.
/// \returns A 128-bit vector of [4 x float] containing the quotients of both
/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_div_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_ps(__m128 __a,
+ __m128 __b) {
return (__m128)((__v4sf)__a / (__v4sf)__b);
}
@@ -437,8 +437,8 @@ _mm_max_ps(__m128 __a, __m128 __b)
/// A 128-bit vector containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
/// values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_and_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_ps(__m128 __a,
+ __m128 __b) {
return (__m128)((__v4su)__a & (__v4su)__b);
}
@@ -475,8 +475,8 @@ _mm_andnot_ps(__m128 __a, __m128 __b) {
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
/// values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_or_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_ps(__m128 __a,
+ __m128 __b) {
return (__m128)((__v4su)__a | (__v4su)__b);
}
@@ -493,8 +493,8 @@ _mm_or_ps(__m128 __a, __m128 __b) {
/// A 128-bit vector of [4 x float] containing one of the source operands.
/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
/// of the values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_xor_ps(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_ps(__m128 __a,
+ __m128 __b) {
return (__m128)((__v4su)__a ^ (__v4su)__b);
}
@@ -1734,8 +1734,7 @@ _mm_cvt_pi2ps(__m128 __a, __m64 __b)
/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
/// used in the extraction.
/// \returns A 32-bit float containing the extracted value.
-static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_cvtss_f32(__m128 __a) {
+static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtss_f32(__m128 __a) {
return __a[0];
}
@@ -1926,8 +1925,7 @@ _mm_undefined_ps(void)
/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
/// lower 32 bits contain the value provided in the source operand. The
/// upper 96 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_set_ss(float __w) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ss(float __w) {
return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
}
@@ -1943,8 +1941,7 @@ _mm_set_ss(float __w) {
/// A single-precision floating-point value used to initialize each vector
/// element of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_set1_ps(float __w) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_ps(float __w) {
return __extension__ (__m128){ __w, __w, __w, __w };
}
@@ -1961,9 +1958,8 @@ _mm_set1_ps(float __w) {
/// A single-precision floating-point value used to initialize each vector
/// element of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_set_ps1(float __w) {
- return _mm_set1_ps(__w);
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps1(float __w) {
+ return _mm_set1_ps(__w);
}
/// Constructs a 128-bit floating-point vector of [4 x float]
@@ -1987,8 +1983,10 @@ _mm_set_ps1(float __w) {
/// A single-precision floating-point value used to initialize bits [31:0]
/// of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_set_ps(float __z, float __y, float __x, float __w) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_ps(float __z,
+ float __y,
+ float __x,
+ float __w) {
return __extension__ (__m128){ __w, __x, __y, __z };
}
@@ -2014,8 +2012,10 @@ _mm_set_ps(float __z, float __y, float __x, float __w) {
/// A single-precision floating-point value used to initialize bits [127:96]
/// of the result.
/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_setr_ps(float __z, float __y, float __x, float __w) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_ps(float __z,
+ float __y,
+ float __x,
+ float __w) {
return __extension__ (__m128){ __z, __y, __x, __w };
}
@@ -2028,8 +2028,7 @@ _mm_setr_ps(float __z, float __y, float __x, float __w) {
///
/// \returns An initialized 128-bit floating-point vector of [4 x float] with
/// all elements set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_setzero_ps(void) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_ps(void) {
return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
}
@@ -2817,8 +2816,8 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b) {
/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
/// written to the lower 32 bits of the result.
/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_move_ss(__m128 __a, __m128 __b) {
+static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_move_ss(__m128 __a,
+ __m128 __b) {
__a[0] = __b[0];
return __a;
}
|
d3e47ff
to
34f0bfa
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
This is an initial patch to enable constexpr support on the more basic SSE1 intrinsics - such as initialization, arithmetic, logic and fixed shuffles. The plan is to incrementally extend this for SSE2/AVX etc. - initially for the equivalent basic intrinsics, but we can add support for some of the ia32 builtins as well we the need arises.
34f0bfa
to
b57d452
Compare
…mm_cvtsi64_ss SSE1 intrinsics Followup to #111001
…mm_cvtsi64_ss SSE1 intrinsics Followup to llvm#111001
This is an initial patch to enable constexpr support on the more basic SSE1 intrinsics - such as initialization, arithmetic, logic and fixed shuffles.
The plan is to incrementally extend this for SSE2/AVX etc. - initially for the equivalent basic intrinsics, but we can add support for some of the ia32 builtins as well we the need arises.