Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: clang build fixes and workarounds #16

Draft
wants to merge 13 commits into
base: master
Choose a base branch
from
24 changes: 17 additions & 7 deletions experimental/bits/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ using __m512d [[__gnu__::__vector_size__(64)]] = double;
using __m512i [[__gnu__::__vector_size__(64)]] = long long;
#endif

#if __clang__
template<typename T> auto __builtin_ia32_ps256_ps (T x) { return __builtin_shufflevector(x, _mm_setzero_ps() , 0, 1, 2, 3, 4, 4, 4, 4); }
template<typename T> auto __builtin_ia32_ps512_ps (T x) { return __builtin_shufflevector(x, _mm_setzero_ps() , 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4); }
template<typename T> auto __builtin_ia32_ps512_256ps(T x) { return __builtin_shufflevector(x, _mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8); }
#endif

// __next_power_of_2{{{
/**
* \internal
Expand Down Expand Up @@ -178,7 +184,7 @@ using __value_type_or_identity_t
// }}}
// __is_vectorizable {{{
template <typename _Tp>
struct __is_vectorizable : public std::is_arithmetic<_Tp>
struct __is_vectorizable : public std::is_arithmetic<std::remove_reference_t<_Tp>>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this needed? _Tp should never be a reference. And references are not vectorizable. (Pointers might be - needs a proposal)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clang by some reason returns long long&& out of operator[] in my example:
https://godbolt.org/z/WJN7_M

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then whatever calls __is_vectorizable<decltype(x[0])> is incorrect.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, thus a number of remove_references will be required to workaround that thing across other places.

{
};
template <> struct __is_vectorizable<bool> : public false_type
Expand Down Expand Up @@ -381,7 +387,12 @@ __is_neon_abi()

// }}}
// __make_dependent_t {{{
template <typename, typename _Up> using __make_dependent_t = _Up;
template <typename, typename _Up> struct __make_dependent
{
using type = _Up;
};
template <typename _Tp, typename _Up>
using __make_dependent_t = typename __make_dependent<_Tp, _Up>::type;

// }}}
// ^^^ ---- type traits ---- ^^^
Expand Down Expand Up @@ -1039,7 +1050,7 @@ template <size_t _Np, bool _Sanitized> struct _BitMask
"not implemented for bitmasks larger than one ullong");
if constexpr (_NewSize == 1) // must sanitize because the return _Tp is bool
return _SanitizedBitMask<1>{
{static_cast<bool>(_M_bits[0] & (_Tp(1) << _DropLsb))}};
(static_cast<bool>(_M_bits[0] & (_Tp(1) << _DropLsb)))};
else
return _BitMask<_NewSize,
((_NewSize + _DropLsb == sizeof(_Tp) * CHAR_BIT
Expand Down Expand Up @@ -1285,7 +1296,7 @@ struct __vector_type_n<_Tp, _Np,
static constexpr size_t _Bytes = _Np * sizeof(_Tp) < __min_vector_size<_Tp>
? __min_vector_size<_Tp>
: __next_power_of_2(_Np * sizeof(_Tp));
using type [[__gnu__::__vector_size__(_Bytes)]] = _Tp;
using type [[__gnu__::__vector_size__(_Bytes)]] = std::remove_reference_t<_Tp>;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again, _Tp should never be a reference

};

template <typename _Tp, size_t _Bytes, size_t = _Bytes % sizeof(_Tp)>
Expand Down Expand Up @@ -2068,7 +2079,7 @@ struct __intrinsic_type<
static constexpr std::size_t _VBytes
= _Bytes <= 16 ? 16 : _Bytes <= 32 ? 32 : 64;
using type [[__gnu__::__vector_size__(_VBytes)]]
= std::conditional_t<std::is_integral_v<_Tp>, long long int, _Tp>;
= std::conditional_t<std::is_integral_v<std::remove_reference_t<_Tp>>, long long int, std::remove_reference_t<_Tp>>;
};
#endif // _GLIBCXX_SIMD_HAVE_SSE

Expand Down Expand Up @@ -3559,8 +3570,7 @@ split(const simd_mask<typename _V::simd_type::value_type, _Ap>& __x)

// }}}
// split<_Sizes...>(simd) {{{
template <size_t... _Sizes, typename _Tp, typename _Ap,
typename = enable_if_t<((_Sizes + ...) == simd<_Tp, _Ap>::size())>>
template <size_t... _Sizes, typename _Tp, typename _Ap, typename>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removing the SFINAE condition breaks the spec

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SFINAE is there in declaration. Here it is definition. clang says it is redefinition of default argument

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Declaration is at line 3314

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK thanks. I'll take a look

_GLIBCXX_SIMD_ALWAYS_INLINE
std::tuple<simd<_Tp, simd_abi::deduce_t<_Tp, _Sizes>>...>
split(const simd<_Tp, _Ap>& __x)
Expand Down
8 changes: 4 additions & 4 deletions experimental/bits/simd_builtin.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@
_GLIBCXX_SIMD_BEGIN_NAMESPACE
// _S_allbits{{{
template <typename _V>
static inline constexpr _V _S_allbits
static inline _GLIBCXX_CONSTEXPR_SIMD _V _S_allbits
= reinterpret_cast<_V>(~__vector_type_t<char, sizeof(_V) / sizeof(char)>());

// }}}
// _S_signmask, _S_absmask{{{
template <typename _V, typename = _VectorTraits<_V>>
static inline constexpr _V _S_signmask = __xor(_V() + 1, _V() - 1);
static inline _GLIBCXX_CONSTEXPR_SIMD _V _S_signmask = __xor(_V() + 1, _V() - 1);
template <typename _V, typename = _VectorTraits<_V>>
static inline constexpr _V _S_absmask
static inline _GLIBCXX_CONSTEXPR_SIMD _V _S_absmask
= __andnot(_S_signmask<_V>, _S_allbits<_V>);

//}}}
Expand Down Expand Up @@ -624,7 +624,7 @@ __convert_all(_From __v)
return __vector_bitcast<_FromT, decltype(__n)::value>(__vv);
};
[[maybe_unused]] const auto __vi = __to_intrin(__v);
auto&& __make_array = [](std::initializer_list<auto> __xs) {
auto&& __make_array = [](auto __xs) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can't work. An initializer list argument cannot be deduced as one. But I have a fix coming up for this. Erich mentioned it yesterday.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we talked with Erich yesterday about that.

return __call_with_subscripts(
__xs.begin(), std::make_index_sequence<_Np>(),
[](auto... __ys) { return _R{__vector_bitcast<_ToT>(__ys)...}; });
Expand Down
6 changes: 6 additions & 0 deletions experimental/bits/simd_detail.h
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,12 @@
#define _GLIBCXX_SIMD_FIX_P2TS_ISSUE66 1
// }}}

#if __clang__
#define _GLIBCXX_CONSTEXPR_SIMD
#else
#define _GLIBCXX_CONSTEXPR_SIMD constexpr
#endif
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My naming convention has been to prefix everything with _GLIBCXX_SIMD_. I understand naming the macro "constexpr simd" is trying to explain the purpose of the macro. SIMD_CONSTEXPR_SIMD is just confusing again, though.
<bits/c++config> has _GLIBCXX_USE_CONSTEXPR which seems like a good fit here. I.e. name it _GLIBCXX_SIMD_USE_CONSTEXPR.
And define the __clang__ branch to const.


#endif // __cplusplus >= 201703L
#endif // _GLIBCXX_EXPERIMENTAL_SIMD_DETAIL_H_

Expand Down
10 changes: 5 additions & 5 deletions experimental/bits/simd_x86.h
Original file line number Diff line number Diff line change
Expand Up @@ -4225,7 +4225,7 @@ struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi>
__m128i __a = {};
__builtin_memcpy(&__a, __mem, 16);
const auto __b = _mm512_cvtepi8_epi32(__a);
__builtin_memcpy(&__a, __mem + 16, size<_Tp> - 16);
__builtin_memcpy(&__a, static_cast<const char*>(__mem) + 16, size<_Tp> - 16);
const auto __c = _mm512_cvtepi8_epi32(__a);
return _mm512_test_epi32_mask(__b, __b)
| (_mm512_test_epi32_mask(__c, __c) << 16);
Expand All @@ -4235,21 +4235,21 @@ struct _MaskImplX86 : _MaskImplX86Mixin, _MaskImplBuiltin<_Abi>
__m128i __a = {};
__builtin_memcpy(&__a, __mem, 16);
const auto __b = _mm512_cvtepi8_epi32(__a);
__builtin_memcpy(&__a, __mem + 16, 16);
__builtin_memcpy(&__a, static_cast<const char*>(__mem) + 16, 16);
const auto __c = _mm512_cvtepi8_epi32(__a);
if constexpr (size<_Tp> <= 48)
{
__builtin_memcpy(&__a, __mem + 32, size<_Tp> - 32);
__builtin_memcpy(&__a, static_cast<const char*>(__mem) + 32, size<_Tp> - 32);
const auto __d = _mm512_cvtepi8_epi32(__a);
return _mm512_test_epi32_mask(__b, __b)
| (_mm512_test_epi32_mask(__c, __c) << 16)
| (_ULLong(_mm512_test_epi32_mask(__d, __d)) << 32);
}
else
{
__builtin_memcpy(&__a, __mem + 16, 32);
__builtin_memcpy(&__a, static_cast<const char*>(__mem) + 16, 16);
const auto __d = _mm512_cvtepi8_epi32(__a);
__builtin_memcpy(&__a, __mem + 32, size<_Tp> - 48);
__builtin_memcpy(&__a, static_cast<const char*>(__mem) + 32, size<_Tp> - 48);
const auto __e = _mm512_cvtepi8_epi32(__a);
return _mm512_test_epi32_mask(__b, __b)
| (_mm512_test_epi32_mask(__c, __c) << 16)
Expand Down
2 changes: 2 additions & 0 deletions experimental/simd
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@
#pragma GCC diagnostic push
// Many [[gnu::vector_size(N)]] types might lead to a -Wpsabi warning which is
// irrelevant as those functions never appear on ABI borders
#if !__clang__
#pragma GCC diagnostic ignored "-Wpsabi"
#endif

// If __OPTIMIZE__ is not defined some intrinsics are defined as macros, making
// use of C casts internally. This requires us to disable the warning as it
Expand Down