diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp index 6751ef20c902d..b2fcd558328e7 100644 --- a/sycl/include/sycl/builtins.hpp +++ b/sycl/include/sycl/builtins.hpp @@ -734,8 +734,8 @@ std::enable_if_t<__FAST_MATH_GENFLOAT(T), T> sin(T x) __NOEXC { // svgenfloat sincos (svgenfloat x, genfloatptr cosval) template -std::enable_if_t< - detail::is_svgenfloat::value && detail::is_genfloatptr::value, T> +std::enable_if_t<__FAST_MATH_GENFLOAT(T) && detail::is_genfloatptr::value, + T> sincos(T x, T2 cosval) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_sincos(x, cosval); @@ -2500,6 +2500,23 @@ std::enable_if_t::value, T> cos(T x) __NOEXC { return native::cos(x); } +// svgenfloat sincos (svgenfloat x, genfloatptr cosval) +// This is a performance optimization to ensure that sincos isn't slower than a +// pair of sin/cos executed separately. Theoretically, calling non-native sincos +// might be faster than calling native::sin plus native::cos separately and we'd +// need some kind of cost model to make the right decision (and move this +// entirely to the JIT/AOT compilers). However, in practice, this simpler +// solution seems to work just fine and matches how sin/cos above are optimized +// for the fast math path. +template +std::enable_if_t< + detail::is_svgenfloatf::value && detail::is_genfloatptr::value, T> +sincos(T x, T2 cosval) __NOEXC { + detail::check_vector_size(); + *cosval = native::cos(x); + return native::sin(x); +} + // svgenfloatf exp (svgenfloatf x) template std::enable_if_t::value, T> exp(T x) __NOEXC {