diff --git a/alc/alu.cpp b/alc/alu.cpp index 63400e12da..bbb5f419a3 100644 --- a/alc/alu.cpp +++ b/alc/alu.cpp @@ -203,6 +203,14 @@ inline ResamplerFunc SelectResampler(Resampler resampler, uint increment) if((CPUCapFlags&CPU_CAP_NEON)) return Resample_; #endif +#ifdef HAVE_SSE4_1 + if((CPUCapFlags&CPU_CAP_SSE4_1)) + return Resample_; +#endif +#ifdef HAVE_SSE2 + if((CPUCapFlags&CPU_CAP_SSE2)) + return Resample_; +#endif #ifdef HAVE_SSE if((CPUCapFlags&CPU_CAP_SSE)) return Resample_; diff --git a/core/mixer/mixer_neon.cpp b/core/mixer/mixer_neon.cpp index 57b7b638e4..4cb5cebddd 100644 --- a/core/mixer/mixer_neon.cpp +++ b/core/mixer/mixer_neon.cpp @@ -29,6 +29,8 @@ struct FastBSincTag; #pragma GCC target("fpu=neon") #endif +using uint = unsigned int; + namespace { constexpr uint BSincPhaseDiffBits{MixerFracBits - BSincPhaseBits}; @@ -39,6 +41,19 @@ constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits}; constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits}; constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u}; +force_inline +void vtranspose4(float32x4_t &x0, float32x4_t &x1, float32x4_t &x2, float32x4_t &x3) noexcept +{ + float32x4x2_t t0_{vzipq_f32(x0, x2)}; + float32x4x2_t t1_{vzipq_f32(x1, x3)}; + float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])}; + float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])}; + x0 = u0_.val[0]; + x1 = u0_.val[1]; + x2 = u1_.val[0]; + x3 = u1_.val[1]; +} + inline float32x4_t set_f4(float l0, float l1, float l2, float l3) { float32x4_t ret{vmovq_n_f32(l0)}; @@ -150,42 +165,42 @@ void Resample_(const InterpState*, const float *src, uint frac, { ASSUME(frac < MixerFracOne); - const int32x4_t increment4 = vdupq_n_s32(static_cast(increment*4)); + const uint32x4_t increment4 = vdupq_n_u32(increment*4u); const float32x4_t fracOne4 = vdupq_n_f32(1.0f/MixerFracOne); - const int32x4_t fracMask4 = vdupq_n_s32(MixerFracMask); + const uint32x4_t fracMask4 = vdupq_n_u32(MixerFracMask); alignas(16) std::array pos_, frac_; InitPosArrays(frac, increment, al::span{frac_}, al::span{pos_}); - int32x4_t frac4 = vld1q_s32(reinterpret_cast(frac_.data())); - int32x4_t pos4 = vld1q_s32(reinterpret_cast(pos_.data())); + uint32x4_t frac4 = vld1q_u32(frac_.data()); + uint32x4_t pos4 = vld1q_u32(pos_.data()); auto dst_iter = dst.begin(); for(size_t todo{dst.size()>>2};todo;--todo) { - const int pos0{vgetq_lane_s32(pos4, 0)}; - const int pos1{vgetq_lane_s32(pos4, 1)}; - const int pos2{vgetq_lane_s32(pos4, 2)}; - const int pos3{vgetq_lane_s32(pos4, 3)}; + const uint pos0{vgetq_lane_u32(pos4, 0)}; + const uint pos1{vgetq_lane_u32(pos4, 1)}; + const uint pos2{vgetq_lane_u32(pos4, 2)}; + const uint pos3{vgetq_lane_u32(pos4, 3)}; const float32x4_t val1{set_f4(src[pos0], src[pos1], src[pos2], src[pos3])}; - const float32x4_t val2{set_f4(src[pos0+1], src[pos1+1], src[pos2+1], src[pos3+1])}; + const float32x4_t val2{set_f4(src[pos0+1_uz], src[pos1+1_uz], src[pos2+1_uz], src[pos3+1_uz])}; /* val1 + (val2-val1)*mu */ const float32x4_t r0{vsubq_f32(val2, val1)}; - const float32x4_t mu{vmulq_f32(vcvtq_f32_s32(frac4), fracOne4)}; + const float32x4_t mu{vmulq_f32(vcvtq_f32_u32(frac4), fracOne4)}; const float32x4_t out{vmlaq_f32(val1, mu, r0)}; vst1q_f32(dst_iter, out); dst_iter += 4; - frac4 = vaddq_s32(frac4, increment4); - pos4 = vaddq_s32(pos4, vshrq_n_s32(frac4, MixerFracBits)); - frac4 = vandq_s32(frac4, fracMask4); + frac4 = vaddq_u32(frac4, increment4); + pos4 = vaddq_u32(pos4, vshrq_n_u32(frac4, MixerFracBits)); + frac4 = vandq_u32(frac4, fracMask4); } if(size_t todo{dst.size()&3}) { - src += static_cast(vgetq_lane_s32(pos4, 0)); - frac = static_cast(vgetq_lane_s32(frac4, 0)); + src += vgetq_lane_u32(pos4, 0); + frac = vgetq_lane_u32(frac4, 0); do { *(dst_iter++) = lerpf(src[0], src[1], static_cast(frac) * (1.0f/MixerFracOne)); @@ -205,29 +220,86 @@ void Resample_(const InterpState *state, const float *src, uin const auto *filter = al::assume_aligned<16>(std::get(*state).filter); + const uint32x4_t increment4{vdupq_n_u32(increment*4u)}; + const uint32x4_t fracMask4{vdupq_n_u32(MixerFracMask)}; + const float32x4_t fracDiffOne4{vdupq_n_f32(1.0f/CubicPhaseDiffOne)}; + const uint32x4_t fracDiffMask4{vdupq_n_u32(CubicPhaseDiffMask)}; + + alignas(16) std::array pos_, frac_; + InitPosArrays(frac, increment, al::span{frac_}, al::span{pos_}); + uint32x4_t frac4{vld1q_u32(frac_.data())}; + uint32x4_t pos4{vld1q_u32(pos_.data())}; + src -= 1; - std::generate(dst.begin(), dst.end(), [&src,&frac,increment,filter]() -> float + auto dst_iter = dst.begin(); + for(size_t todo{dst.size()>>2};todo;--todo) { - const uint pi{frac >> CubicPhaseDiffBits}; - const float pf{static_cast(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)}; - const float32x4_t pf4{vdupq_n_f32(pf)}; + const uint pos0{vgetq_lane_u32(pos4, 0)}; + const uint pos1{vgetq_lane_u32(pos4, 1)}; + const uint pos2{vgetq_lane_u32(pos4, 2)}; + const uint pos3{vgetq_lane_u32(pos4, 3)}; + const float32x4_t val0{vld1q_f32(src+pos0)}; + const float32x4_t val1{vld1q_f32(src+pos1)}; + const float32x4_t val2{vld1q_f32(src+pos2)}; + const float32x4_t val3{vld1q_f32(src+pos3)}; + + const uint32x4_t pi4{vshrq_n_u32(frac4, CubicPhaseDiffBits)}; + const uint pi0{vgetq_lane_u32(pi4, 0)}; + const uint pi1{vgetq_lane_u32(pi4, 1)}; + const uint pi2{vgetq_lane_u32(pi4, 2)}; + const uint pi3{vgetq_lane_u32(pi4, 3)}; + + const float32x4_t pf4{vmulq_f32(vcvtq_f32_u32(vandq_u32(frac4, fracDiffMask4)), + fracDiffOne4)}; + + float32x4_t r0{vmulq_f32(val0, + vmlaq_f32(vld1q_f32(filter[pi0].mCoeffs.data()), vdupq_lane_f32(vget_low_f32(pf4), 0), + vld1q_f32(filter[pi0].mDeltas.data())))}; + float32x4_t r1{vmulq_f32(val1, + vmlaq_f32(vld1q_f32(filter[pi1].mCoeffs.data()), vdupq_lane_f32(vget_low_f32(pf4), 1), + vld1q_f32(filter[pi1].mDeltas.data())))}; + float32x4_t r2{vmulq_f32(val2, + vmlaq_f32(vld1q_f32(filter[pi2].mCoeffs.data()), vdupq_lane_f32(vget_high_f32(pf4), 0), + vld1q_f32(filter[pi2].mDeltas.data())))}; + float32x4_t r3{vmulq_f32(val3, + vmlaq_f32(vld1q_f32(filter[pi3].mCoeffs.data()), vdupq_lane_f32(vget_high_f32(pf4), 1), + vld1q_f32(filter[pi3].mDeltas.data())))}; + + vtranspose4(r0, r1, r2, r3); + r0 = vaddq_f32(vaddq_f32(r0, r1), vaddq_f32(r2, r3)); + + vst1q_f32(dst_iter, r0); + dst_iter += 4; - /* Apply the phase interpolated filter. */ + frac4 = vaddq_u32(frac4, increment4); + pos4 = vaddq_u32(pos4, vshrq_n_u32(frac4, MixerFracBits)); + frac4 = vandq_u32(frac4, fracMask4); + } - /* f = fil + pf*phd */ - const float32x4_t f4 = vmlaq_f32(vld1q_f32(filter[pi].mCoeffs.data()), pf4, - vld1q_f32(filter[pi].mDeltas.data())); - /* r = f*src */ - float32x4_t r4{vmulq_f32(f4, vld1q_f32(src))}; + if(const size_t todo{dst.size()&3}) + { + src += vgetq_lane_u32(pos4, 0); + frac = vgetq_lane_u32(frac4, 0); - r4 = vaddq_f32(r4, vrev64q_f32(r4)); - const float output{vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0)}; + std::generate(dst.end()-todo, dst.end(), [&src,&frac,increment,filter]() -> float + { + const uint pi{frac >> CubicPhaseDiffBits}; + const float pf{static_cast(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)}; + const float32x4_t pf4{vdupq_n_f32(pf)}; - frac += increment; - src += frac>>MixerFracBits; - frac &= MixerFracMask; - return output; - }); + const float32x4_t f4{vmlaq_f32(vld1q_f32(filter[pi].mCoeffs.data()), pf4, + vld1q_f32(filter[pi].mDeltas.data()))}; + float32x4_t r4{vmulq_f32(f4, vld1q_f32(src))}; + + r4 = vaddq_f32(r4, vrev64q_f32(r4)); + const float output{vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0)}; + + frac += increment; + src += frac>>MixerFracBits; + frac &= MixerFracMask; + return output; + }); + } } template<> diff --git a/core/mixer/mixer_sse2.cpp b/core/mixer/mixer_sse2.cpp index 325877b2b0..7e60a6d799 100644 --- a/core/mixer/mixer_sse2.cpp +++ b/core/mixer/mixer_sse2.cpp @@ -28,17 +28,32 @@ #include "alnumeric.h" #include "alspan.h" +#include "core/cubic_defs.h" #include "defs.h" #include "opthelpers.h" struct SSE2Tag; struct LerpTag; +struct CubicTag; #if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE2__) #pragma GCC target("sse2") #endif +using uint = unsigned int; + +namespace { + +constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits}; +constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits}; +constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u}; + +force_inline __m128 vmadd(const __m128 x, const __m128 y, const __m128 z) noexcept +{ return _mm_add_ps(x, _mm_mul_ps(y, z)); } + +} // namespace + template<> void Resample_(const InterpState*, const float *src, uint frac, const uint increment, const al::span dst) @@ -59,12 +74,12 @@ void Resample_(const InterpState*, const float *src, uint frac, auto dst_iter = dst.begin(); for(size_t todo{dst.size()>>2};todo;--todo) { - const int pos0{_mm_cvtsi128_si32(pos4)}; - const int pos1{_mm_cvtsi128_si32(_mm_srli_si128(pos4, 4))}; - const int pos2{_mm_cvtsi128_si32(_mm_srli_si128(pos4, 8))}; - const int pos3{_mm_cvtsi128_si32(_mm_srli_si128(pos4, 12))}; - const __m128 val1{_mm_setr_ps(src[pos0 ], src[pos1 ], src[pos2 ], src[pos3 ])}; - const __m128 val2{_mm_setr_ps(src[pos0+1], src[pos1+1], src[pos2+1], src[pos3+1])}; + const auto pos0 = static_cast(_mm_cvtsi128_si32(pos4)); + const auto pos1 = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(pos4, 4))); + const auto pos2 = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(pos4, 8))); + const auto pos3 = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(pos4, 12))); + const __m128 val1{_mm_setr_ps(src[pos0], src[pos1], src[pos2], src[pos3])}; + const __m128 val2{_mm_setr_ps(src[pos0+1_uz], src[pos1+1_uz], src[pos2+1_uz], src[pos3+1_uz])}; /* val1 + (val2-val1)*mu */ const __m128 r0{_mm_sub_ps(val2, val1)}; @@ -93,3 +108,100 @@ void Resample_(const InterpState*, const float *src, uint frac, } while(--todo); } } + +template<> +void Resample_(const InterpState *state, const float *src, uint frac, + const uint increment, const al::span dst) +{ + ASSUME(frac < MixerFracOne); + + const auto *filter = al::assume_aligned<16>(std::get(*state).filter); + + const __m128i increment4{_mm_set1_epi32(static_cast(increment*4))}; + const __m128i fracMask4{_mm_set1_epi32(MixerFracMask)}; + const __m128 fracDiffOne4{_mm_set1_ps(1.0f/CubicPhaseDiffOne)}; + const __m128i fracDiffMask4{_mm_set1_epi32(CubicPhaseDiffMask)}; + + alignas(16) std::array pos_, frac_; + InitPosArrays(frac, increment, al::span{frac_}, al::span{pos_}); + __m128i frac4{_mm_setr_epi32(static_cast(frac_[0]), static_cast(frac_[1]), + static_cast(frac_[2]), static_cast(frac_[3]))}; + __m128i pos4{_mm_setr_epi32(static_cast(pos_[0]), static_cast(pos_[1]), + static_cast(pos_[2]), static_cast(pos_[3]))}; + + src -= 1; + auto dst_iter = dst.begin(); + for(size_t todo{dst.size()>>2};todo;--todo) + { + const auto pos0 = static_cast(_mm_cvtsi128_si32(pos4)); + const auto pos1 = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(pos4, 4))); + const auto pos2 = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(pos4, 8))); + const auto pos3 = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(pos4, 12))); + const __m128 val0{_mm_loadu_ps(src+pos0)}; + const __m128 val1{_mm_loadu_ps(src+pos1)}; + const __m128 val2{_mm_loadu_ps(src+pos2)}; + const __m128 val3{_mm_loadu_ps(src+pos3)}; + + const __m128i pi4{_mm_srli_epi32(frac4, CubicPhaseDiffBits)}; + const auto pi0 = static_cast(_mm_cvtsi128_si32(pi4)); + const auto pi1 = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(pi4, 4))); + const auto pi2 = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(pi4, 8))); + const auto pi3 = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(pi4, 12))); + + const __m128 pf4{_mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(frac4, fracDiffMask4)), + fracDiffOne4)}; + + __m128 r0{_mm_mul_ps(val0, + vmadd(_mm_load_ps(filter[pi0].mCoeffs.data()), + _mm_shuffle_ps(pf4, pf4, _MM_SHUFFLE(0, 0, 0, 0)), + _mm_load_ps(filter[pi0].mDeltas.data())))}; + __m128 r1{_mm_mul_ps(val1, + vmadd(_mm_load_ps(filter[pi1].mCoeffs.data()), + _mm_shuffle_ps(pf4, pf4, _MM_SHUFFLE(1, 1, 1, 1)), + _mm_load_ps(filter[pi1].mDeltas.data())))}; + __m128 r2{_mm_mul_ps(val2, + vmadd(_mm_load_ps(filter[pi2].mCoeffs.data()), + _mm_shuffle_ps(pf4, pf4, _MM_SHUFFLE(2, 2, 2, 2)), + _mm_load_ps(filter[pi2].mDeltas.data())))}; + __m128 r3{_mm_mul_ps(val3, + vmadd(_mm_load_ps(filter[pi3].mCoeffs.data()), + _mm_shuffle_ps(pf4, pf4, _MM_SHUFFLE(3, 3, 3, 3)), + _mm_load_ps(filter[pi3].mDeltas.data())))}; + + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + r0 = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3)); + + _mm_store_ps(dst_iter, r0); + dst_iter += 4; + + frac4 = _mm_add_epi32(frac4, increment4); + pos4 = _mm_add_epi32(pos4, _mm_srli_epi32(frac4, MixerFracBits)); + frac4 = _mm_and_si128(frac4, fracMask4); + } + + if(const size_t todo{dst.size()&3}) + { + src += static_cast(_mm_cvtsi128_si32(pos4)); + frac = static_cast(_mm_cvtsi128_si32(frac4)); + + std::generate(dst.end()-todo, dst.end(), [&src,&frac,increment,filter]() -> float + { + const uint pi{frac >> CubicPhaseDiffBits}; + const float pf{static_cast(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)}; + const __m128 pf4{_mm_set1_ps(pf)}; + + const __m128 f4 = vmadd(_mm_load_ps(filter[pi].mCoeffs.data()), pf4, + _mm_load_ps(filter[pi].mDeltas.data())); + __m128 r4{_mm_mul_ps(f4, _mm_loadu_ps(src))}; + + r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3))); + r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + const float output{_mm_cvtss_f32(r4)}; + + frac += increment; + src += frac>>MixerFracBits; + frac &= MixerFracMask; + return output; + }); + } +} diff --git a/core/mixer/mixer_sse41.cpp b/core/mixer/mixer_sse41.cpp index c777401b36..6e5d4d1042 100644 --- a/core/mixer/mixer_sse41.cpp +++ b/core/mixer/mixer_sse41.cpp @@ -29,17 +29,32 @@ #include "alnumeric.h" #include "alspan.h" +#include "core/cubic_defs.h" #include "defs.h" #include "opthelpers.h" struct SSE4Tag; struct LerpTag; +struct CubicTag; #if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE4_1__) #pragma GCC target("sse4.1") #endif +using uint = unsigned int; + +namespace { + +constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits}; +constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits}; +constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u}; + +force_inline __m128 vmadd(const __m128 x, const __m128 y, const __m128 z) noexcept +{ return _mm_add_ps(x, _mm_mul_ps(y, z)); } + +} // namespace + template<> void Resample_(const InterpState*, const float *src, uint frac, const uint increment, const al::span dst) @@ -60,12 +75,12 @@ void Resample_(const InterpState*, const float *src, uint frac, auto dst_iter = dst.begin(); for(size_t todo{dst.size()>>2};todo;--todo) { - const int pos0{_mm_extract_epi32(pos4, 0)}; - const int pos1{_mm_extract_epi32(pos4, 1)}; - const int pos2{_mm_extract_epi32(pos4, 2)}; - const int pos3{_mm_extract_epi32(pos4, 3)}; - const __m128 val1{_mm_setr_ps(src[pos0 ], src[pos1 ], src[pos2 ], src[pos3 ])}; - const __m128 val2{_mm_setr_ps(src[pos0+1], src[pos1+1], src[pos2+1], src[pos3+1])}; + const auto pos0 = static_cast(_mm_extract_epi32(pos4, 0)); + const auto pos1 = static_cast(_mm_extract_epi32(pos4, 1)); + const auto pos2 = static_cast(_mm_extract_epi32(pos4, 2)); + const auto pos3 = static_cast(_mm_extract_epi32(pos4, 3)); + const __m128 val1{_mm_setr_ps(src[pos0], src[pos1], src[pos2], src[pos3])}; + const __m128 val2{_mm_setr_ps(src[pos0+1_uz], src[pos1+1_uz], src[pos2+1_uz], src[pos3+1_uz])}; /* val1 + (val2-val1)*mu */ const __m128 r0{_mm_sub_ps(val2, val1)}; @@ -98,3 +113,100 @@ void Resample_(const InterpState*, const float *src, uint frac, } while(--todo); } } + +template<> +void Resample_(const InterpState *state, const float *src, uint frac, + const uint increment, const al::span dst) +{ + ASSUME(frac < MixerFracOne); + + const auto *filter = al::assume_aligned<16>(std::get(*state).filter); + + const __m128i increment4{_mm_set1_epi32(static_cast(increment*4))}; + const __m128i fracMask4{_mm_set1_epi32(MixerFracMask)}; + const __m128 fracDiffOne4{_mm_set1_ps(1.0f/CubicPhaseDiffOne)}; + const __m128i fracDiffMask4{_mm_set1_epi32(CubicPhaseDiffMask)}; + + alignas(16) std::array pos_, frac_; + InitPosArrays(frac, increment, al::span{frac_}, al::span{pos_}); + __m128i frac4{_mm_setr_epi32(static_cast(frac_[0]), static_cast(frac_[1]), + static_cast(frac_[2]), static_cast(frac_[3]))}; + __m128i pos4{_mm_setr_epi32(static_cast(pos_[0]), static_cast(pos_[1]), + static_cast(pos_[2]), static_cast(pos_[3]))}; + + src -= 1; + auto dst_iter = dst.begin(); + for(size_t todo{dst.size()>>2};todo;--todo) + { + const auto pos0 = static_cast(_mm_extract_epi32(pos4, 0)); + const auto pos1 = static_cast(_mm_extract_epi32(pos4, 1)); + const auto pos2 = static_cast(_mm_extract_epi32(pos4, 2)); + const auto pos3 = static_cast(_mm_extract_epi32(pos4, 3)); + const __m128 val0{_mm_loadu_ps(src+pos0)}; + const __m128 val1{_mm_loadu_ps(src+pos1)}; + const __m128 val2{_mm_loadu_ps(src+pos2)}; + const __m128 val3{_mm_loadu_ps(src+pos3)}; + + const __m128i pi4{_mm_srli_epi32(frac4, CubicPhaseDiffBits)}; + const auto pi0 = static_cast(_mm_extract_epi32(pi4, 0)); + const auto pi1 = static_cast(_mm_extract_epi32(pi4, 1)); + const auto pi2 = static_cast(_mm_extract_epi32(pi4, 2)); + const auto pi3 = static_cast(_mm_extract_epi32(pi4, 3)); + + const __m128 pf4{_mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(frac4, fracDiffMask4)), + fracDiffOne4)}; + + __m128 r0{_mm_mul_ps(val0, + vmadd(_mm_load_ps(filter[pi0].mCoeffs.data()), + _mm_shuffle_ps(pf4, pf4, _MM_SHUFFLE(0, 0, 0, 0)), + _mm_load_ps(filter[pi0].mDeltas.data())))}; + __m128 r1{_mm_mul_ps(val1, + vmadd(_mm_load_ps(filter[pi1].mCoeffs.data()), + _mm_shuffle_ps(pf4, pf4, _MM_SHUFFLE(1, 1, 1, 1)), + _mm_load_ps(filter[pi1].mDeltas.data())))}; + __m128 r2{_mm_mul_ps(val2, + vmadd(_mm_load_ps(filter[pi2].mCoeffs.data()), + _mm_shuffle_ps(pf4, pf4, _MM_SHUFFLE(2, 2, 2, 2)), + _mm_load_ps(filter[pi2].mDeltas.data())))}; + __m128 r3{_mm_mul_ps(val3, + vmadd(_mm_load_ps(filter[pi3].mCoeffs.data()), + _mm_shuffle_ps(pf4, pf4, _MM_SHUFFLE(3, 3, 3, 3)), + _mm_load_ps(filter[pi3].mDeltas.data())))}; + + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + r0 = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3)); + + _mm_store_ps(dst_iter, r0); + dst_iter += 4; + + frac4 = _mm_add_epi32(frac4, increment4); + pos4 = _mm_add_epi32(pos4, _mm_srli_epi32(frac4, MixerFracBits)); + frac4 = _mm_and_si128(frac4, fracMask4); + } + + if(const size_t todo{dst.size()&3}) + { + src += static_cast(_mm_cvtsi128_si32(pos4)); + frac = static_cast(_mm_cvtsi128_si32(frac4)); + + std::generate(dst.end()-todo, dst.end(), [&src,&frac,increment,filter]() -> float + { + const uint pi{frac >> CubicPhaseDiffBits}; + const float pf{static_cast(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)}; + const __m128 pf4{_mm_set1_ps(pf)}; + + const __m128 f4 = vmadd(_mm_load_ps(filter[pi].mCoeffs.data()), pf4, + _mm_load_ps(filter[pi].mDeltas.data())); + __m128 r4{_mm_mul_ps(f4, _mm_loadu_ps(src))}; + + r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3))); + r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); + const float output{_mm_cvtss_f32(r4)}; + + frac += increment; + src += frac>>MixerFracBits; + frac &= MixerFracMask; + return output; + }); + } +}