From e77c668946ed3d6662907895788ce13404021ea0 Mon Sep 17 00:00:00 2001
From: Clayton Smith <argilo@gmail.com>
Date: Thu, 9 Nov 2023 11:02:05 -0500
Subject: [PATCH] Use pointers to pass in s32fc arguments

This avoids undefined behaviour arising from incompatibility between
complex numbers in C and C++.

Signed-off-by: Clayton Smith <argilo@gmail.com>
---
 .github/workflows/run-tests.yml               |   5 +-
 docs/kernels.dox                              |   3 +
 kernels/volk/volk_32fc_s32fc_multiply2_32fc.h | 413 +++++++++
 kernels/volk/volk_32fc_s32fc_multiply_32fc.h  | 266 +-----
 .../volk_32fc_s32fc_rotator2puppet_32fc.h     | 173 ++++
 .../volk/volk_32fc_s32fc_rotatorpuppet_32fc.h | 169 ----
 .../volk/volk_32fc_s32fc_x2_rotator2_32fc.h   | 782 ++++++++++++++++++
 .../volk/volk_32fc_s32fc_x2_rotator_32fc.h    | 623 +-------------
 ...fc_x2_s32fc_multiply_conjugate_add2_32fc.h | 345 ++++++++
 ...2fc_x2_s32fc_multiply_conjugate_add_32fc.h | 204 +----
 lib/kernel_tests.h                            |   8 +-
 lib/qa_utils.cc                               |   6 +-
 lib/qa_utils.h                                |   6 +-
 tmpl/volk.tmpl.h                              |   8 +-
 14 files changed, 1791 insertions(+), 1220 deletions(-)
 create mode 100644 kernels/volk/volk_32fc_s32fc_multiply2_32fc.h
 create mode 100644 kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h
 delete mode 100644 kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
 create mode 100644 kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h
 create mode 100644 kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index f0581eeaf..0b79cb82b 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -114,7 +114,6 @@ jobs:
           submodules: 'recursive'
       - uses: uraimo/run-on-arch-action@v2.5.0
         name: Build in non-x86 container
-        continue-on-error: ${{ contains(fromJson('["ppc64le", "s390x"]'), matrix.arch) }}
         id: build
         with:
           arch: ${{ matrix.arch }}
@@ -153,7 +152,9 @@ jobs:
             cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON ..
             echo "Build with $(nproc) thread(s)"
             make -j$(nproc)
-            ./cpu_features/list_cpu_features
+            if [ -f ./cpu_features/list_cpu_features ]; then
+              ./cpu_features/list_cpu_features
+            fi
             ./apps/volk-config-info --alignment
             ./apps/volk-config-info --avail-machines
             ./apps/volk-config-info --all-machines
diff --git a/docs/kernels.dox b/docs/kernels.dox
index e6930ad2f..f67e44440 100644
--- a/docs/kernels.dox
+++ b/docs/kernels.dox
@@ -50,7 +50,9 @@
 \li \subpage volk_32f_cos_32f
 \li \subpage volk_32fc_s32f_atan2_32f
 \li \subpage volk_32fc_s32fc_multiply_32fc
+\li \subpage volk_32fc_s32fc_multiply2_32fc
 \li \subpage volk_32fc_s32fc_x2_rotator_32fc
+\li \subpage volk_32fc_s32fc_x2_rotator2_32fc
 \li \subpage volk_32fc_s32f_deinterleave_real_16i
 \li \subpage volk_32fc_s32f_magnitude_16i
 \li \subpage volk_32fc_s32f_power_32fc
@@ -63,6 +65,7 @@
 \li \subpage volk_32fc_x2_multiply_32fc
 \li \subpage volk_32fc_x2_multiply_conjugate_32fc
 \li \subpage volk_32fc_x2_s32fc_multiply_conjugate_add_32fc
+\li \subpage volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc
 \li \subpage volk_32fc_x2_s32f_square_dist_scalar_mult_32f
 \li \subpage volk_32fc_x2_square_dist_32f
 \li \subpage volk_32f_exp_32f
diff --git a/kernels/volk/volk_32fc_s32fc_multiply2_32fc.h b/kernels/volk/volk_32fc_s32fc_multiply2_32fc.h
new file mode 100644
index 000000000..c1704041f
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32fc_multiply2_32fc.h
@@ -0,0 +1,413 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_s32fc_multiply2_32fc
+ *
+ * \b Overview
+ *
+ * Multiplies the input complex vector by a complex scalar and returns
+ * the results.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32fc_multiply2_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * lv_32fc_t* scalar, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector to be multiplied.
+ * \li scalar: The complex scalar to multiply against aVector.
+ * \li num_points: The number of complex values in aVector.
+ *
+ * \b Outputs
+ * \li cVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Generate points around the unit circle and shift the phase pi/3 rad.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t scalar = lv_cmake((float)std::cos(M_PI/3.f), (float)std::sin(M_PI/3.f));
+ *
+ *   float delta = 2.f*M_PI / (float)N;
+ *   for(unsigned int ii = 0; ii < N/2; ++ii){
+ *       // Generate points around the unit circle
+ *       float real = std::cos(delta * (float)ii);
+ *       float imag = std::sin(delta * (float)ii);
+ *       in[ii] = lv_cmake(real, imag);
+ *       in[ii+N/2] = lv_cmake(-real, -imag);
+ *   }
+ *
+ *   volk_32fc_s32fc_multiply2_32fc(out, in, &scalar, N);
+ *
+ *   printf(" mag   phase  |   mag   phase\n");
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("%+1.2f  %+1.2f  |  %+1.2f   %+1.2f\n",
+ *           std::abs(in[ii]), std::arg(in[ii]),
+ *           std::abs(out[ii]), std::arg(out[ii]));
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32fc_multiply2_32fc_u_H
+#define INCLUDED_volk_32fc_s32fc_multiply2_32fc_u_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_multiply2_32fc_u_avx_fma(lv_32fc_t* cVector,
+                                                            const lv_32fc_t* aVector,
+                                                            const lv_32fc_t* scalar,
+                                                            unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+    __m256 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm256_set1_ps(lv_creal(*scalar));
+    yh = _mm256_set1_ps(lv_cimag(*scalar));
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+        tmp1 = x;
+
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm256_fmaddsub_ps(
+            tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        c += 4;
+    }
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        *c++ = (*a++) * (*scalar);
+    }
+}
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_multiply2_32fc_u_avx(lv_32fc_t* cVector,
+                                                        const lv_32fc_t* aVector,
+                                                        const lv_32fc_t* scalar,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+    __m256 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm256_set1_ps(lv_creal(*scalar));
+    yh = _mm256_set1_ps(lv_cimag(*scalar));
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+        tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm256_addsub_ps(tmp1,
+                             tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        c += 4;
+    }
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        *c++ = (*a++) * (*scalar);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_32fc_s32fc_multiply2_32fc_u_sse3(lv_32fc_t* cVector,
+                                                         const lv_32fc_t* aVector,
+                                                         const lv_32fc_t* scalar,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm_set_ps1(lv_creal(*scalar));
+    yh = _mm_set_ps1(lv_cimag(*scalar));
+
+    for (; number < halfPoints; number++) {
+
+        x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+        tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+        x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm_addsub_ps(tmp1,
+                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = (*a) * (*scalar);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_multiply2_32fc_generic(lv_32fc_t* cVector,
+                                                          const lv_32fc_t* aVector,
+                                                          const lv_32fc_t* scalar,
+                                                          unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = num_points;
+
+    // unwrap loop
+    while (number >= 8) {
+        *cPtr++ = (*aPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) * (*scalar);
+        number -= 8;
+    }
+
+    // clean up any remaining
+    while (number-- > 0)
+        *cPtr++ = *aPtr++ * (*scalar);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply2_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_s32fc_multiply2_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_multiply2_32fc_a_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_multiply2_32fc_a_avx_fma(lv_32fc_t* cVector,
+                                                            const lv_32fc_t* aVector,
+                                                            const lv_32fc_t* scalar,
+                                                            unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+    __m256 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm256_set1_ps(lv_creal(*scalar));
+    yh = _mm256_set1_ps(lv_cimag(*scalar));
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+        tmp1 = x;
+
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm256_fmaddsub_ps(
+            tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        c += 4;
+    }
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        *c++ = (*a++) * (*scalar);
+    }
+}
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_multiply2_32fc_a_avx(lv_32fc_t* cVector,
+                                                        const lv_32fc_t* aVector,
+                                                        const lv_32fc_t* scalar,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+    __m256 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm256_set1_ps(lv_creal(*scalar));
+    yh = _mm256_set1_ps(lv_cimag(*scalar));
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+        tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm256_addsub_ps(tmp1,
+                             tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        c += 4;
+    }
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        *c++ = (*a++) * (*scalar);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_32fc_s32fc_multiply2_32fc_a_sse3(lv_32fc_t* cVector,
+                                                         const lv_32fc_t* aVector,
+                                                         const lv_32fc_t* scalar,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm_set_ps1(lv_creal(*scalar));
+    yh = _mm_set_ps1(lv_cimag(*scalar));
+
+    for (; number < halfPoints; number++) {
+
+        x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+        tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+        x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm_addsub_ps(tmp1,
+                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = (*a) * (*scalar);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_s32fc_multiply2_32fc_neon(lv_32fc_t* cVector,
+                                                       const lv_32fc_t* aVector,
+                                                       const lv_32fc_t* scalar,
+                                                       unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = num_points;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4x2_t a_val, scalar_val;
+    float32x4x2_t tmp_imag;
+
+    scalar_val.val[0] = vld1q_dup_f32((const float*)scalar);
+    scalar_val.val[1] = vld1q_dup_f32(((const float*)scalar) + 1);
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)aPtr);
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
+
+        tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
+        tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
+
+        vst2q_f32((float*)cPtr, tmp_imag);
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = *aPtr++ * (*scalar);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_x2_multiply2_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
index 1593b7cbe..cd5d1e97e 100644
--- a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
+++ b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -10,6 +10,12 @@
 /*!
  * \page volk_32fc_s32fc_multiply_32fc
  *
+ * \b Deprecation
+ *
+ * This kernel is deprecated, because passing in `lv_32fc_t` by value results in
+ * Undefined Behaviour, causing a segmentation fault on some architectures.
+ * Use `volk_32fc_s32fc_multiply2_32fc` instead.
+ *
  * \b Overview
  *
  * Multiplies the input complex vector by a complex scalar and returns
@@ -66,137 +72,39 @@
 #include <float.h>
 #include <inttypes.h>
 #include <stdio.h>
+#include <volk/volk_32fc_s32fc_multiply2_32fc.h>
 #include <volk/volk_complex.h>
 
 #if LV_HAVE_AVX && LV_HAVE_FMA
-#include <immintrin.h>
 
 static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector,
                                                            const lv_32fc_t* aVector,
                                                            const lv_32fc_t scalar,
                                                            unsigned int num_points)
 {
-    unsigned int number = 0;
-    unsigned int i = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    unsigned int isodd = num_points & 3;
-    __m256 x, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-
-    // Set up constant scalar vector
-    yl = _mm256_set1_ps(lv_creal(scalar));
-    yh = _mm256_set1_ps(lv_cimag(scalar));
-
-    for (; number < quarterPoints; number++) {
-        x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
-        tmp1 = x;
-
-        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-
-        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
-        z = _mm256_fmaddsub_ps(
-            tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
-
-        a += 4;
-        c += 4;
-    }
-
-    for (i = num_points - isodd; i < num_points; i++) {
-        *c++ = (*a++) * scalar;
-    }
+    volk_32fc_s32fc_multiply2_32fc_u_avx_fma(cVector, aVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
 
 #ifdef LV_HAVE_AVX
-#include <immintrin.h>
 
 static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector,
                                                        const lv_32fc_t* aVector,
                                                        const lv_32fc_t scalar,
                                                        unsigned int num_points)
 {
-    unsigned int number = 0;
-    unsigned int i = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    unsigned int isodd = num_points & 3;
-    __m256 x, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-
-    // Set up constant scalar vector
-    yl = _mm256_set1_ps(lv_creal(scalar));
-    yh = _mm256_set1_ps(lv_cimag(scalar));
-
-    for (; number < quarterPoints; number++) {
-        x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
-        tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
-        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-
-        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
-        z = _mm256_addsub_ps(tmp1,
-                             tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
-
-        a += 4;
-        c += 4;
-    }
-
-    for (i = num_points - isodd; i < num_points; i++) {
-        *c++ = (*a++) * scalar;
-    }
+    volk_32fc_s32fc_multiply2_32fc_u_avx(cVector, aVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_AVX */
 
 #ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
 
 static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector,
                                                         const lv_32fc_t* aVector,
                                                         const lv_32fc_t scalar,
                                                         unsigned int num_points)
 {
-    unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    __m128 x, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-
-    // Set up constant scalar vector
-    yl = _mm_set_ps1(lv_creal(scalar));
-    yh = _mm_set_ps1(lv_cimag(scalar));
-
-    for (; number < halfPoints; number++) {
-
-        x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
-        tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
-        x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-
-        tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
-        z = _mm_addsub_ps(tmp1,
-                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-        _mm_storeu_ps((float*)c, z); // Store the results back into the C container
-
-        a += 2;
-        c += 2;
-    }
-
-    if ((num_points % 2) != 0) {
-        *c = (*a) * scalar;
-    }
+    volk_32fc_s32fc_multiply2_32fc_u_sse3(cVector, aVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_SSE */
 
@@ -207,26 +115,7 @@ static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector,
                                                          const lv_32fc_t scalar,
                                                          unsigned int num_points)
 {
-    lv_32fc_t* cPtr = cVector;
-    const lv_32fc_t* aPtr = aVector;
-    unsigned int number = num_points;
-
-    // unwrap loop
-    while (number >= 8) {
-        *cPtr++ = (*aPtr++) * scalar;
-        *cPtr++ = (*aPtr++) * scalar;
-        *cPtr++ = (*aPtr++) * scalar;
-        *cPtr++ = (*aPtr++) * scalar;
-        *cPtr++ = (*aPtr++) * scalar;
-        *cPtr++ = (*aPtr++) * scalar;
-        *cPtr++ = (*aPtr++) * scalar;
-        *cPtr++ = (*aPtr++) * scalar;
-        number -= 8;
-    }
-
-    // clean up any remaining
-    while (number-- > 0)
-        *cPtr++ = *aPtr++ * scalar;
+    volk_32fc_s32fc_multiply2_32fc_generic(cVector, aVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_GENERIC */
 
@@ -241,172 +130,47 @@ static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector,
 #include <volk/volk_complex.h>
 
 #if LV_HAVE_AVX && LV_HAVE_FMA
-#include <immintrin.h>
 
 static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector,
                                                            const lv_32fc_t* aVector,
                                                            const lv_32fc_t scalar,
                                                            unsigned int num_points)
 {
-    unsigned int number = 0;
-    unsigned int i = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    unsigned int isodd = num_points & 3;
-    __m256 x, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-
-    // Set up constant scalar vector
-    yl = _mm256_set1_ps(lv_creal(scalar));
-    yh = _mm256_set1_ps(lv_cimag(scalar));
-
-    for (; number < quarterPoints; number++) {
-        x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
-        tmp1 = x;
-
-        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-
-        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
-        z = _mm256_fmaddsub_ps(
-            tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-        _mm256_store_ps((float*)c, z); // Store the results back into the C container
-
-        a += 4;
-        c += 4;
-    }
-
-    for (i = num_points - isodd; i < num_points; i++) {
-        *c++ = (*a++) * scalar;
-    }
+    volk_32fc_s32fc_multiply2_32fc_a_avx_fma(cVector, aVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_AVX && LV_HAVE_FMA */
 
 
 #ifdef LV_HAVE_AVX
-#include <immintrin.h>
 
 static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector,
                                                        const lv_32fc_t* aVector,
                                                        const lv_32fc_t scalar,
                                                        unsigned int num_points)
 {
-    unsigned int number = 0;
-    unsigned int i = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    unsigned int isodd = num_points & 3;
-    __m256 x, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-
-    // Set up constant scalar vector
-    yl = _mm256_set1_ps(lv_creal(scalar));
-    yh = _mm256_set1_ps(lv_cimag(scalar));
-
-    for (; number < quarterPoints; number++) {
-        x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
-        tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
-        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-
-        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
-        z = _mm256_addsub_ps(tmp1,
-                             tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-        _mm256_store_ps((float*)c, z); // Store the results back into the C container
-
-        a += 4;
-        c += 4;
-    }
-
-    for (i = num_points - isodd; i < num_points; i++) {
-        *c++ = (*a++) * scalar;
-    }
+    volk_32fc_s32fc_multiply2_32fc_a_avx(cVector, aVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_AVX */
 
 #ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
 
 static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector,
                                                         const lv_32fc_t* aVector,
                                                         const lv_32fc_t scalar,
                                                         unsigned int num_points)
 {
-    unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    __m128 x, yl, yh, z, tmp1, tmp2;
-    lv_32fc_t* c = cVector;
-    const lv_32fc_t* a = aVector;
-
-    // Set up constant scalar vector
-    yl = _mm_set_ps1(lv_creal(scalar));
-    yh = _mm_set_ps1(lv_cimag(scalar));
-
-    for (; number < halfPoints; number++) {
-
-        x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
-
-        tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
-
-        x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
-
-        tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
-
-        z = _mm_addsub_ps(tmp1,
-                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
-
-        _mm_store_ps((float*)c, z); // Store the results back into the C container
-
-        a += 2;
-        c += 2;
-    }
-
-    if ((num_points % 2) != 0) {
-        *c = (*a) * scalar;
-    }
+    volk_32fc_s32fc_multiply2_32fc_a_sse3(cVector, aVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_SSE */
 
 #ifdef LV_HAVE_NEON
-#include <arm_neon.h>
 
 static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector,
                                                       const lv_32fc_t* aVector,
                                                       const lv_32fc_t scalar,
                                                       unsigned int num_points)
 {
-    lv_32fc_t* cPtr = cVector;
-    const lv_32fc_t* aPtr = aVector;
-    unsigned int number = num_points;
-    unsigned int quarter_points = num_points / 4;
-
-    float32x4x2_t a_val, scalar_val;
-    float32x4x2_t tmp_imag;
-
-    scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
-    scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
-    for (number = 0; number < quarter_points; ++number) {
-        a_val = vld2q_f32((float*)aPtr);
-        tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
-        tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
-
-        tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
-        tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
-
-        vst2q_f32((float*)cPtr, tmp_imag);
-        aPtr += 4;
-        cPtr += 4;
-    }
-
-    for (number = quarter_points * 4; number < num_points; number++) {
-        *cPtr++ = *aPtr++ * scalar;
-    }
+    volk_32fc_s32fc_multiply2_32fc_neon(cVector, aVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_NEON */
 
diff --git a/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h
new file mode 100644
index 000000000..3ce071ca4
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32fc_rotator2puppet_32fc.h
@@ -0,0 +1,173 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+
+#ifndef INCLUDED_volk_32fc_s32fc_rotator2puppet_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotator2puppet_32fc_a_H
+
+
+#include <stdio.h>
+#include <volk/volk_32fc_s32fc_x2_rotator2_32fc.h>
+#include <volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_rotator2puppet_32fc_generic(lv_32fc_t* outVector,
+                                                               const lv_32fc_t* inVector,
+                                                               const lv_32fc_t* phase_inc,
+                                                               unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+    volk_32fc_s32fc_x2_rotator2_32fc_generic(
+        outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void volk_32fc_s32fc_rotator2puppet_32fc_neon(lv_32fc_t* outVector,
+                                                            const lv_32fc_t* inVector,
+                                                            const lv_32fc_t* phase_inc,
+                                                            unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+    volk_32fc_s32fc_x2_rotator2_32fc_neon(
+        outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32fc_s32fc_rotator2puppet_32fc_a_sse4_1(lv_32fc_t* outVector,
+                                             const lv_32fc_t* inVector,
+                                             const lv_32fc_t* phase_inc,
+                                             unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+    volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1(
+        outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+static inline void
+volk_32fc_s32fc_rotator2puppet_32fc_u_sse4_1(lv_32fc_t* outVector,
+                                             const lv_32fc_t* inVector,
+                                             const lv_32fc_t* phase_inc,
+                                             unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+    volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1(
+        outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_rotator2puppet_32fc_a_avx(lv_32fc_t* outVector,
+                                                             const lv_32fc_t* inVector,
+                                                             const lv_32fc_t* phase_inc,
+                                                             unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+    volk_32fc_s32fc_x2_rotator2_32fc_a_avx(
+        outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_rotator2puppet_32fc_u_avx(lv_32fc_t* outVector,
+                                                             const lv_32fc_t* inVector,
+                                                             const lv_32fc_t* phase_inc,
+                                                             unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+    volk_32fc_s32fc_x2_rotator2_32fc_u_avx(
+        outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX */
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void
+volk_32fc_s32fc_rotator2puppet_32fc_a_avx_fma(lv_32fc_t* outVector,
+                                              const lv_32fc_t* inVector,
+                                              const lv_32fc_t* phase_inc,
+                                              unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+    volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma(
+        outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void
+volk_32fc_s32fc_rotator2puppet_32fc_u_avx_fma(lv_32fc_t* outVector,
+                                              const lv_32fc_t* inVector,
+                                              const lv_32fc_t* phase_inc,
+                                              unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        *phase_inc / hypotf(lv_creal(*phase_inc), lv_cimag(*phase_inc));
+    volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(
+        outVector, inVector, &phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotator2puppet_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
deleted file mode 100644
index e328a3112..000000000
--- a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/* -*- c++ -*- */
-/*
- * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
- *
- * This file is part of VOLK
- *
- * SPDX-License-Identifier: LGPL-3.0-or-later
- */
-
-
-#ifndef INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
-#define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
-
-
-#include <stdio.h>
-#include <volk/volk_32fc_s32fc_x2_rotator_32fc.h>
-#include <volk/volk_complex.h>
-
-
-#ifdef LV_HAVE_GENERIC
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector,
-                                                              const lv_32fc_t* inVector,
-                                                              const lv_32fc_t phase_inc,
-                                                              unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_generic(
-        outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_GENERIC */
-
-
-#ifdef LV_HAVE_NEON
-#include <arm_neon.h>
-#include <volk/volk_neon_intrinsics.h>
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector,
-                                                           const lv_32fc_t* inVector,
-                                                           const lv_32fc_t phase_inc,
-                                                           unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_neon(
-        outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_NEON */
-
-
-#ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector,
-                                                               const lv_32fc_t* inVector,
-                                                               const lv_32fc_t phase_inc,
-                                                               unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
-        outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_SSE4_1 */
-
-
-#ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector,
-                                                               const lv_32fc_t* inVector,
-                                                               const lv_32fc_t phase_inc,
-                                                               unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
-        outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_SSE4_1 */
-
-
-#ifdef LV_HAVE_AVX
-#include <immintrin.h>
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector,
-                                                            const lv_32fc_t* inVector,
-                                                            const lv_32fc_t phase_inc,
-                                                            unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_a_avx(
-        outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_AVX */
-
-
-#ifdef LV_HAVE_AVX
-#include <immintrin.h>
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector,
-                                                            const lv_32fc_t* inVector,
-                                                            const lv_32fc_t phase_inc,
-                                                            unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_u_avx(
-        outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_AVX */
-
-#if LV_HAVE_AVX && LV_HAVE_FMA
-#include <immintrin.h>
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector,
-                                                                const lv_32fc_t* inVector,
-                                                                const lv_32fc_t phase_inc,
-                                                                unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(
-        outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
-
-
-#if LV_HAVE_AVX && LV_HAVE_FMA
-#include <immintrin.h>
-
-static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector,
-                                                                const lv_32fc_t* inVector,
-                                                                const lv_32fc_t phase_inc,
-                                                                unsigned int num_points)
-{
-    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
-    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    const lv_32fc_t phase_inc_n =
-        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
-    volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(
-        outVector, inVector, phase_inc_n, phase, num_points);
-}
-
-#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
-
-#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h
new file mode 100644
index 000000000..bee1f0682
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32fc_x2_rotator2_32fc.h
@@ -0,0 +1,782 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_s32fc_x2_rotator2_32fc
+ *
+ * \b Overview
+ *
+ * Rotate input vector at fixed rate per sample from initial phase
+ * offset.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32fc_x2_rotator2_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector,
+ * const lv_32fc_t* phase_inc, lv_32fc_t* phase, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inVector: Vector to be rotated.
+ * \li phase_inc: rotational velocity (scalar, input).
+ * \li phase: phase offset (scalar, input & output).
+ * \li num_points: The number of values in inVector to be rotated and stored into
+ * outVector.
+ *
+ * \b Outputs
+ * \li outVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Generate a tone at f=0.3 (normalized frequency) and use the rotator with
+ * f=0.1 to shift the tone to f=0.4. Change this example to start with a DC
+ * tone (initialize in with lv_cmake(1, 0)) to observe rotator signal generation.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       // Generate a tone at f=0.3
+ *       float real = std::cos(0.3f * (float)ii);
+ *       float imag = std::sin(0.3f * (float)ii);
+ *       in[ii] = lv_cmake(real, imag);
+ *   }
+ *   // The oscillator rotates at f=0.1
+ *   float frequency = 0.1f;
+ *   lv_32fc_t phase_increment = lv_cmake(std::cos(frequency), std::sin(frequency));
+ *   lv_32fc_t phase= lv_cmake(1.f, 0.0f); // start at 1 (0 rad phase)
+ *
+ *   // rotate so the output is a tone at f=0.4
+ *   volk_32fc_s32fc_x2_rotator2_32fc(out, in, &phase_increment, &phase, N);
+ *
+ *   // print results for inspection
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %+1.2f %+1.2fj\n",
+ *           ii, lv_creal(out[ii]), lv_cimag(out[ii]));
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H
+
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <volk/volk_complex.h>
+#define ROTATOR_RELOAD 512
+#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
+#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_generic(lv_32fc_t* outVector,
+                                                            const lv_32fc_t* inVector,
+                                                            const lv_32fc_t* phase_inc,
+                                                            lv_32fc_t* phase,
+                                                            unsigned int num_points)
+{
+    unsigned int i = 0;
+    int j = 0;
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
+        for (j = 0; j < ROTATOR_RELOAD; ++j) {
+            *outVector++ = *inVector++ * (*phase);
+            (*phase) *= *phase_inc;
+        }
+
+        (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    }
+    for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
+        *outVector++ = *inVector++ * (*phase);
+        (*phase) *= *phase_inc;
+    }
+    if (i) {
+        // Make sure, we normalize phase on every call!
+        (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_neon(lv_32fc_t* outVector,
+                                                         const lv_32fc_t* inVector,
+                                                         const lv_32fc_t* phase_inc,
+                                                         lv_32fc_t* phase,
+                                                         unsigned int num_points)
+
+{
+    lv_32fc_t* outputVectorPtr = outVector;
+    const lv_32fc_t* inputVectorPtr = inVector;
+    lv_32fc_t incr = 1;
+    lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
+    float32x4x2_t input_vec;
+    float32x4x2_t output_vec;
+
+    unsigned int i = 0, j = 0;
+    // const unsigned int quarter_points = num_points / 4;
+
+    for (i = 0; i < 4; ++i) {
+        phasePtr[i] *= incr;
+        incr *= (*phase_inc);
+    }
+
+    // Notice that incr has be incremented in the previous loop
+    const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
+    const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
+    float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD_4; j++) {
+            input_vec = vld2q_f32((float*)inputVectorPtr);
+            // Prefetch next one, speeds things up
+            __VOLK_PREFETCH(inputVectorPtr + 4);
+            // Rotate
+            output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
+            // Increase phase
+            phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
+            // Store output
+            vst2q_f32((float*)outputVectorPtr, output_vec);
+
+            outputVectorPtr += 4;
+            inputVectorPtr += 4;
+        }
+        // normalize phase so magnitude doesn't grow because of
+        // floating point rounding error
+        const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
+        const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
+        // Multiply complex with real
+        phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
+        phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
+    }
+
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
+        input_vec = vld2q_f32((float*)inputVectorPtr);
+        // Prefetch next one, speeds things up
+        __VOLK_PREFETCH(inputVectorPtr + 4);
+        // Rotate
+        output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
+        // Increase phase
+        phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
+        // Store output
+        vst2q_f32((float*)outputVectorPtr, output_vec);
+
+        outputVectorPtr += 4;
+        inputVectorPtr += 4;
+    }
+    // if(i) == true means we looped above
+    if (i) {
+        // normalize phase so magnitude doesn't grow because of
+        // floating point rounding error
+        const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
+        const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
+        // Multiply complex with real
+        phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
+        phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
+    }
+    // Store current phase
+    vst2q_f32((float*)phasePtr, phase_vec);
+
+    // Deal with the rest
+    for (i = 0; i < num_points % 4; i++) {
+        *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
+        phasePtr[0] *= (*phase_inc);
+    }
+
+    // For continuous phase next time we need to call this function
+    (*phase) = phasePtr[0];
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1(lv_32fc_t* outVector,
+                                                             const lv_32fc_t* inVector,
+                                                             const lv_32fc_t* phase_inc,
+                                                             lv_32fc_t* phase,
+                                                             unsigned int num_points)
+{
+    lv_32fc_t* cPtr = outVector;
+    const lv_32fc_t* aPtr = inVector;
+    lv_32fc_t incr = 1;
+    lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
+
+    unsigned int i, j = 0;
+
+    for (i = 0; i < 2; ++i) {
+        phase_Ptr[i] *= incr;
+        incr *= (*phase_inc);
+    }
+
+    __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+    phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+    inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
+
+            aVal = _mm_load_ps((float*)aPtr);
+
+            yl = _mm_moveldup_ps(phase_Val);
+            yh = _mm_movehdup_ps(phase_Val);
+            ylp = _mm_moveldup_ps(inc_Val);
+            yhp = _mm_movehdup_ps(inc_Val);
+
+            tmp1 = _mm_mul_ps(aVal, yl);
+            tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+            aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+            phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+            tmp2 = _mm_mul_ps(aVal, yh);
+            tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+            z = _mm_addsub_ps(tmp1, tmp2);
+            phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+            _mm_store_ps((float*)cPtr, z);
+
+            aPtr += 2;
+            cPtr += 2;
+        }
+        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm_sqrt_ps(tmp1);
+        phase_Val = _mm_div_ps(phase_Val, tmp2);
+    }
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
+        aVal = _mm_load_ps((float*)aPtr);
+
+        yl = _mm_moveldup_ps(phase_Val);
+        yh = _mm_movehdup_ps(phase_Val);
+        ylp = _mm_moveldup_ps(inc_Val);
+        yhp = _mm_movehdup_ps(inc_Val);
+
+        tmp1 = _mm_mul_ps(aVal, yl);
+
+        tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+        aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+        phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+        tmp2 = _mm_mul_ps(aVal, yh);
+        tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+        z = _mm_addsub_ps(tmp1, tmp2);
+        phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+        _mm_store_ps((float*)cPtr, z);
+
+        aPtr += 2;
+        cPtr += 2;
+    }
+    if (i) {
+        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm_sqrt_ps(tmp1);
+        phase_Val = _mm_div_ps(phase_Val, tmp2);
+    }
+
+    _mm_storeu_ps((float*)phase_Ptr, phase_Val);
+    if (num_points & 1) {
+        *cPtr++ = *aPtr++ * phase_Ptr[0];
+        phase_Ptr[0] *= (*phase_inc);
+    }
+
+    (*phase) = phase_Ptr[0];
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1(lv_32fc_t* outVector,
+                                                             const lv_32fc_t* inVector,
+                                                             const lv_32fc_t* phase_inc,
+                                                             lv_32fc_t* phase,
+                                                             unsigned int num_points)
+{
+    lv_32fc_t* cPtr = outVector;
+    const lv_32fc_t* aPtr = inVector;
+    lv_32fc_t incr = 1;
+    lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
+
+    unsigned int i, j = 0;
+
+    for (i = 0; i < 2; ++i) {
+        phase_Ptr[i] *= incr;
+        incr *= (*phase_inc);
+    }
+
+    /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+    printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+    printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+    __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+    phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+    inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
+
+            aVal = _mm_loadu_ps((float*)aPtr);
+
+            yl = _mm_moveldup_ps(phase_Val);
+            yh = _mm_movehdup_ps(phase_Val);
+            ylp = _mm_moveldup_ps(inc_Val);
+            yhp = _mm_movehdup_ps(inc_Val);
+
+            tmp1 = _mm_mul_ps(aVal, yl);
+            tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+            aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+            phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+            tmp2 = _mm_mul_ps(aVal, yh);
+            tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+            z = _mm_addsub_ps(tmp1, tmp2);
+            phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+            _mm_storeu_ps((float*)cPtr, z);
+
+            aPtr += 2;
+            cPtr += 2;
+        }
+        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm_sqrt_ps(tmp1);
+        phase_Val = _mm_div_ps(phase_Val, tmp2);
+    }
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
+        aVal = _mm_loadu_ps((float*)aPtr);
+
+        yl = _mm_moveldup_ps(phase_Val);
+        yh = _mm_movehdup_ps(phase_Val);
+        ylp = _mm_moveldup_ps(inc_Val);
+        yhp = _mm_movehdup_ps(inc_Val);
+
+        tmp1 = _mm_mul_ps(aVal, yl);
+
+        tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+        aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+        phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+        tmp2 = _mm_mul_ps(aVal, yh);
+        tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+        z = _mm_addsub_ps(tmp1, tmp2);
+        phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+        _mm_storeu_ps((float*)cPtr, z);
+
+        aPtr += 2;
+        cPtr += 2;
+    }
+    if (i) {
+        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm_sqrt_ps(tmp1);
+        phase_Val = _mm_div_ps(phase_Val, tmp2);
+    }
+
+    _mm_storeu_ps((float*)phase_Ptr, phase_Val);
+    if (num_points & 1) {
+        *cPtr++ = *aPtr++ * phase_Ptr[0];
+        phase_Ptr[0] *= (*phase_inc);
+    }
+
+    (*phase) = phase_Ptr[0];
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx(lv_32fc_t* outVector,
+                                                          const lv_32fc_t* inVector,
+                                                          const lv_32fc_t* phase_inc,
+                                                          lv_32fc_t* phase,
+                                                          unsigned int num_points)
+{
+    lv_32fc_t* cPtr = outVector;
+    const lv_32fc_t* aPtr = inVector;
+    lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
+    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+    unsigned int i, j = 0;
+
+    for (i = 0; i < 4; ++i) {
+        phase_Ptr[i] *= incr;
+        incr *= (*phase_inc);
+    }
+
+    __m256 aVal, phase_Val, z;
+
+    phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+
+    const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr));
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
+
+            aVal = _mm256_load_ps((float*)aPtr);
+
+            z = _mm256_complexmul_ps(aVal, phase_Val);
+            phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+            _mm256_store_ps((float*)cPtr, z);
+
+            aPtr += 4;
+            cPtr += 4;
+        }
+        phase_Val = _mm256_normalize_ps(phase_Val);
+    }
+
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
+        aVal = _mm256_load_ps((float*)aPtr);
+
+        z = _mm256_complexmul_ps(aVal, phase_Val);
+        phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+        _mm256_store_ps((float*)cPtr, z);
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+    if (i) {
+        phase_Val = _mm256_normalize_ps(phase_Val);
+    }
+
+    _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+    (*phase) = phase_Ptr[0];
+    volk_32fc_s32fc_x2_rotator2_32fc_generic(
+        cPtr, aPtr, phase_inc, phase, num_points % 4);
+}
+
+#endif /* LV_HAVE_AVX for aligned */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx(lv_32fc_t* outVector,
+                                                          const lv_32fc_t* inVector,
+                                                          const lv_32fc_t* phase_inc,
+                                                          lv_32fc_t* phase,
+                                                          unsigned int num_points)
+{
+    lv_32fc_t* cPtr = outVector;
+    const lv_32fc_t* aPtr = inVector;
+    lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
+    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+    unsigned int i, j = 0;
+
+    for (i = 0; i < 4; ++i) {
+        phase_Ptr[i] *= incr;
+        incr *= (*phase_inc);
+    }
+
+    __m256 aVal, phase_Val, z;
+
+    phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+
+    const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr));
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
+        for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
+
+            aVal = _mm256_loadu_ps((float*)aPtr);
+
+            z = _mm256_complexmul_ps(aVal, phase_Val);
+            phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+            _mm256_storeu_ps((float*)cPtr, z);
+
+            aPtr += 4;
+            cPtr += 4;
+        }
+        phase_Val = _mm256_normalize_ps(phase_Val);
+    }
+
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
+        aVal = _mm256_loadu_ps((float*)aPtr);
+
+        z = _mm256_complexmul_ps(aVal, phase_Val);
+        phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+        _mm256_storeu_ps((float*)cPtr, z);
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+    if (i) {
+        phase_Val = _mm256_normalize_ps(phase_Val);
+    }
+
+    _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+    (*phase) = phase_Ptr[0];
+    volk_32fc_s32fc_x2_rotator2_32fc_generic(
+        cPtr, aPtr, phase_inc, phase, num_points % 4);
+}
+
+#endif /* LV_HAVE_AVX */
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma(lv_32fc_t* outVector,
+                                                              const lv_32fc_t* inVector,
+                                                              const lv_32fc_t* phase_inc,
+                                                              lv_32fc_t* phase,
+                                                              unsigned int num_points)
+{
+    lv_32fc_t* cPtr = outVector;
+    const lv_32fc_t* aPtr = inVector;
+    lv_32fc_t incr = 1;
+    __VOLK_ATTR_ALIGNED(32)
+    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+    unsigned int i, j = 0;
+
+    for (i = 0; i < 4; ++i) {
+        phase_Ptr[i] *= incr;
+        incr *= (*phase_inc);
+    }
+
+    __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+    phase_Val = _mm256_load_ps((float*)phase_Ptr);
+    inc_Val = _mm256_set_ps(lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr));
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
+
+            aVal = _mm256_load_ps((float*)aPtr);
+
+            yl = _mm256_moveldup_ps(phase_Val);
+            yh = _mm256_movehdup_ps(phase_Val);
+            ylp = _mm256_moveldup_ps(inc_Val);
+            yhp = _mm256_movehdup_ps(inc_Val);
+
+            tmp1 = aVal;
+            tmp1p = phase_Val;
+
+            aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+            phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+            tmp2 = _mm256_mul_ps(aVal, yh);
+            tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+            z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
+            phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
+
+            _mm256_store_ps((float*)cPtr, z);
+
+            aPtr += 4;
+            cPtr += 4;
+        }
+        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm256_sqrt_ps(tmp1);
+        phase_Val = _mm256_div_ps(phase_Val, tmp2);
+    }
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
+        aVal = _mm256_load_ps((float*)aPtr);
+
+        yl = _mm256_moveldup_ps(phase_Val);
+        yh = _mm256_movehdup_ps(phase_Val);
+        ylp = _mm256_moveldup_ps(inc_Val);
+        yhp = _mm256_movehdup_ps(inc_Val);
+
+        tmp1 = aVal;
+        tmp1p = phase_Val;
+
+        aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+        phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+        tmp2 = _mm256_mul_ps(aVal, yh);
+        tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+        z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
+        phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
+
+        _mm256_store_ps((float*)cPtr, z);
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+    if (i) {
+        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm256_sqrt_ps(tmp1);
+        phase_Val = _mm256_div_ps(phase_Val, tmp2);
+    }
+
+    _mm256_store_ps((float*)phase_Ptr, phase_Val);
+    for (i = 0; i < num_points % 4; ++i) {
+        *cPtr++ = *aPtr++ * phase_Ptr[0];
+        phase_Ptr[0] *= (*phase_inc);
+    }
+
+    (*phase) = phase_Ptr[0];
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(lv_32fc_t* outVector,
+                                                              const lv_32fc_t* inVector,
+                                                              const lv_32fc_t* phase_inc,
+                                                              lv_32fc_t* phase,
+                                                              unsigned int num_points)
+{
+    lv_32fc_t* cPtr = outVector;
+    const lv_32fc_t* aPtr = inVector;
+    lv_32fc_t incr = 1;
+    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+    unsigned int i, j = 0;
+
+    for (i = 0; i < 4; ++i) {
+        phase_Ptr[i] *= incr;
+        incr *= (*phase_inc);
+    }
+
+    __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+    phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+    inc_Val = _mm256_set_ps(lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr));
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
+
+            aVal = _mm256_loadu_ps((float*)aPtr);
+
+            yl = _mm256_moveldup_ps(phase_Val);
+            yh = _mm256_movehdup_ps(phase_Val);
+            ylp = _mm256_moveldup_ps(inc_Val);
+            yhp = _mm256_movehdup_ps(inc_Val);
+
+            tmp1 = aVal;
+            tmp1p = phase_Val;
+
+            aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+            phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+            tmp2 = _mm256_mul_ps(aVal, yh);
+            tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+            z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
+            phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
+
+            _mm256_storeu_ps((float*)cPtr, z);
+
+            aPtr += 4;
+            cPtr += 4;
+        }
+        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm256_sqrt_ps(tmp1);
+        phase_Val = _mm256_div_ps(phase_Val, tmp2);
+    }
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
+        aVal = _mm256_loadu_ps((float*)aPtr);
+
+        yl = _mm256_moveldup_ps(phase_Val);
+        yh = _mm256_movehdup_ps(phase_Val);
+        ylp = _mm256_moveldup_ps(inc_Val);
+        yhp = _mm256_movehdup_ps(inc_Val);
+
+        tmp1 = aVal;
+        tmp1p = phase_Val;
+
+        aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+        phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+        tmp2 = _mm256_mul_ps(aVal, yh);
+        tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+        z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
+        phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
+
+        _mm256_storeu_ps((float*)cPtr, z);
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+    if (i) {
+        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm256_sqrt_ps(tmp1);
+        phase_Val = _mm256_div_ps(phase_Val, tmp2);
+    }
+
+    _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+    for (i = 0; i < num_points % 4; ++i) {
+        *cPtr++ = *aPtr++ * phase_Ptr[0];
+        phase_Ptr[0] *= (*phase_inc);
+    }
+
+    (*phase) = phase_Ptr[0];
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotator2_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
index aebf46d53..c0def35dd 100644
--- a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
+++ b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
@@ -10,6 +10,12 @@
 /*!
  * \page volk_32fc_s32fc_x2_rotator_32fc
  *
+ * \b Deprecation
+ *
+ * This kernel is deprecated, because passing in `lv_32fc_t` by value results in
+ * Undefined Behaviour, causing a segmentation fault on some architectures.
+ * Use `volk_32fc_s32fc_x2_rotator2_32fc` instead.
+ *
  * \b Overview
  *
  * Rotate input vector at fixed rate per sample from initial phase
@@ -72,10 +78,8 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <volk/volk_32fc_s32fc_x2_rotator_32fc.h>
 #include <volk/volk_complex.h>
-#define ROTATOR_RELOAD 512
-#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
-#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
 
 
 #ifdef LV_HAVE_GENERIC
@@ -86,32 +90,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
                                                            lv_32fc_t* phase,
                                                            unsigned int num_points)
 {
-    unsigned int i = 0;
-    int j = 0;
-    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
-        for (j = 0; j < ROTATOR_RELOAD; ++j) {
-            *outVector++ = *inVector++ * (*phase);
-            (*phase) *= phase_inc;
-        }
-
-        (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    }
-    for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
-        *outVector++ = *inVector++ * (*phase);
-        (*phase) *= phase_inc;
-    }
-    if (i) {
-        // Make sure, we normalize phase on every call!
-        (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
-    }
+    volk_32fc_s32fc_x2_rotator2_32fc_generic(
+        outVector, inVector, &phase_inc, phase, num_points);
 }
 
 #endif /* LV_HAVE_GENERIC */
 
 
 #ifdef LV_HAVE_NEON
-#include <arm_neon.h>
-#include <volk/volk_neon_intrinsics.h>
 
 static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
                                                         const lv_32fc_t* inVector,
@@ -120,92 +106,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
                                                         unsigned int num_points)
 
 {
-    lv_32fc_t* outputVectorPtr = outVector;
-    const lv_32fc_t* inputVectorPtr = inVector;
-    lv_32fc_t incr = 1;
-    lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
-    float32x4x2_t input_vec;
-    float32x4x2_t output_vec;
-
-    unsigned int i = 0, j = 0;
-    // const unsigned int quarter_points = num_points / 4;
-
-    for (i = 0; i < 4; ++i) {
-        phasePtr[i] *= incr;
-        incr *= (phase_inc);
-    }
-
-    // Notice that incr has be incremented in the previous loop
-    const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
-    const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
-    float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
-
-    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
-        for (j = 0; j < ROTATOR_RELOAD_4; j++) {
-            input_vec = vld2q_f32((float*)inputVectorPtr);
-            // Prefetch next one, speeds things up
-            __VOLK_PREFETCH(inputVectorPtr + 4);
-            // Rotate
-            output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
-            // Increase phase
-            phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
-            // Store output
-            vst2q_f32((float*)outputVectorPtr, output_vec);
-
-            outputVectorPtr += 4;
-            inputVectorPtr += 4;
-        }
-        // normalize phase so magnitude doesn't grow because of
-        // floating point rounding error
-        const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
-        const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
-        // Multiply complex with real
-        phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
-        phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
-    }
-
-    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
-        input_vec = vld2q_f32((float*)inputVectorPtr);
-        // Prefetch next one, speeds things up
-        __VOLK_PREFETCH(inputVectorPtr + 4);
-        // Rotate
-        output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
-        // Increase phase
-        phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
-        // Store output
-        vst2q_f32((float*)outputVectorPtr, output_vec);
-
-        outputVectorPtr += 4;
-        inputVectorPtr += 4;
-    }
-    // if(i) == true means we looped above
-    if (i) {
-        // normalize phase so magnitude doesn't grow because of
-        // floating point rounding error
-        const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
-        const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
-        // Multiply complex with real
-        phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
-        phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
-    }
-    // Store current phase
-    vst2q_f32((float*)phasePtr, phase_vec);
-
-    // Deal with the rest
-    for (i = 0; i < num_points % 4; i++) {
-        *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
-        phasePtr[0] *= (phase_inc);
-    }
-
-    // For continuous phase next time we need to call this function
-    (*phase) = phasePtr[0];
+    volk_32fc_s32fc_x2_rotator2_32fc_neon(
+        outVector, inVector, &phase_inc, phase, num_points);
 }
 
 #endif /* LV_HAVE_NEON */
 
 
 #ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
 
 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
                                                             const lv_32fc_t* inVector,
@@ -213,102 +121,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector
                                                             lv_32fc_t* phase,
                                                             unsigned int num_points)
 {
-    lv_32fc_t* cPtr = outVector;
-    const lv_32fc_t* aPtr = inVector;
-    lv_32fc_t incr = 1;
-    lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
-
-    unsigned int i, j = 0;
-
-    for (i = 0; i < 2; ++i) {
-        phase_Ptr[i] *= incr;
-        incr *= (phase_inc);
-    }
-
-    __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
-
-    phase_Val = _mm_loadu_ps((float*)phase_Ptr);
-    inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
-
-    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
-        for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
-
-            aVal = _mm_load_ps((float*)aPtr);
-
-            yl = _mm_moveldup_ps(phase_Val);
-            yh = _mm_movehdup_ps(phase_Val);
-            ylp = _mm_moveldup_ps(inc_Val);
-            yhp = _mm_movehdup_ps(inc_Val);
-
-            tmp1 = _mm_mul_ps(aVal, yl);
-            tmp1p = _mm_mul_ps(phase_Val, ylp);
-
-            aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
-            phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
-            tmp2 = _mm_mul_ps(aVal, yh);
-            tmp2p = _mm_mul_ps(phase_Val, yhp);
-
-            z = _mm_addsub_ps(tmp1, tmp2);
-            phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
-
-            _mm_store_ps((float*)cPtr, z);
-
-            aPtr += 2;
-            cPtr += 2;
-        }
-        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
-        tmp2 = _mm_hadd_ps(tmp1, tmp1);
-        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
-        tmp2 = _mm_sqrt_ps(tmp1);
-        phase_Val = _mm_div_ps(phase_Val, tmp2);
-    }
-    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
-        aVal = _mm_load_ps((float*)aPtr);
-
-        yl = _mm_moveldup_ps(phase_Val);
-        yh = _mm_movehdup_ps(phase_Val);
-        ylp = _mm_moveldup_ps(inc_Val);
-        yhp = _mm_movehdup_ps(inc_Val);
-
-        tmp1 = _mm_mul_ps(aVal, yl);
-
-        tmp1p = _mm_mul_ps(phase_Val, ylp);
-
-        aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
-        phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
-        tmp2 = _mm_mul_ps(aVal, yh);
-        tmp2p = _mm_mul_ps(phase_Val, yhp);
-
-        z = _mm_addsub_ps(tmp1, tmp2);
-        phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
-
-        _mm_store_ps((float*)cPtr, z);
-
-        aPtr += 2;
-        cPtr += 2;
-    }
-    if (i) {
-        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
-        tmp2 = _mm_hadd_ps(tmp1, tmp1);
-        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
-        tmp2 = _mm_sqrt_ps(tmp1);
-        phase_Val = _mm_div_ps(phase_Val, tmp2);
-    }
-
-    _mm_storeu_ps((float*)phase_Ptr, phase_Val);
-    if (num_points & 1) {
-        *cPtr++ = *aPtr++ * phase_Ptr[0];
-        phase_Ptr[0] *= (phase_inc);
-    }
-
-    (*phase) = phase_Ptr[0];
+    volk_32fc_s32fc_x2_rotator2_32fc_a_sse4_1(
+        outVector, inVector, &phase_inc, phase, num_points);
 }
 
 #endif /* LV_HAVE_SSE4_1 for aligned */
 
 
 #ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
 
 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
                                                             const lv_32fc_t* inVector,
@@ -316,106 +136,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector
                                                             lv_32fc_t* phase,
                                                             unsigned int num_points)
 {
-    lv_32fc_t* cPtr = outVector;
-    const lv_32fc_t* aPtr = inVector;
-    lv_32fc_t incr = 1;
-    lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
-
-    unsigned int i, j = 0;
-
-    for (i = 0; i < 2; ++i) {
-        phase_Ptr[i] *= incr;
-        incr *= (phase_inc);
-    }
-
-    /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
-    printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
-    printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
-    __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
-
-    phase_Val = _mm_loadu_ps((float*)phase_Ptr);
-    inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
-
-    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
-        for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
-
-            aVal = _mm_loadu_ps((float*)aPtr);
-
-            yl = _mm_moveldup_ps(phase_Val);
-            yh = _mm_movehdup_ps(phase_Val);
-            ylp = _mm_moveldup_ps(inc_Val);
-            yhp = _mm_movehdup_ps(inc_Val);
-
-            tmp1 = _mm_mul_ps(aVal, yl);
-            tmp1p = _mm_mul_ps(phase_Val, ylp);
-
-            aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
-            phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
-            tmp2 = _mm_mul_ps(aVal, yh);
-            tmp2p = _mm_mul_ps(phase_Val, yhp);
-
-            z = _mm_addsub_ps(tmp1, tmp2);
-            phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
-
-            _mm_storeu_ps((float*)cPtr, z);
-
-            aPtr += 2;
-            cPtr += 2;
-        }
-        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
-        tmp2 = _mm_hadd_ps(tmp1, tmp1);
-        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
-        tmp2 = _mm_sqrt_ps(tmp1);
-        phase_Val = _mm_div_ps(phase_Val, tmp2);
-    }
-    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
-        aVal = _mm_loadu_ps((float*)aPtr);
-
-        yl = _mm_moveldup_ps(phase_Val);
-        yh = _mm_movehdup_ps(phase_Val);
-        ylp = _mm_moveldup_ps(inc_Val);
-        yhp = _mm_movehdup_ps(inc_Val);
-
-        tmp1 = _mm_mul_ps(aVal, yl);
-
-        tmp1p = _mm_mul_ps(phase_Val, ylp);
-
-        aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
-        phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
-        tmp2 = _mm_mul_ps(aVal, yh);
-        tmp2p = _mm_mul_ps(phase_Val, yhp);
-
-        z = _mm_addsub_ps(tmp1, tmp2);
-        phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
-
-        _mm_storeu_ps((float*)cPtr, z);
-
-        aPtr += 2;
-        cPtr += 2;
-    }
-    if (i) {
-        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
-        tmp2 = _mm_hadd_ps(tmp1, tmp1);
-        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
-        tmp2 = _mm_sqrt_ps(tmp1);
-        phase_Val = _mm_div_ps(phase_Val, tmp2);
-    }
-
-    _mm_storeu_ps((float*)phase_Ptr, phase_Val);
-    if (num_points & 1) {
-        *cPtr++ = *aPtr++ * phase_Ptr[0];
-        phase_Ptr[0] *= (phase_inc);
-    }
-
-    (*phase) = phase_Ptr[0];
+    volk_32fc_s32fc_x2_rotator2_32fc_u_sse4_1(
+        outVector, inVector, &phase_inc, phase, num_points);
 }
 
 #endif /* LV_HAVE_SSE4_1 */
 
 
 #ifdef LV_HAVE_AVX
-#include <immintrin.h>
-#include <volk/volk_avx_intrinsics.h>
 
 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
                                                          const lv_32fc_t* inVector,
@@ -423,73 +151,14 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
                                                          lv_32fc_t* phase,
                                                          unsigned int num_points)
 {
-    lv_32fc_t* cPtr = outVector;
-    const lv_32fc_t* aPtr = inVector;
-    lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
-    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
-
-    unsigned int i, j = 0;
-
-    for (i = 0; i < 4; ++i) {
-        phase_Ptr[i] *= incr;
-        incr *= (phase_inc);
-    }
-
-    __m256 aVal, phase_Val, z;
-
-    phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
-
-    const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
-                                         lv_creal(incr),
-                                         lv_cimag(incr),
-                                         lv_creal(incr),
-                                         lv_cimag(incr),
-                                         lv_creal(incr),
-                                         lv_cimag(incr),
-                                         lv_creal(incr));
-
-    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
-        for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
-
-            aVal = _mm256_load_ps((float*)aPtr);
-
-            z = _mm256_complexmul_ps(aVal, phase_Val);
-            phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
-
-            _mm256_store_ps((float*)cPtr, z);
-
-            aPtr += 4;
-            cPtr += 4;
-        }
-        phase_Val = _mm256_normalize_ps(phase_Val);
-    }
-
-    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
-        aVal = _mm256_load_ps((float*)aPtr);
-
-        z = _mm256_complexmul_ps(aVal, phase_Val);
-        phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
-
-        _mm256_store_ps((float*)cPtr, z);
-
-        aPtr += 4;
-        cPtr += 4;
-    }
-    if (i) {
-        phase_Val = _mm256_normalize_ps(phase_Val);
-    }
-
-    _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
-    (*phase) = phase_Ptr[0];
-    volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
+    volk_32fc_s32fc_x2_rotator2_32fc_a_avx(
+        outVector, inVector, &phase_inc, phase, num_points);
 }
 
 #endif /* LV_HAVE_AVX for aligned */
 
 
 #ifdef LV_HAVE_AVX
-#include <immintrin.h>
-#include <volk/volk_avx_intrinsics.h>
 
 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
                                                          const lv_32fc_t* inVector,
@@ -497,71 +166,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
                                                          lv_32fc_t* phase,
                                                          unsigned int num_points)
 {
-    lv_32fc_t* cPtr = outVector;
-    const lv_32fc_t* aPtr = inVector;
-    lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
-    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
-
-    unsigned int i, j = 0;
-
-    for (i = 0; i < 4; ++i) {
-        phase_Ptr[i] *= incr;
-        incr *= (phase_inc);
-    }
-
-    __m256 aVal, phase_Val, z;
-
-    phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
-
-    const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
-                                         lv_creal(incr),
-                                         lv_cimag(incr),
-                                         lv_creal(incr),
-                                         lv_cimag(incr),
-                                         lv_creal(incr),
-                                         lv_cimag(incr),
-                                         lv_creal(incr));
-
-    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
-        for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
-
-            aVal = _mm256_loadu_ps((float*)aPtr);
-
-            z = _mm256_complexmul_ps(aVal, phase_Val);
-            phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
-
-            _mm256_storeu_ps((float*)cPtr, z);
-
-            aPtr += 4;
-            cPtr += 4;
-        }
-        phase_Val = _mm256_normalize_ps(phase_Val);
-    }
-
-    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
-        aVal = _mm256_loadu_ps((float*)aPtr);
-
-        z = _mm256_complexmul_ps(aVal, phase_Val);
-        phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
-
-        _mm256_storeu_ps((float*)cPtr, z);
-
-        aPtr += 4;
-        cPtr += 4;
-    }
-    if (i) {
-        phase_Val = _mm256_normalize_ps(phase_Val);
-    }
-
-    _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
-    (*phase) = phase_Ptr[0];
-    volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
+    volk_32fc_s32fc_x2_rotator2_32fc_u_avx(
+        outVector, inVector, &phase_inc, phase, num_points);
 }
 
 #endif /* LV_HAVE_AVX */
 
 #if LV_HAVE_AVX && LV_HAVE_FMA
-#include <immintrin.h>
 
 static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
                                                              const lv_32fc_t* inVector,
@@ -569,108 +180,13 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVecto
                                                              lv_32fc_t* phase,
                                                              unsigned int num_points)
 {
-    lv_32fc_t* cPtr = outVector;
-    const lv_32fc_t* aPtr = inVector;
-    lv_32fc_t incr = 1;
-    __VOLK_ATTR_ALIGNED(32)
-    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
-
-    unsigned int i, j = 0;
-
-    for (i = 0; i < 4; ++i) {
-        phase_Ptr[i] *= incr;
-        incr *= (phase_inc);
-    }
-
-    __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
-
-    phase_Val = _mm256_load_ps((float*)phase_Ptr);
-    inc_Val = _mm256_set_ps(lv_cimag(incr),
-                            lv_creal(incr),
-                            lv_cimag(incr),
-                            lv_creal(incr),
-                            lv_cimag(incr),
-                            lv_creal(incr),
-                            lv_cimag(incr),
-                            lv_creal(incr));
-
-    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
-        for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
-
-            aVal = _mm256_load_ps((float*)aPtr);
-
-            yl = _mm256_moveldup_ps(phase_Val);
-            yh = _mm256_movehdup_ps(phase_Val);
-            ylp = _mm256_moveldup_ps(inc_Val);
-            yhp = _mm256_movehdup_ps(inc_Val);
-
-            tmp1 = aVal;
-            tmp1p = phase_Val;
-
-            aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
-            phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
-            tmp2 = _mm256_mul_ps(aVal, yh);
-            tmp2p = _mm256_mul_ps(phase_Val, yhp);
-
-            z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
-            phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
-
-            _mm256_store_ps((float*)cPtr, z);
-
-            aPtr += 4;
-            cPtr += 4;
-        }
-        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
-        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
-        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
-        tmp2 = _mm256_sqrt_ps(tmp1);
-        phase_Val = _mm256_div_ps(phase_Val, tmp2);
-    }
-    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
-        aVal = _mm256_load_ps((float*)aPtr);
-
-        yl = _mm256_moveldup_ps(phase_Val);
-        yh = _mm256_movehdup_ps(phase_Val);
-        ylp = _mm256_moveldup_ps(inc_Val);
-        yhp = _mm256_movehdup_ps(inc_Val);
-
-        tmp1 = aVal;
-        tmp1p = phase_Val;
-
-        aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
-        phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
-        tmp2 = _mm256_mul_ps(aVal, yh);
-        tmp2p = _mm256_mul_ps(phase_Val, yhp);
-
-        z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
-        phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
-
-        _mm256_store_ps((float*)cPtr, z);
-
-        aPtr += 4;
-        cPtr += 4;
-    }
-    if (i) {
-        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
-        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
-        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
-        tmp2 = _mm256_sqrt_ps(tmp1);
-        phase_Val = _mm256_div_ps(phase_Val, tmp2);
-    }
-
-    _mm256_store_ps((float*)phase_Ptr, phase_Val);
-    for (i = 0; i < num_points % 4; ++i) {
-        *cPtr++ = *aPtr++ * phase_Ptr[0];
-        phase_Ptr[0] *= (phase_inc);
-    }
-
-    (*phase) = phase_Ptr[0];
+    volk_32fc_s32fc_x2_rotator2_32fc_a_avx_fma(
+        outVector, inVector, &phase_inc, phase, num_points);
 }
 
 #endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
 
 #if LV_HAVE_AVX && LV_HAVE_FMA
-#include <immintrin.h>
 
 static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
                                                              const lv_32fc_t* inVector,
@@ -678,101 +194,8 @@ static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVecto
                                                              lv_32fc_t* phase,
                                                              unsigned int num_points)
 {
-    lv_32fc_t* cPtr = outVector;
-    const lv_32fc_t* aPtr = inVector;
-    lv_32fc_t incr = 1;
-    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
-
-    unsigned int i, j = 0;
-
-    for (i = 0; i < 4; ++i) {
-        phase_Ptr[i] *= incr;
-        incr *= (phase_inc);
-    }
-
-    __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
-
-    phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
-    inc_Val = _mm256_set_ps(lv_cimag(incr),
-                            lv_creal(incr),
-                            lv_cimag(incr),
-                            lv_creal(incr),
-                            lv_cimag(incr),
-                            lv_creal(incr),
-                            lv_cimag(incr),
-                            lv_creal(incr));
-
-    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
-        for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
-
-            aVal = _mm256_loadu_ps((float*)aPtr);
-
-            yl = _mm256_moveldup_ps(phase_Val);
-            yh = _mm256_movehdup_ps(phase_Val);
-            ylp = _mm256_moveldup_ps(inc_Val);
-            yhp = _mm256_movehdup_ps(inc_Val);
-
-            tmp1 = aVal;
-            tmp1p = phase_Val;
-
-            aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
-            phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
-            tmp2 = _mm256_mul_ps(aVal, yh);
-            tmp2p = _mm256_mul_ps(phase_Val, yhp);
-
-            z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
-            phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
-
-            _mm256_storeu_ps((float*)cPtr, z);
-
-            aPtr += 4;
-            cPtr += 4;
-        }
-        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
-        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
-        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
-        tmp2 = _mm256_sqrt_ps(tmp1);
-        phase_Val = _mm256_div_ps(phase_Val, tmp2);
-    }
-    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
-        aVal = _mm256_loadu_ps((float*)aPtr);
-
-        yl = _mm256_moveldup_ps(phase_Val);
-        yh = _mm256_movehdup_ps(phase_Val);
-        ylp = _mm256_moveldup_ps(inc_Val);
-        yhp = _mm256_movehdup_ps(inc_Val);
-
-        tmp1 = aVal;
-        tmp1p = phase_Val;
-
-        aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
-        phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
-        tmp2 = _mm256_mul_ps(aVal, yh);
-        tmp2p = _mm256_mul_ps(phase_Val, yhp);
-
-        z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
-        phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
-
-        _mm256_storeu_ps((float*)cPtr, z);
-
-        aPtr += 4;
-        cPtr += 4;
-    }
-    if (i) {
-        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
-        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
-        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
-        tmp2 = _mm256_sqrt_ps(tmp1);
-        phase_Val = _mm256_div_ps(phase_Val, tmp2);
-    }
-
-    _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
-    for (i = 0; i < num_points % 4; ++i) {
-        *cPtr++ = *aPtr++ * phase_Ptr[0];
-        phase_Ptr[0] *= (phase_inc);
-    }
-
-    (*phase) = phase_Ptr[0];
+    volk_32fc_s32fc_x2_rotator2_32fc_u_avx_fma(
+        outVector, inVector, &phase_inc, phase, num_points);
 }
 
 #endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h
new file mode 100644
index 000000000..b35bed5eb
--- /dev/null
+++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h
@@ -0,0 +1,345 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2019 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc
+ *
+ * \b Overview
+ *
+ * Conjugate the input complex vector, multiply them by a complex scalar,
+ * add the another input complex vector and returns the results.
+ *
+ * c[i] = a[i] + conj(b[i]) * (*scalar)
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc(lv_32fc_t* cVector, const
+ * lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t* scalar, unsigned int
+ * num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector to be added.
+ * \li bVector: The input vector to be conjugate and multiplied.
+ * \li scalar: The complex scalar to multiply against conjugated bVector.
+ * \li num_points: The number of complex values in aVector and bVector to be conjugate,
+ * multiplied and stored into cVector.
+ *
+ * \b Outputs
+ * \li cVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Calculate coefficients.
+ *
+ * \code
+ * int n_filter = 2 * N + 1;
+ * unsigned int alignment = volk_get_alignment();
+ *
+ * // state is a queue of input IQ data.
+ * lv_32fc_t* state = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment);
+ * // weight and next one.
+ * lv_32fc_t* weight = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment);
+ * lv_32fc_t* next = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment);
+ * ...
+ * // push back input IQ data into state.
+ * foo_push_back_queue(state, input);
+ *
+ * // get filtered output.
+ * lv_32fc_t output = lv_cmake(0.f,0.f);
+ * for (int i = 0; i < n_filter; i++) {
+ *   output += state[i] * weight[i];
+ * }
+ *
+ * // update weight using output.
+ * float real = lv_creal(output) * (1.0 - std::norm(output)) * MU;
+ * lv_32fc_t factor = lv_cmake(real, 0.f);
+ * volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc(
+ *     next, weight, state, &factor, n_filter);
+ * lv_32fc_t *tmp = next;
+ * next = weight;
+ * weight = tmp;
+ * weight[N + 1] = lv_cmake(lv_creal(weight[N + 1]), 0.f);
+ * ...
+ * volk_free(state);
+ * volk_free(weight);
+ * volk_free(next);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
+#define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_generic(lv_32fc_t* cVector,
+                                                        const lv_32fc_t* aVector,
+                                                        const lv_32fc_t* bVector,
+                                                        const lv_32fc_t* scalar,
+                                                        unsigned int num_points)
+{
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    lv_32fc_t* cPtr = cVector;
+    unsigned int number = num_points;
+
+    // unwrap loop
+    while (number >= 8) {
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+        number -= 8;
+    }
+
+    // clean up any remaining
+    while (number-- > 0) {
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_avx(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      const lv_32fc_t* bVector,
+                                                      const lv_32fc_t* scalar,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+
+    __m256 x, y, s, z;
+    lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
+
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+    lv_32fc_t* c = cVector;
+
+    // Set up constant scalar vector
+    s = _mm256_loadu_ps((float*)v_scalar);
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps((float*)b);
+        y = _mm256_loadu_ps((float*)a);
+        z = _mm256_complexconjugatemul_ps(s, x);
+        z = _mm256_add_ps(y, z);
+        _mm256_storeu_ps((float*)c, z);
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        *c++ = (*a++) + lv_conj(*b++) * (*scalar);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_sse3(lv_32fc_t* cVector,
+                                                       const lv_32fc_t* aVector,
+                                                       const lv_32fc_t* bVector,
+                                                       const lv_32fc_t* scalar,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, s, z;
+    lv_32fc_t v_scalar[2] = { *scalar, *scalar };
+
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+    lv_32fc_t* c = cVector;
+
+    // Set up constant scalar vector
+    s = _mm_loadu_ps((float*)v_scalar);
+
+    for (; number < halfPoints; number++) {
+        x = _mm_loadu_ps((float*)b);
+        y = _mm_loadu_ps((float*)a);
+        z = _mm_complexconjugatemul_ps(s, x);
+        z = _mm_add_ps(y, z);
+        _mm_storeu_ps((float*)c, z);
+
+        a += 2;
+        b += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = *a + lv_conj(*b) * (*scalar);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_avx(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      const lv_32fc_t* bVector,
+                                                      const lv_32fc_t* scalar,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+
+    __m256 x, y, s, z;
+    lv_32fc_t v_scalar[4] = { *scalar, *scalar, *scalar, *scalar };
+
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+    lv_32fc_t* c = cVector;
+
+    // Set up constant scalar vector
+    s = _mm256_loadu_ps((float*)v_scalar);
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_load_ps((float*)b);
+        y = _mm256_load_ps((float*)a);
+        z = _mm256_complexconjugatemul_ps(s, x);
+        z = _mm256_add_ps(y, z);
+        _mm256_store_ps((float*)c, z);
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        *c++ = (*a++) + lv_conj(*b++) * (*scalar);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_sse3(lv_32fc_t* cVector,
+                                                       const lv_32fc_t* aVector,
+                                                       const lv_32fc_t* bVector,
+                                                       const lv_32fc_t* scalar,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, s, z;
+    lv_32fc_t v_scalar[2] = { *scalar, *scalar };
+
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+    lv_32fc_t* c = cVector;
+
+    // Set up constant scalar vector
+    s = _mm_loadu_ps((float*)v_scalar);
+
+    for (; number < halfPoints; number++) {
+        x = _mm_load_ps((float*)b);
+        y = _mm_load_ps((float*)a);
+        z = _mm_complexconjugatemul_ps(s, x);
+        z = _mm_add_ps(y, z);
+        _mm_store_ps((float*)c, z);
+
+        a += 2;
+        b += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = *a + lv_conj(*b) * (*scalar);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const lv_32fc_t* bVector,
+                                                     const lv_32fc_t* scalar,
+                                                     unsigned int num_points)
+{
+    const lv_32fc_t* bPtr = bVector;
+    const lv_32fc_t* aPtr = aVector;
+    lv_32fc_t* cPtr = cVector;
+    unsigned int number = num_points;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4x2_t a_val, b_val, c_val, scalar_val;
+    float32x4x2_t tmp_val;
+
+    scalar_val.val[0] = vld1q_dup_f32((const float*)scalar);
+    scalar_val.val[1] = vld1q_dup_f32(((const float*)scalar) + 1);
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)aPtr);
+        b_val = vld2q_f32((float*)bPtr);
+        b_val.val[1] = vnegq_f32(b_val.val[1]);
+        __VOLK_PREFETCH(aPtr + 8);
+        __VOLK_PREFETCH(bPtr + 8);
+
+        tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
+        tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
+
+        tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
+        tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
+
+        c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
+        c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
+
+        vst2q_f32((float*)cPtr, c_val);
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * (*scalar);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_H */
diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
index 85cdaf16a..78694044e 100644
--- a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
+++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
@@ -10,6 +10,12 @@
 /*!
  * \page volk_32fc_x2_s32fc_multiply_conjugate_add_32fc
  *
+ * \b Deprecation
+ *
+ * This kernel is deprecated, because passing in `lv_32fc_t` by value results in
+ * Undefined Behaviour, causing a segmentation fault on some architectures.
+ * Use `volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc` instead.
+ *
  * \b Overview
  *
  * Conjugate the input complex vector, multiply them by a complex scalar,
@@ -76,6 +82,7 @@
 #include <float.h>
 #include <inttypes.h>
 #include <stdio.h>
+#include <volk/volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc.h>
 #include <volk/volk_complex.h>
 
 
@@ -88,35 +95,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector,
                                                        const lv_32fc_t scalar,
                                                        unsigned int num_points)
 {
-    const lv_32fc_t* aPtr = aVector;
-    const lv_32fc_t* bPtr = bVector;
-    lv_32fc_t* cPtr = cVector;
-    unsigned int number = num_points;
-
-    // unwrap loop
-    while (number >= 8) {
-        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
-        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
-        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
-        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
-        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
-        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
-        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
-        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
-        number -= 8;
-    }
-
-    // clean up any remaining
-    while (number-- > 0) {
-        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
-    }
+    volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_generic(
+        cVector, aVector, bVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_GENERIC */
 
 
 #ifdef LV_HAVE_AVX
-#include <immintrin.h>
-#include <volk/volk_avx_intrinsics.h>
 
 static inline void
 volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector,
@@ -125,43 +110,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector,
                                                      const lv_32fc_t scalar,
                                                      unsigned int num_points)
 {
-    unsigned int number = 0;
-    unsigned int i = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    unsigned int isodd = num_points & 3;
-
-    __m256 x, y, s, z;
-    lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
-
-    const lv_32fc_t* a = aVector;
-    const lv_32fc_t* b = bVector;
-    lv_32fc_t* c = cVector;
-
-    // Set up constant scalar vector
-    s = _mm256_loadu_ps((float*)v_scalar);
-
-    for (; number < quarterPoints; number++) {
-        x = _mm256_loadu_ps((float*)b);
-        y = _mm256_loadu_ps((float*)a);
-        z = _mm256_complexconjugatemul_ps(s, x);
-        z = _mm256_add_ps(y, z);
-        _mm256_storeu_ps((float*)c, z);
-
-        a += 4;
-        b += 4;
-        c += 4;
-    }
-
-    for (i = num_points - isodd; i < num_points; i++) {
-        *c++ = (*a++) + lv_conj(*b++) * scalar;
-    }
+    volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_avx(
+        cVector, aVector, bVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_AVX */
 
 
 #ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
-#include <volk/volk_sse3_intrinsics.h>
 
 static inline void
 volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector,
@@ -170,41 +125,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector,
                                                       const lv_32fc_t scalar,
                                                       unsigned int num_points)
 {
-    unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    __m128 x, y, s, z;
-    lv_32fc_t v_scalar[2] = { scalar, scalar };
-
-    const lv_32fc_t* a = aVector;
-    const lv_32fc_t* b = bVector;
-    lv_32fc_t* c = cVector;
-
-    // Set up constant scalar vector
-    s = _mm_loadu_ps((float*)v_scalar);
-
-    for (; number < halfPoints; number++) {
-        x = _mm_loadu_ps((float*)b);
-        y = _mm_loadu_ps((float*)a);
-        z = _mm_complexconjugatemul_ps(s, x);
-        z = _mm_add_ps(y, z);
-        _mm_storeu_ps((float*)c, z);
-
-        a += 2;
-        b += 2;
-        c += 2;
-    }
-
-    if ((num_points % 2) != 0) {
-        *c = *a + lv_conj(*b) * scalar;
-    }
+    volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_u_sse3(
+        cVector, aVector, bVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_SSE */
 
 
 #ifdef LV_HAVE_AVX
-#include <immintrin.h>
-#include <volk/volk_avx_intrinsics.h>
 
 static inline void
 volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector,
@@ -213,43 +140,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector,
                                                      const lv_32fc_t scalar,
                                                      unsigned int num_points)
 {
-    unsigned int number = 0;
-    unsigned int i = 0;
-    const unsigned int quarterPoints = num_points / 4;
-    unsigned int isodd = num_points & 3;
-
-    __m256 x, y, s, z;
-    lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
-
-    const lv_32fc_t* a = aVector;
-    const lv_32fc_t* b = bVector;
-    lv_32fc_t* c = cVector;
-
-    // Set up constant scalar vector
-    s = _mm256_loadu_ps((float*)v_scalar);
-
-    for (; number < quarterPoints; number++) {
-        x = _mm256_load_ps((float*)b);
-        y = _mm256_load_ps((float*)a);
-        z = _mm256_complexconjugatemul_ps(s, x);
-        z = _mm256_add_ps(y, z);
-        _mm256_store_ps((float*)c, z);
-
-        a += 4;
-        b += 4;
-        c += 4;
-    }
-
-    for (i = num_points - isodd; i < num_points; i++) {
-        *c++ = (*a++) + lv_conj(*b++) * scalar;
-    }
+    volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_avx(
+        cVector, aVector, bVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_AVX */
 
 
 #ifdef LV_HAVE_SSE3
-#include <pmmintrin.h>
-#include <volk/volk_sse3_intrinsics.h>
 
 static inline void
 volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector,
@@ -258,40 +155,13 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector,
                                                       const lv_32fc_t scalar,
                                                       unsigned int num_points)
 {
-    unsigned int number = 0;
-    const unsigned int halfPoints = num_points / 2;
-
-    __m128 x, y, s, z;
-    lv_32fc_t v_scalar[2] = { scalar, scalar };
-
-    const lv_32fc_t* a = aVector;
-    const lv_32fc_t* b = bVector;
-    lv_32fc_t* c = cVector;
-
-    // Set up constant scalar vector
-    s = _mm_loadu_ps((float*)v_scalar);
-
-    for (; number < halfPoints; number++) {
-        x = _mm_load_ps((float*)b);
-        y = _mm_load_ps((float*)a);
-        z = _mm_complexconjugatemul_ps(s, x);
-        z = _mm_add_ps(y, z);
-        _mm_store_ps((float*)c, z);
-
-        a += 2;
-        b += 2;
-        c += 2;
-    }
-
-    if ((num_points % 2) != 0) {
-        *c = *a + lv_conj(*b) * scalar;
-    }
+    volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_a_sse3(
+        cVector, aVector, bVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_SSE */
 
 
 #ifdef LV_HAVE_NEON
-#include <arm_neon.h>
 
 static inline void
 volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector,
@@ -300,44 +170,8 @@ volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector,
                                                     const lv_32fc_t scalar,
                                                     unsigned int num_points)
 {
-    const lv_32fc_t* bPtr = bVector;
-    const lv_32fc_t* aPtr = aVector;
-    lv_32fc_t* cPtr = cVector;
-    unsigned int number = num_points;
-    unsigned int quarter_points = num_points / 4;
-
-    float32x4x2_t a_val, b_val, c_val, scalar_val;
-    float32x4x2_t tmp_val;
-
-    scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
-    scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
-
-    for (number = 0; number < quarter_points; ++number) {
-        a_val = vld2q_f32((float*)aPtr);
-        b_val = vld2q_f32((float*)bPtr);
-        b_val.val[1] = vnegq_f32(b_val.val[1]);
-        __VOLK_PREFETCH(aPtr + 8);
-        __VOLK_PREFETCH(bPtr + 8);
-
-        tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
-        tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
-
-        tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
-        tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
-
-        c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
-        c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
-
-        vst2q_f32((float*)cPtr, c_val);
-
-        aPtr += 4;
-        bPtr += 4;
-        cPtr += 4;
-    }
-
-    for (number = quarter_points * 4; number < num_points; number++) {
-        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
-    }
+    volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc_neon(
+        cVector, aVector, bVector, &scalar, num_points);
 }
 #endif /* LV_HAVE_NEON */
 
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h
index dd430762c..0e7db1439 100644
--- a/lib/kernel_tests.h
+++ b/lib/kernel_tests.h
@@ -55,8 +55,8 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
     QA(VOLK_INIT_PUPP(volk_32u_byteswappuppet_32u, volk_32u_byteswap, test_params))
     QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params))
     QA(VOLK_INIT_PUPP(volk_64u_byteswappuppet_64u, volk_64u_byteswap, test_params))
-    QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc,
-                      volk_32fc_s32fc_x2_rotator_32fc,
+    QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotator2puppet_32fc,
+                      volk_32fc_s32fc_x2_rotator2_32fc,
                       test_params_rotator))
     QA(VOLK_INIT_PUPP(
         volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0)))
@@ -159,14 +159,14 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
     QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params))
     QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params))
     QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params))
-    QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply2_32fc, test_params))
     QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params))
     QA(VOLK_INIT_TEST(volk_32f_s32f_add_32f, test_params))
     QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params))
     QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params))
     QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params))
     QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc))
-    QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add2_32fc, test_params))
     QA(VOLK_INIT_TEST(volk_32f_exp_32f, test_params))
     QA(VOLK_INIT_PUPP(
         volk_32f_s32f_mod_rangepuppet_32f, volk_32f_s32f_s32f_mod_range_32f, test_params))
diff --git a/lib/qa_utils.cc b/lib/qa_utils.cc
index 4be7b8ad9..a94d895c4 100644
--- a/lib/qa_utils.cc
+++ b/lib/qa_utils.cc
@@ -355,7 +355,7 @@ inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func,
                                  std::string arch)
 {
     while (iter--)
-        func(buffs[0], scalar, vlen, arch.c_str());
+        func(buffs[0], &scalar, vlen, arch.c_str());
 }
 
 inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func,
@@ -366,7 +366,7 @@ inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func,
                                  std::string arch)
 {
     while (iter--)
-        func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+        func(buffs[0], buffs[1], &scalar, vlen, arch.c_str());
 }
 
 inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func,
@@ -377,7 +377,7 @@ inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func,
                                  std::string arch)
 {
     while (iter--)
-        func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+        func(buffs[0], buffs[1], buffs[2], &scalar, vlen, arch.c_str());
 }
 
 template <class t>
diff --git a/lib/qa_utils.h b/lib/qa_utils.h
index 40f549cfd..a65677203 100644
--- a/lib/qa_utils.h
+++ b/lib/qa_utils.h
@@ -199,11 +199,11 @@ typedef void (*volk_fn_2arg_s32f)(void*, void*, float, unsigned int, const char*
 typedef void (*volk_fn_3arg_s32f)(void*, void*, void*, float, unsigned int, const char*);
 typedef void (*volk_fn_1arg_s32fc)(
     void*,
-    lv_32fc_t,
+    lv_32fc_t*,
     unsigned int,
     const char*); // one input vector, one scalar float input
-typedef void (*volk_fn_2arg_s32fc)(void*, void*, lv_32fc_t, unsigned int, const char*);
+typedef void (*volk_fn_2arg_s32fc)(void*, void*, lv_32fc_t*, unsigned int, const char*);
 typedef void (*volk_fn_3arg_s32fc)(
-    void*, void*, void*, lv_32fc_t, unsigned int, const char*);
+    void*, void*, void*, lv_32fc_t*, unsigned int, const char*);
 
 #endif // VOLK_QA_UTILS_H
diff --git a/tmpl/volk.tmpl.h b/tmpl/volk.tmpl.h
index ad5595b24..b26c542cf 100644
--- a/tmpl/volk.tmpl.h
+++ b/tmpl/volk.tmpl.h
@@ -63,9 +63,11 @@ VOLK_API bool volk_is_aligned(const void *ptr);
 // Just drop the deprecated attribute in case we are on Windows. Clang and GCC support `__attribute__`.
 // We just assume the compiler and the system are tight together as far as Mako templates are concerned.
 <%
-deprecated_kernels = ('volk_16i_x5_add_quad_16i_x4', 'volk_16i_branch_4_state_8', 
-                      'volk_16i_max_star_16i', 'volk_16i_max_star_horizontal_16i', 
-                      'volk_16i_permute_and_scalar_add', 'volk_16i_x4_quad_max_star_16i')
+deprecated_kernels = ('volk_16i_x5_add_quad_16i_x4', 'volk_16i_branch_4_state_8',
+                      'volk_16i_max_star_16i', 'volk_16i_max_star_horizontal_16i',
+                      'volk_16i_permute_and_scalar_add', 'volk_16i_x4_quad_max_star_16i',
+                      'volk_32fc_s32fc_multiply_32fc', 'volk_32fc_s32fc_x2_rotator_32fc',
+                      'volk_32fc_x2_s32fc_multiply_conjugate_add_32fc')
 from platform import system
 if system() == 'Windows':
     deprecated_kernels = ()