From d21287e48d69315ffe8b464e397d1c5c0915ae92 Mon Sep 17 00:00:00 2001
From: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
Date: Fri, 17 Jun 2022 12:45:28 +0200
Subject: [PATCH] Replace Vc by xsimd

Fixes: #520
---
 .github/workflows/ci.yaml              |  12 +-
 docs/pages/install.rst                 |   2 +-
 examples/alpaka/daxpy/CMakeLists.txt   |   1 -
 examples/alpaka/nbody/CMakeLists.txt   |   8 +-
 examples/alpaka/nbody/nbody.cpp        |  92 +++---
 examples/heatequation/CMakeLists.txt   |   8 +-
 examples/heatequation/heatequation.cpp |  37 +--
 examples/nbody/CMakeLists.txt          |   8 +-
 examples/nbody/nbody.cpp               | 370 +++++++++++++------------
 9 files changed, 292 insertions(+), 246 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 7c35902b1e..0a6e005c3c 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -13,7 +13,7 @@ env:
   THREADS: 4
   CONFIG: RelWithDebInfo
   ALPAKA_BRANCH: 0.9.0
-  VCPKG_INSTALL_LIST: "fmt vc tinyobjloader boost-mp11 boost-atomic boost-smart-ptr boost-functional boost-container"
+  VCPKG_INSTALL: "vcpkg install fmt tinyobjloader boost-mp11 boost-atomic boost-smart-ptr boost-functional boost-container; vcpkg install --head xsimd"
 
 jobs:
   clang-format:
@@ -40,7 +40,7 @@ jobs:
         sudo apt install clang-14 libomp-14-dev clang-tidy-14
     - name: vcpkg install dependencies
       run: |
-        vcpkg install $VCPKG_INSTALL_LIST
+        eval $VCPKG_INSTALL
     - name: install alpaka
       run: |
         git clone https://github.com/alpaka-group/alpaka.git
@@ -76,7 +76,7 @@ jobs:
         sudo apt install lcov
     - name: vcpkg install dependencies
       run: |
-        vcpkg install $VCPKG_INSTALL_LIST
+        eval $VCPKG_INSTALL
     - name: cmake
       run: |
         mkdir build
@@ -204,7 +204,7 @@ jobs:
         run: |
           # vcpkg fails to build with Intel compilers
           if [ ${{ matrix.install_oneapi }} ]; then unset CXX; fi
-          vcpkg install $VCPKG_INSTALL_LIST
+          eval $VCPKG_INSTALL
       - name: download CUDA
         if: matrix.cuda_url
         run: |
@@ -270,7 +270,7 @@ jobs:
     - uses: actions/checkout@v2
     - name: vcpkg install dependencies
       run: |
-        vcpkg install $VCPKG_INSTALL_LIST
+        eval $VCPKG_INSTALL
     - name: install alpaka
       run: |
         git clone --branch $ALPAKA_BRANCH --depth 1 https://github.com/alpaka-group/alpaka.git
@@ -311,7 +311,7 @@ jobs:
           brew install libomp
       - name: vcpkg install dependencies
         run: |
-          vcpkg install $VCPKG_INSTALL_LIST
+          eval $VCPKG_INSTALL
       - name: install alpaka
         run: |
           git clone --branch $ALPAKA_BRANCH --depth 1 https://github.com/alpaka-group/alpaka.git
diff --git a/docs/pages/install.rst b/docs/pages/install.rst
index 707dcd34db..bb63a918d0 100644
--- a/docs/pages/install.rst
+++ b/docs/pages/install.rst
@@ -23,7 +23,7 @@ Dependencies
 - Boost 1.70.0 or higher
 - libfmt 6.2.1 or higher (optional) for building the tests and examples and LLAMA supporting to dump mappings as SVG/HTML
 - `Alpaka <https://github.com/alpaka-group/alpaka>`_ (optional) for building some examples
-- `Vc <https://github.com/VcDevel/Vc>`_ (optional) for building some examples
+- `xsimd <https://github.com/xtensor-stack/xsimd>`_ (optional) for building some examples
 
 
 Build tests and examples
diff --git a/examples/alpaka/daxpy/CMakeLists.txt b/examples/alpaka/daxpy/CMakeLists.txt
index 22ff2e2991..582c7f7025 100644
--- a/examples/alpaka/daxpy/CMakeLists.txt
+++ b/examples/alpaka/daxpy/CMakeLists.txt
@@ -1,7 +1,6 @@
 cmake_minimum_required (VERSION 3.18.3)
 project(llama-alpaka-daxpy CXX)
 
-#find_package(Vc QUIET)
 find_package(OpenMP REQUIRED)
 if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)
diff --git a/examples/alpaka/nbody/CMakeLists.txt b/examples/alpaka/nbody/CMakeLists.txt
index a4f547bdd3..ab2651e7b6 100644
--- a/examples/alpaka/nbody/CMakeLists.txt
+++ b/examples/alpaka/nbody/CMakeLists.txt
@@ -1,14 +1,18 @@
 cmake_minimum_required (VERSION 3.18.3)
 project(llama-alpaka-nbody CXX)
 
-find_package(Vc REQUIRED)
+find_package(xsimd QUIET)
+if (NOT xsimd_FOUND)
+	message(WARNING "xsimd not found: alpaka n-body example is disabled")
+	return()
+endif()
 if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)
 endif()
 find_package(alpaka 0.9.0 REQUIRED)
 alpaka_add_executable(${PROJECT_NAME} nbody.cpp ../../common/Stopwatch.hpp)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
-target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka Vc::Vc)
+target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka xsimd)
 
 if (MSVC)
 	target_compile_options(${PROJECT_NAME} PRIVATE /arch:AVX2 /fp:fast)
diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp
index 80e0f88fd8..ee284c4f2e 100644
--- a/examples/alpaka/nbody/nbody.cpp
+++ b/examples/alpaka/nbody/nbody.cpp
@@ -10,6 +10,11 @@
 #include <string>
 #include <utility>
 
+#if __has_include(<xsimd/xsimd.hpp>)
+#    include <xsimd/xsimd.hpp>
+#    define HAVE_XSIMD
+#endif
+
 using FP = float;
 
 constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles
@@ -19,13 +24,11 @@ constexpr auto ALLOW_RSQRT = true; // rsqrt can be way faster, but less accurate
 
 #if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
 #    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-#        error Cannot enable CUDA together with other backends, because nvcc cannot parse the Vc header, sorry :/
+#        error Cannot enable CUDA together with other backends
 #    endif
-// nvcc fails to compile Vc headers even if nothing is used from there, so we need to conditionally include it
-#    include <Vc/Vc>
-constexpr auto DESIRED_ELEMENTS_PER_THREAD = Vc::float_v::size();
+constexpr auto DESIRED_ELEMENTS_PER_THREAD = xsimd::batch<float>::size;
 constexpr auto THREADS_PER_BLOCK = 1;
-constexpr auto AOSOA_LANES = Vc::float_v::size(); // vectors
+constexpr auto AOSOA_LANES = xsimd::batch<float>::size; // vectors
 #elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
 constexpr auto DESIRED_ELEMENTS_PER_THREAD = 1;
 constexpr auto THREADS_PER_BLOCK = 256;
@@ -77,20 +80,20 @@ namespace stdext
     }
 } // namespace stdext
 
-// FIXME: this makes assumptions that there are always float_v::size() many values blocked in the LLAMA view
-template<typename Vec>
+// FIXME: this makes assumptions that there are always Simd::size many values blocked in the LLAMA view
+template<typename Simd>
 LLAMA_FN_HOST_ACC_INLINE auto load(const FP& src)
 {
-    if constexpr(std::is_same_v<Vec, FP>)
+    if constexpr(std::is_same_v<Simd, FP>)
         return src;
     else
-        return Vec(&src);
+        return Simd::load_unaligned(&src);
 }
 
-template<typename Vec>
+template<typename Simd>
 LLAMA_FN_HOST_ACC_INLINE auto broadcast(const FP& src)
 {
-    return Vec(src);
+    return Simd(src);
 }
 
 template<typename Vec>
@@ -99,19 +102,19 @@ LLAMA_FN_HOST_ACC_INLINE auto store(FP& dst, Vec v)
     if constexpr(std::is_same_v<Vec, FP>)
         dst = v;
     else
-        v.store(&dst);
+        v.store_unaligned(&dst);
 }
 
 template<int Elems>
-struct VecType
+struct SimdType
 {
     // TODO(bgruber): we need a vector type that also works on GPUs
-#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
-    using type = Vc::SimdArray<FP, Elems>;
-#endif
+    using type = xsimd::make_sized_batch_t<FP, Elems>;
+    static_assert(!std::is_void_v<type>, "xsimd does not have a SIMD type for this element count");
 };
+
 template<>
-struct VecType<1>
+struct SimdType<1>
 {
     using type = FP;
 };
@@ -119,28 +122,26 @@ struct VecType<1>
 template<int Elems, typename ViewParticleI, typename ParticleRefJ>
 LLAMA_FN_HOST_ACC_INLINE void pPInteraction(ViewParticleI pi, ParticleRefJ pj)
 {
-    using Vec = typename VecType<Elems>::type;
+    using Simd = typename SimdType<Elems>::type;
 
     using std::sqrt;
     using stdext::rsqrt;
-#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
-    using Vc::rsqrt;
-    using Vc::sqrt;
-#endif
-
-    const Vec xdistance = load<Vec>(pi(tag::Pos{}, tag::X{})) - broadcast<Vec>(pj(tag::Pos{}, tag::X{}));
-    const Vec ydistance = load<Vec>(pi(tag::Pos{}, tag::Y{})) - broadcast<Vec>(pj(tag::Pos{}, tag::Y{}));
-    const Vec zdistance = load<Vec>(pi(tag::Pos{}, tag::Z{})) - broadcast<Vec>(pj(tag::Pos{}, tag::Z{}));
-    const Vec xdistanceSqr = xdistance * xdistance;
-    const Vec ydistanceSqr = ydistance * ydistance;
-    const Vec zdistanceSqr = zdistance * zdistance;
-    const Vec distSqr = +EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr;
-    const Vec distSixth = distSqr * distSqr * distSqr;
-    const Vec invDistCube = ALLOW_RSQRT ? rsqrt(distSixth) : (1.0f / sqrt(distSixth));
-    const Vec sts = broadcast<Vec>(pj(tag::Mass())) * invDistCube * TIMESTEP;
-    store<Vec>(pi(tag::Vel{}, tag::X{}), xdistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::X{})));
-    store<Vec>(pi(tag::Vel{}, tag::Y{}), ydistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::Y{})));
-    store<Vec>(pi(tag::Vel{}, tag::Z{}), zdistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::Z{})));
+    using xsimd::rsqrt;
+    using xsimd::sqrt;
+
+    const Simd xdistance = load<Simd>(pi(tag::Pos{}, tag::X{})) - broadcast<Simd>(pj(tag::Pos{}, tag::X{}));
+    const Simd ydistance = load<Simd>(pi(tag::Pos{}, tag::Y{})) - broadcast<Simd>(pj(tag::Pos{}, tag::Y{}));
+    const Simd zdistance = load<Simd>(pi(tag::Pos{}, tag::Z{})) - broadcast<Simd>(pj(tag::Pos{}, tag::Z{}));
+    const Simd xdistanceSqr = xdistance * xdistance;
+    const Simd ydistanceSqr = ydistance * ydistance;
+    const Simd zdistanceSqr = zdistance * zdistance;
+    const Simd distSqr = +EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr;
+    const Simd distSixth = distSqr * distSqr * distSqr;
+    const Simd invDistCube = ALLOW_RSQRT ? rsqrt(distSixth) : (1.0f / sqrt(distSixth));
+    const Simd sts = broadcast<Simd>(pj(tag::Mass())) * invDistCube * TIMESTEP;
+    store<Simd>(pi(tag::Vel{}, tag::X{}), xdistanceSqr * sts + load<Simd>(pi(tag::Vel{}, tag::X{})));
+    store<Simd>(pi(tag::Vel{}, tag::Y{}), ydistanceSqr * sts + load<Simd>(pi(tag::Vel{}, tag::Y{})));
+    store<Simd>(pi(tag::Vel{}, tag::Z{}), zdistanceSqr * sts + load<Simd>(pi(tag::Vel{}, tag::Z{})));
 }
 
 template<int ProblemSize, int Elems, int BlockSize, Mapping MappingSM>
@@ -218,16 +219,19 @@ struct MoveKernel
         const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
         const auto i = ti * Elems;
 
-        using Vec = typename VecType<Elems>::type;
-        store<Vec>(
+        using Simd = typename SimdType<Elems>::type;
+        store<Simd>(
             particles(i)(tag::Pos{}, tag::X{}),
-            load<Vec>(particles(i)(tag::Pos{}, tag::X{})) + load<Vec>(particles(i)(tag::Vel{}, tag::X{})) * TIMESTEP);
-        store<Vec>(
+            load<Simd>(particles(i)(tag::Pos{}, tag::X{}))
+                + load<Simd>(particles(i)(tag::Vel{}, tag::X{})) * TIMESTEP);
+        store<Simd>(
             particles(i)(tag::Pos{}, tag::Y{}),
-            load<Vec>(particles(i)(tag::Pos{}, tag::Y{})) + load<Vec>(particles(i)(tag::Vel{}, tag::Y{})) * TIMESTEP);
-        store<Vec>(
+            load<Simd>(particles(i)(tag::Pos{}, tag::Y{}))
+                + load<Simd>(particles(i)(tag::Vel{}, tag::Y{})) * TIMESTEP);
+        store<Simd>(
             particles(i)(tag::Pos{}, tag::Z{}),
-            load<Vec>(particles(i)(tag::Pos{}, tag::Z{})) + load<Vec>(particles(i)(tag::Vel{}, tag::Z{})) * TIMESTEP);
+            load<Simd>(particles(i)(tag::Pos{}, tag::Z{}))
+                + load<Simd>(particles(i)(tag::Vel{}, tag::Z{})) * TIMESTEP);
     }
 };
 
@@ -337,7 +341,7 @@ try
 
     std::ofstream plotFile{"nbody.sh"};
     plotFile.exceptions(std::ios::badbit | std::ios::failbit);
-    std::ofstream{"nbody.sh"} << R"(#!/usr/bin/gnuplot -p
+    plotFile << R"(#!/usr/bin/gnuplot -p
 set style data histograms
 set style fill solid
 set xtics rotate by 45 right
diff --git a/examples/heatequation/CMakeLists.txt b/examples/heatequation/CMakeLists.txt
index 453e44332a..815bcacc81 100644
--- a/examples/heatequation/CMakeLists.txt
+++ b/examples/heatequation/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required (VERSION 3.18.3)
 project(llama-heatequation CXX)
 
-find_package(Vc QUIET)
+find_package(xsimd QUIET)
 if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)
 endif()
@@ -9,8 +9,10 @@ endif()
 add_executable(${PROJECT_NAME} heatequation.cpp)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
 target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)
-if (Vc_FOUND)
-	target_link_libraries(${PROJECT_NAME} PRIVATE Vc::Vc)
+if (xsimd_FOUND)
+	target_link_libraries(${PROJECT_NAME} PRIVATE xsimd)
+else()
+	message(WARNING "xsimd not found: heatequation built without explicit SIMD support")
 endif()
 
 if (MSVC)
diff --git a/examples/heatequation/heatequation.cpp b/examples/heatequation/heatequation.cpp
index 352893f94c..7d776c78ad 100644
--- a/examples/heatequation/heatequation.cpp
+++ b/examples/heatequation/heatequation.cpp
@@ -21,6 +21,11 @@
 #include <llama/llama.hpp>
 #include <utility>
 
+#if __has_include(<xsimd/xsimd.hpp>)
+#    include <xsimd/xsimd.hpp>
+#    define HAVE_XSIMD
+#endif
+
 template<typename View>
 inline void kernel(uint32_t idx, const View& uCurr, View& uNext, double r)
 {
@@ -36,21 +41,21 @@ void update_scalar(const View& uCurr, View& uNext, uint32_t extent, double dx, d
             kernel(i, uCurr, uNext, r);
 }
 
-#if __has_include(<Vc/Vc>)
-#    include <Vc/Vc>
+#ifdef HAVE_XSIMD
 
 template<typename View>
 inline void kernel_vec(uint32_t baseIdx, const View& uCurr, View& uNext, double r)
 {
-    const auto next = Vc::double_v{&uCurr[baseIdx]} * (1.0 - 2.0 * r) + Vc::double_v{&uCurr[baseIdx - 1]} * r
-        + Vc::double_v{&uCurr[baseIdx + 1]} * r;
-    next.store(&uNext[baseIdx]);
+    using Simd = xsimd::batch<double>;
+    const auto next = Simd::load_unaligned(&uCurr[baseIdx]) * (1.0 - 2.0 * r)
+        + Simd::load_unaligned(&uCurr[baseIdx - 1]) * r + Simd::load_unaligned(&uCurr[baseIdx + 1]) * r;
+    next.store_unaligned(&uNext[baseIdx]);
 }
 
 template<typename View>
-void update_Vc(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
+void update_SIMD(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
 {
-    constexpr auto L = Vc::double_v::size();
+    constexpr auto L = xsimd::batch<double>::size;
     const auto r = dt / (dx * dx);
 
     const auto blocks = (extent + L - 1) / L;
@@ -67,9 +72,9 @@ void update_Vc(const View& uCurr, View& uNext, uint32_t extent, double dx, doubl
 }
 
 template<typename View>
-void update_Vc_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
+void update_SIMD_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
 {
-    constexpr auto L = Vc::double_v::size();
+    constexpr auto L = xsimd::batch<double>::size;
     const auto r = dt / (dx * dx);
 
     for(auto i = 1; i < L; i++)
@@ -84,9 +89,9 @@ void update_Vc_peel(const View& uCurr, View& uNext, uint32_t extent, double dx,
 }
 
 template<typename View>
-void update_Vc_peel_unal_st(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
+void update_SIMD_peel_unal_st(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
 {
-    constexpr auto L = Vc::double_v::size();
+    constexpr auto L = xsimd::batch<double>::size;
     const auto r = dt / (dx * dx);
 
     kernel(1, uCurr, uNext, r);
@@ -168,11 +173,11 @@ try
             std::cout << "Incorrect! error = " << maxError << " (the grid resolution may be too low)\n";
     };
 
-    run("update_scalar         ", [](auto&... args) { update_scalar(args...); });
-#if __has_include(<Vc/Vc>)
-    run("update_Vc             ", [](auto&... args) { update_Vc(args...); });
-    run("update_Vc_peel        ", [](auto&... args) { update_Vc_peel(args...); });
-    run("update_Vc_peel_unal_st", [](auto&... args) { update_Vc_peel_unal_st(args...); });
+//    run("update_scalar           ", [](auto&... args) { update_scalar(args...); });
+#ifdef HAVE_XSIMD
+    run("update_SIMD             ", [](auto&... args) { update_SIMD(args...); });
+    run("update_SIMD_peel        ", [](auto&... args) { update_SIMD_peel(args...); });
+    run("update_SIMD_peel_unal_st", [](auto&... args) { update_SIMD_peel_unal_st(args...); });
 #endif
 
     return 0;
diff --git a/examples/nbody/CMakeLists.txt b/examples/nbody/CMakeLists.txt
index dbb8ba32f9..bdf32590bf 100644
--- a/examples/nbody/CMakeLists.txt
+++ b/examples/nbody/CMakeLists.txt
@@ -3,7 +3,7 @@ project(llama-nbody CXX)
 
 find_package(fmt CONFIG REQUIRED)
 find_package(OpenMP REQUIRED)
-find_package(Vc QUIET)
+find_package(xsimd QUIET)
 if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)
 endif()
@@ -11,8 +11,10 @@ endif()
 add_executable(${PROJECT_NAME} nbody.cpp ../common/Stopwatch.hpp)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
 target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama fmt::fmt OpenMP::OpenMP_CXX)
-if (Vc_FOUND)
-	target_link_libraries(${PROJECT_NAME} PRIVATE Vc::Vc)
+if (xsimd_FOUND)
+	target_link_libraries(${PROJECT_NAME} PRIVATE xsimd)
+else()
+	message(WARNING "xsimd not found: n-body built without explicit SIMD support")
 endif()
 
 if (MSVC)
diff --git a/examples/nbody/nbody.cpp b/examples/nbody/nbody.cpp
index 9c97d03fe9..7235a1ea59 100644
--- a/examples/nbody/nbody.cpp
+++ b/examples/nbody/nbody.cpp
@@ -12,6 +12,11 @@
 #include <utility>
 #include <vector>
 
+#if __has_include(<xsimd/xsimd.hpp>)
+#    include <xsimd/xsimd.hpp>
+#    define HAVE_XSIMD
+#endif
+
 // needs -fno-math-errno, so std::sqrt() can be vectorized
 // for multithreading, specify thread affinity (GNU OpenMP):
 // e.g. for a 32 core CPU with SMT/hyperthreading: GOMP_CPU_AFFINITY='0-30:2,1-31:2' llama-nbody
@@ -219,13 +224,13 @@ namespace usellama
 
 namespace manualAoS
 {
-    struct Vec
+    struct Simd
     {
         FP x;
         FP y;
         FP z;
 
-        auto operator*=(FP s) -> Vec&
+        auto operator*=(FP s) -> Simd&
         {
             x *= s;
             y *= s;
@@ -233,7 +238,7 @@ namespace manualAoS
             return *this;
         }
 
-        auto operator*=(Vec v) -> Vec&
+        auto operator*=(Simd v) -> Simd&
         {
             x *= v.x;
             y *= v.y;
@@ -241,7 +246,7 @@ namespace manualAoS
             return *this;
         }
 
-        auto operator+=(Vec v) -> Vec&
+        auto operator+=(Simd v) -> Simd&
         {
             x += v.x;
             y += v.y;
@@ -249,7 +254,7 @@ namespace manualAoS
             return *this;
         }
 
-        auto operator-=(Vec v) -> Vec&
+        auto operator-=(Simd v) -> Simd&
         {
             x -= v.x;
             y -= v.y;
@@ -257,24 +262,24 @@ namespace manualAoS
             return *this;
         }
 
-        friend auto operator-(Vec a, Vec b) -> Vec
+        friend auto operator-(Simd a, Simd b) -> Simd
         {
             return a -= b;
         }
 
-        friend auto operator*(Vec a, FP s) -> Vec
+        friend auto operator*(Simd a, FP s) -> Simd
         {
             return a *= s;
         }
 
-        friend auto operator*(Vec a, Vec b) -> Vec
+        friend auto operator*(Simd a, Simd b) -> Simd
         {
             return a *= b;
         }
     };
 
-    using Pos = Vec;
-    using Vel = Vec;
+    using Pos = Simd;
+    using Vel = Simd;
 
     struct Particle
     {
@@ -646,7 +651,7 @@ namespace manualAoSoA
     }
 } // namespace manualAoSoA
 
-#ifdef __AVX2__
+#if defined(__AVX2__) && defined(__FMA__)
 #    include <immintrin.h>
 
 namespace manualAoSoA_manualAVX
@@ -862,74 +867,72 @@ namespace manualAoSoA_manualAVX
 } // namespace manualAoSoA_manualAVX
 #endif
 
-#if __has_include(<Vc/Vc>)
-#    include <Vc/Vc>
-
-namespace manualAoSoA_Vc
+#ifdef HAVE_XSIMD
+namespace manualAoSoA_SIMD
 {
-    template<typename Vec>
-    struct alignas(32) ParticleBlock
+    template<typename Simd>
+    struct alignas(64) ParticleBlock
     {
         struct
         {
-            Vec x;
-            Vec y;
-            Vec z;
+            Simd x;
+            Simd y;
+            Simd z;
         } pos, vel;
-        Vec mass;
+        Simd mass;
     };
 
 
-    template<typename Vec>
+    template<typename Simd>
     inline void pPInteraction(
-        Vec piposx,
-        Vec piposy,
-        Vec piposz,
-        Vec& pivelx,
-        Vec& pively,
-        Vec& pivelz,
-        Vec pjposx,
-        Vec pjposy,
-        Vec pjposz,
-        Vec pjmass)
+        Simd piposx,
+        Simd piposy,
+        Simd piposz,
+        Simd& pivelx,
+        Simd& pively,
+        Simd& pivelz,
+        Simd pjposx,
+        Simd pjposy,
+        Simd pjposz,
+        Simd pjmass)
     {
-        const Vec xdistance = piposx - pjposx;
-        const Vec ydistance = piposy - pjposy;
-        const Vec zdistance = piposz - pjposz;
-        const Vec xdistanceSqr = xdistance * xdistance;
-        const Vec ydistanceSqr = ydistance * ydistance;
-        const Vec zdistanceSqr = zdistance * zdistance;
-        const Vec distSqr = EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr;
-        const Vec distSixth = distSqr * distSqr * distSqr;
-        const Vec invDistCube = [&]
+        const Simd xdistance = piposx - pjposx;
+        const Simd ydistance = piposy - pjposy;
+        const Simd zdistance = piposz - pjposz;
+        const Simd xdistanceSqr = xdistance * xdistance;
+        const Simd ydistanceSqr = ydistance * ydistance;
+        const Simd zdistanceSqr = zdistance * zdistance;
+        const Simd distSqr = EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr;
+        const Simd distSixth = distSqr * distSqr * distSqr;
+        const Simd invDistCube = [&]
         {
             if constexpr(ALLOW_RSQRT)
             {
-                const Vec r = Vc::rsqrt(distSixth);
+                const Simd r = xsimd::rsqrt(distSixth);
                 if constexpr(NEWTON_RAPHSON_AFTER_RSQRT)
                 {
                     // from: http://stackoverflow.com/q/14752399/556899
-                    const Vec three = 3.0f;
-                    const Vec half = 0.5f;
-                    const Vec muls = distSixth * r * r;
+                    const Simd three = 3.0f;
+                    const Simd half = 0.5f;
+                    const Simd muls = distSixth * r * r;
                     return (half * r) * (three - muls);
                 }
                 else
                     return r;
             }
             else
-                return 1.0f / Vc::sqrt(distSixth);
+                return 1.0f / xsimd::sqrt(distSixth);
         }();
-        const Vec sts = pjmass * invDistCube * TIMESTEP;
+        const Simd sts = pjmass * invDistCube * TIMESTEP;
         pivelx = xdistanceSqr * sts + pivelx;
         pively = ydistanceSqr * sts + pively;
         pivelz = zdistanceSqr * sts + pivelz;
     }
 
-    template<typename Vec>
-    void update8(ParticleBlock<Vec>* particles, int threads)
+    template<typename Simd>
+    void update8(ParticleBlock<Simd>* particles, int threads)
     {
-        constexpr auto LANES = Vec::size();
+        constexpr auto LANES = Simd::size;
         constexpr auto BLOCKS = PROBLEM_SIZE / LANES;
 
 #    pragma omp parallel for schedule(static) num_threads(threads)
@@ -937,21 +940,21 @@ namespace manualAoSoA_Vc
         {
             auto& blockI = particles[bi];
             // std::for_each(ex, particles, particles + BLOCKS, [&](ParticleBlock& blockI) {
-            const Vec piposx = blockI.pos.x;
-            const Vec piposy = blockI.pos.y;
-            const Vec piposz = blockI.pos.z;
-            Vec pivelx = blockI.vel.x;
-            Vec pively = blockI.vel.y;
-            Vec pivelz = blockI.vel.z;
+            const Simd piposx = blockI.pos.x;
+            const Simd piposy = blockI.pos.y;
+            const Simd piposz = blockI.pos.z;
+            Simd pivelx = blockI.vel.x;
+            Simd pively = blockI.vel.y;
+            Simd pivelz = blockI.vel.z;
 
             for(std::size_t bj = 0; bj < BLOCKS; bj++)
                 for(std::size_t j = 0; j < LANES; j++)
                 {
                     const auto& blockJ = particles[bj];
-                    const Vec pjposx = blockJ.pos.x[j];
-                    const Vec pjposy = blockJ.pos.y[j];
-                    const Vec pjposz = blockJ.pos.z[j];
-                    const Vec pjmass = blockJ.mass[j];
+                    const Simd pjposx = blockJ.pos.x.get(j);
+                    const Simd pjposy = blockJ.pos.y.get(j);
+                    const Simd pjposz = blockJ.pos.z.get(j);
+                    const Simd pjmass = blockJ.mass.get(j);
 
                     pPInteraction(piposx, piposy, piposz, pivelx, pively, pivelz, pjposx, pjposy, pjposz, pjmass);
                 }
@@ -963,10 +966,10 @@ namespace manualAoSoA_Vc
         }
     }
 
-    template<typename Vec>
-    void update8Tiled(ParticleBlock<Vec>* particles, int threads)
+    template<typename Simd>
+    void update8Tiled(ParticleBlock<Simd>* particles, int threads)
     {
-        constexpr auto LANES = Vec::size();
+        constexpr auto LANES = Simd::size;
         constexpr auto BLOCKS = PROBLEM_SIZE / LANES;
 
         constexpr auto blocksPerTile = 128; // L1D_SIZE / sizeof(ParticleBlock);
@@ -976,21 +979,21 @@ namespace manualAoSoA_Vc
             for(std::size_t bi = 0; bi < blocksPerTile; bi++)
             {
                 auto& blockI = particles[bi];
-                const Vec piposx = blockI.pos.x;
-                const Vec piposy = blockI.pos.y;
-                const Vec piposz = blockI.pos.z;
-                Vec pivelx = blockI.vel.x;
-                Vec pively = blockI.vel.y;
-                Vec pivelz = blockI.vel.z;
+                const Simd piposx = blockI.pos.x;
+                const Simd piposy = blockI.pos.y;
+                const Simd piposz = blockI.pos.z;
+                Simd pivelx = blockI.vel.x;
+                Simd pively = blockI.vel.y;
+                Simd pivelz = blockI.vel.z;
                 for(std::size_t tj = 0; tj < BLOCKS / blocksPerTile; tj++)
                     for(std::size_t bj = 0; bj < blocksPerTile; bj++)
                         for(std::size_t j = 0; j < LANES; j++)
                         {
                             const auto& blockJ = particles[bj];
-                            const Vec pjposx = blockJ.pos.x[j];
-                            const Vec pjposy = blockJ.pos.y[j];
-                            const Vec pjposz = blockJ.pos.z[j];
-                            const Vec pjmass = blockJ.mass[j];
+                            const Simd pjposx = blockJ.pos.x.get(j);
+                            const Simd pjposy = blockJ.pos.y.get(j);
+                            const Simd pjposz = blockJ.pos.z.get(j);
+                            const Simd pjmass = blockJ.mass.get(j);
 
                             pPInteraction(
                                 piposx,
@@ -1011,10 +1014,23 @@ namespace manualAoSoA_Vc
             }
     }
 
-    template<typename Vec>
-    void update1(ParticleBlock<Vec>* particles, int threads)
+    // xsimd renamed hadd() to reduce_all() on master (after release 8.1), so we need to SFINAE for the name
+    template<typename Simd>
+    auto sum(Simd v) -> decltype(reduce_add(v))
+    {
+        return reduce_add(v);
+    }
+
+    template<typename Simd>
+    auto sum(Simd v) -> decltype(hadd(v))
+    {
+        return hadd(v);
+    }
+
+    template<typename Simd>
+    void update1(ParticleBlock<Simd>* particles, int threads)
     {
-        constexpr auto LANES = Vec::size();
+        constexpr auto LANES = Simd::size;
         constexpr auto BLOCKS = PROBLEM_SIZE / LANES;
 
 #    pragma omp parallel for schedule(static) num_threads(threads)
@@ -1024,12 +1040,12 @@ namespace manualAoSoA_Vc
             // std::for_each(ex, particles, particles + BLOCKS, [&](ParticleBlock& blockI) {
             for(std::size_t i = 0; i < LANES; i++)
             {
-                const Vec piposx = static_cast<FP>(blockI.pos.x[i]);
-                const Vec piposy = static_cast<FP>(blockI.pos.y[i]);
-                const Vec piposz = static_cast<FP>(blockI.pos.z[i]);
-                Vec pivelx = static_cast<FP>(blockI.vel.x[i]);
-                Vec pively = static_cast<FP>(blockI.vel.y[i]);
-                Vec pivelz = static_cast<FP>(blockI.vel.z[i]);
+                const Simd piposx = blockI.pos.x.get(i);
+                const Simd piposy = blockI.pos.y.get(i);
+                const Simd piposz = blockI.pos.z.get(i);
+                Simd pivelx = blockI.vel.x.get(i);
+                Simd pively = blockI.vel.y.get(i);
+                Simd pivelz = blockI.vel.z.get(i);
 
                 for(std::size_t bj = 0; bj < BLOCKS; bj++)
                 {
@@ -1047,18 +1063,18 @@ namespace manualAoSoA_Vc
                         blockJ.mass);
                 }
 
-                blockI.vel.x[i] = pivelx.sum();
-                blockI.vel.y[i] = pively.sum();
-                blockI.vel.z[i] = pivelz.sum();
+                reinterpret_cast<FP*>(&blockI.vel.x)[i] = sum(pivelx);
+                reinterpret_cast<FP*>(&blockI.vel.y)[i] = sum(pively);
+                reinterpret_cast<FP*>(&blockI.vel.z)[i] = sum(pivelz);
             }
             // });
         }
     }
 
-    template<typename Vec>
-    void move(ParticleBlock<Vec>* particles, int threads)
+    template<typename Simd>
+    void move(ParticleBlock<Simd>* particles, int threads)
     {
-        constexpr auto BLOCKS = PROBLEM_SIZE / Vec::size();
+        constexpr auto BLOCKS = PROBLEM_SIZE / Simd::size;
 
 #    pragma omp parallel for schedule(static) num_threads(threads)
         for(std::ptrdiff_t bi = 0; bi < BLOCKS; bi++)
@@ -1072,10 +1088,10 @@ namespace manualAoSoA_Vc
         }
     }
 
-    template<typename Vec>
+    template<typename Simd>
     auto main(std::ostream& plotFile, int threads, bool useUpdate1, bool tiled = false) -> int
     {
-        auto title = "AoSoA" + std::to_string(Vec::size()) + " Vc" + (useUpdate1 ? " w1r8" : " w8r1"); // NOLINT
+        auto title = "AoSoA" + std::to_string(Simd::size) + " SIMD" + (useUpdate1 ? " w1r8" : " w8r1"); // NOLINT
         if(tiled)
             title += " tiled";
         if(threads > 1)
@@ -1084,9 +1100,9 @@ namespace manualAoSoA_Vc
         std::cout << title << '\n';
         Stopwatch watch;
 
-        static_assert(PROBLEM_SIZE % Vec::size() == 0);
-        constexpr auto BLOCKS = PROBLEM_SIZE / Vec::size();
-        std::vector<ParticleBlock<Vec>> particles(BLOCKS);
+        static_assert(PROBLEM_SIZE % Simd::size == 0);
+        constexpr auto BLOCKS = PROBLEM_SIZE / Simd::size;
+        std::vector<ParticleBlock<Simd>> particles(BLOCKS);
         watch.printAndReset("alloc");
 
         std::default_random_engine engine;
@@ -1094,15 +1110,15 @@ namespace manualAoSoA_Vc
         for(std::size_t bi = 0; bi < BLOCKS; ++bi)
         {
             auto& block = particles[bi];
-            for(std::size_t i = 0; i < Vec::size(); ++i)
+            for(std::size_t i = 0; i < Simd::size; ++i)
             {
-                block.pos.x[i] = dist(engine);
-                block.pos.y[i] = dist(engine);
-                block.pos.z[i] = dist(engine);
-                block.vel.x[i] = dist(engine) / FP{10};
-                block.vel.y[i] = dist(engine) / FP{10};
-                block.vel.z[i] = dist(engine) / FP{10};
-                block.mass[i] = dist(engine) / FP{100};
+                reinterpret_cast<FP*>(&block.pos.x)[i] = dist(engine);
+                reinterpret_cast<FP*>(&block.pos.y)[i] = dist(engine);
+                reinterpret_cast<FP*>(&block.pos.z)[i] = dist(engine);
+                reinterpret_cast<FP*>(&block.vel.x)[i] = dist(engine) / FP{10};
+                reinterpret_cast<FP*>(&block.vel.y)[i] = dist(engine) / FP{10};
+                reinterpret_cast<FP*>(&block.vel.z)[i] = dist(engine) / FP{10};
+                reinterpret_cast<FP*>(&block.mass)[i] = dist(engine) / FP{100};
             }
         }
         watch.printAndReset("init");
@@ -1131,42 +1147,49 @@ namespace manualAoSoA_Vc
 
         return 0;
     }
-} // namespace manualAoSoA_Vc
+} // namespace manualAoSoA_SIMD
 
-namespace manualAoS_Vc
+namespace manualAoS_SIMD
 {
     using manualAoS::Particle;
-    using manualAoSoA_Vc::pPInteraction;
+    using manualAoSoA_SIMD::pPInteraction;
+
+    struct GenStrides
+    {
+        static constexpr auto get(int i, int) -> int
+        {
+            return i * static_cast<int>(sizeof(Particle) / sizeof(FP));
+        }
+    };
 
-    template<typename Vec>
-    const auto particleGatherScatterStrides = typename Vec::IndexType{Vc::IndexesFromZero} *
-        typename Vec::IndexType::value_type{sizeof(Particle) / sizeof(FP)};
+    template<typename Simd>
+    const auto particleStrides = static_cast<xsimd::batch<int, typename Simd::arch_type>>(
+        xsimd::make_batch_constant<xsimd::batch<int, typename Simd::arch_type>, GenStrides>());
 
-    template<typename Vec>
+    template<typename Simd>
     void update(Particle* particles, int threads)
     {
-        constexpr auto LANES = Vec::size();
-        const auto strides = particleGatherScatterStrides<Vec>;
+        constexpr auto LANES = Simd::size;
+        const auto strides = particleStrides<Simd>;
 
 #    pragma omp parallel for schedule(static) num_threads(threads)
         for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i += LANES)
         {
-            // gather
             auto& pi = particles[i];
-            const Vec piposx = Vec(&pi.pos.x, strides);
-            const Vec piposy = Vec(&pi.pos.y, strides);
-            const Vec piposz = Vec(&pi.pos.z, strides);
-            Vec pivelx = Vec(&pi.vel.x, strides);
-            Vec pively = Vec(&pi.vel.y, strides);
-            Vec pivelz = Vec(&pi.vel.z, strides);
+            const Simd piposx = Simd::gather(&pi.pos.x, strides);
+            const Simd piposy = Simd::gather(&pi.pos.y, strides);
+            const Simd piposz = Simd::gather(&pi.pos.z, strides);
+            Simd pivelx = Simd::gather(&pi.vel.x, strides);
+            Simd pively = Simd::gather(&pi.vel.y, strides);
+            Simd pivelz = Simd::gather(&pi.vel.z, strides);
 
             for(std::size_t j = 0; j < PROBLEM_SIZE; j++)
             {
                 const auto& pj = particles[j];
-                const Vec pjposx = pj.pos.x;
-                const Vec pjposy = pj.pos.y;
-                const Vec pjposz = pj.pos.z;
-                const Vec pjmass = pj.mass;
+                const Simd pjposx(pj.pos.x);
+                const Simd pjposy(pj.pos.y);
+                const Simd pjposz(pj.pos.z);
+                const Simd pjmass(pj.mass);
 
                 pPInteraction(piposx, piposy, piposz, pivelx, pively, pivelz, pjposx, pjposy, pjposz, pjmass);
             }
@@ -1178,26 +1201,29 @@ namespace manualAoS_Vc
         }
     }
 
-    template<typename Vec>
+    template<typename Simd>
     void move(Particle* particles, int threads)
     {
-        constexpr auto LANES = Vec::size();
-        const auto strides = particleGatherScatterStrides<Vec>;
+        constexpr auto LANES = Simd::size;
+        const auto strides = particleStrides<Simd>;
 
 #    pragma omp parallel for schedule(static) num_threads(threads)
         for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i += LANES)
         {
             auto& pi = particles[i];
-            (Vec(&pi.pos.x, strides) + Vec(&pi.vel.x, strides) * TIMESTEP).scatter(&pi.pos.x, strides);
-            (Vec(&pi.pos.y, strides) + Vec(&pi.vel.y, strides) * TIMESTEP).scatter(&pi.pos.y, strides);
-            (Vec(&pi.pos.z, strides) + Vec(&pi.vel.z, strides) * TIMESTEP).scatter(&pi.pos.z, strides);
+            (Simd::gather(&pi.pos.x, strides) + Simd::gather(&pi.vel.x, strides) * TIMESTEP)
+                .scatter(&pi.pos.x, strides);
+            (Simd::gather(&pi.pos.y, strides) + Simd::gather(&pi.vel.y, strides) * TIMESTEP)
+                .scatter(&pi.pos.y, strides);
+            (Simd::gather(&pi.pos.z, strides) + Simd::gather(&pi.vel.z, strides) * TIMESTEP)
+                .scatter(&pi.pos.z, strides);
         }
     }
 
-    template<typename Vec>
+    template<typename Simd>
     auto main(std::ostream& plotFile, int threads) -> int
     {
-        auto title = "AoS Vc"s;
+        auto title = "AoS SIMD"s;
         if(threads > 1)
             title += " " + std::to_string(threads) + "Thrds";
         std::cout << title << '\n';
@@ -1226,23 +1252,23 @@ namespace manualAoS_Vc
         {
             if constexpr(RUN_UPATE)
             {
-                update<Vec>(particles.data(), threads);
+                update<Simd>(particles.data(), threads);
                 sumUpdate += watch.printAndReset("update", '\t');
             }
-            move<Vec>(particles.data(), threads);
+            move<Simd>(particles.data(), threads);
             sumMove += watch.printAndReset("move");
         }
         plotFile << std::quoted(title) << "\t" << sumUpdate / STEPS << '\t' << sumMove / STEPS << '\n';
 
         return 0;
     }
-} // namespace manualAoS_Vc
+} // namespace manualAoS_SIMD
 
 namespace manualSoA_Vc
 {
-    using manualAoSoA_Vc::pPInteraction;
+    using manualAoSoA_SIMD::pPInteraction;
 
-    template<typename Vec>
+    template<typename Simd>
     void update(
         const FP* posx,
         const FP* posy,
@@ -1254,14 +1280,14 @@ namespace manualSoA_Vc
         int threads)
     {
 #    pragma omp parallel for schedule(static) num_threads(threads)
-        for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i += Vec::size())
+        for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i += Simd::size)
         {
-            const Vec piposx = Vec(posx + i);
-            const Vec piposy = Vec(posy + i);
-            const Vec piposz = Vec(posz + i);
-            Vec pivelx = Vec(velx + i);
-            Vec pively = Vec(vely + i);
-            Vec pivelz = Vec(velz + i);
+            const Simd piposx = Simd::load_aligned(posx + i);
+            const Simd piposy = Simd::load_aligned(posy + i);
+            const Simd piposz = Simd::load_aligned(posz + i);
+            Simd pivelx = Simd::load_aligned(velx + i);
+            Simd pively = Simd::load_aligned(vely + i);
+            Simd pivelz = Simd::load_aligned(velz + i);
             for(std::size_t j = 0; j < PROBLEM_SIZE; ++j)
                 pPInteraction(
                     piposx,
@@ -1270,32 +1296,32 @@ namespace manualSoA_Vc
                     pivelx,
                     pively,
                     pivelz,
-                    Vec(posx[j]),
-                    Vec(posy[j]),
-                    Vec(posz[j]),
-                    Vec(mass[j]));
-            pivelx.store(velx + i);
-            pively.store(vely + i);
-            pivelz.store(velz + i);
+                    Simd(posx[j]),
+                    Simd(posy[j]),
+                    Simd(posz[j]),
+                    Simd(mass[j]));
+            pivelx.store_aligned(velx + i);
+            pively.store_aligned(vely + i);
+            pivelz.store_aligned(velz + i);
         }
     }
 
-    template<typename Vec>
+    template<typename Simd>
     void move(FP* posx, FP* posy, FP* posz, const FP* velx, const FP* vely, const FP* velz, int threads)
     {
 #    pragma omp parallel for schedule(static) num_threads(threads)
-        for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i += Vec::size())
+        for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i += Simd::size)
         {
-            (Vec(posx + i) + Vec(velx + i) * TIMESTEP).store(posx + i);
-            (Vec(posy + i) + Vec(vely + i) * TIMESTEP).store(posy + i);
-            (Vec(posz + i) + Vec(velz + i) * TIMESTEP).store(posz + i);
+            (Simd::load_aligned(posx + i) + Simd::load_aligned(velx + i) * TIMESTEP).store_aligned(posx + i);
+            (Simd::load_aligned(posy + i) + Simd::load_aligned(vely + i) * TIMESTEP).store_aligned(posy + i);
+            (Simd::load_aligned(posz + i) + Simd::load_aligned(velz + i) * TIMESTEP).store_aligned(posz + i);
         }
     }
 
-    template<typename Vec>
+    template<typename Simd>
     auto main(std::ostream& plotFile, int threads) -> int
     {
-        auto title = "SoA Vc"s;
+        auto title = "SoA SIMD"s;
         if(threads > 1)
             title += " " + std::to_string(threads) + "Thrds";
         std::cout << title << '\n';
@@ -1331,7 +1357,7 @@ namespace manualSoA_Vc
         {
             if constexpr(RUN_UPATE)
             {
-                update<Vec>(
+                update<Simd>(
                     posx.data(),
                     posy.data(),
                     posz.data(),
@@ -1342,7 +1368,7 @@ namespace manualSoA_Vc
                     threads);
                 sumUpdate += watch.printAndReset("update", '\t');
             }
-            move<Vec>(posx.data(), posy.data(), posz.data(), velx.data(), vely.data(), velz.data(), threads);
+            move<Simd>(posx.data(), posy.data(), posz.data(), velx.data(), vely.data(), velz.data(), threads);
             sumMove += watch.printAndReset("move");
         }
         plotFile << std::quoted(title) << "\t" << sumUpdate / STEPS << '\t' << sumMove / STEPS << '\n';
@@ -1355,10 +1381,10 @@ namespace manualSoA_Vc
 auto main() -> int
 try
 {
-#if __has_include(<Vc/Vc>)
-    using vec = Vc::Vector<FP>;
-    // using vec = Vc::SimdArray<FP, 16>;
-    constexpr auto SIMDLanes = vec::size();
+#ifdef HAVE_XSIMD
+    using Simd = xsimd::batch<FP>;
+    // using Simd = xsimd::make_sized_batch_t<FP, 16>;
+    constexpr auto SIMDLanes = Simd::size;
 #else
     constexpr auto SIMDLanes = 1;
 #endif
@@ -1428,12 +1454,12 @@ set y2tics auto
             //    r += manualAoSoA::main<decltype(lanes)::value>(plotFile, tiled);
             r += manualAoSoA::main<decltype(lanes)::value>(plotFile, false);
         });
-#ifdef __AVX2__
+#if defined(__AVX2__) && defined(__FMA__)
     // for (auto useUpdate1 : {false, true})
     //    r += manualAoSoA_manualAVX::main(plotFile, useUpdate1);
     r += manualAoSoA_manualAVX::main(plotFile, false);
 #endif
-#if __has_include(<Vc/Vc>)
+#ifdef HAVE_XSIMD
     for(int threads = 1; threads <= std::thread::hardware_concurrency(); threads *= 2)
     {
         // for (auto useUpdate1 : {false, true})
@@ -1441,19 +1467,23 @@ set y2tics auto
         //    {
         //        if (useUpdate1 && tiled)
         //            continue;
-        //        r += manualAoSoA_Vc::main<vec>(plotFile, threads, useUpdate1, tiled);
+        //        r += manualAoSoA_SIMD::main<Simd>(plotFile, threads, useUpdate1, tiled);
         //    }
-        r += manualAoSoA_Vc::main<vec>(plotFile, threads, false, false);
+        r += manualAoSoA_SIMD::main<Simd>(plotFile, threads, false, false);
     }
     for(int threads = 1; threads <= std::thread::hardware_concurrency(); threads *= 2)
     {
-        // mp_for_each<mp_list_c<std::size_t, 1, 2, 4, 8, 16>>(
-        //    [&](auto lanes) { r += manualAoS_Vc::main<Vc::SimdArray<FP, decltype(lanes)::value>>(plotFile, threads);
-        //    });
-        r += manualAoS_Vc::main<vec>(plotFile, threads);
+        //        mp_for_each<mp_list_c<std::size_t, 1, 2, 4, 8, 16>>(
+        //            [&](auto lanes)
+        //            {
+        //                using Simd = xsimd::make_sized_batch_t<FP, decltype(lanes)::value>;
+        //                if constexpr(!std::is_void_v<Simd>)
+        //                    r += manualAoS_SIMD::main<Simd>(plotFile, threads);
+        //            });
+        r += manualAoS_SIMD::main<Simd>(plotFile, threads);
     }
     for(int threads = 1; threads <= std::thread::hardware_concurrency(); threads *= 2)
-        r += manualSoA_Vc::main<vec>(plotFile, threads);
+        r += manualSoA_Vc::main<Simd>(plotFile, threads);
 #endif
 
     plotFile << R"(EOD