alpaka-group · bernhardmgruber · Aug 22, 2022 · Jun 17, 2022
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -13,7 +13,7 @@ env:
   THREADS: 4
   CONFIG: RelWithDebInfo
   ALPAKA_BRANCH: 0.9.0
-  VCPKG_INSTALL_LIST: "fmt vc tinyobjloader boost-mp11 boost-atomic boost-smart-ptr boost-functional boost-container"
+  VCPKG_INSTALL: "vcpkg install fmt tinyobjloader boost-mp11 boost-atomic boost-smart-ptr boost-functional boost-container; vcpkg install --head xsimd"
 
 jobs:
   clang-format:
@@ -40,7 +40,7 @@ jobs:
         sudo apt install clang-14 libomp-14-dev clang-tidy-14
     - name: vcpkg install dependencies
       run: |
-        vcpkg install $VCPKG_INSTALL_LIST
+        eval $VCPKG_INSTALL
     - name: install alpaka
       run: |
         git clone https://github.com/alpaka-group/alpaka.git
@@ -76,7 +76,7 @@ jobs:
         sudo apt install lcov
     - name: vcpkg install dependencies
       run: |
-        vcpkg install $VCPKG_INSTALL_LIST
+        eval $VCPKG_INSTALL
     - name: cmake
       run: |
         mkdir build
@@ -204,7 +204,7 @@ jobs:
         run: |
           # vcpkg fails to build with Intel compilers
           if [ ${{ matrix.install_oneapi }} ]; then unset CXX; fi
-          vcpkg install $VCPKG_INSTALL_LIST
+          eval $VCPKG_INSTALL
       - name: download CUDA
         if: matrix.cuda_url
         run: |
@@ -270,7 +270,7 @@ jobs:
     - uses: actions/checkout@v2
     - name: vcpkg install dependencies
       run: |
-        vcpkg install $VCPKG_INSTALL_LIST
+        eval $VCPKG_INSTALL
     - name: install alpaka
       run: |
         git clone --branch $ALPAKA_BRANCH --depth 1 https://github.com/alpaka-group/alpaka.git
@@ -311,7 +311,7 @@ jobs:
           brew install libomp
       - name: vcpkg install dependencies
         run: |
-          vcpkg install $VCPKG_INSTALL_LIST
+          eval $VCPKG_INSTALL
       - name: install alpaka
         run: |
           git clone --branch $ALPAKA_BRANCH --depth 1 https://github.com/alpaka-group/alpaka.git

diff --git a/docs/pages/install.rst b/docs/pages/install.rst
@@ -23,7 +23,7 @@ Dependencies
 - Boost 1.70.0 or higher
 - libfmt 6.2.1 or higher (optional) for building the tests and examples and LLAMA supporting to dump mappings as SVG/HTML
 - `Alpaka <https://github.com/alpaka-group/alpaka>`_ (optional) for building some examples
-- `Vc <https://github.com/VcDevel/Vc>`_ (optional) for building some examples
+- `xsimd <https://github.com/xtensor-stack/xsimd>`_ (optional) for building some examples
 
 
 Build tests and examples

diff --git a/examples/alpaka/daxpy/CMakeLists.txt b/examples/alpaka/daxpy/CMakeLists.txt
@@ -1,7 +1,6 @@
 cmake_minimum_required (VERSION 3.18.3)
 project(llama-alpaka-daxpy CXX)
 
-#find_package(Vc QUIET)
 find_package(OpenMP REQUIRED)
 if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)

diff --git a/examples/alpaka/nbody/CMakeLists.txt b/examples/alpaka/nbody/CMakeLists.txt
@@ -1,14 +1,18 @@
 cmake_minimum_required (VERSION 3.18.3)
 project(llama-alpaka-nbody CXX)
 
-find_package(Vc REQUIRED)
+find_package(xsimd QUIET)
+if (NOT xsimd_FOUND)
+	message(WARNING "xsimd not found: alpaka n-body example is disabled")
+	return()
+endif()
 if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)
 endif()
 find_package(alpaka 0.9.0 REQUIRED)
 alpaka_add_executable(${PROJECT_NAME} nbody.cpp ../../common/Stopwatch.hpp)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
-target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka Vc::Vc)
+target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka xsimd)
 
 if (MSVC)
 	target_compile_options(${PROJECT_NAME} PRIVATE /arch:AVX2 /fp:fast)

diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp
@@ -10,6 +10,11 @@
 #include <string>
 #include <utility>
 
+#if __has_include(<xsimd/xsimd.hpp>)
+#    include <xsimd/xsimd.hpp>
+#    define HAVE_XSIMD
+#endif
+
 using FP = float;
 
 constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles
@@ -19,13 +24,11 @@ constexpr auto ALLOW_RSQRT = true; // rsqrt can be way faster, but less accurate
 
 #if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
 #    if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
-#        error Cannot enable CUDA together with other backends, because nvcc cannot parse the Vc header, sorry :/
+#        error Cannot enable CUDA together with other backends
 #    endif
-// nvcc fails to compile Vc headers even if nothing is used from there, so we need to conditionally include it
-#    include <Vc/Vc>
-constexpr auto DESIRED_ELEMENTS_PER_THREAD = Vc::float_v::size();
+constexpr auto DESIRED_ELEMENTS_PER_THREAD = xsimd::batch<float>::size;
 constexpr auto THREADS_PER_BLOCK = 1;
-constexpr auto AOSOA_LANES = Vc::float_v::size(); // vectors
+constexpr auto AOSOA_LANES = xsimd::batch<float>::size; // vectors
 #elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
 constexpr auto DESIRED_ELEMENTS_PER_THREAD = 1;
 constexpr auto THREADS_PER_BLOCK = 256;
@@ -77,20 +80,20 @@ namespace stdext
     }
 } // namespace stdext
 
-// FIXME: this makes assumptions that there are always float_v::size() many values blocked in the LLAMA view
-template<typename Vec>
+// FIXME: this makes assumptions that there are always Simd::size many values blocked in the LLAMA view
+template<typename Simd>
 LLAMA_FN_HOST_ACC_INLINE auto load(const FP& src)
 {
-    if constexpr(std::is_same_v<Vec, FP>)
+    if constexpr(std::is_same_v<Simd, FP>)
         return src;
     else
-        return Vec(&src);
+        return Simd::load_unaligned(&src);
 }
 
-template<typename Vec>
+template<typename Simd>
 LLAMA_FN_HOST_ACC_INLINE auto broadcast(const FP& src)
 {
-    return Vec(src);
+    return Simd(src);
 }
 
 template<typename Vec>
@@ -99,48 +102,46 @@ LLAMA_FN_HOST_ACC_INLINE auto store(FP& dst, Vec v)
     if constexpr(std::is_same_v<Vec, FP>)
         dst = v;
     else
-        v.store(&dst);
+        v.store_unaligned(&dst);
 }
 
 template<int Elems>
-struct VecType
+struct SimdType
 {
     // TODO(bgruber): we need a vector type that also works on GPUs
-#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
-    using type = Vc::SimdArray<FP, Elems>;
-#endif
+    using type = xsimd::make_sized_batch_t<FP, Elems>;
+    static_assert(!std::is_void_v<type>, "xsimd does not have a SIMD type for this element count");
 };
+
 template<>
-struct VecType<1>
+struct SimdType<1>
 {
     using type = FP;
 };
 
 template<int Elems, typename ViewParticleI, typename ParticleRefJ>
 LLAMA_FN_HOST_ACC_INLINE void pPInteraction(ViewParticleI pi, ParticleRefJ pj)
 {
-    using Vec = typename VecType<Elems>::type;
+    using Simd = typename SimdType<Elems>::type;
 
     using std::sqrt;
     using stdext::rsqrt;
-#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
-    using Vc::rsqrt;
-    using Vc::sqrt;
-#endif
-
-    const Vec xdistance = load<Vec>(pi(tag::Pos{}, tag::X{})) - broadcast<Vec>(pj(tag::Pos{}, tag::X{}));
-    const Vec ydistance = load<Vec>(pi(tag::Pos{}, tag::Y{})) - broadcast<Vec>(pj(tag::Pos{}, tag::Y{}));
-    const Vec zdistance = load<Vec>(pi(tag::Pos{}, tag::Z{})) - broadcast<Vec>(pj(tag::Pos{}, tag::Z{}));
-    const Vec xdistanceSqr = xdistance * xdistance;
-    const Vec ydistanceSqr = ydistance * ydistance;
-    const Vec zdistanceSqr = zdistance * zdistance;
-    const Vec distSqr = +EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr;
-    const Vec distSixth = distSqr * distSqr * distSqr;
-    const Vec invDistCube = ALLOW_RSQRT ? rsqrt(distSixth) : (1.0f / sqrt(distSixth));
-    const Vec sts = broadcast<Vec>(pj(tag::Mass())) * invDistCube * TIMESTEP;
-    store<Vec>(pi(tag::Vel{}, tag::X{}), xdistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::X{})));
-    store<Vec>(pi(tag::Vel{}, tag::Y{}), ydistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::Y{})));
-    store<Vec>(pi(tag::Vel{}, tag::Z{}), zdistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::Z{})));
+    using xsimd::rsqrt;
+    using xsimd::sqrt;
+
+    const Simd xdistance = load<Simd>(pi(tag::Pos{}, tag::X{})) - broadcast<Simd>(pj(tag::Pos{}, tag::X{}));
+    const Simd ydistance = load<Simd>(pi(tag::Pos{}, tag::Y{})) - broadcast<Simd>(pj(tag::Pos{}, tag::Y{}));
+    const Simd zdistance = load<Simd>(pi(tag::Pos{}, tag::Z{})) - broadcast<Simd>(pj(tag::Pos{}, tag::Z{}));
+    const Simd xdistanceSqr = xdistance * xdistance;
+    const Simd ydistanceSqr = ydistance * ydistance;
+    const Simd zdistanceSqr = zdistance * zdistance;
+    const Simd distSqr = +EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr;
+    const Simd distSixth = distSqr * distSqr * distSqr;
+    const Simd invDistCube = ALLOW_RSQRT ? rsqrt(distSixth) : (1.0f / sqrt(distSixth));
+    const Simd sts = broadcast<Simd>(pj(tag::Mass())) * invDistCube * TIMESTEP;
+    store<Simd>(pi(tag::Vel{}, tag::X{}), xdistanceSqr * sts + load<Simd>(pi(tag::Vel{}, tag::X{})));
+    store<Simd>(pi(tag::Vel{}, tag::Y{}), ydistanceSqr * sts + load<Simd>(pi(tag::Vel{}, tag::Y{})));
+    store<Simd>(pi(tag::Vel{}, tag::Z{}), zdistanceSqr * sts + load<Simd>(pi(tag::Vel{}, tag::Z{})));
 }
 
 template<int ProblemSize, int Elems, int BlockSize, Mapping MappingSM>
@@ -218,16 +219,19 @@ struct MoveKernel
         const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
         const auto i = ti * Elems;
 
-        using Vec = typename VecType<Elems>::type;
-        store<Vec>(
+        using Simd = typename SimdType<Elems>::type;
+        store<Simd>(
             particles(i)(tag::Pos{}, tag::X{}),
-            load<Vec>(particles(i)(tag::Pos{}, tag::X{})) + load<Vec>(particles(i)(tag::Vel{}, tag::X{})) * TIMESTEP);
-        store<Vec>(
+            load<Simd>(particles(i)(tag::Pos{}, tag::X{}))
+                + load<Simd>(particles(i)(tag::Vel{}, tag::X{})) * TIMESTEP);
+        store<Simd>(
             particles(i)(tag::Pos{}, tag::Y{}),
-            load<Vec>(particles(i)(tag::Pos{}, tag::Y{})) + load<Vec>(particles(i)(tag::Vel{}, tag::Y{})) * TIMESTEP);
-        store<Vec>(
+            load<Simd>(particles(i)(tag::Pos{}, tag::Y{}))
+                + load<Simd>(particles(i)(tag::Vel{}, tag::Y{})) * TIMESTEP);
+        store<Simd>(
             particles(i)(tag::Pos{}, tag::Z{}),
-            load<Vec>(particles(i)(tag::Pos{}, tag::Z{})) + load<Vec>(particles(i)(tag::Vel{}, tag::Z{})) * TIMESTEP);
+            load<Simd>(particles(i)(tag::Pos{}, tag::Z{}))
+                + load<Simd>(particles(i)(tag::Vel{}, tag::Z{})) * TIMESTEP);
     }
 };
 
@@ -337,7 +341,7 @@ try
 
     std::ofstream plotFile{"nbody.sh"};
     plotFile.exceptions(std::ios::badbit | std::ios::failbit);
-    std::ofstream{"nbody.sh"} << R"(#!/usr/bin/gnuplot -p
+    plotFile << R"(#!/usr/bin/gnuplot -p
 set style data histograms
 set style fill solid
 set xtics rotate by 45 right

diff --git a/examples/heatequation/CMakeLists.txt b/examples/heatequation/CMakeLists.txt
@@ -1,16 +1,18 @@
 cmake_minimum_required (VERSION 3.18.3)
 project(llama-heatequation CXX)
 
-find_package(Vc QUIET)
+find_package(xsimd QUIET)
 if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)
 endif()
 
 add_executable(${PROJECT_NAME} heatequation.cpp)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
 target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)
-if (Vc_FOUND)
-	target_link_libraries(${PROJECT_NAME} PRIVATE Vc::Vc)
+if (xsimd_FOUND)
+	target_link_libraries(${PROJECT_NAME} PRIVATE xsimd)
+else()
+	message(WARNING "xsimd not found: heatequation built without explicit SIMD support")
 endif()
 
 if (MSVC)

diff --git a/examples/heatequation/heatequation.cpp b/examples/heatequation/heatequation.cpp
@@ -21,6 +21,11 @@
 #include <llama/llama.hpp>
 #include <utility>
 
+#if __has_include(<xsimd/xsimd.hpp>)
+#    include <xsimd/xsimd.hpp>
+#    define HAVE_XSIMD
+#endif
+
 template<typename View>
 inline void kernel(uint32_t idx, const View& uCurr, View& uNext, double r)
 {
@@ -36,21 +41,21 @@ void update_scalar(const View& uCurr, View& uNext, uint32_t extent, double dx, d
             kernel(i, uCurr, uNext, r);
 }
 
-#if __has_include(<Vc/Vc>)
-#    include <Vc/Vc>
+#ifdef HAVE_XSIMD
 
 template<typename View>
 inline void kernel_vec(uint32_t baseIdx, const View& uCurr, View& uNext, double r)
 {
-    const auto next = Vc::double_v{&uCurr[baseIdx]} * (1.0 - 2.0 * r) + Vc::double_v{&uCurr[baseIdx - 1]} * r
-        + Vc::double_v{&uCurr[baseIdx + 1]} * r;
-    next.store(&uNext[baseIdx]);
+    using Simd = xsimd::batch<double>;
+    const auto next = Simd::load_unaligned(&uCurr[baseIdx]) * (1.0 - 2.0 * r)
+        + Simd::load_unaligned(&uCurr[baseIdx - 1]) * r + Simd::load_unaligned(&uCurr[baseIdx + 1]) * r;
+    next.store_unaligned(&uNext[baseIdx]);
 }
 
 template<typename View>
-void update_Vc(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
+void update_SIMD(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
 {
-    constexpr auto L = Vc::double_v::size();
+    constexpr auto L = xsimd::batch<double>::size;
     const auto r = dt / (dx * dx);
 
     const auto blocks = (extent + L - 1) / L;
@@ -67,9 +72,9 @@ void update_Vc(const View& uCurr, View& uNext, uint32_t extent, double dx, doubl
 }
 
 template<typename View>
-void update_Vc_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
+void update_SIMD_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
 {
-    constexpr auto L = Vc::double_v::size();
+    constexpr auto L = xsimd::batch<double>::size;
     const auto r = dt / (dx * dx);
 
     for(auto i = 1; i < L; i++)
@@ -84,9 +89,9 @@ void update_Vc_peel(const View& uCurr, View& uNext, uint32_t extent, double dx,
 }
 
 template<typename View>
-void update_Vc_peel_unal_st(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
+void update_SIMD_peel_unal_st(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
 {
-    constexpr auto L = Vc::double_v::size();
+    constexpr auto L = xsimd::batch<double>::size;
     const auto r = dt / (dx * dx);
 
     kernel(1, uCurr, uNext, r);
@@ -168,11 +173,11 @@ try
             std::cout << "Incorrect! error = " << maxError << " (the grid resolution may be too low)\n";
     };
 
-    run("update_scalar         ", [](auto&... args) { update_scalar(args...); });
-#if __has_include(<Vc/Vc>)
-    run("update_Vc             ", [](auto&... args) { update_Vc(args...); });
-    run("update_Vc_peel        ", [](auto&... args) { update_Vc_peel(args...); });
-    run("update_Vc_peel_unal_st", [](auto&... args) { update_Vc_peel_unal_st(args...); });
+//    run("update_scalar           ", [](auto&... args) { update_scalar(args...); });
+#ifdef HAVE_XSIMD
+    run("update_SIMD             ", [](auto&... args) { update_SIMD(args...); });
+    run("update_SIMD_peel        ", [](auto&... args) { update_SIMD_peel(args...); });
+    run("update_SIMD_peel_unal_st", [](auto&... args) { update_SIMD_peel_unal_st(args...); });
 #endif
 
     return 0;

diff --git a/examples/nbody/CMakeLists.txt b/examples/nbody/CMakeLists.txt
@@ -3,16 +3,18 @@ project(llama-nbody CXX)
 
 find_package(fmt CONFIG REQUIRED)
 find_package(OpenMP REQUIRED)
-find_package(Vc QUIET)
+find_package(xsimd QUIET)
 if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)
 endif()
 
 add_executable(${PROJECT_NAME} nbody.cpp ../common/Stopwatch.hpp)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
 target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama fmt::fmt OpenMP::OpenMP_CXX)
-if (Vc_FOUND)
-	target_link_libraries(${PROJECT_NAME} PRIVATE Vc::Vc)
+if (xsimd_FOUND)
+	target_link_libraries(${PROJECT_NAME} PRIVATE xsimd)
+else()
+	message(WARNING "xsimd not found: n-body built without explicit SIMD support")
 endif()
 
 if (MSVC)