From d21287e48d69315ffe8b464e397d1c5c0915ae92 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 17 Jun 2022 12:45:28 +0200 Subject: [PATCH] Replace Vc by xsimd Fixes: #520 --- .github/workflows/ci.yaml | 12 +- docs/pages/install.rst | 2 +- examples/alpaka/daxpy/CMakeLists.txt | 1 - examples/alpaka/nbody/CMakeLists.txt | 8 +- examples/alpaka/nbody/nbody.cpp | 92 +++--- examples/heatequation/CMakeLists.txt | 8 +- examples/heatequation/heatequation.cpp | 37 +-- examples/nbody/CMakeLists.txt | 8 +- examples/nbody/nbody.cpp | 370 +++++++++++++------------ 9 files changed, 292 insertions(+), 246 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7c35902b1e..0a6e005c3c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -13,7 +13,7 @@ env: THREADS: 4 CONFIG: RelWithDebInfo ALPAKA_BRANCH: 0.9.0 - VCPKG_INSTALL_LIST: "fmt vc tinyobjloader boost-mp11 boost-atomic boost-smart-ptr boost-functional boost-container" + VCPKG_INSTALL: "vcpkg install fmt tinyobjloader boost-mp11 boost-atomic boost-smart-ptr boost-functional boost-container; vcpkg install --head xsimd" jobs: clang-format: @@ -40,7 +40,7 @@ jobs: sudo apt install clang-14 libomp-14-dev clang-tidy-14 - name: vcpkg install dependencies run: | - vcpkg install $VCPKG_INSTALL_LIST + eval $VCPKG_INSTALL - name: install alpaka run: | git clone https://github.com/alpaka-group/alpaka.git @@ -76,7 +76,7 @@ jobs: sudo apt install lcov - name: vcpkg install dependencies run: | - vcpkg install $VCPKG_INSTALL_LIST + eval $VCPKG_INSTALL - name: cmake run: | mkdir build @@ -204,7 +204,7 @@ jobs: run: | # vcpkg fails to build with Intel compilers if [ ${{ matrix.install_oneapi }} ]; then unset CXX; fi - vcpkg install $VCPKG_INSTALL_LIST + eval $VCPKG_INSTALL - name: download CUDA if: matrix.cuda_url run: | @@ -270,7 +270,7 @@ jobs: - uses: actions/checkout@v2 - name: vcpkg install dependencies run: | - vcpkg install $VCPKG_INSTALL_LIST + eval $VCPKG_INSTALL - name: install alpaka run: | git clone --branch $ALPAKA_BRANCH --depth 1 https://github.com/alpaka-group/alpaka.git @@ -311,7 +311,7 @@ jobs: brew install libomp - name: vcpkg install dependencies run: | - vcpkg install $VCPKG_INSTALL_LIST + eval $VCPKG_INSTALL - name: install alpaka run: | git clone --branch $ALPAKA_BRANCH --depth 1 https://github.com/alpaka-group/alpaka.git diff --git a/docs/pages/install.rst b/docs/pages/install.rst index 707dcd34db..bb63a918d0 100644 --- a/docs/pages/install.rst +++ b/docs/pages/install.rst @@ -23,7 +23,7 @@ Dependencies - Boost 1.70.0 or higher - libfmt 6.2.1 or higher (optional) for building the tests and examples and LLAMA supporting to dump mappings as SVG/HTML - `Alpaka `_ (optional) for building some examples -- `Vc `_ (optional) for building some examples +- `xsimd `_ (optional) for building some examples Build tests and examples diff --git a/examples/alpaka/daxpy/CMakeLists.txt b/examples/alpaka/daxpy/CMakeLists.txt index 22ff2e2991..582c7f7025 100644 --- a/examples/alpaka/daxpy/CMakeLists.txt +++ b/examples/alpaka/daxpy/CMakeLists.txt @@ -1,7 +1,6 @@ cmake_minimum_required (VERSION 3.18.3) project(llama-alpaka-daxpy CXX) -#find_package(Vc QUIET) find_package(OpenMP REQUIRED) if (NOT TARGET llama::llama) find_package(llama REQUIRED) diff --git a/examples/alpaka/nbody/CMakeLists.txt b/examples/alpaka/nbody/CMakeLists.txt index a4f547bdd3..ab2651e7b6 100644 --- a/examples/alpaka/nbody/CMakeLists.txt +++ b/examples/alpaka/nbody/CMakeLists.txt @@ -1,14 +1,18 @@ cmake_minimum_required (VERSION 3.18.3) project(llama-alpaka-nbody CXX) -find_package(Vc REQUIRED) +find_package(xsimd QUIET) +if (NOT xsimd_FOUND) + message(WARNING "xsimd not found: alpaka n-body example is disabled") + return() +endif() if (NOT TARGET llama::llama) find_package(llama REQUIRED) endif() find_package(alpaka 0.9.0 REQUIRED) alpaka_add_executable(${PROJECT_NAME} nbody.cpp ../../common/Stopwatch.hpp) target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17) -target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka Vc::Vc) +target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka xsimd) if (MSVC) target_compile_options(${PROJECT_NAME} PRIVATE /arch:AVX2 /fp:fast) diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index 80e0f88fd8..ee284c4f2e 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -10,6 +10,11 @@ #include #include +#if __has_include() +# include +# define HAVE_XSIMD +#endif + using FP = float; constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles @@ -19,13 +24,11 @@ constexpr auto ALLOW_RSQRT = true; // rsqrt can be way faster, but less accurate #if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED) # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) -# error Cannot enable CUDA together with other backends, because nvcc cannot parse the Vc header, sorry :/ +# error Cannot enable CUDA together with other backends # endif -// nvcc fails to compile Vc headers even if nothing is used from there, so we need to conditionally include it -# include -constexpr auto DESIRED_ELEMENTS_PER_THREAD = Vc::float_v::size(); +constexpr auto DESIRED_ELEMENTS_PER_THREAD = xsimd::batch::size; constexpr auto THREADS_PER_BLOCK = 1; -constexpr auto AOSOA_LANES = Vc::float_v::size(); // vectors +constexpr auto AOSOA_LANES = xsimd::batch::size; // vectors #elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED) constexpr auto DESIRED_ELEMENTS_PER_THREAD = 1; constexpr auto THREADS_PER_BLOCK = 256; @@ -77,20 +80,20 @@ namespace stdext } } // namespace stdext -// FIXME: this makes assumptions that there are always float_v::size() many values blocked in the LLAMA view -template +// FIXME: this makes assumptions that there are always Simd::size many values blocked in the LLAMA view +template LLAMA_FN_HOST_ACC_INLINE auto load(const FP& src) { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) return src; else - return Vec(&src); + return Simd::load_unaligned(&src); } -template +template LLAMA_FN_HOST_ACC_INLINE auto broadcast(const FP& src) { - return Vec(src); + return Simd(src); } template @@ -99,19 +102,19 @@ LLAMA_FN_HOST_ACC_INLINE auto store(FP& dst, Vec v) if constexpr(std::is_same_v) dst = v; else - v.store(&dst); + v.store_unaligned(&dst); } template -struct VecType +struct SimdType { // TODO(bgruber): we need a vector type that also works on GPUs -#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED - using type = Vc::SimdArray; -#endif + using type = xsimd::make_sized_batch_t; + static_assert(!std::is_void_v, "xsimd does not have a SIMD type for this element count"); }; + template<> -struct VecType<1> +struct SimdType<1> { using type = FP; }; @@ -119,28 +122,26 @@ struct VecType<1> template LLAMA_FN_HOST_ACC_INLINE void pPInteraction(ViewParticleI pi, ParticleRefJ pj) { - using Vec = typename VecType::type; + using Simd = typename SimdType::type; using std::sqrt; using stdext::rsqrt; -#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED - using Vc::rsqrt; - using Vc::sqrt; -#endif - - const Vec xdistance = load(pi(tag::Pos{}, tag::X{})) - broadcast(pj(tag::Pos{}, tag::X{})); - const Vec ydistance = load(pi(tag::Pos{}, tag::Y{})) - broadcast(pj(tag::Pos{}, tag::Y{})); - const Vec zdistance = load(pi(tag::Pos{}, tag::Z{})) - broadcast(pj(tag::Pos{}, tag::Z{})); - const Vec xdistanceSqr = xdistance * xdistance; - const Vec ydistanceSqr = ydistance * ydistance; - const Vec zdistanceSqr = zdistance * zdistance; - const Vec distSqr = +EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr; - const Vec distSixth = distSqr * distSqr * distSqr; - const Vec invDistCube = ALLOW_RSQRT ? rsqrt(distSixth) : (1.0f / sqrt(distSixth)); - const Vec sts = broadcast(pj(tag::Mass())) * invDistCube * TIMESTEP; - store(pi(tag::Vel{}, tag::X{}), xdistanceSqr * sts + load(pi(tag::Vel{}, tag::X{}))); - store(pi(tag::Vel{}, tag::Y{}), ydistanceSqr * sts + load(pi(tag::Vel{}, tag::Y{}))); - store(pi(tag::Vel{}, tag::Z{}), zdistanceSqr * sts + load(pi(tag::Vel{}, tag::Z{}))); + using xsimd::rsqrt; + using xsimd::sqrt; + + const Simd xdistance = load(pi(tag::Pos{}, tag::X{})) - broadcast(pj(tag::Pos{}, tag::X{})); + const Simd ydistance = load(pi(tag::Pos{}, tag::Y{})) - broadcast(pj(tag::Pos{}, tag::Y{})); + const Simd zdistance = load(pi(tag::Pos{}, tag::Z{})) - broadcast(pj(tag::Pos{}, tag::Z{})); + const Simd xdistanceSqr = xdistance * xdistance; + const Simd ydistanceSqr = ydistance * ydistance; + const Simd zdistanceSqr = zdistance * zdistance; + const Simd distSqr = +EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr; + const Simd distSixth = distSqr * distSqr * distSqr; + const Simd invDistCube = ALLOW_RSQRT ? rsqrt(distSixth) : (1.0f / sqrt(distSixth)); + const Simd sts = broadcast(pj(tag::Mass())) * invDistCube * TIMESTEP; + store(pi(tag::Vel{}, tag::X{}), xdistanceSqr * sts + load(pi(tag::Vel{}, tag::X{}))); + store(pi(tag::Vel{}, tag::Y{}), ydistanceSqr * sts + load(pi(tag::Vel{}, tag::Y{}))); + store(pi(tag::Vel{}, tag::Z{}), zdistanceSqr * sts + load(pi(tag::Vel{}, tag::Z{}))); } template @@ -218,16 +219,19 @@ struct MoveKernel const auto ti = alpaka::getIdx(acc)[0]; const auto i = ti * Elems; - using Vec = typename VecType::type; - store( + using Simd = typename SimdType::type; + store( particles(i)(tag::Pos{}, tag::X{}), - load(particles(i)(tag::Pos{}, tag::X{})) + load(particles(i)(tag::Vel{}, tag::X{})) * TIMESTEP); - store( + load(particles(i)(tag::Pos{}, tag::X{})) + + load(particles(i)(tag::Vel{}, tag::X{})) * TIMESTEP); + store( particles(i)(tag::Pos{}, tag::Y{}), - load(particles(i)(tag::Pos{}, tag::Y{})) + load(particles(i)(tag::Vel{}, tag::Y{})) * TIMESTEP); - store( + load(particles(i)(tag::Pos{}, tag::Y{})) + + load(particles(i)(tag::Vel{}, tag::Y{})) * TIMESTEP); + store( particles(i)(tag::Pos{}, tag::Z{}), - load(particles(i)(tag::Pos{}, tag::Z{})) + load(particles(i)(tag::Vel{}, tag::Z{})) * TIMESTEP); + load(particles(i)(tag::Pos{}, tag::Z{})) + + load(particles(i)(tag::Vel{}, tag::Z{})) * TIMESTEP); } }; @@ -337,7 +341,7 @@ try std::ofstream plotFile{"nbody.sh"}; plotFile.exceptions(std::ios::badbit | std::ios::failbit); - std::ofstream{"nbody.sh"} << R"(#!/usr/bin/gnuplot -p + plotFile << R"(#!/usr/bin/gnuplot -p set style data histograms set style fill solid set xtics rotate by 45 right diff --git a/examples/heatequation/CMakeLists.txt b/examples/heatequation/CMakeLists.txt index 453e44332a..815bcacc81 100644 --- a/examples/heatequation/CMakeLists.txt +++ b/examples/heatequation/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required (VERSION 3.18.3) project(llama-heatequation CXX) -find_package(Vc QUIET) +find_package(xsimd QUIET) if (NOT TARGET llama::llama) find_package(llama REQUIRED) endif() @@ -9,8 +9,10 @@ endif() add_executable(${PROJECT_NAME} heatequation.cpp) target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17) target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama) -if (Vc_FOUND) - target_link_libraries(${PROJECT_NAME} PRIVATE Vc::Vc) +if (xsimd_FOUND) + target_link_libraries(${PROJECT_NAME} PRIVATE xsimd) +else() + message(WARNING "xsimd not found: heatequation built without explicit SIMD support") endif() if (MSVC) diff --git a/examples/heatequation/heatequation.cpp b/examples/heatequation/heatequation.cpp index 352893f94c..7d776c78ad 100644 --- a/examples/heatequation/heatequation.cpp +++ b/examples/heatequation/heatequation.cpp @@ -21,6 +21,11 @@ #include #include +#if __has_include() +# include +# define HAVE_XSIMD +#endif + template inline void kernel(uint32_t idx, const View& uCurr, View& uNext, double r) { @@ -36,21 +41,21 @@ void update_scalar(const View& uCurr, View& uNext, uint32_t extent, double dx, d kernel(i, uCurr, uNext, r); } -#if __has_include() -# include +#ifdef HAVE_XSIMD template inline void kernel_vec(uint32_t baseIdx, const View& uCurr, View& uNext, double r) { - const auto next = Vc::double_v{&uCurr[baseIdx]} * (1.0 - 2.0 * r) + Vc::double_v{&uCurr[baseIdx - 1]} * r - + Vc::double_v{&uCurr[baseIdx + 1]} * r; - next.store(&uNext[baseIdx]); + using Simd = xsimd::batch; + const auto next = Simd::load_unaligned(&uCurr[baseIdx]) * (1.0 - 2.0 * r) + + Simd::load_unaligned(&uCurr[baseIdx - 1]) * r + Simd::load_unaligned(&uCurr[baseIdx + 1]) * r; + next.store_unaligned(&uNext[baseIdx]); } template -void update_Vc(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt) +void update_SIMD(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt) { - constexpr auto L = Vc::double_v::size(); + constexpr auto L = xsimd::batch::size; const auto r = dt / (dx * dx); const auto blocks = (extent + L - 1) / L; @@ -67,9 +72,9 @@ void update_Vc(const View& uCurr, View& uNext, uint32_t extent, double dx, doubl } template -void update_Vc_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt) +void update_SIMD_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt) { - constexpr auto L = Vc::double_v::size(); + constexpr auto L = xsimd::batch::size; const auto r = dt / (dx * dx); for(auto i = 1; i < L; i++) @@ -84,9 +89,9 @@ void update_Vc_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, } template -void update_Vc_peel_unal_st(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt) +void update_SIMD_peel_unal_st(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt) { - constexpr auto L = Vc::double_v::size(); + constexpr auto L = xsimd::batch::size; const auto r = dt / (dx * dx); kernel(1, uCurr, uNext, r); @@ -168,11 +173,11 @@ try std::cout << "Incorrect! error = " << maxError << " (the grid resolution may be too low)\n"; }; - run("update_scalar ", [](auto&... args) { update_scalar(args...); }); -#if __has_include() - run("update_Vc ", [](auto&... args) { update_Vc(args...); }); - run("update_Vc_peel ", [](auto&... args) { update_Vc_peel(args...); }); - run("update_Vc_peel_unal_st", [](auto&... args) { update_Vc_peel_unal_st(args...); }); +// run("update_scalar ", [](auto&... args) { update_scalar(args...); }); +#ifdef HAVE_XSIMD + run("update_SIMD ", [](auto&... args) { update_SIMD(args...); }); + run("update_SIMD_peel ", [](auto&... args) { update_SIMD_peel(args...); }); + run("update_SIMD_peel_unal_st", [](auto&... args) { update_SIMD_peel_unal_st(args...); }); #endif return 0; diff --git a/examples/nbody/CMakeLists.txt b/examples/nbody/CMakeLists.txt index dbb8ba32f9..bdf32590bf 100644 --- a/examples/nbody/CMakeLists.txt +++ b/examples/nbody/CMakeLists.txt @@ -3,7 +3,7 @@ project(llama-nbody CXX) find_package(fmt CONFIG REQUIRED) find_package(OpenMP REQUIRED) -find_package(Vc QUIET) +find_package(xsimd QUIET) if (NOT TARGET llama::llama) find_package(llama REQUIRED) endif() @@ -11,8 +11,10 @@ endif() add_executable(${PROJECT_NAME} nbody.cpp ../common/Stopwatch.hpp) target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17) target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama fmt::fmt OpenMP::OpenMP_CXX) -if (Vc_FOUND) - target_link_libraries(${PROJECT_NAME} PRIVATE Vc::Vc) +if (xsimd_FOUND) + target_link_libraries(${PROJECT_NAME} PRIVATE xsimd) +else() + message(WARNING "xsimd not found: n-body built without explicit SIMD support") endif() if (MSVC) diff --git a/examples/nbody/nbody.cpp b/examples/nbody/nbody.cpp index 9c97d03fe9..7235a1ea59 100644 --- a/examples/nbody/nbody.cpp +++ b/examples/nbody/nbody.cpp @@ -12,6 +12,11 @@ #include #include +#if __has_include() +# include +# define HAVE_XSIMD +#endif + // needs -fno-math-errno, so std::sqrt() can be vectorized // for multithreading, specify thread affinity (GNU OpenMP): // e.g. for a 32 core CPU with SMT/hyperthreading: GOMP_CPU_AFFINITY='0-30:2,1-31:2' llama-nbody @@ -219,13 +224,13 @@ namespace usellama namespace manualAoS { - struct Vec + struct Simd { FP x; FP y; FP z; - auto operator*=(FP s) -> Vec& + auto operator*=(FP s) -> Simd& { x *= s; y *= s; @@ -233,7 +238,7 @@ namespace manualAoS return *this; } - auto operator*=(Vec v) -> Vec& + auto operator*=(Simd v) -> Simd& { x *= v.x; y *= v.y; @@ -241,7 +246,7 @@ namespace manualAoS return *this; } - auto operator+=(Vec v) -> Vec& + auto operator+=(Simd v) -> Simd& { x += v.x; y += v.y; @@ -249,7 +254,7 @@ namespace manualAoS return *this; } - auto operator-=(Vec v) -> Vec& + auto operator-=(Simd v) -> Simd& { x -= v.x; y -= v.y; @@ -257,24 +262,24 @@ namespace manualAoS return *this; } - friend auto operator-(Vec a, Vec b) -> Vec + friend auto operator-(Simd a, Simd b) -> Simd { return a -= b; } - friend auto operator*(Vec a, FP s) -> Vec + friend auto operator*(Simd a, FP s) -> Simd { return a *= s; } - friend auto operator*(Vec a, Vec b) -> Vec + friend auto operator*(Simd a, Simd b) -> Simd { return a *= b; } }; - using Pos = Vec; - using Vel = Vec; + using Pos = Simd; + using Vel = Simd; struct Particle { @@ -646,7 +651,7 @@ namespace manualAoSoA } } // namespace manualAoSoA -#ifdef __AVX2__ +#if defined(__AVX2__) && defined(__FMA__) # include namespace manualAoSoA_manualAVX @@ -862,74 +867,72 @@ namespace manualAoSoA_manualAVX } // namespace manualAoSoA_manualAVX #endif -#if __has_include() -# include - -namespace manualAoSoA_Vc +#ifdef HAVE_XSIMD +namespace manualAoSoA_SIMD { - template - struct alignas(32) ParticleBlock + template + struct alignas(64) ParticleBlock { struct { - Vec x; - Vec y; - Vec z; + Simd x; + Simd y; + Simd z; } pos, vel; - Vec mass; + Simd mass; }; - template + template inline void pPInteraction( - Vec piposx, - Vec piposy, - Vec piposz, - Vec& pivelx, - Vec& pively, - Vec& pivelz, - Vec pjposx, - Vec pjposy, - Vec pjposz, - Vec pjmass) + Simd piposx, + Simd piposy, + Simd piposz, + Simd& pivelx, + Simd& pively, + Simd& pivelz, + Simd pjposx, + Simd pjposy, + Simd pjposz, + Simd pjmass) { - const Vec xdistance = piposx - pjposx; - const Vec ydistance = piposy - pjposy; - const Vec zdistance = piposz - pjposz; - const Vec xdistanceSqr = xdistance * xdistance; - const Vec ydistanceSqr = ydistance * ydistance; - const Vec zdistanceSqr = zdistance * zdistance; - const Vec distSqr = EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr; - const Vec distSixth = distSqr * distSqr * distSqr; - const Vec invDistCube = [&] + const Simd xdistance = piposx - pjposx; + const Simd ydistance = piposy - pjposy; + const Simd zdistance = piposz - pjposz; + const Simd xdistanceSqr = xdistance * xdistance; + const Simd ydistanceSqr = ydistance * ydistance; + const Simd zdistanceSqr = zdistance * zdistance; + const Simd distSqr = EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr; + const Simd distSixth = distSqr * distSqr * distSqr; + const Simd invDistCube = [&] { if constexpr(ALLOW_RSQRT) { - const Vec r = Vc::rsqrt(distSixth); + const Simd r = xsimd::rsqrt(distSixth); if constexpr(NEWTON_RAPHSON_AFTER_RSQRT) { // from: http://stackoverflow.com/q/14752399/556899 - const Vec three = 3.0f; - const Vec half = 0.5f; - const Vec muls = distSixth * r * r; + const Simd three = 3.0f; + const Simd half = 0.5f; + const Simd muls = distSixth * r * r; return (half * r) * (three - muls); } else return r; } else - return 1.0f / Vc::sqrt(distSixth); + return 1.0f / xsimd::sqrt(distSixth); }(); - const Vec sts = pjmass * invDistCube * TIMESTEP; + const Simd sts = pjmass * invDistCube * TIMESTEP; pivelx = xdistanceSqr * sts + pivelx; pively = ydistanceSqr * sts + pively; pivelz = zdistanceSqr * sts + pivelz; } - template - void update8(ParticleBlock* particles, int threads) + template + void update8(ParticleBlock* particles, int threads) { - constexpr auto LANES = Vec::size(); + constexpr auto LANES = Simd::size; constexpr auto BLOCKS = PROBLEM_SIZE / LANES; # pragma omp parallel for schedule(static) num_threads(threads) @@ -937,21 +940,21 @@ namespace manualAoSoA_Vc { auto& blockI = particles[bi]; // std::for_each(ex, particles, particles + BLOCKS, [&](ParticleBlock& blockI) { - const Vec piposx = blockI.pos.x; - const Vec piposy = blockI.pos.y; - const Vec piposz = blockI.pos.z; - Vec pivelx = blockI.vel.x; - Vec pively = blockI.vel.y; - Vec pivelz = blockI.vel.z; + const Simd piposx = blockI.pos.x; + const Simd piposy = blockI.pos.y; + const Simd piposz = blockI.pos.z; + Simd pivelx = blockI.vel.x; + Simd pively = blockI.vel.y; + Simd pivelz = blockI.vel.z; for(std::size_t bj = 0; bj < BLOCKS; bj++) for(std::size_t j = 0; j < LANES; j++) { const auto& blockJ = particles[bj]; - const Vec pjposx = blockJ.pos.x[j]; - const Vec pjposy = blockJ.pos.y[j]; - const Vec pjposz = blockJ.pos.z[j]; - const Vec pjmass = blockJ.mass[j]; + const Simd pjposx = blockJ.pos.x.get(j); + const Simd pjposy = blockJ.pos.y.get(j); + const Simd pjposz = blockJ.pos.z.get(j); + const Simd pjmass = blockJ.mass.get(j); pPInteraction(piposx, piposy, piposz, pivelx, pively, pivelz, pjposx, pjposy, pjposz, pjmass); } @@ -963,10 +966,10 @@ namespace manualAoSoA_Vc } } - template - void update8Tiled(ParticleBlock* particles, int threads) + template + void update8Tiled(ParticleBlock* particles, int threads) { - constexpr auto LANES = Vec::size(); + constexpr auto LANES = Simd::size; constexpr auto BLOCKS = PROBLEM_SIZE / LANES; constexpr auto blocksPerTile = 128; // L1D_SIZE / sizeof(ParticleBlock); @@ -976,21 +979,21 @@ namespace manualAoSoA_Vc for(std::size_t bi = 0; bi < blocksPerTile; bi++) { auto& blockI = particles[bi]; - const Vec piposx = blockI.pos.x; - const Vec piposy = blockI.pos.y; - const Vec piposz = blockI.pos.z; - Vec pivelx = blockI.vel.x; - Vec pively = blockI.vel.y; - Vec pivelz = blockI.vel.z; + const Simd piposx = blockI.pos.x; + const Simd piposy = blockI.pos.y; + const Simd piposz = blockI.pos.z; + Simd pivelx = blockI.vel.x; + Simd pively = blockI.vel.y; + Simd pivelz = blockI.vel.z; for(std::size_t tj = 0; tj < BLOCKS / blocksPerTile; tj++) for(std::size_t bj = 0; bj < blocksPerTile; bj++) for(std::size_t j = 0; j < LANES; j++) { const auto& blockJ = particles[bj]; - const Vec pjposx = blockJ.pos.x[j]; - const Vec pjposy = blockJ.pos.y[j]; - const Vec pjposz = blockJ.pos.z[j]; - const Vec pjmass = blockJ.mass[j]; + const Simd pjposx = blockJ.pos.x.get(j); + const Simd pjposy = blockJ.pos.y.get(j); + const Simd pjposz = blockJ.pos.z.get(j); + const Simd pjmass = blockJ.mass.get(j); pPInteraction( piposx, @@ -1011,10 +1014,23 @@ namespace manualAoSoA_Vc } } - template - void update1(ParticleBlock* particles, int threads) + // xsimd renamed hadd() to reduce_all() on master (after release 8.1), so we need to SFINAE for the name + template + auto sum(Simd v) -> decltype(reduce_add(v)) + { + return reduce_add(v); + } + + template + auto sum(Simd v) -> decltype(hadd(v)) + { + return hadd(v); + } + + template + void update1(ParticleBlock* particles, int threads) { - constexpr auto LANES = Vec::size(); + constexpr auto LANES = Simd::size; constexpr auto BLOCKS = PROBLEM_SIZE / LANES; # pragma omp parallel for schedule(static) num_threads(threads) @@ -1024,12 +1040,12 @@ namespace manualAoSoA_Vc // std::for_each(ex, particles, particles + BLOCKS, [&](ParticleBlock& blockI) { for(std::size_t i = 0; i < LANES; i++) { - const Vec piposx = static_cast(blockI.pos.x[i]); - const Vec piposy = static_cast(blockI.pos.y[i]); - const Vec piposz = static_cast(blockI.pos.z[i]); - Vec pivelx = static_cast(blockI.vel.x[i]); - Vec pively = static_cast(blockI.vel.y[i]); - Vec pivelz = static_cast(blockI.vel.z[i]); + const Simd piposx = blockI.pos.x.get(i); + const Simd piposy = blockI.pos.y.get(i); + const Simd piposz = blockI.pos.z.get(i); + Simd pivelx = blockI.vel.x.get(i); + Simd pively = blockI.vel.y.get(i); + Simd pivelz = blockI.vel.z.get(i); for(std::size_t bj = 0; bj < BLOCKS; bj++) { @@ -1047,18 +1063,18 @@ namespace manualAoSoA_Vc blockJ.mass); } - blockI.vel.x[i] = pivelx.sum(); - blockI.vel.y[i] = pively.sum(); - blockI.vel.z[i] = pivelz.sum(); + reinterpret_cast(&blockI.vel.x)[i] = sum(pivelx); + reinterpret_cast(&blockI.vel.y)[i] = sum(pively); + reinterpret_cast(&blockI.vel.z)[i] = sum(pivelz); } // }); } } - template - void move(ParticleBlock* particles, int threads) + template + void move(ParticleBlock* particles, int threads) { - constexpr auto BLOCKS = PROBLEM_SIZE / Vec::size(); + constexpr auto BLOCKS = PROBLEM_SIZE / Simd::size; # pragma omp parallel for schedule(static) num_threads(threads) for(std::ptrdiff_t bi = 0; bi < BLOCKS; bi++) @@ -1072,10 +1088,10 @@ namespace manualAoSoA_Vc } } - template + template auto main(std::ostream& plotFile, int threads, bool useUpdate1, bool tiled = false) -> int { - auto title = "AoSoA" + std::to_string(Vec::size()) + " Vc" + (useUpdate1 ? " w1r8" : " w8r1"); // NOLINT + auto title = "AoSoA" + std::to_string(Simd::size) + " SIMD" + (useUpdate1 ? " w1r8" : " w8r1"); // NOLINT if(tiled) title += " tiled"; if(threads > 1) @@ -1084,9 +1100,9 @@ namespace manualAoSoA_Vc std::cout << title << '\n'; Stopwatch watch; - static_assert(PROBLEM_SIZE % Vec::size() == 0); - constexpr auto BLOCKS = PROBLEM_SIZE / Vec::size(); - std::vector> particles(BLOCKS); + static_assert(PROBLEM_SIZE % Simd::size == 0); + constexpr auto BLOCKS = PROBLEM_SIZE / Simd::size; + std::vector> particles(BLOCKS); watch.printAndReset("alloc"); std::default_random_engine engine; @@ -1094,15 +1110,15 @@ namespace manualAoSoA_Vc for(std::size_t bi = 0; bi < BLOCKS; ++bi) { auto& block = particles[bi]; - for(std::size_t i = 0; i < Vec::size(); ++i) + for(std::size_t i = 0; i < Simd::size; ++i) { - block.pos.x[i] = dist(engine); - block.pos.y[i] = dist(engine); - block.pos.z[i] = dist(engine); - block.vel.x[i] = dist(engine) / FP{10}; - block.vel.y[i] = dist(engine) / FP{10}; - block.vel.z[i] = dist(engine) / FP{10}; - block.mass[i] = dist(engine) / FP{100}; + reinterpret_cast(&block.pos.x)[i] = dist(engine); + reinterpret_cast(&block.pos.y)[i] = dist(engine); + reinterpret_cast(&block.pos.z)[i] = dist(engine); + reinterpret_cast(&block.vel.x)[i] = dist(engine) / FP{10}; + reinterpret_cast(&block.vel.y)[i] = dist(engine) / FP{10}; + reinterpret_cast(&block.vel.z)[i] = dist(engine) / FP{10}; + reinterpret_cast(&block.mass)[i] = dist(engine) / FP{100}; } } watch.printAndReset("init"); @@ -1131,42 +1147,49 @@ namespace manualAoSoA_Vc return 0; } -} // namespace manualAoSoA_Vc +} // namespace manualAoSoA_SIMD -namespace manualAoS_Vc +namespace manualAoS_SIMD { using manualAoS::Particle; - using manualAoSoA_Vc::pPInteraction; + using manualAoSoA_SIMD::pPInteraction; + + struct GenStrides + { + static constexpr auto get(int i, int) -> int + { + return i * static_cast(sizeof(Particle) / sizeof(FP)); + } + }; - template - const auto particleGatherScatterStrides = typename Vec::IndexType{Vc::IndexesFromZero} * - typename Vec::IndexType::value_type{sizeof(Particle) / sizeof(FP)}; + template + const auto particleStrides = static_cast>( + xsimd::make_batch_constant, GenStrides>()); - template + template void update(Particle* particles, int threads) { - constexpr auto LANES = Vec::size(); - const auto strides = particleGatherScatterStrides; + constexpr auto LANES = Simd::size; + const auto strides = particleStrides; # pragma omp parallel for schedule(static) num_threads(threads) for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i += LANES) { - // gather auto& pi = particles[i]; - const Vec piposx = Vec(&pi.pos.x, strides); - const Vec piposy = Vec(&pi.pos.y, strides); - const Vec piposz = Vec(&pi.pos.z, strides); - Vec pivelx = Vec(&pi.vel.x, strides); - Vec pively = Vec(&pi.vel.y, strides); - Vec pivelz = Vec(&pi.vel.z, strides); + const Simd piposx = Simd::gather(&pi.pos.x, strides); + const Simd piposy = Simd::gather(&pi.pos.y, strides); + const Simd piposz = Simd::gather(&pi.pos.z, strides); + Simd pivelx = Simd::gather(&pi.vel.x, strides); + Simd pively = Simd::gather(&pi.vel.y, strides); + Simd pivelz = Simd::gather(&pi.vel.z, strides); for(std::size_t j = 0; j < PROBLEM_SIZE; j++) { const auto& pj = particles[j]; - const Vec pjposx = pj.pos.x; - const Vec pjposy = pj.pos.y; - const Vec pjposz = pj.pos.z; - const Vec pjmass = pj.mass; + const Simd pjposx(pj.pos.x); + const Simd pjposy(pj.pos.y); + const Simd pjposz(pj.pos.z); + const Simd pjmass(pj.mass); pPInteraction(piposx, piposy, piposz, pivelx, pively, pivelz, pjposx, pjposy, pjposz, pjmass); } @@ -1178,26 +1201,29 @@ namespace manualAoS_Vc } } - template + template void move(Particle* particles, int threads) { - constexpr auto LANES = Vec::size(); - const auto strides = particleGatherScatterStrides; + constexpr auto LANES = Simd::size; + const auto strides = particleStrides; # pragma omp parallel for schedule(static) num_threads(threads) for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i += LANES) { auto& pi = particles[i]; - (Vec(&pi.pos.x, strides) + Vec(&pi.vel.x, strides) * TIMESTEP).scatter(&pi.pos.x, strides); - (Vec(&pi.pos.y, strides) + Vec(&pi.vel.y, strides) * TIMESTEP).scatter(&pi.pos.y, strides); - (Vec(&pi.pos.z, strides) + Vec(&pi.vel.z, strides) * TIMESTEP).scatter(&pi.pos.z, strides); + (Simd::gather(&pi.pos.x, strides) + Simd::gather(&pi.vel.x, strides) * TIMESTEP) + .scatter(&pi.pos.x, strides); + (Simd::gather(&pi.pos.y, strides) + Simd::gather(&pi.vel.y, strides) * TIMESTEP) + .scatter(&pi.pos.y, strides); + (Simd::gather(&pi.pos.z, strides) + Simd::gather(&pi.vel.z, strides) * TIMESTEP) + .scatter(&pi.pos.z, strides); } } - template + template auto main(std::ostream& plotFile, int threads) -> int { - auto title = "AoS Vc"s; + auto title = "AoS SIMD"s; if(threads > 1) title += " " + std::to_string(threads) + "Thrds"; std::cout << title << '\n'; @@ -1226,23 +1252,23 @@ namespace manualAoS_Vc { if constexpr(RUN_UPATE) { - update(particles.data(), threads); + update(particles.data(), threads); sumUpdate += watch.printAndReset("update", '\t'); } - move(particles.data(), threads); + move(particles.data(), threads); sumMove += watch.printAndReset("move"); } plotFile << std::quoted(title) << "\t" << sumUpdate / STEPS << '\t' << sumMove / STEPS << '\n'; return 0; } -} // namespace manualAoS_Vc +} // namespace manualAoS_SIMD namespace manualSoA_Vc { - using manualAoSoA_Vc::pPInteraction; + using manualAoSoA_SIMD::pPInteraction; - template + template void update( const FP* posx, const FP* posy, @@ -1254,14 +1280,14 @@ namespace manualSoA_Vc int threads) { # pragma omp parallel for schedule(static) num_threads(threads) - for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i += Vec::size()) + for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i += Simd::size) { - const Vec piposx = Vec(posx + i); - const Vec piposy = Vec(posy + i); - const Vec piposz = Vec(posz + i); - Vec pivelx = Vec(velx + i); - Vec pively = Vec(vely + i); - Vec pivelz = Vec(velz + i); + const Simd piposx = Simd::load_aligned(posx + i); + const Simd piposy = Simd::load_aligned(posy + i); + const Simd piposz = Simd::load_aligned(posz + i); + Simd pivelx = Simd::load_aligned(velx + i); + Simd pively = Simd::load_aligned(vely + i); + Simd pivelz = Simd::load_aligned(velz + i); for(std::size_t j = 0; j < PROBLEM_SIZE; ++j) pPInteraction( piposx, @@ -1270,32 +1296,32 @@ namespace manualSoA_Vc pivelx, pively, pivelz, - Vec(posx[j]), - Vec(posy[j]), - Vec(posz[j]), - Vec(mass[j])); - pivelx.store(velx + i); - pively.store(vely + i); - pivelz.store(velz + i); + Simd(posx[j]), + Simd(posy[j]), + Simd(posz[j]), + Simd(mass[j])); + pivelx.store_aligned(velx + i); + pively.store_aligned(vely + i); + pivelz.store_aligned(velz + i); } } - template + template void move(FP* posx, FP* posy, FP* posz, const FP* velx, const FP* vely, const FP* velz, int threads) { # pragma omp parallel for schedule(static) num_threads(threads) - for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i += Vec::size()) + for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i += Simd::size) { - (Vec(posx + i) + Vec(velx + i) * TIMESTEP).store(posx + i); - (Vec(posy + i) + Vec(vely + i) * TIMESTEP).store(posy + i); - (Vec(posz + i) + Vec(velz + i) * TIMESTEP).store(posz + i); + (Simd::load_aligned(posx + i) + Simd::load_aligned(velx + i) * TIMESTEP).store_aligned(posx + i); + (Simd::load_aligned(posy + i) + Simd::load_aligned(vely + i) * TIMESTEP).store_aligned(posy + i); + (Simd::load_aligned(posz + i) + Simd::load_aligned(velz + i) * TIMESTEP).store_aligned(posz + i); } } - template + template auto main(std::ostream& plotFile, int threads) -> int { - auto title = "SoA Vc"s; + auto title = "SoA SIMD"s; if(threads > 1) title += " " + std::to_string(threads) + "Thrds"; std::cout << title << '\n'; @@ -1331,7 +1357,7 @@ namespace manualSoA_Vc { if constexpr(RUN_UPATE) { - update( + update( posx.data(), posy.data(), posz.data(), @@ -1342,7 +1368,7 @@ namespace manualSoA_Vc threads); sumUpdate += watch.printAndReset("update", '\t'); } - move(posx.data(), posy.data(), posz.data(), velx.data(), vely.data(), velz.data(), threads); + move(posx.data(), posy.data(), posz.data(), velx.data(), vely.data(), velz.data(), threads); sumMove += watch.printAndReset("move"); } plotFile << std::quoted(title) << "\t" << sumUpdate / STEPS << '\t' << sumMove / STEPS << '\n'; @@ -1355,10 +1381,10 @@ namespace manualSoA_Vc auto main() -> int try { -#if __has_include() - using vec = Vc::Vector; - // using vec = Vc::SimdArray; - constexpr auto SIMDLanes = vec::size(); +#ifdef HAVE_XSIMD + using Simd = xsimd::batch; + // using Simd = xsimd::make_sized_batch_t; + constexpr auto SIMDLanes = Simd::size; #else constexpr auto SIMDLanes = 1; #endif @@ -1428,12 +1454,12 @@ set y2tics auto // r += manualAoSoA::main(plotFile, tiled); r += manualAoSoA::main(plotFile, false); }); -#ifdef __AVX2__ +#if defined(__AVX2__) && defined(__FMA__) // for (auto useUpdate1 : {false, true}) // r += manualAoSoA_manualAVX::main(plotFile, useUpdate1); r += manualAoSoA_manualAVX::main(plotFile, false); #endif -#if __has_include() +#ifdef HAVE_XSIMD for(int threads = 1; threads <= std::thread::hardware_concurrency(); threads *= 2) { // for (auto useUpdate1 : {false, true}) @@ -1441,19 +1467,23 @@ set y2tics auto // { // if (useUpdate1 && tiled) // continue; - // r += manualAoSoA_Vc::main(plotFile, threads, useUpdate1, tiled); + // r += manualAoSoA_SIMD::main(plotFile, threads, useUpdate1, tiled); // } - r += manualAoSoA_Vc::main(plotFile, threads, false, false); + r += manualAoSoA_SIMD::main(plotFile, threads, false, false); } for(int threads = 1; threads <= std::thread::hardware_concurrency(); threads *= 2) { - // mp_for_each>( - // [&](auto lanes) { r += manualAoS_Vc::main>(plotFile, threads); - // }); - r += manualAoS_Vc::main(plotFile, threads); + // mp_for_each>( + // [&](auto lanes) + // { + // using Simd = xsimd::make_sized_batch_t; + // if constexpr(!std::is_void_v) + // r += manualAoS_SIMD::main(plotFile, threads); + // }); + r += manualAoS_SIMD::main(plotFile, threads); } for(int threads = 1; threads <= std::thread::hardware_concurrency(); threads *= 2) - r += manualSoA_Vc::main(plotFile, threads); + r += manualSoA_Vc::main(plotFile, threads); #endif plotFile << R"(EOD