Skip to content

Commit

Permalink
Replace Vc by xsimd
Browse files Browse the repository at this point in the history
Fixes: #520
  • Loading branch information
bernhardmgruber committed Aug 22, 2022
1 parent 4ce5481 commit 75d21ad
Show file tree
Hide file tree
Showing 9 changed files with 292 additions and 246 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ env:
THREADS: 4
CONFIG: RelWithDebInfo
ALPAKA_BRANCH: 0.9.0
VCPKG_INSTALL_LIST: "fmt vc tinyobjloader boost-mp11 boost-atomic boost-smart-ptr boost-functional boost-container"
VCPKG_INSTALL: "vcpkg install fmt tinyobjloader boost-mp11 boost-atomic boost-smart-ptr boost-functional boost-container; vcpkg install --head xsimd"

jobs:
clang-format:
Expand All @@ -40,7 +40,7 @@ jobs:
sudo apt install clang-14 libomp-14-dev clang-tidy-14
- name: vcpkg install dependencies
run: |
vcpkg install $VCPKG_INSTALL_LIST
eval $VCPKG_INSTALL
- name: install alpaka
run: |
git clone https://github.com/alpaka-group/alpaka.git
Expand Down Expand Up @@ -76,7 +76,7 @@ jobs:
sudo apt install lcov
- name: vcpkg install dependencies
run: |
vcpkg install $VCPKG_INSTALL_LIST
eval $VCPKG_INSTALL
- name: cmake
run: |
mkdir build
Expand Down Expand Up @@ -204,7 +204,7 @@ jobs:
run: |
# vcpkg fails to build with Intel compilers
if [ ${{ matrix.install_oneapi }} ]; then unset CXX; fi
vcpkg install $VCPKG_INSTALL_LIST
eval $VCPKG_INSTALL
- name: download CUDA
if: matrix.cuda_url
run: |
Expand Down Expand Up @@ -270,7 +270,7 @@ jobs:
- uses: actions/checkout@v2
- name: vcpkg install dependencies
run: |
vcpkg install $VCPKG_INSTALL_LIST
eval $VCPKG_INSTALL
- name: install alpaka
run: |
git clone --branch $ALPAKA_BRANCH --depth 1 https://github.com/alpaka-group/alpaka.git
Expand Down Expand Up @@ -311,7 +311,7 @@ jobs:
brew install libomp
- name: vcpkg install dependencies
run: |
vcpkg install $VCPKG_INSTALL_LIST
eval $VCPKG_INSTALL
- name: install alpaka
run: |
git clone --branch $ALPAKA_BRANCH --depth 1 https://github.com/alpaka-group/alpaka.git
Expand Down
2 changes: 1 addition & 1 deletion docs/pages/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Dependencies
- Boost 1.70.0 or higher
- libfmt 6.2.1 or higher (optional) for building the tests and examples and LLAMA supporting to dump mappings as SVG/HTML
- `Alpaka <https://github.com/alpaka-group/alpaka>`_ (optional) for building some examples
- `Vc <https://github.com/VcDevel/Vc>`_ (optional) for building some examples
- `xsimd <https://github.com/xtensor-stack/xsimd>`_ (optional) for building some examples


Build tests and examples
Expand Down
1 change: 0 additions & 1 deletion examples/alpaka/daxpy/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
cmake_minimum_required (VERSION 3.18.3)
project(llama-alpaka-daxpy CXX)

#find_package(Vc QUIET)
find_package(OpenMP REQUIRED)
if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
Expand Down
8 changes: 6 additions & 2 deletions examples/alpaka/nbody/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
cmake_minimum_required (VERSION 3.18.3)
project(llama-alpaka-nbody CXX)

find_package(Vc REQUIRED)
find_package(xsimd QUIET)
if (NOT xsimd_FOUND)
message(WARNING "xsimd not found: alpaka n-body example is disabled")
return()
endif()
if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()
find_package(alpaka 0.9.0 REQUIRED)
alpaka_add_executable(${PROJECT_NAME} nbody.cpp ../../common/Stopwatch.hpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka Vc::Vc)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka xsimd)

if (MSVC)
target_compile_options(${PROJECT_NAME} PRIVATE /arch:AVX2 /fp:fast)
Expand Down
92 changes: 48 additions & 44 deletions examples/alpaka/nbody/nbody.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
#include <string>
#include <utility>

#if __has_include(<xsimd/xsimd.hpp>)
# include <xsimd/xsimd.hpp>
# define HAVE_XSIMD
#endif

using FP = float;

constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles
Expand All @@ -19,13 +24,11 @@ constexpr auto ALLOW_RSQRT = true; // rsqrt can be way faster, but less accurate

#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
# error Cannot enable CUDA together with other backends, because nvcc cannot parse the Vc header, sorry :/
# error Cannot enable CUDA together with other backends
# endif
// nvcc fails to compile Vc headers even if nothing is used from there, so we need to conditionally include it
# include <Vc/Vc>
constexpr auto DESIRED_ELEMENTS_PER_THREAD = Vc::float_v::size();
constexpr auto DESIRED_ELEMENTS_PER_THREAD = xsimd::batch<float>::size;
constexpr auto THREADS_PER_BLOCK = 1;
constexpr auto AOSOA_LANES = Vc::float_v::size(); // vectors
constexpr auto AOSOA_LANES = xsimd::batch<float>::size; // vectors
#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
constexpr auto DESIRED_ELEMENTS_PER_THREAD = 1;
constexpr auto THREADS_PER_BLOCK = 256;
Expand Down Expand Up @@ -77,20 +80,20 @@ namespace stdext
}
} // namespace stdext

// FIXME: this makes assumptions that there are always float_v::size() many values blocked in the LLAMA view
template<typename Vec>
// FIXME: this makes assumptions that there are always Simd::size many values blocked in the LLAMA view
template<typename Simd>
LLAMA_FN_HOST_ACC_INLINE auto load(const FP& src)
{
if constexpr(std::is_same_v<Vec, FP>)
if constexpr(std::is_same_v<Simd, FP>)
return src;
else
return Vec(&src);
return Simd::load_unaligned(&src);
}

template<typename Vec>
template<typename Simd>
LLAMA_FN_HOST_ACC_INLINE auto broadcast(const FP& src)
{
return Vec(src);
return Simd(src);
}

template<typename Vec>
Expand All @@ -99,48 +102,46 @@ LLAMA_FN_HOST_ACC_INLINE auto store(FP& dst, Vec v)
if constexpr(std::is_same_v<Vec, FP>)
dst = v;
else
v.store(&dst);
v.store_unaligned(&dst);
}

template<int Elems>
struct VecType
struct SimdType
{
// TODO(bgruber): we need a vector type that also works on GPUs
#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
using type = Vc::SimdArray<FP, Elems>;
#endif
using type = xsimd::make_sized_batch_t<FP, Elems>;
static_assert(!std::is_void_v<type>, "xsimd does not have a SIMD type for this element count");
};

template<>
struct VecType<1>
struct SimdType<1>
{
using type = FP;
};

template<int Elems, typename ViewParticleI, typename ParticleRefJ>
LLAMA_FN_HOST_ACC_INLINE void pPInteraction(ViewParticleI pi, ParticleRefJ pj)
{
using Vec = typename VecType<Elems>::type;
using Simd = typename SimdType<Elems>::type;

using std::sqrt;
using stdext::rsqrt;
#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
using Vc::rsqrt;
using Vc::sqrt;
#endif

const Vec xdistance = load<Vec>(pi(tag::Pos{}, tag::X{})) - broadcast<Vec>(pj(tag::Pos{}, tag::X{}));
const Vec ydistance = load<Vec>(pi(tag::Pos{}, tag::Y{})) - broadcast<Vec>(pj(tag::Pos{}, tag::Y{}));
const Vec zdistance = load<Vec>(pi(tag::Pos{}, tag::Z{})) - broadcast<Vec>(pj(tag::Pos{}, tag::Z{}));
const Vec xdistanceSqr = xdistance * xdistance;
const Vec ydistanceSqr = ydistance * ydistance;
const Vec zdistanceSqr = zdistance * zdistance;
const Vec distSqr = +EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr;
const Vec distSixth = distSqr * distSqr * distSqr;
const Vec invDistCube = ALLOW_RSQRT ? rsqrt(distSixth) : (1.0f / sqrt(distSixth));
const Vec sts = broadcast<Vec>(pj(tag::Mass())) * invDistCube * TIMESTEP;
store<Vec>(pi(tag::Vel{}, tag::X{}), xdistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::X{})));
store<Vec>(pi(tag::Vel{}, tag::Y{}), ydistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::Y{})));
store<Vec>(pi(tag::Vel{}, tag::Z{}), zdistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::Z{})));
using xsimd::rsqrt;
using xsimd::sqrt;

const Simd xdistance = load<Simd>(pi(tag::Pos{}, tag::X{})) - broadcast<Simd>(pj(tag::Pos{}, tag::X{}));
const Simd ydistance = load<Simd>(pi(tag::Pos{}, tag::Y{})) - broadcast<Simd>(pj(tag::Pos{}, tag::Y{}));
const Simd zdistance = load<Simd>(pi(tag::Pos{}, tag::Z{})) - broadcast<Simd>(pj(tag::Pos{}, tag::Z{}));
const Simd xdistanceSqr = xdistance * xdistance;
const Simd ydistanceSqr = ydistance * ydistance;
const Simd zdistanceSqr = zdistance * zdistance;
const Simd distSqr = +EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr;
const Simd distSixth = distSqr * distSqr * distSqr;
const Simd invDistCube = ALLOW_RSQRT ? rsqrt(distSixth) : (1.0f / sqrt(distSixth));
const Simd sts = broadcast<Simd>(pj(tag::Mass())) * invDistCube * TIMESTEP;
store<Simd>(pi(tag::Vel{}, tag::X{}), xdistanceSqr * sts + load<Simd>(pi(tag::Vel{}, tag::X{})));
store<Simd>(pi(tag::Vel{}, tag::Y{}), ydistanceSqr * sts + load<Simd>(pi(tag::Vel{}, tag::Y{})));
store<Simd>(pi(tag::Vel{}, tag::Z{}), zdistanceSqr * sts + load<Simd>(pi(tag::Vel{}, tag::Z{})));
}

template<int ProblemSize, int Elems, int BlockSize, Mapping MappingSM>
Expand Down Expand Up @@ -218,16 +219,19 @@ struct MoveKernel
const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
const auto i = ti * Elems;

using Vec = typename VecType<Elems>::type;
store<Vec>(
using Simd = typename SimdType<Elems>::type;
store<Simd>(
particles(i)(tag::Pos{}, tag::X{}),
load<Vec>(particles(i)(tag::Pos{}, tag::X{})) + load<Vec>(particles(i)(tag::Vel{}, tag::X{})) * TIMESTEP);
store<Vec>(
load<Simd>(particles(i)(tag::Pos{}, tag::X{}))
+ load<Simd>(particles(i)(tag::Vel{}, tag::X{})) * TIMESTEP);
store<Simd>(
particles(i)(tag::Pos{}, tag::Y{}),
load<Vec>(particles(i)(tag::Pos{}, tag::Y{})) + load<Vec>(particles(i)(tag::Vel{}, tag::Y{})) * TIMESTEP);
store<Vec>(
load<Simd>(particles(i)(tag::Pos{}, tag::Y{}))
+ load<Simd>(particles(i)(tag::Vel{}, tag::Y{})) * TIMESTEP);
store<Simd>(
particles(i)(tag::Pos{}, tag::Z{}),
load<Vec>(particles(i)(tag::Pos{}, tag::Z{})) + load<Vec>(particles(i)(tag::Vel{}, tag::Z{})) * TIMESTEP);
load<Simd>(particles(i)(tag::Pos{}, tag::Z{}))
+ load<Simd>(particles(i)(tag::Vel{}, tag::Z{})) * TIMESTEP);
}
};

Expand Down Expand Up @@ -337,7 +341,7 @@ try

std::ofstream plotFile{"nbody.sh"};
plotFile.exceptions(std::ios::badbit | std::ios::failbit);
std::ofstream{"nbody.sh"} << R"(#!/usr/bin/gnuplot -p
plotFile << R"(#!/usr/bin/gnuplot -p
set style data histograms
set style fill solid
set xtics rotate by 45 right
Expand Down
8 changes: 5 additions & 3 deletions examples/heatequation/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
cmake_minimum_required (VERSION 3.18.3)
project(llama-heatequation CXX)

find_package(Vc QUIET)
find_package(xsimd QUIET)
if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()

add_executable(${PROJECT_NAME} heatequation.cpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)
if (Vc_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE Vc::Vc)
if (xsimd_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE xsimd)
else()
message(WARNING "xsimd not found: heatequation built without explicit SIMD support")
endif()

if (MSVC)
Expand Down
37 changes: 21 additions & 16 deletions examples/heatequation/heatequation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
#include <llama/llama.hpp>
#include <utility>

#if __has_include(<xsimd/xsimd.hpp>)
# include <xsimd/xsimd.hpp>
# define HAVE_XSIMD
#endif

template<typename View>
inline void kernel(uint32_t idx, const View& uCurr, View& uNext, double r)
{
Expand All @@ -36,21 +41,21 @@ void update_scalar(const View& uCurr, View& uNext, uint32_t extent, double dx, d
kernel(i, uCurr, uNext, r);
}

#if __has_include(<Vc/Vc>)
# include <Vc/Vc>
#ifdef HAVE_XSIMD

template<typename View>
inline void kernel_vec(uint32_t baseIdx, const View& uCurr, View& uNext, double r)
{
const auto next = Vc::double_v{&uCurr[baseIdx]} * (1.0 - 2.0 * r) + Vc::double_v{&uCurr[baseIdx - 1]} * r
+ Vc::double_v{&uCurr[baseIdx + 1]} * r;
next.store(&uNext[baseIdx]);
using Simd = xsimd::batch<double>;
const auto next = Simd::load_unaligned(&uCurr[baseIdx]) * (1.0 - 2.0 * r)
+ Simd::load_unaligned(&uCurr[baseIdx - 1]) * r + Simd::load_unaligned(&uCurr[baseIdx + 1]) * r;
next.store_unaligned(&uNext[baseIdx]);
}

template<typename View>
void update_Vc(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
void update_SIMD(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
{
constexpr auto L = Vc::double_v::size();
constexpr auto L = xsimd::batch<double>::size;
const auto r = dt / (dx * dx);

const auto blocks = (extent + L - 1) / L;
Expand All @@ -67,9 +72,9 @@ void update_Vc(const View& uCurr, View& uNext, uint32_t extent, double dx, doubl
}

template<typename View>
void update_Vc_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
void update_SIMD_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
{
constexpr auto L = Vc::double_v::size();
constexpr auto L = xsimd::batch<double>::size;
const auto r = dt / (dx * dx);

for(auto i = 1; i < L; i++)
Expand All @@ -84,9 +89,9 @@ void update_Vc_peel(const View& uCurr, View& uNext, uint32_t extent, double dx,
}

template<typename View>
void update_Vc_peel_unal_st(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
void update_SIMD_peel_unal_st(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
{
constexpr auto L = Vc::double_v::size();
constexpr auto L = xsimd::batch<double>::size;
const auto r = dt / (dx * dx);

kernel(1, uCurr, uNext, r);
Expand Down Expand Up @@ -168,11 +173,11 @@ try
std::cout << "Incorrect! error = " << maxError << " (the grid resolution may be too low)\n";
};

run("update_scalar ", [](auto&... args) { update_scalar(args...); });
#if __has_include(<Vc/Vc>)
run("update_Vc ", [](auto&... args) { update_Vc(args...); });
run("update_Vc_peel ", [](auto&... args) { update_Vc_peel(args...); });
run("update_Vc_peel_unal_st", [](auto&... args) { update_Vc_peel_unal_st(args...); });
// run("update_scalar ", [](auto&... args) { update_scalar(args...); });
#ifdef HAVE_XSIMD
run("update_SIMD ", [](auto&... args) { update_SIMD(args...); });
run("update_SIMD_peel ", [](auto&... args) { update_SIMD_peel(args...); });
run("update_SIMD_peel_unal_st", [](auto&... args) { update_SIMD_peel_unal_st(args...); });
#endif

return 0;
Expand Down
8 changes: 5 additions & 3 deletions examples/nbody/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@ project(llama-nbody CXX)

find_package(fmt CONFIG REQUIRED)
find_package(OpenMP REQUIRED)
find_package(Vc QUIET)
find_package(xsimd QUIET)
if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()

add_executable(${PROJECT_NAME} nbody.cpp ../common/Stopwatch.hpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama fmt::fmt OpenMP::OpenMP_CXX)
if (Vc_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE Vc::Vc)
if (xsimd_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE xsimd)
else()
message(WARNING "xsimd not found: n-body built without explicit SIMD support")
endif()

if (MSVC)
Expand Down
Loading

0 comments on commit 75d21ad

Please sign in to comment.