Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace Vc by xsimd #557

Merged
merged 1 commit into from
Aug 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ env:
THREADS: 4
CONFIG: RelWithDebInfo
ALPAKA_BRANCH: 0.9.0
VCPKG_INSTALL_LIST: "fmt vc tinyobjloader boost-mp11 boost-atomic boost-smart-ptr boost-functional boost-container"
VCPKG_INSTALL: "vcpkg install fmt tinyobjloader boost-mp11 boost-atomic boost-smart-ptr boost-functional boost-container; vcpkg install --head xsimd"

jobs:
clang-format:
Expand All @@ -40,7 +40,7 @@ jobs:
sudo apt install clang-14 libomp-14-dev clang-tidy-14
- name: vcpkg install dependencies
run: |
vcpkg install $VCPKG_INSTALL_LIST
eval $VCPKG_INSTALL
- name: install alpaka
run: |
git clone https://github.com/alpaka-group/alpaka.git
Expand Down Expand Up @@ -76,7 +76,7 @@ jobs:
sudo apt install lcov
- name: vcpkg install dependencies
run: |
vcpkg install $VCPKG_INSTALL_LIST
eval $VCPKG_INSTALL
- name: cmake
run: |
mkdir build
Expand Down Expand Up @@ -204,7 +204,7 @@ jobs:
run: |
# vcpkg fails to build with Intel compilers
if [ ${{ matrix.install_oneapi }} ]; then unset CXX; fi
vcpkg install $VCPKG_INSTALL_LIST
eval $VCPKG_INSTALL
- name: download CUDA
if: matrix.cuda_url
run: |
Expand Down Expand Up @@ -270,7 +270,7 @@ jobs:
- uses: actions/checkout@v2
- name: vcpkg install dependencies
run: |
vcpkg install $VCPKG_INSTALL_LIST
eval $VCPKG_INSTALL
- name: install alpaka
run: |
git clone --branch $ALPAKA_BRANCH --depth 1 https://github.com/alpaka-group/alpaka.git
Expand Down Expand Up @@ -311,7 +311,7 @@ jobs:
brew install libomp
- name: vcpkg install dependencies
run: |
vcpkg install $VCPKG_INSTALL_LIST
eval $VCPKG_INSTALL
- name: install alpaka
run: |
git clone --branch $ALPAKA_BRANCH --depth 1 https://github.com/alpaka-group/alpaka.git
Expand Down
2 changes: 1 addition & 1 deletion docs/pages/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Dependencies
- Boost 1.70.0 or higher
- libfmt 6.2.1 or higher (optional) for building the tests and examples and LLAMA supporting to dump mappings as SVG/HTML
- `Alpaka <https://github.com/alpaka-group/alpaka>`_ (optional) for building some examples
- `Vc <https://github.com/VcDevel/Vc>`_ (optional) for building some examples
- `xsimd <https://github.com/xtensor-stack/xsimd>`_ (optional) for building some examples


Build tests and examples
Expand Down
1 change: 0 additions & 1 deletion examples/alpaka/daxpy/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
cmake_minimum_required (VERSION 3.18.3)
project(llama-alpaka-daxpy CXX)

#find_package(Vc QUIET)
find_package(OpenMP REQUIRED)
if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
Expand Down
8 changes: 6 additions & 2 deletions examples/alpaka/nbody/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
cmake_minimum_required (VERSION 3.18.3)
project(llama-alpaka-nbody CXX)

find_package(Vc REQUIRED)
find_package(xsimd QUIET)
if (NOT xsimd_FOUND)
message(WARNING "xsimd not found: alpaka n-body example is disabled")
return()
endif()
if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()
find_package(alpaka 0.9.0 REQUIRED)
alpaka_add_executable(${PROJECT_NAME} nbody.cpp ../../common/Stopwatch.hpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka Vc::Vc)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka xsimd)

if (MSVC)
target_compile_options(${PROJECT_NAME} PRIVATE /arch:AVX2 /fp:fast)
Expand Down
92 changes: 48 additions & 44 deletions examples/alpaka/nbody/nbody.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
#include <string>
#include <utility>

#if __has_include(<xsimd/xsimd.hpp>)
# include <xsimd/xsimd.hpp>
# define HAVE_XSIMD
#endif

using FP = float;

constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles
Expand All @@ -19,13 +24,11 @@ constexpr auto ALLOW_RSQRT = true; // rsqrt can be way faster, but less accurate

#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED)
# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
# error Cannot enable CUDA together with other backends, because nvcc cannot parse the Vc header, sorry :/
# error Cannot enable CUDA together with other backends
# endif
// nvcc fails to compile Vc headers even if nothing is used from there, so we need to conditionally include it
# include <Vc/Vc>
constexpr auto DESIRED_ELEMENTS_PER_THREAD = Vc::float_v::size();
constexpr auto DESIRED_ELEMENTS_PER_THREAD = xsimd::batch<float>::size;
constexpr auto THREADS_PER_BLOCK = 1;
constexpr auto AOSOA_LANES = Vc::float_v::size(); // vectors
constexpr auto AOSOA_LANES = xsimd::batch<float>::size; // vectors
#elif defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
constexpr auto DESIRED_ELEMENTS_PER_THREAD = 1;
constexpr auto THREADS_PER_BLOCK = 256;
Expand Down Expand Up @@ -77,20 +80,20 @@ namespace stdext
}
} // namespace stdext

// FIXME: this makes assumptions that there are always float_v::size() many values blocked in the LLAMA view
template<typename Vec>
// FIXME: this makes assumptions that there are always Simd::size many values blocked in the LLAMA view
template<typename Simd>
LLAMA_FN_HOST_ACC_INLINE auto load(const FP& src)
{
if constexpr(std::is_same_v<Vec, FP>)
if constexpr(std::is_same_v<Simd, FP>)
return src;
else
return Vec(&src);
return Simd::load_unaligned(&src);
}

template<typename Vec>
template<typename Simd>
LLAMA_FN_HOST_ACC_INLINE auto broadcast(const FP& src)
{
return Vec(src);
return Simd(src);
}

template<typename Vec>
Expand All @@ -99,48 +102,46 @@ LLAMA_FN_HOST_ACC_INLINE auto store(FP& dst, Vec v)
if constexpr(std::is_same_v<Vec, FP>)
dst = v;
else
v.store(&dst);
v.store_unaligned(&dst);
}

template<int Elems>
struct VecType
struct SimdType
{
// TODO(bgruber): we need a vector type that also works on GPUs
#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
using type = Vc::SimdArray<FP, Elems>;
#endif
using type = xsimd::make_sized_batch_t<FP, Elems>;
static_assert(!std::is_void_v<type>, "xsimd does not have a SIMD type for this element count");
};

template<>
struct VecType<1>
struct SimdType<1>
{
using type = FP;
};

template<int Elems, typename ViewParticleI, typename ParticleRefJ>
LLAMA_FN_HOST_ACC_INLINE void pPInteraction(ViewParticleI pi, ParticleRefJ pj)
{
using Vec = typename VecType<Elems>::type;
using Simd = typename SimdType<Elems>::type;

using std::sqrt;
using stdext::rsqrt;
#ifndef ALPAKA_ACC_GPU_CUDA_ENABLED
using Vc::rsqrt;
using Vc::sqrt;
#endif

const Vec xdistance = load<Vec>(pi(tag::Pos{}, tag::X{})) - broadcast<Vec>(pj(tag::Pos{}, tag::X{}));
const Vec ydistance = load<Vec>(pi(tag::Pos{}, tag::Y{})) - broadcast<Vec>(pj(tag::Pos{}, tag::Y{}));
const Vec zdistance = load<Vec>(pi(tag::Pos{}, tag::Z{})) - broadcast<Vec>(pj(tag::Pos{}, tag::Z{}));
const Vec xdistanceSqr = xdistance * xdistance;
const Vec ydistanceSqr = ydistance * ydistance;
const Vec zdistanceSqr = zdistance * zdistance;
const Vec distSqr = +EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr;
const Vec distSixth = distSqr * distSqr * distSqr;
const Vec invDistCube = ALLOW_RSQRT ? rsqrt(distSixth) : (1.0f / sqrt(distSixth));
const Vec sts = broadcast<Vec>(pj(tag::Mass())) * invDistCube * TIMESTEP;
store<Vec>(pi(tag::Vel{}, tag::X{}), xdistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::X{})));
store<Vec>(pi(tag::Vel{}, tag::Y{}), ydistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::Y{})));
store<Vec>(pi(tag::Vel{}, tag::Z{}), zdistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::Z{})));
using xsimd::rsqrt;
using xsimd::sqrt;

const Simd xdistance = load<Simd>(pi(tag::Pos{}, tag::X{})) - broadcast<Simd>(pj(tag::Pos{}, tag::X{}));
const Simd ydistance = load<Simd>(pi(tag::Pos{}, tag::Y{})) - broadcast<Simd>(pj(tag::Pos{}, tag::Y{}));
const Simd zdistance = load<Simd>(pi(tag::Pos{}, tag::Z{})) - broadcast<Simd>(pj(tag::Pos{}, tag::Z{}));
const Simd xdistanceSqr = xdistance * xdistance;
const Simd ydistanceSqr = ydistance * ydistance;
const Simd zdistanceSqr = zdistance * zdistance;
const Simd distSqr = +EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr;
const Simd distSixth = distSqr * distSqr * distSqr;
const Simd invDistCube = ALLOW_RSQRT ? rsqrt(distSixth) : (1.0f / sqrt(distSixth));
const Simd sts = broadcast<Simd>(pj(tag::Mass())) * invDistCube * TIMESTEP;
store<Simd>(pi(tag::Vel{}, tag::X{}), xdistanceSqr * sts + load<Simd>(pi(tag::Vel{}, tag::X{})));
store<Simd>(pi(tag::Vel{}, tag::Y{}), ydistanceSqr * sts + load<Simd>(pi(tag::Vel{}, tag::Y{})));
store<Simd>(pi(tag::Vel{}, tag::Z{}), zdistanceSqr * sts + load<Simd>(pi(tag::Vel{}, tag::Z{})));
}

template<int ProblemSize, int Elems, int BlockSize, Mapping MappingSM>
Expand Down Expand Up @@ -218,16 +219,19 @@ struct MoveKernel
const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
const auto i = ti * Elems;

using Vec = typename VecType<Elems>::type;
store<Vec>(
using Simd = typename SimdType<Elems>::type;
store<Simd>(
particles(i)(tag::Pos{}, tag::X{}),
load<Vec>(particles(i)(tag::Pos{}, tag::X{})) + load<Vec>(particles(i)(tag::Vel{}, tag::X{})) * TIMESTEP);
store<Vec>(
load<Simd>(particles(i)(tag::Pos{}, tag::X{}))
+ load<Simd>(particles(i)(tag::Vel{}, tag::X{})) * TIMESTEP);
store<Simd>(
particles(i)(tag::Pos{}, tag::Y{}),
load<Vec>(particles(i)(tag::Pos{}, tag::Y{})) + load<Vec>(particles(i)(tag::Vel{}, tag::Y{})) * TIMESTEP);
store<Vec>(
load<Simd>(particles(i)(tag::Pos{}, tag::Y{}))
+ load<Simd>(particles(i)(tag::Vel{}, tag::Y{})) * TIMESTEP);
store<Simd>(
particles(i)(tag::Pos{}, tag::Z{}),
load<Vec>(particles(i)(tag::Pos{}, tag::Z{})) + load<Vec>(particles(i)(tag::Vel{}, tag::Z{})) * TIMESTEP);
load<Simd>(particles(i)(tag::Pos{}, tag::Z{}))
+ load<Simd>(particles(i)(tag::Vel{}, tag::Z{})) * TIMESTEP);
}
};

Expand Down Expand Up @@ -337,7 +341,7 @@ try

std::ofstream plotFile{"nbody.sh"};
plotFile.exceptions(std::ios::badbit | std::ios::failbit);
std::ofstream{"nbody.sh"} << R"(#!/usr/bin/gnuplot -p
plotFile << R"(#!/usr/bin/gnuplot -p
set style data histograms
set style fill solid
set xtics rotate by 45 right
Expand Down
8 changes: 5 additions & 3 deletions examples/heatequation/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
cmake_minimum_required (VERSION 3.18.3)
project(llama-heatequation CXX)

find_package(Vc QUIET)
find_package(xsimd QUIET)
if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()

add_executable(${PROJECT_NAME} heatequation.cpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)
if (Vc_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE Vc::Vc)
if (xsimd_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE xsimd)
else()
message(WARNING "xsimd not found: heatequation built without explicit SIMD support")
endif()

if (MSVC)
Expand Down
37 changes: 21 additions & 16 deletions examples/heatequation/heatequation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
#include <llama/llama.hpp>
#include <utility>

#if __has_include(<xsimd/xsimd.hpp>)
# include <xsimd/xsimd.hpp>
# define HAVE_XSIMD
#endif

template<typename View>
inline void kernel(uint32_t idx, const View& uCurr, View& uNext, double r)
{
Expand All @@ -36,21 +41,21 @@ void update_scalar(const View& uCurr, View& uNext, uint32_t extent, double dx, d
kernel(i, uCurr, uNext, r);
}

#if __has_include(<Vc/Vc>)
# include <Vc/Vc>
#ifdef HAVE_XSIMD

template<typename View>
inline void kernel_vec(uint32_t baseIdx, const View& uCurr, View& uNext, double r)
{
const auto next = Vc::double_v{&uCurr[baseIdx]} * (1.0 - 2.0 * r) + Vc::double_v{&uCurr[baseIdx - 1]} * r
+ Vc::double_v{&uCurr[baseIdx + 1]} * r;
next.store(&uNext[baseIdx]);
using Simd = xsimd::batch<double>;
const auto next = Simd::load_unaligned(&uCurr[baseIdx]) * (1.0 - 2.0 * r)
+ Simd::load_unaligned(&uCurr[baseIdx - 1]) * r + Simd::load_unaligned(&uCurr[baseIdx + 1]) * r;
next.store_unaligned(&uNext[baseIdx]);
}

template<typename View>
void update_Vc(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
void update_SIMD(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
{
constexpr auto L = Vc::double_v::size();
constexpr auto L = xsimd::batch<double>::size;
const auto r = dt / (dx * dx);

const auto blocks = (extent + L - 1) / L;
Expand All @@ -67,9 +72,9 @@ void update_Vc(const View& uCurr, View& uNext, uint32_t extent, double dx, doubl
}

template<typename View>
void update_Vc_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
void update_SIMD_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
{
constexpr auto L = Vc::double_v::size();
constexpr auto L = xsimd::batch<double>::size;
const auto r = dt / (dx * dx);

for(auto i = 1; i < L; i++)
Expand All @@ -84,9 +89,9 @@ void update_Vc_peel(const View& uCurr, View& uNext, uint32_t extent, double dx,
}

template<typename View>
void update_Vc_peel_unal_st(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
void update_SIMD_peel_unal_st(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
{
constexpr auto L = Vc::double_v::size();
constexpr auto L = xsimd::batch<double>::size;
const auto r = dt / (dx * dx);

kernel(1, uCurr, uNext, r);
Expand Down Expand Up @@ -168,11 +173,11 @@ try
std::cout << "Incorrect! error = " << maxError << " (the grid resolution may be too low)\n";
};

run("update_scalar ", [](auto&... args) { update_scalar(args...); });
#if __has_include(<Vc/Vc>)
run("update_Vc ", [](auto&... args) { update_Vc(args...); });
run("update_Vc_peel ", [](auto&... args) { update_Vc_peel(args...); });
run("update_Vc_peel_unal_st", [](auto&... args) { update_Vc_peel_unal_st(args...); });
// run("update_scalar ", [](auto&... args) { update_scalar(args...); });
#ifdef HAVE_XSIMD
run("update_SIMD ", [](auto&... args) { update_SIMD(args...); });
run("update_SIMD_peel ", [](auto&... args) { update_SIMD_peel(args...); });
run("update_SIMD_peel_unal_st", [](auto&... args) { update_SIMD_peel_unal_st(args...); });
#endif

return 0;
Expand Down
8 changes: 5 additions & 3 deletions examples/nbody/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@ project(llama-nbody CXX)

find_package(fmt CONFIG REQUIRED)
find_package(OpenMP REQUIRED)
find_package(Vc QUIET)
find_package(xsimd QUIET)
if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()

add_executable(${PROJECT_NAME} nbody.cpp ../common/Stopwatch.hpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama fmt::fmt OpenMP::OpenMP_CXX)
if (Vc_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE Vc::Vc)
if (xsimd_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE xsimd)
else()
message(WARNING "xsimd not found: n-body built without explicit SIMD support")
endif()

if (MSVC)
Expand Down
Loading