Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement SIMD load/store between different record dimensions #819

Merged
merged 1 commit into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions include/llama/RecordRef.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,11 +328,11 @@ namespace llama
inline constexpr auto isDirectListInitializableFromTuple<T, Tuple<Args...>>
= isDirectListInitializable<T, Args...>;

template<typename T, typename Simd, typename RecordCoord>
LLAMA_FN_HOST_ACC_INLINE void loadSimdFromField(const T& srcRef, Simd& dstSimd, RecordCoord rc);
template<typename T, typename Simd, typename SrcRC, typename DstRC>
LLAMA_FN_HOST_ACC_INLINE void loadSimdFromField(const T& srcRef, Simd& dstSimd, SrcRC srcRC, DstRC dstRC);

template<typename Simd, typename T, typename RecordCoord>
LLAMA_FN_HOST_ACC_INLINE void storeSimdToField(const Simd& srcSimd, T&& dstRef, RecordCoord rc);
template<typename Simd, typename TFwd, typename SrcRC, typename DstRC>
LLAMA_FN_HOST_ACC_INLINE void storeSimdToField(const Simd& srcSimd, TFwd&& dstRef, SrcRC srcRC, DstRC dstRC);
} // namespace internal

/// Record reference type returned by \ref View after resolving an array dimensions coordinate or partially
Expand Down Expand Up @@ -756,16 +756,18 @@ namespace llama
// FIXME(bgruber): the SIMD load/store functions need to navigate back from a record ref to the contained view
// to find subsequent elements. This is not a great design for now and the SIMD load/store functions should
// probably take iterators to records.
template<typename T, typename Simd, typename RecordCoord>
template<typename T, typename Simd, typename SrcRC, typename DstRC>
friend LLAMA_FN_HOST_ACC_INLINE void internal::loadSimdFromField(
const T& srcRef,
Simd& dstSimd,
RecordCoord rc);
template<typename Simd, typename T, typename RecordCoord>
SrcRC srcRC,
DstRC dstRC);
template<typename Simd, typename TFwd, typename SrcRC, typename DstRC>
friend LLAMA_FN_HOST_ACC_INLINE void internal::storeSimdToField(
const Simd& srcSimd,
T&& dstRef,
RecordCoord rc);
TFwd&& dstRef,
SrcRC srcRC,
DstRC dstRC);
};

// swap for heterogeneous RecordRef
Expand Down
99 changes: 73 additions & 26 deletions include/llama/Simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,28 +203,27 @@ namespace llama
return indices;
}();

template<typename T, typename Simd, typename RecordCoord>
LLAMA_FN_HOST_ACC_INLINE void loadSimdFromField(const T& srcRef, Simd& dstSimd, RecordCoord rc)
template<typename T, typename Simd, typename SrcRC, typename DstRC>
LLAMA_FN_HOST_ACC_INLINE void loadSimdFromField(const T& srcRef, Simd& dstSimd, SrcRC srcRC, DstRC dstRC)
{
using RecordDim = typename T::AccessibleRecordDim;
using FieldType = GetType<RecordDim, decltype(rc)>;
using ElementSimd = std::decay_t<decltype(dstSimd(rc))>;
using FieldType = GetType<typename T::AccessibleRecordDim, SrcRC>;
using ElementSimd = std::decay_t<decltype(dstSimd(dstRC))>;
using Traits = SimdTraits<ElementSimd>;

auto loadElementWise = [&]
{
auto b = ArrayIndexIterator{srcRef.view.extents(), srcRef.arrayIndex()};
for(std::size_t i = 0; i < Traits::lanes; i++)
reinterpret_cast<FieldType*>(&dstSimd(rc))[i]
= srcRef.view(*b++)(cat(typename T::BoundRecordCoord{}, rc));
reinterpret_cast<FieldType*>(&dstSimd(dstRC))[i]
= srcRef.view(*b++)(cat(typename T::BoundRecordCoord{}, srcRC));
};

// TODO(bgruber): can we generalize the logic whether we can load a dstSimd from that mapping?
using Mapping = typename T::View::Mapping;
if constexpr(mapping::isSoA<Mapping>)
{
LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
dstSimd(rc) = Traits::loadUnaligned(&srcRef(rc));
dstSimd(dstRC) = Traits::loadUnaligned(&srcRef(srcRC));
LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
}
else if constexpr(mapping::isAoSoA<typename T::View::Mapping>)
Expand All @@ -234,7 +233,7 @@ namespace llama
&& T::View::Mapping::lanes >= Traits::lanes)
{
LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
dstSimd(rc) = Traits::loadUnaligned(&srcRef(rc));
dstSimd(dstRC) = Traits::loadUnaligned(&srcRef(srcRC));
LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
}
else
Expand All @@ -243,20 +242,19 @@ namespace llama
else if constexpr(mapping::isAoS<Mapping>)
{
LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
dstSimd(rc) = Traits::gather(&srcRef(rc), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
dstSimd(dstRC) = Traits::gather(&srcRef(srcRC), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
}
else
loadElementWise();
}

template<typename Simd, typename TFwd, typename RecordCoord>
LLAMA_FN_HOST_ACC_INLINE void storeSimdToField(const Simd& srcSimd, TFwd&& dstRef, RecordCoord rc)
template<typename Simd, typename TFwd, typename SrcRC, typename DstRC>
LLAMA_FN_HOST_ACC_INLINE void storeSimdToField(const Simd& srcSimd, TFwd&& dstRef, SrcRC srcRC, DstRC dstRC)
{
using T = std::remove_reference_t<TFwd>;
using RecordDim = typename T::AccessibleRecordDim;
using FieldType = GetType<RecordDim, decltype(rc)>;
using ElementSimd = std::decay_t<decltype(srcSimd(rc))>;
using FieldType = GetType<typename T::AccessibleRecordDim, DstRC>;
using ElementSimd = std::decay_t<decltype(srcSimd(srcRC))>;
using Traits = SimdTraits<ElementSimd>;

auto storeElementWise = [&]
Expand All @@ -265,16 +263,16 @@ namespace llama
// direction should we collect SIMD values?
auto b = ArrayIndexIterator{dstRef.view.extents(), dstRef.arrayIndex()};
for(std::size_t i = 0; i < Traits::lanes; i++)
dstRef.view (*b++)(cat(typename T::BoundRecordCoord{}, rc))
= reinterpret_cast<const FieldType*>(&srcSimd(rc))[i];
dstRef.view (*b++)(cat(typename T::BoundRecordCoord{}, dstRC))
= reinterpret_cast<const FieldType*>(&srcSimd(srcRC))[i];
};

// TODO(bgruber): can we generalize the logic whether we can store a srcSimd to that mapping?
using Mapping = typename std::remove_reference_t<T>::View::Mapping;
if constexpr(mapping::isSoA<Mapping>)
{
LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
Traits::storeUnaligned(srcSimd(rc), &dstRef(rc));
Traits::storeUnaligned(srcSimd(srcRC), &dstRef(dstRC));
LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
}
else if constexpr(mapping::isAoSoA<typename T::View::Mapping>)
Expand All @@ -284,7 +282,7 @@ namespace llama
&& T::View::Mapping::lanes >= Traits::lanes)
{
LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
Traits::storeUnaligned(srcSimd(rc), &dstRef(rc));
Traits::storeUnaligned(srcSimd(srcRC), &dstRef(dstRC));
LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
}
else
Expand All @@ -293,7 +291,7 @@ namespace llama
else if constexpr(mapping::isAoS<Mapping>)
{
LLAMA_BEGIN_SUPPRESS_HOST_DEVICE_WARNING
Traits::scatter(srcSimd(rc), &dstRef(rc), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
Traits::scatter(srcSimd(srcRC), &dstRef(dstRC), aosStridedIndices<Mapping, FieldType, Traits::lanes>);
LLAMA_END_SUPPRESS_HOST_DEVICE_WARNING
}
else
Expand All @@ -315,8 +313,32 @@ namespace llama
if constexpr(simdLanes<Simd> == simdLanes<T>) // fast path mainly for scalar SimdN<T, 1, ...>
dstSimd = srcRef;
else
forEachLeafCoord<typename Simd::AccessibleRecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE { internal::loadSimdFromField(srcRef, dstSimd, rc); });
{
using SrcARD = typename T::AccessibleRecordDim;
using DstArd = typename Simd::AccessibleRecordDim;
if constexpr(std::is_same_v<SrcARD, DstArd>)
{
forEachLeafCoord<SrcARD>([&](auto rc) LLAMA_LAMBDA_INLINE
{ internal::loadSimdFromField(srcRef, dstSimd, rc, rc); });
}
else
{
forEachLeafCoord<SrcARD>(
[&](auto srcRC) LLAMA_LAMBDA_INLINE
{
using SrcInnerCoord = decltype(srcRC);
forEachLeafCoord<DstArd>(
[&](auto dstRC) LLAMA_LAMBDA_INLINE
{
using DstInnerCoord = decltype(dstRC);
if constexpr(hasSameTags<SrcARD, SrcInnerCoord, DstArd, DstInnerCoord>)
{
internal::loadSimdFromField(srcRef, dstSimd, srcRC, dstRC);
}
});
});
}
}
}
// unstructured dstSimd and reference type
else if constexpr(!isRecordRef<Simd> && !isRecordRef<T>)
Expand All @@ -337,17 +359,42 @@ namespace llama
/// SIMD vector will be stored for each of the fields. The number of elements stored per SIMD vector depends on the
/// SIMD width of the vector. Simd is allowed to have different vector lengths per element.
LLAMA_EXPORT
template<typename Simd, typename T>
LLAMA_FN_HOST_ACC_INLINE void storeSimd(const Simd& srcSimd, T&& dstRef)
template<typename Simd, typename TFwd>
LLAMA_FN_HOST_ACC_INLINE void storeSimd(const Simd& srcSimd, TFwd&& dstRef)
{
using T = std::decay_t<TFwd>;
// structured Simd type and record reference
if constexpr(isRecordRef<Simd> && isRecordRef<T>)
{
if constexpr(simdLanes<Simd> == simdLanes<T>) // fast path mainly for scalar SimdN<T, 1, ...>
dstRef = srcSimd;
else
forEachLeafCoord<typename T::AccessibleRecordDim>(
[&](auto rc) LLAMA_LAMBDA_INLINE { internal::storeSimdToField(srcSimd, dstRef, rc); });
{
using SrcARD = typename Simd::AccessibleRecordDim;
using DstArd = typename T::AccessibleRecordDim;
if constexpr(std::is_same_v<SrcARD, DstArd>)
{
forEachLeafCoord<SrcARD>([&](auto rc) LLAMA_LAMBDA_INLINE
{ internal::storeSimdToField(srcSimd, dstRef, rc, rc); });
}
else
{
forEachLeafCoord<SrcARD>(
[&](auto srcRC) LLAMA_LAMBDA_INLINE
{
using SrcInnerCoord = decltype(srcRC);
forEachLeafCoord<DstArd>(
[&](auto dstRC) LLAMA_LAMBDA_INLINE
{
using DstInnerCoord = decltype(dstRC);
if constexpr(hasSameTags<SrcARD, SrcInnerCoord, DstArd, DstInnerCoord>)
{
internal::storeSimdToField(srcSimd, dstRef, srcRC, dstRC);
}
});
});
}
}
}
// unstructured srcSimd and reference type
else if constexpr(!isRecordRef<Simd> && !isRecordRef<T>)
Expand Down
131 changes: 123 additions & 8 deletions tests/simd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,13 @@
CHECK(p(tag::Flags{}, llama::RecordCoord<1>{}) == 8);
CHECK(p(tag::Flags{}, llama::RecordCoord<2>{}) == 9);
CHECK(p(tag::Flags{}, llama::RecordCoord<3>{}) == 10);

llama::SimdN<Vec3D, 1, stdx::fixed_size_simd> v;
llama::loadSimd(view(0)(tag::Pos{}), v);

CHECK(v(tag::X{}) == 0);
CHECK(v(tag::Y{}) == 1);
CHECK(v(tag::Z{}) == 2);
}

TEMPLATE_TEST_CASE(
Expand Down Expand Up @@ -292,6 +299,74 @@
CHECK(
SimdRange{p(tag::Flags{}, llama::RecordCoord<3>{})}
== SimdRange{stdx::fixed_size_simd<std::uint8_t, 4>{[](auto ic) -> std::uint8_t { return 10 + ic * 11; }}});

llama::SimdN<Vec3D, 4, stdx::fixed_size_simd> v;
llama::loadSimd(view(0)(tag::Pos{}), v);

CHECK(SimdRange{v(tag::X{})} == SimdRange{stdx::fixed_size_simd<double, 4>{[](auto ic) {
return 0.0 + ic * 11.0;
}}});
CHECK(SimdRange{v(tag::Y{})} == SimdRange{stdx::fixed_size_simd<double, 4>{[](auto ic) {
return 1.0 + ic * 11.0;
}}});
CHECK(SimdRange{v(tag::Z{})} == SimdRange{stdx::fixed_size_simd<double, 4>{[](auto ic) {
return 2.0 + ic * 11.0;
}}});
}

using Vec2I = llama::Record<llama::Field<tag::X, int>, llama::Field<tag::Y, int>>;
using Vec1I = llama::Record<llama::Field<tag::Y, int>>;

TEMPLATE_TEST_CASE(
"simd.heterogeneousLoadStore.stdsimd",
"",
llama::mapping::BindAoS<>,
llama::mapping::BindSoA<>,
llama::mapping::BindAoSoA<2>,
llama::mapping::BindAoSoA<32>)
{
using ArrayExtents = llama::ArrayExtentsDynamic<int, 1>;
const auto mapping = typename TestType::template fn<ArrayExtents, Vec2I>(ArrayExtents{2});
auto view = llama::allocViewUninitialized(mapping);

Check warning on line 330 in tests/simd.cpp

View check run for this annotation

Codecov / codecov/patch

tests/simd.cpp#L330

Added line #L330 was not covered by tests
iotaFillView(view);

SECTION("BiggerSimdRecord")
{
llama::SimdN<Vec3I, 2, stdx::fixed_size_simd> v{};
llama::loadSimd(view(0), v);
CHECK(SimdRange{v(tag::X{})} == SimdRange{stdx::fixed_size_simd<int, 2>{[](auto ic) {
return 0 + static_cast<int>(ic) * 2;
}}});
CHECK(SimdRange{v(tag::Y{})} == SimdRange{stdx::fixed_size_simd<int, 2>{[](auto ic) {
return 1 + static_cast<int>(ic) * 2;
}}});
CHECK(SimdRange{v(tag::Z{})} == SimdRange{stdx::fixed_size_simd<int, 2>{}});

v(tag::X{}) = stdx::fixed_size_simd<int, 2>{[](auto ic) { return static_cast<int>(ic) + 100; }};
v(tag::Y{}) = stdx::fixed_size_simd<int, 2>{[](auto ic) { return static_cast<int>(ic) + 200; }};
v(tag::Z{}) = stdx::fixed_size_simd<int, 2>{[](auto ic) { return static_cast<int>(ic) + 300; }};
llama::storeSimd(v, view(0));
CHECK(view(0)(tag::X{}) == 100);
CHECK(view(1)(tag::X{}) == 101);
CHECK(view(0)(tag::Y{}) == 200);
CHECK(view(1)(tag::Y{}) == 201);
}

SECTION("SmallerSimdRecord")
{
llama::SimdN<Vec1I, 2, stdx::fixed_size_simd> v{};
llama::loadSimd(view(0), v);
CHECK(SimdRange{v(tag::Y{})} == SimdRange{stdx::fixed_size_simd<int, 2>{[](auto ic) {
return 1 + static_cast<int>(ic) * 2;
}}});

v(tag::Y{}) = stdx::fixed_size_simd<int, 2>{[](auto ic) { return static_cast<int>(ic) + 1000; }};
llama::storeSimd(v, view(0));
CHECK(view(0)(tag::X{}) == 0);
CHECK(view(1)(tag::X{}) == 2);
CHECK(view(0)(tag::Y{}) == 1000);
CHECK(view(1)(tag::Y{}) == 1001);
}
}

TEST_CASE("simd.storeSimd.scalar")
Expand Down Expand Up @@ -352,6 +427,24 @@
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<1>{}) == 8);
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<2>{}) == 9);
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<3>{}) == 10);

llama::SimdN<Vec3D, 1, stdx::fixed_size_simd> v;
v(tag::X{}) = 100;
v(tag::Y{}) = 101;
v(tag::Z{}) = 102;
llama::storeSimd(v, view(0)(tag::Vel{}));

CHECK(view(0)(tag::Pos{}, tag::X{}) == 0);
CHECK(view(0)(tag::Pos{}, tag::Y{}) == 1);
CHECK(view(0)(tag::Pos{}, tag::Z{}) == 2);
CHECK(view(0)(tag::Mass{}) == 3);
CHECK(view(0)(tag::Vel{}, tag::X{}) == 100);
CHECK(view(0)(tag::Vel{}, tag::Y{}) == 101);
CHECK(view(0)(tag::Vel{}, tag::Z{}) == 102);
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<0>{}) == 7);
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<1>{}) == 8);
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<2>{}) == 9);
CHECK(view(0)(tag::Flags{}, llama::RecordCoord<3>{}) == 10);
}

TEMPLATE_TEST_CASE(
Expand All @@ -367,14 +460,16 @@
auto view = llama::allocViewUninitialized(mapping);

llama::SimdN<ParticleSimd, 3, stdx::fixed_size_simd> p;
auto& x = p(tag::Pos{}, tag::X{});
auto& y = p(tag::Pos{}, tag::Y{});
auto& z = p(tag::Pos{}, tag::Z{});
auto& m = p(tag::Mass{});
x[0] = 1, x[1] = 2, x[2] = 3;
y[0] = 4, y[1] = 5, y[2] = 6;
z[0] = 7, z[1] = 8, z[2] = 9;
m[0] = 80, m[1] = 81, m[2] = 82;
{
auto& x = p(tag::Pos{}, tag::X{});
auto& y = p(tag::Pos{}, tag::Y{});
auto& z = p(tag::Pos{}, tag::Z{});
auto& m = p(tag::Mass{});
x[0] = 1, x[1] = 2, x[2] = 3;
y[0] = 4, y[1] = 5, y[2] = 6;
z[0] = 7, z[1] = 8, z[2] = 9;
m[0] = 80, m[1] = 81, m[2] = 82;
}
llama::storeSimd(p, view(0));

CHECK(view(0)(tag::Pos{}, tag::X{}) == 1);
Expand All @@ -393,6 +488,26 @@
CHECK(view(1)(tag::Mass{}) == 81);
CHECK(view(2)(tag::Mass{}) == 82);
CHECK(view(3)(tag::Mass{}) == 0);

llama::SimdN<Vec3D, 3, stdx::fixed_size_simd> v;
{
auto& x = v(tag::X{});
auto& y = v(tag::Y{});
auto& z = v(tag::Z{});
x[0] = 101, x[1] = 102, x[2] = 103;
y[0] = 104, y[1] = 105, y[2] = 106;
z[0] = 107, z[1] = 108, z[2] = 109;
}
llama::storeSimd(v, view(0)(tag::Pos{}));
CHECK(view(0)(tag::Pos{}, tag::X{}) == 101);
CHECK(view(1)(tag::Pos{}, tag::X{}) == 102);
CHECK(view(2)(tag::Pos{}, tag::X{}) == 103);
CHECK(view(0)(tag::Pos{}, tag::Y{}) == 104);
CHECK(view(1)(tag::Pos{}, tag::Y{}) == 105);
CHECK(view(2)(tag::Pos{}, tag::Y{}) == 106);
CHECK(view(0)(tag::Pos{}, tag::Z{}) == 107);
CHECK(view(1)(tag::Pos{}, tag::Z{}) == 108);
CHECK(view(2)(tag::Pos{}, tag::Z{}) == 109);
}

TEMPLATE_TEST_CASE(
Expand Down
Loading