Skip to content

Commit

Permalink
Merge pull request #134 from bernhardmgruber/nbody
Browse files Browse the repository at this point in the history
Small improvements for nbody example
  • Loading branch information
bernhardmgruber authored Dec 15, 2020
2 parents 18bcdfb + c468bfb commit 05a138a
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 26 deletions.
82 changes: 58 additions & 24 deletions examples/nbody/nbody.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <fstream>
#include <iomanip>
#include <iostream>
#include <llama/DumpMapping.hpp>
#include <llama/llama.hpp>
#include <random>
#include <string_view>
Expand All @@ -22,7 +23,10 @@ using FP = float;
constexpr auto PROBLEM_SIZE = 16 * 1024;
constexpr auto STEPS = 5;
constexpr auto TRACE = false;
constexpr auto DUMP_MAPPING = false;
constexpr auto ALLOW_RSQRT = true; // rsqrt can be way faster, but less accurate
constexpr auto NEWTON_RAPHSON_AFTER_RSQRT
= true; // generate a newton raphson refinement after explicit calls to rsqrt()
constexpr FP TIMESTEP = 0.0001f;
constexpr FP EPS2 = 0.01f;

Expand Down Expand Up @@ -138,6 +142,8 @@ namespace usellama
llama::mapping::SoA,
true>{arrayDomain};
}();
if constexpr (DUMP_MAPPING)
std::ofstream{title + ".svg"} << llama::toSvg(mapping);

auto tmapping = [&] {
if constexpr (TRACE)
Expand Down Expand Up @@ -712,8 +718,24 @@ namespace manualAoSoA_manualAVX
const __m256 distSqr
= _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(vEPS2, xdistanceSqr), ydistanceSqr), zdistanceSqr);
const __m256 distSixth = _mm256_mul_ps(_mm256_mul_ps(distSqr, distSqr), distSqr);
const __m256 invDistCube
= ALLOW_RSQRT ? _mm256_rsqrt_ps(distSixth) : _mm256_div_ps(_mm256_set1_ps(1.0f), _mm256_sqrt_ps(distSixth));
const __m256 invDistCube = [&] {
if constexpr (ALLOW_RSQRT)
{
const __m256 r = _mm256_rsqrt_ps(distSixth);
if constexpr (NEWTON_RAPHSON_AFTER_RSQRT)
{
// from: http://stackoverflow.com/q/14752399/556899
const __m256 three = _mm256_set1_ps(3.0f);
const __m256 half = _mm256_set1_ps(0.5f);
const __m256 muls = _mm256_mul_ps(_mm256_mul_ps(distSixth, r), r);
return _mm256_mul_ps(_mm256_mul_ps(half, r), _mm256_sub_ps(three, muls));
}
else
return r;
}
else
return _mm256_div_ps(_mm256_set1_ps(1.0f), _mm256_sqrt_ps(distSixth));
}();
const __m256 sts = _mm256_mul_ps(_mm256_mul_ps(pjmass, invDistCube), vTIMESTEP);
pivelx = _mm256_fmadd_ps(xdistanceSqr, sts, pivelx);
pively = _mm256_fmadd_ps(ydistanceSqr, sts, pively);
Expand All @@ -728,10 +750,10 @@ namespace manualAoSoA_manualAVX
for (std::size_t j = 0; j < LANES; j++)
{
const auto& blockJ = particles[bj];
const __m256 posxJ = _mm256_broadcast_ss(&blockJ.pos.x[j]);
const __m256 posyJ = _mm256_broadcast_ss(&blockJ.pos.y[j]);
const __m256 poszJ = _mm256_broadcast_ss(&blockJ.pos.z[j]);
const __m256 massJ = _mm256_broadcast_ss(&blockJ.mass[j]);
const __m256 pjposx = _mm256_broadcast_ss(&blockJ.pos.x[j]);
const __m256 pjposy = _mm256_broadcast_ss(&blockJ.pos.y[j]);
const __m256 pjposz = _mm256_broadcast_ss(&blockJ.pos.z[j]);
const __m256 pjmass = _mm256_broadcast_ss(&blockJ.mass[j]);

auto& blockI = particles[bi];
__m256 pivelx = _mm256_load_ps(blockI.vel.x);
Expand All @@ -744,10 +766,10 @@ namespace manualAoSoA_manualAVX
pivelx,
pively,
pivelz,
posxJ,
posyJ,
poszJ,
massJ);
pjposx,
pjposy,
pjposz,
pjmass);
_mm256_store_ps(blockI.vel.x, pivelx);
_mm256_store_ps(blockI.vel.y, pively);
_mm256_store_ps(blockI.vel.z, pivelz);
Expand Down Expand Up @@ -803,8 +825,8 @@ namespace manualAoSoA_manualAVX
void update1(ParticleBlock* particles)
{
for (std::size_t bi = 0; bi < BLOCKS; bi++)
for (std::size_t i = 0; i < LANES; i++)
for (std::size_t bj = 0; bj < BLOCKS; bj++)
for (std::size_t bj = 0; bj < BLOCKS; bj++)
for (std::size_t i = 0; i < LANES; i++)
{
auto& blockI = particles[bi];
const __m256 piposx = _mm256_broadcast_ss(&blockI.pos.x[i]);
Expand Down Expand Up @@ -985,7 +1007,24 @@ namespace manualAoSoA_Vc
const vec zdistanceSqr = zdistance * zdistance;
const vec distSqr = EPS2 + xdistanceSqr + ydistanceSqr + zdistanceSqr;
const vec distSixth = distSqr * distSqr * distSqr;
const vec invDistCube = ALLOW_RSQRT ? Vc::rsqrt(distSixth) : (1.0f / Vc::sqrt(distSixth));
const vec invDistCube = [&] {
if constexpr (ALLOW_RSQRT)
{
const vec r = Vc::rsqrt(distSixth);
if constexpr (NEWTON_RAPHSON_AFTER_RSQRT)
{
// from: http://stackoverflow.com/q/14752399/556899
const vec three = 3.0f;
const vec half = 0.5f;
const vec muls = distSixth * r * r;
return (half * r) * (three - muls);
}
else
return r;
}
else
return 1.0f / Vc::sqrt(distSixth);
}();
const vec sts = pjmass * invDistCube * TIMESTEP;
pivelx = xdistanceSqr * sts + pivelx;
pively = ydistanceSqr * sts + pively;
Expand All @@ -1004,22 +1043,17 @@ namespace manualAoSoA_Vc
for (std::size_t j = 0; j < LANES; j++)
{
const auto& blockJ = particles[bj];
const vec pjposx = blockJ.pos.x[j];
const vec pjposy = blockJ.pos.y[j];
const vec pjposz = blockJ.pos.z[j];
const vec pjmass = blockJ.mass[j];

pPInteraction(
blockI.pos.x,
blockI.pos.y,
blockI.pos.z,
blockI.vel.x,
blockI.vel.y,
blockI.vel.z,
pjposx,
pjposy,
pjposz,
pjmass);
blockJ.pos.x[j],
blockJ.pos.y[j],
blockJ.pos.z[j],
blockJ.mass[j]);
}
// });
}
Expand Down Expand Up @@ -1066,8 +1100,8 @@ namespace manualAoSoA_Vc
{
auto& blockI = particles[bi];
// std::for_each(ex, particles, particles + BLOCKS, [&](ParticleBlock& blockI) {
for (std::size_t i = 0; i < LANES; i++)
for (std::size_t bj = 0; bj < BLOCKS; bj++)
for (std::size_t bj = 0; bj < BLOCKS; bj++)
for (std::size_t i = 0; i < LANES; i++)
{
const auto& blockJ = particles[bj];
vec pivelx = (FP) blockI.vel.x[i];
Expand Down
13 changes: 11 additions & 2 deletions include/llama/DumpMapping.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ namespace llama
{
return mapping.template getBlobNrAndOffset<Coords...>(udCoord);
}

auto color(const std::vector<std::size_t>& ddIndices) -> std::size_t
{
auto c = (boost::hash_value(ddIndices) & 0xFFFFFF);
const auto channelSum = ((c & 0xFF0000) >> 4) + ((c & 0xFF00) >> 2) + c & 0xFF;
if (channelSum < 200)
c |= 0x404040; // ensure color per channel is at least 0x40.
return c;
}
} // namespace internal

/// Returns an SVG image visualizing the memory layout created by the given
Expand Down Expand Up @@ -151,7 +160,7 @@ namespace llama
const auto x = (info.nrAndOffset.offset % wrapByteCount) * byteSizeInPixel;
const auto y = (info.nrAndOffset.offset / wrapByteCount) * byteSizeInPixel + blobY;

const auto fill = boost::hash_value(info.ddIndices) & 0xFFFFFF;
const auto fill = internal::color(info.ddIndices);

const auto width = byteSizeInPixel * info.size;
svg += fmt::format(
Expand Down Expand Up @@ -300,7 +309,7 @@ namespace llama
)",
cssClass(internal::tagsAsStrings<DatumDomain>(coord)),
byteSizeInPixel * size,
boost::hash_value(internal::toVec(coord)) & 0xFFFFFF);
internal::color(internal::toVec(coord)));
});

svg += fmt::format(R"(</style>
Expand Down

0 comments on commit 05a138a

Please sign in to comment.