Skip to content

Commit

Permalink
Merge pull request #118 from bernhardmgruber/heat
Browse files Browse the repository at this point in the history
Improve heatequation demo and add Vc version
  • Loading branch information
bernhardmgruber authored Nov 5, 2020
2 parents c01befc + 0c8b0cf commit 424df94
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 48 deletions.
17 changes: 17 additions & 0 deletions examples/heatequation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,25 @@ project(llama-heatequation)
set (CMAKE_CXX_STANDARD 17)
set (CMAKE_CXX_STANDARD_REQUIRED on)

find_package(Vc QUIET)
if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()

add_executable(${PROJECT_NAME} heatequation.cpp)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)
if (Vc_FOUND)
target_link_libraries(${PROJECT_NAME} PRIVATE Vc::Vc)
endif()

if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
target_compile_options(${PROJECT_NAME} PRIVATE
-march=native
-ffast-math
)
elseif(MSVC)
target_compile_options(${PROJECT_NAME} PRIVATE
/arch:AVX2
/fp:fast
)
endif()
143 changes: 95 additions & 48 deletions examples/heatequation/heatequation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,65 @@
#include <llama/llama.hpp>
#include <utility>

using DatumDomain = double;
template <typename View>
inline void kernel(uint32_t idx, const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
{
const auto r = dt / (dx * dx);
if (idx > 0 && idx < extent - 1u)
uNext[idx] = uCurr[idx] * (1.0 - 2.0 * r) + uCurr[idx - 1] * r + uCurr[idx + 1] * r;
}

template <typename View>
void update_scalar(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
{
for (auto i = 0; i < extent; i++)
kernel(i, uCurr, uNext, extent, dx, dt);
}

struct HeatEquationKernel
#if __has_include(<Vc/Vc>)
# include <Vc/Vc>

template <typename View>
inline void kernel_vec(uint32_t blockIdx, const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
{
template <typename View>
void operator()(uint32_t idx, const View& uCurrBuf, View& uNextBuf, uint32_t extent, double dx, double dt) const
const auto r = dt / (dx * dx);

const auto baseIdx = static_cast<uint32_t>(blockIdx * Vc::double_v::size());
if (baseIdx > 0 && baseIdx + Vc::double_v::size() < extent)
{
const auto next = Vc::double_v{&uCurr[baseIdx]} * (1.0 - 2.0 * r) + Vc::double_v{&uCurr[baseIdx - 1]} * r
+ Vc::double_v{&uCurr[baseIdx + 1]} * r;
next.store(&uNext[baseIdx]);
}
else
{
const auto r = dt / (dx * dx);
if (idx > 0 && idx < extent - 1u)
uNextBuf[idx] = uCurrBuf[idx] * (1.0 - 2.0 * r) + uCurrBuf[idx - 1] * r + uCurrBuf[idx + 1] * r;
for (auto idx = baseIdx; idx <= baseIdx + Vc::double_v::size(); idx++)
if (idx > 0 && idx < extent - 1u)
uNext[idx] = uCurr[idx] * (1.0 - 2.0 * r) + uCurr[idx - 1] * r + uCurr[idx + 1] * r;
}
};
}

template <typename View>
void update_Vc(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
{
constexpr auto L = Vc::double_v::size();
const auto blocks = (extent + L - 1) / L;
for (auto i = 0; i < blocks; i++)
kernel_vec(i, uCurr, uNext, extent, dx, dt);
}

template <typename View>
void update_Vc_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
{
constexpr auto L = Vc::double_v::size();
const auto blocks = (extent + L - 1) / L;
kernel_vec(0, uCurr, uNext, extent, dx, dt);
for (auto i = 1; i < blocks - 1; i++)
kernel_vec(i, uCurr, uNext, extent, dx, dt);
kernel_vec(blocks - 1, uCurr, uNext, extent, dx, dt);
}

#endif

// Exact solution to the test problem
// u_t(x, t) = u_xx(x, t), x in [0, 1], t in [0, T]
Expand All @@ -46,13 +93,13 @@ double exactSolution(double const x, double const t)

auto main() -> int
{
// Parameters (a user is supposed to change numNodesX, numTimeSteps)
const auto numNodesX = 10000;
const auto numTimeSteps = 200000;
// Parameters (a user is supposed to change extent, timeSteps)
const auto extent = 10000;
const auto timeSteps = 200000;
const auto tMax = 0.001;
// x in [0, 1], t in [0, tMax]
const auto dx = 1.0 / static_cast<double>(numNodesX - 1);
const auto dt = tMax / static_cast<double>(numTimeSteps - 1);
const auto dx = 1.0 / static_cast<double>(extent - 1);
const auto dt = tMax / static_cast<double>(timeSteps - 1);

const auto r = dt / (dx * dx);
if (r > 0.5)
Expand All @@ -61,46 +108,46 @@ auto main() -> int
return 1;
}

const auto mapping = llama::mapping::AoS{llama::ArrayDomain{numNodesX}, DatumDomain{}};
const auto mapping = llama::mapping::SoA{llama::ArrayDomain{extent}, double{}};
auto uNext = llama::allocView(mapping);
auto uCurr = llama::allocView(mapping);

// Apply initial conditions for the test problem
for (uint32_t i = 0; i < numNodesX; i++)
uCurr[i] = exactSolution(i * dx, 0.0);
auto run = [&](std::string_view updateName, auto update) {
// init
for (uint32_t i = 0; i < extent; i++)
uCurr[i] = exactSolution(i * dx, 0.0);

const auto start = std::chrono::high_resolution_clock::now();
HeatEquationKernel kernel;
for (int step = 0; step < numTimeSteps; step++)
{
for (auto i = 0; i < numNodesX; i++)
kernel(i, uCurr, uNext, numNodesX, dx, dt);
// run simulation
const auto start = std::chrono::high_resolution_clock::now();
for (int step = 0; step < timeSteps; step++)
{
update(uCurr, uNext, extent, dx, dt);
std::swap(uNext, uCurr);
}
const auto stop = std::chrono::high_resolution_clock::now();
std::cout << updateName << " took " << std::chrono::duration<double>(stop - start).count() << "s\t";

// We assume the boundary conditions are constant and so these values
// do not need to be updated.
std::swap(uNext, uCurr);
}
const auto end = std::chrono::high_resolution_clock::now();
std::cout << "Runtime: " << std::chrono::duration<double>(end - start).count() << "s\n";
// calculate error
double maxError = 0.0;
for (uint32_t i = 0; i < extent; i++)
{
const auto error = std::abs(uNext[i] - exactSolution(i * dx, tMax));
maxError = std::max(maxError, error);
}

// Calculate error
double maxError = 0.0;
for (uint32_t i = 0; i < numNodesX; i++)
{
const auto error = std::abs(uNext[i] - exactSolution(i * dx, tMax));
maxError = std::max(maxError, error);
}
const auto errorThreshold = 1e-5;
const auto resultCorrect = (maxError < errorThreshold);
if (resultCorrect)
std::cout << "Correct!\n";
else
std::cout << "Incorrect! error = " << maxError << " (the grid resolution may be too low)\n";
};

const auto errorThreshold = 1e-5;
const auto resultCorrect = (maxError < errorThreshold);
if (resultCorrect)
{
std::cout << "Execution results correct!\n";
return 0;
}
else
{
std::cout << "Execution results incorrect: error = " << maxError << " (the grid resolution may be too low)\n";
return 2;
}
run("update_scalar ", [](auto&... args) { update_scalar(args...); });
#if __has_include(<Vc/Vc>)
run("update_Vc ", [](auto&... args) { update_Vc(args...); });
run("update_Vc_peel", [](auto&... args) { update_Vc_peel(args...); });
#endif

return 0;
}

0 comments on commit 424df94

Please sign in to comment.