Merge pull request #118 from bernhardmgruber/heat

Improve heatequation demo and add Vc version
alpaka-group · Nov 5, 2020 · 424df94 · 424df94
2 parents c01befc + 0c8b0cf
commit 424df94
Show file tree

Hide file tree

Showing 2 changed files with 112 additions and 48 deletions.
diff --git a/examples/heatequation/CMakeLists.txt b/examples/heatequation/CMakeLists.txt
@@ -4,8 +4,25 @@ project(llama-heatequation)
 set (CMAKE_CXX_STANDARD 17)
 set (CMAKE_CXX_STANDARD_REQUIRED on)
 
+find_package(Vc QUIET)
 if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)
 endif()
+
 add_executable(${PROJECT_NAME} heatequation.cpp)
 target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)
+if (Vc_FOUND)
+	target_link_libraries(${PROJECT_NAME} PRIVATE Vc::Vc)
+endif()
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+	target_compile_options(${PROJECT_NAME} PRIVATE
+		-march=native
+		-ffast-math
+	)
+elseif(MSVC)
+	target_compile_options(${PROJECT_NAME} PRIVATE
+		/arch:AVX2
+		/fp:fast
+	)
+endif()
diff --git a/examples/heatequation/heatequation.cpp b/examples/heatequation/heatequation.cpp
@@ -21,18 +21,65 @@
 #include <llama/llama.hpp>
 #include <utility>
 
-using DatumDomain = double;
+template <typename View>
+inline void kernel(uint32_t idx, const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
+{
+    const auto r = dt / (dx * dx);
+    if (idx > 0 && idx < extent - 1u)
+        uNext[idx] = uCurr[idx] * (1.0 - 2.0 * r) + uCurr[idx - 1] * r + uCurr[idx + 1] * r;
+}
+
+template <typename View>
+void update_scalar(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
+{
+    for (auto i = 0; i < extent; i++)
+        kernel(i, uCurr, uNext, extent, dx, dt);
+}
 
-struct HeatEquationKernel
+#if __has_include(<Vc/Vc>)
+#    include <Vc/Vc>
+
+template <typename View>
+inline void kernel_vec(uint32_t blockIdx, const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
 {
-    template <typename View>
-    void operator()(uint32_t idx, const View& uCurrBuf, View& uNextBuf, uint32_t extent, double dx, double dt) const
+    const auto r = dt / (dx * dx);
+
+    const auto baseIdx = static_cast<uint32_t>(blockIdx * Vc::double_v::size());
+    if (baseIdx > 0 && baseIdx + Vc::double_v::size() < extent)
+    {
+        const auto next = Vc::double_v{&uCurr[baseIdx]} * (1.0 - 2.0 * r) + Vc::double_v{&uCurr[baseIdx - 1]} * r
+            + Vc::double_v{&uCurr[baseIdx + 1]} * r;
+        next.store(&uNext[baseIdx]);
+    }
+    else
     {
-        const auto r = dt / (dx * dx);
-        if (idx > 0 && idx < extent - 1u)
-            uNextBuf[idx] = uCurrBuf[idx] * (1.0 - 2.0 * r) + uCurrBuf[idx - 1] * r + uCurrBuf[idx + 1] * r;
+        for (auto idx = baseIdx; idx <= baseIdx + Vc::double_v::size(); idx++)
+            if (idx > 0 && idx < extent - 1u)
+                uNext[idx] = uCurr[idx] * (1.0 - 2.0 * r) + uCurr[idx - 1] * r + uCurr[idx + 1] * r;
     }
-};
+}
+
+template <typename View>
+void update_Vc(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
+{
+    constexpr auto L = Vc::double_v::size();
+    const auto blocks = (extent + L - 1) / L;
+    for (auto i = 0; i < blocks; i++)
+        kernel_vec(i, uCurr, uNext, extent, dx, dt);
+}
+
+template <typename View>
+void update_Vc_peel(const View& uCurr, View& uNext, uint32_t extent, double dx, double dt)
+{
+    constexpr auto L = Vc::double_v::size();
+    const auto blocks = (extent + L - 1) / L;
+    kernel_vec(0, uCurr, uNext, extent, dx, dt);
+    for (auto i = 1; i < blocks - 1; i++)
+        kernel_vec(i, uCurr, uNext, extent, dx, dt);
+    kernel_vec(blocks - 1, uCurr, uNext, extent, dx, dt);
+}
+
+#endif
 
 // Exact solution to the test problem
 // u_t(x, t) = u_xx(x, t), x in [0, 1], t in [0, T]
@@ -46,13 +93,13 @@ double exactSolution(double const x, double const t)
 
 auto main() -> int
 {
-    // Parameters (a user is supposed to change numNodesX, numTimeSteps)
-    const auto numNodesX = 10000;
-    const auto numTimeSteps = 200000;
+    // Parameters (a user is supposed to change extent, timeSteps)
+    const auto extent = 10000;
+    const auto timeSteps = 200000;
     const auto tMax = 0.001;
     // x in [0, 1], t in [0, tMax]
-    const auto dx = 1.0 / static_cast<double>(numNodesX - 1);
-    const auto dt = tMax / static_cast<double>(numTimeSteps - 1);
+    const auto dx = 1.0 / static_cast<double>(extent - 1);
+    const auto dt = tMax / static_cast<double>(timeSteps - 1);
 
     const auto r = dt / (dx * dx);
     if (r > 0.5)
@@ -61,46 +108,46 @@ auto main() -> int
         return 1;
     }
 
-    const auto mapping = llama::mapping::AoS{llama::ArrayDomain{numNodesX}, DatumDomain{}};
+    const auto mapping = llama::mapping::SoA{llama::ArrayDomain{extent}, double{}};
     auto uNext = llama::allocView(mapping);
     auto uCurr = llama::allocView(mapping);
 
-    // Apply initial conditions for the test problem
-    for (uint32_t i = 0; i < numNodesX; i++)
-        uCurr[i] = exactSolution(i * dx, 0.0);
+    auto run = [&](std::string_view updateName, auto update) {
+        // init
+        for (uint32_t i = 0; i < extent; i++)
+            uCurr[i] = exactSolution(i * dx, 0.0);
 
-    const auto start = std::chrono::high_resolution_clock::now();
-    HeatEquationKernel kernel;
-    for (int step = 0; step < numTimeSteps; step++)
-    {
-        for (auto i = 0; i < numNodesX; i++)
-            kernel(i, uCurr, uNext, numNodesX, dx, dt);
+        // run simulation
+        const auto start = std::chrono::high_resolution_clock::now();
+        for (int step = 0; step < timeSteps; step++)
+        {
+            update(uCurr, uNext, extent, dx, dt);
+            std::swap(uNext, uCurr);
+        }
+        const auto stop = std::chrono::high_resolution_clock::now();
+        std::cout << updateName << " took " << std::chrono::duration<double>(stop - start).count() << "s\t";
 
-        // We assume the boundary conditions are constant and so these values
-        // do not need to be updated.
-        std::swap(uNext, uCurr);
-    }
-    const auto end = std::chrono::high_resolution_clock::now();
-    std::cout << "Runtime: " << std::chrono::duration<double>(end - start).count() << "s\n";
+        // calculate error
+        double maxError = 0.0;
+        for (uint32_t i = 0; i < extent; i++)
+        {
+            const auto error = std::abs(uNext[i] - exactSolution(i * dx, tMax));
+            maxError = std::max(maxError, error);
+        }
 
-    // Calculate error
-    double maxError = 0.0;
-    for (uint32_t i = 0; i < numNodesX; i++)
-    {
-        const auto error = std::abs(uNext[i] - exactSolution(i * dx, tMax));
-        maxError = std::max(maxError, error);
-    }
+        const auto errorThreshold = 1e-5;
+        const auto resultCorrect = (maxError < errorThreshold);
+        if (resultCorrect)
+            std::cout << "Correct!\n";
+        else
+            std::cout << "Incorrect! error = " << maxError << " (the grid resolution may be too low)\n";
+    };
 
-    const auto errorThreshold = 1e-5;
-    const auto resultCorrect = (maxError < errorThreshold);
-    if (resultCorrect)
-    {
-        std::cout << "Execution results correct!\n";
-        return 0;
-    }
-    else
-    {
-        std::cout << "Execution results incorrect: error = " << maxError << " (the grid resolution may be too low)\n";
-        return 2;
-    }
+    run("update_scalar ", [](auto&... args) { update_scalar(args...); });
+#if __has_include(<Vc/Vc>)
+    run("update_Vc     ", [](auto&... args) { update_Vc(args...); });
+    run("update_Vc_peel", [](auto&... args) { update_Vc_peel(args...); });
+#endif
+
+    return 0;
 }