Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Experimental perfectly nested loop interface #479

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 56 additions & 46 deletions src/apps/MASS3DEA-Hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ __global__ void Mass3DEA(const Real_ptr B, const Real_ptr D, Real_ptr M) {
}
}
}

}

template < size_t block_size >
Expand All @@ -65,6 +65,17 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) {

MASS3DEA_DATA_SETUP;

//Extra kernel launch ...
{
dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D);
constexpr size_t shmem = 0;

RPlaunchHipKernel( (Mass3DEA<block_size>),
NE, nthreads_per_block,
shmem, res.get_stream(),
B, D, M );
}

switch (vid) {

case Base_HIP: {
Expand All @@ -89,7 +100,7 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) {

constexpr bool async = true;

using launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<async, MEA_D1D*MEA_D1D*MEA_D1D>>;
using launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<async, block_size>>;

using outer_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;

Expand All @@ -99,72 +110,71 @@ void MASS3DEA::runHipVariantImpl(VariantID vid) {

using inner_z = RAJA::LoopPolicy<RAJA::hip_thread_size_z_loop<MEA_D1D>>;

using inner_zyx = RAJA::LoopPolicy<RAJA::hip_thread_zyx_loop>;

using inner_yxz = RAJA::LoopPolicy<RAJA::hip_thread_yxz_loop>;

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

#if 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@artv3 do you plan to keep both options in the code? If so, I recommend using a descriptive macro definition name here instead of just '1' or '0' like we do here: https://github.com/LLNL/RAJAPerf/blob/develop/src/basic/NESTED_INIT-OMP.cpp#L43

RAJA::launch<launch_policy>( res,
RAJA::LaunchParams(RAJA::Teams(NE),
RAJA::Threads(MEA_D1D, MEA_D1D, MEA_D1D)),
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {

RAJA::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
[&](int e) {
[&](int e) {

MASS3DEA_0

RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
[&](int ) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MEA_D1D),
[&](int d) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
[&](int q) {
MASS3DEA_1
}
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_z>

RAJA::expt::loop<inner_yxz>
(ctx, RAJA::RangeSegment(0, MEA_Q1D), RAJA::RangeSegment(0, MEA_D1D), RAJA::RangeSegment(0, 1),
[&](int q, int d, int ) {
MASS3DEA_1
}
); // RAJA::loop<inner_yxy>

MASS3DEA_2

RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
[&](int k1) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
[&](int k2) {
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
[&](int k3) {
MASS3DEA_3
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_z>
RAJA::expt::loop<inner_zyx>
(ctx, RAJA::RangeSegment(0, MEA_Q1D), RAJA::RangeSegment(0, MEA_Q1D), RAJA::RangeSegment(0, MEA_Q1D),
[&](int k3, int k2, int k1) {
MASS3DEA_3
}
); // RAJA::loop<inner_zyx>


ctx.teamSync();

RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MEA_D1D),
[&](int i1) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MEA_D1D),
[&](int i2) {
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, MEA_D1D),
[&](int i3) {
MASS3DEA_4
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_z>

} // lambda (e)
); // RAJA::loop<outer_x>
RAJA::expt::loop<inner_zyx>
(ctx, RAJA::RangeSegment(0, MEA_D1D), RAJA::RangeSegment(0, MEA_D1D), RAJA::RangeSegment(0, MEA_D1D),
[&](int i3, int i2, int i1) {
MASS3DEA_4
}
); // RAJA::loop<inner_zyx>

} // lambda (e)
); // RAJA::loop<outer_x>

} // outer lambda (ctx)
); // RAJA::launch

#else

dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D);
constexpr size_t shmem = 0;

RPlaunchHipKernel( (Mass3DEA<block_size>),
NE, nthreads_per_block,
shmem, res.get_stream(),
B, D, M );



#endif


} // loop over kernel reps
stopTimer();

Expand Down
2 changes: 1 addition & 1 deletion tpl/RAJA
Submodule RAJA updated 45 files
+25 −0 .github/ISSUE_TEMPLATE/bug_report.md
+22 −0 .github/ISSUE_TEMPLATE/feature_request.md
+74 −0 .github/workflows/build.yml
+2 −1 CMakeLists.txt
+118 −74 Dockerfile
+4 −1 README.md
+14 −26 azure-pipelines.yml
+4 −0 benchmark/CMakeLists.txt
+346 −0 benchmark/benchmark-atomic.cpp
+2 −1 docs/conf.py
+16 −22 docs/sphinx/dev_guide/build_configurations.rst
+84 −37 docs/sphinx/dev_guide/ci.rst
+136 −88 docs/sphinx/dev_guide/ci_tasks.rst
+4 −4 docs/sphinx/dev_guide/contributing.rst
+1 −1 docs/sphinx/user_guide/feature/policies.rst
+9 −1 docs/sphinx/user_guide/getting_started.rst
+8 −0 docs/sphinx/user_guide/tutorial.rst
+0 −5 exercises/dot-product.cpp
+1 −6 exercises/dot-product_solution.cpp
+0 −5 exercises/vector-addition.cpp
+1 −6 exercises/vector-addition_solution.cpp
+14 −0 host-configs/lc-builds/toss4/clang_X_asan.cmake
+14 −0 host-configs/lc-builds/toss4/clang_X_ubsan.cmake
+19 −0 include/RAJA/pattern/launch/launch_core.hpp
+3 −2 include/RAJA/policy/cuda/launch.hpp
+3 −0 include/RAJA/policy/openmp/launch.hpp
+6 −10 include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+6 −13 include/RAJA/policy/sycl/reduce.hpp
+1 −1 scripts/lc-builds/blueos_clang.sh
+1 −1 scripts/lc-builds/blueos_clang_omptarget.sh
+1 −1 scripts/lc-builds/blueos_clangcuda.sh
+1 −1 scripts/lc-builds/blueos_gcc.sh
+1 −1 scripts/lc-builds/blueos_nvcc_clang.sh
+1 −1 scripts/lc-builds/blueos_nvcc_gcc.sh
+1 −1 scripts/lc-builds/blueos_nvcc_xl.sh
+1 −1 scripts/lc-builds/blueos_pgi.sh
+1 −1 scripts/lc-builds/blueos_xl.sh
+1 −1 scripts/lc-builds/blueos_xl_omptarget.sh
+1 −1 scripts/lc-builds/corona_sycl.sh
+1 −1 scripts/lc-builds/toss4_amdclang.sh
+1 −1 scripts/lc-builds/toss4_amdclang_asan.sh
+57 −0 scripts/lc-builds/toss4_clang_san.sh
+1 −1 scripts/lc-builds/toss4_icpc-classic.sh
+2 −2 scripts/lc-builds/toss4_icpx.sh
+0 −14 src/MemUtils_SYCL.cpp
Loading