Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional REDUCE macros #297

Merged
merged 5 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/care/DefaultMacros.h
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_chunk_begin_ndx; INDEX < _
#define CARE_CHECKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_POLICY_LOOP_END(CHECK)

#define CARE_CHECKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHECK) \
CARE_CHECKED_POLICY_LOOP_START(care::parallel_reduce,INDEX, START_INDEX, END_INDEX, CHECK)
CARE_CHECKED_POLICY_LOOP_START(care::gpu_reduce,INDEX, START_INDEX, END_INDEX, CHECK)

#define CARE_CHECKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_POLICY_LOOP_END(CHECK)

Expand Down Expand Up @@ -771,7 +771,7 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_chunk_begin_ndx; INDEX < _
#define CARE_CHECKED_CHUNKED_PARALLEL_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_POLICY_LOOP_END(CHECK)

#define CARE_CHECKED_CHUNKED_REDUCE_LOOP_START(INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK) \
CARE_CHECKED_CHUNKED_POLICY_LOOP_START(care::parallel_reduce,INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK)
CARE_CHECKED_CHUNKED_POLICY_LOOP_START(care::gpu_reduce,INDEX, START_INDEX, END_INDEX, CHUNK_SIZE, CHECK)

#define CARE_CHECKED_CHUNKED_REDUCE_LOOP_END(CHECK) CARE_CHECKED_CHUNKED_POLICY_LOOP_END(CHECK)

Expand Down Expand Up @@ -1278,6 +1278,9 @@ OMP_FOR_BEGIN for (auto INDEX = _care_openmp_for_loop_chunk_begin_ndx; INDEX < _
launch_2D_jagged(care::gpu{}, XSTART, XEND, XLENGTHS.data(chai::DEFAULT, true), YSTART, YLENGTH, __FILE__, __LINE__, [=] CARE_DEVICE (int XINDEX, int YINDEX)->void {
#define CARE_LOOP_2D_STREAM_JAGGED_END });

#define CARE_LOOP_2D_REDUCE_JAGGED(XINDEX, XSTART, XEND, XLENGTHS, YINDEX, YSTART, YLENGTH, FLAT_INDEX) \
launch_2D_jagged(care::gpu_reduce{}, XSTART, XEND, XLENGTHS.data(chai::DEFAULT, true), YSTART, YLENGTH, __FILE__, __LINE__, [=] CARE_DEVICE (int XINDEX, int YINDEX)->void {
#define CARE_LOOP_2D_REDUCE_JAGGED_END });

#endif // !defined(_CARE_DEFAULT_MACROS_H_)

45 changes: 45 additions & 0 deletions src/care/LoopFuser.h
Original file line number Diff line number Diff line change
Expand Up @@ -1273,9 +1273,14 @@ void LoopFuser<REGISTER_COUNT, XARGS...>::registerAction(const char * fileName,
#define FUSIBLE_LOOP_STREAM_R_END \
} }); } FUSIBLE_FLUSH_IF_NEEDED

#define FUSIBLE_REDUCE_LOOP_R FUSIBLE_LOOP_STREAM_R
#define FUSIBLE_REDUCE_LOOP_R_END FUSIBLE_LOOP_STREAM_R_END

#define FUSIBLE_LOOP_STREAM(INDEX, START, END) FUSIBLE_LOOP_STREAM_R(INDEX, START, END, CARE_DEFAULT_LOOP_FUSER_REGISTER_COUNT)
#define FUSIBLE_LOOP_STREAM_END FUSIBLE_LOOP_STREAM_R_END

#define FUSIBLE_REDUCE_LOOP FUSIBLE_LOOP_STREAM
#define FUSIBLE_REDUCE_LOOP_END FUSIBLE_LOOP_STREAM_END

#define FUSIBLE_KERNEL_R(REGISTER_COUNT) { \
auto __fuser__ = LOOPFUSER(REGISTER_COUNT)::getInstance(); \
Expand Down Expand Up @@ -1304,9 +1309,14 @@ void LoopFuser<REGISTER_COUNT, XARGS...>::registerAction(const char * fileName,
} \
}); }}

#define FUSIBLE_REDUCE_LOOP_PHASE_R FUSIBLE_LOOP_PHASE_R
#define FUSIBLE_REDUCE_LOOP_PHASE_R_END FUSIBLE_LOOP_PHASE_R_END

#define FUSIBLE_LOOP_PHASE(INDEX, START, END, PRIORITY) FUSIBLE_LOOP_PHASE_R(INDEX, START, END, PRIORITY, CARE_DEFAULT_LOOP_FUSER_REGISTER_COUNT)
#define FUSIBLE_LOOP_PHASE_END FUSIBLE_LOOP_PHASE_R_END

#define FUSIBLE_REDUCE_LOOP_PHASE FUSIBLE_LOOP_PHASE
#define FUSIBLE_REDUCE_LOOP_PHASE_END FUSIBLE_LOOP_PHASE_END

#define FUSIBLE_KERNEL_PHASE_R(PRIORITY, REGISTER_COUNT) { \
LOOPFUSER(REGISTER_COUNT) * __fuser__ = FusedActionsObserver::getActiveObserver()->getFusedActions<LOOPFUSER(REGISTER_COUNT)>(PRIORITY); \
Expand Down Expand Up @@ -1382,21 +1392,32 @@ void LoopFuser<REGISTER_COUNT, XARGS...>::registerAction(const char * fileName,
#define FUSIBLE_LOOP_SCAN(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) \
FUSIBLE_LOOP_SCAN_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, CARE_DEFAULT_LOOP_FUSER_REGISTER_COUNT)

#define FUSIBLE_REDUCE_LOOP_SCAN_R FUSIBLE_LOOP_SCAN_R
#define FUSIBLE_REDUCE_LOOP_SCAN FUSIBLE_LOOP_SCAN

#define _FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) } return 0; }, 1, POS_STORE_DESTINATION); }
#define FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) _FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) FUSIBLE_FLUSH_IF_NEEDED

#define FUSIBLE_LOOP_SCAN_END(LENGTH, POS, POS_STORE_DESTINATION) FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION)

#define FUSIBLE_REDUCE_LOOP_SCAN_R_END FUSIBLE_LOOP_SCAN_R_END
#define FUSIBLE_REDUCE_LOOP_SCAN_END FUSIBLE_LOOP_SCAN_END

#define FUSIBLE_LOOP_SCAN_PHASE_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY, REGISTER_COUNT) \
_FUSIBLE_LOOP_SCAN_R(FusedActionsObserver::getActiveObserver()->getFusedActions<LOOPFUSER(REGISTER_COUNT)>(PRIORITY), \
INDEX, START, END, POS, INIT_POS, BOOL_EXPR, REGISTER_COUNT)

#define FUSIBLE_LOOP_SCAN_PHASE(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY) \
FUSIBLE_LOOP_SCAN_PHASE_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY, CARE_DEFAULT_LOOP_FUSER_REGISTER_COUNT)

#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE_R FUSIBLE_LOOP_SCAN_PHASE_R
#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE FUSIBLE_LOOP_SCAN_PHASE

#define FUSIBLE_LOOP_SCAN_PHASE_END(LENGTH, POS, POS_STORE_DESTINATION) _FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION)
#define FUSIBLE_LOOP_SCAN_PHASE_R_END(LENGTH, POS, POS_STORE_DESTINATION) _FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION)

#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE_R_END FUSIBLE_LOOP_SCAN_PHASE_R_END
#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE_END FUSIBLE_LOOP_SCAN_PHASE_END

// note - FUSED_SCANVAR will be nullptr if we are not recording, as there will be no need for an intermediate
// FUSED_SCANVAR, so we won't need to write to it in the action or store into it in the conditional
Expand Down Expand Up @@ -1466,13 +1487,21 @@ void LoopFuser<REGISTER_COUNT, XARGS...>::registerAction(const char * fileName,

#define FUSIBLE_LOOP_STREAM_R(INDEX, START, END, REGISTER_COUNT) CARE_STREAM_LOOP(INDEX, START, END)
#define FUSIBLE_LOOP_STREAM(INDEX, START, END) CARE_STREAM_LOOP(INDEX, START, END)
#define FUSIBLE_REDUCE_LOOP_R(INDEX, START, END, REGISTER_COUNT) CARE_REDUCE_LOOP(INDEX, START, END)
#define FUSIBLE_REDUCE_LOOP(INDEX, START, END) CARE_REDUCE_LOOP(INDEX, START, END)

#define FUSIBLE_LOOP_PHASE_R(INDEX, START, END, PRIORITY, REGISTER_COUNT) CARE_STREAM_LOOP(INDEX, START, END)
#define FUSIBLE_LOOP_PHASE(INDEX, START, END, PRIORITY) CARE_STREAM_LOOP(INDEX, START, END)

#define FUSIBLE_LOOP_PHASE_END CARE_STREAM_LOOP_END
#define FUSIBLE_LOOP_PHASE_R_END CARE_STREAM_LOOP_END

#define FUSIBLE_REDUCE_LOOP_PHASE_R(INDEX, START, END, PRIORITY, REGISTER_COUNT) CARE_STREAM_LOOP(INDEX, START, END)
#define FUSIBLE_REDUCE_LOOP_PHASE(INDEX, START, END, PRIORITY) CARE_STREAM_LOOP(INDEX, START, END)

#define FUSIBLE_REDUCE_LOOP_PHASE_END CARE_REDUCE_LOOP_END
#define FUSIBLE_REDUCE_LOOP_PHASE_R_END CARE_REDUCE_LOOP_END

#define FUSIBLE_FLUSH_IF_NEEDED
#define FUSIBLE_PHASE_RESET

Expand All @@ -1485,6 +1514,9 @@ void LoopFuser<REGISTER_COUNT, XARGS...>::registerAction(const char * fileName,
#define FUSIBLE_LOOP_STREAM_R_END CARE_STREAM_LOOP_END
#define FUSIBLE_LOOP_STREAM_END CARE_STREAM_LOOP_END

#define FUSIBLE_REDUCE_LOOP_R_END CARE_REDUCE_LOOP_END
#define FUSIBLE_REDUCE_LOOP_END CARE_REDUCE_LOOP_END

#define FUSIBLE_KERNEL_PHASE_R_END CARE_PARALLEL_KERNEL_END
#define FUSIBLE_KERNEL_R_END CARE_PARALLEL_KERNEL_END
#define FUSIBLE_KERNEL_PHASE_END CARE_PARALLEL_KERNEL_END
Expand All @@ -1501,16 +1533,29 @@ void LoopFuser<REGISTER_COUNT, XARGS...>::registerAction(const char * fileName,
#define FUSIBLE_LOOP_SCAN_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, REGISTER_COUNT) SCAN_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR)
#define FUSIBLE_LOOP_SCAN(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) SCAN_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR)

#define FUSIBLE_REDUCE_LOOP_SCAN_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, REGISTER_COUNT) SCAN_REDUCE_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR)
#define FUSIBLE_REDUCE_LOOP_SCAN(INDEX, START, END, POS, INIT_POS, BOOL_EXPR) SCAN_REDUCE_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR)

#define FUSIBLE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION)
#define FUSIBLE_LOOP_SCAN_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION)

#define FUSIBLE_REDUCE_LOOP_SCAN_R_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_REDUCE_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION)
#define FUSIBLE_REDUCE_LOOP_SCAN_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_REDUCE_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION)

#define FUSIBLE_LOOP_SCAN_PHASE_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY, REGISTER_COUNT) \
SCAN_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR)
#define FUSIBLE_LOOP_SCAN_PHASE(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY) SCAN_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR)

#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE_R(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY, REGISTER_COUNT) \
SCAN_REDUCE_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR)
#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE(INDEX, START, END, POS, INIT_POS, BOOL_EXPR, PRIORITY) SCAN_REDUCE_LOOP(INDEX, START, END, POS, INIT_POS, BOOL_EXPR)

#define FUSIBLE_LOOP_SCAN_PHASE_R_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION)
#define FUSIBLE_LOOP_SCAN_PHASE_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION)

#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE_R_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_REDUCE_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION)
#define FUSIBLE_REDUCE_LOOP_SCAN_PHASE_END(LENGTH, POS, POS_STORE_DESTINATION) SCAN_REDUCE_LOOP_END(LENGTH, POS, POS_STORE_DESTINATION)

#define FUSIBLE_FREE(A) A.free()
#define FUSIBLE_FREE_DEVICE(A,DEST,ELEM) care::wrappedFreeDeviceMemory(A, DEST, ELEM);

Expand Down
9 changes: 7 additions & 2 deletions src/care/forall.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ namespace care {
///
/// @brief Execute using the care::RAJAReductionExec policy
///
/// @arg[in] parallel_reduce Used to choose this overload of forall
/// @arg[in] gpu_reduce Used to choose this overload of forall
/// @arg[in] fileName The name of the file where this function is called
/// @arg[in] lineNumber The line number in the file where this function is called
/// @arg[in] start The starting index (inclusive)
Expand All @@ -275,7 +275,7 @@ namespace care {
///
////////////////////////////////////////////////////////////////////////////////
template <typename LB>
void forall(parallel_reduce, const char * fileName, const int lineNumber,
void forall(gpu_reduce, const char * fileName, const int lineNumber,
const int start, const int end, const int batch_size, LB&& body) {
#if CARE_ENABLE_PARALLEL_LOOP_BACKWARDS
s_reverseLoopOrder = true;
Expand Down Expand Up @@ -627,6 +627,11 @@ namespace care {
arrayManager->setExecutionSpace(chai::ExecutionSpace::NONE);
}
}

template <typename LB>
void launch_2D_jagged(care::gpu_reduce, int xstart, int xend, int const * gpu_lengths, int ystart, int ylength, const char * fileName, int lineNumber , LB && body) {
launch_2D_jagged(care::gpu{}, xstart, xend, gpu_lengths, ystart, ylength, fileName, lineNumber, body) ;
}
#endif
} // namespace care

Expand Down
4 changes: 2 additions & 2 deletions src/care/policies.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ namespace care {
struct sequential {};
struct openmp {};
struct gpu {};
struct gpu_reduce {};
struct parallel {};
struct parallel_reduce {};
struct raja_fusible {};
struct raja_fusible_seq {};
struct managed_ptr_read {};
Expand All @@ -27,8 +27,8 @@ namespace care {
sequential,
openmp,
gpu,
gpu_reduce,
parallel,
parallel_reduce,
managed_ptr_read,
managed_ptr_write
};
Expand Down
Loading
Loading