Skip to content

Commit

Permalink
Merge back 6.2 hotfixes (#607)
Browse files Browse the repository at this point in the history
* Update dependency names for static builds (#557)

This also removes the line setting `BUILD_SHARED_LIBS` to `ON`, which was previously required to get the correctly named packages when not specifically compiling for a static build. Updates to the ROCmCMakeBuildTools (rocm-cmake) should mean this is no longer necessary.

* Fix BUILD_SHARED_LIBS for packaging (#558)

* Fix the dependencies of the static packages (#563)

* cmake: don't set CMAKE_C_COMPILER, as rocPRIM is a CXX project (#567)

* add developer guidelines (#555) (#574)

Co-authored-by: Nol Moonen <nol@streamhpc.com>

* Update Read the Docs config to Python 3.10 and latest rocm-docs-core (#564) (#579)

* Cherry-pick: Optimize block_reduce_warp_reduce when block size is the same as warp size (#599)

* Optimize block_reduce_warp_reduce when block size == warp size

* Make conditional constexpr

* Fix conflict in concepts.rst

---------

Co-authored-by: Lauren Wrubleski <Lauren.Wrubleski@amd.com>
Co-authored-by: Steve Leung <Steve.Leung@amd.com>
Co-authored-by: randyh62 <42045079+randyh62@users.noreply.github.com>
Co-authored-by: Nol Moonen <nol@streamhpc.com>
Co-authored-by: Sam Wu <22262939+samjwu@users.noreply.github.com>
  • Loading branch information
6 people authored Sep 17, 2024
1 parent 45b1942 commit 4dca5b7
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 37 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,12 @@ Documentation for rocPRIM is available at

* `rocprim::thread_load` and `rocprim::thread_store`, use dereference instead. Not all of those functions are available on every device architecture, and their usage can hurt performance, because inline assembly inhibits optimizations.

## Unreleased rocPRIM-3.2.0 for ROCm 6.2.0
## rocPRIM-3.2.1 for ROCm 6.2.1

### Optimizations
* Improved performance of block_reduce_warp_reduce when warp size == block size.

## rocPRIM-3.2.0 for ROCm 6.2.0

### Additions

Expand Down
15 changes: 13 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

if(DEFINED BUILD_SHARED_LIBS)
set(PKG_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
else()
set(PKG_BUILD_SHARED_LIBS ON)
endif()
set(BUILD_SHARED_LIBS OFF) # don't build client dependencies as shared
if(NOT USE_HIP_CPU)
# Get dependencies (required here to get rocm-cmake)
Expand Down Expand Up @@ -172,16 +177,22 @@ if(BUILD_DOCS AND NOT ONLY_INSTALL)
add_subdirectory(docs)
endif()

# set BUILD_SHARED_LIBS for packaging
set(BUILD_SHARED_LIBS ${PKG_BUILD_SHARED_LIBS})
# Package
if (ROCPRIM_PROJECT_IS_TOP_LEVEL)
set(BUILD_SHARED_LIBS ON) # Build as though shared library for naming
# add dependency on HIP runtime
set(HIP_RUNTIME_MINIMUM 4.5.0)
if(BUILD_ADDRESS_SANITIZER)
set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan" )
else()
set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" )
endif()

rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0")
rocm_package_add_dependencies(SHARED_DEPENDS "${DEPENDS_HIP_RUNTIME} >= ${HIP_RUNTIME_MINIMUM}")
rocm_package_add_deb_dependencies(STATIC_DEPENDS "hip-static-dev >= ${HIP_RUNTIME_MINIMUM}")
rocm_package_add_rpm_dependencies(STATIC_DEPENDS "hip-static-devel >= ${HIP_RUNTIME_MINIMUM}")

set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.txt")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")

Expand Down
10 changes: 5 additions & 5 deletions docs/sphinx/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ docutils==0.21.2
# myst-parser
# pydata-sphinx-theme
# sphinx
fastjsonschema==2.19.1
fastjsonschema==2.20.0
# via rocm-docs-core
gitdb==4.0.11
# via gitpython
Expand All @@ -62,13 +62,13 @@ mdurl==0.1.2
# via markdown-it-py
myst-parser==3.0.1
# via rocm-docs-core
packaging==24.0
packaging==24.1
# via
# pydata-sphinx-theme
# sphinx
pycparser==2.22
# via cffi
pydata-sphinx-theme==0.15.3
pydata-sphinx-theme==0.15.4
# via
# rocm-docs-core
# sphinx-book-theme
Expand Down Expand Up @@ -111,7 +111,7 @@ sphinx==7.3.7
# sphinx-design
# sphinx-external-toc
# sphinx-notfound-page
sphinx-book-theme==1.1.2
sphinx-book-theme==1.1.3
# via rocm-docs-core
sphinx-copybutton==0.5.2
# via rocm-docs-core
Expand All @@ -135,7 +135,7 @@ sphinxcontrib-serializinghtml==1.1.10
# via sphinx
tomli==2.0.1
# via sphinx
typing-extensions==4.12.0
typing-extensions==4.12.2
# via
# pydata-sphinx-theme
# pygithub
Expand Down
66 changes: 37 additions & 29 deletions rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,21 +180,25 @@ class block_reduce_warp_reduce
input, output, num_valid, reduce_op
);

// i-th warp will have its partial stored in storage_.warp_partials[i-1]
if(lane_id == 0)
// Final reduction across warps is only required if there is more than 1 warp
if ROCPRIM_IF_CONSTEXPR (warps_no_ > 1)
{
storage_.warp_partials[warp_id] = output;
}
::rocprim::syncthreads();

if(flat_tid < warps_no_)
{
// Use warp partial to calculate the final reduce results for every thread
auto warp_partial = storage_.warp_partials[lane_id];

warp_reduce<!warps_no_is_pow_of_two_, warp_reduce_output_type>(
warp_partial, output, warps_no_, reduce_op
);
// i-th warp will have its partial stored in storage_.warp_partials[i-1]
if(lane_id == 0)
{
storage_.warp_partials[warp_id] = output;
}
::rocprim::syncthreads();

if(flat_tid < warps_no_)
{
// Use warp partial to calculate the final reduce results for every thread
auto warp_partial = storage_.warp_partials[lane_id];

warp_reduce<!warps_no_is_pow_of_two_, warp_reduce_output_type>(
warp_partial, output, warps_no_, reduce_op
);
}
}
}

Expand Down Expand Up @@ -246,22 +250,26 @@ class block_reduce_warp_reduce
input, output, num_valid, reduce_op
);

// i-th warp will have its partial stored in storage_.warp_partials[i-1]
if(lane_id == 0)
// Final reduction across warps is only required if there is more than 1 warp
if ROCPRIM_IF_CONSTEXPR (warps_no_ > 1)
{
storage_.warp_partials[warp_id] = output;
}
::rocprim::syncthreads();

if(flat_tid < warps_no_)
{
// Use warp partial to calculate the final reduce results for every thread
auto warp_partial = storage_.warp_partials[lane_id];

unsigned int valid_warps_no = (valid_items + warp_size_ - 1) / warp_size_;
warp_reduce_output_type().reduce(
warp_partial, output, valid_warps_no, reduce_op
);
// i-th warp will have its partial stored in storage_.warp_partials[i-1]
if(lane_id == 0)
{
storage_.warp_partials[warp_id] = output;
}
::rocprim::syncthreads();

if(flat_tid < warps_no_)
{
// Use warp partial to calculate the final reduce results for every thread
auto warp_partial = storage_.warp_partials[lane_id];

unsigned int valid_warps_no = (valid_items + warp_size_ - 1) / warp_size_;
warp_reduce_output_type().reduce(
warp_partial, output, valid_warps_no, reduce_op
);
}
}
}
};
Expand Down

0 comments on commit 4dca5b7

Please sign in to comment.