From fb3643e509487f4522fafb3ba519f14e914aebd3 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 7 Nov 2024 22:29:15 +0100 Subject: [PATCH 1/7] Make internals inbounds --- src/util.jl | 20 +++++++++++++++----- src/vector_of_vectors.jl | 3 ++- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/util.jl b/src/util.jl index c44e6ff..02b3e58 100644 --- a/src/util.jl +++ b/src/util.jl @@ -1,6 +1,9 @@ # Return the `i`-th column of the array `A` as an `SVector`. @inline function extract_svector(A, ::Val{NDIMS}, i) where {NDIMS} - return SVector(ntuple(@inline(dim->A[dim, i]), NDIMS)) + # Inlining this makes the WCSPH benchmark ~8% faster on an H100 GPU. + # Even when adding an explicit bounds check before, it is still ~4% faster. + # However, inlining makes it ~25% slower on an RTX 3090. + return SVector(ntuple(@inline(dim -> A[dim, i]), NDIMS)) end # When particles end up with coordinates so big that the cell coordinates @@ -13,13 +16,20 @@ end # If we threw an error here, we would prevent the time integration method from # retrying with a smaller time step, and we would thus crash perfectly fine simulations. @inline function floor_to_int(i) - if isnan(i) || i > typemax(Int) + # `Base.floor(Int, i)` is defined as `trunc(Int, round(x, RoundDown))` + rounded = round(i, RoundDown) + + # `Base.trunc(Int, x)` throws an `InexactError` in these cases, and otherwise + # returns `unsafe_trunc(Int, rounded)`. + if isnan(rounded) || rounded >= typemax(Int) return typemax(Int) - elseif i < typemin(Int) + elseif rounded <= typemin(Int) return typemin(Int) end - return floor(Int, i) + # After making sure that `rounded` is in the range of `Int`, + # we can safely call `unsafe_trunc`. + return unsafe_trunc(Int, rounded) end abstract type AbstractThreadingBackend end @@ -134,7 +144,7 @@ end # Call the generic kernel that is defined below, which only calls a function with # the global GPU index. generic_kernel(backend)(ndrange = ndrange) do i - @inline f(iterator[indices[i]]) + @inbounds @inline f(iterator[indices[i]]) end KernelAbstractions.synchronize(backend) diff --git a/src/vector_of_vectors.jl b/src/vector_of_vectors.jl index afab672..63a9311 100644 --- a/src/vector_of_vectors.jl +++ b/src/vector_of_vectors.jl @@ -26,9 +26,10 @@ end @inline function Base.getindex(vov::DynamicVectorOfVectors, i) (; backend, lengths) = vov + # This is slightly faster than without explicit boundscheck and `@inbounds` below @boundscheck checkbounds(vov, i) - return view(backend, 1:lengths[i], i) + return @inbounds view(backend, 1:lengths[i], i) end @inline function Base.push!(vov::DynamicVectorOfVectors, vector::AbstractVector) From 0275d6173fb6f1670e5a34ac03f9bfa520095d82 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 8 Nov 2024 11:06:08 +0100 Subject: [PATCH 2/7] Add inbounds to extract_svector --- src/util.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/util.jl b/src/util.jl index 02b3e58..4a5df79 100644 --- a/src/util.jl +++ b/src/util.jl @@ -1,9 +1,10 @@ # Return the `i`-th column of the array `A` as an `SVector`. @inline function extract_svector(A, ::Val{NDIMS}, i) where {NDIMS} - # Inlining this makes the WCSPH benchmark ~8% faster on an H100 GPU. - # Even when adding an explicit bounds check before, it is still ~4% faster. - # However, inlining makes it ~25% slower on an RTX 3090. - return SVector(ntuple(@inline(dim -> A[dim, i]), NDIMS)) + # Explicit bounds check, which can be removed by calling this function with `@inbounds` + @boundscheck checkbounds(A, NDIMS, i) + + # Assume inbounds access now + return SVector(ntuple(@inline(dim -> @inbounds A[dim, i]), NDIMS)) end # When particles end up with coordinates so big that the cell coordinates From a27a8fb687fb7250cf178d847abb23d7e7a3aac3 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:02:21 +0100 Subject: [PATCH 3/7] Remove bounds check for particle coords --- src/PointNeighbors.jl | 1 + src/neighborhood_search.jl | 21 ++++++++++++++++++--- src/nhs_grid.jl | 24 ++++++++++++++++++++---- 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl index ec32a65..22ef389 100644 --- a/src/PointNeighbors.jl +++ b/src/PointNeighbors.jl @@ -4,6 +4,7 @@ using Reexport: @reexport using Adapt: Adapt using Atomix: Atomix +using Base: @propagate_inbounds using GPUArraysCore: AbstractGPUArray using KernelAbstractions: KernelAbstractions, @kernel, @index using LinearAlgebra: dot diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl index 4f1ba6b..650de86 100644 --- a/src/neighborhood_search.jl +++ b/src/neighborhood_search.jl @@ -164,8 +164,13 @@ end @inline function foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points, parallel::Val{true}) + # Explicit bounds check before the hot loop (or GPU kernel) + @boundscheck checkbounds(system_coords, ndims(neighborhood_search)) + @threaded system_coords for point in points - foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point) + # Now we can assume that `point` is inbounds + @inbounds foreach_neighbor(f, system_coords, neighbor_coords, + neighborhood_search, point) end return nothing @@ -175,8 +180,13 @@ end @inline function foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points, backend::ParallelizationBackend) + # Explicit bounds check before the hot loop (or GPU kernel) + @boundscheck checkbounds(system_coords, ndims(neighborhood_search)) + @threaded backend for point in points - foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point) + # Now we can assume that `point` is inbounds + @inbounds foreach_neighbor(f, system_coords, neighbor_coords, + neighborhood_search, point) end return nothing @@ -184,8 +194,13 @@ end @inline function foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points, parallel::Val{false}) + # Explicit bounds check before the hot loop + @boundscheck checkbounds(system_coords, ndims(neighborhood_search)) + for point in points - foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point) + # Now we can assume that `point` is inbounds + @inbounds foreach_neighbor(f, system_coords, neighbor_coords, + neighborhood_search, point) end return nothing diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index fef556b..c14b97e 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -337,18 +337,34 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, Paralle return neighborhood_search end -@inline function foreach_neighbor(f, system_coords, neighbor_system_coords, - neighborhood_search::GridNeighborhoodSearch, point; - search_radius = search_radius(neighborhood_search)) +@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_system_coords, + neighborhood_search::GridNeighborhoodSearch, + point; + search_radius = search_radius(neighborhood_search)) + # Due to https://github.com/JuliaLang/julia/issues/30411, we cannot just remove + # a `@boundscheck` by calling this function with `@inbounds` because it has a kwarg. + # We have to use `@propagate_inbounds`, which will also remove boundschecks + # in the neighbor loop, which is not safe (see comment below). + # To avoid this, we have to use a functio barrier to disable the `@inbounds` again. + point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point) + + __foreach_neighbor(f, system_coords, neighbor_system_coords, neighborhood_search, + point, point_coords, search_radius) +end + +@inline function __foreach_neighbor(f, system_coords, neighbor_system_coords, + neighborhood_search::GridNeighborhoodSearch, + point, point_coords, search_radius) (; periodic_box) = neighborhood_search - point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point) cell = cell_coords(point_coords, neighborhood_search) for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) neighbor_cell = Tuple(neighbor_cell_) for neighbor in points_in_cell(neighbor_cell, neighborhood_search) + # Making the following `@inbounds` yields a ~3% speedup on an NVIDIA H100. + # But we don't know if `neighbor` (extracted from the cell list) is in bounds. neighbor_coords = extract_svector(neighbor_system_coords, Val(ndims(neighborhood_search)), neighbor) From 64ba1ceef3c4ab4168bbdc6aa33251191df527d4 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:02:54 +0100 Subject: [PATCH 4/7] Avoid one additional boundscheck --- src/cell_lists/full_grid.jl | 4 ++-- src/nhs_grid.jl | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/cell_lists/full_grid.jl b/src/cell_lists/full_grid.jl index 4e8603d..e6424f9 100644 --- a/src/cell_lists/full_grid.jl +++ b/src/cell_lists/full_grid.jl @@ -156,7 +156,7 @@ function each_cell_index(cell_list::FullGridCellList{Nothing}) error("`search_radius` is not defined for this cell list") end -@inline function cell_index(cell_list::FullGridCellList, cell::Tuple) +@propagate_inbounds function cell_index(cell_list::FullGridCellList, cell::Tuple) (; linear_indices) = cell_list return linear_indices[cell...] @@ -164,7 +164,7 @@ end @inline cell_index(::FullGridCellList, cell::Integer) = cell -@inline function Base.getindex(cell_list::FullGridCellList, cell) +@propagate_inbounds function Base.getindex(cell_list::FullGridCellList, cell) (; cells) = cell_list return cells[cell_index(cell_list, cell)] diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index c14b97e..3e7a27f 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -361,8 +361,11 @@ end for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) neighbor_cell = Tuple(neighbor_cell_) + neighbors = points_in_cell(neighbor_cell, neighborhood_search) + + for neighbor_ in eachindex(neighbors) + neighbor = @inbounds neighbors[neighbor_] - for neighbor in points_in_cell(neighbor_cell, neighborhood_search) # Making the following `@inbounds` yields a ~3% speedup on an NVIDIA H100. # But we don't know if `neighbor` (extracted from the cell list) is in bounds. neighbor_coords = extract_svector(neighbor_system_coords, @@ -402,7 +405,7 @@ end for cell in neighboring_cells(cell, neighborhood_search)) end -@inline function points_in_cell(cell_index, neighborhood_search) +@propagate_inbounds function points_in_cell(cell_index, neighborhood_search) (; cell_list) = neighborhood_search return cell_list[periodic_cell_index(cell_index, neighborhood_search)] From 4f938d33e02afde4612e1a6d991eb6077bb2c31b Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:39:06 +0100 Subject: [PATCH 5/7] Update expected speedup for inbounds --- src/nhs_grid.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 3e7a27f..4c7278d 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -366,7 +366,7 @@ end for neighbor_ in eachindex(neighbors) neighbor = @inbounds neighbors[neighbor_] - # Making the following `@inbounds` yields a ~3% speedup on an NVIDIA H100. + # Making the following `@inbounds` yields a ~2% speedup on an NVIDIA H100. # But we don't know if `neighbor` (extracted from the cell list) is in bounds. neighbor_coords = extract_svector(neighbor_system_coords, Val(ndims(neighborhood_search)), neighbor) From 339dae22f284227c785f5cc5b9dd082487a528b9 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:45:28 +0100 Subject: [PATCH 6/7] Fix typo --- src/nhs_grid.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 4c7278d..e8a2bb3 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -345,7 +345,7 @@ end # a `@boundscheck` by calling this function with `@inbounds` because it has a kwarg. # We have to use `@propagate_inbounds`, which will also remove boundschecks # in the neighbor loop, which is not safe (see comment below). - # To avoid this, we have to use a functio barrier to disable the `@inbounds` again. + # To avoid this, we have to use a function barrier to disable the `@inbounds` again. point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point) __foreach_neighbor(f, system_coords, neighbor_system_coords, neighborhood_search, From f90b24e903dad74cba7e9f78733aeb38a509f881 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:46:43 +0100 Subject: [PATCH 7/7] Reformat code --- src/util.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util.jl b/src/util.jl index 4a5df79..7cfbfed 100644 --- a/src/util.jl +++ b/src/util.jl @@ -4,7 +4,7 @@ @boundscheck checkbounds(A, NDIMS, i) # Assume inbounds access now - return SVector(ntuple(@inline(dim -> @inbounds A[dim, i]), NDIMS)) + return SVector(ntuple(@inline(dim->@inbounds A[dim, i]), NDIMS)) end # When particles end up with coordinates so big that the cell coordinates