diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl index ec32a65..22ef389 100644 --- a/src/PointNeighbors.jl +++ b/src/PointNeighbors.jl @@ -4,6 +4,7 @@ using Reexport: @reexport using Adapt: Adapt using Atomix: Atomix +using Base: @propagate_inbounds using GPUArraysCore: AbstractGPUArray using KernelAbstractions: KernelAbstractions, @kernel, @index using LinearAlgebra: dot diff --git a/src/cell_lists/full_grid.jl b/src/cell_lists/full_grid.jl index fea8a89..32cde5d 100644 --- a/src/cell_lists/full_grid.jl +++ b/src/cell_lists/full_grid.jl @@ -156,7 +156,7 @@ function each_cell_index(cell_list::FullGridCellList{Nothing}) error("`search_radius` is not defined for this cell list") end -@inline function cell_index(cell_list::FullGridCellList, cell::Tuple) +@propagate_inbounds function cell_index(cell_list::FullGridCellList, cell::Tuple) (; linear_indices) = cell_list return linear_indices[cell...] @@ -164,7 +164,7 @@ end @inline cell_index(::FullGridCellList, cell::Integer) = cell -@inline function Base.getindex(cell_list::FullGridCellList, cell) +@propagate_inbounds function Base.getindex(cell_list::FullGridCellList, cell) (; cells) = cell_list return cells[cell_index(cell_list, cell)] diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl index 4f1ba6b..650de86 100644 --- a/src/neighborhood_search.jl +++ b/src/neighborhood_search.jl @@ -164,8 +164,13 @@ end @inline function foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points, parallel::Val{true}) + # Explicit bounds check before the hot loop (or GPU kernel) + @boundscheck checkbounds(system_coords, ndims(neighborhood_search)) + @threaded system_coords for point in points - foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point) + # Now we can assume that `point` is inbounds + @inbounds foreach_neighbor(f, system_coords, neighbor_coords, + neighborhood_search, point) end return nothing @@ -175,8 +180,13 @@ end @inline function foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points, backend::ParallelizationBackend) + # Explicit bounds check before the hot loop (or GPU kernel) + @boundscheck checkbounds(system_coords, ndims(neighborhood_search)) + @threaded backend for point in points - foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point) + # Now we can assume that `point` is inbounds + @inbounds foreach_neighbor(f, system_coords, neighbor_coords, + neighborhood_search, point) end return nothing @@ -184,8 +194,13 @@ end @inline function foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points, parallel::Val{false}) + # Explicit bounds check before the hot loop + @boundscheck checkbounds(system_coords, ndims(neighborhood_search)) + for point in points - foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point) + # Now we can assume that `point` is inbounds + @inbounds foreach_neighbor(f, system_coords, neighbor_coords, + neighborhood_search, point) end return nothing diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 8bb1440..3b5dd26 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -343,18 +343,37 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, Paralle return neighborhood_search end -@inline function foreach_neighbor(f, system_coords, neighbor_system_coords, - neighborhood_search::GridNeighborhoodSearch, point; - search_radius = search_radius(neighborhood_search)) +@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_system_coords, + neighborhood_search::GridNeighborhoodSearch, + point; + search_radius = search_radius(neighborhood_search)) + # Due to https://github.com/JuliaLang/julia/issues/30411, we cannot just remove + # a `@boundscheck` by calling this function with `@inbounds` because it has a kwarg. + # We have to use `@propagate_inbounds`, which will also remove boundschecks + # in the neighbor loop, which is not safe (see comment below). + # To avoid this, we have to use a function barrier to disable the `@inbounds` again. + point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point) + + __foreach_neighbor(f, system_coords, neighbor_system_coords, neighborhood_search, + point, point_coords, search_radius) +end + +@inline function __foreach_neighbor(f, system_coords, neighbor_system_coords, + neighborhood_search::GridNeighborhoodSearch, + point, point_coords, search_radius) (; periodic_box) = neighborhood_search - point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point) cell = cell_coords(point_coords, neighborhood_search) for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) neighbor_cell = Tuple(neighbor_cell_) + neighbors = points_in_cell(neighbor_cell, neighborhood_search) + + for neighbor_ in eachindex(neighbors) + neighbor = @inbounds neighbors[neighbor_] - for neighbor in points_in_cell(neighbor_cell, neighborhood_search) + # Making the following `@inbounds` yields a ~2% speedup on an NVIDIA H100. + # But we don't know if `neighbor` (extracted from the cell list) is in bounds. neighbor_coords = extract_svector(neighbor_system_coords, Val(ndims(neighborhood_search)), neighbor) @@ -392,7 +411,7 @@ end for cell in neighboring_cells(cell, neighborhood_search)) end -@inline function points_in_cell(cell_index, neighborhood_search) +@propagate_inbounds function points_in_cell(cell_index, neighborhood_search) (; cell_list) = neighborhood_search return cell_list[periodic_cell_index(cell_index, neighborhood_search)] diff --git a/src/util.jl b/src/util.jl index c44e6ff..7cfbfed 100644 --- a/src/util.jl +++ b/src/util.jl @@ -1,6 +1,10 @@ # Return the `i`-th column of the array `A` as an `SVector`. @inline function extract_svector(A, ::Val{NDIMS}, i) where {NDIMS} - return SVector(ntuple(@inline(dim->A[dim, i]), NDIMS)) + # Explicit bounds check, which can be removed by calling this function with `@inbounds` + @boundscheck checkbounds(A, NDIMS, i) + + # Assume inbounds access now + return SVector(ntuple(@inline(dim->@inbounds A[dim, i]), NDIMS)) end # When particles end up with coordinates so big that the cell coordinates @@ -13,13 +17,20 @@ end # If we threw an error here, we would prevent the time integration method from # retrying with a smaller time step, and we would thus crash perfectly fine simulations. @inline function floor_to_int(i) - if isnan(i) || i > typemax(Int) + # `Base.floor(Int, i)` is defined as `trunc(Int, round(x, RoundDown))` + rounded = round(i, RoundDown) + + # `Base.trunc(Int, x)` throws an `InexactError` in these cases, and otherwise + # returns `unsafe_trunc(Int, rounded)`. + if isnan(rounded) || rounded >= typemax(Int) return typemax(Int) - elseif i < typemin(Int) + elseif rounded <= typemin(Int) return typemin(Int) end - return floor(Int, i) + # After making sure that `rounded` is in the range of `Int`, + # we can safely call `unsafe_trunc`. + return unsafe_trunc(Int, rounded) end abstract type AbstractThreadingBackend end @@ -134,7 +145,7 @@ end # Call the generic kernel that is defined below, which only calls a function with # the global GPU index. generic_kernel(backend)(ndrange = ndrange) do i - @inline f(iterator[indices[i]]) + @inbounds @inline f(iterator[indices[i]]) end KernelAbstractions.synchronize(backend) diff --git a/src/vector_of_vectors.jl b/src/vector_of_vectors.jl index afab672..63a9311 100644 --- a/src/vector_of_vectors.jl +++ b/src/vector_of_vectors.jl @@ -26,9 +26,10 @@ end @inline function Base.getindex(vov::DynamicVectorOfVectors, i) (; backend, lengths) = vov + # This is slightly faster than without explicit boundscheck and `@inbounds` below @boundscheck checkbounds(vov, i) - return view(backend, 1:lengths[i], i) + return @inbounds view(backend, 1:lengths[i], i) end @inline function Base.push!(vov::DynamicVectorOfVectors, vector::AbstractVector)