diff --git a/Project.toml b/Project.toml index 62f5f0a..646c778 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,13 @@ version = "0.1.2" [compat] julia = "1.10" +StaticArrays = "1" + +[weakdeps] +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" + +[extensions] +UnrolledUtilitiesStaticArraysExt = "StaticArrays" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" diff --git a/docs/src/index.md b/docs/src/index.md index 8aaec38..10ac6bc 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,16 +1,26 @@ +```@meta +CurrentModule = UnrolledUtilities +``` + # UnrolledUtilities.jl -A collection of generated functions in which all loops are unrolled and inlined: +## Unrolled Functions + +This package exports the following functions, in which all loops are unrolled +and inlined: - `unrolled_any(f, itr)`: similar to `any` - `unrolled_all(f, itr)`: similar to `all` - `unrolled_foreach(f, itrs...)`: similar to `foreach` - `unrolled_map(f, itrs...)`: similar to `map` - `unrolled_reduce(op, itr; [init])`: similar to `reduce` - `unrolled_mapreduce(f, op, itrs...; [init])`: similar to `mapreduce` -- `unrolled_zip(itrs...)`: similar to `zip` -- `unrolled_enumerate(itrs...)`: similar to `enumerate`, but with the ability to - handle multiple iterators +- `unrolled_accumulate(op, itr; [init], [transform])`: similar to `accumulate`, + but with an optional `transform` function applied to every accumulated value +- `unrolled_mapaccumulate(f, op, itrs...; [init], [transform])`: a combination + of `unrolled_map` and `unrolled_accumulate`, analogous to `unrolled_mapreduce` - `unrolled_in(item, itr)`: similar to `in` +- `unrolled_push(itr, item)`: similar to `push!`, but non-mutating +- `unrolled_append(itr, item)`: similar to `append!`, but non-mutating - `unrolled_unique(itr)`: similar to `unique` - `unrolled_filter(f, itr)`: similar to `filter` - `unrolled_split(f, itr)`: similar to `(filter(f, itr), filter(!f, itr))`, but @@ -42,33 +52,77 @@ iterators have singleton element types (and when the result of calling `f` and/or `op` on these elements is inferrable). However, they can also be much more expensive to compile than their counterparts from `Base` and `Base.Iterators`, in which case they should not be used unless there is a clear -performance benefit. Some notable exceptions to this are `unrolled_zip`, -`unrolled_take`, and `unrolled_drop`, which tend to be easier to compile than -`zip`, `Iterators.take`, `Iterators.drop`, and standard indexing notation. +performance benefit. Two notable exceptions to this are `unrolled_take` and +`unrolled_drop`, which are faster to compile than their non-static versions. + +## Interface + +These functions can be used to unroll loops over all iterators with statically +inferrable lengths. Compatibility with any such iterator type can be added +through the following interface: + +```@docs +length_from_type +target_output_type +target_output_type_for_promotion +output_promote_rule +eltype_restriction +output_constructor +empty_output +``` + +## Lazy and Low-Storage Iterators + +The interface above is used to provide built-in compatibility with `Tuple`s and +`SVector`s (from [StaticArrays.jl](https://github.com/JuliaArrays/StaticArrays.jl)), +and also with subtypes of `StaticSequence`: + +```@docs +StaticSequence +LazyMap +LazySequence +BitSequence +``` + +This package also exports several "lazy" functions that generate `LazyMap`s: +- `lazy_map(f, itrs...)`: similar to `map` +- `lazy_zip(itrs...)`: similar to `zip` +- `lazy_enumerate(itrs...)`: similar to `enumerate`, but with the ability to + handle multiple iterators + +When used in conjunction with a `LazySequence` or `BitSequence`, these functions +can result in significantly lower register pressure than `unrolled_map` or +similarly unrolled versions of `zip` and `enumerate`. + +## When to Unroll For a more precise indication of whether you should use `UnrolledUtilities`, please consult the autogenerated [Comparison Table](@ref). This table contains a -comprehensive set of potential use cases, each with a measurement of performance -optimization, the time required for compilation, and the memory usage during -compilation. Most cases involve simple functions `f` and/or `op`, but the last -few demonstrate the benefits of unrolling with non-trivial recursive functions. +comprehensive set of potential use cases, along with a few measurements that +summarize their performance, compilation, and allocations: +- overall level of optimization (type stability, constant propagation, etc.) +- run time (best of several trial measurements) +- compilation time (as reported by the compiler) +- memory usage during compilation and first run (as reported by the garbage + collector and, when possible, the Julia process's resident set size estimator) The rows of the table are highlighted as follows: -- green indicates an improvement in performance and either no change in - compilation or easier compilation (i.e., either similar or smaller values of - compilation time and memory usage) -- dark blue indicates an improvement in performance and harder compilation - (i.e., larger values of compilation time and/or memory usage) -- light blue indicates no change in performance and easier compilation -- yellow indicates no change in performance and no change in compilation -- magenta indicates no change in performance, an increase in compilation time, - and a decrease in compilation memory usage -- red indicates no change in performance and harder compilation +- green indicates an improvement in performance and either an improvement or + no change in compilation and allocations +- dark blue indicates an improvement in performance and either slower + compilation or more allocations +- light blue indicates no change in performance and either faster compilation or + fewer allocations +- magenta indicates no change in performance and either faster compilation with + more allocations or slower compilation with fewer allocations +- yellow indicates no change in performance, compilation, or allocations +- red indicates a deterioration in performance, or no change in + performance and either slower compilation or more allocations Rows highlighted in green and blue present a clear advantage for unrolling, -whereas those highlighted in yellow, magenta, and red either have no clear -advantage, or they have a clear disadvantage. It is recommended that you only -unroll when your use case is similar to a row in the first category. +whereas those highlighted in magenta, yellow, and red either have no clear +advantage or have a clear disadvantage. It is recommended that you only unroll +when your use case is similar to a row in the first category. The table is also printed out by this package's unit tests, so these measurements can be compared across different operating systems by checking the diff --git a/ext/UnrolledUtilitiesStaticArraysExt.jl b/ext/UnrolledUtilitiesStaticArraysExt.jl new file mode 100644 index 0000000..09d2bf3 --- /dev/null +++ b/ext/UnrolledUtilitiesStaticArraysExt.jl @@ -0,0 +1,10 @@ +module UnrolledUtilitiesStaticArraysExt + +import UnrolledUtilities +import StaticArrays: SVector + +UnrolledUtilities.length_from_type(::Type{<:SVector{N}}) where {N} = N +UnrolledUtilities.target_output_type(::SVector) = SVector +UnrolledUtilities.output_constructor(::Type{SVector}) = SVector + +end diff --git a/src/BitSequence.jl b/src/BitSequence.jl new file mode 100644 index 0000000..77076dd --- /dev/null +++ b/src/BitSequence.jl @@ -0,0 +1,150 @@ +""" + BitSequence{N, [U]}(f) + BitSequence{N, [U]}([bit]) + +A statically-sized analogue of `BitVector` with `Unsigned` chunks of type `U`, +which can be constructed using either a function `f(n)` or a constant `bit`. By +default, `U` is set to `UInt8` and `bit` is set to `false`. + +Efficient methods are provided for `unrolled_map`, `unrolled_accumulate`, +`unrolled_take`, and `unrolled_drop`, though the methods for `unrolled_map` and +`unrolled_accumulate` only apply when their outputs consist of `Bool`s. All +other unrolled functions that need to construct non-empty iterators convert +`BitSequence`s into `Tuple`s. +""" +struct BitSequence{N, U <: Unsigned, I <: NTuple{<:Any, U}} <: StaticSequence{N} + ints::I +end +BitSequence{N, U}(ints::I) where {N, U <: Unsigned, I <: NTuple{<:Any, U}} = + BitSequence{N, U, I}(ints) +BitSequence{N}(args...) where {N} = BitSequence{N, UInt8}(args...) + +function BitSequence{N, U}(bit::Bool = false) where {N, U} + n_bits_per_int = 8 * sizeof(U) + n_ints = cld(N, n_bits_per_int) + int = bit ? ~zero(U) : zero(U) + ints = ntuple(_ -> int, Val(n_ints)) + return BitSequence{N, U}(ints) +end + +function BitSequence{N, U}(f) where {N, U} + n_bits_per_int = 8 * sizeof(U) + n_ints = cld(N, n_bits_per_int) + ints = ntuple(Val(n_ints)) do int_index + @inline + first_index = n_bits_per_int * (int_index - 1) + 1 + unrolled_reduce( + LazySequence{min(n_bits_per_int, N - first_index + 1)}(0); + init = zero(U), + ) do int, bit_offset + int | U(f(first_index + bit_offset)::Bool) << bit_offset + end + end + return BitSequence{N, U}(ints) +end + +target_output_type(::BitSequence{<:Any, U}) where {U} = BitSequence{<:Any, U} + +output_promote_rule(::Type{B}, ::Type{O}) where {B <: BitSequence, O} = O +output_promote_rule(::Type{B}, ::Type{Tuple}) where {B <: BitSequence} = Tuple +output_promote_rule(::Type{B}, ::Type{LazySequence}) where {B <: BitSequence} = + B + +eltype_restriction(::Type{<:BitSequence}) = Bool + +empty_output(::Type{BitSequence{<:Any, U}}) where {U} = BitSequence{0, U}() + +@inline function unrolled_map_into_target( + ::Type{BitSequence{<:Any, U}}, + f, + itrs..., +) where {U} + lazy_itr = lazy_map(f, itrs...) + N = inferred_length(lazy_itr) + return BitSequence{N, U}(Base.Fix1(getindex, lazy_itr)) +end + +@inline function unrolled_accumulate_into_target( + ::Type{BitSequence{<:Any, U}}, + op, + itr, + init, + transform, +) where {U} + N = inferred_length(itr) + (N == 0 && init isa NoInit) && + error("unrolled_accumulate requires an init value for empty iterators") + n_bits_per_int = 8 * sizeof(U) + n_ints = cld(N, n_bits_per_int) + ints = unrolled_accumulate_into_tuple( + LazySequence{n_ints}(); + init = (nothing, init), + transform = first, + ) do (_, init_value_for_new_int), int_index + @inline + first_index = n_bits_per_int * (int_index - 1) + 1 + unrolled_reduce( + LazySequence{min(n_bits_per_int, N - first_index + 1)}(0); + init = (zero(U), init_value_for_new_int), + ) do (int, prev_value), bit_offset + item = itr[first_index + bit_offset] + new_value = + first_index + bit_offset == 1 && prev_value isa NoInit ? + item : op(prev_value, item) + (int | U(transform(new_value)::Bool) << bit_offset, new_value) + end + end + return BitSequence{N, U}(ints) +end + +@inline function unrolled_take( + itr::BitSequence{<:Any, U}, + ::Val{N}, +) where {N, U} + n_bits_per_int = 8 * sizeof(U) + n_ints = cld(N, n_bits_per_int) + ints = unrolled_take(itr.ints, Val(n_ints)) + return BitSequence{N, U}(ints) +end + +@inline function unrolled_drop( + itr::BitSequence{N_old, U}, + ::Val{N}, +) where {N_old, N, U} + n_bits_per_int = 8 * sizeof(U) + n_ints = cld(N_old - N, n_bits_per_int) + n_dropped_ints = length(itr.ints) - n_ints + bit_offset = N - n_bits_per_int * n_dropped_ints + ints_without_offset = unrolled_drop(itr.ints, Val(n_dropped_ints)) + ints = if bit_offset == 0 + ints_without_offset + else + cur_ints = ints_without_offset + next_ints = unrolled_push(unrolled_drop(cur_ints, Val(1)), nothing) + unrolled_map_into_tuple(cur_ints, next_ints) do cur_int, next_int + isnothing(next_int) ? cur_int >> bit_offset : + cur_int >> bit_offset | next_int << (n_bits_per_int - bit_offset) + end + end + return BitSequence{N_old - N, U}(ints) +end + +@inline function int_index_and_bit_offset(itr, n) + int_offset, bit_offset = divrem(n - 1, 8 * sizeof(eltype(itr.ints))) + return (int_offset + 1, bit_offset) +end + +@inline function Base.getindex(itr::BitSequence, n::Integer) + int_index, bit_offset = int_index_and_bit_offset(itr, n) + int = itr.ints[int_index] + return Bool(int >> bit_offset & one(int)) +end + +@inline function Base.setindex(itr::BitSequence, bit::Bool, n::Integer) + int_index, bit_offset = int_index_and_bit_offset(itr, n) + int = itr.ints[int_index] + int′ = int & ~(one(int) << bit_offset) | typeof(int)(bit) << bit_offset + return typeof(itr)(Base.setindex(itr.ints, int′, int_index)) +end + +@inline Base.eltype(::BitSequence) = Bool diff --git a/src/LazyMap.jl b/src/LazyMap.jl new file mode 100644 index 0000000..83a82f4 --- /dev/null +++ b/src/LazyMap.jl @@ -0,0 +1,56 @@ +""" + LazyMap{N}(f, itrs...) + +A lazy and statically-sized analogue of a `Base.AbstractBroadcasted` object +whose elements and `target_output_type` are consistent with +`unrolled_map(f, itrs...)`. + +Efficient methods are provided for `unrolled_take` and `unrolled_drop`. All +other unrolled functions that need to construct non-empty iterators convert +`LazyMap`s into their `output_type`s. +""" +struct LazyMap{N, F, I} <: StaticSequence{N} + f::F + itrs::I +end +LazyMap{N}(f, itrs...) where {N} = LazyMap{N, typeof(f), typeof(itrs)}(f, itrs) + +target_output_type(itr::LazyMap) = output_type_of_map(itr.f, itr.itrs...) + +# Ignore eltype restrictions during the promotion process, until the final step. +target_output_type_for_promotion(itr::LazyMap) = + promoted_target_output_type(itr.itrs) + +@inline unrolled_fix2(f, arg, itrs) = + unrolled_map_into_tuple((@inline itr -> f(itr, arg)), itrs) + +@inline unrolled_take(itr::LazyMap, ::Val{N}) where {N} = + LazyMap{N}(itr.f, unrolled_fix2(unrolled_take, Val(N), itr.itrs)...) + +@inline unrolled_drop(itr::LazyMap{N_old}, ::Val{N}) where {N_old, N} = + LazyMap{N_old - N}(itr.f, unrolled_fix2(unrolled_drop, Val(N), itr.itrs)...) + +# Work around the recursion limit for getindex to handle chains of LazyMaps. +@inline Base.getindex(itr::LazyMap, n::Integer) = lazy_map_getindex(itr, n) +@inline lazy_map_getindex(itr, n) = getindex(itr, n) +@inline lazy_map_getindex(itr::LazyMap, n) = + itr.f(unrolled_fix2(lazy_map_getindex, n, itr.itrs)...) +@static if hasfield(Method, :recursion_relation) + for method in methods(lazy_map_getindex) + method.recursion_relation = (_...) -> true + end +end + +@inline Base.eltype(itr::LazyMap) = + Base.promote_op(itr.f, unrolled_map_into_tuple(eltype, itr.itrs)...) + +################################################################################ + +@inline lazy_map(f, itr) = LazyMap{inferred_length(itr)}(f, itr) +@inline lazy_map(f, itrs...) = LazyMap{minimum_length(itrs)}(f, itrs...) +# The first method avoids the recursion lazy_map → minimum_length → lazy_map. + +@inline lazy_zip(itrs...) = lazy_map(tuple, itrs...) + +@inline lazy_enumerate(itrs...) = + lazy_zip(LazySequence{minimum_length(itrs)}(), itrs...) diff --git a/src/LazySequence.jl b/src/LazySequence.jl new file mode 100644 index 0000000..9fa03cb --- /dev/null +++ b/src/LazySequence.jl @@ -0,0 +1,34 @@ +""" + LazySequence{N}(f) + LazySequence{N}([start]) + +A lazy analogue of `ntuple(f, Val(N))`, or a lazy and statically-sized analogue +of `start:(start - 1 + N)`. By default, `start` is set to 1. + +Efficient methods are provided for `unrolled_take` and `unrolled_drop`. All +other unrolled functions that need to construct non-empty iterators convert +`LazySequence`s into `Tuple`s. +""" +struct LazySequence{N, F} <: StaticSequence{N} + f::F +end +LazySequence{N}(f = identity) where {N} = LazySequence{N, typeof(f)}(f) +LazySequence{N}(start::Number) where {N} = + LazySequence{N}(Base.Fix1(+, start - one(start))) + +target_output_type(::LazySequence) = LazySequence + +output_promote_rule(::Type{LazySequence}, ::Type{O}) where {O} = O +output_promote_rule(::Type{LazySequence}, ::Type{Tuple}) = Tuple + +empty_output(::Type{LazySequence}) = LazySequence{0}() + +@inline unrolled_take(itr::LazySequence, ::Val{N}) where {N} = + LazySequence{N}(itr.f) + +@inline unrolled_drop(itr::LazySequence{N_old}, ::Val{N}) where {N_old, N} = + LazySequence{N_old - N}(n -> itr.f(n + N)) + +@inline Base.getindex(itr::LazySequence, n::Integer) = itr.f(n) + +@inline Base.eltype(itr::LazySequence) = Base.promote_op(itr.f, Int) diff --git a/src/UnrolledUtilities.jl b/src/UnrolledUtilities.jl index dc69559..70d3c78 100644 --- a/src/UnrolledUtilities.jl +++ b/src/UnrolledUtilities.jl @@ -6,9 +6,11 @@ export unrolled_any, unrolled_map, unrolled_reduce, unrolled_mapreduce, - unrolled_zip, - unrolled_enumerate, + unrolled_accumulate, + unrolled_mapaccumulate, unrolled_in, + unrolled_push, + unrolled_append, unrolled_unique, unrolled_filter, unrolled_split, @@ -17,107 +19,140 @@ export unrolled_any, unrolled_product, unrolled_applyat, unrolled_take, - unrolled_drop + unrolled_drop, + lazy_map, + lazy_zip, + lazy_enumerate, + LazySequence, + BitSequence -inferred_length(::Type{<:NTuple{N, Any}}) where {N} = N -# We could also add support for statically-sized iterators that are not Tuples. +struct NoInit end # Analogue of Base._InitialValue for reduction/accumulation. -f_exprs(itr_type) = (:(f(itr[$n])) for n in 1:inferred_length(itr_type)) -@inline @generated unrolled_any(f, itr) = Expr(:||, f_exprs(itr)...) -@inline @generated unrolled_all(f, itr) = Expr(:&&, f_exprs(itr)...) +include("generated_functions.jl") +include("unrollable_interface.jl") -function zipped_f_exprs(itr_types) - L = length(itr_types) - L == 0 && error("unrolled functions need at least one iterator as input") - N = minimum(inferred_length, itr_types) - return (:(f($((:(itrs[$l][$n]) for l in 1:L)...))) for n in 1:N) -end -@inline @generated unrolled_foreach(f, itrs...) = - Expr(:block, zipped_f_exprs(itrs)..., nothing) -@inline @generated unrolled_map(f, itrs...) = - Expr(:tuple, zipped_f_exprs(itrs)...) - -function nested_op_expr(itr_type) - N = inferred_length(itr_type) - N == 0 && error("unrolled_reduce needs an `init` value for empty iterators") - item_exprs = (:(itr[$n]) for n in 1:N) - return reduce((expr1, expr2) -> :(op($expr1, $expr2)), item_exprs) -end -@inline @generated unrolled_reduce_without_init(op, itr) = nested_op_expr(itr) +@inline unrolled_map_into_target(output_type, f, itrs...) = + output_constructor(output_type)(unrolled_map_into_tuple(f, itrs...)) +@inline unrolled_map(f, itrs...) = + unrolled_map_into_target(output_type_of_map(f, itrs...), f, itrs...) -struct NoInit end @inline unrolled_reduce(op, itr; init = NoInit()) = - unrolled_reduce_without_init(op, init isa NoInit ? itr : (init, itr...)) - + unrolled_reduce(op, itr, init) @inline unrolled_mapreduce(f, op, itrs...; init = NoInit()) = - unrolled_reduce(op, unrolled_map(f, itrs...); init) + unrolled_reduce(op, lazy_map(f, itrs...), init) -@inline unrolled_zip(itrs...) = unrolled_map(tuple, itrs...) - -@inline unrolled_enumerate(itrs...) = - unrolled_zip(ntuple(identity, Val(length(itrs[1]))), itrs...) +@inline unrolled_accumulate_into_target(output_type, op, itr, init, transform) = + output_constructor(output_type)( + unrolled_accumulate_into_tuple(op, itr, init, transform), + ) +@inline unrolled_accumulate(op, itr; init = NoInit(), transform = identity) = + unrolled_accumulate_into_target( + output_type_of_accumulate(op, itr, init, transform), + op, + itr, + init, + transform, + ) +@inline unrolled_mapaccumulate(f, op, itrs...; kwargs...) = + unrolled_accumulate(op, lazy_map(f, itrs...); kwargs...) @inline unrolled_in(item, itr) = unrolled_any(Base.Fix1(===, item), itr) # Using === instead of == or isequal improves type stability for singletons. +@inline unrolled_push(itr, item) = inferred_constructor(itr)((itr..., item)) + +@inline unrolled_append(itr1, itr2) = + promoted_constructor((itr1, itr2))((itr1..., itr2...)) + @inline unrolled_unique(itr) = - unrolled_reduce(itr; init = ()) do unique_items, item + unrolled_reduce(itr; init = inferred_empty(itr)) do unique_items, item @inline - unrolled_in(item, unique_items) ? unique_items : (unique_items..., item) + unrolled_in(item, unique_items) ? unique_items : + unrolled_push(unique_items, item) end @inline unrolled_filter(f, itr) = - unrolled_reduce(itr; init = ()) do filtered_items, item + unrolled_reduce(itr; init = inferred_empty(itr)) do filtered_items, item @inline - f(item) ? (filtered_items..., item) : filtered_items + f(item) ? unrolled_push(filtered_items, item) : filtered_items end -@inline unrolled_split(f, itr) = - unrolled_reduce(itr; init = ((), ())) do (f_items, not_f_items), item +@inline function unrolled_split(f, itr) + init = (inferred_empty(itr), inferred_empty(itr)) + return unrolled_reduce(itr; init) do (f_items, not_f_items), item @inline - f(item) ? ((f_items..., item), not_f_items) : - (f_items, (not_f_items..., item)) + f(item) ? (unrolled_push(f_items, item), not_f_items) : + (f_items, unrolled_push(not_f_items, item)) end +end @inline unrolled_flatten(itr) = - unrolled_reduce((item1, item2) -> (item1..., item2...), itr; init = ()) + unrolled_reduce(unrolled_append, itr; init = promoted_empty(itr)) -@inline unrolled_flatmap(f, itrs...) = - unrolled_flatten(unrolled_map(f, itrs...)) +@inline unrolled_flatmap(f, itrs...) = unrolled_flatten(lazy_map(f, itrs...)) @inline unrolled_product(itrs...) = - unrolled_reduce(itrs; init = ((),)) do product_itr, itr + unrolled_reduce(itrs; init = (promoted_empty(itrs),)) do product_itr, itr @inline unrolled_flatmap(itr) do item @inline - unrolled_map(product_tuple -> (product_tuple..., item), product_itr) + unrolled_map_into_tuple(Base.Fix2(unrolled_push, item), product_itr) end end @inline unrolled_applyat(f, n, itrs...) = unrolled_foreach( - (i, items...) -> i == n && f(items...), - unrolled_enumerate(itrs...), + (n′, items...) -> n′ == n && f(items...), + lazy_enumerate(itrs...), ) -@inline unrolled_take(itr, ::Val{N}) where {N} = ntuple(i -> itr[i], Val(N)) -@inline unrolled_drop(itr, ::Val{N}) where {N} = - ntuple(i -> itr[N + i], Val(length(itr) - N)) -# When its second argument is a Val, ntuple is unrolled via Base.@ntuple. +@inline unrolled_take(itr, ::Val{N}) where {N} = + inferred_constructor(itr)(ntuple(Base.Fix1(getindex, itr), Val(N))) + +@inline unrolled_drop(itr, ::Val{N}) where {N} = inferred_constructor(itr)( + ntuple(n -> itr[N + n], Val(inferred_length(itr) - N)), +) + +""" + StaticSequence{N} + +A statically-sized iterator of length `N` for which some unrolled functions have +lazy or low-storage implementations. +""" +abstract type StaticSequence{N} end + +length_from_type(::Type{<:StaticSequence{N}}) where {N} = N + +Base.firstindex(itr::StaticSequence) = 1 +Base.lastindex(itr::StaticSequence) = inferred_length(itr) +Base.length(itr::StaticSequence) = inferred_length(itr) + +@inline Base.iterate(itr::StaticSequence, index = 1) = + index > inferred_length(itr) ? nothing : (itr[index], index + 1) + +include("LazyMap.jl") +include("LazySequence.jl") +include("BitSequence.jl") @static if hasfield(Method, :recursion_relation) # Remove recursion limits for functions whose arguments are also functions. for func in ( + unrolled_map_into_tuple, + unrolled_map_into_target, + unrolled_accumulate_into_tuple, + unrolled_accumulate_into_target, unrolled_any, unrolled_all, unrolled_foreach, unrolled_map, - unrolled_reduce_without_init, unrolled_reduce, unrolled_mapreduce, + unrolled_accumulate, + unrolled_mapaccumulate, unrolled_filter, unrolled_split, unrolled_flatmap, unrolled_applyat, + lazy_map, ) for method in methods(func) method.recursion_relation = (_...) -> true diff --git a/src/generated_functions.jl b/src/generated_functions.jl new file mode 100644 index 0000000..acd71c6 --- /dev/null +++ b/src/generated_functions.jl @@ -0,0 +1,41 @@ +f_exprs(itr_type) = (:(f(itr[$n])) for n in 1:length_from_type(itr_type)) +@inline @generated unrolled_any(f, itr) = Expr(:||, f_exprs(itr)...) +@inline @generated unrolled_all(f, itr) = Expr(:&&, f_exprs(itr)...) + +@inline unrolled_any(itr) = unrolled_any(identity, itr) +@inline unrolled_all(itr) = unrolled_all(identity, itr) + +function zipped_f_exprs(itr_types) + L = length(itr_types) + N = L == 0 ? 0 : minimum(length_from_type, itr_types) # match minimum_length + return (:(f($((:(itrs[$l][$n]) for l in 1:L)...))) for n in 1:N) +end +@inline @generated unrolled_foreach(f, itrs...) = + Expr(:block, zipped_f_exprs(itrs)..., nothing) +@inline @generated unrolled_map_into_tuple(f, itrs...) = + Expr(:tuple, zipped_f_exprs(itrs)...) + +function nested_op_expr(itr_type, init_type) + N = length_from_type(itr_type) + (N == 0 && init_type == NoInit) && + error("unrolled_reduce requires an init value for empty iterators") + init_expr = init_type == NoInit ? :(itr[1]) : :init + n_range = init_type == NoInit ? (2:N) : (1:N) + return foldl((expr, n) -> :(op($expr, itr[$n])), n_range; init = init_expr) +end +@inline @generated unrolled_reduce(op, itr, init) = nested_op_expr(itr, init) + +function transformed_sequential_op_exprs(itr_type, init_type) + N = length_from_type(itr_type) + (N == 0 && init_type == NoInit) && + error("unrolled_accumulate requires an init value for empty iterators") + init_op_expr = init_type == NoInit ? :(itr[1]) : :(op(init, itr[1])) + transformed_exprs_and_next_op_exprs = + accumulate(1:N; init = (nothing, init_op_expr)) do (_, op_expr), n + var = gensym() + (:($var = $op_expr; transform($var)), :(op($var, itr[$(n + 1)]))) + end + return map(first, transformed_exprs_and_next_op_exprs) +end +@inline @generated unrolled_accumulate_into_tuple(op, itr, init, transform) = + Expr(:tuple, transformed_sequential_op_exprs(itr, init)...) diff --git a/src/unrollable_interface.jl b/src/unrollable_interface.jl new file mode 100644 index 0000000..b0a319c --- /dev/null +++ b/src/unrollable_interface.jl @@ -0,0 +1,158 @@ +#= +To use an unrollable iterator of type T with this interface, follow these steps: +- Add a method for length_from_type(T) +- If every unrolled function that needs to construct an iterator when given an + iterator of type T can return a Tuple, stop here; otherwise, to return a + non-Tuple iterator when possible, follow these steps: + - Add a method for target_output_type(::T) = O + - If the output type used for promotion should be some other type O′, add an + method for target_output_type_for_promotion(::T) = O′ + - If an output of type O can be used together with an output of type O′, add + a method for output_promote_rule(O, O′) + - If an output of type O can only store elements of a certain type, and if + it should be replaced with a Tuple when given elements of any other type, + add a method for eltype_restriction(O) + - If an output of type O can be efficiently constructed from a Tuple, add a + method for output_constructor(O) + - If an output of type O cannot be efficiently constructed from a Tuple, + follow these steps: + - Add a method for empty_output(O) + - Add a method for every unrolled function that can be optimized to + handle outputs of type O + - Check that every other unrolled function that needs to construct an + iterator can return a Tuple instead of an output of type O +=# + +""" + length_from_type(itr_type) + +The length of an iterator of type `itr_type`. +""" +length_from_type(::Type{<:NTuple{N, Any}}) where {N} = N +length_from_type(::Type{T}) where {T} = error( + "UnrolledUtilities.length_from_type must be defined for the following \ + type before iterators of this type can be unrolled: $T", +) + +@inline inferred_length(itr) = length_from_type(typeof(itr)) + +@inline minimum_length(itrs) = + length(itrs) == 0 ? 0 : unrolled_mapreduce(inferred_length, min, itrs) + +""" + target_output_type(itr) + +The type of output that unrolled functions should try to generate for the input +iterator `itr`. By default, this is set to `Tuple`. +""" +target_output_type(_) = Tuple + +""" + target_output_type_for_promotion(itr) + +An alternative to `target_output_type(itr)` used within the promotion process. +This type of output might not be able to store all of the elements in `itr` +(i.e., it might have an `eltype_restriction` that makes it incompatible with +`itr`), but lazy wrappers around `itr` could still use it as their output type. +""" +target_output_type_for_promotion(itr) = target_output_type(itr) + +""" + output_promote_rule(output_type1, output_type2) + +The type of output that should be generated when two iterators have different +target output types, or `Union{}` if these output types should not be combined. +Only one method of `output_promote_rule` needs to be defined for any given pair +of output types. + +The built-in promotion rules are +[`LazySequence`](@ref UnrolledUtilities.LazySequence) < +[`BitSequence`](@ref UnrolledUtilities.BitSequence) < `Tuple` < `Any`. By +default, the result for any other pair of distinct output types is `Union{}`. +""" +output_promote_rule(::Type, ::Type) = Union{} +output_promote_rule(::Type{O}, ::Type{O}) where {O} = O +output_promote_rule(::Type{O}, ::Type{Tuple}) where {O} = O + +@inline function output_promote_result(O1, O2) + O12 = output_promote_rule(O1, O2) + O21 = output_promote_rule(O2, O1) + O12 == O21 == Union{} && + error("output_promote_rule is undefined for types $O1 and $O2") + (O12 == O21 || O21 == Union{}) && return O12 + O12 == Union{} && return O21 + error("output_promote_rule yields inconsistent results for types $O1 \ + and $O2: $O12 for ($O1, $O2) and $O21 for ($O2, $O1)") +end + +@inline promoted_target_output_type(itrs) = + length(itrs) == 0 ? Tuple : # Make an empty Tuple when given 0 iterators. + unrolled_mapreduce( + target_output_type_for_promotion, + output_promote_result, + itrs, + ) + +""" + eltype_restriction(output_type) + +The most general element type that can be stored in an output of type +`output_type`. By default, this is assumed to be `Any`. + +If `output_promote_rule` specifies that a particular type of output should be +constructed, but that type cannot store all of the elements that need to be in +the output, a `Tuple` will be generated instead of the promoted output type. +""" +eltype_restriction(::Type) = Any + +@inline function output_type_of_map(f, itrs...) + output_type = promoted_target_output_type(itrs) + allowed_eltype = eltype_restriction(output_type) + allowed_eltype == Any && return output_type + result_eltype = Base.promote_op(f, unrolled_map_into_tuple(eltype, itrs)...) + return result_eltype <: allowed_eltype ? output_type : Tuple +end + +@inline function output_type_of_accumulate(op, itr, init, transform) + output_type = target_output_type_for_promotion(itr) + allowed_eltype = eltype_restriction(output_type) + allowed_eltype == Any && return output_type + first_value_type = init isa NoInit ? eltype(itr) : typeof(init) + untransformed_type = Base.promote_op(op, first_value_type, eltype(itr)) + result_eltype = Base.promote_op(transform, untransformed_type) + return result_eltype <: allowed_eltype ? output_type : Tuple +end + +""" + output_constructor(output_type) + +A function that can be used to efficiently construct an output of type +`output_type` from a `Tuple`, or `identity` if such an output should not be +constructed from a `Tuple`. By default, this is set to `identity`, which also +handles the case where `output_type` is already `Tuple`. + +Many unrollable iterators (e.g., `SVector`s) are essentially wrappers for +`Tuple`s, and their constructors for `Tuple`s can be reduced to no-ops. The main +exceptions are [`StaticSequence`](@ref UnrolledUtilities.StaticSequence)s, which +do not provide constructors for `Tuple`s because there is no performance benefit +to converting a high-storage data structure into a low-storage data structure +after it has already been constructed. +""" +output_constructor(::Type) = identity + +@inline inferred_constructor(itr) = output_constructor(target_output_type(itr)) + +@inline promoted_constructor(itrs) = + output_constructor(promoted_target_output_type(itrs)) + +""" + empty_output(output_type) + +An empty output of type `output_type`. By default, this applies the +`output_constructor` for `output_type` to an empty `Tuple`. +""" +@inline empty_output(output_type) = output_constructor(output_type)(()) + +@inline inferred_empty(itr) = empty_output(target_output_type(itr)) + +@inline promoted_empty(itrs) = empty_output(promoted_target_output_type(itrs)) diff --git a/test/aqua.jl b/test/aqua.jl index d7becf1..ff1edd1 100644 --- a/test/aqua.jl +++ b/test/aqua.jl @@ -1,3 +1,4 @@ +using Test import Aqua, UnrolledUtilities # This is separate from all the other tests because Aqua.test_all checks for diff --git a/test/test_and_analyze.jl b/test/test_and_analyze.jl index 70415e3..5006e21 100644 --- a/test/test_and_analyze.jl +++ b/test/test_and_analyze.jl @@ -18,30 +18,43 @@ function print_comparison_table(io = stdout, generate_html = false) generate_html ? HtmlHighlighter(f, HtmlDecoration(; color)) : Highlighter(f, Crayon(; foreground = Symbol(color))) - better_performance_but_harder_to_compile = - highlighter(generate_html ? "royalblue" : "blue") do data, i, j - data[i, 4] != data[i, 5] && - (endswith(data[i, 6], "slower") || endswith(data[i, 7], "more")) + better_performance_but_worse_compilation_or_allocations = + highlighter(generate_html ? "royalblue" : "blue") do data, i, _ + ( + contains(data[i, 4], "better") && + !contains(data[i, 5], "more") || contains(data[i, 5], "less") + ) && (contains(data[i, 6], "more") || contains(data[i, 7], "more")) end better_performance = - highlighter(generate_html ? "mediumseagreen" : "green") do data, i, j - data[i, 4] != data[i, 5] + highlighter(generate_html ? "mediumseagreen" : "green") do data, i, _ + contains(data[i, 4], "better") && !contains(data[i, 5], "more") || + contains(data[i, 5], "less") end - mixed_compilation = - highlighter(generate_html ? "mediumorchid" : "magenta") do data, i, j - (endswith(data[i, 6], "slower") && endswith(data[i, 7], "less")) || - (endswith(data[i, 6], "faster") && endswith(data[i, 7], "more")) + similar_performance_but_mixed_compilation_and_allocations = + highlighter(generate_html ? "mediumorchid" : "magenta") do data, i, _ + contains(data[i, 5], "similar") && ( + contains(data[i, 6], "more") && contains(data[i, 7], "less") || + contains(data[i, 6], "less") && contains(data[i, 7], "more") + ) + end + worse_performance_or_compilation_or_allocations = + highlighter(generate_html ? "indianred" : "red") do data, i, _ + contains(data[i, 5], "more") || + contains(data[i, 6], "more") || + contains(data[i, 7], "more") end - harder_to_compile = - highlighter(generate_html ? "indianred" : "red") do data, i, j - endswith(data[i, 6], "slower") || endswith(data[i, 7], "more") + better_compilation_or_allocations = + highlighter(generate_html ? "darkturquoise" : "cyan") do data, i, _ + contains(data[i, 6], "less") || contains(data[i, 7], "less") end - easier_to_compile = - highlighter(generate_html ? "darkturquoise" : "cyan") do data, i, j - endswith(data[i, 6], "faster") || endswith(data[i, 7], "less") + no_measurable_difference = + highlighter(generate_html ? "khaki" : "yellow") do data, i, _ + @assert contains(data[i, 4], "similar") && + contains(data[i, 5], "similar") && + contains(data[i, 6], "similar") && + contains(data[i, 7], "similar") + true end - no_difference = - highlighter((data, i, j) -> true, generate_html ? "khaki" : "yellow") other_kwargs = generate_html ? @@ -54,7 +67,7 @@ function print_comparison_table(io = stdout, generate_html = false) ) : (; title_same_width_as_table = true, - columns_width = [45, 45, 0, 0, 0, 0, 0], + columns_width = [45, 45, 30, 25, 20, 20, 30], linebreaks = true, autowrap = true, crop = :none, @@ -70,23 +83,66 @@ function print_comparison_table(io = stdout, generate_html = false) "Unrolled Expression", "Reference Expression", "Iterator Contents", - "Unrolled Performance", - "Reference Performance", - "Unrolled Compilation Time", - "Unrolled Compilation Memory", + "Optimization", + "Run Time", + "Compilation Time", + any(contains('['), table_data[:, 7]) ? + "Total GC [and RSS] Allocations" : "Total Allocations", ], highlighters = ( - better_performance_but_harder_to_compile, + better_performance_but_worse_compilation_or_allocations, better_performance, - mixed_compilation, - harder_to_compile, - easier_to_compile, - no_difference, + similar_performance_but_mixed_compilation_and_allocations, + worse_performance_or_compilation_or_allocations, + better_compilation_or_allocations, + no_measurable_difference, ), other_kwargs..., ) end +function time_string(nanoseconds) + nanoseconds == 0 && return "$nanoseconds ns" + n_decimal_digits = floor(Int, log10(nanoseconds) + 1) + return if n_decimal_digits <= 3 + "$nanoseconds ns" + elseif n_decimal_digits <= 6 + "$(nanoseconds ÷ 10^3) μs" + elseif n_decimal_digits <= 9 + "$(nanoseconds ÷ 10^6) ms" + else + "$(nanoseconds ÷ 10^9) s" + end +end + +function memory_string(bytes) + bytes == 0 && return "$bytes B" + n_binary_digits = floor(Int, log2(bytes) + 1) + return if n_binary_digits <= 10 + "$bytes B" + elseif n_binary_digits <= 20 + "$(bytes ÷ 2^10) kB" + elseif n_binary_digits <= 30 + "$(bytes ÷ 2^20) MB" + else + "$(bytes ÷ 2^30) GB" + end +end + +function comparison_string(value1, value2, to_string, to_number = identity) + ratio = to_number(value1) / to_number(value2) + summary_str = if ratio >= 1.5 + rounded_ratio = ratio == Inf ? Inf : round(Int, ratio) + "$rounded_ratio times more" + elseif inv(ratio) >= 1.5 + rounded_inv_ratio = ratio == 0 ? Inf : round(Int, inv(ratio)) + "$rounded_inv_ratio times less" + else + "similar" + end + return "$summary_str ($(to_string(value1)) vs. $(to_string(value2)))" +end + function drop_line_numbers(expr) expr isa Expr || return expr new_args = map(drop_line_numbers, expr.args) @@ -118,7 +174,35 @@ function code_instance(f, args...) end end -macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str) +macro benchmark(expression) + return quote + prev_time = time_ns() + $(esc(expression)) + new_time = time_ns() + best_time = new_time - prev_time + + # Benchmark for at most 0.1 s (10^8 ns), ignoring the first call above. + n_trials = 0 + start_time = new_time + while n_trials < 10^4 && new_time - start_time < 10^8 + prev_time = time_ns() + $(esc(expression)) + new_time = time_ns() + best_time = min(best_time, new_time - prev_time) + n_trials += 1 + end + + best_time + end +end + +macro test_unrolled( + args_expr, + unrolled_expr, + reference_expr, + contents_info_str, + skip_allocations_test = false, +) @assert Meta.isexpr(args_expr, :tuple) arg_names = args_expr.args @assert all(arg_name -> arg_name isa Symbol, arg_names) @@ -146,26 +230,27 @@ macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str) reference_func_and_nothing($(args...)) # Test for allocations. - @test (@allocated unrolled_func_and_nothing($(args...))) == 0 + is_unrolled_non_allocating = + (@allocated unrolled_func_and_nothing($(args...))) == 0 is_reference_non_allocating = (@allocated reference_func_and_nothing($(args...))) == 0 + $(esc(skip_allocations_test)) || @test is_unrolled_non_allocating # Test for type-stability. @test_opt unrolled_func($(args...)) is_reference_stable = isempty(JET.get_reports(@report_opt reference_func($(args...)))) - unrolled_instance = code_instance(unrolled_func, $(args...)) - reference_instance = code_instance(reference_func, $(args...)) - # Test for constant propagation. - is_unrolled_const = isdefined(unrolled_instance, :rettype_const) + is_unrolled_const = + isdefined(code_instance(unrolled_func, $(args...)), :rettype_const) Base.issingletontype(typeof(($(args...),))) && @test is_unrolled_const - is_reference_const = isdefined(reference_instance, :rettype_const) + is_reference_const = + isdefined(code_instance(reference_func, $(args...)), :rettype_const) buffer = IOBuffer() - # Check whether the functions are fully optimized out. + # Determine whether the functions are fully optimized out. args_type = Tuple{map(typeof, ($(args...),))...} code_llvm(buffer, unrolled_func, args_type; debuginfo = :none) is_unrolled_optimized_out = @@ -174,86 +259,105 @@ macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str) is_reference_optimized_out = length(split(String(take!(buffer)), '\n')) == 5 + # Test the overall level of optimization. + unrolled_opt_str, unrolled_opt_score = if !is_unrolled_non_allocating + "allocating", 1 + elseif !is_unrolled_const && !is_unrolled_optimized_out + "type-stable", 3 + elseif !is_unrolled_optimized_out + "constant", 4 + else + "optimized out", 5 + end + reference_opt_str, reference_opt_score = if !is_reference_non_allocating + "allocating", 1 + elseif !is_reference_stable + "type-unstable", 2 + elseif !is_reference_const && !is_reference_optimized_out + "type-stable", 3 + elseif !is_reference_optimized_out + "constant", 4 + else + "optimized out", 5 + end + @test unrolled_opt_score >= reference_opt_score + + # Measure the run times. + unrolled_run_time = @benchmark unrolled_func($(args...)) + reference_run_time = @benchmark reference_func($(args...)) + + # Measure the compilation times and memory usages in separate processes + # to ensure that they are not under-counted. arg_name_strs = ($(map(string, arg_names)...),) arg_names_str = join(arg_name_strs, ", ") arg_definition_strs = map((name, value) -> "$name = $value", arg_name_strs, ($(args...),)) arg_definitions_str = join(arg_definition_strs, '\n') - unrolled_command_str = """ + command_str(func_str) = """ using UnrolledUtilities - unrolled_func($arg_names_str) = $($(string(unrolled_expr))) + func_and_nothing($arg_names_str) = ($func_str; nothing) $arg_definitions_str - stats1 = @timed unrolled_func($arg_names_str) - stats2 = @timed unrolled_func($arg_names_str) - print(stats1.time - stats2.time, ',', stats1.bytes - stats2.bytes) - """ - reference_command_str = """ - reference_func($arg_names_str) = $($(string(reference_expr))) - $arg_definitions_str - stats1 = @timed reference_func($arg_names_str) - stats2 = @timed reference_func($arg_names_str) - print(stats1.time - stats2.time, ',', stats1.bytes - stats2.bytes) + begin # The following block is based on @time from "base/timing.jl". + Base.Experimental.@force_compile + Base.cumulative_compile_timing(true) + time_ns_1 = Base.cumulative_compile_time_ns()[1] + rss_bytes_1 = Sys.maxrss() + Δgc_bytes = @allocated func_and_nothing($arg_names_str) + rss_bytes_2 = Sys.maxrss() + time_ns_2 = Base.cumulative_compile_time_ns()[1] + Base.cumulative_compile_timing(false) + end + Δtime_ns = time_ns_2 - time_ns_1 + Δrss_bytes = rss_bytes_2 - rss_bytes_1 + print(Δtime_ns, ", ", Δgc_bytes, ", ", Δrss_bytes) """ - - # Get the unrolled function's time-to-first-run and its memory usage. + unrolled_command_str = command_str($(string(unrolled_expr))) run(pipeline(`julia --project -e $unrolled_command_str`, buffer)) - unrolled_time, unrolled_memory = - parse.((Float64, Int), split(String(take!(buffer)), ',')) + unrolled_compile_time, unrolled_memory_gc, unrolled_memory_rss = + parse.((Int, Int, Int), split(String(take!(buffer)), ',')) # Make a new buffer to avoid a potential data race: # https://discourse.julialang.org/t/iobuffer-becomes-not-writable-after-run/92323/3 close(buffer) buffer = IOBuffer() - - # Get the reference function's time-to-first-run and its memory usage. + reference_command_str = command_str($(string(reference_expr))) run(pipeline(`julia --project -e $reference_command_str`, buffer)) - reference_time, reference_memory = - parse.((Float64, Int), split(String(take!(buffer)), ',')) + reference_compile_time, reference_memory_gc, reference_memory_rss = + parse.((Int, Int, Int), split(String(take!(buffer)), ',')) close(buffer) - # Record all relevant information in comparison_table_dict. - unrolled_performance_str = if !is_unrolled_const - "type-stable" - elseif !is_unrolled_optimized_out - "const return value" - else - "fully optimized out" - end - reference_performance_str = if !is_reference_non_allocating - "allocating" - elseif !is_reference_stable - "type-unstable" - elseif !is_reference_const - "type-stable" - elseif !is_reference_optimized_out - "const return value" - else - "fully optimized out" - end - time_ratio = unrolled_time / reference_time - time_ratio_str = if time_ratio >= 1.5 - "$(round(Int, time_ratio)) times slower" - elseif inv(time_ratio) >= 1.5 - "$(round(Int, inv(time_ratio))) times faster" - else - "similar" - end - memory_ratio = unrolled_memory / reference_memory - memory_ratio_str = if memory_ratio >= 1.5 - "$(round(Int, memory_ratio)) times more" - elseif inv(memory_ratio) >= 1.5 - "$(round(Int, inv(memory_ratio))) times less" - else - "similar" - end + optimization_str = + unrolled_opt_str == reference_opt_str ? + "similar ($unrolled_opt_str)" : + "better ($unrolled_opt_str vs. $reference_opt_str)" + run_time_str = comparison_string( + unrolled_run_time, + reference_run_time, + time_string, + ) + compile_time_str = comparison_string( + unrolled_compile_time, + reference_compile_time, + time_string, + ) + memory_str = comparison_string( + (unrolled_memory_gc, unrolled_memory_rss), + (reference_memory_gc, reference_memory_rss), + ((gc_bytes, rss_bytes),) -> + rss_bytes == 0 ? memory_string(gc_bytes) : + "$(memory_string(gc_bytes)) [$(memory_string(rss_bytes))]", + first, # Use GC value for comparison since RSS might be unavailable. + ) + # TODO: Why does Sys.maxrss() seem to always return 0 on Ubuntu systems? + dict_key = ($unrolled_expr_str, $reference_expr_str) dict_entry = ( $(esc(contents_info_str)), - unrolled_performance_str, - reference_performance_str, - time_ratio_str, - memory_ratio_str, + optimization_str, + run_time_str, + compile_time_str, + memory_str, ) if dict_key in keys(comparison_table_dict) push!(comparison_table_dict[dict_key], dict_entry) @@ -342,10 +446,6 @@ for n in (1, 8, 32, 33, 128), identical in (n == 1 ? (true,) : (true, false)) str, ) - @test_unrolled (itr,) unrolled_zip(itr) Tuple(zip(itr)) str - - @test_unrolled (itr,) unrolled_enumerate(itr) Tuple(enumerate(itr)) str - @test_unrolled (itr,) unrolled_in(nothing, itr) (nothing in itr) str @test_unrolled (itr,) unrolled_in(itr[1], itr) (itr[1] in itr) str @test_unrolled (itr,) unrolled_in(itr[end], itr) (itr[end] in itr) str @@ -479,19 +579,6 @@ for n in (1, 8, 32, 33, 128), identical in (n == 1 ? (true,) : (true, false)) str23, ) - @test_unrolled( - (itr1, itr2), - unrolled_zip(itr1, itr2), - Tuple(zip(itr1, itr2)), - str12, - ) - @test_unrolled( - (itr1, itr2, itr3), - unrolled_zip(itr1, itr2, itr3), - Tuple(zip(itr1, itr2, itr3)), - str123, - ) - # unrolled_product can take several minutes to compile when n is large if n <= 33 @test_unrolled( @@ -561,3 +648,112 @@ for n in (8, 32, 128) end end end + +@testset "Tuple vs. BitSequence for nested reductions" begin + for (itr, skip_allocations_test) in ( + (ntuple(_ -> true, Val(32)), false), + (ntuple(_ -> true, Val(33)), true), + (BitSequence{256}(true), false), + (BitSequence{257}(true), true), + ) + n = length(itr) + indices = LazySequence{n}() + @test_unrolled( + (itr, indices), + unrolled_reduce( + (itr′, i) -> Base.setindex(itr′, !itr′[i], i), + indices; + init = itr, + ), + reduce( + (itr′, i) -> Base.setindex(itr′, !itr′[i], i), + indices; + init = itr, + ), + "$n bits$(itr isa BitSequence ? " stored in a BitSequence" : "")", + skip_allocations_test, + ) + @test_unrolled( + (itr, indices), + unrolled_reduce( + (itr′, i) -> unrolled_reduce( + (itr′′, j) -> + Base.setindex(itr′′, !itr′′[min(i, j)], j), + indices; + init = itr′, + ), + indices; + init = itr, + ), + reduce( + (itr′, i) -> reduce( + (itr′′, j) -> + Base.setindex(itr′′, !itr′′[min(i, j)], j), + indices; + init = itr′, + ), + indices; + init = itr, + ), + "$n bits$(itr isa BitSequence ? " stored in a BitSequence" : "")", + skip_allocations_test, + ) + length(itr) == 257 || @test_unrolled( + (itr, indices), + unrolled_reduce( + (itr′, i) -> unrolled_reduce( + (itr′′, j) -> unrolled_reduce( + (itr′′′, k) -> Base.setindex( + itr′′′, + !itr′′′[min(i, j, k)], + k, + ), + indices; + init = itr′′, + ), + indices; + init = itr′, + ), + indices; + init = itr, + ), + reduce( + (itr′, i) -> reduce( + (itr′′, j) -> reduce( + (itr′′′, k) -> Base.setindex( + itr′′′, + !itr′′′[min(i, j, k)], + k, + ), + indices; + init = itr′′, + ), + indices; + init = itr′, + ), + indices; + init = itr, + ), + "$n bits$(itr isa BitSequence ? " stored in a BitSequence" : "")", + skip_allocations_test, + ) # Skip this test for BitSequence{257} because it allocates over 2 GB. + end +end + +# We cannot generate an unrolled function with more than 8187 calls to getindex. +@testset "maximum unrollable N" begin + itr = LazySequence{8187}() + @test_unrolled( + (itr,), + unrolled_mapreduce(abs2, +, itr), + mapreduce(abs2, +, itr), + "8187 integers stored in a LazySequence", + ) + + @test_throws "gc handles" unrolled_mapreduce(abs2, +, LazySequence{8188}()) + @test_throws "gc handles" unrolled_mapreduce( + abs2, + +, + ntuple(identity, Val(8188)), + ) +end