diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 919b89c..2f1832d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,12 +16,18 @@ jobs: matrix: version: - '1.10' + - '~1.11.0-0' os: - ubuntu-latest - macOS-latest - windows-latest arch: - x64 + exclude: + - version: '~1.11.0-0' + os: macOS-latest # JET crashes on one unit test and hangs on another + - version: '~1.11.0-0' + os: windows-latest # JET crashes on 3 unit tests steps: - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v1 diff --git a/Project.toml b/Project.toml index 62f5f0a..2dd7065 100644 --- a/Project.toml +++ b/Project.toml @@ -1,10 +1,17 @@ name = "UnrolledUtilities" uuid = "0fe1646c-419e-43be-ac14-22321958931b" authors = ["CliMA Contributors "] -version = "0.1.2" +version = "0.1.3" [compat] julia = "1.10" +StaticArrays = "1" + +[weakdeps] +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" + +[extensions] +UnrolledUtilitiesStaticArraysExt = "StaticArrays" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" diff --git a/README.md b/README.md index 29346ae..8f49b68 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@

- + Shows the logo of UnrolledUtilities.jl diff --git a/docs/Project.toml b/docs/Project.toml index 5ea11ab..5daf06b 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -5,3 +5,4 @@ JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +UnrolledUtilities = "0fe1646c-419e-43be-ac14-22321958931b" diff --git a/docs/make.jl b/docs/make.jl index ee728d5..e702add 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -2,28 +2,37 @@ using Documenter include(joinpath("..", "test", "test_and_analyze.jl")) -comparison_table_file = joinpath("docs", "src", "comparison_table.md") - -open(comparison_table_file, "w") do io - println(io, "# Comparison Table\n```@raw html") - println(io, "
") # use 80% of viewport - print_comparison_table(io, true) - println(io, "
") - println(io, "```") +comparison_tables_file = joinpath("docs", "src", "comparison_tables.md") +preamble_file = joinpath("docs", "src", "comparison_tables_preamble.md") +cp(preamble_file, comparison_tables_file; force = true) +open(comparison_tables_file, "a") do io + for (title, comparison_table_dict) in comparison_table_dicts + print_comparison_table(title, comparison_table_dict, io) + end end makedocs(; sitename = "UnrolledUtilities.jl", modules = [UnrolledUtilities], - pages = ["Home" => "index.md", "Comparison Table" => "comparison_table.md"], + pages = [ + "Home" => "index.md", + "Introduction" => "introduction.md", + "User Guide" => "user_guide.md", + "Developer Guide" => "developer_guide.md", + "Comparison Tables" => basename(comparison_tables_file), + ], format = Documenter.HTML( prettyurls = get(ENV, "CI", nothing) == "true", - size_threshold_ignore = ["comparison_table.md"], + sidebar_sitename = false, + size_threshold_ignore = [ + "introduction.md", + basename(comparison_tables_file), + ], ), clean = true, ) -rm(comparison_table_file) +rm(comparison_tables_file) deploydocs( repo = "github.com/CliMA/UnrolledUtilities.jl.git", diff --git a/logo-white.svg b/docs/src/assets/logo-dark.svg similarity index 99% rename from logo-white.svg rename to docs/src/assets/logo-dark.svg index 2daf34e..c8c43ac 100644 --- a/logo-white.svg +++ b/docs/src/assets/logo-dark.svg @@ -7,7 +7,7 @@ width="1567.9242" height="279.37802" viewBox="0 0 1567.9242 279.37802" - sodipodi:docname="logo-white.svg" + sodipodi:docname="logo-dark.svg" inkscape:version="1.2.2 (b0a8486541, 2022-12-01)" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" diff --git a/docs/src/assets/logo.svg b/docs/src/assets/logo.svg new file mode 100644 index 0000000..f02f78e --- /dev/null +++ b/docs/src/assets/logo.svg @@ -0,0 +1,131 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + UnrolledUtilities.jl + + diff --git a/docs/src/comparison_tables_preamble.md b/docs/src/comparison_tables_preamble.md new file mode 100644 index 0000000..bd34f5d --- /dev/null +++ b/docs/src/comparison_tables_preamble.md @@ -0,0 +1,61 @@ +The following autogenerated tables contain a representative set of potential use +cases for this package, along with measurements that summarize each case's +performance, compilation, and memory usage: +- run time (best of several trial measurements) +- compilation time (as reported by the compiler) +- overall level of optimization (type stability, constant propagation, etc.) and + allocations during run time (as reported by the garbage collector) +- total allocations during compilation and first run (as reported by the garbage + collector and, when possible, the Julia process's resident set size estimator) + +The rows of the tables are highlighted as follows: +- ```@raw html + light blue + ``` + indicates better optimization and either an improvement or no change in run + time, compilation time, and total allocations +- ```@raw html + dark blue + ``` + indicates better optimization and either slower run time, slower compilation, + or more total allocations +- ```@raw html + green + ``` + indicates similar optimization, either faster run time or fewer allocations + during run time, and either an improvement or no change in compilation time + and total allocations +- ```@raw html + yellow + ``` + indicates similar optimization, either faster run time or fewer allocations + during run time, and either slower compilation or more total allocations +- ```@raw html + magenta + ``` + indicates no change in performance and either an improvement or no change in + compilation time and total allocations +- ```@raw html + light gray + ``` + indicates no change in performance and either faster compilation with more + total allocations or slower compilation with fewer total allocations +- ```@raw html + dark gray + ``` + indicates no change in performance, compilation time, or total allocations +- ```@raw html + red + ``` + indicates a deterioration in performance, or no change in performance and + either slower compilation or more total allocations + +Rows highlighted in gray present no clear advantage to loop unrolling, while +those highlighted in red present a clear disadvantage. It is recommended that +you only call unrolled functions when your use case is similar to a row in one +of the remaining categories, each of which demonstrates some advantage to loop +unrolling. + +The tables are also printed out by this package's test suite, so they can be +compared across different operating systems by consulting the +[CI pipeline](https://github.com/CliMA/UnrolledUtilities.jl/actions/workflows/ci.yml). diff --git a/docs/src/developer_guide.md b/docs/src/developer_guide.md new file mode 100644 index 0000000..9ec99cd --- /dev/null +++ b/docs/src/developer_guide.md @@ -0,0 +1,96 @@ +```@meta +CurrentModule = UnrolledUtilities +``` + +## How to Unroll + +There are two general ways to implement loop unrolling in Julia—recursively +splatting iterator contents and manually generating unrolled expressions. For +example, the recursively unrolled version of `foreach` is + +```julia +unrolled_foreach(f, itr) = _unrolled_foreach(f, itr...) +_unrolled_foreach(f) = nothing +_unrolled_foreach(f, item, items...) = (f(item); _unrolled_foreach(f, items...)) +``` + +In contrast, the generatively unrolled version of `foreach` is + +```julia +unrolled_foreach(f, itr) = _unrolled_foreach(Val(length(itr)), f, itr) +@generated _unrolled_foreach(::Val{N}, f, itr) where {N} = + Expr(:block, (:(f(generic_getindex(itr, $n))) for n in 1:N)..., nothing) +``` + +To switch between recursive and generative unrolling, this package defines the +following function: + +```@docs +rec_unroll +``` + +!!! tip "Tip" + Recursive loop unrolling can be disabled globally with the following + function redefinition: + + ```julia + rec_unroll(itr) = false + ``` + +The cutoff length of 16 for switching to generative unrolling is motivated by +the benchmarks for [Generative vs. Recursive Unrolling](@ref). + +## Interface API + +The functions exported by this package can be used with any statically sized +iterators, as long as those iterators make appropriate use of the following +interface: + +```@docs +generic_getindex +output_type_for_promotion +AmbiguousOutputType +NoOutputType +ConditionalOutputType +output_promote_rule +constructor_from_tuple +empty_output +``` + +## How to Use the Interface + +To unroll over a statically sized iterator of some user-defined type `T`, follow +these steps: +- To enable recursive unrolling, add a method for `iterate(::T, [state])` +- To enable generative unrolling, add a method for `getindex(::T, n)` (or for + `generic_getindex(::T, n)` if `getindex` should not be defined for iterators + of type `T`) +- If every unrolled function that needs to construct an iterator when given an + iterator of type `T` can return a `Tuple` instead, stop here +- Otherwise, to return a non-`Tuple` iterator whenever it is efficient to do so, + follow these steps: + - Add a method for `output_type_for_promotion(::T) = O`, where `O` can be + `T`, a supertype of `T`, some other `Type`, or an `AmbiguousOutputType` + - If an iterator whose output type is `O` can be used together with an + iterator whose output type is `O′`, add a method for + `output_promote_rule(O, O′)` + - If `O` is a `NoOutputType`, stop here + - Otherwise, to handle the unambiguous output type `U` that underlies `O` + (where `U` is equivalent to `O` unless `O` is a `ConditionalOutputType`), + follow these steps: + - If an iterator of type `U` can be efficiently constructed from a + `Tuple`, add a method for `constructor_from_tuple(U)` + - Otherwise, for each of the following functions, add a method if it can + be implemented to construct an iterator of type `U` without first + storing the iterator's contents in a `Tuple`: + - `empty_output(U)` + - `unrolled_map_into(U, f, itr)` + - `unrolled_accumulate_into(U, op, itr, init, transform)` + - `unrolled_push_into(U, itr, item)` + - `unrolled_append_into(U, itr1, itr2)` + - `unrolled_take_into(U, itr, val_N)` + - `unrolled_drop_into(U, itr, val_N)` + +!!! note "Note" + When a relevant method for the interface is not defined, unrolled functions + will typically fall back to using `Tuple`s instead of other iterator types. diff --git a/docs/src/index.md b/docs/src/index.md index 8aaec38..c317460 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,75 +1,87 @@ +```@setup inference_test +using UnrolledUtilities +``` + # UnrolledUtilities.jl -A collection of generated functions in which all loops are unrolled and inlined: -- `unrolled_any(f, itr)`: similar to `any` -- `unrolled_all(f, itr)`: similar to `all` -- `unrolled_foreach(f, itrs...)`: similar to `foreach` -- `unrolled_map(f, itrs...)`: similar to `map` -- `unrolled_reduce(op, itr; [init])`: similar to `reduce` -- `unrolled_mapreduce(f, op, itrs...; [init])`: similar to `mapreduce` -- `unrolled_zip(itrs...)`: similar to `zip` -- `unrolled_enumerate(itrs...)`: similar to `enumerate`, but with the ability to - handle multiple iterators -- `unrolled_in(item, itr)`: similar to `in` -- `unrolled_unique(itr)`: similar to `unique` -- `unrolled_filter(f, itr)`: similar to `filter` -- `unrolled_split(f, itr)`: similar to `(filter(f, itr), filter(!f, itr))`, but - without duplicate calls to `f` -- `unrolled_flatten(itr)`: similar to `Iterators.flatten` -- `unrolled_flatmap(f, itrs...)`: similar to `Iterators.flatmap` -- `unrolled_product(itrs...)`: similar to `Iterators.product` -- `unrolled_applyat(f, n, itrs...)`: similar to `f(map(itr -> itr[n], itrs)...)` -- `unrolled_take(itr, ::Val{N})`: similar to `itr[1:N]` (and to - `Iterators.take`), but with `N` wrapped in a `Val` -- `unrolled_drop(itr, ::Val{N})`: similar to `itr[(N + 1):end]` (and to - `Iterators.drop`), but with `N` wrapped in a `Val` +A toolkit for low-level optimization of Julia code in which iterator sizes are +known during compilation. + +This package can be used with all *statically sized* iterators (`Tuple`s, +`NamedTuple`s, [`StaticArray`s](https://github.com/JuliaArrays/StaticArrays.jl), +etc.), including ones that are very long or ones that have elements of different +types, both of which are cases that Julia's standard library often handles +inefficiently. For example, the standard libary function `in` performs worse +than this package's `unrolled_in` for `Tuple`s with elements of different types: + +```@repl inference_test +@allocated () in ((1, 2), (1, 2, 3)) +@allocated unrolled_in((), ((1, 2), (1, 2, 3))) +``` -These functions are guaranteed to be type-stable whenever they are given -iterators with inferrable lengths and element types, including when -- the iterators have many elements (e.g., more than 32, which is when `map`, - `reduce`, and `mapreduce` tend to stop getting compiled efficiently) -- the iterators have nonuniform element types (most functions from `Base` and - `Base.Iterators` tend to encounter type-instabilities and allocations when - this is the case, especially when there are more than 32 elements) -- `f` and/or `op` recursively call the function to which they are passed, up to - an arbitrarily large recursion depth (e.g., if `f` calls `map(f, itrs)`, it - will be type-unstable when the recursion depth exceeds 2, but this will not be - the case with `unrolled_map`) +The [loop unrolling](https://en.wikipedia.org/wiki/Loop_unrolling) automatically +performed by this package offers the following benefits for statically sized +iterators: +- better support for *static compilation* + - compilation of [executables](https://github.com/tshort/StaticCompiler.jl) + - compilation of [GPU kernels](https://github.com/JuliaGPU/CUDA.jl) +- better performance (usually) + - reduced run times + - reduced memory footprints while code is running +- better compilation efficiency (occasionally) + - reduced compilation times + - reduced memory footprints while code is compiling -In addition, these functions have been written in a way that makes them very -likely to get fully optimized out through constant propagation when the -iterators have singleton element types (and when the result of calling `f` -and/or `op` on these elements is inferrable). However, they can also be much -more expensive to compile than their counterparts from `Base` and -`Base.Iterators`, in which case they should not be used unless there is a clear -performance benefit. Some notable exceptions to this are `unrolled_zip`, -`unrolled_take`, and `unrolled_drop`, which tend to be easier to compile than -`zip`, `Iterators.take`, `Iterators.drop`, and standard indexing notation. +To find out more about loop unrolling and when it is useful, see the +[Introduction](introduction.md). + +## Package Features + +This package exports a number of analogues to functions from `Base` and +`Base.Iterators`, each of which has been optimized for statically sized +iterators (in terms of both performance and compilation time): +- `unrolled_any(f, itr)`—similar to `any` +- `unrolled_all(f, itr)`—similar to `all` +- `unrolled_foreach(f, itrs...)`—similar to `foreach` +- `unrolled_map(f, itrs...)`—similar to `map` +- `unrolled_reduce(op, itr; [init])`—similar to `reduce` +- `unrolled_mapreduce(f, op, itrs...; [init])`—similar to `mapreduce` +- `unrolled_accumulate(op, itr; [init], [transform])`—similar to `accumulate`, + but with a `transform` that can be applied to every value in the output +- `unrolled_push(itr, item)`—similar to `push!`, but non-mutating +- `unrolled_append(itr1, itr2)`—similar to `append!`, but non-mutating +- `unrolled_take(itr, ::Val{N})`—similar to `Iterators.take` (i.e., `itr[1:N]`), + but with `N` wrapped in a `Val` +- `unrolled_drop(itr, ::Val{N})`—similar to `Iterators.drop` (i.e., + `itr[(N + 1):end]`), but with `N` wrapped in a `Val` +- `unrolled_in(item, itr)`—similar to `in` +- `unrolled_unique(itr)`—similar to `unique` +- `unrolled_filter(f, itr)`—similar to `filter` +- `unrolled_flatten(itr)`—similar to `Iterators.flatten` +- `unrolled_flatmap(f, itrs...)`—similar to `Iterators.flatmap` +- `unrolled_product(itrs...)`—similar to `Iterators.product` + +In addition, this package exports two functions that do not have public +analogues in `Base` or `Base.Iterators`: +- `unrolled_applyat(f, n, itrs...)`—similar to `f(itrs[1][n], itrs[2][n], ...)`, + but with a `Core.Const` index in every call to `getindex` +- `unrolled_split(f, itr)`—similar to `(filter(f, itr), filter(!f, itr))`, but + without duplicate calls to `f` -For a more precise indication of whether you should use `UnrolledUtilities`, -please consult the autogenerated [Comparison Table](@ref). This table contains a -comprehensive set of potential use cases, each with a measurement of performance -optimization, the time required for compilation, and the memory usage during -compilation. Most cases involve simple functions `f` and/or `op`, but the last -few demonstrate the benefits of unrolling with non-trivial recursive functions. +These unrolled functions are compatible with the following types of iterators: +- statically sized iterators from `Base` (e.g., `Tuple` and `NamedTuple`) +- statically sized iterators from `StaticArrays` (e.g., `SVector` and `MVector`) +- lazy iterators from `Base` (e.g., the results of `enumerate`, `zip`, + `Iterators.map`, and generator expressions) that are used as wrappers for + statically sized iterators -The rows of the table are highlighted as follows: -- green indicates an improvement in performance and either no change in - compilation or easier compilation (i.e., either similar or smaller values of - compilation time and memory usage) -- dark blue indicates an improvement in performance and harder compilation - (i.e., larger values of compilation time and/or memory usage) -- light blue indicates no change in performance and easier compilation -- yellow indicates no change in performance and no change in compilation -- magenta indicates no change in performance, an increase in compilation time, - and a decrease in compilation memory usage -- red indicates no change in performance and harder compilation +They are also compatible with two new types of statically sized iterators +exported by this package: +- `StaticOneTo`—similar to `Base.OneTo` +- `StaticBitVector`—similar to `BitVector` -Rows highlighted in green and blue present a clear advantage for unrolling, -whereas those highlighted in yellow, magenta, and red either have no clear -advantage, or they have a clear disadvantage. It is recommended that you only -unroll when your use case is similar to a row in the first category. +See the [User Guide](@ref "When to Use StaticOneTo and StaticBitVector") for +additional information about these new types of iterators. -The table is also printed out by this package's unit tests, so these -measurements can be compared across different operating systems by checking the -[CI pipeline](https://github.com/CliMA/UnrolledUtilities.jl/actions/workflows/ci.yml). +See the [Developer Guide](@ref "How to Use the Interface") to learn how +user-defined iterator types can be made compatible with unrolled functions. diff --git a/docs/src/introduction.md b/docs/src/introduction.md new file mode 100644 index 0000000..9db926a --- /dev/null +++ b/docs/src/introduction.md @@ -0,0 +1,277 @@ +```@setup inference_test +using UnrolledUtilities, InteractiveUtils, Test +``` + +```@setup fake_inference_test +macro code_warntype(_...) nothing end +macro code_llvm(_...) nothing end +``` + +```@raw html + +``` + +## Motivation for Loop Unrolling + +Although the iteration utilities in `Base` and `Base.Iterators` are sufficiently +performant for most common use cases, those who choose to dive into the world of +low-level optimization will often discover +[type instabilities](https://docs.julialang.org/en/v1/manual/faq/#man-type-stability) +in unexpected situations. Here is a particularly simple example: + +```@repl inference_test +Test.@inferred map(one, Tuple(1:31)); +Test.@inferred map(one, Tuple(1:32)); +``` + +This type instability is present in all `map`s over iterators with lengths +greater than 31, regardless of whether they are statically sized. As with most +type instabilities in Julia, this leads to memory allocations every time `map` +is called with sufficiently long iterators. + +[`Test.@inferred`](https://docs.julialang.org/en/v1/stdlib/Test/#Test.@inferred) +is helpful for checking whether the return type of a function call is stable, +but looking directly at the generated [LLVM](https://llvm.org/docs/LangRef.html) +code reveals just how different the two function calls above are: + +```@repl inference_test +@code_llvm debuginfo=:none map(one, Tuple(1:31)) +``` +```@raw html +
+``` +```@repl fake_inference_test +@code_llvm debuginfo=:none map(one, Tuple(1:32)) +``` +```@raw html + +``` +```@repl inference_test +@code_llvm debuginfo=:none map(one, Tuple(1:32)) # hide +``` +```@raw html +

+``` + +The type instability (and all of the resulting LLVM code complexity) in the +second function call can be eliminated by replacing `map` with `unrolled_map`: + +```@repl inference_test +Test.@inferred unrolled_map(one, Tuple(1:32)); +@code_llvm debuginfo=:none unrolled_map(one, Tuple(1:32)) +``` + +The minimum iterator length for type instability is not always 32; for instance, +it can also be 14: + +```@repl inference_test +first_11(itr) = itr[1:11] +Test.@inferred first_11(Tuple(1:13)); +Test.@inferred first_11(Tuple(1:14)); +``` + +!!! note "Note" + ##### *Why is the function definition needed in this example?* + + On the first line of the example above, `[1:11]` is enclosed in a function + so that it does not get evaluated in global scope. This turns the range + `1:11` into a `Core.Const`, which the compiler can propagate into the call + to `getindex` in order to infer the length of the result: + + ```@setup first_11_code_warntype + using InteractiveUtils + first_11(itr) = itr[1:11] + ``` + + ```@repl first_11_code_warntype + @code_warntype first_11(Tuple(1:13)) + ``` + + In contrast, running `Test.@inferred Tuple(1:13)[1:11]` would amount to + checking whether the compiler can compute the result type of `getindex` + given only the argument types `NTuple{13, Int64}` and `UnitRange{Int64}`, + which it cannot do: + + ```@raw html +
+ ``` + ```@repl fake_inference_test + @code_warntype Tuple(1:13)[1:11] + ``` + ```@raw html + + ``` + ```@repl inference_test + @code_warntype Tuple(1:13)[1:11] # hide + ``` + ```@raw html +

+ ``` + +Although `itr[1:10]` is always inferrable when `itr` is a `Tuple`, `itr[1:11]` +has a type instability whenever `itr` contains more than 13 items. More +generally, `itr[1:N]` seems to be unstable for all `N > 10` whenever `itr` +contains more than `N + 2` items. This type instability can be fixed by +replacing `getindex` with `unrolled_take`: + +```@repl inference_test +unrolled_first_11(itr) = unrolled_take(itr, Val(11)) +Test.@inferred unrolled_first_11(Tuple(1:14)); +``` + +Even when the final result of a function is inferred, there can be intermediate +steps in the function with type instabilities that trigger allocations: + +```@repl inference_test +function add_lengths(itr) + length_sum = 0 + for n in 1:length(itr) + length_sum += length(itr[n]) + end +end +Test.@inferred add_lengths(((1, 2), (1, 2, 3))) +@allocated add_lengths(((1, 2), (1, 2, 3))) +@code_warntype add_lengths(((1, 2), (1, 2, 3))) +``` + +The output of `@code_warntype` is quite cluttered, but the most important detail +here is that the call to `getindex` does not get inferred because it can result +in either a `Tuple` of length 2 or a `Tuple` of length 3. This type instability +can be fixed by replacing `getindex` with `unrolled_applyat`: + +```@repl inference_test +function unrolled_add_lengths(itr) + length_sum = 0 + for n in 1:length(itr) + length_sum += unrolled_applyat(length, n, itr) + end +end +unrolled_add_lengths(((1, 2), (1, 2, 3))) # hide +@allocated unrolled_add_lengths(((1, 2), (1, 2, 3))) +@code_warntype unrolled_add_lengths(((1, 2), (1, 2, 3))) +``` + +For a detailed breakdown of when the tools provided by this package can improve +performance, see the [User Guide](user_guide.md). + +## What Does Loop Unrolling Do + +When a loop over `N` indices is unrolled, it gets compiled into `N` lines of +LLVM code, where each line has a constant (`Core.Const`) index. For example, an +unrolled loop that prints every integer from 1 to 33 is compiled into the +following: + +```@raw html +
+``` +```@repl fake_inference_test +@code_llvm debuginfo=:none unrolled_foreach(println, Tuple(1:33)) +``` +```@raw html + +``` +```@repl inference_test +@code_llvm debuginfo=:none unrolled_foreach(println, Tuple(1:33)) # hide +``` +```@raw html +

+``` + +This LLVM code consists of 33 `getelementptr` instructions (each of which +extracts a value from a `Tuple` at a particular index), 33 `load` instructions, +and 33 `call` instructions (each of which switches execution to `println`). +Every `getelementptr` instruction has a constant index between 0 and 32; in more +complex examples where the `call` instructions get inlined, this constant index +can be propagated into the LLVM code of the function being called. On the other +hand, here is the LLVM code for the non-unrolled version of this loop: + +```@repl inference_test +@code_llvm debuginfo=:none foreach(println, Tuple(1:33)) +``` + +Although the first `getelementptr` instruction here has the constant index 0, +the other `getelementptr` instruction has a non-constant integer index. Also, +this LLVM code has conditional jump instructions for checking whether the last +index of the `Tuple` has been reached after each `getelementptr` instruction. + +## Downsides of Loop Unrolling + +```@setup tuple_of_tuples_test +using UnrolledUtilities, Test +tup32 = ntuple(Returns((1, 2)), 32) +``` + +Given the performance benefits of loop unrolling, it might seem at first that +the standard library needs more of it. However, the standard library is not just +meant for writing high-performance code with statically sized iterators—many of +its use cases involve code that is only executed once or several times. In such +cases, most of the execution time is required for compilation, and minimizing +run time makes no practical difference. Although unrolled functions can +occasionally be faster to compile than non-unrolled functions, they are +typically slower to compile, which means that using them instead of standard +library functions can often increase total execution time: + +```@repl tuple_of_tuples_test +tup32 = ntuple(Returns((1, 2)), 32); +@elapsed map(first, tup32) +@elapsed unrolled_map(first, tup32) +``` + +The increase in compilation time is usually no more than a factor of 5 for small +iterators, but it grows as iterator length increases: + +```@repl tuple_of_tuples_test +tup320 = ntuple(Returns((1, 2)), 320); +@elapsed map(first, tup320) +@elapsed unrolled_map(first, tup320) +``` + +Moreover, loop unrolling can sometimes increase the run time of a function in +addition to its compilation time: + +```@repl tuple_of_tuples_test +@elapsed Tuple(Iterators.product(tup32, tup32)) # compilation time + run time +@elapsed Tuple(Iterators.product(tup32, tup32)) # only run time +@elapsed unrolled_product(tup32, tup32) # compilation time + run time +@elapsed unrolled_product(tup32, tup32) # only run time +``` + +This increase in run time is most likely due to the larger size of unrolled +code, which makes it take longer to load. Nevertheless, loop unrolling still +offers the benefit of eliminating the unstable return type in this example: + +```@repl tuple_of_tuples_test +Test.@inferred Tuple(Iterators.product(tup32, tup32)); +Test.@inferred unrolled_product(tup32, tup32); +``` + +So, when type instabilities and memory allocations need to be removed +([as is required for static compilation](https://github.com/brenhinkeller/StaticTools.jl#limitations)) +and the cost to total execution time is more or less irrelevant, using unrolled +functions is probably worthwhile. Otherwise, if a significant increase in +compilation time (and potentially also run time) needs to be avoided, using +standard library functions might be a better option. + +It is usually a good idea to compare the performance of unrolled code against +non-unrolled code before settling on a particular design. Many examples of such +comparisons can be found in the [tables of benchmarks](comparison_tables.md) +that are automatically generated for this package. diff --git a/docs/src/user_guide.md b/docs/src/user_guide.md new file mode 100644 index 0000000..676de77 --- /dev/null +++ b/docs/src/user_guide.md @@ -0,0 +1,257 @@ +```@meta +CurrentModule = UnrolledUtilities +``` + +```@setup inference_test +using UnrolledUtilities, InteractiveUtils, Test +``` + +# When to Use UnrolledUtilities + +The functions and types exported by this package tend to perform better than +their counterparts from `Base` and `Base.Iterators` in the scenarios listed +below. Additional examples and more precise measurements can be found in the +automatically generated [tables of benchmarks](comparison_tables.md). + +##### Outline: + +```@contents +Pages = ["user_guide.md"] +Depth = 2:3 +``` + +## When to Use Unrolled Functions + +### Long iterators + +- `map` has an unstable return type for iterators with lengths greater than 32: + + ```@repl inference_test + Test.@inferred map(one, Tuple(1:31)); + Test.@inferred map(one, Tuple(1:32)); + Test.@inferred unrolled_map(one, Tuple(1:32)); + ``` + +- `getindex` has an unstable return type for `Core.Const` slices of length + `N > 10` from iterators with lengths greater than `N + 2`: + + ```@repl inference_test + first_11(itr) = itr[1:11] + Test.@inferred first_11(Tuple(1:13)); + Test.@inferred first_11(Tuple(1:14)); + unrolled_first_11(itr) = unrolled_take(itr, Val(11)) + Test.@inferred unrolled_first_11(Tuple(1:14)); + ``` + +- For benchmarks that indicate performance improvements when using unrolled + functions with long iterators, see [Isolated Unrolled Functions](@ref) + +### Iterators with elements of different types + +- `in` has an intermediate type instability that triggers allocations for + nonuniform iterators: + + ```@repl inference_test + @allocated () in ((1, 2), (1, 2, 3)) + @allocated unrolled_in((), ((1, 2), (1, 2, 3))) + ``` + +- `any`, `all`, and `foreach` have intermediate type instabilities that trigger + allocations for nonuniform iterators with lengths greater than 32: + + ```@repl inference_test + const nonuniform_itr_of_length_32 = (ntuple(Returns((1, 2)), 31)..., (1, 2, 3)); + const nonuniform_itr_of_length_33 = (ntuple(Returns((1, 2)), 32)..., (1, 2, 3)); + @allocated any(isempty, nonuniform_itr_of_length_32) + @allocated any(isempty, nonuniform_itr_of_length_33) + @allocated unrolled_any(isempty, nonuniform_itr_of_length_33) + ``` + +- `getindex` has an unstable return type for nonuniform iterators when given + non-constant (i.e., not `Core.Const`) indices, which can lead to intermediate + type instabilities that trigger allocations: + + ```@repl inference_test + function add_lengths(itr) + length_sum = 0 + for n in 1:length(itr) + length_sum += length(itr[n]) + end + end + add_lengths(((1, 2), (1, 2, 3))) # hide + @allocated add_lengths(((1, 2), (1, 2, 3))) + function unrolled_add_lengths(itr) + length_sum = 0 + for n in 1:length(itr) + length_sum += unrolled_applyat(length, n, itr) + end + end + unrolled_add_lengths(((1, 2), (1, 2, 3))) # hide + @allocated unrolled_add_lengths(((1, 2), (1, 2, 3))) + ``` + + !!! note "Note" + ##### *How can `unrolled_applyat` be stable if `n` isn't a `Core.Const`?* + + For the example of `add_lengths`, the compiler must infer the return + type of `itr[::Int64]` before it can compile the call to `length`. + Since this return type depends on the index `n`, the compiler needs to + insert a runtime lookup into the method table that determines which + method of `length` to call, `length(::Tuple{Int64, Int64})` or + `length(::Tuple{Int64, Int64, Int64})`, and this triggers allocations. + + For the example of `unrolled_add_lengths`, the compiler instead infers + the return types of `itr[::Core.Const(1)]`, `itr[::Core.Const(2)]`, + and so on for every index into `itr`. Then, it compiles a call to + `length` for each of these return types, and it inserts a runtime + [switch instruction](https://llvm.org/docs/LangRef.html#switch-instruction) + that determines which result of `length` to return for a particular + value of `n`. As long as `length` itself only returns one type (in this + case, `Int64`), this ensures that `unrolled_add_lengths` has no + intermediate type instabilities. + + In other words, `unrolled_applyat` combines multiple methods for `length` + and `getindex` into a single method, replacing the inefficient method + table lookup that switches between them with a simpler switch instruction. + + !!! tip "Tip" + ##### *When should `getindex` be replaced with `unrolled_applyat`?* + + The specific example above could be simplified by using `mapreduce`, + instead of using a `for`-loop in conjunction with `unrolled_applyat`: + + ```@repl + @allocated mapreduce(length, +, ((1, 2), (1, 2, 3))) + ``` + + However, there are often situations in which it is not possible to + replace loops with function calls, like when those loops are parallelized + over CPU or GPU threads. Moreover, CUDA is unable to compile any kernels + with type instabilities that trigger allocations, so `unrolled_applyat` is + *required* in order to parallelize over nonuniform iterators on GPUs. + +- For benchmarks that indicate performance improvements when using unrolled + functions with nonuniform iterators, see [Isolated Unrolled Functions](@ref) + and [Nested Unrolled Functions](@ref) + +### Reduction operations with non-constant return types + +- `reduce` and `accumulate` have unstable return types when the return type of + `op` is not constant, but only for iterator lengths greater than 32: + + ```@repl inference_test + Test.@inferred reduce(tuple, Tuple(1:32)); + Test.@inferred reduce(tuple, Tuple(1:33)); + Test.@inferred unrolled_reduce(tuple, Tuple(1:33)); + ``` + +- For benchmarks that indicate performance improvements when using unrolled + functions with nonuniform reductions, see [Isolated Unrolled Functions](@ref) + +### Operations with more than 2 levels of recursion + +- All functions in Julia have a default "recursion limit" of 2; unless this + limit is modified, it forces any function that recursively calls itself 2 or + more times to have an unstable return type: + + ```@repl inference_test + recursive_length(itr) = + eltype(itr) <: Tuple ? mapreduce(recursive_length, +, itr) : length(itr) + Test.@inferred recursive_length(((1, 2), (1, 2, 3))); + Test.@inferred recursive_length((((1,), (2,)), (1, 2, 3))); + unrolled_recursive_length(itr) = + eltype(itr) <: Tuple ? + unrolled_mapreduce(unrolled_recursive_length, +, itr) : length(itr) + Test.@inferred unrolled_recursive_length((((1,), (2,)), (1, 2, 3))); + ``` + + !!! note "Note" + ##### *Is there any other way to avoid the default recursion limit?* + + The default recursion limit applies to all functions defined in `Base` and + `Base.Iterators`, so those functions will have unstable return types for + more than 2 levels of recursion, even when all user-defined functions + passed to them have had their recursion limits disabled. It is also + impossible to modify the recursion limits of functions defined in `Base` + from external packages. This means that the only way to avoid the default + recursion limit is to not use certain functions from `Base`, and instead + to define alternatives without any recursion limits. + +- For benchmarks that indicate performance improvements when using unrolled + functions with recursive operations, see [Recursive Unrolled Functions](@ref) + +## When to Use `StaticOneTo` and `StaticBitVector` + +### Iterators of `Int`s from 1 to `N` + +```@docs +StaticOneTo +``` + +If an iterator only contains the integers from 1 to `N ≥ 0`, it is possible to +provide the compiler with the values in the iterator in addition to their types +by using a `StaticOneTo`, as opposed to a `Tuple` or something similar. This +can allow the compiler to fully optimize out code that depends on those values, +essentially moving the code's execution from run time to compilation time: + +```@repl inference_test +@code_llvm debuginfo=:none reduce(+, (1, 2, 3)) +@code_llvm debuginfo=:none reduce(+, StaticOneTo(3)) +``` + +Standard library functions can sometimes take advantage of this optimization, +but for most non-trivial operations it is necessary to use unrolled functions: + +```@repl inference_test +@code_llvm debuginfo=:none mapreduce(abs2, +, StaticOneTo(3)) +@code_llvm debuginfo=:none mapreduce(log, +, StaticOneTo(3)) +@code_llvm debuginfo=:none unrolled_mapreduce(log, +, StaticOneTo(3)) +``` + +For benchmarks that indicate performance improvements when using `StaticOneTo`s, +see [Very Long Iterators](@ref). + +!!! note "Note" + ##### *Can the compiler infer iterator values in other scenarios?* + + The compiler can usually infer the values of iterators that only contain + [singletons](https://docs.julialang.org/en/v1/manual/types/#man-singleton-types) + when they are accessed using `Core.Const` indices, but this is not possible + for non-singletons (e.g., integers) unless some special type of iterator is + used (e.g., a `StaticOneTo`). + +### Long iterators of `Bool`s that get modified across loop iterations + +```@docs +StaticBitVector +``` + +Loops in Julia often allocate memory when a value larger than 32 bytes in size +is modified across loop iterations (regardless of whether the loops are unrolled +or not). Since `Bool`s are represented by bytes, this limits certain types of +loops to modifying [bitmasks](https://en.wikipedia.org/wiki/Mask_(computing)) of +no more than 32 `Bool`s in order to avoid allocations. Unlike an iterator of +`Bool`s, though, a `StaticBitVector` stores 8 bits in every byte, which makes it +possible to modify up to 256 bits at a time in loops without any allocations: + +```@repl inference_test +random_bit_flips(itr) = reduce( + (itr′, i) -> Base.setindex(itr′, !itr′[rand(1:i)], i), + 1:length(itr); + init = itr, +) +@allocated random_bit_flips(ntuple(Returns(true), Val(32))) # hide +@allocated random_bit_flips(ntuple(Returns(true), Val(32))) +@allocated random_bit_flips(ntuple(Returns(true), Val(33))) # hide +@allocated random_bit_flips(ntuple(Returns(true), Val(33))) +@allocated random_bit_flips(StaticBitVector{256}(true)) # hide +@allocated random_bit_flips(StaticBitVector{256}(true)) +``` + +As with `StaticOneTo`s, standard library functions can occasionally take +advantage of the optimization allowed by `StaticBitVector`s, but most complex +use cases require unrolled functions. + +For benchmarks that indicate performance improvements when using long +`StaticBitVector`s that get modified across loop iterations, see +[Nested Unrolled Closures](@ref). diff --git a/ext/UnrolledUtilitiesStaticArraysExt.jl b/ext/UnrolledUtilitiesStaticArraysExt.jl new file mode 100644 index 0000000..67058a7 --- /dev/null +++ b/ext/UnrolledUtilitiesStaticArraysExt.jl @@ -0,0 +1,12 @@ +module UnrolledUtilitiesStaticArraysExt + +import UnrolledUtilities +import StaticArrays: SVector, MVector + +@inline UnrolledUtilities.output_type_for_promotion(::SVector) = SVector +@inline UnrolledUtilities.constructor_from_tuple(::Type{SVector}) = SVector + +@inline UnrolledUtilities.output_type_for_promotion(::MVector) = MVector +@inline UnrolledUtilities.constructor_from_tuple(::Type{MVector}) = MVector + +end diff --git a/logo-white.png b/logo-dark.png similarity index 100% rename from logo-white.png rename to logo-dark.png diff --git a/logo-dark.svg b/logo-dark.svg new file mode 100644 index 0000000..c8c43ac --- /dev/null +++ b/logo-dark.svg @@ -0,0 +1,131 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + UnrolledUtilities.jl + + diff --git a/src/StaticBitVector.jl b/src/StaticBitVector.jl new file mode 100644 index 0000000..d41bffa --- /dev/null +++ b/src/StaticBitVector.jl @@ -0,0 +1,194 @@ +""" + StaticBitVector{N, [U]}(f) + StaticBitVector{N, [U]}([bit]) + +A statically sized analogue of `BitVector` with `Unsigned` chunks of type `U`, +which can be constructed using either a function `f(n)` or a constant `bit`. By +default, `U` is set to `UInt8` and `bit` is set to `false`. + +This iterator can only store `Bool`s, so its `output_type_for_promotion` is a +`ConditionalOutputType`. Efficient implementations are provided for all unrolled +functions, though the methods for `unrolled_map` and `unrolled_accumulate` only +apply when the first item in the output is a `Bool`. +""" +struct StaticBitVector{N, U <: Unsigned, I <: NTuple{<:Any, U}} <: + StaticSequence{N} + ints::I +end +@inline StaticBitVector{N, U}(ints) where {N, U} = + StaticBitVector{N, U, typeof(ints)}(ints) +@inline StaticBitVector{N}(args...) where {N} = + StaticBitVector{N, UInt8}(args...) + +@inline function StaticBitVector{N, U}(bit::Bool = false) where {N, U} + n_bits_per_int = 8 * sizeof(U) + n_ints = cld(N, n_bits_per_int) + ints = ntuple(Returns(bit ? ~zero(U) : zero(U)), Val(n_ints)) + return StaticBitVector{N, U}(ints) +end + +@inline function StaticBitVector{N, U}(f::Function) where {N, U} + n_bits_per_int = 8 * sizeof(U) + n_ints = cld(N, n_bits_per_int) + ints = ntuple(Val(n_ints)) do int_index + @inline + first_index = n_bits_per_int * (int_index - 1) + 1 + unrolled_reduce( + StaticOneTo(min(n_bits_per_int, N - first_index + 1)); + init = zero(U), + ) do int, bit_index + @inline + bit_offset = bit_index - 1 + int | U(f(first_index + bit_offset)::Bool) << bit_offset + end + end + return StaticBitVector{N, U}(ints) +end + +@inline function int_index_and_bit_offset(::Type{U}, n) where {U} + int_offset, bit_offset = divrem(n - 1, 8 * sizeof(U)) + return (int_offset + 1, bit_offset) +end + +@inline function generic_getindex( + itr::StaticBitVector{<:Any, U}, + n::Integer, +) where {U} + int_index, bit_offset = int_index_and_bit_offset(U, n) + int = itr.ints[int_index] + return Bool(int >> bit_offset & one(int)) +end + +@inline function Base.setindex( + itr::StaticBitVector{N, U}, + bit::Bool, + n::Integer, +) where {N, U} + int_index, bit_offset = int_index_and_bit_offset(U, n) + int = itr.ints[int_index] + new_int = int & ~(one(U) << bit_offset) | U(bit) << bit_offset + ints = Base.setindex(itr.ints, new_int, int_index) + return StaticBitVector{N, U}(ints) +end + +@inline output_type_for_promotion(::StaticBitVector{<:Any, U}) where {U} = + ConditionalOutputType(Bool, StaticBitVector{<:Any, U}) + +@inline empty_output(::Type{StaticBitVector{<:Any, U}}) where {U} = + StaticBitVector{0, U}() + +@inline unrolled_map_into(::Type{StaticBitVector{<:Any, U}}, f, itr) where {U} = + StaticBitVector{length(itr), U}( + Base.Fix1(generic_getindex, Iterators.map(f, itr)), + ) + +@inline function unrolled_accumulate_into( + ::Type{StaticBitVector{<:Any, U}}, + op, + itr, + init, + transform, +) where {U} + N = length(itr) + n_bits_per_int = 8 * sizeof(U) + n_ints = cld(N, n_bits_per_int) + ints = unrolled_accumulate( + StaticOneTo(n_ints); + init = (nothing, init), + transform = first, + ) do (_, init_value_for_new_int), int_index + @inline + first_index = n_bits_per_int * (int_index - 1) + 1 + unrolled_reduce( + StaticOneTo(min(n_bits_per_int, N - first_index + 1)); + init = (zero(U), init_value_for_new_int), + ) do (int, prev_value), bit_index + @inline + bit_offset = bit_index - 1 + item = generic_getindex(itr, first_index + bit_offset) + new_value = + first_index + bit_offset == 1 && prev_value isa NoInit ? + item : op(prev_value, item) + (int | U(transform(new_value)::Bool) << bit_offset, new_value) + end + end + return StaticBitVector{N, U}(ints) +end + +@inline function unrolled_push_into( + ::Type{StaticBitVector{<:Any, U}}, + itr, + bit, +) where {U} + n_bits_per_int = 8 * sizeof(U) + n_ints = cld(length(itr), n_bits_per_int) + bit_offset = length(itr) % n_bits_per_int + ints = if bit_offset == 0 + (itr.ints..., U(bit)) + else + last_int = itr.ints[n_ints] + new_last_int = + last_int & ~(one(U) << bit_offset) | U(bit) << bit_offset + (unrolled_take(itr.ints, Val(n_ints - 1))..., new_last_int) + end + return StaticBitVector{length(itr) + 1, U}(ints) +end + +@inline function unrolled_append_into( + ::Type{StaticBitVector{<:Any, U}}, + itr1, + itr2, +) where {U} + n_bits_per_int = 8 * sizeof(U) + n_ints1 = cld(length(itr1), n_bits_per_int) + bit_offset = length(itr1) % n_bits_per_int + ints = if bit_offset == 0 || length(itr2) == 0 + (itr1.ints..., itr2.ints...) + else + mid_int1 = itr1.ints[n_ints1] + mid_int2 = itr2.ints[1] + mid_int = + mid_int1 & ~(~zero(U) << bit_offset) | mid_int2 << bit_offset + final_ints = + length(itr2) + bit_offset <= n_bits_per_int ? () : + unrolled_drop(itr2, Val(n_bits_per_int - bit_offset)).ints + (unrolled_take(itr1.ints, Val(n_ints1 - 1))..., mid_int, final_ints...) + end + return StaticBitVector{length(itr1) + length(itr2), U}(ints) +end + +@inline function unrolled_take_into( + ::Type{StaticBitVector{<:Any, U}}, + itr, + ::Val{N}, +) where {N, U} + n_bits_per_int = 8 * sizeof(U) + n_ints = cld(N, n_bits_per_int) + ints = unrolled_take(itr.ints, Val(n_ints)) + return StaticBitVector{N, U}(ints) +end + +@inline function unrolled_drop_into( + ::Type{StaticBitVector{<:Any, U}}, + itr, + ::Val{N}, +) where {N, U} + n_bits_per_int = 8 * sizeof(U) + n_ints = cld(length(itr) - N, n_bits_per_int) + n_dropped_ints = fld(N, n_bits_per_int) + bit_offset = N - n_bits_per_int * n_dropped_ints + ints_without_offset = unrolled_drop(itr.ints, Val(n_dropped_ints)) + ints = if bit_offset == 0 || length(itr) <= N + ints_without_offset + else + next_ints = + length(ints_without_offset) == 1 ? (nothing,) : + (unrolled_drop(ints_without_offset, Val(1))..., nothing) + unrolled_map(ints_without_offset, next_ints) do cur_int, next_int + @inline + isnothing(next_int) ? cur_int >> bit_offset : + cur_int >> bit_offset | next_int << (n_bits_per_int - bit_offset) + end + end + return StaticBitVector{length(itr) - N, U}(ints) +end diff --git a/src/StaticOneTo.jl b/src/StaticOneTo.jl new file mode 100644 index 0000000..3a80b0d --- /dev/null +++ b/src/StaticOneTo.jl @@ -0,0 +1,18 @@ +""" + StaticOneTo(N) + +A lazy and statically sized analogue of `Base.OneTo(N)`. + +This iterator can only store the integers from 1 to `N`, so its +`output_type_for_promotion` is `NoOutputType()`. An efficient method is provided +for `unrolled_take`, but no other unrolled functions can use `StaticOneTo`s as +output types. +""" +struct StaticOneTo{N} <: StaticSequence{N} end +@inline StaticOneTo(N) = StaticOneTo{N}() + +@inline generic_getindex(::StaticOneTo, n) = n + +@inline output_type_for_promotion(::StaticOneTo) = NoOutputType() + +@inline unrolled_take(::StaticOneTo, ::Val{N}) where {N} = StaticOneTo(N) diff --git a/src/UnrolledUtilities.jl b/src/UnrolledUtilities.jl index dc69559..da4d860 100644 --- a/src/UnrolledUtilities.jl +++ b/src/UnrolledUtilities.jl @@ -4,10 +4,14 @@ export unrolled_any, unrolled_all, unrolled_foreach, unrolled_map, + unrolled_applyat, unrolled_reduce, unrolled_mapreduce, - unrolled_zip, - unrolled_enumerate, + unrolled_accumulate, + unrolled_push, + unrolled_append, + unrolled_take, + unrolled_drop, unrolled_in, unrolled_unique, unrolled_filter, @@ -15,114 +19,161 @@ export unrolled_any, unrolled_flatten, unrolled_flatmap, unrolled_product, - unrolled_applyat, - unrolled_take, - unrolled_drop - -inferred_length(::Type{<:NTuple{N, Any}}) where {N} = N -# We could also add support for statically-sized iterators that are not Tuples. - -f_exprs(itr_type) = (:(f(itr[$n])) for n in 1:inferred_length(itr_type)) -@inline @generated unrolled_any(f, itr) = Expr(:||, f_exprs(itr)...) -@inline @generated unrolled_all(f, itr) = Expr(:&&, f_exprs(itr)...) - -function zipped_f_exprs(itr_types) - L = length(itr_types) - L == 0 && error("unrolled functions need at least one iterator as input") - N = minimum(inferred_length, itr_types) - return (:(f($((:(itrs[$l][$n]) for l in 1:L)...))) for n in 1:N) -end -@inline @generated unrolled_foreach(f, itrs...) = - Expr(:block, zipped_f_exprs(itrs)..., nothing) -@inline @generated unrolled_map(f, itrs...) = - Expr(:tuple, zipped_f_exprs(itrs)...) - -function nested_op_expr(itr_type) - N = inferred_length(itr_type) - N == 0 && error("unrolled_reduce needs an `init` value for empty iterators") - item_exprs = (:(itr[$n]) for n in 1:N) - return reduce((expr1, expr2) -> :(op($expr1, $expr2)), item_exprs) -end -@inline @generated unrolled_reduce_without_init(op, itr) = nested_op_expr(itr) - -struct NoInit end + StaticOneTo, + StaticBitVector + +include("unrollable_iterator_interface.jl") +include("recursively_unrolled_functions.jl") +include("generatively_unrolled_functions.jl") + +struct NoInit end # Analogue of Base._InitialValue for reduction/accumulation. + +@inline unrolled_any(f, itr) = + (rec_unroll(itr) ? rec_unrolled_any : gen_unrolled_any)(f, itr) +@inline unrolled_any(itr) = unrolled_any(identity, itr) + +@inline unrolled_all(f, itr) = + (rec_unroll(itr) ? rec_unrolled_all : gen_unrolled_all)(f, itr) +@inline unrolled_all(itr) = unrolled_all(identity, itr) + +@inline unrolled_foreach(f, itr) = + (rec_unroll(itr) ? rec_unrolled_foreach : gen_unrolled_foreach)(f, itr) +@inline unrolled_foreach(f, itrs...) = unrolled_foreach(splat(f), zip(itrs...)) + +@inline unrolled_map_into_tuple(f, itr) = + (rec_unroll(itr) ? rec_unrolled_map : gen_unrolled_map)(f, itr) +@inline unrolled_map_into(output_type, f, itr) = + constructor_from_tuple(output_type)(unrolled_map_into_tuple(f, itr)) +@inline unrolled_map(f, itr) = + unrolled_map_into(inferred_output_type(Iterators.map(f, itr)), f, itr) +@inline unrolled_map(f, itrs...) = unrolled_map(splat(f), zip(itrs...)) + +@inline unrolled_applyat(f, n, itr) = + (rec_unroll(itr) ? rec_unrolled_applyat : gen_unrolled_applyat)(f, n, itr) +@inline unrolled_applyat(f, n, itrs...) = + unrolled_applyat(splat(f), n, zip(itrs...)) +@inline unrolled_applyat_bounds_error() = + error("unrolled_applyat has detected an out-of-bounds index") + +@inline unrolled_reduce(op, itr, init) = + (rec_unroll(itr) ? rec_unrolled_reduce : gen_unrolled_reduce)(op, itr, init) @inline unrolled_reduce(op, itr; init = NoInit()) = - unrolled_reduce_without_init(op, init isa NoInit ? itr : (init, itr...)) + isempty(itr) && init isa NoInit ? + error("unrolled_reduce requires an init value for empty iterators") : + unrolled_reduce(op, itr, init) + +# TODO: Figure out why unrolled_reduce(op, Val(N), init) compiles faster than +# unrolled_reduce(op, StaticOneTo(N), init) for the non-orographic gravity wave +# parametrization test in ClimaAtmos, to the point where the StaticOneTo version +# appears to completely hang while the Val version compiles in under a second. +@inline unrolled_reduce(op, val_N::Val, init) = + val_unrolled_reduce(op, val_N, init) +@inline unrolled_reduce(op, val_N::Val; init = NoInit()) = + val_N isa Val{0} && init isa NoInit ? + error("unrolled_reduce requires an init value for empty iterators") : + unrolled_reduce(op, val_N, init) @inline unrolled_mapreduce(f, op, itrs...; init = NoInit()) = - unrolled_reduce(op, unrolled_map(f, itrs...); init) + unrolled_reduce(op, Iterators.map(f, itrs...), init) + +@inline unrolled_accumulate_into_tuple(op, itr, init, transform) = + (rec_unroll(itr) ? rec_unrolled_accumulate : gen_unrolled_accumulate)( + op, + itr, + init, + transform, + ) +@inline unrolled_accumulate_into(output_type, op, itr, init, transform) = + constructor_from_tuple(output_type)( + unrolled_accumulate_into_tuple(op, itr, init, transform), + ) +@inline unrolled_accumulate(op, itr; init = NoInit(), transform = identity) = + unrolled_accumulate_into( + accumulate_output_type(op, itr, init, transform), + op, + itr, + init, + transform, + ) + +@inline unrolled_push_into(output_type, itr, item) = + constructor_from_tuple(output_type)((itr..., item)) +@inline unrolled_push(itr, item) = + unrolled_push_into(inferred_output_type(itr), itr, item) + +@inline unrolled_append_into(output_type, itr1, itr2) = + constructor_from_tuple(output_type)((itr1..., itr2...)) +@inline unrolled_append(itr1, itr2) = + unrolled_append_into(promoted_output_type((itr1, itr2)), itr1, itr2) -@inline unrolled_zip(itrs...) = unrolled_map(tuple, itrs...) +@inline unrolled_take_into(output_type, itr, ::Val{N}) where {N} = + constructor_from_tuple(output_type)( + ntuple(Base.Fix1(generic_getindex, itr), Val(N)), + ) +@inline unrolled_take(itr, val_N) = + unrolled_take_into(inferred_output_type(itr), itr, val_N) -@inline unrolled_enumerate(itrs...) = - unrolled_zip(ntuple(identity, Val(length(itrs[1]))), itrs...) +@inline unrolled_drop_into(output_type, itr, ::Val{N}) where {N} = + constructor_from_tuple(output_type)( + ntuple(n -> generic_getindex(itr, N + n), Val(length(itr) - N)), + ) +@inline unrolled_drop(itr, val_N) = + unrolled_drop_into(inferred_output_type(itr), itr, val_N) @inline unrolled_in(item, itr) = unrolled_any(Base.Fix1(===, item), itr) # Using === instead of == or isequal improves type stability for singletons. @inline unrolled_unique(itr) = - unrolled_reduce(itr; init = ()) do unique_items, item + unrolled_reduce(itr; init = inferred_empty(itr)) do unique_items, item @inline - unrolled_in(item, unique_items) ? unique_items : (unique_items..., item) + unrolled_in(item, unique_items) ? unique_items : + unrolled_push(unique_items, item) end @inline unrolled_filter(f, itr) = - unrolled_reduce(itr; init = ()) do filtered_items, item + unrolled_reduce(itr; init = inferred_empty(itr)) do items_with_true_f, item @inline - f(item) ? (filtered_items..., item) : filtered_items + f(item) ? unrolled_push(items_with_true_f, item) : items_with_true_f end @inline unrolled_split(f, itr) = - unrolled_reduce(itr; init = ((), ())) do (f_items, not_f_items), item + unrolled_reduce( + itr; + init = (inferred_empty(itr), inferred_empty(itr)), + ) do (items_with_true_f, items_with_false_f), item @inline - f(item) ? ((f_items..., item), not_f_items) : - (f_items, (not_f_items..., item)) + f(item) ? (unrolled_push(items_with_true_f, item), items_with_false_f) : + (items_with_true_f, unrolled_push(items_with_false_f, item)) end @inline unrolled_flatten(itr) = - unrolled_reduce((item1, item2) -> (item1..., item2...), itr; init = ()) + unrolled_reduce(unrolled_append, itr; init = promoted_empty(itr)) @inline unrolled_flatmap(f, itrs...) = - unrolled_flatten(unrolled_map(f, itrs...)) + unrolled_flatten(Iterators.map(f, itrs...)) @inline unrolled_product(itrs...) = - unrolled_reduce(itrs; init = ((),)) do product_itr, itr + unrolled_reduce(itrs; init = (promoted_empty(itrs),)) do product_itr, itr @inline unrolled_flatmap(itr) do item @inline - unrolled_map(product_tuple -> (product_tuple..., item), product_itr) + unrolled_map_into_tuple(Base.Fix2(unrolled_push, item), product_itr) end end -@inline unrolled_applyat(f, n, itrs...) = unrolled_foreach( - (i, items...) -> i == n && f(items...), - unrolled_enumerate(itrs...), -) - -@inline unrolled_take(itr, ::Val{N}) where {N} = ntuple(i -> itr[i], Val(N)) -@inline unrolled_drop(itr, ::Val{N}) where {N} = - ntuple(i -> itr[N + i], Val(length(itr) - N)) -# When its second argument is a Val, ntuple is unrolled via Base.@ntuple. - -@static if hasfield(Method, :recursion_relation) - # Remove recursion limits for functions whose arguments are also functions. - for func in ( - unrolled_any, - unrolled_all, - unrolled_foreach, - unrolled_map, - unrolled_reduce_without_init, - unrolled_reduce, - unrolled_mapreduce, - unrolled_filter, - unrolled_split, - unrolled_flatmap, - unrolled_applyat, - ) - for method in methods(func) - method.recursion_relation = (_...) -> true - end - end -end +abstract type StaticSequence{N} end + +@inline Base.length(::StaticSequence{N}) where {N} = N +@inline Base.firstindex(::StaticSequence) = 1 +@inline Base.lastindex(itr::StaticSequence) = length(itr) +@inline Base.getindex(itr::StaticSequence, n::Integer) = + generic_getindex(itr, n) +@inline Base.iterate(itr::StaticSequence, n = 1) = + n > length(itr) ? nothing : (generic_getindex(itr, n), n + 1) + +include("StaticOneTo.jl") +include("StaticBitVector.jl") + +include("recursion_limits.jl") # This must be included at the end of the module. end diff --git a/src/generatively_unrolled_functions.jl b/src/generatively_unrolled_functions.jl new file mode 100644 index 0000000..91ec3af --- /dev/null +++ b/src/generatively_unrolled_functions.jl @@ -0,0 +1,60 @@ +@inline @generated _gen_unrolled_any(::Val{N}, f, itr) where {N} = + Expr(:||, (:(f(generic_getindex(itr, $n))) for n in 1:N)...) +@inline gen_unrolled_any(f, itr) = _gen_unrolled_any(Val(length(itr)), f, itr) + +@inline @generated _gen_unrolled_all(::Val{N}, f, itr) where {N} = + Expr(:&&, (:(f(generic_getindex(itr, $n))) for n in 1:N)...) +@inline gen_unrolled_all(f, itr) = _gen_unrolled_all(Val(length(itr)), f, itr) + +@inline @generated _gen_unrolled_foreach(::Val{N}, f, itr) where {N} = + Expr(:block, (:(f(generic_getindex(itr, $n))) for n in 1:N)..., nothing) +@inline gen_unrolled_foreach(f, itr) = + _gen_unrolled_foreach(Val(length(itr)), f, itr) + +@inline @generated _gen_unrolled_map(::Val{N}, f, itr) where {N} = + Expr(:tuple, (:(f(generic_getindex(itr, $n))) for n in 1:N)...) +@inline gen_unrolled_map(f, itr) = _gen_unrolled_map(Val(length(itr)), f, itr) + +@inline @generated _gen_unrolled_applyat(::Val{N}, f, n, itr) where {N} = Expr( + :block, + (:(n == $n && return f(generic_getindex(itr, $n))) for n in 1:N)..., + :(unrolled_applyat_bounds_error()), +) # This block gets optimized into a switch instruction during LLVM codegen. +@inline gen_unrolled_applyat(f, n, itr) = + _gen_unrolled_applyat(Val(length(itr)), f, n, itr) + +@inline @generated _gen_unrolled_reduce(::Val{N}, op, itr, init) where {N} = + foldl( + init <: NoInit ? (2:N) : (1:N); + init = init <: NoInit ? :(generic_getindex(itr, 1)) : :init, + ) do prev_op_expr, n + :(op($prev_op_expr, generic_getindex(itr, $n))) + end # Use foldl instead of reduce to guarantee left associativity. +@inline gen_unrolled_reduce(op, itr, init) = + _gen_unrolled_reduce(Val(length(itr)), op, itr, init) + +@inline @generated function _gen_unrolled_accumulate( + ::Val{N}, + op, + itr, + init, + transform, +) where {N} + first_item_expr = :(generic_getindex(itr, 1)) + init_expr = init <: NoInit ? first_item_expr : :(op(init, $first_item_expr)) + transformed_exprs_and_op_exprs = + accumulate(1:N; init = (nothing, init_expr)) do (_, prev_op_expr), n + var = gensym() + op_expr = :(op($var, generic_getindex(itr, $(n + 1)))) + (:($var = $prev_op_expr; transform($var)), op_expr) + end + return Expr(:tuple, Iterators.map(first, transformed_exprs_and_op_exprs)...) +end +@inline gen_unrolled_accumulate(op, itr, init, transform) = + _gen_unrolled_accumulate(Val(length(itr)), op, itr, init, transform) + +# NOTE: The following is experimental and will likely be removed in the future. +@inline @generated val_unrolled_reduce(op, ::Val{N}, init) where {N} = + foldl(init <: NoInit ? (1:N) : (:init, 1:N...)) do prev_op_expr, item_expr + :(op($prev_op_expr, $item_expr)) + end # Use foldl instead of reduce to guarantee left associativity. diff --git a/src/recursion_limits.jl b/src/recursion_limits.jl new file mode 100644 index 0000000..9f9c279 --- /dev/null +++ b/src/recursion_limits.jl @@ -0,0 +1,56 @@ +# Remove recursion limits from functions that call themselves, and also from all +# functions whose arguments can be arbitrary functions (including themselves). +@static if hasfield(Method, :recursion_relation) + for func in ( + generic_getindex, + output_type_for_promotion, + _rec_unrolled_any, + _rec_unrolled_all, + _rec_unrolled_foreach, + _rec_unrolled_map, + _rec_unrolled_applyat, + _rec_unrolled_reduce, + _rec_unrolled_accumulate, + rec_unrolled_any, + rec_unrolled_all, + rec_unrolled_foreach, + rec_unrolled_map, + rec_unrolled_applyat, + rec_unrolled_reduce, + rec_unrolled_accumulate, + _gen_unrolled_any, + _gen_unrolled_all, + _gen_unrolled_foreach, + _gen_unrolled_map, + _gen_unrolled_applyat, + _gen_unrolled_reduce, + _gen_unrolled_accumulate, + gen_unrolled_any, + gen_unrolled_all, + gen_unrolled_foreach, + gen_unrolled_map, + gen_unrolled_applyat, + gen_unrolled_reduce, + gen_unrolled_accumulate, + val_unrolled_reduce, + unrolled_any, + unrolled_all, + unrolled_foreach, + unrolled_map_into_tuple, + unrolled_map_into, + unrolled_map, + unrolled_applyat, + unrolled_reduce, + unrolled_mapreduce, + unrolled_accumulate_into_tuple, + unrolled_accumulate_into, + unrolled_accumulate, + unrolled_filter, + unrolled_split, + unrolled_flatmap, + ) + for method in methods(func) + method.recursion_relation = Returns(true) + end + end +end diff --git a/src/recursively_unrolled_functions.jl b/src/recursively_unrolled_functions.jl new file mode 100644 index 0000000..db88bef --- /dev/null +++ b/src/recursively_unrolled_functions.jl @@ -0,0 +1,47 @@ +@inline _rec_unrolled_any(f) = false +@inline _rec_unrolled_any(f, item, items...) = + f(item) || _rec_unrolled_any(f, items...) +@inline rec_unrolled_any(f, itr) = _rec_unrolled_any(f, itr...) + +@inline _rec_unrolled_all(f) = true +@inline _rec_unrolled_all(f, item, items...) = + f(item) && _rec_unrolled_all(f, items...) +@inline rec_unrolled_all(f, itr) = _rec_unrolled_all(f, itr...) + +@inline _rec_unrolled_foreach(f) = nothing +@inline _rec_unrolled_foreach(f, item, items...) = + (f(item); _rec_unrolled_foreach(f, items...)) +@inline rec_unrolled_foreach(f, itr) = _rec_unrolled_foreach(f, itr...) + +@inline _rec_unrolled_map(f) = () +@inline _rec_unrolled_map(f, item, items...) = + (f(item), _rec_unrolled_map(f, items...)...) +@inline rec_unrolled_map(f, itr) = _rec_unrolled_map(f, itr...) + +@inline _rec_unrolled_applyat(f, offset_n) = unrolled_applyat_bounds_error() +@inline _rec_unrolled_applyat(f, offset_n, item, items...) = + offset_n == 1 ? f(item) : _rec_unrolled_applyat(f, offset_n - 1, items...) +@inline rec_unrolled_applyat(f, n, itr) = _rec_unrolled_applyat(f, n, itr...) + +@inline _rec_unrolled_reduce(op, prev_value) = prev_value +@inline _rec_unrolled_reduce(op, prev_value, item, items...) = + _rec_unrolled_reduce(op, op(prev_value, item), items...) +@inline rec_unrolled_reduce(op, itr, init) = + init isa NoInit ? _rec_unrolled_reduce(op, itr...) : + _rec_unrolled_reduce(op, init, itr...) + +@inline _rec_unrolled_accumulate(op, transform, prev_value) = + (transform(prev_value),) +@inline _rec_unrolled_accumulate(op, transform, prev_value, item, items...) = ( + transform(prev_value), + _rec_unrolled_accumulate(op, transform, op(prev_value, item), items...)..., +) +@inline rec_unrolled_accumulate(op, itr, init, transform) = + isempty(itr) ? () : + init isa NoInit ? _rec_unrolled_accumulate(op, transform, itr...) : + _rec_unrolled_accumulate( + op, + transform, + op(init, generic_getindex(itr, 1)), + unrolled_drop(itr, Val(1))..., + ) diff --git a/src/unrollable_iterator_interface.jl b/src/unrollable_iterator_interface.jl new file mode 100644 index 0000000..f17705f --- /dev/null +++ b/src/unrollable_iterator_interface.jl @@ -0,0 +1,205 @@ +""" + rec_unroll(itr) + +Whether to use recursive loop unrolling instead of generative loop unrolling for +the iterator `itr`. + +In general, recursive loop unrolling is faster to compile for small iterators, +but it becomes extremely slow to compile for long iterators, and it usually +generates suboptimal LLVM code for long iterators. On the other hand, generative +loop unrolling is slow to compile for small iterators, but its compilation time +does not grow as rapidly with respect to iterator size, and it always generates +optimal LLVM code. The default is currently to use recursive unrolling for +iterator lengths up to 16, and to use generative unrolling for longer iterators. +""" +@inline rec_unroll(itr) = length(itr) <= 16 + +""" + generic_getindex(itr, n) + +Identical to `getindex(itr, n)`, but with the added ability to handle lazy +iterator types defined in the standard library, such as `Base.Generator` and +`Base.Iterators.Enumerate`. +""" +@inline generic_getindex(itr, n) = getindex(itr, n) +@inline generic_getindex(itr::Base.Generator, n) = + itr.f(generic_getindex(itr.iter, n)) +@inline generic_getindex(itr::Base.Iterators.Enumerate, n) = + (n, generic_getindex(itr.itr, n)) +@inline generic_getindex(itr::Base.Iterators.Zip, n) = + unrolled_map_into_tuple(Base.Fix2(generic_getindex, n), itr.is) + +@inline first_item_type(itr) = + Base.promote_op(Base.Fix2(generic_getindex, 1), typeof(itr)) +@inline second_item_type(itr) = + Base.promote_op(Base.Fix2(generic_getindex, 2), typeof(itr)) + +""" + output_type_for_promotion(itr) + +The type of output that unrolled functions should try to generate for the input +iterator `itr`, or a `ConditionalOutputType` if the output type depends on the +type of items that need to be stored in it, or `NoOutputType()` if `itr` is a +lazy iterator without any associated output type. Defaults to `Tuple`. +""" +@inline output_type_for_promotion(_) = Tuple +@inline output_type_for_promotion(::NamedTuple{names}) where {names} = + NamedTuple{names} +@inline output_type_for_promotion(itr::Base.Generator) = + output_type_for_promotion(itr.iter) +@inline output_type_for_promotion(itr::Base.Iterators.Enumerate) = + output_type_for_promotion(itr.itr) +@inline output_type_for_promotion(itr::Base.Iterators.Zip) = + maybe_ambiguous_promoted_output_type(itr.is) + +""" + AmbiguousOutputType + +The result of `output_type_for_promotion` for iterators that do not have +well-defined output types. +""" +abstract type AmbiguousOutputType end + +""" + NoOutputType() + +The `AmbiguousOutputType` of lazy iterators. +""" +struct NoOutputType <: AmbiguousOutputType end + +""" + ConditionalOutputType(allowed_item_type, output_type, [fallback_type]) + +An `AmbiguousOutputType` that can have one of two possible values. If the first +item in the output is a subtype of `allowed_item_type`, the output will have the +type `output_type`; otherwise, it will have the type `fallback_type`, which is +set to `Tuple` by default. +""" +struct ConditionalOutputType{I, O, O′} <: AmbiguousOutputType end +@inline ConditionalOutputType( + allowed_item_type::Type, + output_type::Type, + fallback_type::Type = Tuple, +) = ConditionalOutputType{allowed_item_type, output_type, fallback_type}() + +@inline unambiguous_output_type(_, ::Type{O}) where {O} = O +@inline unambiguous_output_type(_, ::NoOutputType) = Tuple +@inline unambiguous_output_type( + get_first_item_type, + ::ConditionalOutputType{I, O, O′}, +) where {I, O, O′} = get_first_item_type() <: I ? O : O′ + +""" + output_promote_rule(output_type1, output_type2) + +The type of output that should be generated when two iterators do not have the +same `output_type_for_promotion`, or `Union{}` if these iterators should not be +used together. Only one method of `output_promote_rule` needs to be defined for +any pair of output types. + +By default, all types take precedence over `NoOutputType()`, and the conditional +part of any `ConditionalOutputType` takes precedence over an unconditional type +(so that only the `fallback_type` of any conditional type gets promoted). The +default result for all other pairs of unequal output types is `Union{}`. +""" +@inline output_promote_rule(_, _) = Union{} +@inline output_promote_rule(::Type{O}, ::Type{O}) where {O} = O +@inline output_promote_rule(::NoOutputType, output_type) = output_type +@inline output_promote_rule( + ::ConditionalOutputType{I, O, O′}, + ::Type{O′′}, +) where {I, O, O′, O′′} = + ConditionalOutputType(I, O, output_promote_rule(O′, O′′)) +@inline output_promote_rule( + ::Type{O′}, + ::ConditionalOutputType{I, O, O′′}, +) where {I, O, O′, O′′} = + ConditionalOutputType(I, O, output_promote_rule(O′, O′′)) +@inline output_promote_rule( + ::ConditionalOutputType{I, O, O′}, + ::ConditionalOutputType{I, O, O′′}, +) where {I, O, O′, O′′} = + ConditionalOutputType(I, O, output_promote_rule(O′, O′′)) + +@inline function output_promote_result(O1, O2) + O12 = output_promote_rule(O1, O2) + O21 = output_promote_rule(O2, O1) + O12 == O21 == Union{} && + error("output_promote_rule is undefined for $O1 and $O2") + (O12 == O21 || O21 == Union{}) && return O12 + O12 == Union{} && return O21 + error("output_promote_rule yields inconsistent results for $O1 and $O2: \ + $O12 for $O1 followed by $O2, versus $O21 for $O2 followed by $O1") +end + +@inline maybe_ambiguous_promoted_output_type(itrs) = + isempty(itrs) ? Tuple : # Generate a Tuple when given 0 inputs. + unrolled_mapreduce(output_type_for_promotion, output_promote_result, itrs) + +@inline inferred_output_type(itr) = + unambiguous_output_type(output_type_for_promotion(itr)) do + @inline + first_item_type(itr) + end +@inline inferred_output_type(itr::Base.Generator) = + unambiguous_output_type(output_type_for_promotion(itr.iter)) do + @inline + Base.promote_op(itr.f, first_item_type(itr.iter)) + end +@inline inferred_output_type(itr::Base.Iterators.Enumerate) = + unambiguous_output_type(output_type_for_promotion(itr.itr)) do + @inline + Tuple{Int, first_item_type(itr.itr)} + end +@inline inferred_output_type(itr::Base.Iterators.Zip) = + unambiguous_output_type(maybe_ambiguous_promoted_output_type(itr.is)) do + @inline + Tuple{unrolled_map_into_tuple(first_item_type, itr.is)...} + end + +@inline promoted_output_type(itrs) = + unambiguous_output_type(maybe_ambiguous_promoted_output_type(itrs)) do + @inline + first_item_type(generic_getindex(itrs, 1)) + end + +@inline accumulate_output_type(op, itr, init, transform) = + unambiguous_output_type(output_type_for_promotion(itr)) do + @inline + no_init = init isa NoInit + arg1_type = no_init ? first_item_type(itr) : typeof(init) + arg2_type = no_init ? second_item_type(itr) : first_item_type(itr) + Base.promote_op(transform, Base.promote_op(op, arg1_type, arg2_type)) + end + +""" + constructor_from_tuple(output_type) + +A function that can be used to efficiently construct an output of type +`output_type` from a `Tuple`, or `identity` if such an output should not be +constructed from a `Tuple`. Defaults to `identity`, which also handles the case +where `output_type` is already `Tuple`. The `output_type` here is guaranteed to +be a `Type`, rather than a `ConditionalOutputType` or `NoOutputType`. + +Many statically sized iterators (e.g., `SVector`s) are essentially wrappers for +`Tuple`s, and their constructors for `Tuple`s can be reduced to no-ops. The main +exceptions are [`StaticOneTo`](@ref UnrolledUtilities.StaticOneTo)s and +[`StaticBitVector`](@ref UnrolledUtilities.StaticBitVector)s, which do not +provide constructors for `Tuple`s because there is no performance benefit to +making a lazy or low-storage data structure once a corresponding high-storage +data structure has already been constructed. +""" +@inline constructor_from_tuple(::Type) = identity +@inline constructor_from_tuple(::Type{NT}) where {NT <: NamedTuple} = NT + +""" + empty_output(output_type) + +An empty output of type `output_type`. Defaults to applying the +`constructor_from_tuple` for the given type to an empty `Tuple`. +""" +@inline empty_output(output_type) = constructor_from_tuple(output_type)(()) + +@inline inferred_empty(itr) = empty_output(inferred_output_type(itr)) + +@inline promoted_empty(itrs) = empty_output(promoted_output_type(itrs)) diff --git a/test/aqua.jl b/test/aqua.jl index d7becf1..ff1edd1 100644 --- a/test/aqua.jl +++ b/test/aqua.jl @@ -1,3 +1,4 @@ +using Test import Aqua, UnrolledUtilities # This is separate from all the other tests because Aqua.test_all checks for diff --git a/test/runtests.jl b/test/runtests.jl index 631181c..0cfddab 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,7 +2,9 @@ using SafeTestsets @safetestset "Test and Analyze" begin @time include("test_and_analyze.jl") - print_comparison_table() + for (title, comparison_table_dict) in comparison_table_dicts + print_comparison_table(title, comparison_table_dict) + end end @safetestset "Aqua" begin @time include("aqua.jl") diff --git a/test/test_and_analyze.jl b/test/test_and_analyze.jl index 70415e3..b1beb7d 100644 --- a/test/test_and_analyze.jl +++ b/test/test_and_analyze.jl @@ -6,45 +6,79 @@ using InteractiveUtils using UnrolledUtilities -comparison_table_dict = OrderedDict() +comparison_table_dicts = OrderedDict() -function print_comparison_table(io = stdout, generate_html = false) +function print_comparison_table(title, comparison_table_dict, io = stdout) table_data = mapreduce(vcat, collect(comparison_table_dict)) do (key, entries) stack(entry -> (key..., entry...), entries; dims = 1) end - highlighter(f, color) = - generate_html ? HtmlHighlighter(f, HtmlDecoration(; color)) : - Highlighter(f, Crayon(; foreground = Symbol(color))) - - better_performance_but_harder_to_compile = - highlighter(generate_html ? "royalblue" : "blue") do data, i, j - data[i, 4] != data[i, 5] && - (endswith(data[i, 6], "slower") || endswith(data[i, 7], "more")) - end - better_performance = - highlighter(generate_html ? "mediumseagreen" : "green") do data, i, j - data[i, 4] != data[i, 5] - end - mixed_compilation = - highlighter(generate_html ? "mediumorchid" : "magenta") do data, i, j - (endswith(data[i, 6], "slower") && endswith(data[i, 7], "less")) || - (endswith(data[i, 6], "faster") && endswith(data[i, 7], "more")) - end - harder_to_compile = - highlighter(generate_html ? "indianred" : "red") do data, i, j - endswith(data[i, 6], "slower") || endswith(data[i, 7], "more") - end - easier_to_compile = - highlighter(generate_html ? "darkturquoise" : "cyan") do data, i, j - endswith(data[i, 6], "faster") || endswith(data[i, 7], "less") + writing_to_docs = io isa IOStream + + color(color_str) = + writing_to_docs ? HtmlDecoration(; color = color_str) : + Crayon(; foreground = Symbol(color_str)) + highlighter_color(optimization, run_time, compile_time, allocs) = + if contains(optimization, "better") || + contains(optimization, "fewer allocs") && + !contains(run_time, "more") || + contains(optimization, "identical") && contains(run_time, "less") + # better performance + if !contains(run_time, "more") && + !contains(compile_time, "more") && + !contains(allocs, "more") + # similar or better run time, compilation, and total allocations + if contains(optimization, "better") + # better optimization + color(writing_to_docs ? "darkturquoise" : "cyan") + else + # faster run time or fewer allocations at run time + color(writing_to_docs ? "mediumseagreen" : "green") + end + else + # worse run time, compilation, or total allocations + if contains(optimization, "better") + # better optimization + color(writing_to_docs ? "royalblue" : "blue") + else + # faster run time or fewer allocations at run time + color(writing_to_docs ? "khaki" : "yellow") + end + end + elseif contains(optimization, "identical") && + contains(run_time, "similar") + # similar performance + if contains(compile_time, "less") && !contains(allocs, "more") || + !contains(compile_time, "more") && contains(allocs, "less") + # better compilation or total allocations + color(writing_to_docs ? "mediumorchid" : "magenta") + elseif contains(compile_time, "less") && contains(allocs, "more") || + contains(compile_time, "more") && contains(allocs, "less") + # mixed compilation and total allocations + color(writing_to_docs ? "silver" : "light_gray") + elseif contains(compile_time, "similar") && + contains(allocs, "similar") + # similar compilation and total allocations + color(writing_to_docs ? "gray" : "dark_gray") + else + # worse compilation or total allocations + color(writing_to_docs ? "indianred" : "red") + end + else + # worse performance + color(writing_to_docs ? "indianred" : "red") end - no_difference = - highlighter((data, i, j) -> true, generate_html ? "khaki" : "yellow") + highlighter = (writing_to_docs ? HtmlHighlighter : Highlighter)( + Returns(true), + (_, data, row, _) -> highlighter_color(data[row, 6:9]...), + ) + + # TODO: Why does Sys.maxrss() always seem to be 0 on Ubuntu systems? + has_rss = any(contains('['), table_data[:, 9]) other_kwargs = - generate_html ? + writing_to_docs ? (; backend = Val(:html), table_style = Dict( @@ -53,38 +87,86 @@ function print_comparison_table(io = stdout, generate_html = false) ), ) : (; + title, + title_alignment = :c, title_same_width_as_table = true, - columns_width = [45, 45, 0, 0, 0, 0, 0], + columns_width = [45, 45, 15, 10, 30, 25, 20, 20, has_rss ? 30 : 20], linebreaks = true, autowrap = true, crop = :none, ) + if writing_to_docs + println(io, "## $title") + println(io, "```@raw html") + println(io, "
") # 80% of viewport + end pretty_table( io, table_data; - title = "Comparison of UnrolledUtilities to Base and Base.Iterators", - title_alignment = :c, alignment = :l, header = [ "Unrolled Expression", "Reference Expression", - "Iterator Contents", - "Unrolled Performance", - "Reference Performance", - "Unrolled Compilation Time", - "Unrolled Compilation Memory", + "Itr Type", + "Itr Length", + "Itr Contents", + "Optimization", + "Run Time", + "Compilation Time", + "Total $(has_rss ? "GC [and RSS] " : "")Allocations", ], - highlighters = ( - better_performance_but_harder_to_compile, - better_performance, - mixed_compilation, - harder_to_compile, - easier_to_compile, - no_difference, - ), + highlighters = highlighter, other_kwargs..., ) + if writing_to_docs + println(io, "
") + println(io, "```") + else + println(io) + end +end + +function time_string(nanoseconds) + nanoseconds == 0 && return "$nanoseconds ns" + n_decimal_digits = floor(Int, log10(nanoseconds) + 1) + return if n_decimal_digits <= 3 + "$nanoseconds ns" + elseif n_decimal_digits <= 6 + "$(round(Int, nanoseconds / 10^3)) μs" + elseif n_decimal_digits <= 9 + "$(round(Int, nanoseconds / 10^6)) ms" + else + "$(round(Int, nanoseconds / 10^9)) s" + end +end + +function memory_string(bytes) + bytes == 0 && return "$bytes B" + n_binary_digits = floor(Int, log2(bytes) + 1) + return if n_binary_digits <= 10 + "$bytes B" + elseif n_binary_digits <= 20 + "$(round(Int, bytes / 2^10)) kB" + elseif n_binary_digits <= 30 + "$(round(Int, bytes / 2^20)) MB" + else + "$(round(Int, bytes / 2^30)) GB" + end +end + +function comparison_string(value1, value2, to_string, to_number = identity) + ratio = to_number(value1) / to_number(value2) + ratio_str = if ratio >= 2 + floored_ratio = ratio == Inf ? Inf : floor(Int, ratio) + "$floored_ratio times more" + elseif inv(ratio) >= 2 + floored_inv_ratio = ratio == 0 ? Inf : floor(Int, inv(ratio)) + "$floored_inv_ratio times less" + else + "similar" + end + return "$ratio_str ($(to_string(value1)) vs. $(to_string(value2)))" end function drop_line_numbers(expr) @@ -118,18 +200,59 @@ function code_instance(f, args...) end end -macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str) +macro benchmark(expression) + return quote + prev_time = time_ns() + $(esc(expression)) + new_time = time_ns() + best_time = new_time - prev_time + + # Benchmark for at most 0.1 s (10^8 ns), ignoring the first call above. + n_trials = 0 + start_time = new_time + while n_trials < 10^4 && new_time - start_time < 10^8 + prev_time = time_ns() + $(esc(expression)) + new_time = time_ns() + best_time = min(best_time, new_time - prev_time) + n_trials += 1 + end + + best_time + end +end + +macro test_unrolled( + args_expr, + unrolled_expr, + reference_expr, + itr_contents_str, + skip_allocations_test = false, + skip_type_stability_test = false, +) @assert Meta.isexpr(args_expr, :tuple) arg_names = args_expr.args @assert all(arg_name -> arg_name isa Symbol, arg_names) args = map(esc, arg_names) unrolled_expr_str = simplified_expression_string(unrolled_expr) reference_expr_str = simplified_expression_string(reference_expr) - expr_info_str = - length(args) == 1 ? "$unrolled_expr_str with 1 iterator that contains" : - "$unrolled_expr_str with $(length(args)) iterators that each contain" + contains_str = length(args) == 1 ? " that contains" : "s that each contain" quote - @info "Testing $($expr_info_str) $($(esc(contents_info_str)))" + itr_types = map(arg -> typeof(arg).name.wrapper, ($(args...),)) + itr_lengths = map(length, ($(args...),)) + + itr_type_str = + length(unique(itr_types)) == 1 ? string(itr_types[1]) : + join(itr_types, '/') + itr_length_str = + length(unique(itr_lengths)) == 1 ? string(itr_lengths[1]) : + join(itr_lengths, '/') + itr_str = + $(isempty(args)) ? "nothing" : + "$($(length(args))) $itr_type_str$($contains_str) $itr_length_str \ + $($(esc(itr_contents_str)))" + + @info "Testing $($unrolled_expr_str) with $itr_str" unrolled_func($(arg_names...)) = $(esc(unrolled_expr)) reference_func($(arg_names...)) = $(esc(reference_expr)) @@ -146,26 +269,27 @@ macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str) reference_func_and_nothing($(args...)) # Test for allocations. - @test (@allocated unrolled_func_and_nothing($(args...))) == 0 - is_reference_non_allocating = - (@allocated reference_func_and_nothing($(args...))) == 0 + unrolled_run_memory = @allocated unrolled_func_and_nothing($(args...)) + reference_run_memory = @allocated reference_func_and_nothing($(args...)) + $(esc(skip_allocations_test)) || @test unrolled_run_memory == 0 # Test for type-stability. - @test_opt unrolled_func($(args...)) + is_unrolled_stable = + isempty(JET.get_reports(@report_opt unrolled_func($(args...)))) is_reference_stable = isempty(JET.get_reports(@report_opt reference_func($(args...)))) - - unrolled_instance = code_instance(unrolled_func, $(args...)) - reference_instance = code_instance(reference_func, $(args...)) + $(esc(skip_type_stability_test)) || @test_opt unrolled_func($(args...)) # Test for constant propagation. - is_unrolled_const = isdefined(unrolled_instance, :rettype_const) - Base.issingletontype(typeof(($(args...),))) && @test is_unrolled_const - is_reference_const = isdefined(reference_instance, :rettype_const) + is_unrolled_const = + isdefined(code_instance(unrolled_func, $(args...)), :rettype_const) + is_reference_const = + isdefined(code_instance(reference_func, $(args...)), :rettype_const) + # Base.issingletontype(typeof(($(args...),))) && @test is_unrolled_const buffer = IOBuffer() - # Check whether the functions are fully optimized out. + # Determine whether the functions are fully optimized out. args_type = Tuple{map(typeof, ($(args...),))...} code_llvm(buffer, unrolled_func, args_type; debuginfo = :none) is_unrolled_optimized_out = @@ -174,86 +298,115 @@ macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str) is_reference_optimized_out = length(split(String(take!(buffer)), '\n')) == 5 + # Test the overall level of optimization. + unrolled_opt_str, unrolled_opt_score = if unrolled_run_memory > 0 + "$(memory_string(unrolled_run_memory)) allocs", 1 / unrolled_run_memory + elseif !is_unrolled_stable + "type-unstable", 2 + elseif !is_unrolled_const && !is_unrolled_optimized_out + "type-stable", 3 + elseif !is_unrolled_optimized_out + "constant", 4 + else + "optimized out", 5 + end + reference_opt_str, reference_opt_score = if reference_run_memory > 0 + "$(memory_string(reference_run_memory)) allocs", + 1 / reference_run_memory + elseif !is_reference_stable + "type-unstable", 2 + elseif !is_reference_const && !is_reference_optimized_out + "type-stable", 3 + elseif !is_reference_optimized_out + "constant", 4 + else + "optimized out", 5 + end + $(esc(skip_type_stability_test)) || + @test unrolled_opt_score >= reference_opt_score + + # Measure the run times. + unrolled_run_time = @benchmark unrolled_func($(args...)) + reference_run_time = @benchmark reference_func($(args...)) + + # Measure the compilation times and memory allocations in separate + # processes to ensure that they are not under-counted. arg_name_strs = ($(map(string, arg_names)...),) arg_names_str = join(arg_name_strs, ", ") arg_definition_strs = map((name, value) -> "$name = $value", arg_name_strs, ($(args...),)) arg_definitions_str = join(arg_definition_strs, '\n') - unrolled_command_str = """ + command_str(func_str) = """ using UnrolledUtilities - unrolled_func($arg_names_str) = $($(string(unrolled_expr))) - $arg_definitions_str - stats1 = @timed unrolled_func($arg_names_str) - stats2 = @timed unrolled_func($arg_names_str) - print(stats1.time - stats2.time, ',', stats1.bytes - stats2.bytes) - """ - reference_command_str = """ - reference_func($arg_names_str) = $($(string(reference_expr))) $arg_definitions_str - stats1 = @timed reference_func($arg_names_str) - stats2 = @timed reference_func($arg_names_str) - print(stats1.time - stats2.time, ',', stats1.bytes - stats2.bytes) + Base.cumulative_compile_timing(true) + nanoseconds1 = Base.cumulative_compile_time_ns()[1] + rss_bytes_1 = Sys.maxrss() + Δgc_bytes = @allocated $func_str + rss_bytes_2 = Sys.maxrss() + nanoseconds2 = Base.cumulative_compile_time_ns()[1] + Base.cumulative_compile_timing(false) + Δnanoseconds = nanoseconds2 - nanoseconds1 + Δrss_bytes = rss_bytes_2 - rss_bytes_1 + print(Δnanoseconds, ", ", Δgc_bytes, ", ", Δrss_bytes) """ - # Get the unrolled function's time-to-first-run and its memory usage. + unrolled_command_str = command_str($(string(unrolled_expr))) run(pipeline(`julia --project -e $unrolled_command_str`, buffer)) - unrolled_time, unrolled_memory = - parse.((Float64, Int), split(String(take!(buffer)), ',')) + unrolled_compile_time, unrolled_total_memory, unrolled_total_rss = + parse.((Int, Int, Int), split(String(take!(buffer)), ',')) # Make a new buffer to avoid a potential data race: - # https://discourse.julialang.org/t/iobuffer-becomes-not-writable-after-run/92323/3 + # discourse.julialang.org/t/iobuffer-becomes-not-writable-after-run/92323/3 close(buffer) buffer = IOBuffer() - # Get the reference function's time-to-first-run and its memory usage. + reference_command_str = command_str($(string(reference_expr))) run(pipeline(`julia --project -e $reference_command_str`, buffer)) - reference_time, reference_memory = - parse.((Float64, Int), split(String(take!(buffer)), ',')) + reference_compile_time, reference_total_memory, reference_total_rss = + parse.((Int, Int, Int), split(String(take!(buffer)), ',')) close(buffer) - # Record all relevant information in comparison_table_dict. - unrolled_performance_str = if !is_unrolled_const - "type-stable" - elseif !is_unrolled_optimized_out - "const return value" - else - "fully optimized out" - end - reference_performance_str = if !is_reference_non_allocating - "allocating" - elseif !is_reference_stable - "type-unstable" - elseif !is_reference_const - "type-stable" - elseif !is_reference_optimized_out - "const return value" - else - "fully optimized out" - end - time_ratio = unrolled_time / reference_time - time_ratio_str = if time_ratio >= 1.5 - "$(round(Int, time_ratio)) times slower" - elseif inv(time_ratio) >= 1.5 - "$(round(Int, inv(time_ratio))) times faster" - else - "similar" - end - memory_ratio = unrolled_memory / reference_memory - memory_ratio_str = if memory_ratio >= 1.5 - "$(round(Int, memory_ratio)) times more" - elseif inv(memory_ratio) >= 1.5 - "$(round(Int, inv(memory_ratio))) times less" + optimization_str = if unrolled_opt_score > reference_opt_score + if unrolled_opt_score <= 1 + "fewer allocs ($unrolled_opt_str vs. $reference_opt_str)" + else + "better ($unrolled_opt_str vs. $reference_opt_str)" + end + elseif unrolled_opt_score < reference_opt_score + "worse ($unrolled_opt_str vs. $reference_opt_str)" else - "similar" + "identical ($unrolled_opt_str)" end + run_time_str = comparison_string( + unrolled_run_time, + reference_run_time, + time_string, + ) + compile_time_str = comparison_string( + unrolled_compile_time, + reference_compile_time, + time_string, + ) + memory_str = comparison_string( + (unrolled_total_memory, unrolled_total_rss), + (reference_total_memory, reference_total_rss), + ((gc_bytes, rss_bytes),) -> + rss_bytes == 0 ? memory_string(gc_bytes) : + "$(memory_string(gc_bytes)) [$(memory_string(rss_bytes))]", + first, # Use GC value for comparison since RSS might be unavailable. + ) + dict_key = ($unrolled_expr_str, $reference_expr_str) dict_entry = ( - $(esc(contents_info_str)), - unrolled_performance_str, - reference_performance_str, - time_ratio_str, - memory_ratio_str, + itr_type_str, + itr_length_str, + $(esc(itr_contents_str)), + optimization_str, + run_time_str, + compile_time_str, + memory_str, ) if dict_key in keys(comparison_table_dict) push!(comparison_table_dict[dict_key], dict_entry) @@ -263,160 +416,219 @@ macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str) end end -@testset "empty iterators" begin - itr = () - str = "nothing" - @test_unrolled (itr,) unrolled_any(error, itr) any(error, itr) str - @test_unrolled (itr,) unrolled_all(error, itr) all(error, itr) str - @test_unrolled (itr,) unrolled_foreach(error, itr) foreach(error, itr) str - @test_unrolled (itr,) unrolled_map(error, itr, itr) map(error, itr, itr) str - @test_unrolled( - (itr,), - unrolled_reduce(error, itr; init = 0), - reduce(error, itr; init = 0), - str, - ) +tuple_of_tuples(num_tuples, min_tuple_length, singleton, identical) = + ntuple(num_tuples) do index + tuple_length = min_tuple_length + (identical ? 0 : (index - 1) % 7) + ntuple(singleton ? Val : identity, tuple_length) + end +function tuples_of_tuples_contents_str(itrs...) + str = "" + all(itr -> length(itr) > 1 && length(unique(itr)) == 1, itrs) && + (str *= "identical ") + all(itr -> length(itr) > 1 && length(unique(itr)) != 1, itrs) && + (str *= "distinct ") + all(itr -> all(isempty, itr), itrs) && (str *= "empty ") + all(itr -> all(!isempty, itr), itrs) && (str *= "nonempty ") + all(itr -> any(isempty, itr) && any(!isempty, itr), itrs) && + (str *= "empty & nonempty ") + all(itr -> Base.issingletontype(typeof(itr)), itrs) && (str *= "singleton ") + all(itr -> !Base.issingletontype(typeof(itr)), itrs) && + (str *= "non-singleton ") + str *= "Tuple" + all(itr -> length(itr) > 1, itrs) && (str *= "s") + return str end -for n in (1, 8, 32, 33, 128), identical in (n == 1 ? (true,) : (true, false)) - itr1 = ntuple(i -> ntuple(Val, identical ? 0 : (i - 1) % 7), n) - itr2 = ntuple(i -> ntuple(Val, identical ? 1 : (i - 1) % 7 + 1), n) - itr3 = ntuple(i -> ntuple(identity, identical ? 1 : (i - 1) % 7 + 1), n) - if n == 1 - str1 = "1 empty tuple" - str2 = "1 nonempty singleton tuple" - str3 = "1 nonempty non-singleton tuple" - str12 = "1 singleton tuple" - str23 = "1 nonempty tuple" - str123 = "1 tuple" - elseif identical - str1 = "$n empty tuples" - str2 = "$n identical nonempty singleton tuples" - str3 = "$n identical nonempty non-singleton tuples" - str12 = "$n identical singleton tuples" - str23 = "$n identical nonempty tuples" - str123 = "$n identical tuples" - else - str1 = "$n empty and nonempty singleton tuples" - str2 = "$n nonempty singleton tuples" - str3 = "$n nonempty non-singleton tuples" - str12 = "$n singleton tuples" - str23 = "$n nonempty tuples" - str123 = "$n tuples" - end - @testset "iterators of $str123" begin - for (itr, str) in ((itr1, str1), (itr2, str2), (itr3, str3)) - @test_unrolled (itr,) unrolled_any(isempty, itr) any(isempty, itr) str - @test_unrolled (itr,) unrolled_any(!isempty, itr) any(!isempty, itr) str +# NOTE: In the tests below, random numbers are meant to emulate values that +# cannot be inferred during compilation. + +title = "Isolated Unrolled Functions" +comparison_table_dict = (comparison_table_dicts[title] = OrderedDict()) + +for itr in ( + tuple_of_tuples(1, 0, true, true), + tuple_of_tuples(1, 1, true, true), + tuple_of_tuples(1, 1, false, true), + map(n -> tuple_of_tuples(n, 0, true, true), (8, 32, 33, 128))..., + map(n -> tuple_of_tuples(n, 1, true, true), (8, 32, 33, 128))..., + map(n -> tuple_of_tuples(n, 1, false, true), (8, 32, 33, 128))..., + map(n -> tuple_of_tuples(n, 0, true, false), (8, 32, 33, 128))..., + map(n -> tuple_of_tuples(n, 1, true, false), (8, 32, 33, 128))..., + map(n -> tuple_of_tuples(n, 1, false, false), (8, 32, 33, 128))..., +) + str = tuples_of_tuples_contents_str(itr) + itr_description = "a Tuple that contains $(length(itr)) $str" + @testset "individual unrolled functions of $itr_description" begin + @test_unrolled (itr,) unrolled_any(isempty, itr) any(isempty, itr) str + @test_unrolled( + (itr,), + unrolled_any(x -> length(x) == rand(8:10), itr), + any(x -> length(x) == rand(8:10), itr), + str, + ) - @test_unrolled (itr,) unrolled_all(isempty, itr) all(isempty, itr) str - @test_unrolled (itr,) unrolled_all(!isempty, itr) all(!isempty, itr) str + @test_unrolled (itr,) unrolled_all(isempty, itr) all(isempty, itr) str + @test_unrolled( + (itr,), + unrolled_all(x -> length(x) == rand(8:10), itr), + all(x -> length(x) == rand(8:10), itr), + str, + ) - @test_unrolled( - (itr,), - unrolled_foreach(x -> @assert(length(x) <= 7), itr), - foreach(x -> @assert(length(x) <= 7), itr), - str, - ) + @test_unrolled( + (itr,), + unrolled_foreach(x -> @assert(length(x) <= 7), itr), + foreach(x -> @assert(length(x) <= 7), itr), + str, + ) - @test_unrolled (itr,) unrolled_map(length, itr) map(length, itr) str + @test_unrolled (itr,) unrolled_map(length, itr) map(length, itr) str - @test_unrolled (itr,) unrolled_reduce(tuple, itr) reduce(tuple, itr) str - @test_unrolled( - (itr,), - unrolled_reduce(tuple, itr; init = ()), - reduce(tuple, itr; init = ()), - str, - ) + @test_unrolled( + (itr,), + unrolled_applyat(length, rand(1:7:length(itr)), itr), + length(itr[rand(1:7:length(itr))]), + str, + ) + + @test_unrolled (itr,) unrolled_reduce(tuple, itr) reduce(tuple, itr) str + @test_unrolled( + (itr,), + unrolled_reduce(tuple, itr; init = ()), + reduce(tuple, itr; init = ()), + str, + ) + + @test_unrolled( + (itr,), + unrolled_mapreduce(length, +, itr), + mapreduce(length, +, itr), + str, + ) + @test_unrolled( + (itr,), + unrolled_mapreduce(length, +, itr; init = 0), + mapreduce(length, +, itr; init = 0), + str, + ) + if length(itr) <= 33 @test_unrolled( (itr,), - unrolled_mapreduce(length, +, itr), - mapreduce(length, +, itr), + unrolled_accumulate(tuple, itr), + accumulate(tuple, itr), str, ) @test_unrolled( (itr,), - unrolled_mapreduce(length, +, itr; init = 0), - mapreduce(length, +, itr; init = 0), + unrolled_accumulate(tuple, itr; init = ()), + accumulate(tuple, itr; init = ()), str, ) + end # These can take half a minute to compile when the length is 128. - @test_unrolled (itr,) unrolled_zip(itr) Tuple(zip(itr)) str + @test_unrolled (itr,) unrolled_push(itr, itr[1]) (itr..., itr[1]) str + @test_unrolled (itr,) unrolled_append(itr, itr) (itr..., itr...) str - @test_unrolled (itr,) unrolled_enumerate(itr) Tuple(enumerate(itr)) str + @test_unrolled( + (itr,), + unrolled_take(itr, Val(length(itr) ÷ 2)), + itr[1:(length(itr) ÷ 2)], + str, + ) + @test_unrolled( + (itr,), + unrolled_drop(itr, Val(length(itr) ÷ 2)), + itr[(length(itr) ÷ 2 + 1):end], + str, + ) - @test_unrolled (itr,) unrolled_in(nothing, itr) (nothing in itr) str - @test_unrolled (itr,) unrolled_in(itr[1], itr) (itr[1] in itr) str - @test_unrolled (itr,) unrolled_in(itr[end], itr) (itr[end] in itr) str + @test_unrolled (itr,) unrolled_in(nothing, itr) (nothing in itr) str + @test_unrolled (itr,) unrolled_in(itr[1], itr) (itr[1] in itr) str + @test_unrolled (itr,) unrolled_in(itr[end], itr) (itr[end] in itr) str - # unrolled_unique is only type-stable for singletons - if Base.issingletontype(typeof(itr)) - @test_unrolled (itr,) unrolled_unique(itr) Tuple(unique(itr)) str - end + @test_unrolled( + (itr,), + unrolled_unique(itr), + Tuple(unique(itr)), + str, + !Base.issingletontype(typeof(itr)), + !Base.issingletontype(typeof(itr)), + ) # unrolled_unique is type-unstable for non-singleton values - @test_unrolled( - (itr,), - unrolled_filter(!isempty, itr), - filter(!isempty, itr), - str, - ) + @test_unrolled( + (itr,), + unrolled_filter(!isempty, itr), + filter(!isempty, itr), + str, + ) - @test_unrolled( - (itr,), - unrolled_split(isempty, itr), - (filter(isempty, itr), filter(!isempty, itr)), - str, - ) + @test_unrolled( + (itr,), + unrolled_split(isempty, itr), + (filter(isempty, itr), filter(!isempty, itr)), + str, + ) - @test_unrolled( - (itr,), - unrolled_flatten(itr), - Tuple(Iterators.flatten(itr)), - str, - ) + @test_unrolled( + (itr,), + unrolled_flatten(itr), + Tuple(Iterators.flatten(itr)), + str, + ) - @test_unrolled( - (itr,), - unrolled_flatmap(reverse, itr), - Tuple(Iterators.flatmap(reverse, itr)), - str, - ) + @test_unrolled( + (itr,), + unrolled_flatmap(reverse, itr), + Tuple(Iterators.flatmap(reverse, itr)), + str, + ) + if length(itr) <= 33 @test_unrolled( (itr,), - unrolled_product(itr), - Tuple(Iterators.product(itr)), + unrolled_product(itr, itr), + Tuple(Iterators.product(itr, itr)), str, ) - + end + if length(itr) <= 8 @test_unrolled( (itr,), - unrolled_applyat( - x -> @assert(length(x) <= 7), - rand(1:length(itr)), - itr, - ), - @assert(length(itr[rand(1:length(itr))]) <= 7), + unrolled_product(itr, itr, itr), + Tuple(Iterators.product(itr, itr, itr)), str, ) + end # This can take several minutes to compile when the length is 32. + end +end - if n > 1 - @test_unrolled( - (itr,), - unrolled_take(itr, Val(7)), - itr[1:7], - str, - ) - @test_unrolled( - (itr,), - unrolled_drop(itr, Val(7)), - itr[8:end], - str, - ) - end - end - +title = "Nested Unrolled Functions" +comparison_table_dict = (comparison_table_dicts[title] = OrderedDict()) + +for (itr1, itr2, itr3) in ( + ( + tuple_of_tuples(1, 0, true, true), + tuple_of_tuples(1, 1, true, true), + tuple_of_tuples(1, 1, false, true), + ), + zip( + map(n -> tuple_of_tuples(n, 0, true, true), (8, 32, 33, 128)), + map(n -> tuple_of_tuples(n, 1, true, true), (8, 32, 33, 128)), + map(n -> tuple_of_tuples(n, 1, false, true), (8, 32, 33, 128)), + )..., + zip( + map(n -> tuple_of_tuples(n, 0, true, false), (8, 32, 33, 128)), + map(n -> tuple_of_tuples(n, 1, true, false), (8, 32, 33, 128)), + map(n -> tuple_of_tuples(n, 1, false, false), (8, 32, 33, 128)), + )..., +) + str3 = tuples_of_tuples_contents_str(itr3) + str12 = tuples_of_tuples_contents_str(itr1, itr2) + str23 = tuples_of_tuples_contents_str(itr2, itr3) + str123 = tuples_of_tuples_contents_str(itr1, itr2, itr3) + itr_description = "Tuples that contain $(length(itr1)) $str123" + @testset "nested unrolled functions of $itr_description" begin @test_unrolled( (itr3,), unrolled_any(x -> unrolled_reduce(+, x) > 7, itr3), @@ -434,11 +646,11 @@ for n in (1, 8, 32, 33, 128), identical in (n == 1 ? (true,) : (true, false)) @test_unrolled( (itr1, itr2), unrolled_foreach( - (x1, x2) -> @assert(length(x1) < length(x2)), + (x1, x2) -> @assert(x1 == unrolled_take(x2, Val(length(x1)))), itr1, itr2, ), - foreach((x1, x2) -> @assert(length(x1) < length(x2)), itr1, itr2), + foreach((x1, x2) -> @assert(x1 == x2[1:length(x1)]), itr1, itr2), str12, ) @test_unrolled( @@ -455,13 +667,13 @@ for n in (1, 8, 32, 33, 128), identical in (n == 1 ? (true,) : (true, false)) @test_unrolled( (itr1, itr2), unrolled_applyat( - (x1, x2) -> @assert(length(x1) < length(x2)), + (x1, x2) -> @assert(x1 == unrolled_take(x2, Val(length(x1)))), rand(1:length(itr1)), itr1, itr2, ), let n = rand(1:length(itr1)) - @assert(length(itr1[n]) < length(itr2[n])) + @assert(itr1[n] == itr2[n][1:length(itr1[n])]) end, str12, ) @@ -478,53 +690,27 @@ for n in (1, 8, 32, 33, 128), identical in (n == 1 ? (true,) : (true, false)) end, str23, ) - - @test_unrolled( - (itr1, itr2), - unrolled_zip(itr1, itr2), - Tuple(zip(itr1, itr2)), - str12, - ) - @test_unrolled( - (itr1, itr2, itr3), - unrolled_zip(itr1, itr2, itr3), - Tuple(zip(itr1, itr2, itr3)), - str123, - ) - - # unrolled_product can take several minutes to compile when n is large - if n <= 33 - @test_unrolled( - (itr1, itr2), - unrolled_product(itr1, itr2), - Tuple(Iterators.product(itr1, itr2)), - str12, - ) - end - if n <= 8 - @test_unrolled( - (itr1, itr2, itr3), - unrolled_product(itr1, itr2, itr3), - Tuple(Iterators.product(itr1, itr2, itr3)), - str123, - ) - end end end nested_iterator(depth, n, inner_n) = depth == 1 ? ntuple(identity, n) : - ntuple(inner_n) do _ - nested_iterator(depth - 1, Int(n / inner_n), inner_n) - end + ntuple( + Returns(nested_iterator(depth - 1, Int(n / inner_n), inner_n)), + inner_n, + ) + +title = "Recursive Unrolled Functions" +comparison_table_dict = (comparison_table_dicts[title] = OrderedDict()) for n in (8, 32, 128) - @testset "iterators of $n values in nested tuples" begin + itr_description = "a Tuple that contains $n values in nested Tuples" + @testset "recursive unrolled functions of $itr_description" begin for depth in (2, 3, 4:2:(Int(log2(n)) + 1)...) itr = nested_iterator(depth, n, 2) - str = "$n values in nested tuples of depth $depth" + str = "$itr_description of depth $depth" # In the following definitions, use var"#self#" to avoid boxing: - # https://discourse.julialang.org/t/performant-recursive-anonymous-functions/90984/5 + # discourse.julialang.org/t/performant-recursive-anonymous-functions/90984/5 @test_unrolled( (itr,), map( @@ -561,3 +747,254 @@ for n in (8, 32, 128) end end end + +title = "Nested Unrolled Closures" +comparison_table_dict = (comparison_table_dicts[title] = OrderedDict()) + +@testset "nested unrolled closures of Tuples vs. StaticBitVectors" begin + for (itr, skip_allocations_test) in ( + (ntuple(Returns(true), 32), false), + (ntuple(Returns(true), 33), true), + (StaticBitVector{256}(true), false), + (StaticBitVector{257}(true), true), + ) + @test_unrolled( + (itr,), + unrolled_reduce( + (itr′, i) -> Base.setindex(itr′, !itr′[i], i), + StaticOneTo(length(itr)); + init = itr, + ), + reduce( + (itr′, i) -> Base.setindex(itr′, !itr′[i], i), + StaticOneTo(length(itr)); + init = itr, + ), + "Bools", + skip_allocations_test, + ) + @test_unrolled( + (itr,), + unrolled_reduce( + (itr′, i) -> unrolled_reduce( + (itr′′, j) -> + Base.setindex(itr′′, !itr′′[min(i, j)], j), + StaticOneTo(length(itr′)); + init = itr′, + ), + StaticOneTo(length(itr)); + init = itr, + ), + reduce( + (itr′, i) -> reduce( + (itr′′, j) -> + Base.setindex(itr′′, !itr′′[min(i, j)], j), + StaticOneTo(length(itr′)); + init = itr′, + ), + StaticOneTo(length(itr)); + init = itr, + ), + "Bools", + skip_allocations_test, + ) + if length(itr) <= 256 + @test_unrolled( + (itr,), + unrolled_reduce( + (itr′, i) -> unrolled_reduce( + (itr′′, j) -> unrolled_reduce( + (itr′′′, k) -> Base.setindex( + itr′′′, + !itr′′′[min(i, j, k)], + k, + ), + StaticOneTo(length(itr′′)); + init = itr′′, + ), + StaticOneTo(length(itr′)); + init = itr′, + ), + StaticOneTo(length(itr)); + init = itr, + ), + reduce( + (itr′, i) -> reduce( + (itr′′, j) -> reduce( + (itr′′′, k) -> Base.setindex( + itr′′′, + !itr′′′[min(i, j, k)], + k, + ), + StaticOneTo(length(itr′′)); + init = itr′′, + ), + StaticOneTo(length(itr′)); + init = itr′, + ), + StaticOneTo(length(itr)); + init = itr, + ), + "Bools", + skip_allocations_test, + ) + end # The StaticBitVector{257} allocates over 2 GB for this test. + end +end + +title = "Empty Iterators" +comparison_table_dict = (comparison_table_dicts[title] = OrderedDict()) + +@testset "unrolled functions of an empty Tuple" begin + itr = () + str = "nothing" + @test_unrolled (itr,) unrolled_any(error, itr) any(error, itr) str + @test_unrolled (itr,) unrolled_all(error, itr) all(error, itr) str + @test_unrolled (itr,) unrolled_foreach(error, itr) foreach(error, itr) str + @test_unrolled (itr,) unrolled_map(error, itr) map(error, itr) str + @test_throws "init" unrolled_reduce(error, itr) + @test_unrolled( + (itr,), + unrolled_reduce(error, itr; init = 0), + reduce(error, itr; init = 0), + str, + ) + @test_unrolled( + (itr,), + unrolled_accumulate(error, itr), + accumulate(error, itr), + str, + ) + @test_unrolled( + (itr,), + unrolled_accumulate(error, itr; init = 0), + accumulate(error, itr; init = 0), + str, + ) +end + +title = "Very Long Iterators" +comparison_table_dict = (comparison_table_dicts[title] = OrderedDict()) + +@testset "unrolled functions of Tuples vs. StaticOneTos" begin + for itr in (ntuple(identity, 2000), StaticOneTo(2000), StaticOneTo(8186)) + @test_unrolled (itr,) unrolled_reduce(+, itr) reduce(+, itr) "Ints" + @test_unrolled( + (itr,), + unrolled_mapreduce(log, +, itr), + mapreduce(log, +, itr), + "Ints", + ) + end # These can each take 40 seconds to compile for ntuple(identity, 8186). + for itr in (ntuple(identity, 8187), StaticOneTo(8187)) + @test_throws "gc handles" unrolled_reduce(+, itr) + @test_throws "gc handles" unrolled_mapreduce(log, +, itr) + end + # TODO: Why does the compiler throw an error when generating functions that + # get unrolled into more than 8186 lines of LLVM code? + + for itr in (StaticOneTo(8186), StaticOneTo(8187)) + @test_unrolled( + (itr,), + unrolled_reduce(+, Val(length(itr))), + reduce(+, itr), + "Ints", + ) + end + @test_throws "gc handles" unrolled_reduce(+, Val(8188)) + # TODO: Why is the limit 8187 for the Val version of unrolled_reduce? +end + +title = "Generative vs. Recursive Unrolling" +comparison_table_dict = (comparison_table_dicts[title] = OrderedDict()) + +for itr in ( + tuple_of_tuples(1, 0, true, true), + tuple_of_tuples(1, 1, true, true), + tuple_of_tuples(1, 1, false, true), + map(n -> tuple_of_tuples(n, 0, true, true), (8, 16, 32, 33, 128, 256))..., + map(n -> tuple_of_tuples(n, 1, true, true), (8, 16, 32, 33, 128, 256))..., + map(n -> tuple_of_tuples(n, 1, false, true), (8, 16, 32, 33, 128, 256))..., + map(n -> tuple_of_tuples(n, 0, true, false), (8, 16, 32, 33, 128, 256))..., + map(n -> tuple_of_tuples(n, 1, true, false), (8, 16, 32, 33, 128, 256))..., + map(n -> tuple_of_tuples(n, 1, false, false), (8, 16, 32, 33, 128, 256))..., +) + str = tuples_of_tuples_contents_str(itr) + itr_description = "a Tuple that contains $(length(itr)) $str" + @testset "generative vs. recursive unrolling of $itr_description" begin + @test_unrolled( + (itr,), + UnrolledUtilities.gen_unrolled_any(isempty, itr), + UnrolledUtilities.rec_unrolled_any(isempty, itr), + str, + ) + + @test_unrolled( + (itr,), + UnrolledUtilities.gen_unrolled_all(isempty, itr), + UnrolledUtilities.rec_unrolled_all(isempty, itr), + str, + ) + + @test_unrolled( + (itr,), + UnrolledUtilities.gen_unrolled_foreach( + x -> @assert(length(x) <= 7), + itr, + ), + UnrolledUtilities.rec_unrolled_foreach( + x -> @assert(length(x) <= 7), + itr, + ), + str, + ) + + @test_unrolled( + (itr,), + UnrolledUtilities.gen_unrolled_map(length, itr), + UnrolledUtilities.rec_unrolled_map(length, itr), + str, + ) + + @test_unrolled( + (itr,), + UnrolledUtilities.gen_unrolled_applyat( + length, + rand(1:7:length(itr)), + itr, + ), + UnrolledUtilities.rec_unrolled_applyat( + length, + rand(1:7:length(itr)), + itr, + ), + str, + ) + + if length(itr) <= 33 + @test_unrolled( + (itr,), + UnrolledUtilities.gen_unrolled_reduce(tuple, itr, ()), + UnrolledUtilities.rec_unrolled_reduce(tuple, itr, ()), + str, + ) + + @test_unrolled( + (itr,), + UnrolledUtilities.gen_unrolled_accumulate( + tuple, + itr, + (), + identity, + ), + UnrolledUtilities.rec_unrolled_accumulate( + tuple, + itr, + (), + identity, + ), + str, + ) + end # These can take over a minute to compile when the length is 128. + end +end