diff --git a/Project.toml b/Project.toml
index 62f5f0a..646c778 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,13 @@ version = "0.1.2"
 
 [compat]
 julia = "1.10"
+StaticArrays = "1"
+
+[weakdeps]
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+
+[extensions]
+UnrolledUtilitiesStaticArraysExt = "StaticArrays"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
diff --git a/docs/src/index.md b/docs/src/index.md
index 8aaec38..10ac6bc 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,16 +1,26 @@
+```@meta
+CurrentModule = UnrolledUtilities
+```
+
 #  UnrolledUtilities.jl
 
-A collection of generated functions in which all loops are unrolled and inlined:
+## Unrolled Functions
+
+This package exports the following functions, in which all loops are unrolled
+and inlined:
 - `unrolled_any(f, itr)`: similar to `any`
 - `unrolled_all(f, itr)`: similar to `all`
 - `unrolled_foreach(f, itrs...)`: similar to `foreach`
 - `unrolled_map(f, itrs...)`: similar to `map`
 - `unrolled_reduce(op, itr; [init])`: similar to `reduce`
 - `unrolled_mapreduce(f, op, itrs...; [init])`: similar to `mapreduce`
-- `unrolled_zip(itrs...)`: similar to `zip`
-- `unrolled_enumerate(itrs...)`: similar to `enumerate`, but with the ability to
-  handle multiple iterators
+- `unrolled_accumulate(op, itr; [init], [transform])`: similar to `accumulate`,
+  but with an optional `transform` function applied to every accumulated value
+- `unrolled_mapaccumulate(f, op, itrs...; [init], [transform])`: a combination
+  of `unrolled_map` and `unrolled_accumulate`, analogous to `unrolled_mapreduce`
 - `unrolled_in(item, itr)`: similar to `in`
+- `unrolled_push(itr, item)`: similar to `push!`, but non-mutating
+- `unrolled_append(itr, item)`: similar to `append!`, but non-mutating
 - `unrolled_unique(itr)`: similar to `unique`
 - `unrolled_filter(f, itr)`: similar to `filter`
 - `unrolled_split(f, itr)`: similar to `(filter(f, itr), filter(!f, itr))`, but
@@ -42,33 +52,77 @@ iterators have singleton element types (and when the result of calling `f`
 and/or `op` on these elements is inferrable). However, they can also be much
 more expensive to compile than their counterparts from `Base` and
 `Base.Iterators`, in which case they should not be used unless there is a clear
-performance benefit. Some notable exceptions to this are `unrolled_zip`,
-`unrolled_take`, and `unrolled_drop`, which tend to be easier to compile than
-`zip`, `Iterators.take`, `Iterators.drop`, and standard indexing notation.
+performance benefit. Two notable exceptions to this are `unrolled_take` and
+`unrolled_drop`, which are faster to compile than their non-static versions.
+
+## Interface
+
+These functions can be used to unroll loops over all iterators with statically
+inferrable lengths. Compatibility with any such iterator type can be added
+through the following interface:
+
+```@docs
+length_from_type
+target_output_type
+target_output_type_for_promotion
+output_promote_rule
+eltype_restriction
+output_constructor
+empty_output
+```
+
+## Lazy and Low-Storage Iterators
+
+The interface above is used to provide built-in compatibility with `Tuple`s and
+`SVector`s (from [StaticArrays.jl](https://github.com/JuliaArrays/StaticArrays.jl)),
+and also with subtypes of `StaticSequence`:
+
+```@docs
+StaticSequence
+LazyMap
+LazySequence
+BitSequence
+```
+
+This package also exports several "lazy" functions that generate `LazyMap`s:
+- `lazy_map(f, itrs...)`: similar to `map`
+- `lazy_zip(itrs...)`: similar to `zip`
+- `lazy_enumerate(itrs...)`: similar to `enumerate`, but with the ability to
+  handle multiple iterators
+
+When used in conjunction with a `LazySequence` or `BitSequence`, these functions
+can result in significantly lower register pressure than `unrolled_map` or
+similarly unrolled versions of `zip` and `enumerate`.
+
+## When to Unroll
 
 For a more precise indication of whether you should use `UnrolledUtilities`,
 please consult the autogenerated [Comparison Table](@ref). This table contains a
-comprehensive set of potential use cases, each with a measurement of performance
-optimization, the time required for compilation, and the memory usage during
-compilation. Most cases involve simple functions `f` and/or `op`, but the last
-few demonstrate the benefits of unrolling with non-trivial recursive functions.
+comprehensive set of potential use cases, along with a few measurements that
+summarize their performance, compilation, and allocations:
+- overall level of optimization (type stability, constant propagation, etc.)
+- run time (best of several trial measurements)
+- compilation time (as reported by the compiler)
+- memory usage during compilation and first run (as reported by the garbage
+  collector and, when possible, the Julia process's resident set size estimator)
 
 The rows of the table are highlighted as follows:
-- green indicates an improvement in performance and either no change in
-  compilation or easier compilation (i.e., either similar or smaller values of
-  compilation time and memory usage)
-- dark blue indicates an improvement in performance and harder compilation
-  (i.e., larger values of compilation time and/or memory usage)
-- light blue indicates no change in performance and easier compilation
-- yellow indicates no change in performance and no change in compilation
-- magenta indicates no change in performance, an increase in compilation time,
-  and a decrease in compilation memory usage
-- red indicates no change in performance and harder compilation
+- green indicates an improvement in performance and either an improvement or
+  no change in compilation and allocations
+- dark blue indicates an improvement in performance and either slower
+  compilation or more allocations
+- light blue indicates no change in performance and either faster compilation or
+  fewer allocations
+- magenta indicates no change in performance and either faster compilation with
+  more allocations or slower compilation with fewer allocations
+- yellow indicates no change in performance, compilation, or allocations
+- red indicates a deterioration in performance, or no change in
+  performance and either slower compilation or more allocations
 
 Rows highlighted in green and blue present a clear advantage for unrolling,
-whereas those highlighted in yellow, magenta, and red either have no clear
-advantage, or they have a clear disadvantage. It is recommended that you only
-unroll when your use case is similar to a row in the first category.
+whereas those highlighted in magenta, yellow, and red either have no clear
+advantage or have a clear disadvantage. It is recommended that you only unroll
+when your use case is similar to a row in the first category.
 
 The table is also printed out by this package's unit tests, so these
 measurements can be compared across different operating systems by checking the
diff --git a/ext/UnrolledUtilitiesStaticArraysExt.jl b/ext/UnrolledUtilitiesStaticArraysExt.jl
new file mode 100644
index 0000000..09d2bf3
--- /dev/null
+++ b/ext/UnrolledUtilitiesStaticArraysExt.jl
@@ -0,0 +1,10 @@
+module UnrolledUtilitiesStaticArraysExt
+
+import UnrolledUtilities
+import StaticArrays: SVector
+
+UnrolledUtilities.length_from_type(::Type{<:SVector{N}}) where {N} = N
+UnrolledUtilities.target_output_type(::SVector) = SVector
+UnrolledUtilities.output_constructor(::Type{SVector}) = SVector
+
+end
diff --git a/src/BitSequence.jl b/src/BitSequence.jl
new file mode 100644
index 0000000..77076dd
--- /dev/null
+++ b/src/BitSequence.jl
@@ -0,0 +1,150 @@
+"""
+    BitSequence{N, [U]}(f)
+    BitSequence{N, [U]}([bit])
+
+A statically-sized analogue of `BitVector` with `Unsigned` chunks of type `U`,
+which can be constructed using either a function `f(n)` or a constant `bit`. By
+default, `U` is set to `UInt8` and `bit` is set to `false`.
+
+Efficient methods are provided for `unrolled_map`, `unrolled_accumulate`,
+`unrolled_take`, and `unrolled_drop`, though the methods for `unrolled_map` and
+`unrolled_accumulate` only apply when their outputs consist of `Bool`s. All
+other unrolled functions that need to construct non-empty iterators convert
+`BitSequence`s into `Tuple`s.
+"""
+struct BitSequence{N, U <: Unsigned, I <: NTuple{<:Any, U}} <: StaticSequence{N}
+    ints::I
+end
+BitSequence{N, U}(ints::I) where {N, U <: Unsigned, I <: NTuple{<:Any, U}} =
+    BitSequence{N, U, I}(ints)
+BitSequence{N}(args...) where {N} = BitSequence{N, UInt8}(args...)
+
+function BitSequence{N, U}(bit::Bool = false) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    int = bit ? ~zero(U) : zero(U)
+    ints = ntuple(_ -> int, Val(n_ints))
+    return BitSequence{N, U}(ints)
+end
+
+function BitSequence{N, U}(f) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = ntuple(Val(n_ints)) do int_index
+        @inline
+        first_index = n_bits_per_int * (int_index - 1) + 1
+        unrolled_reduce(
+            LazySequence{min(n_bits_per_int, N - first_index + 1)}(0);
+            init = zero(U),
+        ) do int, bit_offset
+            int | U(f(first_index + bit_offset)::Bool) << bit_offset
+        end
+    end
+    return BitSequence{N, U}(ints)
+end
+
+target_output_type(::BitSequence{<:Any, U}) where {U} = BitSequence{<:Any, U}
+
+output_promote_rule(::Type{B}, ::Type{O}) where {B <: BitSequence, O} = O
+output_promote_rule(::Type{B}, ::Type{Tuple}) where {B <: BitSequence} = Tuple
+output_promote_rule(::Type{B}, ::Type{LazySequence}) where {B <: BitSequence} =
+    B
+
+eltype_restriction(::Type{<:BitSequence}) = Bool
+
+empty_output(::Type{BitSequence{<:Any, U}}) where {U} = BitSequence{0, U}()
+
+@inline function unrolled_map_into_target(
+    ::Type{BitSequence{<:Any, U}},
+    f,
+    itrs...,
+) where {U}
+    lazy_itr = lazy_map(f, itrs...)
+    N = inferred_length(lazy_itr)
+    return BitSequence{N, U}(Base.Fix1(getindex, lazy_itr))
+end
+
+@inline function unrolled_accumulate_into_target(
+    ::Type{BitSequence{<:Any, U}},
+    op,
+    itr,
+    init,
+    transform,
+) where {U}
+    N = inferred_length(itr)
+    (N == 0 && init isa NoInit) &&
+        error("unrolled_accumulate requires an init value for empty iterators")
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = unrolled_accumulate_into_tuple(
+        LazySequence{n_ints}();
+        init = (nothing, init),
+        transform = first,
+    ) do (_, init_value_for_new_int), int_index
+        @inline
+        first_index = n_bits_per_int * (int_index - 1) + 1
+        unrolled_reduce(
+            LazySequence{min(n_bits_per_int, N - first_index + 1)}(0);
+            init = (zero(U), init_value_for_new_int),
+        ) do (int, prev_value), bit_offset
+            item = itr[first_index + bit_offset]
+            new_value =
+                first_index + bit_offset == 1 && prev_value isa NoInit ?
+                item : op(prev_value, item)
+            (int | U(transform(new_value)::Bool) << bit_offset, new_value)
+        end
+    end
+    return BitSequence{N, U}(ints)
+end
+
+@inline function unrolled_take(
+    itr::BitSequence{<:Any, U},
+    ::Val{N},
+) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = unrolled_take(itr.ints, Val(n_ints))
+    return BitSequence{N, U}(ints)
+end
+
+@inline function unrolled_drop(
+    itr::BitSequence{N_old, U},
+    ::Val{N},
+) where {N_old, N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N_old - N, n_bits_per_int)
+    n_dropped_ints = length(itr.ints) - n_ints
+    bit_offset = N - n_bits_per_int * n_dropped_ints
+    ints_without_offset = unrolled_drop(itr.ints, Val(n_dropped_ints))
+    ints = if bit_offset == 0
+        ints_without_offset
+    else
+        cur_ints = ints_without_offset
+        next_ints = unrolled_push(unrolled_drop(cur_ints, Val(1)), nothing)
+        unrolled_map_into_tuple(cur_ints, next_ints) do cur_int, next_int
+            isnothing(next_int) ? cur_int >> bit_offset :
+            cur_int >> bit_offset | next_int << (n_bits_per_int - bit_offset)
+        end
+    end
+    return BitSequence{N_old - N, U}(ints)
+end
+
+@inline function int_index_and_bit_offset(itr, n)
+    int_offset, bit_offset = divrem(n - 1, 8 * sizeof(eltype(itr.ints)))
+    return (int_offset + 1, bit_offset)
+end
+
+@inline function Base.getindex(itr::BitSequence, n::Integer)
+    int_index, bit_offset = int_index_and_bit_offset(itr, n)
+    int = itr.ints[int_index]
+    return Bool(int >> bit_offset & one(int))
+end
+
+@inline function Base.setindex(itr::BitSequence, bit::Bool, n::Integer)
+    int_index, bit_offset = int_index_and_bit_offset(itr, n)
+    int = itr.ints[int_index]
+    int′ = int & ~(one(int) << bit_offset) | typeof(int)(bit) << bit_offset
+    return typeof(itr)(Base.setindex(itr.ints, int′, int_index))
+end
+
+@inline Base.eltype(::BitSequence) = Bool
diff --git a/src/LazyMap.jl b/src/LazyMap.jl
new file mode 100644
index 0000000..83a82f4
--- /dev/null
+++ b/src/LazyMap.jl
@@ -0,0 +1,56 @@
+"""
+    LazyMap{N}(f, itrs...)
+
+A lazy and statically-sized analogue of a `Base.AbstractBroadcasted` object
+whose elements and `target_output_type` are consistent with
+`unrolled_map(f, itrs...)`.
+
+Efficient methods are provided for `unrolled_take` and `unrolled_drop`. All
+other unrolled functions that need to construct non-empty iterators convert
+`LazyMap`s into their `output_type`s.
+"""
+struct LazyMap{N, F, I} <: StaticSequence{N}
+    f::F
+    itrs::I
+end
+LazyMap{N}(f, itrs...) where {N} = LazyMap{N, typeof(f), typeof(itrs)}(f, itrs)
+
+target_output_type(itr::LazyMap) = output_type_of_map(itr.f, itr.itrs...)
+
+# Ignore eltype restrictions during the promotion process, until the final step.
+target_output_type_for_promotion(itr::LazyMap) =
+    promoted_target_output_type(itr.itrs)
+
+@inline unrolled_fix2(f, arg, itrs) =
+    unrolled_map_into_tuple((@inline itr -> f(itr, arg)), itrs)
+
+@inline unrolled_take(itr::LazyMap, ::Val{N}) where {N} =
+    LazyMap{N}(itr.f, unrolled_fix2(unrolled_take, Val(N), itr.itrs)...)
+
+@inline unrolled_drop(itr::LazyMap{N_old}, ::Val{N}) where {N_old, N} =
+    LazyMap{N_old - N}(itr.f, unrolled_fix2(unrolled_drop, Val(N), itr.itrs)...)
+
+# Work around the recursion limit for getindex to handle chains of LazyMaps.
+@inline Base.getindex(itr::LazyMap, n::Integer) = lazy_map_getindex(itr, n)
+@inline lazy_map_getindex(itr, n) = getindex(itr, n)
+@inline lazy_map_getindex(itr::LazyMap, n) =
+    itr.f(unrolled_fix2(lazy_map_getindex, n, itr.itrs)...)
+@static if hasfield(Method, :recursion_relation)
+    for method in methods(lazy_map_getindex)
+        method.recursion_relation = (_...) -> true
+    end
+end
+
+@inline Base.eltype(itr::LazyMap) =
+    Base.promote_op(itr.f, unrolled_map_into_tuple(eltype, itr.itrs)...)
+
+################################################################################
+
+@inline lazy_map(f, itr) = LazyMap{inferred_length(itr)}(f, itr)
+@inline lazy_map(f, itrs...) = LazyMap{minimum_length(itrs)}(f, itrs...)
+# The first method avoids the recursion lazy_map → minimum_length → lazy_map.
+
+@inline lazy_zip(itrs...) = lazy_map(tuple, itrs...)
+
+@inline lazy_enumerate(itrs...) =
+    lazy_zip(LazySequence{minimum_length(itrs)}(), itrs...)
diff --git a/src/LazySequence.jl b/src/LazySequence.jl
new file mode 100644
index 0000000..9fa03cb
--- /dev/null
+++ b/src/LazySequence.jl
@@ -0,0 +1,34 @@
+"""
+    LazySequence{N}(f)
+    LazySequence{N}([start])
+
+A lazy analogue of `ntuple(f, Val(N))`, or a lazy and statically-sized analogue
+of `start:(start - 1 + N)`. By default, `start` is set to 1.
+
+Efficient methods are provided for `unrolled_take` and `unrolled_drop`. All
+other unrolled functions that need to construct non-empty iterators convert
+`LazySequence`s into `Tuple`s.
+"""
+struct LazySequence{N, F} <: StaticSequence{N}
+    f::F
+end
+LazySequence{N}(f = identity) where {N} = LazySequence{N, typeof(f)}(f)
+LazySequence{N}(start::Number) where {N} =
+    LazySequence{N}(Base.Fix1(+, start - one(start)))
+
+target_output_type(::LazySequence) = LazySequence
+
+output_promote_rule(::Type{LazySequence}, ::Type{O}) where {O} = O
+output_promote_rule(::Type{LazySequence}, ::Type{Tuple}) = Tuple
+
+empty_output(::Type{LazySequence}) = LazySequence{0}()
+
+@inline unrolled_take(itr::LazySequence, ::Val{N}) where {N} =
+    LazySequence{N}(itr.f)
+
+@inline unrolled_drop(itr::LazySequence{N_old}, ::Val{N}) where {N_old, N} =
+    LazySequence{N_old - N}(n -> itr.f(n + N))
+
+@inline Base.getindex(itr::LazySequence, n::Integer) = itr.f(n)
+
+@inline Base.eltype(itr::LazySequence) = Base.promote_op(itr.f, Int)
diff --git a/src/UnrolledUtilities.jl b/src/UnrolledUtilities.jl
index dc69559..70d3c78 100644
--- a/src/UnrolledUtilities.jl
+++ b/src/UnrolledUtilities.jl
@@ -6,9 +6,11 @@ export unrolled_any,
     unrolled_map,
     unrolled_reduce,
     unrolled_mapreduce,
-    unrolled_zip,
-    unrolled_enumerate,
+    unrolled_accumulate,
+    unrolled_mapaccumulate,
     unrolled_in,
+    unrolled_push,
+    unrolled_append,
     unrolled_unique,
     unrolled_filter,
     unrolled_split,
@@ -17,107 +19,140 @@ export unrolled_any,
     unrolled_product,
     unrolled_applyat,
     unrolled_take,
-    unrolled_drop
+    unrolled_drop,
+    lazy_map,
+    lazy_zip,
+    lazy_enumerate,
+    LazySequence,
+    BitSequence
 
-inferred_length(::Type{<:NTuple{N, Any}}) where {N} = N
-# We could also add support for statically-sized iterators that are not Tuples.
+struct NoInit end # Analogue of Base._InitialValue for reduction/accumulation.
 
-f_exprs(itr_type) = (:(f(itr[$n])) for n in 1:inferred_length(itr_type))
-@inline @generated unrolled_any(f, itr) = Expr(:||, f_exprs(itr)...)
-@inline @generated unrolled_all(f, itr) = Expr(:&&, f_exprs(itr)...)
+include("generated_functions.jl")
+include("unrollable_interface.jl")
 
-function zipped_f_exprs(itr_types)
-    L = length(itr_types)
-    L == 0 && error("unrolled functions need at least one iterator as input")
-    N = minimum(inferred_length, itr_types)
-    return (:(f($((:(itrs[$l][$n]) for l in 1:L)...))) for n in 1:N)
-end
-@inline @generated unrolled_foreach(f, itrs...) =
-    Expr(:block, zipped_f_exprs(itrs)..., nothing)
-@inline @generated unrolled_map(f, itrs...) =
-    Expr(:tuple, zipped_f_exprs(itrs)...)
-
-function nested_op_expr(itr_type)
-    N = inferred_length(itr_type)
-    N == 0 && error("unrolled_reduce needs an `init` value for empty iterators")
-    item_exprs = (:(itr[$n]) for n in 1:N)
-    return reduce((expr1, expr2) -> :(op($expr1, $expr2)), item_exprs)
-end
-@inline @generated unrolled_reduce_without_init(op, itr) = nested_op_expr(itr)
+@inline unrolled_map_into_target(output_type, f, itrs...) =
+    output_constructor(output_type)(unrolled_map_into_tuple(f, itrs...))
+@inline unrolled_map(f, itrs...) =
+    unrolled_map_into_target(output_type_of_map(f, itrs...), f, itrs...)
 
-struct NoInit end
 @inline unrolled_reduce(op, itr; init = NoInit()) =
-    unrolled_reduce_without_init(op, init isa NoInit ? itr : (init, itr...))
-
+    unrolled_reduce(op, itr, init)
 @inline unrolled_mapreduce(f, op, itrs...; init = NoInit()) =
-    unrolled_reduce(op, unrolled_map(f, itrs...); init)
+    unrolled_reduce(op, lazy_map(f, itrs...), init)
 
-@inline unrolled_zip(itrs...) = unrolled_map(tuple, itrs...)
-
-@inline unrolled_enumerate(itrs...) =
-    unrolled_zip(ntuple(identity, Val(length(itrs[1]))), itrs...)
+@inline unrolled_accumulate_into_target(output_type, op, itr, init, transform) =
+    output_constructor(output_type)(
+        unrolled_accumulate_into_tuple(op, itr, init, transform),
+    )
+@inline unrolled_accumulate(op, itr; init = NoInit(), transform = identity) =
+    unrolled_accumulate_into_target(
+        output_type_of_accumulate(op, itr, init, transform),
+        op,
+        itr,
+        init,
+        transform,
+    )
+@inline unrolled_mapaccumulate(f, op, itrs...; kwargs...) =
+    unrolled_accumulate(op, lazy_map(f, itrs...); kwargs...)
 
 @inline unrolled_in(item, itr) = unrolled_any(Base.Fix1(===, item), itr)
 # Using === instead of == or isequal improves type stability for singletons.
 
+@inline unrolled_push(itr, item) = inferred_constructor(itr)((itr..., item))
+
+@inline unrolled_append(itr1, itr2) =
+    promoted_constructor((itr1, itr2))((itr1..., itr2...))
+
 @inline unrolled_unique(itr) =
-    unrolled_reduce(itr; init = ()) do unique_items, item
+    unrolled_reduce(itr; init = inferred_empty(itr)) do unique_items, item
         @inline
-        unrolled_in(item, unique_items) ? unique_items : (unique_items..., item)
+        unrolled_in(item, unique_items) ? unique_items :
+        unrolled_push(unique_items, item)
     end
 
 @inline unrolled_filter(f, itr) =
-    unrolled_reduce(itr; init = ()) do filtered_items, item
+    unrolled_reduce(itr; init = inferred_empty(itr)) do filtered_items, item
         @inline
-        f(item) ? (filtered_items..., item) : filtered_items
+        f(item) ? unrolled_push(filtered_items, item) : filtered_items
     end
 
-@inline unrolled_split(f, itr) =
-    unrolled_reduce(itr; init = ((), ())) do (f_items, not_f_items), item
+@inline function unrolled_split(f, itr)
+    init = (inferred_empty(itr), inferred_empty(itr))
+    return unrolled_reduce(itr; init) do (f_items, not_f_items), item
         @inline
-        f(item) ? ((f_items..., item), not_f_items) :
-        (f_items, (not_f_items..., item))
+        f(item) ? (unrolled_push(f_items, item), not_f_items) :
+        (f_items, unrolled_push(not_f_items, item))
     end
+end
 
 @inline unrolled_flatten(itr) =
-    unrolled_reduce((item1, item2) -> (item1..., item2...), itr; init = ())
+    unrolled_reduce(unrolled_append, itr; init = promoted_empty(itr))
 
-@inline unrolled_flatmap(f, itrs...) =
-    unrolled_flatten(unrolled_map(f, itrs...))
+@inline unrolled_flatmap(f, itrs...) = unrolled_flatten(lazy_map(f, itrs...))
 
 @inline unrolled_product(itrs...) =
-    unrolled_reduce(itrs; init = ((),)) do product_itr, itr
+    unrolled_reduce(itrs; init = (promoted_empty(itrs),)) do product_itr, itr
         @inline
         unrolled_flatmap(itr) do item
             @inline
-            unrolled_map(product_tuple -> (product_tuple..., item), product_itr)
+            unrolled_map_into_tuple(Base.Fix2(unrolled_push, item), product_itr)
         end
     end
 
 @inline unrolled_applyat(f, n, itrs...) = unrolled_foreach(
-    (i, items...) -> i == n && f(items...),
-    unrolled_enumerate(itrs...),
+    (n′, items...) -> n′ == n && f(items...),
+    lazy_enumerate(itrs...),
 )
 
-@inline unrolled_take(itr, ::Val{N}) where {N} = ntuple(i -> itr[i], Val(N))
-@inline unrolled_drop(itr, ::Val{N}) where {N} =
-    ntuple(i -> itr[N + i], Val(length(itr) - N))
-# When its second argument is a Val, ntuple is unrolled via Base.@ntuple.
+@inline unrolled_take(itr, ::Val{N}) where {N} =
+    inferred_constructor(itr)(ntuple(Base.Fix1(getindex, itr), Val(N)))
+
+@inline unrolled_drop(itr, ::Val{N}) where {N} = inferred_constructor(itr)(
+    ntuple(n -> itr[N + n], Val(inferred_length(itr) - N)),
+)
+
+"""
+    StaticSequence{N}
+
+A statically-sized iterator of length `N` for which some unrolled functions have
+lazy or low-storage implementations.
+"""
+abstract type StaticSequence{N} end
+
+length_from_type(::Type{<:StaticSequence{N}}) where {N} = N
+
+Base.firstindex(itr::StaticSequence) = 1
+Base.lastindex(itr::StaticSequence) = inferred_length(itr)
+Base.length(itr::StaticSequence) = inferred_length(itr)
+
+@inline Base.iterate(itr::StaticSequence, index = 1) =
+    index > inferred_length(itr) ? nothing : (itr[index], index + 1)
+
+include("LazyMap.jl")
+include("LazySequence.jl")
+include("BitSequence.jl")
 
 @static if hasfield(Method, :recursion_relation)
     # Remove recursion limits for functions whose arguments are also functions.
     for func in (
+        unrolled_map_into_tuple,
+        unrolled_map_into_target,
+        unrolled_accumulate_into_tuple,
+        unrolled_accumulate_into_target,
         unrolled_any,
         unrolled_all,
         unrolled_foreach,
         unrolled_map,
-        unrolled_reduce_without_init,
         unrolled_reduce,
         unrolled_mapreduce,
+        unrolled_accumulate,
+        unrolled_mapaccumulate,
         unrolled_filter,
         unrolled_split,
         unrolled_flatmap,
         unrolled_applyat,
+        lazy_map,
     )
         for method in methods(func)
             method.recursion_relation = (_...) -> true
diff --git a/src/generated_functions.jl b/src/generated_functions.jl
new file mode 100644
index 0000000..acd71c6
--- /dev/null
+++ b/src/generated_functions.jl
@@ -0,0 +1,41 @@
+f_exprs(itr_type) = (:(f(itr[$n])) for n in 1:length_from_type(itr_type))
+@inline @generated unrolled_any(f, itr) = Expr(:||, f_exprs(itr)...)
+@inline @generated unrolled_all(f, itr) = Expr(:&&, f_exprs(itr)...)
+
+@inline unrolled_any(itr) = unrolled_any(identity, itr)
+@inline unrolled_all(itr) = unrolled_all(identity, itr)
+
+function zipped_f_exprs(itr_types)
+    L = length(itr_types)
+    N = L == 0 ? 0 : minimum(length_from_type, itr_types) # match minimum_length
+    return (:(f($((:(itrs[$l][$n]) for l in 1:L)...))) for n in 1:N)
+end
+@inline @generated unrolled_foreach(f, itrs...) =
+    Expr(:block, zipped_f_exprs(itrs)..., nothing)
+@inline @generated unrolled_map_into_tuple(f, itrs...) =
+    Expr(:tuple, zipped_f_exprs(itrs)...)
+
+function nested_op_expr(itr_type, init_type)
+    N = length_from_type(itr_type)
+    (N == 0 && init_type == NoInit) &&
+        error("unrolled_reduce requires an init value for empty iterators")
+    init_expr = init_type == NoInit ? :(itr[1]) : :init
+    n_range = init_type == NoInit ? (2:N) : (1:N)
+    return foldl((expr, n) -> :(op($expr, itr[$n])), n_range; init = init_expr)
+end
+@inline @generated unrolled_reduce(op, itr, init) = nested_op_expr(itr, init)
+
+function transformed_sequential_op_exprs(itr_type, init_type)
+    N = length_from_type(itr_type)
+    (N == 0 && init_type == NoInit) &&
+        error("unrolled_accumulate requires an init value for empty iterators")
+    init_op_expr = init_type == NoInit ? :(itr[1]) : :(op(init, itr[1]))
+    transformed_exprs_and_next_op_exprs =
+        accumulate(1:N; init = (nothing, init_op_expr)) do (_, op_expr), n
+            var = gensym()
+            (:($var = $op_expr; transform($var)), :(op($var, itr[$(n + 1)])))
+        end
+    return map(first, transformed_exprs_and_next_op_exprs)
+end
+@inline @generated unrolled_accumulate_into_tuple(op, itr, init, transform) =
+    Expr(:tuple, transformed_sequential_op_exprs(itr, init)...)
diff --git a/src/unrollable_interface.jl b/src/unrollable_interface.jl
new file mode 100644
index 0000000..b0a319c
--- /dev/null
+++ b/src/unrollable_interface.jl
@@ -0,0 +1,158 @@
+#=
+To use an unrollable iterator of type T with this interface, follow these steps:
+- Add a method for length_from_type(T)
+- If every unrolled function that needs to construct an iterator when given an
+  iterator of type T can return a Tuple, stop here; otherwise, to return a
+  non-Tuple iterator when possible, follow these steps:
+    - Add a method for target_output_type(::T) = O
+    - If the output type used for promotion should be some other type O′, add an
+      method for target_output_type_for_promotion(::T) = O′
+    - If an output of type O can be used together with an output of type O′, add
+      a method for output_promote_rule(O, O′)
+    - If an output of type O can only store elements of a certain type, and if
+      it should be replaced with a Tuple when given elements of any other type,
+      add a method for eltype_restriction(O)
+    - If an output of type O can be efficiently constructed from a Tuple, add a
+      method for output_constructor(O)
+    - If an output of type O cannot be efficiently constructed from a Tuple,
+      follow these steps:
+        - Add a method for empty_output(O)
+        - Add a method for every unrolled function that can be optimized to
+          handle outputs of type O
+        - Check that every other unrolled function that needs to construct an
+          iterator can return a Tuple instead of an output of type O
+=#
+
+"""
+    length_from_type(itr_type)
+
+The length of an iterator of type `itr_type`.
+"""
+length_from_type(::Type{<:NTuple{N, Any}}) where {N} = N
+length_from_type(::Type{T}) where {T} = error(
+    "UnrolledUtilities.length_from_type must be defined for the following \
+     type before iterators of this type can be unrolled: $T",
+)
+
+@inline inferred_length(itr) = length_from_type(typeof(itr))
+
+@inline minimum_length(itrs) =
+    length(itrs) == 0 ? 0 : unrolled_mapreduce(inferred_length, min, itrs)
+
+"""
+    target_output_type(itr)
+
+The type of output that unrolled functions should try to generate for the input
+iterator `itr`. By default, this is set to `Tuple`.
+"""
+target_output_type(_) = Tuple
+
+"""
+    target_output_type_for_promotion(itr)
+
+An alternative to `target_output_type(itr)` used within the promotion process.
+This type of output might not be able to store all of the elements in `itr`
+(i.e., it might have an `eltype_restriction` that makes it incompatible with
+`itr`), but lazy wrappers around `itr` could still use it as their output type.
+"""
+target_output_type_for_promotion(itr) = target_output_type(itr)
+
+"""
+    output_promote_rule(output_type1, output_type2)
+
+The type of output that should be generated when two iterators have different
+target output types, or `Union{}` if these output types should not be combined.
+Only one method of `output_promote_rule` needs to be defined for any given pair
+of output types.
+
+The built-in promotion rules are
+[`LazySequence`](@ref UnrolledUtilities.LazySequence) <
+[`BitSequence`](@ref UnrolledUtilities.BitSequence) < `Tuple` < `Any`. By
+default, the result for any other pair of distinct output types is `Union{}`. 
+"""
+output_promote_rule(::Type, ::Type) = Union{}
+output_promote_rule(::Type{O}, ::Type{O}) where {O} = O
+output_promote_rule(::Type{O}, ::Type{Tuple}) where {O} = O
+
+@inline function output_promote_result(O1, O2)
+    O12 = output_promote_rule(O1, O2)
+    O21 = output_promote_rule(O2, O1)
+    O12 == O21 == Union{} &&
+        error("output_promote_rule is undefined for types $O1 and $O2")
+    (O12 == O21 || O21 == Union{}) && return O12
+    O12 == Union{} && return O21
+    error("output_promote_rule yields inconsistent results for types $O1 \
+           and $O2: $O12 for ($O1, $O2) and $O21 for ($O2, $O1)")
+end
+
+@inline promoted_target_output_type(itrs) =
+    length(itrs) == 0 ? Tuple : # Make an empty Tuple when given 0 iterators.
+    unrolled_mapreduce(
+        target_output_type_for_promotion,
+        output_promote_result,
+        itrs,
+    )
+
+"""
+    eltype_restriction(output_type)
+
+The most general element type that can be stored in an output of type
+`output_type`. By default, this is assumed to be `Any`.
+
+If `output_promote_rule` specifies that a particular type of output should be
+constructed, but that type cannot store all of the elements that need to be in
+the output, a `Tuple` will be generated instead of the promoted output type.
+"""
+eltype_restriction(::Type) = Any
+
+@inline function output_type_of_map(f, itrs...)
+    output_type = promoted_target_output_type(itrs)
+    allowed_eltype = eltype_restriction(output_type)
+    allowed_eltype == Any && return output_type
+    result_eltype = Base.promote_op(f, unrolled_map_into_tuple(eltype, itrs)...)
+    return result_eltype <: allowed_eltype ? output_type : Tuple
+end
+
+@inline function output_type_of_accumulate(op, itr, init, transform)
+    output_type = target_output_type_for_promotion(itr)
+    allowed_eltype = eltype_restriction(output_type)
+    allowed_eltype == Any && return output_type
+    first_value_type = init isa NoInit ? eltype(itr) : typeof(init)
+    untransformed_type = Base.promote_op(op, first_value_type, eltype(itr))
+    result_eltype = Base.promote_op(transform, untransformed_type)
+    return result_eltype <: allowed_eltype ? output_type : Tuple
+end
+
+"""
+    output_constructor(output_type)
+
+A function that can be used to efficiently construct an output of type
+`output_type` from a `Tuple`, or `identity` if such an output should not be
+constructed from a `Tuple`. By default, this is set to `identity`, which also
+handles the case where `output_type` is already `Tuple`.
+
+Many unrollable iterators (e.g., `SVector`s) are essentially wrappers for
+`Tuple`s, and their constructors for `Tuple`s can be reduced to no-ops. The main
+exceptions are [`StaticSequence`](@ref UnrolledUtilities.StaticSequence)s, which
+do not provide constructors for `Tuple`s because there is no performance benefit
+to converting a high-storage data structure into a low-storage data structure
+after it has already been constructed.
+"""
+output_constructor(::Type) = identity
+
+@inline inferred_constructor(itr) = output_constructor(target_output_type(itr))
+
+@inline promoted_constructor(itrs) =
+    output_constructor(promoted_target_output_type(itrs))
+
+"""
+    empty_output(output_type)
+
+An empty output of type `output_type`. By default, this applies the
+`output_constructor` for `output_type` to an empty `Tuple`.
+"""
+@inline empty_output(output_type) = output_constructor(output_type)(())
+
+@inline inferred_empty(itr) = empty_output(target_output_type(itr))
+
+@inline promoted_empty(itrs) = empty_output(promoted_target_output_type(itrs))
diff --git a/test/aqua.jl b/test/aqua.jl
index d7becf1..ff1edd1 100644
--- a/test/aqua.jl
+++ b/test/aqua.jl
@@ -1,3 +1,4 @@
+using Test
 import Aqua, UnrolledUtilities
 
 # This is separate from all the other tests because Aqua.test_all checks for
diff --git a/test/test_and_analyze.jl b/test/test_and_analyze.jl
index 70415e3..5006e21 100644
--- a/test/test_and_analyze.jl
+++ b/test/test_and_analyze.jl
@@ -18,30 +18,43 @@ function print_comparison_table(io = stdout, generate_html = false)
         generate_html ? HtmlHighlighter(f, HtmlDecoration(; color)) :
         Highlighter(f, Crayon(; foreground = Symbol(color)))
 
-    better_performance_but_harder_to_compile =
-        highlighter(generate_html ? "royalblue" : "blue") do data, i, j
-            data[i, 4] != data[i, 5] &&
-                (endswith(data[i, 6], "slower") || endswith(data[i, 7], "more"))
+    better_performance_but_worse_compilation_or_allocations =
+        highlighter(generate_html ? "royalblue" : "blue") do data, i, _
+            (
+                contains(data[i, 4], "better") &&
+                !contains(data[i, 5], "more") || contains(data[i, 5], "less")
+            ) && (contains(data[i, 6], "more") || contains(data[i, 7], "more"))
         end
     better_performance =
-        highlighter(generate_html ? "mediumseagreen" : "green") do data, i, j
-            data[i, 4] != data[i, 5]
+        highlighter(generate_html ? "mediumseagreen" : "green") do data, i, _
+            contains(data[i, 4], "better") && !contains(data[i, 5], "more") ||
+                contains(data[i, 5], "less")
         end
-    mixed_compilation =
-        highlighter(generate_html ? "mediumorchid" : "magenta") do data, i, j
-            (endswith(data[i, 6], "slower") && endswith(data[i, 7], "less")) ||
-                (endswith(data[i, 6], "faster") && endswith(data[i, 7], "more"))
+    similar_performance_but_mixed_compilation_and_allocations =
+        highlighter(generate_html ? "mediumorchid" : "magenta") do data, i, _
+            contains(data[i, 5], "similar") && (
+                contains(data[i, 6], "more") && contains(data[i, 7], "less") ||
+                contains(data[i, 6], "less") && contains(data[i, 7], "more")
+            )
+        end
+    worse_performance_or_compilation_or_allocations =
+        highlighter(generate_html ? "indianred" : "red") do data, i, _
+            contains(data[i, 5], "more") ||
+                contains(data[i, 6], "more") ||
+                contains(data[i, 7], "more")
         end
-    harder_to_compile =
-        highlighter(generate_html ? "indianred" : "red") do data, i, j
-            endswith(data[i, 6], "slower") || endswith(data[i, 7], "more")
+    better_compilation_or_allocations =
+        highlighter(generate_html ? "darkturquoise" : "cyan") do data, i, _
+            contains(data[i, 6], "less") || contains(data[i, 7], "less")
         end
-    easier_to_compile =
-        highlighter(generate_html ? "darkturquoise" : "cyan") do data, i, j
-            endswith(data[i, 6], "faster") || endswith(data[i, 7], "less")
+    no_measurable_difference =
+        highlighter(generate_html ? "khaki" : "yellow") do data, i, _
+            @assert contains(data[i, 4], "similar") &&
+                    contains(data[i, 5], "similar") &&
+                    contains(data[i, 6], "similar") &&
+                    contains(data[i, 7], "similar")
+            true
         end
-    no_difference =
-        highlighter((data, i, j) -> true, generate_html ? "khaki" : "yellow")
 
     other_kwargs =
         generate_html ?
@@ -54,7 +67,7 @@ function print_comparison_table(io = stdout, generate_html = false)
         ) :
         (;
             title_same_width_as_table = true,
-            columns_width = [45, 45, 0, 0, 0, 0, 0],
+            columns_width = [45, 45, 30, 25, 20, 20, 30],
             linebreaks = true,
             autowrap = true,
             crop = :none,
@@ -70,23 +83,66 @@ function print_comparison_table(io = stdout, generate_html = false)
             "Unrolled Expression",
             "Reference Expression",
             "Iterator Contents",
-            "Unrolled Performance",
-            "Reference Performance",
-            "Unrolled Compilation Time",
-            "Unrolled Compilation Memory",
+            "Optimization",
+            "Run Time",
+            "Compilation Time",
+            any(contains('['), table_data[:, 7]) ?
+            "Total GC [and RSS] Allocations" : "Total Allocations",
         ],
         highlighters = (
-            better_performance_but_harder_to_compile,
+            better_performance_but_worse_compilation_or_allocations,
             better_performance,
-            mixed_compilation,
-            harder_to_compile,
-            easier_to_compile,
-            no_difference,
+            similar_performance_but_mixed_compilation_and_allocations,
+            worse_performance_or_compilation_or_allocations,
+            better_compilation_or_allocations,
+            no_measurable_difference,
         ),
         other_kwargs...,
     )
 end
 
+function time_string(nanoseconds)
+    nanoseconds == 0 && return "$nanoseconds ns"
+    n_decimal_digits = floor(Int, log10(nanoseconds) + 1)
+    return if n_decimal_digits <= 3
+        "$nanoseconds ns"
+    elseif n_decimal_digits <= 6
+        "$(nanoseconds ÷ 10^3) μs"
+    elseif n_decimal_digits <= 9
+        "$(nanoseconds ÷ 10^6) ms"
+    else
+        "$(nanoseconds ÷ 10^9) s"
+    end
+end
+
+function memory_string(bytes)
+    bytes == 0 && return "$bytes B"
+    n_binary_digits = floor(Int, log2(bytes) + 1)
+    return if n_binary_digits <= 10
+        "$bytes B"
+    elseif n_binary_digits <= 20
+        "$(bytes ÷ 2^10) kB"
+    elseif n_binary_digits <= 30
+        "$(bytes ÷ 2^20) MB"
+    else
+        "$(bytes ÷ 2^30) GB"
+    end
+end
+
+function comparison_string(value1, value2, to_string, to_number = identity)
+    ratio = to_number(value1) / to_number(value2)
+    summary_str = if ratio >= 1.5
+        rounded_ratio = ratio == Inf ? Inf : round(Int, ratio)
+        "$rounded_ratio times more"
+    elseif inv(ratio) >= 1.5
+        rounded_inv_ratio = ratio == 0 ? Inf : round(Int, inv(ratio))
+        "$rounded_inv_ratio times less"
+    else
+        "similar"
+    end
+    return "$summary_str ($(to_string(value1)) vs. $(to_string(value2)))"
+end
+
 function drop_line_numbers(expr)
     expr isa Expr || return expr
     new_args = map(drop_line_numbers, expr.args)
@@ -118,7 +174,35 @@ function code_instance(f, args...)
     end
 end
 
-macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str)
+macro benchmark(expression)
+    return quote
+        prev_time = time_ns()
+        $(esc(expression))
+        new_time = time_ns()
+        best_time = new_time - prev_time
+
+        # Benchmark for at most 0.1 s (10^8 ns), ignoring the first call above.
+        n_trials = 0
+        start_time = new_time
+        while n_trials < 10^4 && new_time - start_time < 10^8
+            prev_time = time_ns()
+            $(esc(expression))
+            new_time = time_ns()
+            best_time = min(best_time, new_time - prev_time)
+            n_trials += 1
+        end
+
+        best_time
+    end
+end
+
+macro test_unrolled(
+    args_expr,
+    unrolled_expr,
+    reference_expr,
+    contents_info_str,
+    skip_allocations_test = false,
+)
     @assert Meta.isexpr(args_expr, :tuple)
     arg_names = args_expr.args
     @assert all(arg_name -> arg_name isa Symbol, arg_names)
@@ -146,26 +230,27 @@ macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str)
         reference_func_and_nothing($(args...))
 
         # Test for allocations.
-        @test (@allocated unrolled_func_and_nothing($(args...))) == 0
+        is_unrolled_non_allocating =
+            (@allocated unrolled_func_and_nothing($(args...))) == 0
         is_reference_non_allocating =
             (@allocated reference_func_and_nothing($(args...))) == 0
+        $(esc(skip_allocations_test)) || @test is_unrolled_non_allocating
 
         # Test for type-stability.
         @test_opt unrolled_func($(args...))
         is_reference_stable =
             isempty(JET.get_reports(@report_opt reference_func($(args...))))
 
-        unrolled_instance = code_instance(unrolled_func, $(args...))
-        reference_instance = code_instance(reference_func, $(args...))
-
         # Test for constant propagation.
-        is_unrolled_const = isdefined(unrolled_instance, :rettype_const)
+        is_unrolled_const =
+            isdefined(code_instance(unrolled_func, $(args...)), :rettype_const)
         Base.issingletontype(typeof(($(args...),))) && @test is_unrolled_const
-        is_reference_const = isdefined(reference_instance, :rettype_const)
+        is_reference_const =
+            isdefined(code_instance(reference_func, $(args...)), :rettype_const)
 
         buffer = IOBuffer()
 
-        # Check whether the functions are fully optimized out.
+        # Determine whether the functions are fully optimized out.
         args_type = Tuple{map(typeof, ($(args...),))...}
         code_llvm(buffer, unrolled_func, args_type; debuginfo = :none)
         is_unrolled_optimized_out =
@@ -174,86 +259,105 @@ macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str)
         is_reference_optimized_out =
             length(split(String(take!(buffer)), '\n')) == 5
 
+        # Test the overall level of optimization.
+        unrolled_opt_str, unrolled_opt_score = if !is_unrolled_non_allocating
+            "allocating", 1
+        elseif !is_unrolled_const && !is_unrolled_optimized_out
+            "type-stable", 3
+        elseif !is_unrolled_optimized_out
+            "constant", 4
+        else
+            "optimized out", 5
+        end
+        reference_opt_str, reference_opt_score = if !is_reference_non_allocating
+            "allocating", 1
+        elseif !is_reference_stable
+            "type-unstable", 2
+        elseif !is_reference_const && !is_reference_optimized_out
+            "type-stable", 3
+        elseif !is_reference_optimized_out
+            "constant", 4
+        else
+            "optimized out", 5
+        end
+        @test unrolled_opt_score >= reference_opt_score
+
+        # Measure the run times.
+        unrolled_run_time = @benchmark unrolled_func($(args...))
+        reference_run_time = @benchmark reference_func($(args...))
+
+        # Measure the compilation times and memory usages in separate processes
+        # to ensure that they are not under-counted.
         arg_name_strs = ($(map(string, arg_names)...),)
         arg_names_str = join(arg_name_strs, ", ")
         arg_definition_strs =
             map((name, value) -> "$name = $value", arg_name_strs, ($(args...),))
         arg_definitions_str = join(arg_definition_strs, '\n')
-        unrolled_command_str = """
+        command_str(func_str) = """
             using UnrolledUtilities
-            unrolled_func($arg_names_str) = $($(string(unrolled_expr)))
+            func_and_nothing($arg_names_str) = ($func_str; nothing)
             $arg_definitions_str
-            stats1 = @timed unrolled_func($arg_names_str)
-            stats2 = @timed unrolled_func($arg_names_str)
-            print(stats1.time - stats2.time, ',', stats1.bytes - stats2.bytes)
-            """
-        reference_command_str = """
-            reference_func($arg_names_str) = $($(string(reference_expr)))
-            $arg_definitions_str
-            stats1 = @timed reference_func($arg_names_str)
-            stats2 = @timed reference_func($arg_names_str)
-            print(stats1.time - stats2.time, ',', stats1.bytes - stats2.bytes)
+            begin # The following block is based on @time from "base/timing.jl".
+                Base.Experimental.@force_compile
+                Base.cumulative_compile_timing(true)
+                time_ns_1 = Base.cumulative_compile_time_ns()[1]
+                rss_bytes_1 = Sys.maxrss()
+                Δgc_bytes = @allocated func_and_nothing($arg_names_str)
+                rss_bytes_2 = Sys.maxrss()
+                time_ns_2 = Base.cumulative_compile_time_ns()[1]
+                Base.cumulative_compile_timing(false)
+            end
+            Δtime_ns = time_ns_2 - time_ns_1
+            Δrss_bytes = rss_bytes_2 - rss_bytes_1
+            print(Δtime_ns, ", ", Δgc_bytes, ", ", Δrss_bytes)
             """
-
-        # Get the unrolled function's time-to-first-run and its memory usage.
+        unrolled_command_str = command_str($(string(unrolled_expr)))
         run(pipeline(`julia --project -e $unrolled_command_str`, buffer))
-        unrolled_time, unrolled_memory =
-            parse.((Float64, Int), split(String(take!(buffer)), ','))
+        unrolled_compile_time, unrolled_memory_gc, unrolled_memory_rss =
+            parse.((Int, Int, Int), split(String(take!(buffer)), ','))
 
         # Make a new buffer to avoid a potential data race:
         # https://discourse.julialang.org/t/iobuffer-becomes-not-writable-after-run/92323/3
         close(buffer)
         buffer = IOBuffer()
-
-        # Get the reference function's time-to-first-run and its memory usage.
+        reference_command_str = command_str($(string(reference_expr)))
         run(pipeline(`julia --project -e $reference_command_str`, buffer))
-        reference_time, reference_memory =
-            parse.((Float64, Int), split(String(take!(buffer)), ','))
+        reference_compile_time, reference_memory_gc, reference_memory_rss =
+            parse.((Int, Int, Int), split(String(take!(buffer)), ','))
 
         close(buffer)
 
-        # Record all relevant information in comparison_table_dict.
-        unrolled_performance_str = if !is_unrolled_const
-            "type-stable"
-        elseif !is_unrolled_optimized_out
-            "const return value"
-        else
-            "fully optimized out"
-        end
-        reference_performance_str = if !is_reference_non_allocating
-            "allocating"
-        elseif !is_reference_stable
-            "type-unstable"
-        elseif !is_reference_const
-            "type-stable"
-        elseif !is_reference_optimized_out
-            "const return value"
-        else
-            "fully optimized out"
-        end
-        time_ratio = unrolled_time / reference_time
-        time_ratio_str = if time_ratio >= 1.5
-            "$(round(Int, time_ratio)) times slower"
-        elseif inv(time_ratio) >= 1.5
-            "$(round(Int, inv(time_ratio))) times faster"
-        else
-            "similar"
-        end
-        memory_ratio = unrolled_memory / reference_memory
-        memory_ratio_str = if memory_ratio >= 1.5
-            "$(round(Int, memory_ratio)) times more"
-        elseif inv(memory_ratio) >= 1.5
-            "$(round(Int, inv(memory_ratio))) times less"
-        else
-            "similar"
-        end
+        optimization_str =
+            unrolled_opt_str == reference_opt_str ?
+            "similar ($unrolled_opt_str)" :
+            "better ($unrolled_opt_str vs. $reference_opt_str)"
+        run_time_str = comparison_string(
+            unrolled_run_time,
+            reference_run_time,
+            time_string,
+        )
+        compile_time_str = comparison_string(
+            unrolled_compile_time,
+            reference_compile_time,
+            time_string,
+        )
+        memory_str = comparison_string(
+            (unrolled_memory_gc, unrolled_memory_rss),
+            (reference_memory_gc, reference_memory_rss),
+            ((gc_bytes, rss_bytes),) ->
+                rss_bytes == 0 ? memory_string(gc_bytes) :
+                "$(memory_string(gc_bytes)) [$(memory_string(rss_bytes))]",
+            first, # Use GC value for comparison since RSS might be unavailable.
+        )
+        # TODO: Why does Sys.maxrss() seem to always return 0 on Ubuntu systems?
+
         dict_key = ($unrolled_expr_str, $reference_expr_str)
         dict_entry = (
             $(esc(contents_info_str)),
-            unrolled_performance_str,
-            reference_performance_str,
-            time_ratio_str,
-            memory_ratio_str,
+            optimization_str,
+            run_time_str,
+            compile_time_str,
+            memory_str,
         )
         if dict_key in keys(comparison_table_dict)
             push!(comparison_table_dict[dict_key], dict_entry)
@@ -342,10 +446,6 @@ for n in (1, 8, 32, 33, 128), identical in (n == 1 ? (true,) : (true, false))
                 str,
             )
 
-            @test_unrolled (itr,) unrolled_zip(itr) Tuple(zip(itr)) str
-
-            @test_unrolled (itr,) unrolled_enumerate(itr) Tuple(enumerate(itr)) str
-
             @test_unrolled (itr,) unrolled_in(nothing, itr) (nothing in itr) str
             @test_unrolled (itr,) unrolled_in(itr[1], itr) (itr[1] in itr) str
             @test_unrolled (itr,) unrolled_in(itr[end], itr) (itr[end] in itr) str
@@ -479,19 +579,6 @@ for n in (1, 8, 32, 33, 128), identical in (n == 1 ? (true,) : (true, false))
             str23,
         )
 
-        @test_unrolled(
-            (itr1, itr2),
-            unrolled_zip(itr1, itr2),
-            Tuple(zip(itr1, itr2)),
-            str12,
-        )
-        @test_unrolled(
-            (itr1, itr2, itr3),
-            unrolled_zip(itr1, itr2, itr3),
-            Tuple(zip(itr1, itr2, itr3)),
-            str123,
-        )
-
         # unrolled_product can take several minutes to compile when n is large
         if n <= 33
             @test_unrolled(
@@ -561,3 +648,112 @@ for n in (8, 32, 128)
         end
     end
 end
+
+@testset "Tuple vs. BitSequence for nested reductions" begin
+    for (itr, skip_allocations_test) in (
+        (ntuple(_ -> true, Val(32)), false),
+        (ntuple(_ -> true, Val(33)), true),
+        (BitSequence{256}(true), false),
+        (BitSequence{257}(true), true),
+    )
+        n = length(itr)
+        indices = LazySequence{n}()
+        @test_unrolled(
+            (itr, indices),
+            unrolled_reduce(
+                (itr′, i) -> Base.setindex(itr′, !itr′[i], i),
+                indices;
+                init = itr,
+            ),
+            reduce(
+                (itr′, i) -> Base.setindex(itr′, !itr′[i], i),
+                indices;
+                init = itr,
+            ),
+            "$n bits$(itr isa BitSequence ? " stored in a BitSequence" : "")",
+            skip_allocations_test,
+        )
+        @test_unrolled(
+            (itr, indices),
+            unrolled_reduce(
+                (itr′, i) -> unrolled_reduce(
+                    (itr′′, j) ->
+                        Base.setindex(itr′′, !itr′′[min(i, j)], j),
+                    indices;
+                    init = itr′,
+                ),
+                indices;
+                init = itr,
+            ),
+            reduce(
+                (itr′, i) -> reduce(
+                    (itr′′, j) ->
+                        Base.setindex(itr′′, !itr′′[min(i, j)], j),
+                    indices;
+                    init = itr′,
+                ),
+                indices;
+                init = itr,
+            ),
+            "$n bits$(itr isa BitSequence ? " stored in a BitSequence" : "")",
+            skip_allocations_test,
+        )
+        length(itr) == 257 || @test_unrolled(
+            (itr, indices),
+            unrolled_reduce(
+                (itr′, i) -> unrolled_reduce(
+                    (itr′′, j) -> unrolled_reduce(
+                        (itr′′′, k) -> Base.setindex(
+                            itr′′′,
+                            !itr′′′[min(i, j, k)],
+                            k,
+                        ),
+                        indices;
+                        init = itr′′,
+                    ),
+                    indices;
+                    init = itr′,
+                ),
+                indices;
+                init = itr,
+            ),
+            reduce(
+                (itr′, i) -> reduce(
+                    (itr′′, j) -> reduce(
+                        (itr′′′, k) -> Base.setindex(
+                            itr′′′,
+                            !itr′′′[min(i, j, k)],
+                            k,
+                        ),
+                        indices;
+                        init = itr′′,
+                    ),
+                    indices;
+                    init = itr′,
+                ),
+                indices;
+                init = itr,
+            ),
+            "$n bits$(itr isa BitSequence ? " stored in a BitSequence" : "")",
+            skip_allocations_test,
+        ) # Skip this test for BitSequence{257} because it allocates over 2 GB.
+    end
+end
+
+# We cannot generate an unrolled function with more than 8187 calls to getindex.
+@testset "maximum unrollable N" begin
+    itr = LazySequence{8187}()
+    @test_unrolled(
+        (itr,),
+        unrolled_mapreduce(abs2, +, itr),
+        mapreduce(abs2, +, itr),
+        "8187 integers stored in a LazySequence",
+    )
+
+    @test_throws "gc handles" unrolled_mapreduce(abs2, +, LazySequence{8188}())
+    @test_throws "gc handles" unrolled_mapreduce(
+        abs2,
+        +,
+        ntuple(identity, Val(8188)),
+    )
+end